Merge branch 'for-linus-4.2' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs

+1

fs/btrfs/async-thread.c

··· 85 85 BTRFS_WORK_HELPER(scrub_helper); 86 86 BTRFS_WORK_HELPER(scrubwrc_helper); 87 87 BTRFS_WORK_HELPER(scrubnc_helper); 88 + BTRFS_WORK_HELPER(scrubparity_helper); 88 89 89 90 static struct __btrfs_workqueue * 90 91 __btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active,

+2

fs/btrfs/async-thread.h

··· 64 64 BTRFS_WORK_HELPER_PROTO(scrub_helper); 65 65 BTRFS_WORK_HELPER_PROTO(scrubwrc_helper); 66 66 BTRFS_WORK_HELPER_PROTO(scrubnc_helper); 67 + BTRFS_WORK_HELPER_PROTO(scrubparity_helper); 68 + 67 69 68 70 struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, 69 71 unsigned int flags,

+41 -18

fs/btrfs/backref.c

··· 250 250 * the first item to check. But sometimes, we may enter it with 251 251 * slot==nritems. In that case, go to the next leaf before we continue. 252 252 */ 253 - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) 254 - ret = btrfs_next_old_leaf(root, path, time_seq); 253 + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 254 + if (time_seq == (u64)-1) 255 + ret = btrfs_next_leaf(root, path); 256 + else 257 + ret = btrfs_next_old_leaf(root, path, time_seq); 258 + } 255 259 256 260 while (!ret && count < total_refs) { 257 261 eb = path->nodes[0]; ··· 295 291 eie = NULL; 296 292 } 297 293 next: 298 - ret = btrfs_next_old_item(root, path, time_seq); 294 + if (time_seq == (u64)-1) 295 + ret = btrfs_next_item(root, path); 296 + else 297 + ret = btrfs_next_old_item(root, path, time_seq); 299 298 } 300 299 301 300 if (ret > 0) ··· 341 334 342 335 if (path->search_commit_root) 343 336 root_level = btrfs_header_level(root->commit_root); 337 + else if (time_seq == (u64)-1) 338 + root_level = btrfs_header_level(root->node); 344 339 else 345 340 root_level = btrfs_old_root_level(root, time_seq); 346 341 ··· 352 343 } 353 344 354 345 path->lowest_level = level; 355 - ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq); 346 + if (time_seq == (u64)-1) 347 + ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path, 348 + 0, 0); 349 + else 350 + ret = btrfs_search_old_slot(root, &ref->key_for_search, path, 351 + time_seq); 356 352 357 353 /* root node has been locked, we can release @subvol_srcu safely here */ 358 354 srcu_read_unlock(&fs_info->subvol_srcu, index); ··· 505 491 BUG_ON(!ref->wanted_disk_byte); 506 492 eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, 507 493 0); 508 - if (!eb || !extent_buffer_uptodate(eb)) { 494 + if (IS_ERR(eb)) { 495 + return PTR_ERR(eb); 496 + } else if (!extent_buffer_uptodate(eb)) { 509 497 free_extent_buffer(eb); 510 498 return -EIO; 511 499 } ··· 523 507 } 524 508 525 509 /* 526 - * merge two lists of backrefs and adjust counts accordingly 510 + * merge backrefs and adjust counts accordingly 527 511 * 528 512 * mode = 1: merge identical keys, if key is set 529 513 * FIXME: if we add more keys in __add_prelim_ref, we can merge more here. ··· 551 535 552 536 ref2 = list_entry(pos2, struct __prelim_ref, list); 553 537 538 + if (!ref_for_same_block(ref1, ref2)) 539 + continue; 554 540 if (mode == 1) { 555 - if (!ref_for_same_block(ref1, ref2)) 556 - continue; 557 541 if (!ref1->parent && ref2->parent) { 558 542 xchg = ref1; 559 543 ref1 = ref2; ··· 588 572 struct list_head *prefs, u64 *total_refs, 589 573 u64 inum) 590 574 { 575 + struct btrfs_delayed_ref_node *node; 591 576 struct btrfs_delayed_extent_op *extent_op = head->extent_op; 592 - struct rb_node *n = &head->node.rb_node; 593 577 struct btrfs_key key; 594 578 struct btrfs_key op_key = {0}; 595 579 int sgn; ··· 599 583 btrfs_disk_key_to_cpu(&op_key, &extent_op->key); 600 584 601 585 spin_lock(&head->lock); 602 - n = rb_first(&head->ref_root); 603 - while (n) { 604 - struct btrfs_delayed_ref_node *node; 605 - node = rb_entry(n, struct btrfs_delayed_ref_node, 606 - rb_node); 607 - n = rb_next(n); 586 + list_for_each_entry(node, &head->ref_list, list) { 608 587 if (node->seq > seq) 609 588 continue; 610 589 ··· 893 882 * 894 883 * NOTE: This can return values > 0 895 884 * 885 + * If time_seq is set to (u64)-1, it will not search delayed_refs, and behave 886 + * much like trans == NULL case, the difference only lies in it will not 887 + * commit root. 888 + * The special case is for qgroup to search roots in commit_transaction(). 889 + * 896 890 * FIXME some caching might speed things up 897 891 */ 898 892 static int find_parent_nodes(struct btrfs_trans_handle *trans, ··· 936 920 path->skip_locking = 1; 937 921 } 938 922 923 + if (time_seq == (u64)-1) 924 + path->skip_locking = 1; 925 + 939 926 /* 940 927 * grab both a lock on the path and a lock on the delayed ref head. 941 928 * We need both to get a consistent picture of how the refs look ··· 953 934 BUG_ON(ret == 0); 954 935 955 936 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 956 - if (trans && likely(trans->type != __TRANS_DUMMY)) { 937 + if (trans && likely(trans->type != __TRANS_DUMMY) && 938 + time_seq != (u64)-1) { 957 939 #else 958 - if (trans) { 940 + if (trans && time_seq != (u64)-1) { 959 941 #endif 960 942 /* 961 943 * look if there are updates for this ref queued and lock the ··· 1054 1034 1055 1035 eb = read_tree_block(fs_info->extent_root, 1056 1036 ref->parent, 0); 1057 - if (!eb || !extent_buffer_uptodate(eb)) { 1037 + if (IS_ERR(eb)) { 1038 + ret = PTR_ERR(eb); 1039 + goto out; 1040 + } else if (!extent_buffer_uptodate(eb)) { 1058 1041 free_extent_buffer(eb); 1059 1042 ret = -EIO; 1060 1043 goto out;

+10 -6

fs/btrfs/ctree.c

··· 1439 1439 btrfs_tree_read_unlock(eb_root); 1440 1440 free_extent_buffer(eb_root); 1441 1441 old = read_tree_block(root, logical, 0); 1442 - if (WARN_ON(!old || !extent_buffer_uptodate(old))) { 1443 - free_extent_buffer(old); 1442 + if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { 1443 + if (!IS_ERR(old)) 1444 + free_extent_buffer(old); 1444 1445 btrfs_warn(root->fs_info, 1445 1446 "failed to read tree block %llu from get_old_root", logical); 1446 1447 } else { ··· 1686 1685 if (!cur || !uptodate) { 1687 1686 if (!cur) { 1688 1687 cur = read_tree_block(root, blocknr, gen); 1689 - if (!cur || !extent_buffer_uptodate(cur)) { 1688 + if (IS_ERR(cur)) { 1689 + return PTR_ERR(cur); 1690 + } else if (!extent_buffer_uptodate(cur)) { 1690 1691 free_extent_buffer(cur); 1691 1692 return -EIO; 1692 1693 } ··· 1867 1864 1868 1865 eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), 1869 1866 btrfs_node_ptr_generation(parent, slot)); 1870 - if (eb && !extent_buffer_uptodate(eb)) { 1871 - free_extent_buffer(eb); 1867 + if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) { 1868 + if (!IS_ERR(eb)) 1869 + free_extent_buffer(eb); 1872 1870 eb = NULL; 1873 1871 } 1874 1872 ··· 2498 2494 2499 2495 ret = -EAGAIN; 2500 2496 tmp = read_tree_block(root, blocknr, 0); 2501 - if (tmp) { 2497 + if (!IS_ERR(tmp)) { 2502 2498 /* 2503 2499 * If the read above didn't mark this buffer up to date, 2504 2500 * it will never end up being up to date. Set ret to EIO now

+20 -8

fs/btrfs/ctree.h

··· 174 174 /* csum types */ 175 175 #define BTRFS_CSUM_TYPE_CRC32 0 176 176 177 - static int btrfs_csum_sizes[] = { 4, 0 }; 177 + static int btrfs_csum_sizes[] = { 4 }; 178 178 179 179 /* four bytes for CRC32 */ 180 180 #define BTRFS_EMPTY_DIR_SIZE 0 ··· 1619 1619 struct task_struct *cleaner_kthread; 1620 1620 int thread_pool_size; 1621 1621 1622 - struct kobject super_kobj; 1623 1622 struct kobject *space_info_kobj; 1624 - struct kobject *device_dir_kobj; 1625 - struct completion kobj_unregister; 1626 1623 int do_barriers; 1627 1624 int closing; 1628 1625 int log_root_recovering; ··· 1695 1698 struct btrfs_workqueue *scrub_workers; 1696 1699 struct btrfs_workqueue *scrub_wr_completion_workers; 1697 1700 struct btrfs_workqueue *scrub_nocow_workers; 1701 + struct btrfs_workqueue *scrub_parity_workers; 1698 1702 1699 1703 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 1700 1704 u32 check_integrity_print_mask; ··· 1733 1735 /* list of dirty qgroups to be written at next commit */ 1734 1736 struct list_head dirty_qgroups; 1735 1737 1736 - /* used by btrfs_qgroup_record_ref for an efficient tree traversal */ 1738 + /* used by qgroup for an efficient tree traversal */ 1737 1739 u64 qgroup_seq; 1738 1740 1739 1741 /* qgroup rescan items */ ··· 3456 3458 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); 3457 3459 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 3458 3460 struct btrfs_root *root); 3461 + void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); 3459 3462 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 3460 3463 struct inode *inode); 3461 3464 void btrfs_orphan_release_metadata(struct inode *inode); ··· 3514 3515 int __get_raid_index(u64 flags); 3515 3516 int btrfs_start_write_no_snapshoting(struct btrfs_root *root); 3516 3517 void btrfs_end_write_no_snapshoting(struct btrfs_root *root); 3518 + void check_system_chunk(struct btrfs_trans_handle *trans, 3519 + struct btrfs_root *root, 3520 + const u64 type); 3517 3521 /* ctree.c */ 3518 3522 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 3519 3523 int level, int *slot); ··· 4052 4050 4053 4051 #ifdef CONFIG_BTRFS_ASSERT 4054 4052 4053 + __cold 4055 4054 static inline void assfail(char *expr, char *file, int line) 4056 4055 { 4057 4056 pr_err("BTRFS: assertion failed: %s, file: %s, line: %d", ··· 4068 4065 4069 4066 #define btrfs_assert() 4070 4067 __printf(5, 6) 4068 + __cold 4071 4069 void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, 4072 4070 unsigned int line, int errno, const char *fmt, ...); 4073 4071 4074 4072 4073 + __cold 4075 4074 void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, 4076 4075 struct btrfs_root *root, const char *function, 4077 4076 unsigned int line, int errno); ··· 4116 4111 * Call btrfs_abort_transaction as early as possible when an error condition is 4117 4112 * detected, that way the exact line number is reported. 4118 4113 */ 4119 - 4120 4114 #define btrfs_abort_transaction(trans, root, errno) \ 4121 4115 do { \ 4122 - __btrfs_abort_transaction(trans, root, __func__, \ 4123 - __LINE__, errno); \ 4116 + /* Report first abort since mount */ \ 4117 + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ 4118 + &((root)->fs_info->fs_state))) { \ 4119 + WARN(1, KERN_DEBUG \ 4120 + "BTRFS: Transaction aborted (error %d)\n", \ 4121 + (errno)); \ 4122 + } \ 4123 + __btrfs_abort_transaction((trans), (root), __func__, \ 4124 + __LINE__, (errno)); \ 4124 4125 } while (0) 4125 4126 4126 4127 #define btrfs_std_error(fs_info, errno) \ ··· 4143 4132 } while (0) 4144 4133 4145 4134 __printf(5, 6) 4135 + __cold 4146 4136 void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, 4147 4137 unsigned int line, int errno, const char *fmt, ...); 4148 4138

+122 -250

fs/btrfs/delayed-ref.c

··· 22 22 #include "ctree.h" 23 23 #include "delayed-ref.h" 24 24 #include "transaction.h" 25 + #include "qgroup.h" 25 26 26 27 struct kmem_cache *btrfs_delayed_ref_head_cachep; 27 28 struct kmem_cache *btrfs_delayed_tree_ref_cachep; ··· 83 82 return 1; 84 83 } 85 84 return 0; 86 - } 87 - 88 - /* 89 - * entries in the rb tree are ordered by the byte number of the extent, 90 - * type of the delayed backrefs and content of delayed backrefs. 91 - */ 92 - static int comp_entry(struct btrfs_delayed_ref_node *ref2, 93 - struct btrfs_delayed_ref_node *ref1, 94 - bool compare_seq) 95 - { 96 - if (ref1->bytenr < ref2->bytenr) 97 - return -1; 98 - if (ref1->bytenr > ref2->bytenr) 99 - return 1; 100 - if (ref1->is_head && ref2->is_head) 101 - return 0; 102 - if (ref2->is_head) 103 - return -1; 104 - if (ref1->is_head) 105 - return 1; 106 - if (ref1->type < ref2->type) 107 - return -1; 108 - if (ref1->type > ref2->type) 109 - return 1; 110 - if (ref1->no_quota > ref2->no_quota) 111 - return 1; 112 - if (ref1->no_quota < ref2->no_quota) 113 - return -1; 114 - /* merging of sequenced refs is not allowed */ 115 - if (compare_seq) { 116 - if (ref1->seq < ref2->seq) 117 - return -1; 118 - if (ref1->seq > ref2->seq) 119 - return 1; 120 - } 121 - if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || 122 - ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { 123 - return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), 124 - btrfs_delayed_node_to_tree_ref(ref1), 125 - ref1->type); 126 - } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY || 127 - ref1->type == BTRFS_SHARED_DATA_REF_KEY) { 128 - return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2), 129 - btrfs_delayed_node_to_data_ref(ref1)); 130 - } 131 - BUG(); 132 - return 0; 133 - } 134 - 135 - /* 136 - * insert a new ref into the rbtree. This returns any existing refs 137 - * for the same (bytenr,parent) tuple, or NULL if the new node was properly 138 - * inserted. 139 - */ 140 - static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, 141 - struct rb_node *node) 142 - { 143 - struct rb_node **p = &root->rb_node; 144 - struct rb_node *parent_node = NULL; 145 - struct btrfs_delayed_ref_node *entry; 146 - struct btrfs_delayed_ref_node *ins; 147 - int cmp; 148 - 149 - ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 150 - while (*p) { 151 - parent_node = *p; 152 - entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, 153 - rb_node); 154 - 155 - cmp = comp_entry(entry, ins, 1); 156 - if (cmp < 0) 157 - p = &(*p)->rb_left; 158 - else if (cmp > 0) 159 - p = &(*p)->rb_right; 160 - else 161 - return entry; 162 - } 163 - 164 - rb_link_node(node, parent_node, p); 165 - rb_insert_color(node, root); 166 - return NULL; 167 85 } 168 86 169 87 /* insert a new ref to head ref rbtree */ ··· 188 268 rb_erase(&head->href_node, &delayed_refs->href_root); 189 269 } else { 190 270 assert_spin_locked(&head->lock); 191 - rb_erase(&ref->rb_node, &head->ref_root); 271 + list_del(&ref->list); 192 272 } 193 273 ref->in_tree = 0; 194 274 btrfs_put_delayed_ref(ref); 195 275 atomic_dec(&delayed_refs->num_entries); 196 276 if (trans->delayed_ref_updates) 197 277 trans->delayed_ref_updates--; 198 - } 199 - 200 - static int merge_ref(struct btrfs_trans_handle *trans, 201 - struct btrfs_delayed_ref_root *delayed_refs, 202 - struct btrfs_delayed_ref_head *head, 203 - struct btrfs_delayed_ref_node *ref, u64 seq) 204 - { 205 - struct rb_node *node; 206 - int mod = 0; 207 - int done = 0; 208 - 209 - node = rb_next(&ref->rb_node); 210 - while (!done && node) { 211 - struct btrfs_delayed_ref_node *next; 212 - 213 - next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 214 - node = rb_next(node); 215 - if (seq && next->seq >= seq) 216 - break; 217 - if (comp_entry(ref, next, 0)) 218 - continue; 219 - 220 - if (ref->action == next->action) { 221 - mod = next->ref_mod; 222 - } else { 223 - if (ref->ref_mod < next->ref_mod) { 224 - struct btrfs_delayed_ref_node *tmp; 225 - 226 - tmp = ref; 227 - ref = next; 228 - next = tmp; 229 - done = 1; 230 - } 231 - mod = -next->ref_mod; 232 - } 233 - 234 - drop_delayed_ref(trans, delayed_refs, head, next); 235 - ref->ref_mod += mod; 236 - if (ref->ref_mod == 0) { 237 - drop_delayed_ref(trans, delayed_refs, head, ref); 238 - done = 1; 239 - } else { 240 - /* 241 - * You can't have multiples of the same ref on a tree 242 - * block. 243 - */ 244 - WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || 245 - ref->type == BTRFS_SHARED_BLOCK_REF_KEY); 246 - } 247 - } 248 - return done; 249 - } 250 - 251 - void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, 252 - struct btrfs_fs_info *fs_info, 253 - struct btrfs_delayed_ref_root *delayed_refs, 254 - struct btrfs_delayed_ref_head *head) 255 - { 256 - struct rb_node *node; 257 - u64 seq = 0; 258 - 259 - assert_spin_locked(&head->lock); 260 - /* 261 - * We don't have too much refs to merge in the case of delayed data 262 - * refs. 263 - */ 264 - if (head->is_data) 265 - return; 266 - 267 - spin_lock(&fs_info->tree_mod_seq_lock); 268 - if (!list_empty(&fs_info->tree_mod_seq_list)) { 269 - struct seq_list *elem; 270 - 271 - elem = list_first_entry(&fs_info->tree_mod_seq_list, 272 - struct seq_list, list); 273 - seq = elem->seq; 274 - } 275 - spin_unlock(&fs_info->tree_mod_seq_lock); 276 - 277 - node = rb_first(&head->ref_root); 278 - while (node) { 279 - struct btrfs_delayed_ref_node *ref; 280 - 281 - ref = rb_entry(node, struct btrfs_delayed_ref_node, 282 - rb_node); 283 - /* We can't merge refs that are outside of our seq count */ 284 - if (seq && ref->seq >= seq) 285 - break; 286 - if (merge_ref(trans, delayed_refs, head, ref, seq)) 287 - node = rb_first(&head->ref_root); 288 - else 289 - node = rb_next(&ref->rb_node); 290 - } 291 278 } 292 279 293 280 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, ··· 270 443 } 271 444 272 445 /* 273 - * helper function to update an extent delayed ref in the 274 - * rbtree. existing and update must both have the same 275 - * bytenr and parent 446 + * Helper to insert the ref_node to the tail or merge with tail. 276 447 * 277 - * This may free existing if the update cancels out whatever 278 - * operation it was doing. 448 + * Return 0 for insert. 449 + * Return >0 for merge. 279 450 */ 280 - static noinline void 281 - update_existing_ref(struct btrfs_trans_handle *trans, 282 - struct btrfs_delayed_ref_root *delayed_refs, 283 - struct btrfs_delayed_ref_head *head, 284 - struct btrfs_delayed_ref_node *existing, 285 - struct btrfs_delayed_ref_node *update) 451 + static int 452 + add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans, 453 + struct btrfs_delayed_ref_root *root, 454 + struct btrfs_delayed_ref_head *href, 455 + struct btrfs_delayed_ref_node *ref) 286 456 { 287 - if (update->action != existing->action) { 288 - /* 289 - * this is effectively undoing either an add or a 290 - * drop. We decrement the ref_mod, and if it goes 291 - * down to zero we just delete the entry without 292 - * every changing the extent allocation tree. 293 - */ 294 - existing->ref_mod--; 295 - if (existing->ref_mod == 0) 296 - drop_delayed_ref(trans, delayed_refs, head, existing); 297 - else 298 - WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || 299 - existing->type == BTRFS_SHARED_BLOCK_REF_KEY); 457 + struct btrfs_delayed_ref_node *exist; 458 + int mod; 459 + int ret = 0; 460 + 461 + spin_lock(&href->lock); 462 + /* Check whether we can merge the tail node with ref */ 463 + if (list_empty(&href->ref_list)) 464 + goto add_tail; 465 + exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node, 466 + list); 467 + /* No need to compare bytenr nor is_head */ 468 + if (exist->type != ref->type || exist->no_quota != ref->no_quota || 469 + exist->seq != ref->seq) 470 + goto add_tail; 471 + 472 + if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY || 473 + exist->type == BTRFS_SHARED_BLOCK_REF_KEY) && 474 + comp_tree_refs(btrfs_delayed_node_to_tree_ref(exist), 475 + btrfs_delayed_node_to_tree_ref(ref), 476 + ref->type)) 477 + goto add_tail; 478 + if ((exist->type == BTRFS_EXTENT_DATA_REF_KEY || 479 + exist->type == BTRFS_SHARED_DATA_REF_KEY) && 480 + comp_data_refs(btrfs_delayed_node_to_data_ref(exist), 481 + btrfs_delayed_node_to_data_ref(ref))) 482 + goto add_tail; 483 + 484 + /* Now we are sure we can merge */ 485 + ret = 1; 486 + if (exist->action == ref->action) { 487 + mod = ref->ref_mod; 300 488 } else { 301 - WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || 302 - existing->type == BTRFS_SHARED_BLOCK_REF_KEY); 303 - /* 304 - * the action on the existing ref matches 305 - * the action on the ref we're trying to add. 306 - * Bump the ref_mod by one so the backref that 307 - * is eventually added/removed has the correct 308 - * reference count 309 - */ 310 - existing->ref_mod += update->ref_mod; 489 + /* Need to change action */ 490 + if (exist->ref_mod < ref->ref_mod) { 491 + exist->action = ref->action; 492 + mod = -exist->ref_mod; 493 + exist->ref_mod = ref->ref_mod; 494 + } else 495 + mod = -ref->ref_mod; 311 496 } 497 + exist->ref_mod += mod; 498 + 499 + /* remove existing tail if its ref_mod is zero */ 500 + if (exist->ref_mod == 0) 501 + drop_delayed_ref(trans, root, href, exist); 502 + spin_unlock(&href->lock); 503 + return ret; 504 + 505 + add_tail: 506 + list_add_tail(&ref->list, &href->ref_list); 507 + atomic_inc(&root->num_entries); 508 + trans->delayed_ref_updates++; 509 + spin_unlock(&href->lock); 510 + return ret; 312 511 } 313 512 314 513 /* ··· 421 568 static noinline struct btrfs_delayed_ref_head * 422 569 add_delayed_ref_head(struct btrfs_fs_info *fs_info, 423 570 struct btrfs_trans_handle *trans, 424 - struct btrfs_delayed_ref_node *ref, u64 bytenr, 425 - u64 num_bytes, int action, int is_data) 571 + struct btrfs_delayed_ref_node *ref, 572 + struct btrfs_qgroup_extent_record *qrecord, 573 + u64 bytenr, u64 num_bytes, int action, int is_data) 426 574 { 427 575 struct btrfs_delayed_ref_head *existing; 428 576 struct btrfs_delayed_ref_head *head_ref = NULL; 429 577 struct btrfs_delayed_ref_root *delayed_refs; 578 + struct btrfs_qgroup_extent_record *qexisting; 430 579 int count_mod = 1; 431 580 int must_insert_reserved = 0; 432 581 ··· 473 618 head_ref = btrfs_delayed_node_to_head(ref); 474 619 head_ref->must_insert_reserved = must_insert_reserved; 475 620 head_ref->is_data = is_data; 476 - head_ref->ref_root = RB_ROOT; 621 + INIT_LIST_HEAD(&head_ref->ref_list); 477 622 head_ref->processing = 0; 478 623 head_ref->total_ref_mod = count_mod; 624 + 625 + /* Record qgroup extent info if provided */ 626 + if (qrecord) { 627 + qrecord->bytenr = bytenr; 628 + qrecord->num_bytes = num_bytes; 629 + qrecord->old_roots = NULL; 630 + 631 + qexisting = btrfs_qgroup_insert_dirty_extent(delayed_refs, 632 + qrecord); 633 + if (qexisting) 634 + kfree(qrecord); 635 + } 479 636 480 637 spin_lock_init(&head_ref->lock); 481 638 mutex_init(&head_ref->mutex); ··· 526 659 u64 num_bytes, u64 parent, u64 ref_root, int level, 527 660 int action, int no_quota) 528 661 { 529 - struct btrfs_delayed_ref_node *existing; 530 662 struct btrfs_delayed_tree_ref *full_ref; 531 663 struct btrfs_delayed_ref_root *delayed_refs; 532 664 u64 seq = 0; 665 + int ret; 533 666 534 667 if (action == BTRFS_ADD_DELAYED_EXTENT) 535 668 action = BTRFS_ADD_DELAYED_REF; ··· 560 693 561 694 trace_add_delayed_tree_ref(ref, full_ref, action); 562 695 563 - spin_lock(&head_ref->lock); 564 - existing = tree_insert(&head_ref->ref_root, &ref->rb_node); 565 - if (existing) { 566 - update_existing_ref(trans, delayed_refs, head_ref, existing, 567 - ref); 568 - /* 569 - * we've updated the existing ref, free the newly 570 - * allocated ref 571 - */ 696 + ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); 697 + 698 + /* 699 + * XXX: memory should be freed at the same level allocated. 700 + * But bad practice is anywhere... Follow it now. Need cleanup. 701 + */ 702 + if (ret > 0) 572 703 kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref); 573 - } else { 574 - atomic_inc(&delayed_refs->num_entries); 575 - trans->delayed_ref_updates++; 576 - } 577 - spin_unlock(&head_ref->lock); 578 704 } 579 705 580 706 /* ··· 581 721 u64 num_bytes, u64 parent, u64 ref_root, u64 owner, 582 722 u64 offset, int action, int no_quota) 583 723 { 584 - struct btrfs_delayed_ref_node *existing; 585 724 struct btrfs_delayed_data_ref *full_ref; 586 725 struct btrfs_delayed_ref_root *delayed_refs; 587 726 u64 seq = 0; 727 + int ret; 588 728 589 729 if (action == BTRFS_ADD_DELAYED_EXTENT) 590 730 action = BTRFS_ADD_DELAYED_REF; ··· 618 758 619 759 trace_add_delayed_data_ref(ref, full_ref, action); 620 760 621 - spin_lock(&head_ref->lock); 622 - existing = tree_insert(&head_ref->ref_root, &ref->rb_node); 623 - if (existing) { 624 - update_existing_ref(trans, delayed_refs, head_ref, existing, 625 - ref); 626 - /* 627 - * we've updated the existing ref, free the newly 628 - * allocated ref 629 - */ 761 + ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); 762 + 763 + if (ret > 0) 630 764 kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); 631 - } else { 632 - atomic_inc(&delayed_refs->num_entries); 633 - trans->delayed_ref_updates++; 634 - } 635 - spin_unlock(&head_ref->lock); 636 765 } 637 766 638 767 /* ··· 639 790 struct btrfs_delayed_tree_ref *ref; 640 791 struct btrfs_delayed_ref_head *head_ref; 641 792 struct btrfs_delayed_ref_root *delayed_refs; 793 + struct btrfs_qgroup_extent_record *record = NULL; 642 794 643 795 if (!is_fstree(ref_root) || !fs_info->quota_enabled) 644 796 no_quota = 0; ··· 650 800 return -ENOMEM; 651 801 652 802 head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); 653 - if (!head_ref) { 654 - kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); 655 - return -ENOMEM; 803 + if (!head_ref) 804 + goto free_ref; 805 + 806 + if (fs_info->quota_enabled && is_fstree(ref_root)) { 807 + record = kmalloc(sizeof(*record), GFP_NOFS); 808 + if (!record) 809 + goto free_head_ref; 656 810 } 657 811 658 812 head_ref->extent_op = extent_op; ··· 668 814 * insert both the head node and the new ref without dropping 669 815 * the spin lock 670 816 */ 671 - head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, 817 + head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, 672 818 bytenr, num_bytes, action, 0); 673 819 674 820 add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, ··· 677 823 spin_unlock(&delayed_refs->lock); 678 824 679 825 return 0; 826 + 827 + free_head_ref: 828 + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); 829 + free_ref: 830 + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); 831 + 832 + return -ENOMEM; 680 833 } 681 834 682 835 /* ··· 700 839 struct btrfs_delayed_data_ref *ref; 701 840 struct btrfs_delayed_ref_head *head_ref; 702 841 struct btrfs_delayed_ref_root *delayed_refs; 842 + struct btrfs_qgroup_extent_record *record = NULL; 703 843 704 844 if (!is_fstree(ref_root) || !fs_info->quota_enabled) 705 845 no_quota = 0; ··· 716 854 return -ENOMEM; 717 855 } 718 856 857 + if (fs_info->quota_enabled && is_fstree(ref_root)) { 858 + record = kmalloc(sizeof(*record), GFP_NOFS); 859 + if (!record) { 860 + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); 861 + kmem_cache_free(btrfs_delayed_ref_head_cachep, 862 + head_ref); 863 + return -ENOMEM; 864 + } 865 + } 866 + 719 867 head_ref->extent_op = extent_op; 720 868 721 869 delayed_refs = &trans->transaction->delayed_refs; ··· 735 863 * insert both the head node and the new ref without dropping 736 864 * the spin lock 737 865 */ 738 - head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, 866 + head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, 739 867 bytenr, num_bytes, action, 1); 740 868 741 869 add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, ··· 763 891 delayed_refs = &trans->transaction->delayed_refs; 764 892 spin_lock(&delayed_refs->lock); 765 893 766 - add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, 767 - num_bytes, BTRFS_UPDATE_DELAYED_HEAD, 768 - extent_op->is_data); 894 + add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr, 895 + num_bytes, BTRFS_UPDATE_DELAYED_HEAD, 896 + extent_op->is_data); 769 897 770 898 spin_unlock(&delayed_refs->lock); 771 899 return 0;

+28 -1

fs/btrfs/delayed-ref.h

··· 24 24 #define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ 25 25 #define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ 26 26 27 + /* 28 + * XXX: Qu: I really hate the design that ref_head and tree/data ref shares the 29 + * same ref_node structure. 30 + * Ref_head is in a higher logic level than tree/data ref, and duplicated 31 + * bytenr/num_bytes in ref_node is really a waste or memory, they should be 32 + * referred from ref_head. 33 + * This gets more disgusting after we use list to store tree/data ref in 34 + * ref_head. Must clean this mess up later. 35 + */ 27 36 struct btrfs_delayed_ref_node { 37 + /* 38 + * ref_head use rb tree, stored in ref_root->href. 39 + * indexed by bytenr 40 + */ 28 41 struct rb_node rb_node; 42 + 43 + /*data/tree ref use list, stored in ref_head->ref_list. */ 44 + struct list_head list; 29 45 30 46 /* the starting bytenr of the extent */ 31 47 u64 bytenr; ··· 99 83 struct mutex mutex; 100 84 101 85 spinlock_t lock; 102 - struct rb_root ref_root; 86 + struct list_head ref_list; 103 87 104 88 struct rb_node href_node; 105 89 ··· 148 132 /* head ref rbtree */ 149 133 struct rb_root href_root; 150 134 135 + /* dirty extent records */ 136 + struct rb_root dirty_extent_root; 137 + 151 138 /* this spin lock protects the rbtree and the entries inside */ 152 139 spinlock_t lock; 153 140 ··· 175 156 int flushing; 176 157 177 158 u64 run_delayed_start; 159 + 160 + /* 161 + * To make qgroup to skip given root. 162 + * This is for snapshot, as btrfs_qgroup_inherit() will manully 163 + * modify counters for snapshot and its source, so we should skip 164 + * the snapshot in new_root/old_roots or it will get calculated twice 165 + */ 166 + u64 qgroup_to_skip; 178 167 }; 179 168 180 169 extern struct kmem_cache *btrfs_delayed_ref_head_cachep;

+5 -2

fs/btrfs/dev-replace.c

··· 376 376 WARN_ON(!tgt_device); 377 377 dev_replace->tgtdev = tgt_device; 378 378 379 + ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device); 380 + if (ret) 381 + btrfs_error(root->fs_info, ret, "kobj add dev failed"); 382 + 379 383 printk_in_rcu(KERN_INFO 380 384 "BTRFS: dev_replace from %s (devid %llu) to %s started\n", 381 385 src_device->missing ? "<missing disk>" : ··· 587 583 mutex_unlock(&uuid_mutex); 588 584 589 585 /* replace the sysfs entry */ 590 - btrfs_kobj_rm_device(fs_info, src_device); 591 - btrfs_kobj_add_device(fs_info, tgt_device); 586 + btrfs_kobj_rm_device(fs_info->fs_devices, src_device); 592 587 btrfs_rm_dev_replace_free_srcdev(fs_info, src_device); 593 588 594 589 /* write back the superblocks */

+37 -19

fs/btrfs/disk-io.c

··· 1149 1149 1150 1150 buf = btrfs_find_create_tree_block(root, bytenr); 1151 1151 if (!buf) 1152 - return NULL; 1152 + return ERR_PTR(-ENOMEM); 1153 1153 1154 1154 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 1155 1155 if (ret) { 1156 1156 free_extent_buffer(buf); 1157 - return NULL; 1157 + return ERR_PTR(ret); 1158 1158 } 1159 1159 return buf; 1160 1160 ··· 1509 1509 generation = btrfs_root_generation(&root->root_item); 1510 1510 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1511 1511 generation); 1512 - if (!root->node) { 1513 - ret = -ENOMEM; 1512 + if (IS_ERR(root->node)) { 1513 + ret = PTR_ERR(root->node); 1514 1514 goto find_fail; 1515 1515 } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { 1516 1516 ret = -EIO; 1517 - goto read_fail; 1517 + free_extent_buffer(root->node); 1518 + goto find_fail; 1518 1519 } 1519 1520 root->commit_root = btrfs_root_node(root); 1520 1521 out: 1521 1522 btrfs_free_path(path); 1522 1523 return root; 1523 1524 1524 - read_fail: 1525 - free_extent_buffer(root->node); 1526 1525 find_fail: 1527 1526 kfree(root); 1528 1527 alloc_fail: ··· 2319 2320 2320 2321 log_tree_root->node = read_tree_block(tree_root, bytenr, 2321 2322 fs_info->generation + 1); 2322 - if (!log_tree_root->node || 2323 - !extent_buffer_uptodate(log_tree_root->node)) { 2323 + if (IS_ERR(log_tree_root->node)) { 2324 + printk(KERN_ERR "BTRFS: failed to read log tree\n"); 2325 + ret = PTR_ERR(log_tree_root->node); 2326 + kfree(log_tree_root); 2327 + return ret; 2328 + } else if (!extent_buffer_uptodate(log_tree_root->node)) { 2324 2329 printk(KERN_ERR "BTRFS: failed to read log tree\n"); 2325 2330 free_extent_buffer(log_tree_root->node); 2326 2331 kfree(log_tree_root); ··· 2497 2494 seqlock_init(&fs_info->profiles_lock); 2498 2495 init_rwsem(&fs_info->delayed_iput_sem); 2499 2496 2500 - init_completion(&fs_info->kobj_unregister); 2501 2497 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 2502 2498 INIT_LIST_HEAD(&fs_info->space_info); 2503 2499 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); ··· 2799 2797 chunk_root->node = read_tree_block(chunk_root, 2800 2798 btrfs_super_chunk_root(disk_super), 2801 2799 generation); 2802 - if (!chunk_root->node || 2803 - !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { 2800 + if (IS_ERR(chunk_root->node) || 2801 + !extent_buffer_uptodate(chunk_root->node)) { 2804 2802 printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n", 2805 2803 sb->s_id); 2806 2804 goto fail_tree_roots; ··· 2836 2834 tree_root->node = read_tree_block(tree_root, 2837 2835 btrfs_super_root(disk_super), 2838 2836 generation); 2839 - if (!tree_root->node || 2840 - !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { 2837 + if (IS_ERR(tree_root->node) || 2838 + !extent_buffer_uptodate(tree_root->node)) { 2841 2839 printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", 2842 2840 sb->s_id); 2843 2841 ··· 2876 2874 2877 2875 btrfs_close_extra_devices(fs_devices, 1); 2878 2876 2877 + ret = btrfs_sysfs_add_fsid(fs_devices, NULL); 2878 + if (ret) { 2879 + pr_err("BTRFS: failed to init sysfs fsid interface: %d\n", ret); 2880 + goto fail_block_groups; 2881 + } 2882 + 2883 + ret = btrfs_sysfs_add_device(fs_devices); 2884 + if (ret) { 2885 + pr_err("BTRFS: failed to init sysfs device interface: %d\n", ret); 2886 + goto fail_fsdev_sysfs; 2887 + } 2888 + 2879 2889 ret = btrfs_sysfs_add_one(fs_info); 2880 2890 if (ret) { 2881 2891 pr_err("BTRFS: failed to init sysfs interface: %d\n", ret); 2882 - goto fail_block_groups; 2892 + goto fail_fsdev_sysfs; 2883 2893 } 2884 2894 2885 2895 ret = btrfs_init_space_info(fs_info); ··· 3068 3054 3069 3055 fail_sysfs: 3070 3056 btrfs_sysfs_remove_one(fs_info); 3057 + 3058 + fail_fsdev_sysfs: 3059 + btrfs_sysfs_remove_fsid(fs_info->fs_devices); 3071 3060 3072 3061 fail_block_groups: 3073 3062 btrfs_put_block_group_cache(fs_info); ··· 3742 3725 } 3743 3726 3744 3727 btrfs_sysfs_remove_one(fs_info); 3728 + btrfs_sysfs_remove_fsid(fs_info->fs_devices); 3745 3729 3746 3730 btrfs_free_fs_roots(fs_info); 3747 3731 ··· 4071 4053 4072 4054 while ((node = rb_first(&delayed_refs->href_root)) != NULL) { 4073 4055 struct btrfs_delayed_ref_head *head; 4056 + struct btrfs_delayed_ref_node *tmp; 4074 4057 bool pin_bytes = false; 4075 4058 4076 4059 head = rb_entry(node, struct btrfs_delayed_ref_head, ··· 4087 4068 continue; 4088 4069 } 4089 4070 spin_lock(&head->lock); 4090 - while ((node = rb_first(&head->ref_root)) != NULL) { 4091 - ref = rb_entry(node, struct btrfs_delayed_ref_node, 4092 - rb_node); 4071 + list_for_each_entry_safe_reverse(ref, tmp, &head->ref_list, 4072 + list) { 4093 4073 ref->in_tree = 0; 4094 - rb_erase(&ref->rb_node, &head->ref_root); 4074 + list_del(&ref->list); 4095 4075 atomic_dec(&delayed_refs->num_entries); 4096 4076 btrfs_put_delayed_ref(ref); 4097 4077 }

+133 -175

fs/btrfs/extent-tree.c

··· 79 79 u64 num_bytes, int alloc); 80 80 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 81 81 struct btrfs_root *root, 82 - u64 bytenr, u64 num_bytes, u64 parent, 82 + struct btrfs_delayed_ref_node *node, u64 parent, 83 83 u64 root_objectid, u64 owner_objectid, 84 84 u64 owner_offset, int refs_to_drop, 85 - struct btrfs_delayed_extent_op *extra_op, 86 - int no_quota); 85 + struct btrfs_delayed_extent_op *extra_op); 87 86 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 88 87 struct extent_buffer *leaf, 89 88 struct btrfs_extent_item *ei); ··· 1966 1967 1967 1968 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1968 1969 struct btrfs_root *root, 1969 - u64 bytenr, u64 num_bytes, 1970 + struct btrfs_delayed_ref_node *node, 1970 1971 u64 parent, u64 root_objectid, 1971 1972 u64 owner, u64 offset, int refs_to_add, 1972 - int no_quota, 1973 1973 struct btrfs_delayed_extent_op *extent_op) 1974 1974 { 1975 1975 struct btrfs_fs_info *fs_info = root->fs_info; ··· 1976 1978 struct extent_buffer *leaf; 1977 1979 struct btrfs_extent_item *item; 1978 1980 struct btrfs_key key; 1981 + u64 bytenr = node->bytenr; 1982 + u64 num_bytes = node->num_bytes; 1979 1983 u64 refs; 1980 1984 int ret; 1981 - enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL; 1985 + int no_quota = node->no_quota; 1982 1986 1983 1987 path = btrfs_alloc_path(); 1984 1988 if (!path) ··· 1996 1996 bytenr, num_bytes, parent, 1997 1997 root_objectid, owner, offset, 1998 1998 refs_to_add, extent_op); 1999 - if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota)) 1999 + if ((ret < 0 && ret != -EAGAIN) || !ret) 2000 2000 goto out; 2001 - /* 2002 - * Ok we were able to insert an inline extent and it appears to be a new 2003 - * reference, deal with the qgroup accounting. 2004 - */ 2005 - if (!ret && !no_quota) { 2006 - ASSERT(root->fs_info->quota_enabled); 2007 - leaf = path->nodes[0]; 2008 - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2009 - item = btrfs_item_ptr(leaf, path->slots[0], 2010 - struct btrfs_extent_item); 2011 - if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add) 2012 - type = BTRFS_QGROUP_OPER_ADD_SHARED; 2013 - btrfs_release_path(path); 2014 - 2015 - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 2016 - bytenr, num_bytes, type, 0); 2017 - goto out; 2018 - } 2019 2001 2020 2002 /* 2021 2003 * Ok we had -EAGAIN which means we didn't have space to insert and ··· 2008 2026 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2009 2027 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2010 2028 refs = btrfs_extent_refs(leaf, item); 2011 - if (refs) 2012 - type = BTRFS_QGROUP_OPER_ADD_SHARED; 2013 2029 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 2014 2030 if (extent_op) 2015 2031 __run_delayed_extent_op(extent_op, leaf, item); 2016 2032 2017 2033 btrfs_mark_buffer_dirty(leaf); 2018 2034 btrfs_release_path(path); 2019 - 2020 - if (!no_quota) { 2021 - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 2022 - bytenr, num_bytes, type, 0); 2023 - if (ret) 2024 - goto out; 2025 - } 2026 2035 2027 2036 path->reada = 1; 2028 2037 path->leave_spinning = 1; ··· 2060 2087 ref->objectid, ref->offset, 2061 2088 &ins, node->ref_mod); 2062 2089 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2063 - ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 2064 - node->num_bytes, parent, 2090 + ret = __btrfs_inc_extent_ref(trans, root, node, parent, 2065 2091 ref_root, ref->objectid, 2066 2092 ref->offset, node->ref_mod, 2067 - node->no_quota, extent_op); 2093 + extent_op); 2068 2094 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2069 - ret = __btrfs_free_extent(trans, root, node->bytenr, 2070 - node->num_bytes, parent, 2095 + ret = __btrfs_free_extent(trans, root, node, parent, 2071 2096 ref_root, ref->objectid, 2072 2097 ref->offset, node->ref_mod, 2073 - extent_op, node->no_quota); 2098 + extent_op); 2074 2099 } else { 2075 2100 BUG(); 2076 2101 } ··· 2226 2255 ref->level, &ins, 2227 2256 node->no_quota); 2228 2257 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2229 - ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 2230 - node->num_bytes, parent, ref_root, 2231 - ref->level, 0, 1, node->no_quota, 2258 + ret = __btrfs_inc_extent_ref(trans, root, node, 2259 + parent, ref_root, 2260 + ref->level, 0, 1, 2232 2261 extent_op); 2233 2262 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2234 - ret = __btrfs_free_extent(trans, root, node->bytenr, 2235 - node->num_bytes, parent, ref_root, 2236 - ref->level, 0, 1, extent_op, 2237 - node->no_quota); 2263 + ret = __btrfs_free_extent(trans, root, node, 2264 + parent, ref_root, 2265 + ref->level, 0, 1, extent_op); 2238 2266 } else { 2239 2267 BUG(); 2240 2268 } ··· 2293 2323 return ret; 2294 2324 } 2295 2325 2296 - static noinline struct btrfs_delayed_ref_node * 2326 + static inline struct btrfs_delayed_ref_node * 2297 2327 select_delayed_ref(struct btrfs_delayed_ref_head *head) 2298 2328 { 2299 - struct rb_node *node; 2300 - struct btrfs_delayed_ref_node *ref, *last = NULL;; 2329 + if (list_empty(&head->ref_list)) 2330 + return NULL; 2301 2331 2302 - /* 2303 - * select delayed ref of type BTRFS_ADD_DELAYED_REF first. 2304 - * this prevents ref count from going down to zero when 2305 - * there still are pending delayed ref. 2306 - */ 2307 - node = rb_first(&head->ref_root); 2308 - while (node) { 2309 - ref = rb_entry(node, struct btrfs_delayed_ref_node, 2310 - rb_node); 2311 - if (ref->action == BTRFS_ADD_DELAYED_REF) 2312 - return ref; 2313 - else if (last == NULL) 2314 - last = ref; 2315 - node = rb_next(node); 2316 - } 2317 - return last; 2332 + return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node, 2333 + list); 2318 2334 } 2319 2335 2320 2336 /* ··· 2352 2396 } 2353 2397 } 2354 2398 2355 - /* 2356 - * We need to try and merge add/drops of the same ref since we 2357 - * can run into issues with relocate dropping the implicit ref 2358 - * and then it being added back again before the drop can 2359 - * finish. If we merged anything we need to re-loop so we can 2360 - * get a good ref. 2361 - */ 2362 2399 spin_lock(&locked_ref->lock); 2363 - btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, 2364 - locked_ref); 2365 2400 2366 2401 /* 2367 2402 * locked_ref is the head node, so we have to go one ··· 2429 2482 spin_unlock(&locked_ref->lock); 2430 2483 spin_lock(&delayed_refs->lock); 2431 2484 spin_lock(&locked_ref->lock); 2432 - if (rb_first(&locked_ref->ref_root) || 2485 + if (!list_empty(&locked_ref->ref_list) || 2433 2486 locked_ref->extent_op) { 2434 2487 spin_unlock(&locked_ref->lock); 2435 2488 spin_unlock(&delayed_refs->lock); ··· 2443 2496 } else { 2444 2497 actual_count++; 2445 2498 ref->in_tree = 0; 2446 - rb_erase(&ref->rb_node, &locked_ref->ref_root); 2499 + list_del(&ref->list); 2447 2500 } 2448 2501 atomic_dec(&delayed_refs->num_entries); 2449 2502 ··· 2811 2864 goto again; 2812 2865 } 2813 2866 out: 2814 - ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info); 2815 - if (ret) 2816 - return ret; 2817 2867 assert_qgroups_uptodate(trans); 2818 2868 return 0; 2819 2869 } ··· 2849 2905 struct btrfs_delayed_ref_node *ref; 2850 2906 struct btrfs_delayed_data_ref *data_ref; 2851 2907 struct btrfs_delayed_ref_root *delayed_refs; 2852 - struct rb_node *node; 2853 2908 int ret = 0; 2854 2909 2855 2910 delayed_refs = &trans->transaction->delayed_refs; ··· 2877 2934 spin_unlock(&delayed_refs->lock); 2878 2935 2879 2936 spin_lock(&head->lock); 2880 - node = rb_first(&head->ref_root); 2881 - while (node) { 2882 - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2883 - node = rb_next(node); 2884 - 2937 + list_for_each_entry(ref, &head->ref_list, list) { 2885 2938 /* If it's a shared ref we know a cross reference exists */ 2886 2939 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { 2887 2940 ret = 1; ··· 3632 3693 found->disk_total += total_bytes * factor; 3633 3694 found->bytes_used += bytes_used; 3634 3695 found->disk_used += bytes_used * factor; 3635 - found->full = 0; 3696 + if (total_bytes > 0) 3697 + found->full = 0; 3636 3698 spin_unlock(&found->lock); 3637 3699 *space_info = found; 3638 3700 return 0; ··· 3661 3721 found->bytes_reserved = 0; 3662 3722 found->bytes_readonly = 0; 3663 3723 found->bytes_may_use = 0; 3664 - found->full = 0; 3724 + if (total_bytes > 0) 3725 + found->full = 0; 3726 + else 3727 + found->full = 1; 3665 3728 found->force_alloc = CHUNK_ALLOC_NO_FORCE; 3666 3729 found->chunk_alloc = 0; 3667 3730 found->flush = 0; ··· 3918 3975 !atomic_read(&root->fs_info->open_ioctl_trans)) { 3919 3976 need_commit--; 3920 3977 3978 + if (need_commit > 0) 3979 + btrfs_wait_ordered_roots(fs_info, -1); 3980 + 3921 3981 trans = btrfs_join_transaction(root); 3922 3982 if (IS_ERR(trans)) 3923 3983 return PTR_ERR(trans); ··· 4034 4088 return 1; 4035 4089 } 4036 4090 4037 - static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) 4091 + static u64 get_profile_num_devs(struct btrfs_root *root, u64 type) 4038 4092 { 4039 4093 u64 num_dev; 4040 4094 ··· 4048 4102 else 4049 4103 num_dev = 1; /* DUP or single */ 4050 4104 4051 - /* metadata for updaing devices and chunk tree */ 4052 - return btrfs_calc_trans_metadata_size(root, num_dev + 1); 4105 + return num_dev; 4053 4106 } 4054 4107 4055 - static void check_system_chunk(struct btrfs_trans_handle *trans, 4056 - struct btrfs_root *root, u64 type) 4108 + /* 4109 + * If @is_allocation is true, reserve space in the system space info necessary 4110 + * for allocating a chunk, otherwise if it's false, reserve space necessary for 4111 + * removing a chunk. 4112 + */ 4113 + void check_system_chunk(struct btrfs_trans_handle *trans, 4114 + struct btrfs_root *root, 4115 + u64 type) 4057 4116 { 4058 4117 struct btrfs_space_info *info; 4059 4118 u64 left; 4060 4119 u64 thresh; 4120 + int ret = 0; 4121 + u64 num_devs; 4122 + 4123 + /* 4124 + * Needed because we can end up allocating a system chunk and for an 4125 + * atomic and race free space reservation in the chunk block reserve. 4126 + */ 4127 + ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex)); 4061 4128 4062 4129 info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4063 4130 spin_lock(&info->lock); 4064 4131 left = info->total_bytes - info->bytes_used - info->bytes_pinned - 4065 - info->bytes_reserved - info->bytes_readonly; 4132 + info->bytes_reserved - info->bytes_readonly - 4133 + info->bytes_may_use; 4066 4134 spin_unlock(&info->lock); 4067 4135 4068 - thresh = get_system_chunk_thresh(root, type); 4136 + num_devs = get_profile_num_devs(root, type); 4137 + 4138 + /* num_devs device items to update and 1 chunk item to add or remove */ 4139 + thresh = btrfs_calc_trunc_metadata_size(root, num_devs) + 4140 + btrfs_calc_trans_metadata_size(root, 1); 4141 + 4069 4142 if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { 4070 4143 btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", 4071 4144 left, thresh, type); ··· 4095 4130 u64 flags; 4096 4131 4097 4132 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); 4098 - btrfs_alloc_chunk(trans, root, flags); 4133 + /* 4134 + * Ignore failure to create system chunk. We might end up not 4135 + * needing it, as we might not need to COW all nodes/leafs from 4136 + * the paths we visit in the chunk tree (they were already COWed 4137 + * or created in the current transaction for example). 4138 + */ 4139 + ret = btrfs_alloc_chunk(trans, root, flags); 4140 + } 4141 + 4142 + if (!ret) { 4143 + ret = btrfs_block_rsv_add(root->fs_info->chunk_root, 4144 + &root->fs_info->chunk_block_rsv, 4145 + thresh, BTRFS_RESERVE_NO_FLUSH); 4146 + if (!ret) 4147 + trans->chunk_bytes_reserved += thresh; 4099 4148 } 4100 4149 } 4101 4150 ··· 5167 5188 trans->bytes_reserved = 0; 5168 5189 } 5169 5190 5191 + /* 5192 + * To be called after all the new block groups attached to the transaction 5193 + * handle have been created (btrfs_create_pending_block_groups()). 5194 + */ 5195 + void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) 5196 + { 5197 + struct btrfs_fs_info *fs_info = trans->root->fs_info; 5198 + 5199 + if (!trans->chunk_bytes_reserved) 5200 + return; 5201 + 5202 + WARN_ON_ONCE(!list_empty(&trans->new_bgs)); 5203 + 5204 + block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, 5205 + trans->chunk_bytes_reserved); 5206 + trans->chunk_bytes_reserved = 0; 5207 + } 5208 + 5170 5209 /* Can only return 0 or -ENOSPC */ 5171 5210 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 5172 5211 struct inode *inode) ··· 6089 6092 6090 6093 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 6091 6094 struct btrfs_root *root, 6092 - u64 bytenr, u64 num_bytes, u64 parent, 6095 + struct btrfs_delayed_ref_node *node, u64 parent, 6093 6096 u64 root_objectid, u64 owner_objectid, 6094 6097 u64 owner_offset, int refs_to_drop, 6095 - struct btrfs_delayed_extent_op *extent_op, 6096 - int no_quota) 6098 + struct btrfs_delayed_extent_op *extent_op) 6097 6099 { 6098 6100 struct btrfs_key key; 6099 6101 struct btrfs_path *path; ··· 6106 6110 int extent_slot = 0; 6107 6111 int found_extent = 0; 6108 6112 int num_to_del = 1; 6113 + int no_quota = node->no_quota; 6109 6114 u32 item_size; 6110 6115 u64 refs; 6116 + u64 bytenr = node->bytenr; 6117 + u64 num_bytes = node->num_bytes; 6111 6118 int last_ref = 0; 6112 - enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL; 6113 6119 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 6114 6120 SKINNY_METADATA); 6115 6121 ··· 6292 6294 refs -= refs_to_drop; 6293 6295 6294 6296 if (refs > 0) { 6295 - type = BTRFS_QGROUP_OPER_SUB_SHARED; 6296 6297 if (extent_op) 6297 6298 __run_delayed_extent_op(extent_op, leaf, ei); 6298 6299 /* ··· 6353 6356 } 6354 6357 btrfs_release_path(path); 6355 6358 6356 - /* Deal with the quota accounting */ 6357 - if (!ret && last_ref && !no_quota) { 6358 - int mod_seq = 0; 6359 - 6360 - if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID && 6361 - type == BTRFS_QGROUP_OPER_SUB_SHARED) 6362 - mod_seq = 1; 6363 - 6364 - ret = btrfs_qgroup_record_ref(trans, info, root_objectid, 6365 - bytenr, num_bytes, type, 6366 - mod_seq); 6367 - } 6368 6359 out: 6369 6360 btrfs_free_path(path); 6370 6361 return ret; ··· 6378 6393 goto out_delayed_unlock; 6379 6394 6380 6395 spin_lock(&head->lock); 6381 - if (rb_first(&head->ref_root)) 6396 + if (!list_empty(&head->ref_list)) 6382 6397 goto out; 6383 6398 6384 6399 if (head->extent_op) { ··· 7288 7303 btrfs_mark_buffer_dirty(path->nodes[0]); 7289 7304 btrfs_free_path(path); 7290 7305 7291 - /* Always set parent to 0 here since its exclusive anyway. */ 7292 - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 7293 - ins->objectid, ins->offset, 7294 - BTRFS_QGROUP_OPER_ADD_EXCL, 0); 7295 - if (ret) 7296 - return ret; 7297 - 7298 7306 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); 7299 7307 if (ret) { /* -ENOENT, logic error */ 7300 7308 btrfs_err(fs_info, "update block group failed for %llu %llu", ··· 7368 7390 7369 7391 btrfs_mark_buffer_dirty(leaf); 7370 7392 btrfs_free_path(path); 7371 - 7372 - if (!no_quota) { 7373 - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 7374 - ins->objectid, num_bytes, 7375 - BTRFS_QGROUP_OPER_ADD_EXCL, 0); 7376 - if (ret) 7377 - return ret; 7378 - } 7379 7393 7380 7394 ret = update_block_group(trans, root, ins->objectid, root->nodesize, 7381 7395 1); ··· 7725 7755 wc->reada_slot = slot; 7726 7756 } 7727 7757 7758 + /* 7759 + * TODO: Modify related function to add related node/leaf to dirty_extent_root, 7760 + * for later qgroup accounting. 7761 + * 7762 + * Current, this function does nothing. 7763 + */ 7728 7764 static int account_leaf_items(struct btrfs_trans_handle *trans, 7729 7765 struct btrfs_root *root, 7730 7766 struct extent_buffer *eb) 7731 7767 { 7732 7768 int nr = btrfs_header_nritems(eb); 7733 - int i, extent_type, ret; 7769 + int i, extent_type; 7734 7770 struct btrfs_key key; 7735 7771 struct btrfs_file_extent_item *fi; 7736 7772 u64 bytenr, num_bytes; ··· 7759 7783 continue; 7760 7784 7761 7785 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); 7762 - 7763 - ret = btrfs_qgroup_record_ref(trans, root->fs_info, 7764 - root->objectid, 7765 - bytenr, num_bytes, 7766 - BTRFS_QGROUP_OPER_SUB_SUBTREE, 0); 7767 - if (ret) 7768 - return ret; 7769 7786 } 7770 7787 return 0; 7771 7788 } ··· 7827 7858 7828 7859 /* 7829 7860 * root_eb is the subtree root and is locked before this function is called. 7861 + * TODO: Modify this function to mark all (including complete shared node) 7862 + * to dirty_extent_root to allow it get accounted in qgroup. 7830 7863 */ 7831 7864 static int account_shared_subtree(struct btrfs_trans_handle *trans, 7832 7865 struct btrfs_root *root, ··· 7891 7920 child_gen = btrfs_node_ptr_generation(eb, parent_slot); 7892 7921 7893 7922 eb = read_tree_block(root, child_bytenr, child_gen); 7894 - if (!eb || !extent_buffer_uptodate(eb)) { 7923 + if (IS_ERR(eb)) { 7924 + ret = PTR_ERR(eb); 7925 + goto out; 7926 + } else if (!extent_buffer_uptodate(eb)) { 7927 + free_extent_buffer(eb); 7895 7928 ret = -EIO; 7896 7929 goto out; 7897 7930 } ··· 7906 7931 btrfs_tree_read_lock(eb); 7907 7932 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 7908 7933 path->locks[level] = BTRFS_READ_LOCK_BLOCKING; 7909 - 7910 - ret = btrfs_qgroup_record_ref(trans, root->fs_info, 7911 - root->objectid, 7912 - child_bytenr, 7913 - root->nodesize, 7914 - BTRFS_QGROUP_OPER_SUB_SUBTREE, 7915 - 0); 7916 - if (ret) 7917 - goto out; 7918 - 7919 7934 } 7920 7935 7921 7936 if (level == 0) { ··· 8116 8151 if (reada && level == 1) 8117 8152 reada_walk_down(trans, root, wc, path); 8118 8153 next = read_tree_block(root, bytenr, generation); 8119 - if (!next || !extent_buffer_uptodate(next)) { 8154 + if (IS_ERR(next)) { 8155 + return PTR_ERR(next); 8156 + } else if (!extent_buffer_uptodate(next)) { 8120 8157 free_extent_buffer(next); 8121 8158 return -EIO; 8122 8159 } ··· 8500 8533 goto out_end_trans; 8501 8534 } 8502 8535 8503 - /* 8504 - * Qgroup update accounting is run from 8505 - * delayed ref handling. This usually works 8506 - * out because delayed refs are normally the 8507 - * only way qgroup updates are added. However, 8508 - * we may have added updates during our tree 8509 - * walk so run qgroups here to make sure we 8510 - * don't lose any updates. 8511 - */ 8512 - ret = btrfs_delayed_qgroup_accounting(trans, 8513 - root->fs_info); 8514 - if (ret) 8515 - printk_ratelimited(KERN_ERR "BTRFS: Failure %d " 8516 - "running qgroup updates " 8517 - "during snapshot delete. " 8518 - "Quota is out of sync, " 8519 - "rescan required.\n", ret); 8520 - 8521 8536 btrfs_end_transaction_throttle(trans, tree_root); 8522 8537 if (!for_reloc && btrfs_need_cleaner_sleep(root)) { 8523 8538 pr_debug("BTRFS: drop snapshot early exit\n"); ··· 8553 8604 } 8554 8605 root_dropped = true; 8555 8606 out_end_trans: 8556 - ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info); 8557 - if (ret) 8558 - printk_ratelimited(KERN_ERR "BTRFS: Failure %d " 8559 - "running qgroup updates " 8560 - "during snapshot delete. " 8561 - "Quota is out of sync, " 8562 - "rescan required.\n", ret); 8563 - 8564 8607 btrfs_end_transaction_throttle(trans, tree_root); 8565 8608 out_free: 8566 8609 kfree(wc); ··· 9503 9562 9504 9563 free_excluded_extents(root, cache); 9505 9564 9565 + /* 9566 + * Call to ensure the corresponding space_info object is created and 9567 + * assigned to our block group, but don't update its counters just yet. 9568 + * We want our bg to be added to the rbtree with its ->space_info set. 9569 + */ 9570 + ret = update_space_info(root->fs_info, cache->flags, 0, 0, 9571 + &cache->space_info); 9572 + if (ret) { 9573 + btrfs_remove_free_space_cache(cache); 9574 + btrfs_put_block_group(cache); 9575 + return ret; 9576 + } 9577 + 9506 9578 ret = btrfs_add_block_group_cache(root->fs_info, cache); 9507 9579 if (ret) { 9508 9580 btrfs_remove_free_space_cache(cache); ··· 9523 9569 return ret; 9524 9570 } 9525 9571 9572 + /* 9573 + * Now that our block group has its ->space_info set and is inserted in 9574 + * the rbtree, update the space info's counters. 9575 + */ 9526 9576 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 9527 9577 &cache->space_info); 9528 9578 if (ret) {

fs/btrfs/extent-tree.h

+8 -1

fs/btrfs/extent_io.c

··· 1277 1277 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1278 1278 unsigned bits, gfp_t mask) 1279 1279 { 1280 - return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); 1280 + int wake = 0; 1281 + 1282 + if (bits & EXTENT_LOCKED) 1283 + wake = 1; 1284 + 1285 + return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask); 1281 1286 } 1282 1287 1283 1288 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, ··· 4495 4490 } 4496 4491 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4497 4492 flags |= FIEMAP_EXTENT_ENCODED; 4493 + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4494 + flags |= FIEMAP_EXTENT_UNWRITTEN; 4498 4495 4499 4496 free_extent_map(em); 4500 4497 em = NULL;

+6 -3

fs/btrfs/file.c

··· 1868 1868 struct btrfs_log_ctx ctx; 1869 1869 int ret = 0; 1870 1870 bool full_sync = 0; 1871 + const u64 len = end - start + 1; 1871 1872 1872 1873 trace_btrfs_sync_file(file, datasync); 1873 1874 ··· 1897 1896 * all extents are persisted and the respective file extent 1898 1897 * items are in the fs/subvol btree. 1899 1898 */ 1900 - ret = btrfs_wait_ordered_range(inode, start, end - start + 1); 1899 + ret = btrfs_wait_ordered_range(inode, start, len); 1901 1900 } else { 1902 1901 /* 1903 1902 * Start any new ordered operations before starting to log the ··· 1969 1968 */ 1970 1969 smp_mb(); 1971 1970 if (btrfs_inode_in_log(inode, root->fs_info->generation) || 1972 - (full_sync && BTRFS_I(inode)->last_trans <= 1973 - root->fs_info->last_trans_committed)) { 1971 + (BTRFS_I(inode)->last_trans <= 1972 + root->fs_info->last_trans_committed && 1973 + (full_sync || 1974 + !btrfs_have_ordered_extents_in_range(inode, start, len)))) { 1974 1975 /* 1975 1976 * We'v had everything committed since the last time we were 1976 1977 * modified so clear this flag in case it was set for whatever

+6 -8

fs/btrfs/free-space-cache.c

··· 231 231 { 232 232 int ret = 0; 233 233 struct btrfs_path *path = btrfs_alloc_path(); 234 + bool locked = false; 234 235 235 236 if (!path) { 236 237 ret = -ENOMEM; ··· 239 238 } 240 239 241 240 if (block_group) { 241 + locked = true; 242 242 mutex_lock(&trans->transaction->cache_write_mutex); 243 243 if (!list_empty(&block_group->io_list)) { 244 244 list_del_init(&block_group->io_list); ··· 271 269 */ 272 270 ret = btrfs_truncate_inode_items(trans, root, inode, 273 271 0, BTRFS_EXTENT_DATA_KEY); 274 - if (ret) { 275 - mutex_unlock(&trans->transaction->cache_write_mutex); 276 - btrfs_abort_transaction(trans, root, ret); 277 - return ret; 278 - } 272 + if (ret) 273 + goto fail; 279 274 280 275 ret = btrfs_update_inode(trans, root, inode); 281 276 282 - if (block_group) 283 - mutex_unlock(&trans->transaction->cache_write_mutex); 284 - 285 277 fail: 278 + if (locked) 279 + mutex_unlock(&trans->transaction->cache_write_mutex); 286 280 if (ret) 287 281 btrfs_abort_transaction(trans, root, ret); 288 282

+21 -5

fs/btrfs/inode.c

··· 4986 4986 } 4987 4987 write_unlock(&map_tree->lock); 4988 4988 4989 + /* 4990 + * Keep looping until we have no more ranges in the io tree. 4991 + * We can have ongoing bios started by readpages (called from readahead) 4992 + * that didn't get their end io callbacks called yet or they are still 4993 + * in progress ((extent_io.c:end_bio_extent_readpage()). This means some 4994 + * ranges can still be locked and eviction started because before 4995 + * submitting those bios, which are executed by a separate task (work 4996 + * queue kthread), inode references (inode->i_count) were not taken 4997 + * (which would be dropped in the end io callback of each bio). 4998 + * Therefore here we effectively end up waiting for those bios and 4999 + * anyone else holding locked ranges without having bumped the inode's 5000 + * reference count - if we don't do it, when they access the inode's 5001 + * io_tree to unlock a range it may be too late, leading to an 5002 + * use-after-free issue. 5003 + */ 4989 5004 spin_lock(&io_tree->lock); 4990 5005 while (!RB_EMPTY_ROOT(&io_tree->state)) { 4991 5006 struct extent_state *state; 4992 5007 struct extent_state *cached_state = NULL; 5008 + u64 start; 5009 + u64 end; 4993 5010 4994 5011 node = rb_first(&io_tree->state); 4995 5012 state = rb_entry(node, struct extent_state, rb_node); 4996 - atomic_inc(&state->refs); 5013 + start = state->start; 5014 + end = state->end; 4997 5015 spin_unlock(&io_tree->lock); 4998 5016 4999 - lock_extent_bits(io_tree, state->start, state->end, 5000 - 0, &cached_state); 5001 - clear_extent_bit(io_tree, state->start, state->end, 5017 + lock_extent_bits(io_tree, start, end, 0, &cached_state); 5018 + clear_extent_bit(io_tree, start, end, 5002 5019 EXTENT_LOCKED | EXTENT_DIRTY | 5003 5020 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 5004 5021 EXTENT_DEFRAG, 1, 1, 5005 5022 &cached_state, GFP_NOFS); 5006 - free_extent_state(state); 5007 5023 5008 5024 cond_resched(); 5009 5025 spin_lock(&io_tree->lock);

+34 -16

fs/btrfs/ioctl.c

··· 553 553 key.offset = (u64)-1; 554 554 new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); 555 555 if (IS_ERR(new_root)) { 556 - btrfs_abort_transaction(trans, root, PTR_ERR(new_root)); 557 556 ret = PTR_ERR(new_root); 557 + btrfs_abort_transaction(trans, root, ret); 558 558 goto fail; 559 559 } 560 560 ··· 1318 1318 i = range->start >> PAGE_CACHE_SHIFT; 1319 1319 } 1320 1320 if (!max_to_defrag) 1321 - max_to_defrag = last_index + 1; 1321 + max_to_defrag = last_index - i + 1; 1322 1322 1323 1323 /* 1324 1324 * make writeback starts from i, so the defrag range can be ··· 1368 1368 ra_index = max(i, ra_index); 1369 1369 btrfs_force_ra(inode->i_mapping, ra, file, ra_index, 1370 1370 cluster); 1371 - ra_index += max_cluster; 1371 + ra_index += cluster; 1372 1372 } 1373 1373 1374 1374 mutex_lock(&inode->i_mutex); ··· 2271 2271 { 2272 2272 struct btrfs_ioctl_ino_lookup_args *args; 2273 2273 struct inode *inode; 2274 - int ret; 2275 - 2276 - if (!capable(CAP_SYS_ADMIN)) 2277 - return -EPERM; 2274 + int ret = 0; 2278 2275 2279 2276 args = memdup_user(argp, sizeof(*args)); 2280 2277 if (IS_ERR(args)) ··· 2279 2282 2280 2283 inode = file_inode(file); 2281 2284 2285 + /* 2286 + * Unprivileged query to obtain the containing subvolume root id. The 2287 + * path is reset so it's consistent with btrfs_search_path_in_tree. 2288 + */ 2282 2289 if (args->treeid == 0) 2283 2290 args->treeid = BTRFS_I(inode)->root->root_key.objectid; 2291 + 2292 + if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) { 2293 + args->name[0] = 0; 2294 + goto out; 2295 + } 2296 + 2297 + if (!capable(CAP_SYS_ADMIN)) { 2298 + ret = -EPERM; 2299 + goto out; 2300 + } 2284 2301 2285 2302 ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info, 2286 2303 args->treeid, args->objectid, 2287 2304 args->name); 2288 2305 2306 + out: 2289 2307 if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) 2290 2308 ret = -EFAULT; 2291 2309 ··· 2425 2413 goto out_unlock_inode; 2426 2414 } 2427 2415 2428 - d_invalidate(dentry); 2429 - 2430 2416 down_write(&root->fs_info->subvol_sem); 2431 2417 2432 2418 err = may_destroy_subvol(dest); ··· 2518 2508 out_unlock_inode: 2519 2509 mutex_unlock(&inode->i_mutex); 2520 2510 if (!err) { 2521 - shrink_dcache_sb(root->fs_info->sb); 2511 + d_invalidate(dentry); 2522 2512 btrfs_invalidate_inodes(dest); 2523 2513 d_delete(dentry); 2524 2514 ASSERT(dest->send_in_progress == 0); ··· 2889 2879 return ret; 2890 2880 } 2891 2881 2892 - static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len) 2882 + static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen, 2883 + u64 olen) 2893 2884 { 2885 + u64 len = *plen; 2894 2886 u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize; 2895 2887 2896 - if (off + len > inode->i_size || off + len < off) 2888 + if (off + olen > inode->i_size || off + olen < off) 2897 2889 return -EINVAL; 2890 + 2891 + /* if we extend to eof, continue to block boundary */ 2892 + if (off + len == inode->i_size) 2893 + *plen = len = ALIGN(inode->i_size, bs) - off; 2894 + 2898 2895 /* Check that we are block aligned - btrfs_clone() requires this */ 2899 2896 if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs)) 2900 2897 return -EINVAL; ··· 2909 2892 return 0; 2910 2893 } 2911 2894 2912 - static int btrfs_extent_same(struct inode *src, u64 loff, u64 len, 2895 + static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, 2913 2896 struct inode *dst, u64 dst_loff) 2914 2897 { 2915 2898 int ret; 2899 + u64 len = olen; 2916 2900 2917 2901 /* 2918 2902 * btrfs_clone() can't handle extents in the same file ··· 2928 2910 2929 2911 btrfs_double_lock(src, loff, dst, dst_loff, len); 2930 2912 2931 - ret = extent_same_check_offsets(src, loff, len); 2913 + ret = extent_same_check_offsets(src, loff, &len, olen); 2932 2914 if (ret) 2933 2915 goto out_unlock; 2934 2916 2935 - ret = extent_same_check_offsets(dst, dst_loff, len); 2917 + ret = extent_same_check_offsets(dst, dst_loff, &len, olen); 2936 2918 if (ret) 2937 2919 goto out_unlock; 2938 2920 ··· 2945 2927 2946 2928 ret = btrfs_cmp_data(src, loff, dst, dst_loff, len); 2947 2929 if (ret == 0) 2948 - ret = btrfs_clone(src, dst, loff, len, len, dst_loff); 2930 + ret = btrfs_clone(src, dst, loff, olen, len, dst_loff); 2949 2931 2950 2932 out_unlock: 2951 2933 btrfs_double_unlock(src, loff, dst, dst_loff, len);

+29 -8

fs/btrfs/ordered-data.c

··· 198 198 entry->file_offset = file_offset; 199 199 entry->start = start; 200 200 entry->len = len; 201 - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) && 202 - !(type == BTRFS_ORDERED_NOCOW)) 203 - entry->csum_bytes_left = disk_len; 204 201 entry->disk_len = disk_len; 205 202 entry->bytes_left = len; 206 203 entry->inode = igrab(inode); ··· 283 286 tree = &BTRFS_I(inode)->ordered_tree; 284 287 spin_lock_irq(&tree->lock); 285 288 list_add_tail(&sum->list, &entry->list); 286 - WARN_ON(entry->csum_bytes_left < sum->len); 287 - entry->csum_bytes_left -= sum->len; 288 - if (entry->csum_bytes_left == 0) 289 - wake_up(&entry->wait); 290 289 spin_unlock_irq(&tree->lock); 291 290 } 292 291 ··· 502 509 wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, 503 510 &ordered->flags)); 504 511 505 - list_add_tail(&ordered->trans_list, &trans->ordered); 512 + /* 513 + * If our ordered extent completed it means it updated the 514 + * fs/subvol and csum trees already, so no need to make the 515 + * current transaction's commit wait for it, as we end up 516 + * holding memory unnecessarily and delaying the inode's iput 517 + * until the transaction commit (we schedule an iput for the 518 + * inode when the ordered extent's refcount drops to 0), which 519 + * prevents it from being evictable until the transaction 520 + * commits. 521 + */ 522 + if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) 523 + btrfs_put_ordered_extent(ordered); 524 + else 525 + list_add_tail(&ordered->trans_list, &trans->ordered); 526 + 506 527 spin_lock_irq(&log->log_extents_lock[index]); 507 528 } 508 529 spin_unlock_irq(&log->log_extents_lock[index]); ··· 849 842 atomic_inc(&entry->refs); 850 843 spin_unlock_irq(&tree->lock); 851 844 return entry; 845 + } 846 + 847 + bool btrfs_have_ordered_extents_in_range(struct inode *inode, 848 + u64 file_offset, 849 + u64 len) 850 + { 851 + struct btrfs_ordered_extent *oe; 852 + 853 + oe = btrfs_lookup_ordered_range(inode, file_offset, len); 854 + if (oe) { 855 + btrfs_put_ordered_extent(oe); 856 + return true; 857 + } 858 + return false; 852 859 } 853 860 854 861 /*

+3 -3

fs/btrfs/ordered-data.h

··· 89 89 /* number of bytes that still need writing */ 90 90 u64 bytes_left; 91 91 92 - /* number of bytes that still need csumming */ 93 - u64 csum_bytes_left; 94 - 95 92 /* 96 93 * the end of the ordered extent which is behind it but 97 94 * didn't update disk_i_size. Please see the comment of ··· 188 191 struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, 189 192 u64 file_offset, 190 193 u64 len); 194 + bool btrfs_have_ordered_extents_in_range(struct inode *inode, 195 + u64 file_offset, 196 + u64 len); 191 197 int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 192 198 struct btrfs_ordered_extent *ordered); 193 199 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,

+286 -802

fs/btrfs/qgroup.c

··· 34 34 #include "extent_io.h" 35 35 #include "qgroup.h" 36 36 37 + 37 38 /* TODO XXX FIXME 38 39 * - subvol delete -> delete when ref goes to 0? delete limits also? 39 40 * - reorganize keys ··· 85 84 86 85 /* 87 86 * temp variables for accounting operations 87 + * Refer to qgroup_shared_accouting() for details. 88 88 */ 89 89 u64 old_refcnt; 90 90 u64 new_refcnt; 91 91 }; 92 + 93 + static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, 94 + int mod) 95 + { 96 + if (qg->old_refcnt < seq) 97 + qg->old_refcnt = seq; 98 + qg->old_refcnt += mod; 99 + } 100 + 101 + static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq, 102 + int mod) 103 + { 104 + if (qg->new_refcnt < seq) 105 + qg->new_refcnt = seq; 106 + qg->new_refcnt += mod; 107 + } 108 + 109 + static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq) 110 + { 111 + if (qg->old_refcnt < seq) 112 + return 0; 113 + return qg->old_refcnt - seq; 114 + } 115 + 116 + static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq) 117 + { 118 + if (qg->new_refcnt < seq) 119 + return 0; 120 + return qg->new_refcnt - seq; 121 + } 92 122 93 123 /* 94 124 * glue structure to represent the relations between qgroups. ··· 1147 1115 struct ulist *tmp; 1148 1116 int ret = 0; 1149 1117 1150 - tmp = ulist_alloc(GFP_NOFS); 1151 - if (!tmp) 1152 - return -ENOMEM; 1153 - 1154 1118 /* Check the level of src and dst first */ 1155 1119 if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) 1156 1120 return -EINVAL; 1121 + 1122 + tmp = ulist_alloc(GFP_NOFS); 1123 + if (!tmp) 1124 + return -ENOMEM; 1157 1125 1158 1126 mutex_lock(&fs_info->qgroup_ioctl_lock); 1159 1127 quota_root = fs_info->quota_root; ··· 1388 1356 return ret; 1389 1357 } 1390 1358 1391 - static int comp_oper_exist(struct btrfs_qgroup_operation *oper1, 1392 - struct btrfs_qgroup_operation *oper2) 1359 + int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, 1360 + struct btrfs_fs_info *fs_info) 1393 1361 { 1394 - /* 1395 - * Ignore seq and type here, we're looking for any operation 1396 - * at all related to this extent on that root. 1397 - */ 1398 - if (oper1->bytenr < oper2->bytenr) 1399 - return -1; 1400 - if (oper1->bytenr > oper2->bytenr) 1401 - return 1; 1402 - if (oper1->ref_root < oper2->ref_root) 1403 - return -1; 1404 - if (oper1->ref_root > oper2->ref_root) 1405 - return 1; 1406 - return 0; 1407 - } 1408 - 1409 - static int qgroup_oper_exists(struct btrfs_fs_info *fs_info, 1410 - struct btrfs_qgroup_operation *oper) 1411 - { 1412 - struct rb_node *n; 1413 - struct btrfs_qgroup_operation *cur; 1414 - int cmp; 1415 - 1416 - spin_lock(&fs_info->qgroup_op_lock); 1417 - n = fs_info->qgroup_op_tree.rb_node; 1418 - while (n) { 1419 - cur = rb_entry(n, struct btrfs_qgroup_operation, n); 1420 - cmp = comp_oper_exist(cur, oper); 1421 - if (cmp < 0) { 1422 - n = n->rb_right; 1423 - } else if (cmp) { 1424 - n = n->rb_left; 1425 - } else { 1426 - spin_unlock(&fs_info->qgroup_op_lock); 1427 - return -EEXIST; 1428 - } 1429 - } 1430 - spin_unlock(&fs_info->qgroup_op_lock); 1431 - return 0; 1432 - } 1433 - 1434 - static int comp_oper(struct btrfs_qgroup_operation *oper1, 1435 - struct btrfs_qgroup_operation *oper2) 1436 - { 1437 - if (oper1->bytenr < oper2->bytenr) 1438 - return -1; 1439 - if (oper1->bytenr > oper2->bytenr) 1440 - return 1; 1441 - if (oper1->ref_root < oper2->ref_root) 1442 - return -1; 1443 - if (oper1->ref_root > oper2->ref_root) 1444 - return 1; 1445 - if (oper1->seq < oper2->seq) 1446 - return -1; 1447 - if (oper1->seq > oper2->seq) 1448 - return 1; 1449 - if (oper1->type < oper2->type) 1450 - return -1; 1451 - if (oper1->type > oper2->type) 1452 - return 1; 1453 - return 0; 1454 - } 1455 - 1456 - static int insert_qgroup_oper(struct btrfs_fs_info *fs_info, 1457 - struct btrfs_qgroup_operation *oper) 1458 - { 1459 - struct rb_node **p; 1460 - struct rb_node *parent = NULL; 1461 - struct btrfs_qgroup_operation *cur; 1462 - int cmp; 1463 - 1464 - spin_lock(&fs_info->qgroup_op_lock); 1465 - p = &fs_info->qgroup_op_tree.rb_node; 1466 - while (*p) { 1467 - parent = *p; 1468 - cur = rb_entry(parent, struct btrfs_qgroup_operation, n); 1469 - cmp = comp_oper(cur, oper); 1470 - if (cmp < 0) { 1471 - p = &(*p)->rb_right; 1472 - } else if (cmp) { 1473 - p = &(*p)->rb_left; 1474 - } else { 1475 - spin_unlock(&fs_info->qgroup_op_lock); 1476 - return -EEXIST; 1477 - } 1478 - } 1479 - rb_link_node(&oper->n, parent, p); 1480 - rb_insert_color(&oper->n, &fs_info->qgroup_op_tree); 1481 - spin_unlock(&fs_info->qgroup_op_lock); 1482 - return 0; 1483 - } 1484 - 1485 - /* 1486 - * Record a quota operation for processing later on. 1487 - * @trans: the transaction we are adding the delayed op to. 1488 - * @fs_info: the fs_info for this fs. 1489 - * @ref_root: the root of the reference we are acting on, 1490 - * @bytenr: the bytenr we are acting on. 1491 - * @num_bytes: the number of bytes in the reference. 1492 - * @type: the type of operation this is. 1493 - * @mod_seq: do we need to get a sequence number for looking up roots. 1494 - * 1495 - * We just add it to our trans qgroup_ref_list and carry on and process these 1496 - * operations in order at some later point. If the reference root isn't a fs 1497 - * root then we don't bother with doing anything. 1498 - * 1499 - * MUST BE HOLDING THE REF LOCK. 1500 - */ 1501 - int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, 1502 - struct btrfs_fs_info *fs_info, u64 ref_root, 1503 - u64 bytenr, u64 num_bytes, 1504 - enum btrfs_qgroup_operation_type type, int mod_seq) 1505 - { 1506 - struct btrfs_qgroup_operation *oper; 1507 - int ret; 1508 - 1509 - if (!is_fstree(ref_root) || !fs_info->quota_enabled) 1510 - return 0; 1511 - 1512 - oper = kmalloc(sizeof(*oper), GFP_NOFS); 1513 - if (!oper) 1514 - return -ENOMEM; 1515 - 1516 - oper->ref_root = ref_root; 1517 - oper->bytenr = bytenr; 1518 - oper->num_bytes = num_bytes; 1519 - oper->type = type; 1520 - oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq); 1521 - INIT_LIST_HEAD(&oper->elem.list); 1522 - oper->elem.seq = 0; 1523 - 1524 - trace_btrfs_qgroup_record_ref(oper); 1525 - 1526 - if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) { 1527 - /* 1528 - * If any operation for this bytenr/ref_root combo 1529 - * exists, then we know it's not exclusively owned and 1530 - * shouldn't be queued up. 1531 - * 1532 - * This also catches the case where we have a cloned 1533 - * extent that gets queued up multiple times during 1534 - * drop snapshot. 1535 - */ 1536 - if (qgroup_oper_exists(fs_info, oper)) { 1537 - kfree(oper); 1538 - return 0; 1539 - } 1540 - } 1541 - 1542 - ret = insert_qgroup_oper(fs_info, oper); 1543 - if (ret) { 1544 - /* Shouldn't happen so have an assert for developers */ 1545 - ASSERT(0); 1546 - kfree(oper); 1547 - return ret; 1548 - } 1549 - list_add_tail(&oper->list, &trans->qgroup_ref_list); 1550 - 1551 - if (mod_seq) 1552 - btrfs_get_tree_mod_seq(fs_info, &oper->elem); 1553 - 1554 - return 0; 1555 - } 1556 - 1557 - static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, 1558 - struct btrfs_qgroup_operation *oper) 1559 - { 1560 - struct ulist *tmp; 1561 - int sign = 0; 1362 + struct btrfs_qgroup_extent_record *record; 1363 + struct btrfs_delayed_ref_root *delayed_refs; 1364 + struct rb_node *node; 1365 + u64 qgroup_to_skip; 1562 1366 int ret = 0; 1563 1367 1564 - tmp = ulist_alloc(GFP_NOFS); 1565 - if (!tmp) 1566 - return -ENOMEM; 1368 + delayed_refs = &trans->transaction->delayed_refs; 1369 + qgroup_to_skip = delayed_refs->qgroup_to_skip; 1567 1370 1568 - spin_lock(&fs_info->qgroup_lock); 1569 - if (!fs_info->quota_root) 1570 - goto out; 1571 - 1572 - switch (oper->type) { 1573 - case BTRFS_QGROUP_OPER_ADD_EXCL: 1574 - sign = 1; 1575 - break; 1576 - case BTRFS_QGROUP_OPER_SUB_EXCL: 1577 - sign = -1; 1578 - break; 1579 - default: 1580 - ASSERT(0); 1371 + /* 1372 + * No need to do lock, since this function will only be called in 1373 + * btrfs_commmit_transaction(). 1374 + */ 1375 + node = rb_first(&delayed_refs->dirty_extent_root); 1376 + while (node) { 1377 + record = rb_entry(node, struct btrfs_qgroup_extent_record, 1378 + node); 1379 + ret = btrfs_find_all_roots(NULL, fs_info, record->bytenr, 0, 1380 + &record->old_roots); 1381 + if (ret < 0) 1382 + break; 1383 + if (qgroup_to_skip) 1384 + ulist_del(record->old_roots, qgroup_to_skip, 0); 1385 + node = rb_next(node); 1581 1386 } 1582 - ret = __qgroup_excl_accounting(fs_info, tmp, oper->ref_root, 1583 - oper->num_bytes, sign); 1584 - out: 1585 - spin_unlock(&fs_info->qgroup_lock); 1586 - ulist_free(tmp); 1587 1387 return ret; 1588 1388 } 1589 1389 1390 + struct btrfs_qgroup_extent_record 1391 + *btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs, 1392 + struct btrfs_qgroup_extent_record *record) 1393 + { 1394 + struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; 1395 + struct rb_node *parent_node = NULL; 1396 + struct btrfs_qgroup_extent_record *entry; 1397 + u64 bytenr = record->bytenr; 1398 + 1399 + while (*p) { 1400 + parent_node = *p; 1401 + entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, 1402 + node); 1403 + if (bytenr < entry->bytenr) 1404 + p = &(*p)->rb_left; 1405 + else if (bytenr > entry->bytenr) 1406 + p = &(*p)->rb_right; 1407 + else 1408 + return entry; 1409 + } 1410 + 1411 + rb_link_node(&record->node, parent_node, p); 1412 + rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); 1413 + return NULL; 1414 + } 1415 + 1416 + #define UPDATE_NEW 0 1417 + #define UPDATE_OLD 1 1590 1418 /* 1591 - * Walk all of the roots that pointed to our bytenr and adjust their refcnts as 1592 - * properly. 1419 + * Walk all of the roots that points to the bytenr and adjust their refcnts. 1593 1420 */ 1594 - static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, 1595 - u64 root_to_skip, struct ulist *tmp, 1596 - struct ulist *roots, struct ulist *qgroups, 1597 - u64 seq, int *old_roots, int rescan) 1421 + static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, 1422 + struct ulist *roots, struct ulist *tmp, 1423 + struct ulist *qgroups, u64 seq, int update_old) 1598 1424 { 1599 1425 struct ulist_node *unode; 1600 1426 struct ulist_iterator uiter; 1601 1427 struct ulist_node *tmp_unode; 1602 1428 struct ulist_iterator tmp_uiter; 1603 1429 struct btrfs_qgroup *qg; 1604 - int ret; 1430 + int ret = 0; 1605 1431 1432 + if (!roots) 1433 + return 0; 1606 1434 ULIST_ITER_INIT(&uiter); 1607 1435 while ((unode = ulist_next(roots, &uiter))) { 1608 - /* We don't count our current root here */ 1609 - if (unode->val == root_to_skip) 1610 - continue; 1611 1436 qg = find_qgroup_rb(fs_info, unode->val); 1612 1437 if (!qg) 1613 1438 continue; 1614 - /* 1615 - * We could have a pending removal of this same ref so we may 1616 - * not have actually found our ref root when doing 1617 - * btrfs_find_all_roots, so we need to keep track of how many 1618 - * old roots we find in case we removed ours and added a 1619 - * different one at the same time. I don't think this could 1620 - * happen in practice but that sort of thinking leads to pain 1621 - * and suffering and to the dark side. 1622 - */ 1623 - (*old_roots)++; 1624 1439 1625 1440 ulist_reinit(tmp); 1626 1441 ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), ··· 1482 1603 struct btrfs_qgroup_list *glist; 1483 1604 1484 1605 qg = u64_to_ptr(tmp_unode->aux); 1485 - /* 1486 - * We use this sequence number to keep from having to 1487 - * run the whole list and 0 out the refcnt every time. 1488 - * We basically use sequnce as the known 0 count and 1489 - * then add 1 everytime we see a qgroup. This is how we 1490 - * get how many of the roots actually point up to the 1491 - * upper level qgroups in order to determine exclusive 1492 - * counts. 1493 - * 1494 - * For rescan we want to set old_refcnt to seq so our 1495 - * exclusive calculations end up correct. 1496 - */ 1497 - if (rescan) 1498 - qg->old_refcnt = seq; 1499 - else if (qg->old_refcnt < seq) 1500 - qg->old_refcnt = seq + 1; 1606 + if (update_old) 1607 + btrfs_qgroup_update_old_refcnt(qg, seq, 1); 1501 1608 else 1502 - qg->old_refcnt++; 1503 - 1504 - if (qg->new_refcnt < seq) 1505 - qg->new_refcnt = seq + 1; 1506 - else 1507 - qg->new_refcnt++; 1609 + btrfs_qgroup_update_new_refcnt(qg, seq, 1); 1508 1610 list_for_each_entry(glist, &qg->groups, next_group) { 1509 1611 ret = ulist_add(qgroups, glist->group->qgroupid, 1510 1612 ptr_to_u64(glist->group), ··· 1504 1644 } 1505 1645 1506 1646 /* 1507 - * We need to walk forward in our operation tree and account for any roots that 1508 - * were deleted after we made this operation. 1647 + * Update qgroup rfer/excl counters. 1648 + * Rfer update is easy, codes can explain themselves. 1649 + * 1650 + * Excl update is tricky, the update is split into 2 part. 1651 + * Part 1: Possible exclusive <-> sharing detect: 1652 + * | A | !A | 1653 + * ------------------------------------- 1654 + * B | * | - | 1655 + * ------------------------------------- 1656 + * !B | + | ** | 1657 + * ------------------------------------- 1658 + * 1659 + * Conditions: 1660 + * A: cur_old_roots < nr_old_roots (not exclusive before) 1661 + * !A: cur_old_roots == nr_old_roots (possible exclusive before) 1662 + * B: cur_new_roots < nr_new_roots (not exclusive now) 1663 + * !B: cur_new_roots == nr_new_roots (possible exclsuive now) 1664 + * 1665 + * Results: 1666 + * +: Possible sharing -> exclusive -: Possible exclusive -> sharing 1667 + * *: Definitely not changed. **: Possible unchanged. 1668 + * 1669 + * For !A and !B condition, the exception is cur_old/new_roots == 0 case. 1670 + * 1671 + * To make the logic clear, we first use condition A and B to split 1672 + * combination into 4 results. 1673 + * 1674 + * Then, for result "+" and "-", check old/new_roots == 0 case, as in them 1675 + * only on variant maybe 0. 1676 + * 1677 + * Lastly, check result **, since there are 2 variants maybe 0, split them 1678 + * again(2x2). 1679 + * But this time we don't need to consider other things, the codes and logic 1680 + * is easy to understand now. 1509 1681 */ 1510 - static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info, 1511 - struct btrfs_qgroup_operation *oper, 1512 - struct ulist *tmp, 1513 - struct ulist *qgroups, u64 seq, 1514 - int *old_roots) 1515 - { 1516 - struct ulist_node *unode; 1517 - struct ulist_iterator uiter; 1518 - struct btrfs_qgroup *qg; 1519 - struct btrfs_qgroup_operation *tmp_oper; 1520 - struct rb_node *n; 1521 - int ret; 1522 - 1523 - ulist_reinit(tmp); 1524 - 1525 - /* 1526 - * We only walk forward in the tree since we're only interested in 1527 - * removals that happened _after_ our operation. 1528 - */ 1529 - spin_lock(&fs_info->qgroup_op_lock); 1530 - n = rb_next(&oper->n); 1531 - spin_unlock(&fs_info->qgroup_op_lock); 1532 - if (!n) 1533 - return 0; 1534 - tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); 1535 - while (tmp_oper->bytenr == oper->bytenr) { 1536 - /* 1537 - * If it's not a removal we don't care, additions work out 1538 - * properly with our refcnt tracking. 1539 - */ 1540 - if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED && 1541 - tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL) 1542 - goto next; 1543 - qg = find_qgroup_rb(fs_info, tmp_oper->ref_root); 1544 - if (!qg) 1545 - goto next; 1546 - ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), 1547 - GFP_ATOMIC); 1548 - if (ret) { 1549 - if (ret < 0) 1550 - return ret; 1551 - /* 1552 - * We only want to increase old_roots if this qgroup is 1553 - * not already in the list of qgroups. If it is already 1554 - * there then that means it must have been re-added or 1555 - * the delete will be discarded because we had an 1556 - * existing ref that we haven't looked up yet. In this 1557 - * case we don't want to increase old_roots. So if ret 1558 - * == 1 then we know that this is the first time we've 1559 - * seen this qgroup and we can bump the old_roots. 1560 - */ 1561 - (*old_roots)++; 1562 - ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), 1563 - GFP_ATOMIC); 1564 - if (ret < 0) 1565 - return ret; 1566 - } 1567 - next: 1568 - spin_lock(&fs_info->qgroup_op_lock); 1569 - n = rb_next(&tmp_oper->n); 1570 - spin_unlock(&fs_info->qgroup_op_lock); 1571 - if (!n) 1572 - break; 1573 - tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); 1574 - } 1575 - 1576 - /* Ok now process the qgroups we found */ 1577 - ULIST_ITER_INIT(&uiter); 1578 - while ((unode = ulist_next(tmp, &uiter))) { 1579 - struct btrfs_qgroup_list *glist; 1580 - 1581 - qg = u64_to_ptr(unode->aux); 1582 - if (qg->old_refcnt < seq) 1583 - qg->old_refcnt = seq + 1; 1584 - else 1585 - qg->old_refcnt++; 1586 - if (qg->new_refcnt < seq) 1587 - qg->new_refcnt = seq + 1; 1588 - else 1589 - qg->new_refcnt++; 1590 - list_for_each_entry(glist, &qg->groups, next_group) { 1591 - ret = ulist_add(qgroups, glist->group->qgroupid, 1592 - ptr_to_u64(glist->group), GFP_ATOMIC); 1593 - if (ret < 0) 1594 - return ret; 1595 - ret = ulist_add(tmp, glist->group->qgroupid, 1596 - ptr_to_u64(glist->group), GFP_ATOMIC); 1597 - if (ret < 0) 1598 - return ret; 1599 - } 1600 - } 1601 - return 0; 1602 - } 1603 - 1604 - /* Add refcnt for the newly added reference. */ 1605 - static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info, 1606 - struct btrfs_qgroup_operation *oper, 1607 - struct btrfs_qgroup *qgroup, 1608 - struct ulist *tmp, struct ulist *qgroups, 1609 - u64 seq) 1610 - { 1611 - struct ulist_node *unode; 1612 - struct ulist_iterator uiter; 1613 - struct btrfs_qgroup *qg; 1614 - int ret; 1615 - 1616 - ulist_reinit(tmp); 1617 - ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup), 1618 - GFP_ATOMIC); 1619 - if (ret < 0) 1620 - return ret; 1621 - ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup), 1622 - GFP_ATOMIC); 1623 - if (ret < 0) 1624 - return ret; 1625 - ULIST_ITER_INIT(&uiter); 1626 - while ((unode = ulist_next(tmp, &uiter))) { 1627 - struct btrfs_qgroup_list *glist; 1628 - 1629 - qg = u64_to_ptr(unode->aux); 1630 - if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { 1631 - if (qg->new_refcnt < seq) 1632 - qg->new_refcnt = seq + 1; 1633 - else 1634 - qg->new_refcnt++; 1635 - } else { 1636 - if (qg->old_refcnt < seq) 1637 - qg->old_refcnt = seq + 1; 1638 - else 1639 - qg->old_refcnt++; 1640 - } 1641 - list_for_each_entry(glist, &qg->groups, next_group) { 1642 - ret = ulist_add(tmp, glist->group->qgroupid, 1643 - ptr_to_u64(glist->group), GFP_ATOMIC); 1644 - if (ret < 0) 1645 - return ret; 1646 - ret = ulist_add(qgroups, glist->group->qgroupid, 1647 - ptr_to_u64(glist->group), GFP_ATOMIC); 1648 - if (ret < 0) 1649 - return ret; 1650 - } 1651 - } 1652 - return 0; 1653 - } 1654 - 1655 - /* 1656 - * This adjusts the counters for all referenced qgroups if need be. 1657 - */ 1658 - static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info, 1659 - u64 root_to_skip, u64 num_bytes, 1660 - struct ulist *qgroups, u64 seq, 1661 - int old_roots, int new_roots, int rescan) 1682 + static int qgroup_update_counters(struct btrfs_fs_info *fs_info, 1683 + struct ulist *qgroups, 1684 + u64 nr_old_roots, 1685 + u64 nr_new_roots, 1686 + u64 num_bytes, u64 seq) 1662 1687 { 1663 1688 struct ulist_node *unode; 1664 1689 struct ulist_iterator uiter; ··· 1555 1810 bool dirty = false; 1556 1811 1557 1812 qg = u64_to_ptr(unode->aux); 1558 - /* 1559 - * Wasn't referenced before but is now, add to the reference 1560 - * counters. 1561 - */ 1562 - if (qg->old_refcnt <= seq && qg->new_refcnt > seq) { 1813 + cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); 1814 + cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); 1815 + 1816 + /* Rfer update part */ 1817 + if (cur_old_count == 0 && cur_new_count > 0) { 1563 1818 qg->rfer += num_bytes; 1564 1819 qg->rfer_cmpr += num_bytes; 1565 1820 dirty = true; 1566 1821 } 1567 - 1568 - /* 1569 - * Was referenced before but isn't now, subtract from the 1570 - * reference counters. 1571 - */ 1572 - if (qg->old_refcnt > seq && qg->new_refcnt <= seq) { 1822 + if (cur_old_count > 0 && cur_new_count == 0) { 1573 1823 qg->rfer -= num_bytes; 1574 1824 qg->rfer_cmpr -= num_bytes; 1575 1825 dirty = true; 1576 1826 } 1577 1827 1578 - if (qg->old_refcnt < seq) 1579 - cur_old_count = 0; 1580 - else 1581 - cur_old_count = qg->old_refcnt - seq; 1582 - if (qg->new_refcnt < seq) 1583 - cur_new_count = 0; 1584 - else 1585 - cur_new_count = qg->new_refcnt - seq; 1586 - 1587 - /* 1588 - * If our refcount was the same as the roots previously but our 1589 - * new count isn't the same as the number of roots now then we 1590 - * went from having a exclusive reference on this range to not. 1591 - */ 1592 - if (old_roots && cur_old_count == old_roots && 1593 - (cur_new_count != new_roots || new_roots == 0)) { 1594 - WARN_ON(cur_new_count != new_roots && new_roots == 0); 1595 - qg->excl -= num_bytes; 1596 - qg->excl_cmpr -= num_bytes; 1597 - dirty = true; 1828 + /* Excl update part */ 1829 + /* Exclusive/none -> shared case */ 1830 + if (cur_old_count == nr_old_roots && 1831 + cur_new_count < nr_new_roots) { 1832 + /* Exclusive -> shared */ 1833 + if (cur_old_count != 0) { 1834 + qg->excl -= num_bytes; 1835 + qg->excl_cmpr -= num_bytes; 1836 + dirty = true; 1837 + } 1598 1838 } 1599 1839 1600 - /* 1601 - * If we didn't reference all the roots before but now we do we 1602 - * have an exclusive reference to this range. 1603 - */ 1604 - if ((!old_roots || (old_roots && cur_old_count != old_roots)) 1605 - && cur_new_count == new_roots) { 1606 - qg->excl += num_bytes; 1607 - qg->excl_cmpr += num_bytes; 1608 - dirty = true; 1840 + /* Shared -> exclusive/none case */ 1841 + if (cur_old_count < nr_old_roots && 1842 + cur_new_count == nr_new_roots) { 1843 + /* Shared->exclusive */ 1844 + if (cur_new_count != 0) { 1845 + qg->excl += num_bytes; 1846 + qg->excl_cmpr += num_bytes; 1847 + dirty = true; 1848 + } 1609 1849 } 1610 1850 1851 + /* Exclusive/none -> exclusive/none case */ 1852 + if (cur_old_count == nr_old_roots && 1853 + cur_new_count == nr_new_roots) { 1854 + if (cur_old_count == 0) { 1855 + /* None -> exclusive/none */ 1856 + 1857 + if (cur_new_count != 0) { 1858 + /* None -> exclusive */ 1859 + qg->excl += num_bytes; 1860 + qg->excl_cmpr += num_bytes; 1861 + dirty = true; 1862 + } 1863 + /* None -> none, nothing changed */ 1864 + } else { 1865 + /* Exclusive -> exclusive/none */ 1866 + 1867 + if (cur_new_count == 0) { 1868 + /* Exclusive -> none */ 1869 + qg->excl -= num_bytes; 1870 + qg->excl_cmpr -= num_bytes; 1871 + dirty = true; 1872 + } 1873 + /* Exclusive -> exclusive, nothing changed */ 1874 + } 1875 + } 1611 1876 if (dirty) 1612 1877 qgroup_dirty(fs_info, qg); 1613 1878 } 1614 1879 return 0; 1615 1880 } 1616 1881 1617 - /* 1618 - * If we removed a data extent and there were other references for that bytenr 1619 - * then we need to lookup all referenced roots to make sure we still don't 1620 - * reference this bytenr. If we do then we can just discard this operation. 1621 - */ 1622 - static int check_existing_refs(struct btrfs_trans_handle *trans, 1623 - struct btrfs_fs_info *fs_info, 1624 - struct btrfs_qgroup_operation *oper) 1882 + int 1883 + btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, 1884 + struct btrfs_fs_info *fs_info, 1885 + u64 bytenr, u64 num_bytes, 1886 + struct ulist *old_roots, struct ulist *new_roots) 1625 1887 { 1626 - struct ulist *roots = NULL; 1627 - struct ulist_node *unode; 1628 - struct ulist_iterator uiter; 1629 - int ret = 0; 1630 - 1631 - ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, 1632 - oper->elem.seq, &roots); 1633 - if (ret < 0) 1634 - return ret; 1635 - ret = 0; 1636 - 1637 - ULIST_ITER_INIT(&uiter); 1638 - while ((unode = ulist_next(roots, &uiter))) { 1639 - if (unode->val == oper->ref_root) { 1640 - ret = 1; 1641 - break; 1642 - } 1643 - } 1644 - ulist_free(roots); 1645 - btrfs_put_tree_mod_seq(fs_info, &oper->elem); 1646 - 1647 - return ret; 1648 - } 1649 - 1650 - /* 1651 - * If we share a reference across multiple roots then we may need to adjust 1652 - * various qgroups referenced and exclusive counters. The basic premise is this 1653 - * 1654 - * 1) We have seq to represent a 0 count. Instead of looping through all of the 1655 - * qgroups and resetting their refcount to 0 we just constantly bump this 1656 - * sequence number to act as the base reference count. This means that if 1657 - * anybody is equal to or below this sequence they were never referenced. We 1658 - * jack this sequence up by the number of roots we found each time in order to 1659 - * make sure we don't have any overlap. 1660 - * 1661 - * 2) We first search all the roots that reference the area _except_ the root 1662 - * we're acting on currently. This makes up the old_refcnt of all the qgroups 1663 - * before. 1664 - * 1665 - * 3) We walk all of the qgroups referenced by the root we are currently acting 1666 - * on, and will either adjust old_refcnt in the case of a removal or the 1667 - * new_refcnt in the case of an addition. 1668 - * 1669 - * 4) Finally we walk all the qgroups that are referenced by this range 1670 - * including the root we are acting on currently. We will adjust the counters 1671 - * based on the number of roots we had and will have after this operation. 1672 - * 1673 - * Take this example as an illustration 1674 - * 1675 - * [qgroup 1/0] 1676 - * / | \ 1677 - * [qg 0/0] [qg 0/1] [qg 0/2] 1678 - * \ | / 1679 - * [ extent ] 1680 - * 1681 - * Say we are adding a reference that is covered by qg 0/0. The first step 1682 - * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with 1683 - * old_roots being 2. Because it is adding new_roots will be 1. We then go 1684 - * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's 1685 - * new_refcnt, bringing it to 3. We then walk through all of the qgroups, we 1686 - * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a 1687 - * reference and thus must add the size to the referenced bytes. Everything 1688 - * else is the same so nothing else changes. 1689 - */ 1690 - static int qgroup_shared_accounting(struct btrfs_trans_handle *trans, 1691 - struct btrfs_fs_info *fs_info, 1692 - struct btrfs_qgroup_operation *oper) 1693 - { 1694 - struct ulist *roots = NULL; 1695 - struct ulist *qgroups, *tmp; 1696 - struct btrfs_qgroup *qgroup; 1697 - struct seq_list elem = SEQ_LIST_INIT(elem); 1888 + struct ulist *qgroups = NULL; 1889 + struct ulist *tmp = NULL; 1698 1890 u64 seq; 1699 - int old_roots = 0; 1700 - int new_roots = 0; 1891 + u64 nr_new_roots = 0; 1892 + u64 nr_old_roots = 0; 1701 1893 int ret = 0; 1702 1894 1703 - if (oper->elem.seq) { 1704 - ret = check_existing_refs(trans, fs_info, oper); 1705 - if (ret < 0) 1706 - return ret; 1707 - if (ret) 1708 - return 0; 1709 - } 1710 - 1711 - qgroups = ulist_alloc(GFP_NOFS); 1712 - if (!qgroups) 1713 - return -ENOMEM; 1714 - 1715 - tmp = ulist_alloc(GFP_NOFS); 1716 - if (!tmp) { 1717 - ulist_free(qgroups); 1718 - return -ENOMEM; 1719 - } 1720 - 1721 - btrfs_get_tree_mod_seq(fs_info, &elem); 1722 - ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq, 1723 - &roots); 1724 - btrfs_put_tree_mod_seq(fs_info, &elem); 1725 - if (ret < 0) { 1726 - ulist_free(qgroups); 1727 - ulist_free(tmp); 1728 - return ret; 1729 - } 1730 - spin_lock(&fs_info->qgroup_lock); 1731 - qgroup = find_qgroup_rb(fs_info, oper->ref_root); 1732 - if (!qgroup) 1733 - goto out; 1734 - seq = fs_info->qgroup_seq; 1735 - 1736 - /* 1737 - * So roots is the list of all the roots currently pointing at the 1738 - * bytenr, including the ref we are adding if we are adding, or not if 1739 - * we are removing a ref. So we pass in the ref_root to skip that root 1740 - * in our calculations. We set old_refnct and new_refcnt cause who the 1741 - * hell knows what everything looked like before, and it doesn't matter 1742 - * except... 1743 - */ 1744 - ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups, 1745 - seq, &old_roots, 0); 1746 - if (ret < 0) 1747 - goto out; 1748 - 1749 - /* 1750 - * Now adjust the refcounts of the qgroups that care about this 1751 - * reference, either the old_count in the case of removal or new_count 1752 - * in the case of an addition. 1753 - */ 1754 - ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups, 1755 - seq); 1756 - if (ret < 0) 1757 - goto out; 1758 - 1759 - /* 1760 - * ...in the case of removals. If we had a removal before we got around 1761 - * to processing this operation then we need to find that guy and count 1762 - * his references as if they really existed so we don't end up screwing 1763 - * up the exclusive counts. Then whenever we go to process the delete 1764 - * everything will be grand and we can account for whatever exclusive 1765 - * changes need to be made there. We also have to pass in old_roots so 1766 - * we have an accurate count of the roots as it pertains to this 1767 - * operations view of the world. 1768 - */ 1769 - ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq, 1770 - &old_roots); 1771 - if (ret < 0) 1772 - goto out; 1773 - 1774 - /* 1775 - * We are adding our root, need to adjust up the number of roots, 1776 - * otherwise old_roots is the number of roots we want. 1777 - */ 1778 - if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { 1779 - new_roots = old_roots + 1; 1780 - } else { 1781 - new_roots = old_roots; 1782 - old_roots++; 1783 - } 1784 - fs_info->qgroup_seq += old_roots + 1; 1785 - 1786 - 1787 - /* 1788 - * And now the magic happens, bless Arne for having a pretty elegant 1789 - * solution for this. 1790 - */ 1791 - qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes, 1792 - qgroups, seq, old_roots, new_roots, 0); 1793 - out: 1794 - spin_unlock(&fs_info->qgroup_lock); 1795 - ulist_free(qgroups); 1796 - ulist_free(roots); 1797 - ulist_free(tmp); 1798 - return ret; 1799 - } 1800 - 1801 - /* 1802 - * Process a reference to a shared subtree. This type of operation is 1803 - * queued during snapshot removal when we encounter extents which are 1804 - * shared between more than one root. 1805 - */ 1806 - static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans, 1807 - struct btrfs_fs_info *fs_info, 1808 - struct btrfs_qgroup_operation *oper) 1809 - { 1810 - struct ulist *roots = NULL; 1811 - struct ulist_node *unode; 1812 - struct ulist_iterator uiter; 1813 - struct btrfs_qgroup_list *glist; 1814 - struct ulist *parents; 1815 - int ret = 0; 1816 - int err; 1817 - struct btrfs_qgroup *qg; 1818 - u64 root_obj = 0; 1819 - struct seq_list elem = SEQ_LIST_INIT(elem); 1820 - 1821 - parents = ulist_alloc(GFP_NOFS); 1822 - if (!parents) 1823 - return -ENOMEM; 1824 - 1825 - btrfs_get_tree_mod_seq(fs_info, &elem); 1826 - ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, 1827 - elem.seq, &roots); 1828 - btrfs_put_tree_mod_seq(fs_info, &elem); 1829 - if (ret < 0) 1830 - goto out; 1831 - 1832 - if (roots->nnodes != 1) 1833 - goto out; 1834 - 1835 - ULIST_ITER_INIT(&uiter); 1836 - unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */ 1837 - /* 1838 - * If we find our ref root then that means all refs 1839 - * this extent has to the root have not yet been 1840 - * deleted. In that case, we do nothing and let the 1841 - * last ref for this bytenr drive our update. 1842 - * 1843 - * This can happen for example if an extent is 1844 - * referenced multiple times in a snapshot (clone, 1845 - * etc). If we are in the middle of snapshot removal, 1846 - * queued updates for such an extent will find the 1847 - * root if we have not yet finished removing the 1848 - * snapshot. 1849 - */ 1850 - if (unode->val == oper->ref_root) 1851 - goto out; 1852 - 1853 - root_obj = unode->val; 1854 - BUG_ON(!root_obj); 1855 - 1856 - spin_lock(&fs_info->qgroup_lock); 1857 - qg = find_qgroup_rb(fs_info, root_obj); 1858 - if (!qg) 1859 - goto out_unlock; 1860 - 1861 - qg->excl += oper->num_bytes; 1862 - qg->excl_cmpr += oper->num_bytes; 1863 - qgroup_dirty(fs_info, qg); 1864 - 1865 - /* 1866 - * Adjust counts for parent groups. First we find all 1867 - * parents, then in the 2nd loop we do the adjustment 1868 - * while adding parents of the parents to our ulist. 1869 - */ 1870 - list_for_each_entry(glist, &qg->groups, next_group) { 1871 - err = ulist_add(parents, glist->group->qgroupid, 1872 - ptr_to_u64(glist->group), GFP_ATOMIC); 1873 - if (err < 0) { 1874 - ret = err; 1875 - goto out_unlock; 1876 - } 1877 - } 1878 - 1879 - ULIST_ITER_INIT(&uiter); 1880 - while ((unode = ulist_next(parents, &uiter))) { 1881 - qg = u64_to_ptr(unode->aux); 1882 - qg->excl += oper->num_bytes; 1883 - qg->excl_cmpr += oper->num_bytes; 1884 - qgroup_dirty(fs_info, qg); 1885 - 1886 - /* Add any parents of the parents */ 1887 - list_for_each_entry(glist, &qg->groups, next_group) { 1888 - err = ulist_add(parents, glist->group->qgroupid, 1889 - ptr_to_u64(glist->group), GFP_ATOMIC); 1890 - if (err < 0) { 1891 - ret = err; 1892 - goto out_unlock; 1893 - } 1894 - } 1895 - } 1896 - 1897 - out_unlock: 1898 - spin_unlock(&fs_info->qgroup_lock); 1899 - 1900 - out: 1901 - ulist_free(roots); 1902 - ulist_free(parents); 1903 - return ret; 1904 - } 1905 - 1906 - /* 1907 - * btrfs_qgroup_account_ref is called for every ref that is added to or deleted 1908 - * from the fs. First, all roots referencing the extent are searched, and 1909 - * then the space is accounted accordingly to the different roots. The 1910 - * accounting algorithm works in 3 steps documented inline. 1911 - */ 1912 - static int btrfs_qgroup_account(struct btrfs_trans_handle *trans, 1913 - struct btrfs_fs_info *fs_info, 1914 - struct btrfs_qgroup_operation *oper) 1915 - { 1916 - int ret = 0; 1895 + if (new_roots) 1896 + nr_new_roots = new_roots->nnodes; 1897 + if (old_roots) 1898 + nr_old_roots = old_roots->nnodes; 1917 1899 1918 1900 if (!fs_info->quota_enabled) 1919 - return 0; 1920 - 1901 + goto out_free; 1921 1902 BUG_ON(!fs_info->quota_root); 1903 + 1904 + qgroups = ulist_alloc(GFP_NOFS); 1905 + if (!qgroups) { 1906 + ret = -ENOMEM; 1907 + goto out_free; 1908 + } 1909 + tmp = ulist_alloc(GFP_NOFS); 1910 + if (!tmp) { 1911 + ret = -ENOMEM; 1912 + goto out_free; 1913 + } 1922 1914 1923 1915 mutex_lock(&fs_info->qgroup_rescan_lock); 1924 1916 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 1925 - if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) { 1917 + if (fs_info->qgroup_rescan_progress.objectid <= bytenr) { 1926 1918 mutex_unlock(&fs_info->qgroup_rescan_lock); 1927 - return 0; 1919 + ret = 0; 1920 + goto out_free; 1928 1921 } 1929 1922 } 1930 1923 mutex_unlock(&fs_info->qgroup_rescan_lock); 1931 1924 1932 - ASSERT(is_fstree(oper->ref_root)); 1925 + spin_lock(&fs_info->qgroup_lock); 1926 + seq = fs_info->qgroup_seq; 1933 1927 1934 - trace_btrfs_qgroup_account(oper); 1928 + /* Update old refcnts using old_roots */ 1929 + ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq, 1930 + UPDATE_OLD); 1931 + if (ret < 0) 1932 + goto out; 1935 1933 1936 - switch (oper->type) { 1937 - case BTRFS_QGROUP_OPER_ADD_EXCL: 1938 - case BTRFS_QGROUP_OPER_SUB_EXCL: 1939 - ret = qgroup_excl_accounting(fs_info, oper); 1940 - break; 1941 - case BTRFS_QGROUP_OPER_ADD_SHARED: 1942 - case BTRFS_QGROUP_OPER_SUB_SHARED: 1943 - ret = qgroup_shared_accounting(trans, fs_info, oper); 1944 - break; 1945 - case BTRFS_QGROUP_OPER_SUB_SUBTREE: 1946 - ret = qgroup_subtree_accounting(trans, fs_info, oper); 1947 - break; 1948 - default: 1949 - ASSERT(0); 1950 - } 1934 + /* Update new refcnts using new_roots */ 1935 + ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq, 1936 + UPDATE_NEW); 1937 + if (ret < 0) 1938 + goto out; 1939 + 1940 + qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots, 1941 + num_bytes, seq); 1942 + 1943 + /* 1944 + * Bump qgroup_seq to avoid seq overlap 1945 + */ 1946 + fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1; 1947 + out: 1948 + spin_unlock(&fs_info->qgroup_lock); 1949 + out_free: 1950 + ulist_free(tmp); 1951 + ulist_free(qgroups); 1952 + ulist_free(old_roots); 1953 + ulist_free(new_roots); 1951 1954 return ret; 1952 1955 } 1953 1956 1954 - /* 1955 - * Needs to be called everytime we run delayed refs, even if there is an error 1956 - * in order to cleanup outstanding operations. 1957 - */ 1958 - int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, 1959 - struct btrfs_fs_info *fs_info) 1957 + int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, 1958 + struct btrfs_fs_info *fs_info) 1960 1959 { 1961 - struct btrfs_qgroup_operation *oper; 1960 + struct btrfs_qgroup_extent_record *record; 1961 + struct btrfs_delayed_ref_root *delayed_refs; 1962 + struct ulist *new_roots = NULL; 1963 + struct rb_node *node; 1964 + u64 qgroup_to_skip; 1962 1965 int ret = 0; 1963 1966 1964 - while (!list_empty(&trans->qgroup_ref_list)) { 1965 - oper = list_first_entry(&trans->qgroup_ref_list, 1966 - struct btrfs_qgroup_operation, list); 1967 - list_del_init(&oper->list); 1968 - if (!ret || !trans->aborted) 1969 - ret = btrfs_qgroup_account(trans, fs_info, oper); 1970 - spin_lock(&fs_info->qgroup_op_lock); 1971 - rb_erase(&oper->n, &fs_info->qgroup_op_tree); 1972 - spin_unlock(&fs_info->qgroup_op_lock); 1973 - btrfs_put_tree_mod_seq(fs_info, &oper->elem); 1974 - kfree(oper); 1967 + delayed_refs = &trans->transaction->delayed_refs; 1968 + qgroup_to_skip = delayed_refs->qgroup_to_skip; 1969 + while ((node = rb_first(&delayed_refs->dirty_extent_root))) { 1970 + record = rb_entry(node, struct btrfs_qgroup_extent_record, 1971 + node); 1972 + 1973 + if (!ret) { 1974 + /* 1975 + * Use (u64)-1 as time_seq to do special search, which 1976 + * doesn't lock tree or delayed_refs and search current 1977 + * root. It's safe inside commit_transaction(). 1978 + */ 1979 + ret = btrfs_find_all_roots(trans, fs_info, 1980 + record->bytenr, (u64)-1, &new_roots); 1981 + if (ret < 0) 1982 + goto cleanup; 1983 + if (qgroup_to_skip) 1984 + ulist_del(new_roots, qgroup_to_skip, 0); 1985 + ret = btrfs_qgroup_account_extent(trans, fs_info, 1986 + record->bytenr, record->num_bytes, 1987 + record->old_roots, new_roots); 1988 + record->old_roots = NULL; 1989 + new_roots = NULL; 1990 + } 1991 + cleanup: 1992 + ulist_free(record->old_roots); 1993 + ulist_free(new_roots); 1994 + new_roots = NULL; 1995 + rb_erase(node, &delayed_refs->dirty_extent_root); 1996 + kfree(record); 1997 + 1975 1998 } 1976 1999 return ret; 1977 2000 } ··· 2150 2637 */ 2151 2638 static int 2152 2639 qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, 2153 - struct btrfs_trans_handle *trans, struct ulist *qgroups, 2154 - struct ulist *tmp, struct extent_buffer *scratch_leaf) 2640 + struct btrfs_trans_handle *trans, 2641 + struct extent_buffer *scratch_leaf) 2155 2642 { 2156 2643 struct btrfs_key found; 2157 2644 struct ulist *roots = NULL; 2158 2645 struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); 2159 2646 u64 num_bytes; 2160 - u64 seq; 2161 - int new_roots; 2162 2647 int slot; 2163 2648 int ret; 2164 2649 ··· 2206 2695 else 2207 2696 num_bytes = found.offset; 2208 2697 2209 - ulist_reinit(qgroups); 2210 2698 ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, 2211 2699 &roots); 2212 2700 if (ret < 0) 2213 2701 goto out; 2214 - spin_lock(&fs_info->qgroup_lock); 2215 - seq = fs_info->qgroup_seq; 2216 - fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ 2217 - 2218 - new_roots = 0; 2219 - ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups, 2220 - seq, &new_roots, 1); 2221 - if (ret < 0) { 2222 - spin_unlock(&fs_info->qgroup_lock); 2223 - ulist_free(roots); 2702 + /* For rescan, just pass old_roots as NULL */ 2703 + ret = btrfs_qgroup_account_extent(trans, fs_info, 2704 + found.objectid, num_bytes, NULL, roots); 2705 + if (ret < 0) 2224 2706 goto out; 2225 - } 2226 - 2227 - ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups, 2228 - seq, 0, new_roots, 1); 2229 - if (ret < 0) { 2230 - spin_unlock(&fs_info->qgroup_lock); 2231 - ulist_free(roots); 2232 - goto out; 2233 - } 2234 - spin_unlock(&fs_info->qgroup_lock); 2235 - ulist_free(roots); 2236 2707 } 2237 2708 out: 2238 2709 btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); ··· 2228 2735 qgroup_rescan_work); 2229 2736 struct btrfs_path *path; 2230 2737 struct btrfs_trans_handle *trans = NULL; 2231 - struct ulist *tmp = NULL, *qgroups = NULL; 2232 2738 struct extent_buffer *scratch_leaf = NULL; 2233 2739 int err = -ENOMEM; 2234 2740 int ret = 0; 2235 2741 2236 2742 path = btrfs_alloc_path(); 2237 2743 if (!path) 2238 - goto out; 2239 - qgroups = ulist_alloc(GFP_NOFS); 2240 - if (!qgroups) 2241 - goto out; 2242 - tmp = ulist_alloc(GFP_NOFS); 2243 - if (!tmp) 2244 2744 goto out; 2245 2745 scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS); 2246 2746 if (!scratch_leaf) ··· 2250 2764 err = -EINTR; 2251 2765 } else { 2252 2766 err = qgroup_rescan_leaf(fs_info, path, trans, 2253 - qgroups, tmp, scratch_leaf); 2767 + scratch_leaf); 2254 2768 } 2255 2769 if (err > 0) 2256 2770 btrfs_commit_transaction(trans, fs_info->fs_root); ··· 2260 2774 2261 2775 out: 2262 2776 kfree(scratch_leaf); 2263 - ulist_free(qgroups); 2264 - ulist_free(tmp); 2265 2777 btrfs_free_path(path); 2266 2778 2267 2779 mutex_lock(&fs_info->qgroup_rescan_lock);

+20 -43

fs/btrfs/qgroup.h

··· 19 19 #ifndef __BTRFS_QGROUP__ 20 20 #define __BTRFS_QGROUP__ 21 21 22 - /* 23 - * A description of the operations, all of these operations only happen when we 24 - * are adding the 1st reference for that subvolume in the case of adding space 25 - * or on the last reference delete in the case of subtraction. The only 26 - * exception is the last one, which is added for confusion. 27 - * 28 - * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only 29 - * one pointing at the bytes we are adding. This is called on the first 30 - * allocation. 31 - * 32 - * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be 33 - * shared between subvols. This is called on the creation of a ref that already 34 - * has refs from a different subvolume, so basically reflink. 35 - * 36 - * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only 37 - * one referencing the range. 38 - * 39 - * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with 40 - * refs with other subvolumes. 41 - */ 42 - enum btrfs_qgroup_operation_type { 43 - BTRFS_QGROUP_OPER_ADD_EXCL, 44 - BTRFS_QGROUP_OPER_ADD_SHARED, 45 - BTRFS_QGROUP_OPER_SUB_EXCL, 46 - BTRFS_QGROUP_OPER_SUB_SHARED, 47 - BTRFS_QGROUP_OPER_SUB_SUBTREE, 48 - }; 22 + #include "ulist.h" 23 + #include "delayed-ref.h" 49 24 50 - struct btrfs_qgroup_operation { 51 - u64 ref_root; 25 + /* 26 + * Record a dirty extent, and info qgroup to update quota on it 27 + * TODO: Use kmem cache to alloc it. 28 + */ 29 + struct btrfs_qgroup_extent_record { 30 + struct rb_node node; 52 31 u64 bytenr; 53 32 u64 num_bytes; 54 - u64 seq; 55 - enum btrfs_qgroup_operation_type type; 56 - struct seq_list elem; 57 - struct rb_node n; 58 - struct list_head list; 33 + struct ulist *old_roots; 59 34 }; 60 35 61 36 int btrfs_quota_enable(struct btrfs_trans_handle *trans, ··· 54 79 int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); 55 80 void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); 56 81 struct btrfs_delayed_extent_op; 57 - int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, 58 - struct btrfs_fs_info *fs_info, u64 ref_root, 82 + int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, 83 + struct btrfs_fs_info *fs_info); 84 + struct btrfs_qgroup_extent_record 85 + *btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs, 86 + struct btrfs_qgroup_extent_record *record); 87 + int 88 + btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, 89 + struct btrfs_fs_info *fs_info, 59 90 u64 bytenr, u64 num_bytes, 60 - enum btrfs_qgroup_operation_type type, 61 - int mod_seq); 62 - int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, 63 - struct btrfs_fs_info *fs_info); 64 - void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans, 65 - struct btrfs_fs_info *fs_info, 66 - struct btrfs_qgroup_operation *oper); 91 + struct ulist *old_roots, struct ulist *new_roots); 92 + int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, 93 + struct btrfs_fs_info *fs_info); 67 94 int btrfs_run_qgroups(struct btrfs_trans_handle *trans, 68 95 struct btrfs_fs_info *fs_info); 69 96 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,

+14 -5

fs/btrfs/relocation.c

··· 1847 1847 } 1848 1848 1849 1849 eb = read_tree_block(dest, old_bytenr, old_ptr_gen); 1850 - if (!eb || !extent_buffer_uptodate(eb)) { 1851 - ret = (!eb) ? -ENOMEM : -EIO; 1850 + if (IS_ERR(eb)) { 1851 + ret = PTR_ERR(eb); 1852 + } else if (!extent_buffer_uptodate(eb)) { 1853 + ret = -EIO; 1852 1854 free_extent_buffer(eb); 1853 1855 break; 1854 1856 } ··· 2004 2002 2005 2003 bytenr = btrfs_node_blockptr(eb, path->slots[i]); 2006 2004 eb = read_tree_block(root, bytenr, ptr_gen); 2007 - if (!eb || !extent_buffer_uptodate(eb)) { 2005 + if (IS_ERR(eb)) { 2006 + return PTR_ERR(eb); 2007 + } else if (!extent_buffer_uptodate(eb)) { 2008 2008 free_extent_buffer(eb); 2009 2009 return -EIO; 2010 2010 } ··· 2714 2710 blocksize = root->nodesize; 2715 2711 generation = btrfs_node_ptr_generation(upper->eb, slot); 2716 2712 eb = read_tree_block(root, bytenr, generation); 2717 - if (!eb || !extent_buffer_uptodate(eb)) { 2713 + if (IS_ERR(eb)) { 2714 + err = PTR_ERR(eb); 2715 + goto next; 2716 + } else if (!extent_buffer_uptodate(eb)) { 2718 2717 free_extent_buffer(eb); 2719 2718 err = -EIO; 2720 2719 goto next; ··· 2880 2873 BUG_ON(block->key_ready); 2881 2874 eb = read_tree_block(rc->extent_root, block->bytenr, 2882 2875 block->key.offset); 2883 - if (!eb || !extent_buffer_uptodate(eb)) { 2876 + if (IS_ERR(eb)) { 2877 + return PTR_ERR(eb); 2878 + } else if (!extent_buffer_uptodate(eb)) { 2884 2879 free_extent_buffer(eb); 2885 2880 return -EIO; 2886 2881 }

+23 -3

fs/btrfs/scrub.c

··· 2662 2662 kfree(sparity); 2663 2663 } 2664 2664 2665 + static void scrub_parity_bio_endio_worker(struct btrfs_work *work) 2666 + { 2667 + struct scrub_parity *sparity = container_of(work, struct scrub_parity, 2668 + work); 2669 + struct scrub_ctx *sctx = sparity->sctx; 2670 + 2671 + scrub_free_parity(sparity); 2672 + scrub_pending_bio_dec(sctx); 2673 + } 2674 + 2665 2675 static void scrub_parity_bio_endio(struct bio *bio, int error) 2666 2676 { 2667 2677 struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; 2668 - struct scrub_ctx *sctx = sparity->sctx; 2669 2678 2670 2679 if (error) 2671 2680 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, 2672 2681 sparity->nsectors); 2673 2682 2674 - scrub_free_parity(sparity); 2675 - scrub_pending_bio_dec(sctx); 2676 2683 bio_put(bio); 2684 + 2685 + btrfs_init_work(&sparity->work, btrfs_scrubparity_helper, 2686 + scrub_parity_bio_endio_worker, NULL, NULL); 2687 + btrfs_queue_work(sparity->sctx->dev_root->fs_info->scrub_parity_workers, 2688 + &sparity->work); 2677 2689 } 2678 2690 2679 2691 static void scrub_parity_check_and_repair(struct scrub_parity *sparity) ··· 3601 3589 ret = -ENOMEM; 3602 3590 goto out; 3603 3591 } 3592 + fs_info->scrub_parity_workers = 3593 + btrfs_alloc_workqueue("btrfs-scrubparity", flags, 3594 + max_active, 2); 3595 + if (!fs_info->scrub_parity_workers) { 3596 + ret = -ENOMEM; 3597 + goto out; 3598 + } 3604 3599 } 3605 3600 ++fs_info->scrub_workers_refcnt; 3606 3601 out: ··· 3620 3601 btrfs_destroy_workqueue(fs_info->scrub_workers); 3621 3602 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); 3622 3603 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers); 3604 + btrfs_destroy_workqueue(fs_info->scrub_parity_workers); 3623 3605 } 3624 3606 WARN_ON(fs_info->scrub_workers_refcnt < 0); 3625 3607 }

+121 -26

fs/btrfs/send.c

··· 243 243 * after this directory is moved, we can try to rmdir the ino rmdir_ino. 244 244 */ 245 245 u64 rmdir_ino; 246 + bool orphanized; 246 247 }; 247 248 248 249 struct orphan_dir_info { ··· 1159 1158 /* may be truncated in case it's the last extent in a file */ 1160 1159 u64 extent_len; 1161 1160 1161 + /* data offset in the file extent item */ 1162 + u64 data_offset; 1163 + 1162 1164 /* Just to check for bugs in backref resolving */ 1163 1165 int found_itself; 1164 1166 }; ··· 1225 1221 if (ret < 0) 1226 1222 return ret; 1227 1223 1228 - if (offset + bctx->extent_len > i_size) 1224 + if (offset + bctx->data_offset + bctx->extent_len > i_size) 1229 1225 return 0; 1230 1226 1231 1227 /* ··· 1367 1363 backref_ctx->cur_offset = data_offset; 1368 1364 backref_ctx->found_itself = 0; 1369 1365 backref_ctx->extent_len = num_bytes; 1366 + /* 1367 + * For non-compressed extents iterate_extent_inodes() gives us extent 1368 + * offsets that already take into account the data offset, but not for 1369 + * compressed extents, since the offset is logical and not relative to 1370 + * the physical extent locations. We must take this into account to 1371 + * avoid sending clone offsets that go beyond the source file's size, 1372 + * which would result in the clone ioctl failing with -EINVAL on the 1373 + * receiving end. 1374 + */ 1375 + if (compressed == BTRFS_COMPRESS_NONE) 1376 + backref_ctx->data_offset = 0; 1377 + else 1378 + backref_ctx->data_offset = btrfs_file_extent_offset(eb, fi); 1370 1379 1371 1380 /* 1372 1381 * The last extent of a file may be too large due to page alignment. ··· 1917 1900 goto out; 1918 1901 } 1919 1902 1920 - /* we know that it is or will be overwritten. check this now */ 1921 - if (ow_inode < sctx->send_progress) 1903 + /* 1904 + * We know that it is or will be overwritten. Check this now. 1905 + * The current inode being processed might have been the one that caused 1906 + * inode 'ino' to be orphanized, therefore ow_inode can actually be the 1907 + * same as sctx->send_progress. 1908 + */ 1909 + if (ow_inode <= sctx->send_progress) 1922 1910 ret = 1; 1923 1911 else 1924 1912 ret = 0; ··· 2245 2223 fs_path_reset(dest); 2246 2224 2247 2225 while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { 2226 + struct waiting_dir_move *wdm; 2227 + 2248 2228 fs_path_reset(name); 2249 2229 2250 2230 if (is_waiting_for_rm(sctx, ino)) { ··· 2257 2233 break; 2258 2234 } 2259 2235 2260 - if (is_waiting_for_move(sctx, ino)) { 2236 + wdm = get_waiting_dir_move(sctx, ino); 2237 + if (wdm && wdm->orphanized) { 2238 + ret = gen_unique_name(sctx, ino, gen, name); 2239 + stop = 1; 2240 + } else if (wdm) { 2261 2241 ret = get_first_ref(sctx->parent_root, ino, 2262 2242 &parent_inode, &parent_gen, name); 2263 2243 } else { ··· 2356 2328 TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, 2357 2329 le64_to_cpu(sctx->send_root->root_item.ctransid)); 2358 2330 if (parent_root) { 2359 - TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 2360 - sctx->parent_root->root_item.uuid); 2331 + if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid)) 2332 + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 2333 + parent_root->root_item.received_uuid); 2334 + else 2335 + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 2336 + parent_root->root_item.uuid); 2361 2337 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, 2362 2338 le64_to_cpu(sctx->parent_root->root_item.ctransid)); 2363 2339 } ··· 2955 2923 return entry != NULL; 2956 2924 } 2957 2925 2958 - static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) 2926 + static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized) 2959 2927 { 2960 2928 struct rb_node **p = &sctx->waiting_dir_moves.rb_node; 2961 2929 struct rb_node *parent = NULL; ··· 2966 2934 return -ENOMEM; 2967 2935 dm->ino = ino; 2968 2936 dm->rmdir_ino = 0; 2937 + dm->orphanized = orphanized; 2969 2938 2970 2939 while (*p) { 2971 2940 parent = *p; ··· 3063 3030 goto out; 3064 3031 } 3065 3032 3066 - ret = add_waiting_dir_move(sctx, pm->ino); 3033 + ret = add_waiting_dir_move(sctx, pm->ino, is_orphan); 3067 3034 if (ret) 3068 3035 goto out; 3069 3036 ··· 3386 3353 return ret; 3387 3354 } 3388 3355 3356 + /* 3357 + * Check if ino ino1 is an ancestor of inode ino2 in the given root. 3358 + * Return 1 if true, 0 if false and < 0 on error. 3359 + */ 3360 + static int is_ancestor(struct btrfs_root *root, 3361 + const u64 ino1, 3362 + const u64 ino1_gen, 3363 + const u64 ino2, 3364 + struct fs_path *fs_path) 3365 + { 3366 + u64 ino = ino2; 3367 + 3368 + while (ino > BTRFS_FIRST_FREE_OBJECTID) { 3369 + int ret; 3370 + u64 parent; 3371 + u64 parent_gen; 3372 + 3373 + fs_path_reset(fs_path); 3374 + ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path); 3375 + if (ret < 0) { 3376 + if (ret == -ENOENT && ino == ino2) 3377 + ret = 0; 3378 + return ret; 3379 + } 3380 + if (parent == ino1) 3381 + return parent_gen == ino1_gen ? 1 : 0; 3382 + ino = parent; 3383 + } 3384 + return 0; 3385 + } 3386 + 3389 3387 static int wait_for_parent_move(struct send_ctx *sctx, 3390 - struct recorded_ref *parent_ref) 3388 + struct recorded_ref *parent_ref, 3389 + const bool is_orphan) 3391 3390 { 3392 3391 int ret = 0; 3393 3392 u64 ino = parent_ref->dir; ··· 3439 3374 * Our current directory inode may not yet be renamed/moved because some 3440 3375 * ancestor (immediate or not) has to be renamed/moved first. So find if 3441 3376 * such ancestor exists and make sure our own rename/move happens after 3442 - * that ancestor is processed. 3377 + * that ancestor is processed to avoid path build infinite loops (done 3378 + * at get_cur_path()). 3443 3379 */ 3444 3380 while (ino > BTRFS_FIRST_FREE_OBJECTID) { 3445 3381 if (is_waiting_for_move(sctx, ino)) { 3446 - ret = 1; 3382 + /* 3383 + * If the current inode is an ancestor of ino in the 3384 + * parent root, we need to delay the rename of the 3385 + * current inode, otherwise don't delayed the rename 3386 + * because we can end up with a circular dependency 3387 + * of renames, resulting in some directories never 3388 + * getting the respective rename operations issued in 3389 + * the send stream or getting into infinite path build 3390 + * loops. 3391 + */ 3392 + ret = is_ancestor(sctx->parent_root, 3393 + sctx->cur_ino, sctx->cur_inode_gen, 3394 + ino, path_before); 3447 3395 break; 3448 3396 } 3449 3397 ··· 3498 3420 ino, 3499 3421 &sctx->new_refs, 3500 3422 &sctx->deleted_refs, 3501 - false); 3423 + is_orphan); 3502 3424 if (!ret) 3503 3425 ret = 1; 3504 3426 } ··· 3667 3589 } 3668 3590 } 3669 3591 3592 + if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root && 3593 + can_rename) { 3594 + ret = wait_for_parent_move(sctx, cur, is_orphan); 3595 + if (ret < 0) 3596 + goto out; 3597 + if (ret == 1) { 3598 + can_rename = false; 3599 + *pending_move = 1; 3600 + } 3601 + } 3602 + 3670 3603 /* 3671 3604 * link/move the ref to the new place. If we have an orphan 3672 3605 * inode, move it and update valid_path. If not, link or move ··· 3698 3609 * dirs, we always have one new and one deleted 3699 3610 * ref. The deleted ref is ignored later. 3700 3611 */ 3701 - ret = wait_for_parent_move(sctx, cur); 3702 - if (ret < 0) 3703 - goto out; 3704 - if (ret) { 3705 - *pending_move = 1; 3706 - } else { 3707 - ret = send_rename(sctx, valid_path, 3708 - cur->full_path); 3709 - if (!ret) 3710 - ret = fs_path_copy(valid_path, 3711 - cur->full_path); 3712 - } 3612 + ret = send_rename(sctx, valid_path, 3613 + cur->full_path); 3614 + if (!ret) 3615 + ret = fs_path_copy(valid_path, 3616 + cur->full_path); 3713 3617 if (ret < 0) 3714 3618 goto out; 3715 3619 } else { ··· 4590 4508 if (ret < 0) 4591 4509 goto out; 4592 4510 4593 - TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 4594 - clone_root->root->root_item.uuid); 4511 + /* 4512 + * If the parent we're using has a received_uuid set then use that as 4513 + * our clone source as that is what we will look for when doing a 4514 + * receive. 4515 + * 4516 + * This covers the case that we create a snapshot off of a received 4517 + * subvolume and then use that as the parent and try to receive on a 4518 + * different host. 4519 + */ 4520 + if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid)) 4521 + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 4522 + clone_root->root->root_item.received_uuid); 4523 + else 4524 + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 4525 + clone_root->root->root_item.uuid); 4595 4526 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, 4596 4527 le64_to_cpu(clone_root->root->root_item.ctransid)); 4597 4528 TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);

+260 -149

fs/btrfs/super.c

··· 135 135 * __btrfs_std_error decodes expected errors from the caller and 136 136 * invokes the approciate error response. 137 137 */ 138 + __cold 138 139 void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, 139 140 unsigned int line, int errno, const char *fmt, ...) 140 141 { ··· 248 247 * We'll complete the cleanup in btrfs_end_transaction and 249 248 * btrfs_commit_transaction. 250 249 */ 250 + __cold 251 251 void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, 252 252 struct btrfs_root *root, const char *function, 253 253 unsigned int line, int errno) 254 254 { 255 - /* 256 - * Report first abort since mount 257 - */ 258 - if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, 259 - &root->fs_info->fs_state)) { 260 - WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n", 261 - errno); 262 - } 263 255 trans->aborted = errno; 264 256 /* Nothing used. The other threads that have joined this 265 257 * transaction may be able to continue. */ ··· 275 281 * __btrfs_panic decodes unexpected, fatal errors from the caller, 276 282 * issues an alert, and either panics or BUGs, depending on mount options. 277 283 */ 284 + __cold 278 285 void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, 279 286 unsigned int line, int errno, const char *fmt, ...) 280 287 { ··· 836 841 return error; 837 842 } 838 843 839 - static struct dentry *get_default_root(struct super_block *sb, 840 - u64 subvol_objectid) 844 + static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, 845 + u64 subvol_objectid) 841 846 { 842 - struct btrfs_fs_info *fs_info = btrfs_sb(sb); 843 847 struct btrfs_root *root = fs_info->tree_root; 844 - struct btrfs_root *new_root; 848 + struct btrfs_root *fs_root; 849 + struct btrfs_root_ref *root_ref; 850 + struct btrfs_inode_ref *inode_ref; 851 + struct btrfs_key key; 852 + struct btrfs_path *path = NULL; 853 + char *name = NULL, *ptr; 854 + u64 dirid; 855 + int len; 856 + int ret; 857 + 858 + path = btrfs_alloc_path(); 859 + if (!path) { 860 + ret = -ENOMEM; 861 + goto err; 862 + } 863 + path->leave_spinning = 1; 864 + 865 + name = kmalloc(PATH_MAX, GFP_NOFS); 866 + if (!name) { 867 + ret = -ENOMEM; 868 + goto err; 869 + } 870 + ptr = name + PATH_MAX - 1; 871 + ptr[0] = '\0'; 872 + 873 + /* 874 + * Walk up the subvolume trees in the tree of tree roots by root 875 + * backrefs until we hit the top-level subvolume. 876 + */ 877 + while (subvol_objectid != BTRFS_FS_TREE_OBJECTID) { 878 + key.objectid = subvol_objectid; 879 + key.type = BTRFS_ROOT_BACKREF_KEY; 880 + key.offset = (u64)-1; 881 + 882 + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 883 + if (ret < 0) { 884 + goto err; 885 + } else if (ret > 0) { 886 + ret = btrfs_previous_item(root, path, subvol_objectid, 887 + BTRFS_ROOT_BACKREF_KEY); 888 + if (ret < 0) { 889 + goto err; 890 + } else if (ret > 0) { 891 + ret = -ENOENT; 892 + goto err; 893 + } 894 + } 895 + 896 + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 897 + subvol_objectid = key.offset; 898 + 899 + root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0], 900 + struct btrfs_root_ref); 901 + len = btrfs_root_ref_name_len(path->nodes[0], root_ref); 902 + ptr -= len + 1; 903 + if (ptr < name) { 904 + ret = -ENAMETOOLONG; 905 + goto err; 906 + } 907 + read_extent_buffer(path->nodes[0], ptr + 1, 908 + (unsigned long)(root_ref + 1), len); 909 + ptr[0] = '/'; 910 + dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref); 911 + btrfs_release_path(path); 912 + 913 + key.objectid = subvol_objectid; 914 + key.type = BTRFS_ROOT_ITEM_KEY; 915 + key.offset = (u64)-1; 916 + fs_root = btrfs_read_fs_root_no_name(fs_info, &key); 917 + if (IS_ERR(fs_root)) { 918 + ret = PTR_ERR(fs_root); 919 + goto err; 920 + } 921 + 922 + /* 923 + * Walk up the filesystem tree by inode refs until we hit the 924 + * root directory. 925 + */ 926 + while (dirid != BTRFS_FIRST_FREE_OBJECTID) { 927 + key.objectid = dirid; 928 + key.type = BTRFS_INODE_REF_KEY; 929 + key.offset = (u64)-1; 930 + 931 + ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); 932 + if (ret < 0) { 933 + goto err; 934 + } else if (ret > 0) { 935 + ret = btrfs_previous_item(fs_root, path, dirid, 936 + BTRFS_INODE_REF_KEY); 937 + if (ret < 0) { 938 + goto err; 939 + } else if (ret > 0) { 940 + ret = -ENOENT; 941 + goto err; 942 + } 943 + } 944 + 945 + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 946 + dirid = key.offset; 947 + 948 + inode_ref = btrfs_item_ptr(path->nodes[0], 949 + path->slots[0], 950 + struct btrfs_inode_ref); 951 + len = btrfs_inode_ref_name_len(path->nodes[0], 952 + inode_ref); 953 + ptr -= len + 1; 954 + if (ptr < name) { 955 + ret = -ENAMETOOLONG; 956 + goto err; 957 + } 958 + read_extent_buffer(path->nodes[0], ptr + 1, 959 + (unsigned long)(inode_ref + 1), len); 960 + ptr[0] = '/'; 961 + btrfs_release_path(path); 962 + } 963 + } 964 + 965 + btrfs_free_path(path); 966 + if (ptr == name + PATH_MAX - 1) { 967 + name[0] = '/'; 968 + name[1] = '\0'; 969 + } else { 970 + memmove(name, ptr, name + PATH_MAX - ptr); 971 + } 972 + return name; 973 + 974 + err: 975 + btrfs_free_path(path); 976 + kfree(name); 977 + return ERR_PTR(ret); 978 + } 979 + 980 + static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objectid) 981 + { 982 + struct btrfs_root *root = fs_info->tree_root; 845 983 struct btrfs_dir_item *di; 846 984 struct btrfs_path *path; 847 985 struct btrfs_key location; 848 - struct inode *inode; 849 986 u64 dir_id; 850 - int new = 0; 851 - 852 - /* 853 - * We have a specific subvol we want to mount, just setup location and 854 - * go look up the root. 855 - */ 856 - if (subvol_objectid) { 857 - location.objectid = subvol_objectid; 858 - location.type = BTRFS_ROOT_ITEM_KEY; 859 - location.offset = (u64)-1; 860 - goto find_root; 861 - } 862 987 863 988 path = btrfs_alloc_path(); 864 989 if (!path) 865 - return ERR_PTR(-ENOMEM); 990 + return -ENOMEM; 866 991 path->leave_spinning = 1; 867 992 868 993 /* ··· 994 879 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); 995 880 if (IS_ERR(di)) { 996 881 btrfs_free_path(path); 997 - return ERR_CAST(di); 882 + return PTR_ERR(di); 998 883 } 999 884 if (!di) { 1000 885 /* 1001 886 * Ok the default dir item isn't there. This is weird since 1002 887 * it's always been there, but don't freak out, just try and 1003 - * mount to root most subvolume. 888 + * mount the top-level subvolume. 1004 889 */ 1005 890 btrfs_free_path(path); 1006 - dir_id = BTRFS_FIRST_FREE_OBJECTID; 1007 - new_root = fs_info->fs_root; 1008 - goto setup_root; 891 + *objectid = BTRFS_FS_TREE_OBJECTID; 892 + return 0; 1009 893 } 1010 894 1011 895 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 1012 896 btrfs_free_path(path); 1013 - 1014 - find_root: 1015 - new_root = btrfs_read_fs_root_no_name(fs_info, &location); 1016 - if (IS_ERR(new_root)) 1017 - return ERR_CAST(new_root); 1018 - 1019 - if (!(sb->s_flags & MS_RDONLY)) { 1020 - int ret; 1021 - down_read(&fs_info->cleanup_work_sem); 1022 - ret = btrfs_orphan_cleanup(new_root); 1023 - up_read(&fs_info->cleanup_work_sem); 1024 - if (ret) 1025 - return ERR_PTR(ret); 1026 - } 1027 - 1028 - dir_id = btrfs_root_dirid(&new_root->root_item); 1029 - setup_root: 1030 - location.objectid = dir_id; 1031 - location.type = BTRFS_INODE_ITEM_KEY; 1032 - location.offset = 0; 1033 - 1034 - inode = btrfs_iget(sb, &location, new_root, &new); 1035 - if (IS_ERR(inode)) 1036 - return ERR_CAST(inode); 1037 - 1038 - /* 1039 - * If we're just mounting the root most subvol put the inode and return 1040 - * a reference to the dentry. We will have already gotten a reference 1041 - * to the inode in btrfs_fill_super so we're good to go. 1042 - */ 1043 - if (!new && d_inode(sb->s_root) == inode) { 1044 - iput(inode); 1045 - return dget(sb->s_root); 1046 - } 1047 - 1048 - return d_obtain_root(inode); 897 + *objectid = location.objectid; 898 + return 0; 1049 899 } 1050 900 1051 901 static int btrfs_fill_super(struct super_block *sb, ··· 1188 1108 seq_puts(seq, ",fatal_errors=panic"); 1189 1109 if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) 1190 1110 seq_printf(seq, ",commit=%d", info->commit_interval); 1111 + seq_printf(seq, ",subvolid=%llu", 1112 + BTRFS_I(d_inode(dentry))->root->root_key.objectid); 1113 + seq_puts(seq, ",subvol="); 1114 + seq_dentry(seq, dentry, " \t\n\\"); 1191 1115 return 0; 1192 1116 } 1193 1117 ··· 1222 1138 } 1223 1139 1224 1140 /* 1225 - * This will strip out the subvol=%s argument for an argument string and add 1226 - * subvolid=0 to make sure we get the actual tree root for path walking to the 1227 - * subvol we want. 1141 + * This will add subvolid=0 to the argument string while removing any subvol= 1142 + * and subvolid= arguments to make sure we get the top-level root for path 1143 + * walking to the subvol we want. 1228 1144 */ 1229 1145 static char *setup_root_args(char *args) 1230 1146 { 1231 - unsigned len = strlen(args) + 2 + 1; 1232 - char *src, *dst, *buf; 1147 + char *buf, *dst, *sep; 1233 1148 1234 - /* 1235 - * We need the same args as before, but with this substitution: 1236 - * s!subvol=[^,]+!subvolid=0! 1237 - * 1238 - * Since the replacement string is up to 2 bytes longer than the 1239 - * original, allocate strlen(args) + 2 + 1 bytes. 1240 - */ 1149 + if (!args) 1150 + return kstrdup("subvolid=0", GFP_NOFS); 1241 1151 1242 - src = strstr(args, "subvol="); 1243 - /* This shouldn't happen, but just in case.. */ 1244 - if (!src) 1245 - return NULL; 1246 - 1247 - buf = dst = kmalloc(len, GFP_NOFS); 1152 + /* The worst case is that we add ",subvolid=0" to the end. */ 1153 + buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1, GFP_NOFS); 1248 1154 if (!buf) 1249 1155 return NULL; 1250 1156 1251 - /* 1252 - * If the subvol= arg is not at the start of the string, 1253 - * copy whatever precedes it into buf. 1254 - */ 1255 - if (src != args) { 1256 - *src++ = '\0'; 1257 - strcpy(buf, args); 1258 - dst += strlen(args); 1157 + while (1) { 1158 + sep = strchrnul(args, ','); 1159 + if (!strstarts(args, "subvol=") && 1160 + !strstarts(args, "subvolid=")) { 1161 + memcpy(dst, args, sep - args); 1162 + dst += sep - args; 1163 + *dst++ = ','; 1164 + } 1165 + if (*sep) 1166 + args = sep + 1; 1167 + else 1168 + break; 1259 1169 } 1260 - 1261 1170 strcpy(dst, "subvolid=0"); 1262 - dst += strlen("subvolid=0"); 1263 - 1264 - /* 1265 - * If there is a "," after the original subvol=... string, 1266 - * copy that suffix into our buffer. Otherwise, we're done. 1267 - */ 1268 - src = strchr(src, ','); 1269 - if (src) 1270 - strcpy(dst, src); 1271 1171 1272 1172 return buf; 1273 1173 } 1274 1174 1275 - static struct dentry *mount_subvol(const char *subvol_name, int flags, 1276 - const char *device_name, char *data) 1175 + static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, 1176 + int flags, const char *device_name, 1177 + char *data) 1277 1178 { 1278 1179 struct dentry *root; 1279 - struct vfsmount *mnt; 1180 + struct vfsmount *mnt = NULL; 1280 1181 char *newargs; 1182 + int ret; 1281 1183 1282 1184 newargs = setup_root_args(data); 1283 - if (!newargs) 1284 - return ERR_PTR(-ENOMEM); 1285 - mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, 1286 - newargs); 1185 + if (!newargs) { 1186 + root = ERR_PTR(-ENOMEM); 1187 + goto out; 1188 + } 1287 1189 1288 - if (PTR_RET(mnt) == -EBUSY) { 1190 + mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs); 1191 + if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) { 1289 1192 if (flags & MS_RDONLY) { 1290 - mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name, 1291 - newargs); 1193 + mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, 1194 + device_name, newargs); 1292 1195 } else { 1293 - int r; 1294 - mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name, 1295 - newargs); 1196 + mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, 1197 + device_name, newargs); 1296 1198 if (IS_ERR(mnt)) { 1297 - kfree(newargs); 1298 - return ERR_CAST(mnt); 1199 + root = ERR_CAST(mnt); 1200 + mnt = NULL; 1201 + goto out; 1299 1202 } 1300 1203 1301 - r = btrfs_remount(mnt->mnt_sb, &flags, NULL); 1302 - if (r < 0) { 1303 - /* FIXME: release vfsmount mnt ??*/ 1304 - kfree(newargs); 1305 - return ERR_PTR(r); 1204 + down_write(&mnt->mnt_sb->s_umount); 1205 + ret = btrfs_remount(mnt->mnt_sb, &flags, NULL); 1206 + up_write(&mnt->mnt_sb->s_umount); 1207 + if (ret < 0) { 1208 + root = ERR_PTR(ret); 1209 + goto out; 1306 1210 } 1307 1211 } 1308 1212 } 1309 - 1310 - kfree(newargs); 1311 - 1312 - if (IS_ERR(mnt)) 1313 - return ERR_CAST(mnt); 1314 - 1315 - root = mount_subtree(mnt, subvol_name); 1316 - 1317 - if (!IS_ERR(root) && !is_subvolume_inode(d_inode(root))) { 1318 - struct super_block *s = root->d_sb; 1319 - dput(root); 1320 - root = ERR_PTR(-EINVAL); 1321 - deactivate_locked_super(s); 1322 - printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n", 1323 - subvol_name); 1213 + if (IS_ERR(mnt)) { 1214 + root = ERR_CAST(mnt); 1215 + mnt = NULL; 1216 + goto out; 1324 1217 } 1325 1218 1219 + if (!subvol_name) { 1220 + if (!subvol_objectid) { 1221 + ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb), 1222 + &subvol_objectid); 1223 + if (ret) { 1224 + root = ERR_PTR(ret); 1225 + goto out; 1226 + } 1227 + } 1228 + subvol_name = get_subvol_name_from_objectid(btrfs_sb(mnt->mnt_sb), 1229 + subvol_objectid); 1230 + if (IS_ERR(subvol_name)) { 1231 + root = ERR_CAST(subvol_name); 1232 + subvol_name = NULL; 1233 + goto out; 1234 + } 1235 + 1236 + } 1237 + 1238 + root = mount_subtree(mnt, subvol_name); 1239 + /* mount_subtree() drops our reference on the vfsmount. */ 1240 + mnt = NULL; 1241 + 1242 + if (!IS_ERR(root)) { 1243 + struct super_block *s = root->d_sb; 1244 + struct inode *root_inode = d_inode(root); 1245 + u64 root_objectid = BTRFS_I(root_inode)->root->root_key.objectid; 1246 + 1247 + ret = 0; 1248 + if (!is_subvolume_inode(root_inode)) { 1249 + pr_err("BTRFS: '%s' is not a valid subvolume\n", 1250 + subvol_name); 1251 + ret = -EINVAL; 1252 + } 1253 + if (subvol_objectid && root_objectid != subvol_objectid) { 1254 + /* 1255 + * This will also catch a race condition where a 1256 + * subvolume which was passed by ID is renamed and 1257 + * another subvolume is renamed over the old location. 1258 + */ 1259 + pr_err("BTRFS: subvol '%s' does not match subvolid %llu\n", 1260 + subvol_name, subvol_objectid); 1261 + ret = -EINVAL; 1262 + } 1263 + if (ret) { 1264 + dput(root); 1265 + root = ERR_PTR(ret); 1266 + deactivate_locked_super(s); 1267 + } 1268 + } 1269 + 1270 + out: 1271 + mntput(mnt); 1272 + kfree(newargs); 1273 + kfree(subvol_name); 1326 1274 return root; 1327 1275 } 1328 1276 ··· 1419 1303 { 1420 1304 struct block_device *bdev = NULL; 1421 1305 struct super_block *s; 1422 - struct dentry *root; 1423 1306 struct btrfs_fs_devices *fs_devices = NULL; 1424 1307 struct btrfs_fs_info *fs_info = NULL; 1425 1308 struct security_mnt_opts new_sec_opts; ··· 1438 1323 return ERR_PTR(error); 1439 1324 } 1440 1325 1441 - if (subvol_name) { 1442 - root = mount_subvol(subvol_name, flags, device_name, data); 1443 - kfree(subvol_name); 1444 - return root; 1326 + if (subvol_name || subvol_objectid != BTRFS_FS_TREE_OBJECTID) { 1327 + /* mount_subvol() will free subvol_name. */ 1328 + return mount_subvol(subvol_name, subvol_objectid, flags, 1329 + device_name, data); 1445 1330 } 1446 1331 1447 1332 security_init_mnt_opts(&new_sec_opts); ··· 1507 1392 error = btrfs_fill_super(s, fs_devices, data, 1508 1393 flags & MS_SILENT ? 1 : 0); 1509 1394 } 1510 - 1511 - root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error); 1512 - if (IS_ERR(root)) { 1395 + if (error) { 1513 1396 deactivate_locked_super(s); 1514 - error = PTR_ERR(root); 1515 1397 goto error_sec_opts; 1516 1398 } 1517 1399 1518 1400 fs_info = btrfs_sb(s); 1519 1401 error = setup_security_options(fs_info, s, &new_sec_opts); 1520 1402 if (error) { 1521 - dput(root); 1522 1403 deactivate_locked_super(s); 1523 1404 goto error_sec_opts; 1524 1405 } 1525 1406 1526 - return root; 1407 + return dget(s->s_root); 1527 1408 1528 1409 error_close_devices: 1529 1410 btrfs_close_devices(fs_devices);

+112 -42

fs/btrfs/sysfs.c

··· 33 33 #include "volumes.h" 34 34 35 35 static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); 36 + static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); 36 37 37 38 static u64 get_features(struct btrfs_fs_info *fs_info, 38 39 enum btrfs_feature_set set) ··· 429 428 430 429 BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show); 431 430 432 - static struct attribute *btrfs_attrs[] = { 431 + static const struct attribute *btrfs_attrs[] = { 433 432 BTRFS_ATTR_PTR(label), 434 433 BTRFS_ATTR_PTR(nodesize), 435 434 BTRFS_ATTR_PTR(sectorsize), ··· 439 438 440 439 static void btrfs_release_super_kobj(struct kobject *kobj) 441 440 { 442 - struct btrfs_fs_info *fs_info = to_fs_info(kobj); 443 - complete(&fs_info->kobj_unregister); 441 + struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj); 442 + 443 + memset(&fs_devs->super_kobj, 0, sizeof(struct kobject)); 444 + complete(&fs_devs->kobj_unregister); 444 445 } 445 446 446 447 static struct kobj_type btrfs_ktype = { 447 448 .sysfs_ops = &kobj_sysfs_ops, 448 449 .release = btrfs_release_super_kobj, 449 - .default_attrs = btrfs_attrs, 450 450 }; 451 + 452 + static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj) 453 + { 454 + if (kobj->ktype != &btrfs_ktype) 455 + return NULL; 456 + return container_of(kobj, struct btrfs_fs_devices, super_kobj); 457 + } 451 458 452 459 static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) 453 460 { 454 461 if (kobj->ktype != &btrfs_ktype) 455 462 return NULL; 456 - return container_of(kobj, struct btrfs_fs_info, super_kobj); 463 + return to_fs_devs(kobj)->fs_info; 457 464 } 458 465 459 466 #define NUM_FEATURE_BITS 64 ··· 502 493 attrs[0] = &fa->kobj_attr.attr; 503 494 if (add) { 504 495 int ret; 505 - ret = sysfs_merge_group(&fs_info->super_kobj, 496 + ret = sysfs_merge_group(&fs_info->fs_devices->super_kobj, 506 497 &agroup); 507 498 if (ret) 508 499 return ret; 509 500 } else 510 - sysfs_unmerge_group(&fs_info->super_kobj, 501 + sysfs_unmerge_group(&fs_info->fs_devices->super_kobj, 511 502 &agroup); 512 503 } 513 504 ··· 515 506 return 0; 516 507 } 517 508 518 - static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) 509 + static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) 519 510 { 520 - kobject_del(&fs_info->super_kobj); 521 - kobject_put(&fs_info->super_kobj); 522 - wait_for_completion(&fs_info->kobj_unregister); 511 + if (fs_devs->device_dir_kobj) { 512 + kobject_del(fs_devs->device_dir_kobj); 513 + kobject_put(fs_devs->device_dir_kobj); 514 + fs_devs->device_dir_kobj = NULL; 515 + } 516 + 517 + if (fs_devs->super_kobj.state_initialized) { 518 + kobject_del(&fs_devs->super_kobj); 519 + kobject_put(&fs_devs->super_kobj); 520 + wait_for_completion(&fs_devs->kobj_unregister); 521 + } 522 + } 523 + 524 + /* when fs_devs is NULL it will remove all fsid kobject */ 525 + void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) 526 + { 527 + struct list_head *fs_uuids = btrfs_get_fs_uuids(); 528 + 529 + if (fs_devs) { 530 + __btrfs_sysfs_remove_fsid(fs_devs); 531 + return; 532 + } 533 + 534 + list_for_each_entry(fs_devs, fs_uuids, list) { 535 + __btrfs_sysfs_remove_fsid(fs_devs); 536 + } 523 537 } 524 538 525 539 void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) 526 540 { 541 + btrfs_reset_fs_info_ptr(fs_info); 542 + 527 543 if (fs_info->space_info_kobj) { 528 544 sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs); 529 545 kobject_del(fs_info->space_info_kobj); 530 546 kobject_put(fs_info->space_info_kobj); 531 547 } 532 - kobject_del(fs_info->device_dir_kobj); 533 - kobject_put(fs_info->device_dir_kobj); 534 548 addrm_unknown_feature_attrs(fs_info, false); 535 - sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group); 536 - __btrfs_sysfs_remove_one(fs_info); 549 + sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group); 550 + sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs); 551 + btrfs_kobj_rm_device(fs_info->fs_devices, NULL); 537 552 } 538 553 539 554 const char * const btrfs_feature_set_names[3] = { ··· 635 602 } 636 603 } 637 604 638 - int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, 605 + /* when one_device is NULL, it removes all device links */ 606 + 607 + int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices, 639 608 struct btrfs_device *one_device) 640 609 { 641 610 struct hd_struct *disk; 642 611 struct kobject *disk_kobj; 643 612 644 - if (!fs_info->device_dir_kobj) 613 + if (!fs_devices->device_dir_kobj) 645 614 return -EINVAL; 646 615 647 616 if (one_device && one_device->bdev) { 648 617 disk = one_device->bdev->bd_part; 649 618 disk_kobj = &part_to_dev(disk)->kobj; 650 619 651 - sysfs_remove_link(fs_info->device_dir_kobj, 620 + sysfs_remove_link(fs_devices->device_dir_kobj, 621 + disk_kobj->name); 622 + } 623 + 624 + if (one_device) 625 + return 0; 626 + 627 + list_for_each_entry(one_device, 628 + &fs_devices->devices, dev_list) { 629 + if (!one_device->bdev) 630 + continue; 631 + disk = one_device->bdev->bd_part; 632 + disk_kobj = &part_to_dev(disk)->kobj; 633 + 634 + sysfs_remove_link(fs_devices->device_dir_kobj, 652 635 disk_kobj->name); 653 636 } 654 637 655 638 return 0; 656 639 } 657 640 658 - int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, 659 - struct btrfs_device *one_device) 641 + int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs) 642 + { 643 + if (!fs_devs->device_dir_kobj) 644 + fs_devs->device_dir_kobj = kobject_create_and_add("devices", 645 + &fs_devs->super_kobj); 646 + 647 + if (!fs_devs->device_dir_kobj) 648 + return -ENOMEM; 649 + 650 + return 0; 651 + } 652 + 653 + int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices, 654 + struct btrfs_device *one_device) 660 655 { 661 656 int error = 0; 662 - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 663 657 struct btrfs_device *dev; 664 - 665 - if (!fs_info->device_dir_kobj) 666 - fs_info->device_dir_kobj = kobject_create_and_add("devices", 667 - &fs_info->super_kobj); 668 - 669 - if (!fs_info->device_dir_kobj) 670 - return -ENOMEM; 671 658 672 659 list_for_each_entry(dev, &fs_devices->devices, dev_list) { 673 660 struct hd_struct *disk; ··· 702 649 disk = dev->bdev->bd_part; 703 650 disk_kobj = &part_to_dev(disk)->kobj; 704 651 705 - error = sysfs_create_link(fs_info->device_dir_kobj, 652 + error = sysfs_create_link(fs_devices->device_dir_kobj, 706 653 disk_kobj, disk_kobj->name); 707 654 if (error) 708 655 break; ··· 720 667 /* Debugging tunables and exported data */ 721 668 u64 btrfs_debugfs_test; 722 669 723 - int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) 670 + /* 671 + * Can be called by the device discovery thread. 672 + * And parent can be specified for seed device 673 + */ 674 + int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, 675 + struct kobject *parent) 724 676 { 725 677 int error; 726 678 727 - init_completion(&fs_info->kobj_unregister); 728 - fs_info->super_kobj.kset = btrfs_kset; 729 - error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL, 730 - "%pU", fs_info->fsid); 679 + init_completion(&fs_devs->kobj_unregister); 680 + fs_devs->super_kobj.kset = btrfs_kset; 681 + error = kobject_init_and_add(&fs_devs->super_kobj, 682 + &btrfs_ktype, parent, "%pU", fs_devs->fsid); 683 + return error; 684 + } 685 + 686 + int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) 687 + { 688 + int error; 689 + struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; 690 + struct kobject *super_kobj = &fs_devs->super_kobj; 691 + 692 + btrfs_set_fs_info_ptr(fs_info); 693 + 694 + error = btrfs_kobj_add_device(fs_devs, NULL); 731 695 if (error) 732 696 return error; 733 697 734 - error = sysfs_create_group(&fs_info->super_kobj, 735 - &btrfs_feature_attr_group); 698 + error = sysfs_create_files(super_kobj, btrfs_attrs); 736 699 if (error) { 737 - __btrfs_sysfs_remove_one(fs_info); 700 + btrfs_kobj_rm_device(fs_devs, NULL); 738 701 return error; 739 702 } 703 + 704 + error = sysfs_create_group(super_kobj, 705 + &btrfs_feature_attr_group); 706 + if (error) 707 + goto failure; 740 708 741 709 error = addrm_unknown_feature_attrs(fs_info, true); 742 710 if (error) 743 711 goto failure; 744 712 745 - error = btrfs_kobj_add_device(fs_info, NULL); 746 - if (error) 747 - goto failure; 748 - 749 713 fs_info->space_info_kobj = kobject_create_and_add("allocation", 750 - &fs_info->super_kobj); 714 + super_kobj); 751 715 if (!fs_info->space_info_kobj) { 752 716 error = -ENOMEM; 753 717 goto failure;

+6 -2

fs/btrfs/sysfs.h

··· 82 82 extern const char * const btrfs_feature_set_names[3]; 83 83 extern struct kobj_type space_info_ktype; 84 84 extern struct kobj_type btrfs_raid_ktype; 85 - int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, 85 + int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices, 86 86 struct btrfs_device *one_device); 87 - int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, 87 + int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices, 88 88 struct btrfs_device *one_device); 89 + int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, 90 + struct kobject *parent); 91 + int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs); 92 + void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); 89 93 #endif /* _BTRFS_SYSFS_H_ */

+83 -26

fs/btrfs/tests/qgroup-tests.c

··· 21 21 #include "../transaction.h" 22 22 #include "../disk-io.h" 23 23 #include "../qgroup.h" 24 + #include "../backref.h" 24 25 25 26 static void init_dummy_trans(struct btrfs_trans_handle *trans) 26 27 { ··· 228 227 { 229 228 struct btrfs_trans_handle trans; 230 229 struct btrfs_fs_info *fs_info = root->fs_info; 230 + struct ulist *old_roots = NULL; 231 + struct ulist *new_roots = NULL; 231 232 int ret; 232 233 233 234 init_dummy_trans(&trans); ··· 241 238 return ret; 242 239 } 243 240 244 - ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, 245 - BTRFS_QGROUP_OPER_ADD_EXCL, 0); 241 + /* 242 + * Since the test trans doesn't havee the complicated delayed refs, 243 + * we can only call btrfs_qgroup_account_extent() directly to test 244 + * quota. 245 + */ 246 + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); 246 247 if (ret) { 247 - test_msg("Couldn't add space to a qgroup %d\n", ret); 248 + ulist_free(old_roots); 249 + test_msg("Couldn't find old roots: %d\n", ret); 248 250 return ret; 249 251 } 250 252 ··· 257 249 if (ret) 258 250 return ret; 259 251 260 - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); 252 + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); 261 253 if (ret) { 262 - test_msg("Delayed qgroup accounting failed %d\n", ret); 254 + ulist_free(old_roots); 255 + ulist_free(new_roots); 256 + test_msg("Couldn't find old roots: %d\n", ret); 257 + return ret; 258 + } 259 + 260 + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, 261 + old_roots, new_roots); 262 + if (ret) { 263 + test_msg("Couldn't account space for a qgroup %d\n", ret); 263 264 return ret; 264 265 } 265 266 ··· 276 259 test_msg("Qgroup counts didn't match expected values\n"); 277 260 return -EINVAL; 278 261 } 262 + old_roots = NULL; 263 + new_roots = NULL; 264 + 265 + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); 266 + if (ret) { 267 + ulist_free(old_roots); 268 + test_msg("Couldn't find old roots: %d\n", ret); 269 + return ret; 270 + } 279 271 280 272 ret = remove_extent_item(root, 4096, 4096); 281 273 if (ret) 282 274 return -EINVAL; 283 275 284 - ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, 285 - BTRFS_QGROUP_OPER_SUB_EXCL, 0); 276 + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); 286 277 if (ret) { 287 - test_msg("Couldn't remove space from the qgroup %d\n", ret); 288 - return -EINVAL; 278 + ulist_free(old_roots); 279 + ulist_free(new_roots); 280 + test_msg("Couldn't find old roots: %d\n", ret); 281 + return ret; 289 282 } 290 283 291 - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); 284 + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, 285 + old_roots, new_roots); 292 286 if (ret) { 293 - test_msg("Qgroup accounting failed %d\n", ret); 287 + test_msg("Couldn't account space for a qgroup %d\n", ret); 294 288 return -EINVAL; 295 289 } 296 290 ··· 322 294 { 323 295 struct btrfs_trans_handle trans; 324 296 struct btrfs_fs_info *fs_info = root->fs_info; 297 + struct ulist *old_roots = NULL; 298 + struct ulist *new_roots = NULL; 325 299 int ret; 326 300 327 301 init_dummy_trans(&trans); ··· 337 307 return ret; 338 308 } 339 309 310 + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); 311 + if (ret) { 312 + ulist_free(old_roots); 313 + test_msg("Couldn't find old roots: %d\n", ret); 314 + return ret; 315 + } 316 + 340 317 ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); 341 318 if (ret) 342 319 return ret; 343 320 344 - ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, 345 - BTRFS_QGROUP_OPER_ADD_EXCL, 0); 321 + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); 346 322 if (ret) { 347 - test_msg("Couldn't add space to a qgroup %d\n", ret); 323 + ulist_free(old_roots); 324 + ulist_free(new_roots); 325 + test_msg("Couldn't find old roots: %d\n", ret); 348 326 return ret; 349 327 } 350 328 351 - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); 329 + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, 330 + old_roots, new_roots); 352 331 if (ret) { 353 - test_msg("Delayed qgroup accounting failed %d\n", ret); 332 + test_msg("Couldn't account space for a qgroup %d\n", ret); 354 333 return ret; 355 334 } 356 335 ··· 368 329 return -EINVAL; 369 330 } 370 331 332 + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); 333 + if (ret) { 334 + ulist_free(old_roots); 335 + test_msg("Couldn't find old roots: %d\n", ret); 336 + return ret; 337 + } 338 + 371 339 ret = add_tree_ref(root, 4096, 4096, 0, 256); 372 340 if (ret) 373 341 return ret; 374 342 375 - ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, 376 - BTRFS_QGROUP_OPER_ADD_SHARED, 0); 343 + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); 377 344 if (ret) { 378 - test_msg("Qgroup record ref failed %d\n", ret); 345 + ulist_free(old_roots); 346 + ulist_free(new_roots); 347 + test_msg("Couldn't find old roots: %d\n", ret); 379 348 return ret; 380 349 } 381 350 382 - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); 351 + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, 352 + old_roots, new_roots); 383 353 if (ret) { 384 - test_msg("Qgroup accounting failed %d\n", ret); 354 + test_msg("Couldn't account space for a qgroup %d\n", ret); 385 355 return ret; 386 356 } 387 357 ··· 404 356 return -EINVAL; 405 357 } 406 358 359 + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); 360 + if (ret) { 361 + ulist_free(old_roots); 362 + test_msg("Couldn't find old roots: %d\n", ret); 363 + return ret; 364 + } 365 + 407 366 ret = remove_extent_ref(root, 4096, 4096, 0, 256); 408 367 if (ret) 409 368 return ret; 410 369 411 - ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, 412 - BTRFS_QGROUP_OPER_SUB_SHARED, 0); 370 + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); 413 371 if (ret) { 414 - test_msg("Qgroup record ref failed %d\n", ret); 372 + ulist_free(old_roots); 373 + ulist_free(new_roots); 374 + test_msg("Couldn't find old roots: %d\n", ret); 415 375 return ret; 416 376 } 417 377 418 - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); 378 + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, 379 + old_roots, new_roots); 419 380 if (ret) { 420 - test_msg("Qgroup accounting failed %d\n", ret); 381 + test_msg("Couldn't account space for a qgroup %d\n", ret); 421 382 return ret; 422 383 } 423 384

+59 -20

fs/btrfs/transaction.c

··· 225 225 cur_trans->dirty_bg_run = 0; 226 226 227 227 cur_trans->delayed_refs.href_root = RB_ROOT; 228 + cur_trans->delayed_refs.dirty_extent_root = RB_ROOT; 228 229 atomic_set(&cur_trans->delayed_refs.num_entries, 0); 229 230 cur_trans->delayed_refs.num_heads_ready = 0; 230 231 cur_trans->delayed_refs.pending_csums = 0; 231 232 cur_trans->delayed_refs.num_heads = 0; 232 233 cur_trans->delayed_refs.flushing = 0; 233 234 cur_trans->delayed_refs.run_delayed_start = 0; 235 + cur_trans->delayed_refs.qgroup_to_skip = 0; 234 236 235 237 /* 236 238 * although the tree mod log is per file system and not per transaction, ··· 511 509 h->transaction = cur_trans; 512 510 h->blocks_used = 0; 513 511 h->bytes_reserved = 0; 512 + h->chunk_bytes_reserved = 0; 514 513 h->root = root; 515 514 h->delayed_ref_updates = 0; 516 515 h->use_count = 1; ··· 794 791 795 792 if (!list_empty(&trans->new_bgs)) 796 793 btrfs_create_pending_block_groups(trans, root); 794 + 795 + btrfs_trans_release_chunk_metadata(trans); 797 796 798 797 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 799 798 should_end_transaction(trans, root) && ··· 1295 1290 if (pending->error) 1296 1291 goto no_free_objectid; 1297 1292 1293 + /* 1294 + * Make qgroup to skip current new snapshot's qgroupid, as it is 1295 + * accounted by later btrfs_qgroup_inherit(). 1296 + */ 1297 + btrfs_set_skip_qgroup(trans, objectid); 1298 + 1298 1299 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); 1299 1300 1300 1301 if (to_reserve > 0) { ··· 1309 1298 to_reserve, 1310 1299 BTRFS_RESERVE_NO_FLUSH); 1311 1300 if (pending->error) 1312 - goto no_free_objectid; 1301 + goto clear_skip_qgroup; 1313 1302 } 1314 1303 1315 1304 key.objectid = objectid; ··· 1407 1396 btrfs_abort_transaction(trans, root, ret); 1408 1397 goto fail; 1409 1398 } 1410 - 1411 - /* 1412 - * We need to flush delayed refs in order to make sure all of our quota 1413 - * operations have been done before we call btrfs_qgroup_inherit. 1414 - */ 1415 - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); 1416 - if (ret) { 1417 - btrfs_abort_transaction(trans, root, ret); 1418 - goto fail; 1419 - } 1420 - 1421 - ret = btrfs_qgroup_inherit(trans, fs_info, 1422 - root->root_key.objectid, 1423 - objectid, pending->inherit); 1424 - if (ret) { 1425 - btrfs_abort_transaction(trans, root, ret); 1426 - goto fail; 1427 - } 1428 - 1429 1399 /* see comments in should_cow_block() */ 1430 1400 set_bit(BTRFS_ROOT_FORCE_COW, &root->state); 1431 1401 smp_wmb(); ··· 1489 1497 goto fail; 1490 1498 } 1491 1499 } 1500 + 1501 + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); 1502 + if (ret) { 1503 + btrfs_abort_transaction(trans, root, ret); 1504 + goto fail; 1505 + } 1506 + 1507 + /* 1508 + * account qgroup counters before qgroup_inherit() 1509 + */ 1510 + ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); 1511 + if (ret) 1512 + goto fail; 1513 + ret = btrfs_qgroup_account_extents(trans, fs_info); 1514 + if (ret) 1515 + goto fail; 1516 + ret = btrfs_qgroup_inherit(trans, fs_info, 1517 + root->root_key.objectid, 1518 + objectid, pending->inherit); 1519 + if (ret) { 1520 + btrfs_abort_transaction(trans, root, ret); 1521 + goto fail; 1522 + } 1523 + 1492 1524 fail: 1493 1525 pending->error = ret; 1494 1526 dir_item_existed: 1495 1527 trans->block_rsv = rsv; 1496 1528 trans->bytes_reserved = 0; 1529 + clear_skip_qgroup: 1530 + btrfs_clear_skip_qgroup(trans); 1497 1531 no_free_objectid: 1498 1532 kfree(new_root_item); 1499 1533 root_item_alloc_fail: ··· 1981 1963 goto scrub_continue; 1982 1964 } 1983 1965 1966 + /* Reocrd old roots for later qgroup accounting */ 1967 + ret = btrfs_qgroup_prepare_account_extents(trans, root->fs_info); 1968 + if (ret) { 1969 + mutex_unlock(&root->fs_info->reloc_mutex); 1970 + goto scrub_continue; 1971 + } 1972 + 1984 1973 /* 1985 1974 * make sure none of the code above managed to slip in a 1986 1975 * delayed item ··· 2028 2003 * safe to free the root of tree log roots 2029 2004 */ 2030 2005 btrfs_free_log_root_tree(trans, root->fs_info); 2006 + 2007 + /* 2008 + * Since fs roots are all committed, we can get a quite accurate 2009 + * new_roots. So let's do quota accounting. 2010 + */ 2011 + ret = btrfs_qgroup_account_extents(trans, root->fs_info); 2012 + if (ret < 0) { 2013 + mutex_unlock(&root->fs_info->tree_log_mutex); 2014 + mutex_unlock(&root->fs_info->reloc_mutex); 2015 + goto scrub_continue; 2016 + } 2031 2017 2032 2018 ret = commit_cowonly_roots(trans, root); 2033 2019 if (ret) { ··· 2089 2053 2090 2054 clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags); 2091 2055 clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags); 2056 + 2057 + btrfs_trans_release_chunk_metadata(trans); 2092 2058 2093 2059 spin_lock(&root->fs_info->trans_lock); 2094 2060 cur_trans->state = TRANS_STATE_UNBLOCKED; ··· 2161 2123 btrfs_scrub_continue(root); 2162 2124 cleanup_transaction: 2163 2125 btrfs_trans_release_metadata(trans, root); 2126 + btrfs_trans_release_chunk_metadata(trans); 2164 2127 trans->block_rsv = NULL; 2165 2128 if (trans->qgroup_reserved) { 2166 2129 btrfs_qgroup_free(root, trans->qgroup_reserved);

+24

fs/btrfs/transaction.h

··· 102 102 struct btrfs_trans_handle { 103 103 u64 transid; 104 104 u64 bytes_reserved; 105 + u64 chunk_bytes_reserved; 105 106 u64 qgroup_reserved; 106 107 unsigned long use_count; 107 108 unsigned long blocks_reserved; ··· 152 151 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 153 152 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; 154 153 spin_unlock(&BTRFS_I(inode)->lock); 154 + } 155 + 156 + /* 157 + * Make qgroup codes to skip given qgroupid, means the old/new_roots for 158 + * qgroup won't contain the qgroupid in it. 159 + */ 160 + static inline void btrfs_set_skip_qgroup(struct btrfs_trans_handle *trans, 161 + u64 qgroupid) 162 + { 163 + struct btrfs_delayed_ref_root *delayed_refs; 164 + 165 + delayed_refs = &trans->transaction->delayed_refs; 166 + WARN_ON(delayed_refs->qgroup_to_skip); 167 + delayed_refs->qgroup_to_skip = qgroupid; 168 + } 169 + 170 + static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) 171 + { 172 + struct btrfs_delayed_ref_root *delayed_refs; 173 + 174 + delayed_refs = &trans->transaction->delayed_refs; 175 + WARN_ON(!delayed_refs->qgroup_to_skip); 176 + delayed_refs->qgroup_to_skip = 0; 155 177 } 156 178 157 179 int btrfs_end_transaction(struct btrfs_trans_handle *trans,

-3

fs/btrfs/tree-defrag.c

··· 52 52 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 53 53 goto out; 54 54 55 - if (btrfs_test_opt(root, SSD)) 56 - goto out; 57 - 58 55 path = btrfs_alloc_path(); 59 56 if (!path) 60 57 return -ENOMEM;

-6

fs/btrfs/tree-log.c

··· 3881 3881 &ordered->flags)) 3882 3882 continue; 3883 3883 3884 - if (ordered->csum_bytes_left) { 3885 - btrfs_start_ordered_extent(inode, ordered, 0); 3886 - wait_event(ordered->wait, 3887 - ordered->csum_bytes_left == 0); 3888 - } 3889 - 3890 3884 list_for_each_entry(sum, &ordered->list, list) { 3891 3885 ret = btrfs_csum_file_blocks(trans, log, sum); 3892 3886 if (ret)

+36 -11

fs/btrfs/ulist.c

··· 132 132 return NULL; 133 133 } 134 134 135 + static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node) 136 + { 137 + rb_erase(&node->rb_node, &ulist->root); 138 + list_del(&node->list); 139 + kfree(node); 140 + BUG_ON(ulist->nnodes == 0); 141 + ulist->nnodes--; 142 + } 143 + 135 144 static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins) 136 145 { 137 146 struct rb_node **p = &ulist->root.rb_node; ··· 206 197 207 198 node->val = val; 208 199 node->aux = aux; 209 - #ifdef CONFIG_BTRFS_DEBUG 210 - node->seqnum = ulist->nnodes; 211 - #endif 212 200 213 201 ret = ulist_rbtree_insert(ulist, node); 214 202 ASSERT(!ret); ··· 213 207 ulist->nnodes++; 214 208 215 209 return 1; 210 + } 211 + 212 + /* 213 + * ulist_del - delete one node from ulist 214 + * @ulist: ulist to remove node from 215 + * @val: value to delete 216 + * @aux: aux to delete 217 + * 218 + * The deletion will only be done when *BOTH* val and aux matches. 219 + * Return 0 for successful delete. 220 + * Return > 0 for not found. 221 + */ 222 + int ulist_del(struct ulist *ulist, u64 val, u64 aux) 223 + { 224 + struct ulist_node *node; 225 + 226 + node = ulist_rbtree_search(ulist, val); 227 + /* Not found */ 228 + if (!node) 229 + return 1; 230 + 231 + if (node->aux != aux) 232 + return 1; 233 + 234 + /* Found and delete */ 235 + ulist_rbtree_erase(ulist, node); 236 + return 0; 216 237 } 217 238 218 239 /** ··· 270 237 uiter->cur_list = uiter->cur_list->next; 271 238 } else { 272 239 uiter->cur_list = ulist->nodes.next; 273 - #ifdef CONFIG_BTRFS_DEBUG 274 - uiter->i = 0; 275 - #endif 276 240 } 277 241 node = list_entry(uiter->cur_list, struct ulist_node, list); 278 - #ifdef CONFIG_BTRFS_DEBUG 279 - ASSERT(node->seqnum == uiter->i); 280 - ASSERT(uiter->i >= 0 && uiter->i < ulist->nnodes); 281 - uiter->i++; 282 - #endif 283 242 return node; 284 243 }

+1

fs/btrfs/ulist.h

··· 57 57 int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask); 58 58 int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, 59 59 u64 *old_aux, gfp_t gfp_mask); 60 + int ulist_del(struct ulist *ulist, u64 val, u64 aux); 60 61 61 62 /* just like ulist_add_merge() but take a pointer for the aux data */ 62 63 static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux,

+163 -23

fs/btrfs/volumes.c

··· 52 52 53 53 DEFINE_MUTEX(uuid_mutex); 54 54 static LIST_HEAD(fs_uuids); 55 + struct list_head *btrfs_get_fs_uuids(void) 56 + { 57 + return &fs_uuids; 58 + } 55 59 56 60 static struct btrfs_fs_devices *__alloc_fs_devices(void) 57 61 { ··· 445 441 run_scheduled_bios(device); 446 442 } 447 443 444 + 445 + void btrfs_free_stale_device(struct btrfs_device *cur_dev) 446 + { 447 + struct btrfs_fs_devices *fs_devs; 448 + struct btrfs_device *dev; 449 + 450 + if (!cur_dev->name) 451 + return; 452 + 453 + list_for_each_entry(fs_devs, &fs_uuids, list) { 454 + int del = 1; 455 + 456 + if (fs_devs->opened) 457 + continue; 458 + if (fs_devs->seeding) 459 + continue; 460 + 461 + list_for_each_entry(dev, &fs_devs->devices, dev_list) { 462 + 463 + if (dev == cur_dev) 464 + continue; 465 + if (!dev->name) 466 + continue; 467 + 468 + /* 469 + * Todo: This won't be enough. What if the same device 470 + * comes back (with new uuid and) with its mapper path? 471 + * But for now, this does help as mostly an admin will 472 + * either use mapper or non mapper path throughout. 473 + */ 474 + rcu_read_lock(); 475 + del = strcmp(rcu_str_deref(dev->name), 476 + rcu_str_deref(cur_dev->name)); 477 + rcu_read_unlock(); 478 + if (!del) 479 + break; 480 + } 481 + 482 + if (!del) { 483 + /* delete the stale device */ 484 + if (fs_devs->num_devices == 1) { 485 + btrfs_sysfs_remove_fsid(fs_devs); 486 + list_del(&fs_devs->list); 487 + free_fs_devices(fs_devs); 488 + } else { 489 + fs_devs->num_devices--; 490 + list_del(&dev->dev_list); 491 + rcu_string_free(dev->name); 492 + kfree(dev); 493 + } 494 + break; 495 + } 496 + } 497 + } 498 + 448 499 /* 449 500 * Add new device to list of registered devices 450 501 * ··· 614 555 */ 615 556 if (!fs_devices->opened) 616 557 device->generation = found_transid; 558 + 559 + /* 560 + * if there is new btrfs on an already registered device, 561 + * then remove the stale device entry. 562 + */ 563 + btrfs_free_stale_device(device); 617 564 618 565 *fs_devices_ret = fs_devices; 619 566 ··· 758 693 759 694 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 760 695 { 761 - struct btrfs_device *device; 696 + struct btrfs_device *device, *tmp; 762 697 763 698 if (--fs_devices->opened > 0) 764 699 return 0; 765 700 766 701 mutex_lock(&fs_devices->device_list_mutex); 767 - list_for_each_entry(device, &fs_devices->devices, dev_list) { 702 + list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { 768 703 struct btrfs_device *new_device; 769 704 struct rcu_string *name; 770 705 ··· 1132 1067 1133 1068 map = (struct map_lookup *)em->bdev; 1134 1069 for (i = 0; i < map->num_stripes; i++) { 1070 + u64 end; 1071 + 1135 1072 if (map->stripes[i].dev != device) 1136 1073 continue; 1137 1074 if (map->stripes[i].physical >= physical_start + len || 1138 1075 map->stripes[i].physical + em->orig_block_len <= 1139 1076 physical_start) 1140 1077 continue; 1141 - *start = map->stripes[i].physical + 1142 - em->orig_block_len; 1143 - ret = 1; 1078 + /* 1079 + * Make sure that while processing the pinned list we do 1080 + * not override our *start with a lower value, because 1081 + * we can have pinned chunks that fall within this 1082 + * device hole and that have lower physical addresses 1083 + * than the pending chunks we processed before. If we 1084 + * do not take this special care we can end up getting 1085 + * 2 pending chunks that start at the same physical 1086 + * device offsets because the end offset of a pinned 1087 + * chunk can be equal to the start offset of some 1088 + * pending chunk. 1089 + */ 1090 + end = map->stripes[i].physical + em->orig_block_len; 1091 + if (end > *start) { 1092 + *start = end; 1093 + ret = 1; 1094 + } 1144 1095 } 1145 1096 } 1146 1097 if (search_list == &trans->transaction->pending_chunks) { ··· 1787 1706 if (device->bdev) { 1788 1707 device->fs_devices->open_devices--; 1789 1708 /* remove sysfs entry */ 1790 - btrfs_kobj_rm_device(root->fs_info, device); 1709 + btrfs_kobj_rm_device(root->fs_info->fs_devices, device); 1791 1710 } 1792 1711 1793 1712 call_rcu(&device->rcu, free_device); ··· 1956 1875 mutex_lock(&uuid_mutex); 1957 1876 WARN_ON(!tgtdev); 1958 1877 mutex_lock(&fs_info->fs_devices->device_list_mutex); 1878 + 1879 + btrfs_kobj_rm_device(fs_info->fs_devices, tgtdev); 1880 + 1959 1881 if (tgtdev->bdev) { 1960 1882 btrfs_scratch_superblock(tgtdev); 1961 1883 fs_info->fs_devices->open_devices--; ··· 2295 2211 tmp + 1); 2296 2212 2297 2213 /* add sysfs device entry */ 2298 - btrfs_kobj_add_device(root->fs_info, device); 2214 + btrfs_kobj_add_device(root->fs_info->fs_devices, device); 2299 2215 2300 2216 /* 2301 2217 * we've got more storage, clear any full flags on the space ··· 2336 2252 */ 2337 2253 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", 2338 2254 root->fs_info->fsid); 2339 - if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) 2340 - goto error_trans; 2255 + if (kobject_rename(&root->fs_info->fs_devices->super_kobj, 2256 + fsid_buf)) 2257 + pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n"); 2341 2258 } 2342 2259 2343 2260 root->fs_info->num_tolerated_disk_barrier_failures = ··· 2374 2289 error_trans: 2375 2290 btrfs_end_transaction(trans, root); 2376 2291 rcu_string_free(device->name); 2377 - btrfs_kobj_rm_device(root->fs_info, device); 2292 + btrfs_kobj_rm_device(root->fs_info->fs_devices, device); 2378 2293 kfree(device); 2379 2294 error: 2380 2295 blkdev_put(bdev, FMODE_EXCL); ··· 2694 2609 return -EINVAL; 2695 2610 } 2696 2611 map = (struct map_lookup *)em->bdev; 2612 + lock_chunks(root->fs_info->chunk_root); 2613 + check_system_chunk(trans, extent_root, map->type); 2614 + unlock_chunks(root->fs_info->chunk_root); 2697 2615 2698 2616 for (i = 0; i < map->num_stripes; i++) { 2699 2617 struct btrfs_device *device = map->stripes[i].dev; ··· 3996 3908 uuid_root = btrfs_create_tree(trans, fs_info, 3997 3909 BTRFS_UUID_TREE_OBJECTID); 3998 3910 if (IS_ERR(uuid_root)) { 3999 - btrfs_abort_transaction(trans, tree_root, 4000 - PTR_ERR(uuid_root)); 4001 - return PTR_ERR(uuid_root); 3911 + ret = PTR_ERR(uuid_root); 3912 + btrfs_abort_transaction(trans, tree_root, ret); 3913 + return ret; 4002 3914 } 4003 3915 4004 3916 fs_info->uuid_root = uuid_root; ··· 4053 3965 int slot; 4054 3966 int failed = 0; 4055 3967 bool retried = false; 3968 + bool checked_pending_chunks = false; 4056 3969 struct extent_buffer *l; 4057 3970 struct btrfs_key key; 4058 3971 struct btrfs_super_block *super_copy = root->fs_info->super_copy; ··· 4134 4045 goto again; 4135 4046 } else if (failed && retried) { 4136 4047 ret = -ENOSPC; 4137 - lock_chunks(root); 4138 - 4139 - btrfs_device_set_total_bytes(device, old_size); 4140 - if (device->writeable) 4141 - device->fs_devices->total_rw_bytes += diff; 4142 - spin_lock(&root->fs_info->free_chunk_lock); 4143 - root->fs_info->free_chunk_space += diff; 4144 - spin_unlock(&root->fs_info->free_chunk_lock); 4145 - unlock_chunks(root); 4146 4048 goto done; 4147 4049 } 4148 4050 ··· 4145 4065 } 4146 4066 4147 4067 lock_chunks(root); 4068 + 4069 + /* 4070 + * We checked in the above loop all device extents that were already in 4071 + * the device tree. However before we have updated the device's 4072 + * total_bytes to the new size, we might have had chunk allocations that 4073 + * have not complete yet (new block groups attached to transaction 4074 + * handles), and therefore their device extents were not yet in the 4075 + * device tree and we missed them in the loop above. So if we have any 4076 + * pending chunk using a device extent that overlaps the device range 4077 + * that we can not use anymore, commit the current transaction and 4078 + * repeat the search on the device tree - this way we guarantee we will 4079 + * not have chunks using device extents that end beyond 'new_size'. 4080 + */ 4081 + if (!checked_pending_chunks) { 4082 + u64 start = new_size; 4083 + u64 len = old_size - new_size; 4084 + 4085 + if (contains_pending_extent(trans, device, &start, len)) { 4086 + unlock_chunks(root); 4087 + checked_pending_chunks = true; 4088 + failed = 0; 4089 + retried = false; 4090 + ret = btrfs_commit_transaction(trans, root); 4091 + if (ret) 4092 + goto done; 4093 + goto again; 4094 + } 4095 + } 4096 + 4148 4097 btrfs_device_set_disk_total_bytes(device, new_size); 4149 4098 if (list_empty(&device->resized_list)) 4150 4099 list_add_tail(&device->resized_list, ··· 4188 4079 btrfs_end_transaction(trans, root); 4189 4080 done: 4190 4081 btrfs_free_path(path); 4082 + if (ret) { 4083 + lock_chunks(root); 4084 + btrfs_device_set_total_bytes(device, old_size); 4085 + if (device->writeable) 4086 + device->fs_devices->total_rw_bytes += diff; 4087 + spin_lock(&root->fs_info->free_chunk_lock); 4088 + root->fs_info->free_chunk_space += diff; 4089 + spin_unlock(&root->fs_info->free_chunk_lock); 4090 + unlock_chunks(root); 4091 + } 4191 4092 return ret; 4192 4093 } 4193 4094 ··· 6191 6072 free_extent_map(em); 6192 6073 return -EIO; 6193 6074 } 6075 + btrfs_warn(root->fs_info, "devid %llu uuid %pU is missing", 6076 + devid, uuid); 6194 6077 } 6195 6078 map->stripes[i].dev->in_fs_metadata = 1; 6196 6079 } ··· 6312 6191 if (!btrfs_test_opt(root, DEGRADED)) 6313 6192 return -EIO; 6314 6193 6315 - btrfs_warn(root->fs_info, "devid %llu missing", devid); 6316 6194 device = add_missing_dev(root, fs_devices, devid, dev_uuid); 6317 6195 if (!device) 6318 6196 return -ENOMEM; 6197 + btrfs_warn(root->fs_info, "devid %llu uuid %pU missing", 6198 + devid, dev_uuid); 6319 6199 } else { 6320 6200 if (!device->bdev && !btrfs_test_opt(root, DEGRADED)) 6321 6201 return -EIO; ··· 6843 6721 } 6844 6722 } 6845 6723 unlock_chunks(root); 6724 + } 6725 + 6726 + void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) 6727 + { 6728 + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6729 + while (fs_devices) { 6730 + fs_devices->fs_info = fs_info; 6731 + fs_devices = fs_devices->seed; 6732 + } 6733 + } 6734 + 6735 + void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) 6736 + { 6737 + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6738 + while (fs_devices) { 6739 + fs_devices->fs_info = NULL; 6740 + fs_devices = fs_devices->seed; 6741 + } 6846 6742 }

+9

fs/btrfs/volumes.h

··· 253 253 * nonrot flag set 254 254 */ 255 255 int rotating; 256 + 257 + struct btrfs_fs_info *fs_info; 258 + /* sysfs kobjects */ 259 + struct kobject super_kobj; 260 + struct kobject *device_dir_kobj; 261 + struct completion kobj_unregister; 256 262 }; 257 263 258 264 #define BTRFS_BIO_INLINE_CSUM_SIZE 64 ··· 541 535 mutex_unlock(&root->fs_info->chunk_mutex); 542 536 } 543 537 538 + struct list_head *btrfs_get_fs_uuids(void); 539 + void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); 540 + void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); 544 541 545 542 #endif

+1

fs/seq_file.c

··· 538 538 539 539 return res; 540 540 } 541 + EXPORT_SYMBOL(seq_dentry); 541 542 542 543 static void *single_start(struct seq_file *p, loff_t *pos) 543 544 {

-55

include/trace/events/btrfs.h

··· 1117 1117 TP_ARGS(wq) 1118 1118 ); 1119 1119 1120 - #define show_oper_type(type) \ 1121 - __print_symbolic(type, \ 1122 - { BTRFS_QGROUP_OPER_ADD_EXCL, "OPER_ADD_EXCL" }, \ 1123 - { BTRFS_QGROUP_OPER_ADD_SHARED, "OPER_ADD_SHARED" }, \ 1124 - { BTRFS_QGROUP_OPER_SUB_EXCL, "OPER_SUB_EXCL" }, \ 1125 - { BTRFS_QGROUP_OPER_SUB_SHARED, "OPER_SUB_SHARED" }) 1126 - 1127 - DECLARE_EVENT_CLASS(btrfs_qgroup_oper, 1128 - 1129 - TP_PROTO(struct btrfs_qgroup_operation *oper), 1130 - 1131 - TP_ARGS(oper), 1132 - 1133 - TP_STRUCT__entry( 1134 - __field( u64, ref_root ) 1135 - __field( u64, bytenr ) 1136 - __field( u64, num_bytes ) 1137 - __field( u64, seq ) 1138 - __field( int, type ) 1139 - __field( u64, elem_seq ) 1140 - ), 1141 - 1142 - TP_fast_assign( 1143 - __entry->ref_root = oper->ref_root; 1144 - __entry->bytenr = oper->bytenr, 1145 - __entry->num_bytes = oper->num_bytes; 1146 - __entry->seq = oper->seq; 1147 - __entry->type = oper->type; 1148 - __entry->elem_seq = oper->elem.seq; 1149 - ), 1150 - 1151 - TP_printk("ref_root = %llu, bytenr = %llu, num_bytes = %llu, " 1152 - "seq = %llu, elem.seq = %llu, type = %s", 1153 - (unsigned long long)__entry->ref_root, 1154 - (unsigned long long)__entry->bytenr, 1155 - (unsigned long long)__entry->num_bytes, 1156 - (unsigned long long)__entry->seq, 1157 - (unsigned long long)__entry->elem_seq, 1158 - show_oper_type(__entry->type)) 1159 - ); 1160 - 1161 - DEFINE_EVENT(btrfs_qgroup_oper, btrfs_qgroup_account, 1162 - 1163 - TP_PROTO(struct btrfs_qgroup_operation *oper), 1164 - 1165 - TP_ARGS(oper) 1166 - ); 1167 - 1168 - DEFINE_EVENT(btrfs_qgroup_oper, btrfs_qgroup_record_ref, 1169 - 1170 - TP_PROTO(struct btrfs_qgroup_operation *oper), 1171 - 1172 - TP_ARGS(oper) 1173 - ); 1174 - 1175 1120 #endif /* _TRACE_BTRFS_H */ 1176 1121 1177 1122 /* This part must be outside protection */

+1

lib/kobject.c

··· 545 545 kfree(devpath); 546 546 return error; 547 547 } 548 + EXPORT_SYMBOL_GPL(kobject_move); 548 549 549 550 /** 550 551 * kobject_del - unlink kobject from hierarchy.