Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs

+1 -1

fs/btrfs/Makefile

··· 8 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 9 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 10 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 11 - reada.o backref.o ulist.o qgroup.o send.o 11 + reada.o backref.o ulist.o qgroup.o send.o dev-replace.o 12 12 13 13 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 14 14 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o

+2

fs/btrfs/acl.c

··· 121 121 ret = posix_acl_equiv_mode(acl, &inode->i_mode); 122 122 if (ret < 0) 123 123 return ret; 124 + if (ret == 0) 125 + acl = NULL; 124 126 } 125 127 ret = 0; 126 128 break;

+12 -4

fs/btrfs/backref.c

··· 461 461 pos2 = n2, n2 = pos2->next) { 462 462 struct __prelim_ref *ref2; 463 463 struct __prelim_ref *xchg; 464 + struct extent_inode_elem *eie; 464 465 465 466 ref2 = list_entry(pos2, struct __prelim_ref, list); 466 467 ··· 473 472 ref1 = ref2; 474 473 ref2 = xchg; 475 474 } 476 - ref1->count += ref2->count; 477 475 } else { 478 476 if (ref1->parent != ref2->parent) 479 477 continue; 480 - ref1->count += ref2->count; 481 478 } 479 + 480 + eie = ref1->inode_list; 481 + while (eie && eie->next) 482 + eie = eie->next; 483 + if (eie) 484 + eie->next = ref2->inode_list; 485 + else 486 + ref1->inode_list = ref2->inode_list; 487 + ref1->count += ref2->count; 488 + 482 489 list_del(&ref2->list); 483 490 kfree(ref2); 484 491 } ··· 899 890 while (!list_empty(&prefs)) { 900 891 ref = list_first_entry(&prefs, struct __prelim_ref, list); 901 892 list_del(&ref->list); 902 - if (ref->count < 0) 903 - WARN_ON(1); 893 + WARN_ON(ref->count < 0); 904 894 if (ref->count && ref->root_id && ref->parent == 0) { 905 895 /* no parent == root of tree */ 906 896 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);

+4

fs/btrfs/btrfs_inode.h

··· 39 39 #define BTRFS_INODE_HAS_ORPHAN_ITEM 5 40 40 #define BTRFS_INODE_HAS_ASYNC_EXTENT 6 41 41 #define BTRFS_INODE_NEEDS_FULL_SYNC 7 42 + #define BTRFS_INODE_COPY_EVERYTHING 8 42 43 43 44 /* in memory btrfs inode */ 44 45 struct btrfs_inode { ··· 90 89 struct rb_node rb_node; 91 90 92 91 unsigned long runtime_flags; 92 + 93 + /* Keep track of who's O_SYNC/fsycing currently */ 94 + atomic_t sync_writers; 93 95 94 96 /* full 64 bit generation number, struct vfs_inode doesn't have a big 95 97 * enough field for this.

+21 -10

fs/btrfs/check-integrity.c

··· 137 137 unsigned int never_written:1; /* block was added because it was 138 138 * referenced, not because it was 139 139 * written */ 140 - unsigned int mirror_num:2; /* large enough to hold 140 + unsigned int mirror_num; /* large enough to hold 141 141 * BTRFS_SUPER_MIRROR_MAX */ 142 142 struct btrfsic_dev_state *dev_state; 143 143 u64 dev_bytenr; /* key, physical byte num on disk */ ··· 723 723 } 724 724 725 725 num_copies = 726 - btrfs_num_copies(&state->root->fs_info->mapping_tree, 726 + btrfs_num_copies(state->root->fs_info, 727 727 next_bytenr, state->metablock_size); 728 728 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 729 729 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", ··· 903 903 } 904 904 905 905 num_copies = 906 - btrfs_num_copies(&state->root->fs_info->mapping_tree, 906 + btrfs_num_copies(state->root->fs_info, 907 907 next_bytenr, state->metablock_size); 908 908 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 909 909 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", ··· 1287 1287 *next_blockp = NULL; 1288 1288 if (0 == *num_copiesp) { 1289 1289 *num_copiesp = 1290 - btrfs_num_copies(&state->root->fs_info->mapping_tree, 1290 + btrfs_num_copies(state->root->fs_info, 1291 1291 next_bytenr, state->metablock_size); 1292 1292 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 1293 1293 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", ··· 1489 1489 chunk_len = num_bytes; 1490 1490 1491 1491 num_copies = 1492 - btrfs_num_copies(&state->root->fs_info->mapping_tree, 1492 + btrfs_num_copies(state->root->fs_info, 1493 1493 next_bytenr, state->datablock_size); 1494 1494 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 1495 1495 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", ··· 1582 1582 struct btrfs_device *device; 1583 1583 1584 1584 length = len; 1585 - ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ, 1585 + ret = btrfs_map_block(state->root->fs_info, READ, 1586 1586 bytenr, &length, &multi, mirror_num); 1587 + 1588 + if (ret) { 1589 + block_ctx_out->start = 0; 1590 + block_ctx_out->dev_bytenr = 0; 1591 + block_ctx_out->len = 0; 1592 + block_ctx_out->dev = NULL; 1593 + block_ctx_out->datav = NULL; 1594 + block_ctx_out->pagev = NULL; 1595 + block_ctx_out->mem_to_free = NULL; 1596 + 1597 + return ret; 1598 + } 1587 1599 1588 1600 device = multi->stripes[0].dev; 1589 1601 block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev); ··· 1606 1594 block_ctx_out->pagev = NULL; 1607 1595 block_ctx_out->mem_to_free = NULL; 1608 1596 1609 - if (0 == ret) 1610 - kfree(multi); 1597 + kfree(multi); 1611 1598 if (NULL == block_ctx_out->dev) { 1612 1599 ret = -ENXIO; 1613 1600 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n"); ··· 2474 2463 } 2475 2464 2476 2465 num_copies = 2477 - btrfs_num_copies(&state->root->fs_info->mapping_tree, 2466 + btrfs_num_copies(state->root->fs_info, 2478 2467 next_bytenr, BTRFS_SUPER_INFO_SIZE); 2479 2468 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 2480 2469 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", ··· 2971 2960 struct btrfsic_block_data_ctx block_ctx; 2972 2961 int match = 0; 2973 2962 2974 - num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, 2963 + num_copies = btrfs_num_copies(state->root->fs_info, 2975 2964 bytenr, state->metablock_size); 2976 2965 2977 2966 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {

+4 -2

fs/btrfs/compression.c

··· 687 687 688 688 ret = btrfs_map_bio(root, READ, comp_bio, 689 689 mirror_num, 0); 690 - BUG_ON(ret); /* -ENOMEM */ 690 + if (ret) 691 + bio_endio(comp_bio, ret); 691 692 692 693 bio_put(comp_bio); 693 694 ··· 713 712 } 714 713 715 714 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); 716 - BUG_ON(ret); /* -ENOMEM */ 715 + if (ret) 716 + bio_endio(comp_bio, ret); 717 717 718 718 bio_put(comp_bio); 719 719 return 0;

+189 -52

fs/btrfs/ctree.c

··· 38 38 struct extent_buffer *dst_buf, 39 39 struct extent_buffer *src_buf); 40 40 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 41 - struct btrfs_path *path, int level, int slot, 42 - int tree_mod_log); 41 + struct btrfs_path *path, int level, int slot); 43 42 static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, 44 43 struct extent_buffer *eb); 45 44 struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, ··· 775 776 776 777 static noinline void 777 778 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, 778 - struct extent_buffer *eb, 779 - struct btrfs_disk_key *disk_key, int slot, int atomic) 779 + struct extent_buffer *eb, int slot, int atomic) 780 780 { 781 781 int ret; 782 782 ··· 1138 1140 switch (tm->op) { 1139 1141 case MOD_LOG_KEY_REMOVE_WHILE_FREEING: 1140 1142 BUG_ON(tm->slot < n); 1141 - case MOD_LOG_KEY_REMOVE_WHILE_MOVING: 1142 1143 case MOD_LOG_KEY_REMOVE: 1144 + n++; 1145 + case MOD_LOG_KEY_REMOVE_WHILE_MOVING: 1143 1146 btrfs_set_node_key(eb, &tm->key, tm->slot); 1144 1147 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 1145 1148 btrfs_set_node_ptr_generation(eb, tm->slot, 1146 1149 tm->generation); 1147 - n++; 1148 1150 break; 1149 1151 case MOD_LOG_KEY_REPLACE: 1150 1152 BUG_ON(tm->slot >= n); ··· 1359 1361 u64 search_start; 1360 1362 int ret; 1361 1363 1362 - if (trans->transaction != root->fs_info->running_transaction) { 1363 - printk(KERN_CRIT "trans %llu running %llu\n", 1364 + if (trans->transaction != root->fs_info->running_transaction) 1365 + WARN(1, KERN_CRIT "trans %llu running %llu\n", 1364 1366 (unsigned long long)trans->transid, 1365 1367 (unsigned long long) 1366 1368 root->fs_info->running_transaction->transid); 1367 - WARN_ON(1); 1368 - } 1369 - if (trans->transid != root->fs_info->generation) { 1370 - printk(KERN_CRIT "trans %llu running %llu\n", 1369 + 1370 + if (trans->transid != root->fs_info->generation) 1371 + WARN(1, KERN_CRIT "trans %llu running %llu\n", 1371 1372 (unsigned long long)trans->transid, 1372 1373 (unsigned long long)root->fs_info->generation); 1373 - WARN_ON(1); 1374 - } 1375 1374 1376 1375 if (!should_cow_block(trans, root, buf)) { 1377 1376 *cow_ret = buf; ··· 1464 1469 if (cache_only && parent_level != 1) 1465 1470 return 0; 1466 1471 1467 - if (trans->transaction != root->fs_info->running_transaction) 1468 - WARN_ON(1); 1469 - if (trans->transid != root->fs_info->generation) 1470 - WARN_ON(1); 1472 + WARN_ON(trans->transaction != root->fs_info->running_transaction); 1473 + WARN_ON(trans->transid != root->fs_info->generation); 1471 1474 1472 1475 parent_nritems = btrfs_header_nritems(parent); 1473 1476 blocksize = btrfs_level_size(root, parent_level - 1); ··· 1820 1827 if (btrfs_header_nritems(right) == 0) { 1821 1828 clean_tree_block(trans, root, right); 1822 1829 btrfs_tree_unlock(right); 1823 - del_ptr(trans, root, path, level + 1, pslot + 1, 1); 1830 + del_ptr(trans, root, path, level + 1, pslot + 1); 1824 1831 root_sub_used(root, right->len); 1825 1832 btrfs_free_tree_block(trans, root, right, 0, 1); 1826 1833 free_extent_buffer_stale(right); ··· 1829 1836 struct btrfs_disk_key right_key; 1830 1837 btrfs_node_key(right, &right_key, 0); 1831 1838 tree_mod_log_set_node_key(root->fs_info, parent, 1832 - &right_key, pslot + 1, 0); 1839 + pslot + 1, 0); 1833 1840 btrfs_set_node_key(parent, &right_key, pslot + 1); 1834 1841 btrfs_mark_buffer_dirty(parent); 1835 1842 } ··· 1864 1871 if (btrfs_header_nritems(mid) == 0) { 1865 1872 clean_tree_block(trans, root, mid); 1866 1873 btrfs_tree_unlock(mid); 1867 - del_ptr(trans, root, path, level + 1, pslot, 1); 1874 + del_ptr(trans, root, path, level + 1, pslot); 1868 1875 root_sub_used(root, mid->len); 1869 1876 btrfs_free_tree_block(trans, root, mid, 0, 1); 1870 1877 free_extent_buffer_stale(mid); ··· 1873 1880 /* update the parent key to reflect our changes */ 1874 1881 struct btrfs_disk_key mid_key; 1875 1882 btrfs_node_key(mid, &mid_key, 0); 1876 - tree_mod_log_set_node_key(root->fs_info, parent, &mid_key, 1883 + tree_mod_log_set_node_key(root->fs_info, parent, 1877 1884 pslot, 0); 1878 1885 btrfs_set_node_key(parent, &mid_key, pslot); 1879 1886 btrfs_mark_buffer_dirty(parent); ··· 1973 1980 orig_slot += left_nr; 1974 1981 btrfs_node_key(mid, &disk_key, 0); 1975 1982 tree_mod_log_set_node_key(root->fs_info, parent, 1976 - &disk_key, pslot, 0); 1983 + pslot, 0); 1977 1984 btrfs_set_node_key(parent, &disk_key, pslot); 1978 1985 btrfs_mark_buffer_dirty(parent); 1979 1986 if (btrfs_header_nritems(left) > orig_slot) { ··· 2026 2033 2027 2034 btrfs_node_key(right, &disk_key, 0); 2028 2035 tree_mod_log_set_node_key(root->fs_info, parent, 2029 - &disk_key, pslot + 1, 0); 2036 + pslot + 1, 0); 2030 2037 btrfs_set_node_key(parent, &disk_key, pslot + 1); 2031 2038 btrfs_mark_buffer_dirty(parent); 2032 2039 ··· 2212 2219 int no_skips = 0; 2213 2220 struct extent_buffer *t; 2214 2221 2222 + if (path->really_keep_locks) 2223 + return; 2224 + 2215 2225 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2216 2226 if (!path->nodes[i]) 2217 2227 break; ··· 2262 2266 { 2263 2267 int i; 2264 2268 2265 - if (path->keep_locks) 2269 + if (path->keep_locks || path->really_keep_locks) 2266 2270 return; 2267 2271 2268 2272 for (i = level; i < BTRFS_MAX_LEVEL; i++) { ··· 2495 2499 if (!cow) 2496 2500 write_lock_level = -1; 2497 2501 2498 - if (cow && (p->keep_locks || p->lowest_level)) 2502 + if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level)) 2499 2503 write_lock_level = BTRFS_MAX_LEVEL; 2500 2504 2501 2505 min_write_lock_level = write_lock_level; ··· 2564 2568 * must have write locks on this node and the 2565 2569 * parent 2566 2570 */ 2567 - if (level + 1 > write_lock_level) { 2571 + if (level > write_lock_level || 2572 + (level + 1 > write_lock_level && 2573 + level + 1 < BTRFS_MAX_LEVEL && 2574 + p->nodes[level + 1])) { 2568 2575 write_lock_level = level + 1; 2569 2576 btrfs_release_path(p); 2570 2577 goto again; ··· 2916 2917 if (!path->nodes[i]) 2917 2918 break; 2918 2919 t = path->nodes[i]; 2919 - tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1); 2920 + tree_mod_log_set_node_key(root->fs_info, t, tslot, 1); 2920 2921 btrfs_set_node_key(t, key, tslot); 2921 2922 btrfs_mark_buffer_dirty(path->nodes[i]); 2922 2923 if (tslot != 0) ··· 3301 3302 */ 3302 3303 static int leaf_space_used(struct extent_buffer *l, int start, int nr) 3303 3304 { 3305 + struct btrfs_item *start_item; 3306 + struct btrfs_item *end_item; 3307 + struct btrfs_map_token token; 3304 3308 int data_len; 3305 3309 int nritems = btrfs_header_nritems(l); 3306 3310 int end = min(nritems, start + nr) - 1; 3307 3311 3308 3312 if (!nr) 3309 3313 return 0; 3310 - data_len = btrfs_item_end_nr(l, start); 3311 - data_len = data_len - btrfs_item_offset_nr(l, end); 3314 + btrfs_init_map_token(&token); 3315 + start_item = btrfs_item_nr(l, start); 3316 + end_item = btrfs_item_nr(l, end); 3317 + data_len = btrfs_token_item_offset(l, start_item, &token) + 3318 + btrfs_token_item_size(l, start_item, &token); 3319 + data_len = data_len - btrfs_token_item_offset(l, end_item, &token); 3312 3320 data_len += sizeof(struct btrfs_item) * nr; 3313 3321 WARN_ON(data_len < 0); 3314 3322 return data_len; ··· 3409 3403 if (push_items == 0) 3410 3404 goto out_unlock; 3411 3405 3412 - if (!empty && push_items == left_nritems) 3413 - WARN_ON(1); 3406 + WARN_ON(!empty && push_items == left_nritems); 3414 3407 3415 3408 /* push left to right */ 3416 3409 right_nritems = btrfs_header_nritems(right); ··· 3647 3642 btrfs_set_header_nritems(left, old_left_nritems + push_items); 3648 3643 3649 3644 /* fixup right node */ 3650 - if (push_items > right_nritems) { 3651 - printk(KERN_CRIT "push items %d nr %u\n", push_items, 3645 + if (push_items > right_nritems) 3646 + WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, 3652 3647 right_nritems); 3653 - WARN_ON(1); 3654 - } 3655 3648 3656 3649 if (push_items < right_nritems) { 3657 3650 push_space = btrfs_item_offset_nr(right, push_items - 1) - ··· 4605 4602 * empty a node. 4606 4603 */ 4607 4604 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4608 - struct btrfs_path *path, int level, int slot, 4609 - int tree_mod_log) 4605 + struct btrfs_path *path, int level, int slot) 4610 4606 { 4611 4607 struct extent_buffer *parent = path->nodes[level]; 4612 4608 u32 nritems; 4613 4609 int ret; 4614 4610 4611 + if (level) { 4612 + ret = tree_mod_log_insert_key(root->fs_info, parent, slot, 4613 + MOD_LOG_KEY_REMOVE); 4614 + BUG_ON(ret < 0); 4615 + } 4616 + 4615 4617 nritems = btrfs_header_nritems(parent); 4616 4618 if (slot != nritems - 1) { 4617 - if (tree_mod_log && level) 4619 + if (level) 4618 4620 tree_mod_log_eb_move(root->fs_info, parent, slot, 4619 4621 slot + 1, nritems - slot - 1); 4620 4622 memmove_extent_buffer(parent, ··· 4627 4619 btrfs_node_key_ptr_offset(slot + 1), 4628 4620 sizeof(struct btrfs_key_ptr) * 4629 4621 (nritems - slot - 1)); 4630 - } else if (tree_mod_log && level) { 4631 - ret = tree_mod_log_insert_key(root->fs_info, parent, slot, 4632 - MOD_LOG_KEY_REMOVE); 4633 - BUG_ON(ret < 0); 4634 4622 } 4635 4623 4636 4624 nritems--; ··· 4660 4656 struct extent_buffer *leaf) 4661 4657 { 4662 4658 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4663 - del_ptr(trans, root, path, 1, path->slots[1], 1); 4659 + del_ptr(trans, root, path, 1, path->slots[1]); 4664 4660 4665 4661 /* 4666 4662 * btrfs_free_extent is expensive, we want to make sure we ··· 5127 5123 right_path->search_commit_root = 1; 5128 5124 right_path->skip_locking = 1; 5129 5125 5130 - spin_lock(&left_root->root_times_lock); 5126 + spin_lock(&left_root->root_item_lock); 5131 5127 left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); 5132 - spin_unlock(&left_root->root_times_lock); 5128 + spin_unlock(&left_root->root_item_lock); 5133 5129 5134 - spin_lock(&right_root->root_times_lock); 5130 + spin_lock(&right_root->root_item_lock); 5135 5131 right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); 5136 - spin_unlock(&right_root->root_times_lock); 5132 + spin_unlock(&right_root->root_item_lock); 5137 5133 5138 5134 trans = btrfs_join_transaction(left_root); 5139 5135 if (IS_ERR(trans)) { ··· 5228 5224 goto out; 5229 5225 } 5230 5226 5231 - spin_lock(&left_root->root_times_lock); 5227 + spin_lock(&left_root->root_item_lock); 5232 5228 ctransid = btrfs_root_ctransid(&left_root->root_item); 5233 - spin_unlock(&left_root->root_times_lock); 5229 + spin_unlock(&left_root->root_item_lock); 5234 5230 if (ctransid != left_start_ctransid) 5235 5231 left_start_ctransid = 0; 5236 5232 5237 - spin_lock(&right_root->root_times_lock); 5233 + spin_lock(&right_root->root_item_lock); 5238 5234 ctransid = btrfs_root_ctransid(&right_root->root_item); 5239 - spin_unlock(&right_root->root_times_lock); 5235 + spin_unlock(&right_root->root_item_lock); 5240 5236 if (ctransid != right_start_ctransid) 5241 5237 right_start_ctransid = 0; 5242 5238 ··· 5498 5494 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) 5499 5495 { 5500 5496 return btrfs_next_old_leaf(root, path, 0); 5497 + } 5498 + 5499 + /* Release the path up to but not including the given level */ 5500 + static void btrfs_release_level(struct btrfs_path *path, int level) 5501 + { 5502 + int i; 5503 + 5504 + for (i = 0; i < level; i++) { 5505 + path->slots[i] = 0; 5506 + if (!path->nodes[i]) 5507 + continue; 5508 + if (path->locks[i]) { 5509 + btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); 5510 + path->locks[i] = 0; 5511 + } 5512 + free_extent_buffer(path->nodes[i]); 5513 + path->nodes[i] = NULL; 5514 + } 5515 + } 5516 + 5517 + /* 5518 + * This function assumes 2 things 5519 + * 5520 + * 1) You are using path->keep_locks 5521 + * 2) You are not inserting items. 5522 + * 5523 + * If either of these are not true do not use this function. If you need a next 5524 + * leaf with either of these not being true then this function can be easily 5525 + * adapted to do that, but at the moment these are the limitations. 5526 + */ 5527 + int btrfs_next_leaf_write(struct btrfs_trans_handle *trans, 5528 + struct btrfs_root *root, struct btrfs_path *path, 5529 + int del) 5530 + { 5531 + struct extent_buffer *b; 5532 + struct btrfs_key key; 5533 + u32 nritems; 5534 + int level = 1; 5535 + int slot; 5536 + int ret = 1; 5537 + int write_lock_level = BTRFS_MAX_LEVEL; 5538 + int ins_len = del ? -1 : 0; 5539 + 5540 + WARN_ON(!(path->keep_locks || path->really_keep_locks)); 5541 + 5542 + nritems = btrfs_header_nritems(path->nodes[0]); 5543 + btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 5544 + 5545 + while (path->nodes[level]) { 5546 + nritems = btrfs_header_nritems(path->nodes[level]); 5547 + if (!(path->locks[level] & BTRFS_WRITE_LOCK)) { 5548 + search: 5549 + btrfs_release_path(path); 5550 + ret = btrfs_search_slot(trans, root, &key, path, 5551 + ins_len, 1); 5552 + if (ret < 0) 5553 + goto out; 5554 + level = 1; 5555 + continue; 5556 + } 5557 + 5558 + if (path->slots[level] >= nritems - 1) { 5559 + level++; 5560 + continue; 5561 + } 5562 + 5563 + btrfs_release_level(path, level); 5564 + break; 5565 + } 5566 + 5567 + if (!path->nodes[level]) { 5568 + ret = 1; 5569 + goto out; 5570 + } 5571 + 5572 + path->slots[level]++; 5573 + b = path->nodes[level]; 5574 + 5575 + while (b) { 5576 + level = btrfs_header_level(b); 5577 + 5578 + if (!should_cow_block(trans, root, b)) 5579 + goto cow_done; 5580 + 5581 + btrfs_set_path_blocking(path); 5582 + ret = btrfs_cow_block(trans, root, b, 5583 + path->nodes[level + 1], 5584 + path->slots[level + 1], &b); 5585 + if (ret) 5586 + goto out; 5587 + cow_done: 5588 + path->nodes[level] = b; 5589 + btrfs_clear_path_blocking(path, NULL, 0); 5590 + if (level != 0) { 5591 + ret = setup_nodes_for_search(trans, root, path, b, 5592 + level, ins_len, 5593 + &write_lock_level); 5594 + if (ret == -EAGAIN) 5595 + goto search; 5596 + if (ret) 5597 + goto out; 5598 + 5599 + b = path->nodes[level]; 5600 + slot = path->slots[level]; 5601 + 5602 + ret = read_block_for_search(trans, root, path, 5603 + &b, level, slot, &key, 0); 5604 + if (ret == -EAGAIN) 5605 + goto search; 5606 + if (ret) 5607 + goto out; 5608 + level = btrfs_header_level(b); 5609 + if (!btrfs_try_tree_write_lock(b)) { 5610 + btrfs_set_path_blocking(path); 5611 + btrfs_tree_lock(b); 5612 + btrfs_clear_path_blocking(path, b, 5613 + BTRFS_WRITE_LOCK); 5614 + } 5615 + path->locks[level] = BTRFS_WRITE_LOCK; 5616 + path->nodes[level] = b; 5617 + path->slots[level] = 0; 5618 + } else { 5619 + path->slots[level] = 0; 5620 + ret = 0; 5621 + break; 5622 + } 5623 + } 5624 + 5625 + out: 5626 + if (ret) 5627 + btrfs_release_path(path); 5628 + 5629 + return ret; 5501 5630 } 5502 5631 5503 5632 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,

+164 -18

fs/btrfs/ctree.h

··· 48 48 49 49 #define BTRFS_MAGIC "_BHRfS_M" 50 50 51 - #define BTRFS_MAX_MIRRORS 2 51 + #define BTRFS_MAX_MIRRORS 3 52 52 53 53 #define BTRFS_MAX_LEVEL 8 54 54 ··· 142 142 143 143 #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 144 144 145 + #define BTRFS_DEV_REPLACE_DEVID 0 146 + 145 147 /* 146 148 * the max metadata block size. This limit is somewhat artificial, 147 149 * but the memmove costs go through the roof for larger blocks. ··· 173 171 174 172 /* four bytes for CRC32 */ 175 173 #define BTRFS_EMPTY_DIR_SIZE 0 174 + 175 + /* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */ 176 + #define REQ_GET_READ_MIRRORS (1 << 30) 176 177 177 178 #define BTRFS_FT_UNKNOWN 0 178 179 #define BTRFS_FT_REG_FILE 1 ··· 576 571 unsigned int skip_locking:1; 577 572 unsigned int leave_spinning:1; 578 573 unsigned int search_commit_root:1; 574 + unsigned int really_keep_locks:1; 579 575 }; 580 576 581 577 /* ··· 889 883 * the existing values unchanged 890 884 */ 891 885 __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; 886 + } __attribute__ ((__packed__)); 887 + 888 + #define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 889 + #define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 890 + #define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0 891 + #define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1 892 + #define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2 893 + #define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3 894 + #define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4 895 + 896 + struct btrfs_dev_replace { 897 + u64 replace_state; /* see #define above */ 898 + u64 time_started; /* seconds since 1-Jan-1970 */ 899 + u64 time_stopped; /* seconds since 1-Jan-1970 */ 900 + atomic64_t num_write_errors; 901 + atomic64_t num_uncorrectable_read_errors; 902 + 903 + u64 cursor_left; 904 + u64 committed_cursor_left; 905 + u64 cursor_left_last_write_of_item; 906 + u64 cursor_right; 907 + 908 + u64 cont_reading_from_srcdev_mode; /* see #define above */ 909 + 910 + int is_valid; 911 + int item_needs_writeback; 912 + struct btrfs_device *srcdev; 913 + struct btrfs_device *tgtdev; 914 + 915 + pid_t lock_owner; 916 + atomic_t nesting_level; 917 + struct mutex lock_finishing_cancel_unmount; 918 + struct mutex lock_management_lock; 919 + struct mutex lock; 920 + 921 + struct btrfs_scrub_progress scrub_progress; 922 + }; 923 + 924 + struct btrfs_dev_replace_item { 925 + /* 926 + * grow this item struct at the end for future enhancements and keep 927 + * the existing values unchanged 928 + */ 929 + __le64 src_devid; 930 + __le64 cursor_left; 931 + __le64 cursor_right; 932 + __le64 cont_reading_from_srcdev_mode; 933 + 934 + __le64 replace_state; 935 + __le64 time_started; 936 + __le64 time_stopped; 937 + __le64 num_write_errors; 938 + __le64 num_uncorrectable_read_errors; 892 939 } __attribute__ ((__packed__)); 893 940 894 941 /* different types of block groups (and chunks) */ ··· 1392 1333 struct btrfs_workers generic_worker; 1393 1334 struct btrfs_workers workers; 1394 1335 struct btrfs_workers delalloc_workers; 1336 + struct btrfs_workers flush_workers; 1395 1337 struct btrfs_workers endio_workers; 1396 1338 struct btrfs_workers endio_meta_workers; 1397 1339 struct btrfs_workers endio_meta_write_workers; ··· 1489 1429 struct rw_semaphore scrub_super_lock; 1490 1430 int scrub_workers_refcnt; 1491 1431 struct btrfs_workers scrub_workers; 1432 + struct btrfs_workers scrub_wr_completion_workers; 1433 + struct btrfs_workers scrub_nocow_workers; 1492 1434 1493 1435 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 1494 1436 u32 check_integrity_print_mask; ··· 1532 1470 int backup_root_index; 1533 1471 1534 1472 int num_tolerated_disk_barrier_failures; 1473 + 1474 + /* device replace state */ 1475 + struct btrfs_dev_replace dev_replace; 1476 + 1477 + atomic_t mutually_exclusive_operation_running; 1535 1478 }; 1536 1479 1537 1480 /* ··· 1646 1579 1647 1580 int force_cow; 1648 1581 1649 - spinlock_t root_times_lock; 1582 + spinlock_t root_item_lock; 1650 1583 }; 1651 1584 1652 1585 struct btrfs_ioctl_defrag_range_args { ··· 1790 1723 #define BTRFS_DEV_STATS_KEY 249 1791 1724 1792 1725 /* 1726 + * Persistantly stores the device replace state in the device tree. 1727 + * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0). 1728 + */ 1729 + #define BTRFS_DEV_REPLACE_KEY 250 1730 + 1731 + /* 1793 1732 * string items are for debugging. They just store a short string of 1794 1733 * data in the FS 1795 1734 */ ··· 1860 1787 1861 1788 static inline void btrfs_init_map_token (struct btrfs_map_token *token) 1862 1789 { 1863 - memset(token, 0, sizeof(*token)); 1790 + token->kaddr = NULL; 1864 1791 } 1865 1792 1866 1793 /* some macros to generate set/get funcs for the struct fields. This ··· 2828 2755 BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, 2829 2756 rsv_excl, 64); 2830 2757 2758 + /* btrfs_dev_replace_item */ 2759 + BTRFS_SETGET_FUNCS(dev_replace_src_devid, 2760 + struct btrfs_dev_replace_item, src_devid, 64); 2761 + BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode, 2762 + struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, 2763 + 64); 2764 + BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item, 2765 + replace_state, 64); 2766 + BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item, 2767 + time_started, 64); 2768 + BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item, 2769 + time_stopped, 64); 2770 + BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item, 2771 + num_write_errors, 64); 2772 + BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors, 2773 + struct btrfs_dev_replace_item, num_uncorrectable_read_errors, 2774 + 64); 2775 + BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item, 2776 + cursor_left, 64); 2777 + BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item, 2778 + cursor_right, 64); 2779 + 2780 + BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid, 2781 + struct btrfs_dev_replace_item, src_devid, 64); 2782 + BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode, 2783 + struct btrfs_dev_replace_item, 2784 + cont_reading_from_srcdev_mode, 64); 2785 + BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state, 2786 + struct btrfs_dev_replace_item, replace_state, 64); 2787 + BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started, 2788 + struct btrfs_dev_replace_item, time_started, 64); 2789 + BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped, 2790 + struct btrfs_dev_replace_item, time_stopped, 64); 2791 + BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors, 2792 + struct btrfs_dev_replace_item, num_write_errors, 64); 2793 + BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors, 2794 + struct btrfs_dev_replace_item, 2795 + num_uncorrectable_read_errors, 64); 2796 + BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left, 2797 + struct btrfs_dev_replace_item, cursor_left, 64); 2798 + BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, 2799 + struct btrfs_dev_replace_item, cursor_right, 64); 2800 + 2831 2801 static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) 2832 2802 { 2833 2803 return sb->s_fs_info; ··· 3016 2900 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 3017 2901 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); 3018 2902 void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2903 + 2904 + enum btrfs_reserve_flush_enum { 2905 + /* If we are in the transaction, we can't flush anything.*/ 2906 + BTRFS_RESERVE_NO_FLUSH, 2907 + /* 2908 + * Flushing delalloc may cause deadlock somewhere, in this 2909 + * case, use FLUSH LIMIT 2910 + */ 2911 + BTRFS_RESERVE_FLUSH_LIMIT, 2912 + BTRFS_RESERVE_FLUSH_ALL, 2913 + }; 2914 + 3019 2915 int btrfs_check_data_free_space(struct inode *inode, u64 bytes); 3020 2916 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); 3021 2917 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, ··· 3047 2919 void btrfs_free_block_rsv(struct btrfs_root *root, 3048 2920 struct btrfs_block_rsv *rsv); 3049 2921 int btrfs_block_rsv_add(struct btrfs_root *root, 3050 - struct btrfs_block_rsv *block_rsv, 3051 - u64 num_bytes); 3052 - int btrfs_block_rsv_add_noflush(struct btrfs_root *root, 3053 - struct btrfs_block_rsv *block_rsv, 3054 - u64 num_bytes); 2922 + struct btrfs_block_rsv *block_rsv, u64 num_bytes, 2923 + enum btrfs_reserve_flush_enum flush); 3055 2924 int btrfs_block_rsv_check(struct btrfs_root *root, 3056 2925 struct btrfs_block_rsv *block_rsv, int min_factor); 3057 2926 int btrfs_block_rsv_refill(struct btrfs_root *root, 3058 - struct btrfs_block_rsv *block_rsv, 3059 - u64 min_reserved); 3060 - int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, 3061 - struct btrfs_block_rsv *block_rsv, 3062 - u64 min_reserved); 2927 + struct btrfs_block_rsv *block_rsv, u64 min_reserved, 2928 + enum btrfs_reserve_flush_enum flush); 3063 2929 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3064 2930 struct btrfs_block_rsv *dst_rsv, 3065 2931 u64 num_bytes); ··· 3077 2955 int btrfs_init_space_info(struct btrfs_fs_info *fs_info); 3078 2956 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, 3079 2957 struct btrfs_fs_info *fs_info); 2958 + int __get_raid_index(u64 flags); 3080 2959 /* ctree.c */ 3081 2960 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 3082 2961 int level, int *slot); ··· 3188 3065 } 3189 3066 3190 3067 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 3068 + int btrfs_next_leaf_write(struct btrfs_trans_handle *trans, 3069 + struct btrfs_root *root, struct btrfs_path *path, 3070 + int del); 3191 3071 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 3192 3072 u64 time_seq); 3193 3073 static inline int btrfs_next_old_item(struct btrfs_root *root, ··· 3283 3157 struct btrfs_root *root); 3284 3158 3285 3159 /* dir-item.c */ 3160 + int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, 3161 + const char *name, int name_len); 3286 3162 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, 3287 3163 struct btrfs_root *root, const char *name, 3288 3164 int name_len, struct inode *dir, ··· 3384 3256 struct btrfs_root *root, 3385 3257 struct btrfs_path *path, u64 objectid, 3386 3258 u64 bytenr, int mod); 3259 + u64 btrfs_file_extent_length(struct btrfs_path *path); 3387 3260 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, 3388 3261 struct btrfs_root *root, 3389 3262 struct btrfs_ordered_sum *sums); ··· 3400 3271 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 3401 3272 struct list_head *list, int search_commit); 3402 3273 /* inode.c */ 3274 + struct btrfs_delalloc_work { 3275 + struct inode *inode; 3276 + int wait; 3277 + int delay_iput; 3278 + struct completion completion; 3279 + struct list_head list; 3280 + struct btrfs_work work; 3281 + }; 3282 + 3283 + struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, 3284 + int wait, int delay_iput); 3285 + void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work); 3286 + 3403 3287 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, 3404 3288 size_t pg_offset, u64 start, u64 len, 3405 3289 int create); ··· 3512 3370 struct btrfs_ioctl_space_info *space); 3513 3371 3514 3372 /* file.c */ 3373 + int btrfs_auto_defrag_init(void); 3374 + void btrfs_auto_defrag_exit(void); 3515 3375 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 3516 3376 struct inode *inode); 3517 3377 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); 3378 + void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); 3518 3379 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); 3519 3380 void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 3520 3381 int skip_pinned); ··· 3664 3519 struct btrfs_pending_snapshot *pending); 3665 3520 3666 3521 /* scrub.c */ 3667 - int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, 3668 - struct btrfs_scrub_progress *progress, int readonly); 3522 + int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, 3523 + u64 end, struct btrfs_scrub_progress *progress, 3524 + int readonly, int is_dev_replace); 3669 3525 void btrfs_scrub_pause(struct btrfs_root *root); 3670 3526 void btrfs_scrub_pause_super(struct btrfs_root *root); 3671 3527 void btrfs_scrub_continue(struct btrfs_root *root); 3672 3528 void btrfs_scrub_continue_super(struct btrfs_root *root); 3673 - int __btrfs_scrub_cancel(struct btrfs_fs_info *info); 3674 - int btrfs_scrub_cancel(struct btrfs_root *root); 3675 - int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev); 3529 + int btrfs_scrub_cancel(struct btrfs_fs_info *info); 3530 + int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, 3531 + struct btrfs_device *dev); 3676 3532 int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); 3677 3533 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 3678 3534 struct btrfs_scrub_progress *progress);

+5 -6

fs/btrfs/delayed-inode.c

··· 651 651 */ 652 652 if (!src_rsv || (!trans->bytes_reserved && 653 653 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { 654 - ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); 654 + ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, 655 + BTRFS_RESERVE_NO_FLUSH); 655 656 /* 656 657 * Since we're under a transaction reserve_metadata_bytes could 657 658 * try to commit the transaction which will make it return ··· 687 686 * reserve something strictly for us. If not be a pain and try 688 687 * to steal from the delalloc block rsv. 689 688 */ 690 - ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); 689 + ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, 690 + BTRFS_RESERVE_NO_FLUSH); 691 691 if (!ret) 692 692 goto out; 693 693 ··· 1257 1255 struct btrfs_delayed_node *delayed_node = NULL; 1258 1256 struct btrfs_root *root; 1259 1257 struct btrfs_block_rsv *block_rsv; 1260 - unsigned long nr = 0; 1261 1258 int need_requeue = 0; 1262 1259 int ret; 1263 1260 ··· 1317 1316 delayed_node); 1318 1317 mutex_unlock(&delayed_node->mutex); 1319 1318 1320 - nr = trans->blocks_used; 1321 - 1322 1319 trans->block_rsv = block_rsv; 1323 1320 btrfs_end_transaction_dmeta(trans, root); 1324 - __btrfs_btree_balance_dirty(root, nr); 1321 + btrfs_btree_balance_dirty_nodelay(root); 1325 1322 free_path: 1326 1323 btrfs_free_path(path); 1327 1324 out:

+856

fs/btrfs/dev-replace.c

··· 1 + /* 2 + * Copyright (C) STRATO AG 2012. All rights reserved. 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public 6 + * License v2 as published by the Free Software Foundation. 7 + * 8 + * This program is distributed in the hope that it will be useful, 9 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 + * General Public License for more details. 12 + * 13 + * You should have received a copy of the GNU General Public 14 + * License along with this program; if not, write to the 15 + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 + * Boston, MA 021110-1307, USA. 17 + */ 18 + #include <linux/sched.h> 19 + #include <linux/bio.h> 20 + #include <linux/slab.h> 21 + #include <linux/buffer_head.h> 22 + #include <linux/blkdev.h> 23 + #include <linux/random.h> 24 + #include <linux/iocontext.h> 25 + #include <linux/capability.h> 26 + #include <linux/kthread.h> 27 + #include <linux/math64.h> 28 + #include <asm/div64.h> 29 + #include "compat.h" 30 + #include "ctree.h" 31 + #include "extent_map.h" 32 + #include "disk-io.h" 33 + #include "transaction.h" 34 + #include "print-tree.h" 35 + #include "volumes.h" 36 + #include "async-thread.h" 37 + #include "check-integrity.h" 38 + #include "rcu-string.h" 39 + #include "dev-replace.h" 40 + 41 + static u64 btrfs_get_seconds_since_1970(void); 42 + static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, 43 + int scrub_ret); 44 + static void btrfs_dev_replace_update_device_in_mapping_tree( 45 + struct btrfs_fs_info *fs_info, 46 + struct btrfs_device *srcdev, 47 + struct btrfs_device *tgtdev); 48 + static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, 49 + char *srcdev_name, 50 + struct btrfs_device **device); 51 + static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); 52 + static int btrfs_dev_replace_kthread(void *data); 53 + static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info); 54 + 55 + 56 + int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) 57 + { 58 + struct btrfs_key key; 59 + struct btrfs_root *dev_root = fs_info->dev_root; 60 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 61 + struct extent_buffer *eb; 62 + int slot; 63 + int ret = 0; 64 + struct btrfs_path *path = NULL; 65 + int item_size; 66 + struct btrfs_dev_replace_item *ptr; 67 + u64 src_devid; 68 + 69 + path = btrfs_alloc_path(); 70 + if (!path) { 71 + ret = -ENOMEM; 72 + goto out; 73 + } 74 + 75 + key.objectid = 0; 76 + key.type = BTRFS_DEV_REPLACE_KEY; 77 + key.offset = 0; 78 + ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 79 + if (ret) { 80 + no_valid_dev_replace_entry_found: 81 + ret = 0; 82 + dev_replace->replace_state = 83 + BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED; 84 + dev_replace->cont_reading_from_srcdev_mode = 85 + BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS; 86 + dev_replace->replace_state = 0; 87 + dev_replace->time_started = 0; 88 + dev_replace->time_stopped = 0; 89 + atomic64_set(&dev_replace->num_write_errors, 0); 90 + atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); 91 + dev_replace->cursor_left = 0; 92 + dev_replace->committed_cursor_left = 0; 93 + dev_replace->cursor_left_last_write_of_item = 0; 94 + dev_replace->cursor_right = 0; 95 + dev_replace->srcdev = NULL; 96 + dev_replace->tgtdev = NULL; 97 + dev_replace->is_valid = 0; 98 + dev_replace->item_needs_writeback = 0; 99 + goto out; 100 + } 101 + slot = path->slots[0]; 102 + eb = path->nodes[0]; 103 + item_size = btrfs_item_size_nr(eb, slot); 104 + ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item); 105 + 106 + if (item_size != sizeof(struct btrfs_dev_replace_item)) { 107 + pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n"); 108 + goto no_valid_dev_replace_entry_found; 109 + } 110 + 111 + src_devid = btrfs_dev_replace_src_devid(eb, ptr); 112 + dev_replace->cont_reading_from_srcdev_mode = 113 + btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr); 114 + dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr); 115 + dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr); 116 + dev_replace->time_stopped = 117 + btrfs_dev_replace_time_stopped(eb, ptr); 118 + atomic64_set(&dev_replace->num_write_errors, 119 + btrfs_dev_replace_num_write_errors(eb, ptr)); 120 + atomic64_set(&dev_replace->num_uncorrectable_read_errors, 121 + btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr)); 122 + dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr); 123 + dev_replace->committed_cursor_left = dev_replace->cursor_left; 124 + dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left; 125 + dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr); 126 + dev_replace->is_valid = 1; 127 + 128 + dev_replace->item_needs_writeback = 0; 129 + switch (dev_replace->replace_state) { 130 + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 131 + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 132 + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 133 + dev_replace->srcdev = NULL; 134 + dev_replace->tgtdev = NULL; 135 + break; 136 + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 137 + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 138 + dev_replace->srcdev = btrfs_find_device(fs_info, src_devid, 139 + NULL, NULL); 140 + dev_replace->tgtdev = btrfs_find_device(fs_info, 141 + BTRFS_DEV_REPLACE_DEVID, 142 + NULL, NULL); 143 + /* 144 + * allow 'btrfs dev replace_cancel' if src/tgt device is 145 + * missing 146 + */ 147 + if (!dev_replace->srcdev && 148 + !btrfs_test_opt(dev_root, DEGRADED)) { 149 + ret = -EIO; 150 + pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n", 151 + (unsigned long long)src_devid); 152 + } 153 + if (!dev_replace->tgtdev && 154 + !btrfs_test_opt(dev_root, DEGRADED)) { 155 + ret = -EIO; 156 + pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n", 157 + (unsigned long long)BTRFS_DEV_REPLACE_DEVID); 158 + } 159 + if (dev_replace->tgtdev) { 160 + if (dev_replace->srcdev) { 161 + dev_replace->tgtdev->total_bytes = 162 + dev_replace->srcdev->total_bytes; 163 + dev_replace->tgtdev->disk_total_bytes = 164 + dev_replace->srcdev->disk_total_bytes; 165 + dev_replace->tgtdev->bytes_used = 166 + dev_replace->srcdev->bytes_used; 167 + } 168 + dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1; 169 + btrfs_init_dev_replace_tgtdev_for_resume(fs_info, 170 + dev_replace->tgtdev); 171 + } 172 + break; 173 + } 174 + 175 + out: 176 + if (path) 177 + btrfs_free_path(path); 178 + return ret; 179 + } 180 + 181 + /* 182 + * called from commit_transaction. Writes changed device replace state to 183 + * disk. 184 + */ 185 + int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, 186 + struct btrfs_fs_info *fs_info) 187 + { 188 + int ret; 189 + struct btrfs_root *dev_root = fs_info->dev_root; 190 + struct btrfs_path *path; 191 + struct btrfs_key key; 192 + struct extent_buffer *eb; 193 + struct btrfs_dev_replace_item *ptr; 194 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 195 + 196 + btrfs_dev_replace_lock(dev_replace); 197 + if (!dev_replace->is_valid || 198 + !dev_replace->item_needs_writeback) { 199 + btrfs_dev_replace_unlock(dev_replace); 200 + return 0; 201 + } 202 + btrfs_dev_replace_unlock(dev_replace); 203 + 204 + key.objectid = 0; 205 + key.type = BTRFS_DEV_REPLACE_KEY; 206 + key.offset = 0; 207 + 208 + path = btrfs_alloc_path(); 209 + if (!path) { 210 + ret = -ENOMEM; 211 + goto out; 212 + } 213 + ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 214 + if (ret < 0) { 215 + pr_warn("btrfs: error %d while searching for dev_replace item!\n", 216 + ret); 217 + goto out; 218 + } 219 + 220 + if (ret == 0 && 221 + btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 222 + /* 223 + * need to delete old one and insert a new one. 224 + * Since no attempt is made to recover any old state, if the 225 + * dev_replace state is 'running', the data on the target 226 + * drive is lost. 227 + * It would be possible to recover the state: just make sure 228 + * that the beginning of the item is never changed and always 229 + * contains all the essential information. Then read this 230 + * minimal set of information and use it as a base for the 231 + * new state. 232 + */ 233 + ret = btrfs_del_item(trans, dev_root, path); 234 + if (ret != 0) { 235 + pr_warn("btrfs: delete too small dev_replace item failed %d!\n", 236 + ret); 237 + goto out; 238 + } 239 + ret = 1; 240 + } 241 + 242 + if (ret == 1) { 243 + /* need to insert a new item */ 244 + btrfs_release_path(path); 245 + ret = btrfs_insert_empty_item(trans, dev_root, path, 246 + &key, sizeof(*ptr)); 247 + if (ret < 0) { 248 + pr_warn("btrfs: insert dev_replace item failed %d!\n", 249 + ret); 250 + goto out; 251 + } 252 + } 253 + 254 + eb = path->nodes[0]; 255 + ptr = btrfs_item_ptr(eb, path->slots[0], 256 + struct btrfs_dev_replace_item); 257 + 258 + btrfs_dev_replace_lock(dev_replace); 259 + if (dev_replace->srcdev) 260 + btrfs_set_dev_replace_src_devid(eb, ptr, 261 + dev_replace->srcdev->devid); 262 + else 263 + btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1); 264 + btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr, 265 + dev_replace->cont_reading_from_srcdev_mode); 266 + btrfs_set_dev_replace_replace_state(eb, ptr, 267 + dev_replace->replace_state); 268 + btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started); 269 + btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped); 270 + btrfs_set_dev_replace_num_write_errors(eb, ptr, 271 + atomic64_read(&dev_replace->num_write_errors)); 272 + btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr, 273 + atomic64_read(&dev_replace->num_uncorrectable_read_errors)); 274 + dev_replace->cursor_left_last_write_of_item = 275 + dev_replace->cursor_left; 276 + btrfs_set_dev_replace_cursor_left(eb, ptr, 277 + dev_replace->cursor_left_last_write_of_item); 278 + btrfs_set_dev_replace_cursor_right(eb, ptr, 279 + dev_replace->cursor_right); 280 + dev_replace->item_needs_writeback = 0; 281 + btrfs_dev_replace_unlock(dev_replace); 282 + 283 + btrfs_mark_buffer_dirty(eb); 284 + 285 + out: 286 + btrfs_free_path(path); 287 + 288 + return ret; 289 + } 290 + 291 + void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) 292 + { 293 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 294 + 295 + dev_replace->committed_cursor_left = 296 + dev_replace->cursor_left_last_write_of_item; 297 + } 298 + 299 + static u64 btrfs_get_seconds_since_1970(void) 300 + { 301 + struct timespec t = CURRENT_TIME_SEC; 302 + 303 + return t.tv_sec; 304 + } 305 + 306 + int btrfs_dev_replace_start(struct btrfs_root *root, 307 + struct btrfs_ioctl_dev_replace_args *args) 308 + { 309 + struct btrfs_trans_handle *trans; 310 + struct btrfs_fs_info *fs_info = root->fs_info; 311 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 312 + int ret; 313 + struct btrfs_device *tgt_device = NULL; 314 + struct btrfs_device *src_device = NULL; 315 + 316 + switch (args->start.cont_reading_from_srcdev_mode) { 317 + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: 318 + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: 319 + break; 320 + default: 321 + return -EINVAL; 322 + } 323 + 324 + if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || 325 + args->start.tgtdev_name[0] == '\0') 326 + return -EINVAL; 327 + 328 + mutex_lock(&fs_info->volume_mutex); 329 + ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, 330 + &tgt_device); 331 + if (ret) { 332 + pr_err("btrfs: target device %s is invalid!\n", 333 + args->start.tgtdev_name); 334 + mutex_unlock(&fs_info->volume_mutex); 335 + return -EINVAL; 336 + } 337 + 338 + ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, 339 + args->start.srcdev_name, 340 + &src_device); 341 + mutex_unlock(&fs_info->volume_mutex); 342 + if (ret) { 343 + ret = -EINVAL; 344 + goto leave_no_lock; 345 + } 346 + 347 + if (tgt_device->total_bytes < src_device->total_bytes) { 348 + pr_err("btrfs: target device is smaller than source device!\n"); 349 + ret = -EINVAL; 350 + goto leave_no_lock; 351 + } 352 + 353 + btrfs_dev_replace_lock(dev_replace); 354 + switch (dev_replace->replace_state) { 355 + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 356 + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 357 + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 358 + break; 359 + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 360 + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 361 + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; 362 + goto leave; 363 + } 364 + 365 + dev_replace->cont_reading_from_srcdev_mode = 366 + args->start.cont_reading_from_srcdev_mode; 367 + WARN_ON(!src_device); 368 + dev_replace->srcdev = src_device; 369 + WARN_ON(!tgt_device); 370 + dev_replace->tgtdev = tgt_device; 371 + 372 + printk_in_rcu(KERN_INFO 373 + "btrfs: dev_replace from %s (devid %llu) to %s) started\n", 374 + src_device->missing ? "<missing disk>" : 375 + rcu_str_deref(src_device->name), 376 + src_device->devid, 377 + rcu_str_deref(tgt_device->name)); 378 + 379 + tgt_device->total_bytes = src_device->total_bytes; 380 + tgt_device->disk_total_bytes = src_device->disk_total_bytes; 381 + tgt_device->bytes_used = src_device->bytes_used; 382 + 383 + /* 384 + * from now on, the writes to the srcdev are all duplicated to 385 + * go to the tgtdev as well (refer to btrfs_map_block()). 386 + */ 387 + dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; 388 + dev_replace->time_started = btrfs_get_seconds_since_1970(); 389 + dev_replace->cursor_left = 0; 390 + dev_replace->committed_cursor_left = 0; 391 + dev_replace->cursor_left_last_write_of_item = 0; 392 + dev_replace->cursor_right = 0; 393 + dev_replace->is_valid = 1; 394 + dev_replace->item_needs_writeback = 1; 395 + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 396 + btrfs_dev_replace_unlock(dev_replace); 397 + 398 + btrfs_wait_ordered_extents(root, 0); 399 + 400 + /* force writing the updated state information to disk */ 401 + trans = btrfs_start_transaction(root, 0); 402 + if (IS_ERR(trans)) { 403 + ret = PTR_ERR(trans); 404 + btrfs_dev_replace_lock(dev_replace); 405 + goto leave; 406 + } 407 + 408 + ret = btrfs_commit_transaction(trans, root); 409 + WARN_ON(ret); 410 + 411 + /* the disk copy procedure reuses the scrub code */ 412 + ret = btrfs_scrub_dev(fs_info, src_device->devid, 0, 413 + src_device->total_bytes, 414 + &dev_replace->scrub_progress, 0, 1); 415 + 416 + ret = btrfs_dev_replace_finishing(root->fs_info, ret); 417 + WARN_ON(ret); 418 + 419 + return 0; 420 + 421 + leave: 422 + dev_replace->srcdev = NULL; 423 + dev_replace->tgtdev = NULL; 424 + btrfs_dev_replace_unlock(dev_replace); 425 + leave_no_lock: 426 + if (tgt_device) 427 + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); 428 + return ret; 429 + } 430 + 431 + static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, 432 + int scrub_ret) 433 + { 434 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 435 + struct btrfs_device *tgt_device; 436 + struct btrfs_device *src_device; 437 + struct btrfs_root *root = fs_info->tree_root; 438 + u8 uuid_tmp[BTRFS_UUID_SIZE]; 439 + struct btrfs_trans_handle *trans; 440 + int ret = 0; 441 + 442 + /* don't allow cancel or unmount to disturb the finishing procedure */ 443 + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); 444 + 445 + btrfs_dev_replace_lock(dev_replace); 446 + /* was the operation canceled, or is it finished? */ 447 + if (dev_replace->replace_state != 448 + BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { 449 + btrfs_dev_replace_unlock(dev_replace); 450 + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 451 + return 0; 452 + } 453 + 454 + tgt_device = dev_replace->tgtdev; 455 + src_device = dev_replace->srcdev; 456 + btrfs_dev_replace_unlock(dev_replace); 457 + 458 + /* replace old device with new one in mapping tree */ 459 + if (!scrub_ret) 460 + btrfs_dev_replace_update_device_in_mapping_tree(fs_info, 461 + src_device, 462 + tgt_device); 463 + 464 + /* 465 + * flush all outstanding I/O and inode extent mappings before the 466 + * copy operation is declared as being finished 467 + */ 468 + btrfs_start_delalloc_inodes(root, 0); 469 + btrfs_wait_ordered_extents(root, 0); 470 + 471 + trans = btrfs_start_transaction(root, 0); 472 + if (IS_ERR(trans)) { 473 + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 474 + return PTR_ERR(trans); 475 + } 476 + ret = btrfs_commit_transaction(trans, root); 477 + WARN_ON(ret); 478 + 479 + /* keep away write_all_supers() during the finishing procedure */ 480 + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 481 + btrfs_dev_replace_lock(dev_replace); 482 + dev_replace->replace_state = 483 + scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 484 + : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; 485 + dev_replace->tgtdev = NULL; 486 + dev_replace->srcdev = NULL; 487 + dev_replace->time_stopped = btrfs_get_seconds_since_1970(); 488 + dev_replace->item_needs_writeback = 1; 489 + 490 + if (scrub_ret) { 491 + printk_in_rcu(KERN_ERR 492 + "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", 493 + src_device->missing ? "<missing disk>" : 494 + rcu_str_deref(src_device->name), 495 + src_device->devid, 496 + rcu_str_deref(tgt_device->name), scrub_ret); 497 + btrfs_dev_replace_unlock(dev_replace); 498 + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 499 + if (tgt_device) 500 + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); 501 + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 502 + 503 + return 0; 504 + } 505 + 506 + printk_in_rcu(KERN_INFO 507 + "btrfs: dev_replace from %s (devid %llu) to %s) finished\n", 508 + src_device->missing ? "<missing disk>" : 509 + rcu_str_deref(src_device->name), 510 + src_device->devid, 511 + rcu_str_deref(tgt_device->name)); 512 + tgt_device->is_tgtdev_for_dev_replace = 0; 513 + tgt_device->devid = src_device->devid; 514 + src_device->devid = BTRFS_DEV_REPLACE_DEVID; 515 + tgt_device->bytes_used = src_device->bytes_used; 516 + memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp)); 517 + memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid)); 518 + memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid)); 519 + tgt_device->total_bytes = src_device->total_bytes; 520 + tgt_device->disk_total_bytes = src_device->disk_total_bytes; 521 + tgt_device->bytes_used = src_device->bytes_used; 522 + if (fs_info->sb->s_bdev == src_device->bdev) 523 + fs_info->sb->s_bdev = tgt_device->bdev; 524 + if (fs_info->fs_devices->latest_bdev == src_device->bdev) 525 + fs_info->fs_devices->latest_bdev = tgt_device->bdev; 526 + list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); 527 + 528 + btrfs_rm_dev_replace_srcdev(fs_info, src_device); 529 + if (src_device->bdev) { 530 + /* zero out the old super */ 531 + btrfs_scratch_superblock(src_device); 532 + } 533 + /* 534 + * this is again a consistent state where no dev_replace procedure 535 + * is running, the target device is part of the filesystem, the 536 + * source device is not part of the filesystem anymore and its 1st 537 + * superblock is scratched out so that it is no longer marked to 538 + * belong to this filesystem. 539 + */ 540 + btrfs_dev_replace_unlock(dev_replace); 541 + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 542 + 543 + /* write back the superblocks */ 544 + trans = btrfs_start_transaction(root, 0); 545 + if (!IS_ERR(trans)) 546 + btrfs_commit_transaction(trans, root); 547 + 548 + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 549 + 550 + return 0; 551 + } 552 + 553 + static void btrfs_dev_replace_update_device_in_mapping_tree( 554 + struct btrfs_fs_info *fs_info, 555 + struct btrfs_device *srcdev, 556 + struct btrfs_device *tgtdev) 557 + { 558 + struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; 559 + struct extent_map *em; 560 + struct map_lookup *map; 561 + u64 start = 0; 562 + int i; 563 + 564 + write_lock(&em_tree->lock); 565 + do { 566 + em = lookup_extent_mapping(em_tree, start, (u64)-1); 567 + if (!em) 568 + break; 569 + map = (struct map_lookup *)em->bdev; 570 + for (i = 0; i < map->num_stripes; i++) 571 + if (srcdev == map->stripes[i].dev) 572 + map->stripes[i].dev = tgtdev; 573 + start = em->start + em->len; 574 + free_extent_map(em); 575 + } while (start); 576 + write_unlock(&em_tree->lock); 577 + } 578 + 579 + static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, 580 + char *srcdev_name, 581 + struct btrfs_device **device) 582 + { 583 + int ret; 584 + 585 + if (srcdevid) { 586 + ret = 0; 587 + *device = btrfs_find_device(root->fs_info, srcdevid, NULL, 588 + NULL); 589 + if (!*device) 590 + ret = -ENOENT; 591 + } else { 592 + ret = btrfs_find_device_missing_or_by_path(root, srcdev_name, 593 + device); 594 + } 595 + return ret; 596 + } 597 + 598 + void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, 599 + struct btrfs_ioctl_dev_replace_args *args) 600 + { 601 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 602 + 603 + btrfs_dev_replace_lock(dev_replace); 604 + /* even if !dev_replace_is_valid, the values are good enough for 605 + * the replace_status ioctl */ 606 + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 607 + args->status.replace_state = dev_replace->replace_state; 608 + args->status.time_started = dev_replace->time_started; 609 + args->status.time_stopped = dev_replace->time_stopped; 610 + args->status.num_write_errors = 611 + atomic64_read(&dev_replace->num_write_errors); 612 + args->status.num_uncorrectable_read_errors = 613 + atomic64_read(&dev_replace->num_uncorrectable_read_errors); 614 + switch (dev_replace->replace_state) { 615 + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 616 + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 617 + args->status.progress_1000 = 0; 618 + break; 619 + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 620 + args->status.progress_1000 = 1000; 621 + break; 622 + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 623 + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 624 + args->status.progress_1000 = div64_u64(dev_replace->cursor_left, 625 + div64_u64(dev_replace->srcdev->total_bytes, 1000)); 626 + break; 627 + } 628 + btrfs_dev_replace_unlock(dev_replace); 629 + } 630 + 631 + int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, 632 + struct btrfs_ioctl_dev_replace_args *args) 633 + { 634 + args->result = __btrfs_dev_replace_cancel(fs_info); 635 + return 0; 636 + } 637 + 638 + static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) 639 + { 640 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 641 + struct btrfs_device *tgt_device = NULL; 642 + struct btrfs_trans_handle *trans; 643 + struct btrfs_root *root = fs_info->tree_root; 644 + u64 result; 645 + int ret; 646 + 647 + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); 648 + btrfs_dev_replace_lock(dev_replace); 649 + switch (dev_replace->replace_state) { 650 + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 651 + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 652 + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 653 + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; 654 + btrfs_dev_replace_unlock(dev_replace); 655 + goto leave; 656 + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 657 + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 658 + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 659 + tgt_device = dev_replace->tgtdev; 660 + dev_replace->tgtdev = NULL; 661 + dev_replace->srcdev = NULL; 662 + break; 663 + } 664 + dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; 665 + dev_replace->time_stopped = btrfs_get_seconds_since_1970(); 666 + dev_replace->item_needs_writeback = 1; 667 + btrfs_dev_replace_unlock(dev_replace); 668 + btrfs_scrub_cancel(fs_info); 669 + 670 + trans = btrfs_start_transaction(root, 0); 671 + if (IS_ERR(trans)) { 672 + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 673 + return PTR_ERR(trans); 674 + } 675 + ret = btrfs_commit_transaction(trans, root); 676 + WARN_ON(ret); 677 + if (tgt_device) 678 + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); 679 + 680 + leave: 681 + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 682 + return result; 683 + } 684 + 685 + void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) 686 + { 687 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 688 + 689 + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); 690 + btrfs_dev_replace_lock(dev_replace); 691 + switch (dev_replace->replace_state) { 692 + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 693 + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 694 + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 695 + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 696 + break; 697 + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 698 + dev_replace->replace_state = 699 + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; 700 + dev_replace->time_stopped = btrfs_get_seconds_since_1970(); 701 + dev_replace->item_needs_writeback = 1; 702 + pr_info("btrfs: suspending dev_replace for unmount\n"); 703 + break; 704 + } 705 + 706 + btrfs_dev_replace_unlock(dev_replace); 707 + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 708 + } 709 + 710 + /* resume dev_replace procedure that was interrupted by unmount */ 711 + int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) 712 + { 713 + struct task_struct *task; 714 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 715 + 716 + btrfs_dev_replace_lock(dev_replace); 717 + switch (dev_replace->replace_state) { 718 + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 719 + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 720 + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 721 + btrfs_dev_replace_unlock(dev_replace); 722 + return 0; 723 + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 724 + break; 725 + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 726 + dev_replace->replace_state = 727 + BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; 728 + break; 729 + } 730 + if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) { 731 + pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n" 732 + "btrfs: you may cancel the operation after 'mount -o degraded'\n"); 733 + btrfs_dev_replace_unlock(dev_replace); 734 + return 0; 735 + } 736 + btrfs_dev_replace_unlock(dev_replace); 737 + 738 + WARN_ON(atomic_xchg( 739 + &fs_info->mutually_exclusive_operation_running, 1)); 740 + task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); 741 + return PTR_RET(task); 742 + } 743 + 744 + static int btrfs_dev_replace_kthread(void *data) 745 + { 746 + struct btrfs_fs_info *fs_info = data; 747 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 748 + struct btrfs_ioctl_dev_replace_args *status_args; 749 + u64 progress; 750 + 751 + status_args = kzalloc(sizeof(*status_args), GFP_NOFS); 752 + if (status_args) { 753 + btrfs_dev_replace_status(fs_info, status_args); 754 + progress = status_args->status.progress_1000; 755 + kfree(status_args); 756 + do_div(progress, 10); 757 + printk_in_rcu(KERN_INFO 758 + "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", 759 + dev_replace->srcdev->missing ? "<missing disk>" : 760 + rcu_str_deref(dev_replace->srcdev->name), 761 + dev_replace->srcdev->devid, 762 + dev_replace->tgtdev ? 763 + rcu_str_deref(dev_replace->tgtdev->name) : 764 + "<missing target disk>", 765 + (unsigned int)progress); 766 + } 767 + btrfs_dev_replace_continue_on_mount(fs_info); 768 + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); 769 + 770 + return 0; 771 + } 772 + 773 + static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info) 774 + { 775 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 776 + int ret; 777 + 778 + ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid, 779 + dev_replace->committed_cursor_left, 780 + dev_replace->srcdev->total_bytes, 781 + &dev_replace->scrub_progress, 0, 1); 782 + ret = btrfs_dev_replace_finishing(fs_info, ret); 783 + WARN_ON(ret); 784 + return 0; 785 + } 786 + 787 + int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) 788 + { 789 + if (!dev_replace->is_valid) 790 + return 0; 791 + 792 + switch (dev_replace->replace_state) { 793 + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 794 + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 795 + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 796 + return 0; 797 + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 798 + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 799 + /* 800 + * return true even if tgtdev is missing (this is 801 + * something that can happen if the dev_replace 802 + * procedure is suspended by an umount and then 803 + * the tgtdev is missing (or "btrfs dev scan") was 804 + * not called and the the filesystem is remounted 805 + * in degraded state. This does not stop the 806 + * dev_replace procedure. It needs to be canceled 807 + * manually if the cancelation is wanted. 808 + */ 809 + break; 810 + } 811 + return 1; 812 + } 813 + 814 + void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace) 815 + { 816 + /* the beginning is just an optimization for the typical case */ 817 + if (atomic_read(&dev_replace->nesting_level) == 0) { 818 + acquire_lock: 819 + /* this is not a nested case where the same thread 820 + * is trying to acqurire the same lock twice */ 821 + mutex_lock(&dev_replace->lock); 822 + mutex_lock(&dev_replace->lock_management_lock); 823 + dev_replace->lock_owner = current->pid; 824 + atomic_inc(&dev_replace->nesting_level); 825 + mutex_unlock(&dev_replace->lock_management_lock); 826 + return; 827 + } 828 + 829 + mutex_lock(&dev_replace->lock_management_lock); 830 + if (atomic_read(&dev_replace->nesting_level) > 0 && 831 + dev_replace->lock_owner == current->pid) { 832 + WARN_ON(!mutex_is_locked(&dev_replace->lock)); 833 + atomic_inc(&dev_replace->nesting_level); 834 + mutex_unlock(&dev_replace->lock_management_lock); 835 + return; 836 + } 837 + 838 + mutex_unlock(&dev_replace->lock_management_lock); 839 + goto acquire_lock; 840 + } 841 + 842 + void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace) 843 + { 844 + WARN_ON(!mutex_is_locked(&dev_replace->lock)); 845 + mutex_lock(&dev_replace->lock_management_lock); 846 + WARN_ON(atomic_read(&dev_replace->nesting_level) < 1); 847 + WARN_ON(dev_replace->lock_owner != current->pid); 848 + atomic_dec(&dev_replace->nesting_level); 849 + if (atomic_read(&dev_replace->nesting_level) == 0) { 850 + dev_replace->lock_owner = 0; 851 + mutex_unlock(&dev_replace->lock_management_lock); 852 + mutex_unlock(&dev_replace->lock); 853 + } else { 854 + mutex_unlock(&dev_replace->lock_management_lock); 855 + } 856 + }

+44

fs/btrfs/dev-replace.h

··· 1 + /* 2 + * Copyright (C) STRATO AG 2012. All rights reserved. 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public 6 + * License v2 as published by the Free Software Foundation. 7 + * 8 + * This program is distributed in the hope that it will be useful, 9 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 + * General Public License for more details. 12 + * 13 + * You should have received a copy of the GNU General Public 14 + * License along with this program; if not, write to the 15 + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 + * Boston, MA 021110-1307, USA. 17 + */ 18 + 19 + #if !defined(__BTRFS_DEV_REPLACE__) 20 + #define __BTRFS_DEV_REPLACE__ 21 + 22 + struct btrfs_ioctl_dev_replace_args; 23 + 24 + int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); 25 + int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, 26 + struct btrfs_fs_info *fs_info); 27 + void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info); 28 + int btrfs_dev_replace_start(struct btrfs_root *root, 29 + struct btrfs_ioctl_dev_replace_args *args); 30 + void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, 31 + struct btrfs_ioctl_dev_replace_args *args); 32 + int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, 33 + struct btrfs_ioctl_dev_replace_args *args); 34 + void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); 35 + int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); 36 + int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); 37 + void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace); 38 + void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace); 39 + 40 + static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value) 41 + { 42 + atomic64_inc(stat_value); 43 + } 44 + #endif

+59

fs/btrfs/dir-item.c

··· 213 213 return btrfs_match_dir_item_name(root, path, name, name_len); 214 214 } 215 215 216 + int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, 217 + const char *name, int name_len) 218 + { 219 + int ret; 220 + struct btrfs_key key; 221 + struct btrfs_dir_item *di; 222 + int data_size; 223 + struct extent_buffer *leaf; 224 + int slot; 225 + struct btrfs_path *path; 226 + 227 + 228 + path = btrfs_alloc_path(); 229 + if (!path) 230 + return -ENOMEM; 231 + 232 + key.objectid = dir; 233 + btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); 234 + key.offset = btrfs_name_hash(name, name_len); 235 + 236 + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 237 + 238 + /* return back any errors */ 239 + if (ret < 0) 240 + goto out; 241 + 242 + /* nothing found, we're safe */ 243 + if (ret > 0) { 244 + ret = 0; 245 + goto out; 246 + } 247 + 248 + /* we found an item, look for our name in the item */ 249 + di = btrfs_match_dir_item_name(root, path, name, name_len); 250 + if (di) { 251 + /* our exact name was found */ 252 + ret = -EEXIST; 253 + goto out; 254 + } 255 + 256 + /* 257 + * see if there is room in the item to insert this 258 + * name 259 + */ 260 + data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item); 261 + leaf = path->nodes[0]; 262 + slot = path->slots[0]; 263 + if (data_size + btrfs_item_size_nr(leaf, slot) + 264 + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) { 265 + ret = -EOVERFLOW; 266 + } else { 267 + /* plenty of insertion room */ 268 + ret = 0; 269 + } 270 + out: 271 + btrfs_free_path(path); 272 + return ret; 273 + } 274 + 216 275 /* 217 276 * lookup a directory item based on index. 'dir' is the objectid 218 277 * we're searching in, and 'mod' tells us if you plan on deleting the

+92 -50

fs/btrfs/disk-io.c

··· 45 45 #include "inode-map.h" 46 46 #include "check-integrity.h" 47 47 #include "rcu-string.h" 48 + #include "dev-replace.h" 48 49 49 50 #ifdef CONFIG_X86 50 51 #include <asm/cpufeature.h> ··· 388 387 if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) 389 388 break; 390 389 391 - num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, 390 + num_copies = btrfs_num_copies(root->fs_info, 392 391 eb->start, eb->len); 393 392 if (num_copies == 1) 394 393 break; ··· 853 852 int mirror_num, unsigned long bio_flags, 854 853 u64 bio_offset) 855 854 { 855 + int ret; 856 + 856 857 /* 857 858 * when we're called for a write, we're already in the async 858 859 * submission context. Just jump into btrfs_map_bio 859 860 */ 860 - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); 861 + ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); 862 + if (ret) 863 + bio_endio(bio, ret); 864 + return ret; 861 865 } 862 866 863 867 static int check_async_write(struct inode *inode, unsigned long bio_flags) ··· 884 878 int ret; 885 879 886 880 if (!(rw & REQ_WRITE)) { 887 - 888 881 /* 889 882 * called for a read, do the setup so that checksum validation 890 883 * can happen in the async kernel threads ··· 891 886 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, 892 887 bio, 1); 893 888 if (ret) 894 - return ret; 895 - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 896 - mirror_num, 0); 889 + goto out_w_error; 890 + ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 891 + mirror_num, 0); 897 892 } else if (!async) { 898 893 ret = btree_csum_one_bio(bio); 899 894 if (ret) 900 - return ret; 901 - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 902 - mirror_num, 0); 895 + goto out_w_error; 896 + ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 897 + mirror_num, 0); 898 + } else { 899 + /* 900 + * kthread helpers are used to submit writes so that 901 + * checksumming can happen in parallel across all CPUs 902 + */ 903 + ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 904 + inode, rw, bio, mirror_num, 0, 905 + bio_offset, 906 + __btree_submit_bio_start, 907 + __btree_submit_bio_done); 903 908 } 904 909 905 - /* 906 - * kthread helpers are used to submit writes so that checksumming 907 - * can happen in parallel across all CPUs 908 - */ 909 - return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 910 - inode, rw, bio, mirror_num, 0, 911 - bio_offset, 912 - __btree_submit_bio_start, 913 - __btree_submit_bio_done); 910 + if (ret) { 911 + out_w_error: 912 + bio_endio(bio, ret); 913 + } 914 + return ret; 914 915 } 915 916 916 917 #ifdef CONFIG_MIGRATION ··· 1001 990 1002 991 static int btree_set_page_dirty(struct page *page) 1003 992 { 993 + #ifdef DEBUG 1004 994 struct extent_buffer *eb; 1005 995 1006 996 BUG_ON(!PagePrivate(page)); ··· 1010 998 BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 1011 999 BUG_ON(!atomic_read(&eb->refs)); 1012 1000 btrfs_assert_tree_locked(eb); 1001 + #endif 1013 1002 return __set_page_dirty_nobuffers(page); 1014 1003 } 1015 1004 ··· 1142 1129 root->fs_info->dirty_metadata_bytes); 1143 1130 } 1144 1131 spin_unlock(&root->fs_info->delalloc_lock); 1145 - } 1146 1132 1147 - /* ugh, clear_extent_buffer_dirty needs to lock the page */ 1148 - btrfs_set_lock_blocking(buf); 1149 - clear_extent_buffer_dirty(buf); 1133 + /* ugh, clear_extent_buffer_dirty needs to lock the page */ 1134 + btrfs_set_lock_blocking(buf); 1135 + clear_extent_buffer_dirty(buf); 1136 + } 1150 1137 } 1151 1138 } 1152 1139 ··· 1206 1193 root->root_key.objectid = objectid; 1207 1194 root->anon_dev = 0; 1208 1195 1209 - spin_lock_init(&root->root_times_lock); 1196 + spin_lock_init(&root->root_item_lock); 1210 1197 } 1211 1198 1212 1199 static int __must_check find_and_setup_root(struct btrfs_root *tree_root, ··· 2144 2131 init_rwsem(&fs_info->extent_commit_sem); 2145 2132 init_rwsem(&fs_info->cleanup_work_sem); 2146 2133 init_rwsem(&fs_info->subvol_sem); 2134 + fs_info->dev_replace.lock_owner = 0; 2135 + atomic_set(&fs_info->dev_replace.nesting_level, 0); 2136 + mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); 2137 + mutex_init(&fs_info->dev_replace.lock_management_lock); 2138 + mutex_init(&fs_info->dev_replace.lock); 2147 2139 2148 2140 spin_lock_init(&fs_info->qgroup_lock); 2149 2141 fs_info->qgroup_tree = RB_ROOT; ··· 2297 2279 fs_info->thread_pool_size, 2298 2280 &fs_info->generic_worker); 2299 2281 2282 + btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc", 2283 + fs_info->thread_pool_size, 2284 + &fs_info->generic_worker); 2285 + 2300 2286 btrfs_init_workers(&fs_info->submit_workers, "submit", 2301 2287 min_t(u64, fs_devices->num_devices, 2302 2288 fs_info->thread_pool_size), ··· 2372 2350 ret |= btrfs_start_workers(&fs_info->delayed_workers); 2373 2351 ret |= btrfs_start_workers(&fs_info->caching_workers); 2374 2352 ret |= btrfs_start_workers(&fs_info->readahead_workers); 2353 + ret |= btrfs_start_workers(&fs_info->flush_workers); 2375 2354 if (ret) { 2376 2355 err = -ENOMEM; 2377 2356 goto fail_sb_buffer; ··· 2441 2418 goto fail_tree_roots; 2442 2419 } 2443 2420 2444 - btrfs_close_extra_devices(fs_devices); 2421 + /* 2422 + * keep the device that is marked to be the target device for the 2423 + * dev_replace procedure 2424 + */ 2425 + btrfs_close_extra_devices(fs_info, fs_devices, 0); 2445 2426 2446 2427 if (!fs_devices->latest_bdev) { 2447 2428 printk(KERN_CRIT "btrfs: failed to read devices on %s\n", ··· 2517 2490 goto fail_block_groups; 2518 2491 } 2519 2492 2493 + ret = btrfs_init_dev_replace(fs_info); 2494 + if (ret) { 2495 + pr_err("btrfs: failed to init dev_replace: %d\n", ret); 2496 + goto fail_block_groups; 2497 + } 2498 + 2499 + btrfs_close_extra_devices(fs_info, fs_devices, 1); 2500 + 2520 2501 ret = btrfs_init_space_info(fs_info); 2521 2502 if (ret) { 2522 2503 printk(KERN_ERR "Failed to initial space info: %d\n", ret); ··· 2538 2503 } 2539 2504 fs_info->num_tolerated_disk_barrier_failures = 2540 2505 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 2506 + if (fs_info->fs_devices->missing_devices > 2507 + fs_info->num_tolerated_disk_barrier_failures && 2508 + !(sb->s_flags & MS_RDONLY)) { 2509 + printk(KERN_WARNING 2510 + "Btrfs: too many missing devices, writeable mount is not allowed\n"); 2511 + goto fail_block_groups; 2512 + } 2541 2513 2542 2514 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 2543 2515 "btrfs-cleaner"); ··· 2673 2631 return ret; 2674 2632 } 2675 2633 2634 + ret = btrfs_resume_dev_replace_async(fs_info); 2635 + if (ret) { 2636 + pr_warn("btrfs: failed to resume dev_replace\n"); 2637 + close_ctree(tree_root); 2638 + return ret; 2639 + } 2640 + 2676 2641 return 0; 2677 2642 2678 2643 fail_qgroup: ··· 2716 2667 btrfs_stop_workers(&fs_info->submit_workers); 2717 2668 btrfs_stop_workers(&fs_info->delayed_workers); 2718 2669 btrfs_stop_workers(&fs_info->caching_workers); 2670 + btrfs_stop_workers(&fs_info->flush_workers); 2719 2671 fail_alloc: 2720 2672 fail_iput: 2721 2673 btrfs_mapping_tree_free(&fs_info->mapping_tree); ··· 3320 3270 smp_mb(); 3321 3271 3322 3272 /* pause restriper - we want to resume on mount */ 3323 - btrfs_pause_balance(root->fs_info); 3273 + btrfs_pause_balance(fs_info); 3324 3274 3325 - btrfs_scrub_cancel(root); 3275 + btrfs_dev_replace_suspend_for_unmount(fs_info); 3276 + 3277 + btrfs_scrub_cancel(fs_info); 3326 3278 3327 3279 /* wait for any defraggers to finish */ 3328 3280 wait_event(fs_info->transaction_wait, 3329 3281 (atomic_read(&fs_info->defrag_running) == 0)); 3330 3282 3331 3283 /* clear out the rbtree of defraggable inodes */ 3332 - btrfs_run_defrag_inodes(fs_info); 3284 + btrfs_cleanup_defrag_inodes(fs_info); 3333 3285 3334 3286 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 3335 3287 ret = btrfs_commit_super(root); ··· 3391 3339 btrfs_stop_workers(&fs_info->delayed_workers); 3392 3340 btrfs_stop_workers(&fs_info->caching_workers); 3393 3341 btrfs_stop_workers(&fs_info->readahead_workers); 3342 + btrfs_stop_workers(&fs_info->flush_workers); 3394 3343 3395 3344 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 3396 3345 if (btrfs_test_opt(root, CHECK_INTEGRITY)) ··· 3436 3383 int was_dirty; 3437 3384 3438 3385 btrfs_assert_tree_locked(buf); 3439 - if (transid != root->fs_info->generation) { 3440 - printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " 3386 + if (transid != root->fs_info->generation) 3387 + WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, " 3441 3388 "found %llu running %llu\n", 3442 3389 (unsigned long long)buf->start, 3443 3390 (unsigned long long)transid, 3444 3391 (unsigned long long)root->fs_info->generation); 3445 - WARN_ON(1); 3446 - } 3447 3392 was_dirty = set_extent_buffer_dirty(buf); 3448 3393 if (!was_dirty) { 3449 3394 spin_lock(&root->fs_info->delalloc_lock); ··· 3450 3399 } 3451 3400 } 3452 3401 3453 - void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) 3402 + static void __btrfs_btree_balance_dirty(struct btrfs_root *root, 3403 + int flush_delayed) 3454 3404 { 3455 3405 /* 3456 3406 * looks as though older kernels can get into trouble with ··· 3463 3411 if (current->flags & PF_MEMALLOC) 3464 3412 return; 3465 3413 3466 - btrfs_balance_delayed_items(root); 3414 + if (flush_delayed) 3415 + btrfs_balance_delayed_items(root); 3467 3416 3468 3417 num_dirty = root->fs_info->dirty_metadata_bytes; 3469 3418 ··· 3475 3422 return; 3476 3423 } 3477 3424 3478 - void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) 3425 + void btrfs_btree_balance_dirty(struct btrfs_root *root) 3479 3426 { 3480 - /* 3481 - * looks as though older kernels can get into trouble with 3482 - * this code, they end up stuck in balance_dirty_pages forever 3483 - */ 3484 - u64 num_dirty; 3485 - unsigned long thresh = 32 * 1024 * 1024; 3427 + __btrfs_btree_balance_dirty(root, 1); 3428 + } 3486 3429 3487 - if (current->flags & PF_MEMALLOC) 3488 - return; 3489 - 3490 - num_dirty = root->fs_info->dirty_metadata_bytes; 3491 - 3492 - if (num_dirty > thresh) { 3493 - balance_dirty_pages_ratelimited( 3494 - root->fs_info->btree_inode->i_mapping); 3495 - } 3496 - return; 3430 + void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root) 3431 + { 3432 + __btrfs_btree_balance_dirty(root, 0); 3497 3433 } 3498 3434 3499 3435 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)

+2 -2

fs/btrfs/disk-io.h

··· 62 62 struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, 63 63 struct btrfs_key *location); 64 64 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); 65 - void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); 66 - void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); 65 + void btrfs_btree_balance_dirty(struct btrfs_root *root); 66 + void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); 67 67 void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); 68 68 void btrfs_mark_buffer_dirty(struct extent_buffer *buf); 69 69 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,

+135 -92

fs/btrfs/extent-tree.c

··· 33 33 #include "volumes.h" 34 34 #include "locking.h" 35 35 #include "free-space-cache.h" 36 + #include "math.h" 36 37 37 38 #undef SCRAMBLE_DELAYED_REFS 38 39 ··· 648 647 list_for_each_entry_rcu(found, head, list) 649 648 found->full = 0; 650 649 rcu_read_unlock(); 651 - } 652 - 653 - static u64 div_factor(u64 num, int factor) 654 - { 655 - if (factor == 10) 656 - return num; 657 - num *= factor; 658 - do_div(num, 10); 659 - return num; 660 - } 661 - 662 - static u64 div_factor_fine(u64 num, int factor) 663 - { 664 - if (factor == 100) 665 - return num; 666 - num *= factor; 667 - do_div(num, 100); 668 - return num; 669 650 } 670 651 671 652 u64 btrfs_find_block_group(struct btrfs_root *root, ··· 1818 1835 1819 1836 1820 1837 /* Tell the block device(s) that the sectors can be discarded */ 1821 - ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, 1838 + ret = btrfs_map_block(root->fs_info, REQ_DISCARD, 1822 1839 bytenr, &num_bytes, &bbio, 0); 1823 1840 /* Error condition is -ENOMEM */ 1824 1841 if (!ret) { ··· 2297 2314 kfree(extent_op); 2298 2315 2299 2316 if (ret) { 2317 + list_del_init(&locked_ref->cluster); 2318 + mutex_unlock(&locked_ref->mutex); 2319 + 2300 2320 printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); 2301 2321 spin_lock(&delayed_refs->lock); 2302 2322 return ret; ··· 2342 2356 count++; 2343 2357 2344 2358 if (ret) { 2359 + if (locked_ref) { 2360 + list_del_init(&locked_ref->cluster); 2361 + mutex_unlock(&locked_ref->mutex); 2362 + } 2345 2363 printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); 2346 2364 spin_lock(&delayed_refs->lock); 2347 2365 return ret; ··· 3651 3661 3652 3662 static int can_overcommit(struct btrfs_root *root, 3653 3663 struct btrfs_space_info *space_info, u64 bytes, 3654 - int flush) 3664 + enum btrfs_reserve_flush_enum flush) 3655 3665 { 3656 3666 u64 profile = btrfs_get_alloc_profile(root, 0); 3657 3667 u64 avail; ··· 3675 3685 avail >>= 1; 3676 3686 3677 3687 /* 3678 - * If we aren't flushing don't let us overcommit too much, say 3679 - * 1/8th of the space. If we can flush, let it overcommit up to 3680 - * 1/2 of the space. 3688 + * If we aren't flushing all things, let us overcommit up to 3689 + * 1/2th of the space. If we can flush, don't let us overcommit 3690 + * too much, let it overcommit up to 1/8 of the space. 3681 3691 */ 3682 - if (flush) 3692 + if (flush == BTRFS_RESERVE_FLUSH_ALL) 3683 3693 avail >>= 3; 3684 3694 else 3685 3695 avail >>= 1; 3686 3696 3687 3697 if (used + bytes < space_info->total_bytes + avail) 3688 3698 return 1; 3699 + return 0; 3700 + } 3701 + 3702 + static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb, 3703 + unsigned long nr_pages, 3704 + enum wb_reason reason) 3705 + { 3706 + if (!writeback_in_progress(sb->s_bdi) && 3707 + down_read_trylock(&sb->s_umount)) { 3708 + writeback_inodes_sb_nr(sb, nr_pages, reason); 3709 + up_read(&sb->s_umount); 3710 + return 1; 3711 + } 3712 + 3689 3713 return 0; 3690 3714 } 3691 3715 ··· 3717 3713 long time_left; 3718 3714 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3719 3715 int loops = 0; 3716 + enum btrfs_reserve_flush_enum flush; 3720 3717 3721 3718 trans = (struct btrfs_trans_handle *)current->journal_info; 3722 3719 block_rsv = &root->fs_info->delalloc_block_rsv; ··· 3735 3730 while (delalloc_bytes && loops < 3) { 3736 3731 max_reclaim = min(delalloc_bytes, to_reclaim); 3737 3732 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 3738 - writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, 3739 - WB_REASON_FS_FREE_SPACE); 3733 + writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb, 3734 + nr_pages, 3735 + WB_REASON_FS_FREE_SPACE); 3740 3736 3741 3737 /* 3742 3738 * We need to wait for the async pages to actually start before ··· 3746 3740 wait_event(root->fs_info->async_submit_wait, 3747 3741 !atomic_read(&root->fs_info->async_delalloc_pages)); 3748 3742 3743 + if (!trans) 3744 + flush = BTRFS_RESERVE_FLUSH_ALL; 3745 + else 3746 + flush = BTRFS_RESERVE_NO_FLUSH; 3749 3747 spin_lock(&space_info->lock); 3750 - if (can_overcommit(root, space_info, orig, !trans)) { 3748 + if (can_overcommit(root, space_info, orig, flush)) { 3751 3749 spin_unlock(&space_info->lock); 3752 3750 break; 3753 3751 } ··· 3909 3899 */ 3910 3900 static int reserve_metadata_bytes(struct btrfs_root *root, 3911 3901 struct btrfs_block_rsv *block_rsv, 3912 - u64 orig_bytes, int flush) 3902 + u64 orig_bytes, 3903 + enum btrfs_reserve_flush_enum flush) 3913 3904 { 3914 3905 struct btrfs_space_info *space_info = block_rsv->space_info; 3915 3906 u64 used; ··· 3923 3912 ret = 0; 3924 3913 spin_lock(&space_info->lock); 3925 3914 /* 3926 - * We only want to wait if somebody other than us is flushing and we are 3927 - * actually alloed to flush. 3915 + * We only want to wait if somebody other than us is flushing and we 3916 + * are actually allowed to flush all things. 3928 3917 */ 3929 - while (flush && !flushing && space_info->flush) { 3918 + while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && 3919 + space_info->flush) { 3930 3920 spin_unlock(&space_info->lock); 3931 3921 /* 3932 3922 * If we have a trans handle we can't wait because the flusher ··· 3993 3981 * Couldn't make our reservation, save our place so while we're trying 3994 3982 * to reclaim space we can actually use it instead of somebody else 3995 3983 * stealing it from us. 3984 + * 3985 + * We make the other tasks wait for the flush only when we can flush 3986 + * all things. 3996 3987 */ 3997 - if (ret && flush) { 3988 + if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) { 3998 3989 flushing = true; 3999 3990 space_info->flush = 1; 4000 3991 } 4001 3992 4002 3993 spin_unlock(&space_info->lock); 4003 3994 4004 - if (!ret || !flush) 3995 + if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 4005 3996 goto out; 4006 3997 4007 3998 ret = flush_space(root, space_info, num_bytes, orig_bytes, 4008 3999 flush_state); 4009 4000 flush_state++; 4001 + 4002 + /* 4003 + * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock 4004 + * would happen. So skip delalloc flush. 4005 + */ 4006 + if (flush == BTRFS_RESERVE_FLUSH_LIMIT && 4007 + (flush_state == FLUSH_DELALLOC || 4008 + flush_state == FLUSH_DELALLOC_WAIT)) 4009 + flush_state = ALLOC_CHUNK; 4010 + 4010 4011 if (!ret) 4011 4012 goto again; 4012 - else if (flush_state <= COMMIT_TRANS) 4013 + else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && 4014 + flush_state < COMMIT_TRANS) 4015 + goto again; 4016 + else if (flush == BTRFS_RESERVE_FLUSH_ALL && 4017 + flush_state <= COMMIT_TRANS) 4013 4018 goto again; 4014 4019 4015 4020 out: ··· 4177 4148 kfree(rsv); 4178 4149 } 4179 4150 4180 - static inline int __block_rsv_add(struct btrfs_root *root, 4181 - struct btrfs_block_rsv *block_rsv, 4182 - u64 num_bytes, int flush) 4151 + int btrfs_block_rsv_add(struct btrfs_root *root, 4152 + struct btrfs_block_rsv *block_rsv, u64 num_bytes, 4153 + enum btrfs_reserve_flush_enum flush) 4183 4154 { 4184 4155 int ret; 4185 4156 ··· 4193 4164 } 4194 4165 4195 4166 return ret; 4196 - } 4197 - 4198 - int btrfs_block_rsv_add(struct btrfs_root *root, 4199 - struct btrfs_block_rsv *block_rsv, 4200 - u64 num_bytes) 4201 - { 4202 - return __block_rsv_add(root, block_rsv, num_bytes, 1); 4203 - } 4204 - 4205 - int btrfs_block_rsv_add_noflush(struct btrfs_root *root, 4206 - struct btrfs_block_rsv *block_rsv, 4207 - u64 num_bytes) 4208 - { 4209 - return __block_rsv_add(root, block_rsv, num_bytes, 0); 4210 4167 } 4211 4168 4212 4169 int btrfs_block_rsv_check(struct btrfs_root *root, ··· 4213 4198 return ret; 4214 4199 } 4215 4200 4216 - static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, 4217 - struct btrfs_block_rsv *block_rsv, 4218 - u64 min_reserved, int flush) 4201 + int btrfs_block_rsv_refill(struct btrfs_root *root, 4202 + struct btrfs_block_rsv *block_rsv, u64 min_reserved, 4203 + enum btrfs_reserve_flush_enum flush) 4219 4204 { 4220 4205 u64 num_bytes = 0; 4221 4206 int ret = -ENOSPC; ··· 4241 4226 } 4242 4227 4243 4228 return ret; 4244 - } 4245 - 4246 - int btrfs_block_rsv_refill(struct btrfs_root *root, 4247 - struct btrfs_block_rsv *block_rsv, 4248 - u64 min_reserved) 4249 - { 4250 - return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1); 4251 - } 4252 - 4253 - int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, 4254 - struct btrfs_block_rsv *block_rsv, 4255 - u64 min_reserved) 4256 - { 4257 - return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0); 4258 4229 } 4259 4230 4260 4231 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, ··· 4533 4532 u64 csum_bytes; 4534 4533 unsigned nr_extents = 0; 4535 4534 int extra_reserve = 0; 4536 - int flush = 1; 4535 + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 4537 4536 int ret; 4537 + bool delalloc_lock = true; 4538 4538 4539 - /* Need to be holding the i_mutex here if we aren't free space cache */ 4540 - if (btrfs_is_free_space_inode(inode)) 4541 - flush = 0; 4539 + /* If we are a free space inode we need to not flush since we will be in 4540 + * the middle of a transaction commit. We also don't need the delalloc 4541 + * mutex since we won't race with anybody. We need this mostly to make 4542 + * lockdep shut its filthy mouth. 4543 + */ 4544 + if (btrfs_is_free_space_inode(inode)) { 4545 + flush = BTRFS_RESERVE_NO_FLUSH; 4546 + delalloc_lock = false; 4547 + } 4542 4548 4543 - if (flush && btrfs_transaction_in_commit(root->fs_info)) 4549 + if (flush != BTRFS_RESERVE_NO_FLUSH && 4550 + btrfs_transaction_in_commit(root->fs_info)) 4544 4551 schedule_timeout(1); 4545 4552 4546 - mutex_lock(&BTRFS_I(inode)->delalloc_mutex); 4553 + if (delalloc_lock) 4554 + mutex_lock(&BTRFS_I(inode)->delalloc_mutex); 4555 + 4547 4556 num_bytes = ALIGN(num_bytes, root->sectorsize); 4548 4557 4549 4558 spin_lock(&BTRFS_I(inode)->lock); ··· 4583 4572 ret = btrfs_qgroup_reserve(root, num_bytes + 4584 4573 nr_extents * root->leafsize); 4585 4574 if (ret) { 4586 - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4575 + spin_lock(&BTRFS_I(inode)->lock); 4576 + calc_csum_metadata_size(inode, num_bytes, 0); 4577 + spin_unlock(&BTRFS_I(inode)->lock); 4578 + if (delalloc_lock) 4579 + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4587 4580 return ret; 4588 4581 } 4589 4582 } ··· 4622 4607 btrfs_ino(inode), 4623 4608 to_free, 0); 4624 4609 } 4625 - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4610 + if (root->fs_info->quota_enabled) { 4611 + btrfs_qgroup_free(root, num_bytes + 4612 + nr_extents * root->leafsize); 4613 + } 4614 + if (delalloc_lock) 4615 + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4626 4616 return ret; 4627 4617 } 4628 4618 ··· 4639 4619 } 4640 4620 BTRFS_I(inode)->reserved_extents += nr_extents; 4641 4621 spin_unlock(&BTRFS_I(inode)->lock); 4642 - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4622 + 4623 + if (delalloc_lock) 4624 + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4643 4625 4644 4626 if (to_reserve) 4645 4627 trace_btrfs_space_reservation(root->fs_info,"delalloc", ··· 4991 4969 { 4992 4970 struct btrfs_fs_info *fs_info = root->fs_info; 4993 4971 struct btrfs_block_group_cache *cache = NULL; 4972 + struct btrfs_space_info *space_info; 4973 + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 4994 4974 u64 len; 4975 + bool readonly; 4995 4976 4996 4977 while (start <= end) { 4978 + readonly = false; 4997 4979 if (!cache || 4998 4980 start >= cache->key.objectid + cache->key.offset) { 4999 4981 if (cache) ··· 5015 4989 } 5016 4990 5017 4991 start += len; 4992 + space_info = cache->space_info; 5018 4993 5019 - spin_lock(&cache->space_info->lock); 4994 + spin_lock(&space_info->lock); 5020 4995 spin_lock(&cache->lock); 5021 4996 cache->pinned -= len; 5022 - cache->space_info->bytes_pinned -= len; 5023 - if (cache->ro) 5024 - cache->space_info->bytes_readonly += len; 4997 + space_info->bytes_pinned -= len; 4998 + if (cache->ro) { 4999 + space_info->bytes_readonly += len; 5000 + readonly = true; 5001 + } 5025 5002 spin_unlock(&cache->lock); 5026 - spin_unlock(&cache->space_info->lock); 5003 + if (!readonly && global_rsv->space_info == space_info) { 5004 + spin_lock(&global_rsv->lock); 5005 + if (!global_rsv->full) { 5006 + len = min(len, global_rsv->size - 5007 + global_rsv->reserved); 5008 + global_rsv->reserved += len; 5009 + space_info->bytes_may_use += len; 5010 + if (global_rsv->reserved >= global_rsv->size) 5011 + global_rsv->full = 1; 5012 + } 5013 + spin_unlock(&global_rsv->lock); 5014 + } 5015 + spin_unlock(&space_info->lock); 5027 5016 } 5028 5017 5029 5018 if (cache) ··· 5507 5466 return 0; 5508 5467 } 5509 5468 5510 - static int __get_block_group_index(u64 flags) 5469 + int __get_raid_index(u64 flags) 5511 5470 { 5512 5471 int index; 5513 5472 ··· 5527 5486 5528 5487 static int get_block_group_index(struct btrfs_block_group_cache *cache) 5529 5488 { 5530 - return __get_block_group_index(cache->flags); 5489 + return __get_raid_index(cache->flags); 5531 5490 } 5532 5491 5533 5492 enum btrfs_loop_type { ··· 6310 6269 block_rsv = get_block_rsv(trans, root); 6311 6270 6312 6271 if (block_rsv->size == 0) { 6313 - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); 6272 + ret = reserve_metadata_bytes(root, block_rsv, blocksize, 6273 + BTRFS_RESERVE_NO_FLUSH); 6314 6274 /* 6315 6275 * If we couldn't reserve metadata bytes try and use some from 6316 6276 * the global reserve. ··· 6334 6292 static DEFINE_RATELIMIT_STATE(_rs, 6335 6293 DEFAULT_RATELIMIT_INTERVAL, 6336 6294 /*DEFAULT_RATELIMIT_BURST*/ 2); 6337 - if (__ratelimit(&_rs)) { 6338 - printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret); 6339 - WARN_ON(1); 6340 - } 6341 - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); 6295 + if (__ratelimit(&_rs)) 6296 + WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n", 6297 + ret); 6298 + ret = reserve_metadata_bytes(root, block_rsv, blocksize, 6299 + BTRFS_RESERVE_NO_FLUSH); 6342 6300 if (!ret) { 6343 6301 return block_rsv; 6344 6302 } else if (ret && block_rsv != global_rsv) { ··· 7469 7427 */ 7470 7428 target = get_restripe_target(root->fs_info, block_group->flags); 7471 7429 if (target) { 7472 - index = __get_block_group_index(extended_to_chunk(target)); 7430 + index = __get_raid_index(extended_to_chunk(target)); 7473 7431 } else { 7474 7432 /* 7475 7433 * this is just a balance, so if we were marked as full ··· 7503 7461 * check to make sure we can actually find a chunk with enough 7504 7462 * space to fit our block group in. 7505 7463 */ 7506 - if (device->total_bytes > device->bytes_used + min_free) { 7464 + if (device->total_bytes > device->bytes_used + min_free && 7465 + !device->is_tgtdev_for_dev_replace) { 7507 7466 ret = find_free_dev_extent(device, min_free, 7508 7467 &dev_offset, NULL); 7509 7468 if (!ret)

+14 -23

fs/btrfs/extent_io.c

··· 341 341 { 342 342 struct rb_node *node; 343 343 344 - if (end < start) { 345 - printk(KERN_ERR "btrfs end < start %llu %llu\n", 344 + if (end < start) 345 + WARN(1, KERN_ERR "btrfs end < start %llu %llu\n", 346 346 (unsigned long long)end, 347 347 (unsigned long long)start); 348 - WARN_ON(1); 349 - } 350 348 state->start = start; 351 349 state->end = end; 352 350 ··· 1917 1919 * the standard behavior is to write all copies in a raid setup. here we only 1918 1920 * want to write the one bad copy. so we do the mapping for ourselves and issue 1919 1921 * submit_bio directly. 1920 - * to avoid any synchonization issues, wait for the data after writing, which 1922 + * to avoid any synchronization issues, wait for the data after writing, which 1921 1923 * actually prevents the read that triggered the error from finishing. 1922 1924 * currently, there can be no more than two copies of every data bit. thus, 1923 1925 * exactly one rewrite is required. 1924 1926 */ 1925 - int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, 1927 + int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, 1926 1928 u64 length, u64 logical, struct page *page, 1927 1929 int mirror_num) 1928 1930 { ··· 1944 1946 bio->bi_size = 0; 1945 1947 map_length = length; 1946 1948 1947 - ret = btrfs_map_block(map_tree, WRITE, logical, 1949 + ret = btrfs_map_block(fs_info, WRITE, logical, 1948 1950 &map_length, &bbio, mirror_num); 1949 1951 if (ret) { 1950 1952 bio_put(bio); ··· 1982 1984 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, 1983 1985 int mirror_num) 1984 1986 { 1985 - struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 1986 1987 u64 start = eb->start; 1987 1988 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); 1988 1989 int ret = 0; 1989 1990 1990 1991 for (i = 0; i < num_pages; i++) { 1991 1992 struct page *p = extent_buffer_page(eb, i); 1992 - ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE, 1993 + ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE, 1993 1994 start, p, mirror_num); 1994 1995 if (ret) 1995 1996 break; ··· 2007 2010 u64 private; 2008 2011 u64 private_failure; 2009 2012 struct io_failure_record *failrec; 2010 - struct btrfs_mapping_tree *map_tree; 2013 + struct btrfs_fs_info *fs_info; 2011 2014 struct extent_state *state; 2012 2015 int num_copies; 2013 2016 int did_repair = 0; ··· 2043 2046 spin_unlock(&BTRFS_I(inode)->io_tree.lock); 2044 2047 2045 2048 if (state && state->start == failrec->start) { 2046 - map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; 2047 - num_copies = btrfs_num_copies(map_tree, failrec->logical, 2048 - failrec->len); 2049 + fs_info = BTRFS_I(inode)->root->fs_info; 2050 + num_copies = btrfs_num_copies(fs_info, failrec->logical, 2051 + failrec->len); 2049 2052 if (num_copies > 1) { 2050 - ret = repair_io_failure(map_tree, start, failrec->len, 2053 + ret = repair_io_failure(fs_info, start, failrec->len, 2051 2054 failrec->logical, page, 2052 2055 failrec->failed_mirror); 2053 2056 did_repair = !ret; ··· 2156 2159 * clean_io_failure() clean all those errors at once. 2157 2160 */ 2158 2161 } 2159 - num_copies = btrfs_num_copies( 2160 - &BTRFS_I(inode)->root->fs_info->mapping_tree, 2161 - failrec->logical, failrec->len); 2162 + num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, 2163 + failrec->logical, failrec->len); 2162 2164 if (num_copies == 1) { 2163 2165 /* 2164 2166 * we only have a single copy of the data, so don't bother with ··· 2462 2466 return bio; 2463 2467 } 2464 2468 2465 - /* 2466 - * Since writes are async, they will only return -ENOMEM. 2467 - * Reads can return the full range of I/O error conditions. 2468 - */ 2469 2469 static int __must_check submit_one_bio(int rw, struct bio *bio, 2470 2470 int mirror_num, unsigned long bio_flags) 2471 2471 { ··· 4713 4721 } 4714 4722 4715 4723 if (start + min_len > eb->len) { 4716 - printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " 4724 + WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, " 4717 4725 "wanted %lu %lu\n", (unsigned long long)eb->start, 4718 4726 eb->len, start, min_len); 4719 - WARN_ON(1); 4720 4727 return -EINVAL; 4721 4728 } 4722 4729

+2 -2

fs/btrfs/extent_io.h

··· 337 337 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 338 338 gfp_t gfp_flags); 339 339 340 - struct btrfs_mapping_tree; 340 + struct btrfs_fs_info; 341 341 342 - int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, 342 + int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, 343 343 u64 length, u64 logical, struct page *page, 344 344 int mirror_num); 345 345 int end_extent_writepage(struct page *page, int err, u64 start, u64 end);

+10 -14

fs/btrfs/extent_map.c

··· 49 49 struct extent_map *alloc_extent_map(void) 50 50 { 51 51 struct extent_map *em; 52 - em = kmem_cache_alloc(extent_map_cache, GFP_NOFS); 52 + em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); 53 53 if (!em) 54 54 return NULL; 55 55 em->in_tree = 0; ··· 198 198 merge = rb_entry(rb, struct extent_map, rb_node); 199 199 if (rb && mergable_maps(merge, em)) { 200 200 em->start = merge->start; 201 + em->orig_start = merge->orig_start; 201 202 em->len += merge->len; 202 203 em->block_len += merge->block_len; 203 204 em->block_start = merge->block_start; 204 205 merge->in_tree = 0; 205 - if (merge->generation > em->generation) { 206 - em->mod_start = em->start; 207 - em->mod_len = em->len; 208 - em->generation = merge->generation; 209 - list_move(&em->list, &tree->modified_extents); 210 - } 206 + em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; 207 + em->mod_start = merge->mod_start; 208 + em->generation = max(em->generation, merge->generation); 209 + list_move(&em->list, &tree->modified_extents); 211 210 212 211 list_del_init(&merge->list); 213 212 rb_erase(&merge->rb_node, &tree->map); ··· 222 223 em->block_len += merge->len; 223 224 rb_erase(&merge->rb_node, &tree->map); 224 225 merge->in_tree = 0; 225 - if (merge->generation > em->generation) { 226 - em->mod_len = em->len; 227 - em->generation = merge->generation; 228 - list_move(&em->list, &tree->modified_extents); 229 - } 226 + em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; 227 + em->generation = max(em->generation, merge->generation); 230 228 list_del_init(&merge->list); 231 229 free_extent_map(merge); 232 230 } ··· 261 265 em->mod_start = em->start; 262 266 em->mod_len = em->len; 263 267 264 - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 268 + if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) { 265 269 prealloc = true; 266 - clear_bit(EXTENT_FLAG_PREALLOC, &em->flags); 270 + clear_bit(EXTENT_FLAG_FILLING, &em->flags); 267 271 } 268 272 269 273 try_merge_map(tree, em);

+2

fs/btrfs/extent_map.h

··· 14 14 #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ 15 15 #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ 16 16 #define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ 17 + #define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */ 17 18 18 19 struct extent_map { 19 20 struct rb_node rb_node; ··· 25 24 u64 mod_start; 26 25 u64 mod_len; 27 26 u64 orig_start; 27 + u64 orig_block_len; 28 28 u64 block_start; 29 29 u64 block_len; 30 30 u64 generation;

+20 -1

fs/btrfs/file-item.c

··· 133 133 return ERR_PTR(ret); 134 134 } 135 135 136 - 137 136 int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, 138 137 struct btrfs_root *root, 139 138 struct btrfs_path *path, u64 objectid, ··· 150 151 return ret; 151 152 } 152 153 154 + u64 btrfs_file_extent_length(struct btrfs_path *path) 155 + { 156 + int extent_type; 157 + struct btrfs_file_extent_item *fi; 158 + u64 len; 159 + 160 + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], 161 + struct btrfs_file_extent_item); 162 + extent_type = btrfs_file_extent_type(path->nodes[0], fi); 163 + 164 + if (extent_type == BTRFS_FILE_EXTENT_REG || 165 + extent_type == BTRFS_FILE_EXTENT_PREALLOC) 166 + len = btrfs_file_extent_num_bytes(path->nodes[0], fi); 167 + else if (extent_type == BTRFS_FILE_EXTENT_INLINE) 168 + len = btrfs_file_extent_inline_len(path->nodes[0], fi); 169 + else 170 + BUG(); 171 + 172 + return len; 173 + } 153 174 154 175 static int __btrfs_lookup_bio_sums(struct btrfs_root *root, 155 176 struct inode *inode, struct bio *bio,

+265 -143

fs/btrfs/file.c

··· 41 41 #include "compat.h" 42 42 #include "volumes.h" 43 43 44 + static struct kmem_cache *btrfs_inode_defrag_cachep; 44 45 /* 45 46 * when auto defrag is enabled we 46 47 * queue up these defrag structs to remember which ··· 91 90 * If an existing record is found the defrag item you 92 91 * pass in is freed 93 92 */ 94 - static void __btrfs_add_inode_defrag(struct inode *inode, 93 + static int __btrfs_add_inode_defrag(struct inode *inode, 95 94 struct inode_defrag *defrag) 96 95 { 97 96 struct btrfs_root *root = BTRFS_I(inode)->root; ··· 119 118 entry->transid = defrag->transid; 120 119 if (defrag->last_offset > entry->last_offset) 121 120 entry->last_offset = defrag->last_offset; 122 - goto exists; 121 + return -EEXIST; 123 122 } 124 123 } 125 124 set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 126 125 rb_link_node(&defrag->rb_node, parent, p); 127 126 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); 128 - return; 127 + return 0; 128 + } 129 129 130 - exists: 131 - kfree(defrag); 132 - return; 130 + static inline int __need_auto_defrag(struct btrfs_root *root) 131 + { 132 + if (!btrfs_test_opt(root, AUTO_DEFRAG)) 133 + return 0; 133 134 135 + if (btrfs_fs_closing(root->fs_info)) 136 + return 0; 137 + 138 + return 1; 134 139 } 135 140 136 141 /* ··· 149 142 struct btrfs_root *root = BTRFS_I(inode)->root; 150 143 struct inode_defrag *defrag; 151 144 u64 transid; 145 + int ret; 152 146 153 - if (!btrfs_test_opt(root, AUTO_DEFRAG)) 154 - return 0; 155 - 156 - if (btrfs_fs_closing(root->fs_info)) 147 + if (!__need_auto_defrag(root)) 157 148 return 0; 158 149 159 150 if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) ··· 162 157 else 163 158 transid = BTRFS_I(inode)->root->last_trans; 164 159 165 - defrag = kzalloc(sizeof(*defrag), GFP_NOFS); 160 + defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); 166 161 if (!defrag) 167 162 return -ENOMEM; 168 163 ··· 171 166 defrag->root = root->root_key.objectid; 172 167 173 168 spin_lock(&root->fs_info->defrag_inodes_lock); 174 - if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) 175 - __btrfs_add_inode_defrag(inode, defrag); 176 - else 177 - kfree(defrag); 169 + if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) { 170 + /* 171 + * If we set IN_DEFRAG flag and evict the inode from memory, 172 + * and then re-read this inode, this new inode doesn't have 173 + * IN_DEFRAG flag. At the case, we may find the existed defrag. 174 + */ 175 + ret = __btrfs_add_inode_defrag(inode, defrag); 176 + if (ret) 177 + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 178 + } else { 179 + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 180 + } 178 181 spin_unlock(&root->fs_info->defrag_inodes_lock); 179 182 return 0; 180 183 } 181 184 182 185 /* 183 - * must be called with the defrag_inodes lock held 186 + * Requeue the defrag object. If there is a defrag object that points to 187 + * the same inode in the tree, we will merge them together (by 188 + * __btrfs_add_inode_defrag()) and free the one that we want to requeue. 184 189 */ 185 - struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, 186 - u64 root, u64 ino, 187 - struct rb_node **next) 190 + void btrfs_requeue_inode_defrag(struct inode *inode, 191 + struct inode_defrag *defrag) 192 + { 193 + struct btrfs_root *root = BTRFS_I(inode)->root; 194 + int ret; 195 + 196 + if (!__need_auto_defrag(root)) 197 + goto out; 198 + 199 + /* 200 + * Here we don't check the IN_DEFRAG flag, because we need merge 201 + * them together. 202 + */ 203 + spin_lock(&root->fs_info->defrag_inodes_lock); 204 + ret = __btrfs_add_inode_defrag(inode, defrag); 205 + spin_unlock(&root->fs_info->defrag_inodes_lock); 206 + if (ret) 207 + goto out; 208 + return; 209 + out: 210 + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 211 + } 212 + 213 + /* 214 + * pick the defragable inode that we want, if it doesn't exist, we will get 215 + * the next one. 216 + */ 217 + static struct inode_defrag * 218 + btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino) 188 219 { 189 220 struct inode_defrag *entry = NULL; 190 221 struct inode_defrag tmp; ··· 231 190 tmp.ino = ino; 232 191 tmp.root = root; 233 192 234 - p = info->defrag_inodes.rb_node; 193 + spin_lock(&fs_info->defrag_inodes_lock); 194 + p = fs_info->defrag_inodes.rb_node; 235 195 while (p) { 236 196 parent = p; 237 197 entry = rb_entry(parent, struct inode_defrag, rb_node); ··· 243 201 else if (ret > 0) 244 202 p = parent->rb_right; 245 203 else 246 - return entry; 204 + goto out; 247 205 } 248 206 249 - if (next) { 250 - while (parent && __compare_inode_defrag(&tmp, entry) > 0) { 251 - parent = rb_next(parent); 207 + if (parent && __compare_inode_defrag(&tmp, entry) > 0) { 208 + parent = rb_next(parent); 209 + if (parent) 252 210 entry = rb_entry(parent, struct inode_defrag, rb_node); 253 - } 254 - *next = parent; 211 + else 212 + entry = NULL; 255 213 } 256 - return NULL; 214 + out: 215 + if (entry) 216 + rb_erase(parent, &fs_info->defrag_inodes); 217 + spin_unlock(&fs_info->defrag_inodes_lock); 218 + return entry; 219 + } 220 + 221 + void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) 222 + { 223 + struct inode_defrag *defrag; 224 + struct rb_node *node; 225 + 226 + spin_lock(&fs_info->defrag_inodes_lock); 227 + node = rb_first(&fs_info->defrag_inodes); 228 + while (node) { 229 + rb_erase(node, &fs_info->defrag_inodes); 230 + defrag = rb_entry(node, struct inode_defrag, rb_node); 231 + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 232 + 233 + if (need_resched()) { 234 + spin_unlock(&fs_info->defrag_inodes_lock); 235 + cond_resched(); 236 + spin_lock(&fs_info->defrag_inodes_lock); 237 + } 238 + 239 + node = rb_first(&fs_info->defrag_inodes); 240 + } 241 + spin_unlock(&fs_info->defrag_inodes_lock); 242 + } 243 + 244 + #define BTRFS_DEFRAG_BATCH 1024 245 + 246 + static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, 247 + struct inode_defrag *defrag) 248 + { 249 + struct btrfs_root *inode_root; 250 + struct inode *inode; 251 + struct btrfs_key key; 252 + struct btrfs_ioctl_defrag_range_args range; 253 + int num_defrag; 254 + 255 + /* get the inode */ 256 + key.objectid = defrag->root; 257 + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 258 + key.offset = (u64)-1; 259 + inode_root = btrfs_read_fs_root_no_name(fs_info, &key); 260 + if (IS_ERR(inode_root)) { 261 + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 262 + return PTR_ERR(inode_root); 263 + } 264 + 265 + key.objectid = defrag->ino; 266 + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 267 + key.offset = 0; 268 + inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); 269 + if (IS_ERR(inode)) { 270 + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 271 + return PTR_ERR(inode); 272 + } 273 + 274 + /* do a chunk of defrag */ 275 + clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 276 + memset(&range, 0, sizeof(range)); 277 + range.len = (u64)-1; 278 + range.start = defrag->last_offset; 279 + 280 + sb_start_write(fs_info->sb); 281 + num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, 282 + BTRFS_DEFRAG_BATCH); 283 + sb_end_write(fs_info->sb); 284 + /* 285 + * if we filled the whole defrag batch, there 286 + * must be more work to do. Queue this defrag 287 + * again 288 + */ 289 + if (num_defrag == BTRFS_DEFRAG_BATCH) { 290 + defrag->last_offset = range.start; 291 + btrfs_requeue_inode_defrag(inode, defrag); 292 + } else if (defrag->last_offset && !defrag->cycled) { 293 + /* 294 + * we didn't fill our defrag batch, but 295 + * we didn't start at zero. Make sure we loop 296 + * around to the start of the file. 297 + */ 298 + defrag->last_offset = 0; 299 + defrag->cycled = 1; 300 + btrfs_requeue_inode_defrag(inode, defrag); 301 + } else { 302 + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 303 + } 304 + 305 + iput(inode); 306 + return 0; 257 307 } 258 308 259 309 /* ··· 355 221 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) 356 222 { 357 223 struct inode_defrag *defrag; 358 - struct btrfs_root *inode_root; 359 - struct inode *inode; 360 - struct rb_node *n; 361 - struct btrfs_key key; 362 - struct btrfs_ioctl_defrag_range_args range; 363 224 u64 first_ino = 0; 364 225 u64 root_objectid = 0; 365 - int num_defrag; 366 - int defrag_batch = 1024; 367 - 368 - memset(&range, 0, sizeof(range)); 369 - range.len = (u64)-1; 370 226 371 227 atomic_inc(&fs_info->defrag_running); 372 - spin_lock(&fs_info->defrag_inodes_lock); 373 228 while(1) { 374 - n = NULL; 229 + if (!__need_auto_defrag(fs_info->tree_root)) 230 + break; 375 231 376 232 /* find an inode to defrag */ 377 - defrag = btrfs_find_defrag_inode(fs_info, root_objectid, 378 - first_ino, &n); 233 + defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, 234 + first_ino); 379 235 if (!defrag) { 380 - if (n) { 381 - defrag = rb_entry(n, struct inode_defrag, 382 - rb_node); 383 - } else if (root_objectid || first_ino) { 236 + if (root_objectid || first_ino) { 384 237 root_objectid = 0; 385 238 first_ino = 0; 386 239 continue; ··· 376 255 } 377 256 } 378 257 379 - /* remove it from the rbtree */ 380 258 first_ino = defrag->ino + 1; 381 259 root_objectid = defrag->root; 382 - rb_erase(&defrag->rb_node, &fs_info->defrag_inodes); 383 260 384 - if (btrfs_fs_closing(fs_info)) 385 - goto next_free; 386 - 387 - spin_unlock(&fs_info->defrag_inodes_lock); 388 - 389 - /* get the inode */ 390 - key.objectid = defrag->root; 391 - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 392 - key.offset = (u64)-1; 393 - inode_root = btrfs_read_fs_root_no_name(fs_info, &key); 394 - if (IS_ERR(inode_root)) 395 - goto next; 396 - 397 - key.objectid = defrag->ino; 398 - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 399 - key.offset = 0; 400 - 401 - inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); 402 - if (IS_ERR(inode)) 403 - goto next; 404 - 405 - /* do a chunk of defrag */ 406 - clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 407 - range.start = defrag->last_offset; 408 - num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, 409 - defrag_batch); 410 - /* 411 - * if we filled the whole defrag batch, there 412 - * must be more work to do. Queue this defrag 413 - * again 414 - */ 415 - if (num_defrag == defrag_batch) { 416 - defrag->last_offset = range.start; 417 - __btrfs_add_inode_defrag(inode, defrag); 418 - /* 419 - * we don't want to kfree defrag, we added it back to 420 - * the rbtree 421 - */ 422 - defrag = NULL; 423 - } else if (defrag->last_offset && !defrag->cycled) { 424 - /* 425 - * we didn't fill our defrag batch, but 426 - * we didn't start at zero. Make sure we loop 427 - * around to the start of the file. 428 - */ 429 - defrag->last_offset = 0; 430 - defrag->cycled = 1; 431 - __btrfs_add_inode_defrag(inode, defrag); 432 - defrag = NULL; 433 - } 434 - 435 - iput(inode); 436 - next: 437 - spin_lock(&fs_info->defrag_inodes_lock); 438 - next_free: 439 - kfree(defrag); 261 + __btrfs_run_defrag_inode(fs_info, defrag); 440 262 } 441 - spin_unlock(&fs_info->defrag_inodes_lock); 442 - 443 263 atomic_dec(&fs_info->defrag_running); 444 264 445 265 /* ··· 588 526 split->block_len = em->block_len; 589 527 else 590 528 split->block_len = split->len; 529 + split->orig_block_len = max(split->block_len, 530 + em->orig_block_len); 591 531 split->generation = gen; 592 532 split->bdev = em->bdev; 593 533 split->flags = flags; ··· 611 547 split->flags = flags; 612 548 split->compress_type = em->compress_type; 613 549 split->generation = gen; 550 + split->orig_block_len = max(em->block_len, 551 + em->orig_block_len); 614 552 615 553 if (compressed) { 616 554 split->block_len = em->block_len; ··· 621 555 } else { 622 556 split->block_len = split->len; 623 557 split->block_start = em->block_start + diff; 624 - split->orig_start = split->start; 558 + split->orig_start = em->orig_start; 625 559 } 626 560 627 561 ret = add_extent_mapping(em_tree, split); ··· 1414 1348 1415 1349 balance_dirty_pages_ratelimited(inode->i_mapping); 1416 1350 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1417 - btrfs_btree_balance_dirty(root, 1); 1351 + btrfs_btree_balance_dirty(root); 1418 1352 1419 1353 pos += copied; 1420 1354 num_written += copied; ··· 1463 1397 return written ? written : err; 1464 1398 } 1465 1399 1400 + static void update_time_for_write(struct inode *inode) 1401 + { 1402 + struct timespec now; 1403 + 1404 + if (IS_NOCMTIME(inode)) 1405 + return; 1406 + 1407 + now = current_fs_time(inode->i_sb); 1408 + if (!timespec_equal(&inode->i_mtime, &now)) 1409 + inode->i_mtime = now; 1410 + 1411 + if (!timespec_equal(&inode->i_ctime, &now)) 1412 + inode->i_ctime = now; 1413 + 1414 + if (IS_I_VERSION(inode)) 1415 + inode_inc_iversion(inode); 1416 + } 1417 + 1466 1418 static ssize_t btrfs_file_aio_write(struct kiocb *iocb, 1467 1419 const struct iovec *iov, 1468 1420 unsigned long nr_segs, loff_t pos) ··· 1493 1409 ssize_t num_written = 0; 1494 1410 ssize_t err = 0; 1495 1411 size_t count, ocount; 1412 + bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); 1496 1413 1497 1414 sb_start_write(inode->i_sb); 1498 1415 ··· 1536 1451 goto out; 1537 1452 } 1538 1453 1539 - err = file_update_time(file); 1540 - if (err) { 1541 - mutex_unlock(&inode->i_mutex); 1542 - goto out; 1543 - } 1454 + /* 1455 + * We reserve space for updating the inode when we reserve space for the 1456 + * extent we are going to write, so we will enospc out there. We don't 1457 + * need to start yet another transaction to update the inode as we will 1458 + * update the inode when we finish writing whatever data we write. 1459 + */ 1460 + update_time_for_write(inode); 1544 1461 1545 1462 start_pos = round_down(pos, root->sectorsize); 1546 1463 if (start_pos > i_size_read(inode)) { ··· 1552 1465 goto out; 1553 1466 } 1554 1467 } 1468 + 1469 + if (sync) 1470 + atomic_inc(&BTRFS_I(inode)->sync_writers); 1555 1471 1556 1472 if (unlikely(file->f_flags & O_DIRECT)) { 1557 1473 num_written = __btrfs_direct_write(iocb, iov, nr_segs, ··· 1582 1492 * this will either be one more than the running transaction 1583 1493 * or the generation used for the next transaction if there isn't 1584 1494 * one running right now. 1495 + * 1496 + * We also have to set last_sub_trans to the current log transid, 1497 + * otherwise subsequent syncs to a file that's been synced in this 1498 + * transaction will appear to have already occured. 1585 1499 */ 1586 1500 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 1501 + BTRFS_I(inode)->last_sub_trans = root->log_transid; 1587 1502 if (num_written > 0 || num_written == -EIOCBQUEUED) { 1588 1503 err = generic_write_sync(file, pos, num_written); 1589 1504 if (err < 0 && num_written > 0) 1590 1505 num_written = err; 1591 1506 } 1592 1507 out: 1508 + if (sync) 1509 + atomic_dec(&BTRFS_I(inode)->sync_writers); 1593 1510 sb_end_write(inode->i_sb); 1594 1511 current->backing_dev_info = NULL; 1595 1512 return num_written ? num_written : err; ··· 1647 1550 * out of the ->i_mutex. If so, we can flush the dirty pages by 1648 1551 * multi-task, and make the performance up. 1649 1552 */ 1553 + atomic_inc(&BTRFS_I(inode)->sync_writers); 1650 1554 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 1555 + atomic_dec(&BTRFS_I(inode)->sync_writers); 1651 1556 if (ret) 1652 1557 return ret; 1653 1558 ··· 1660 1561 * range being left. 1661 1562 */ 1662 1563 atomic_inc(&root->log_batch); 1663 - btrfs_wait_ordered_range(inode, start, end); 1564 + btrfs_wait_ordered_range(inode, start, end - start + 1); 1664 1565 atomic_inc(&root->log_batch); 1665 1566 1666 1567 /* ··· 1866 1767 1867 1768 hole_em->block_start = EXTENT_MAP_HOLE; 1868 1769 hole_em->block_len = 0; 1770 + hole_em->orig_block_len = 0; 1869 1771 hole_em->bdev = root->fs_info->fs_devices->latest_bdev; 1870 1772 hole_em->compress_type = BTRFS_COMPRESS_NONE; 1871 1773 hole_em->generation = trans->transid; ··· 1896 1796 struct btrfs_path *path; 1897 1797 struct btrfs_block_rsv *rsv; 1898 1798 struct btrfs_trans_handle *trans; 1899 - u64 mask = BTRFS_I(inode)->root->sectorsize - 1; 1900 - u64 lockstart = (offset + mask) & ~mask; 1901 - u64 lockend = ((offset + len) & ~mask) - 1; 1799 + u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); 1800 + u64 lockend = round_down(offset + len, 1801 + BTRFS_I(inode)->root->sectorsize) - 1; 1902 1802 u64 cur_offset = lockstart; 1903 1803 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 1904 1804 u64 drop_end; 1905 - unsigned long nr; 1906 1805 int ret = 0; 1907 1806 int err = 0; 1908 - bool same_page = (offset >> PAGE_CACHE_SHIFT) == 1909 - ((offset + len) >> PAGE_CACHE_SHIFT); 1807 + bool same_page = ((offset >> PAGE_CACHE_SHIFT) == 1808 + ((offset + len - 1) >> PAGE_CACHE_SHIFT)); 1910 1809 1911 1810 btrfs_wait_ordered_range(inode, offset, len); 1912 1811 1913 1812 mutex_lock(&inode->i_mutex); 1914 - if (offset >= inode->i_size) { 1915 - mutex_unlock(&inode->i_mutex); 1916 - return 0; 1917 - } 1918 - 1813 + /* 1814 + * We needn't truncate any page which is beyond the end of the file 1815 + * because we are sure there is no data there. 1816 + */ 1919 1817 /* 1920 1818 * Only do this if we are in the same page and we aren't doing the 1921 1819 * entire page. 1922 1820 */ 1923 1821 if (same_page && len < PAGE_CACHE_SIZE) { 1924 - ret = btrfs_truncate_page(inode, offset, len, 0); 1822 + if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) 1823 + ret = btrfs_truncate_page(inode, offset, len, 0); 1925 1824 mutex_unlock(&inode->i_mutex); 1926 1825 return ret; 1927 1826 } 1928 1827 1929 1828 /* zero back part of the first page */ 1930 - ret = btrfs_truncate_page(inode, offset, 0, 0); 1931 - if (ret) { 1932 - mutex_unlock(&inode->i_mutex); 1933 - return ret; 1829 + if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) { 1830 + ret = btrfs_truncate_page(inode, offset, 0, 0); 1831 + if (ret) { 1832 + mutex_unlock(&inode->i_mutex); 1833 + return ret; 1834 + } 1934 1835 } 1935 1836 1936 1837 /* zero the front end of the last page */ 1937 - ret = btrfs_truncate_page(inode, offset + len, 0, 1); 1938 - if (ret) { 1939 - mutex_unlock(&inode->i_mutex); 1940 - return ret; 1838 + if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) { 1839 + ret = btrfs_truncate_page(inode, offset + len, 0, 1); 1840 + if (ret) { 1841 + mutex_unlock(&inode->i_mutex); 1842 + return ret; 1843 + } 1941 1844 } 1942 1845 1943 1846 if (lockend < lockstart) { ··· 2033 1930 break; 2034 1931 } 2035 1932 2036 - nr = trans->blocks_used; 2037 1933 btrfs_end_transaction(trans, root); 2038 - btrfs_btree_balance_dirty(root, nr); 1934 + btrfs_btree_balance_dirty(root); 2039 1935 2040 1936 trans = btrfs_start_transaction(root, 3); 2041 1937 if (IS_ERR(trans)) { ··· 2065 1963 if (!trans) 2066 1964 goto out_free; 2067 1965 1966 + inode_inc_iversion(inode); 1967 + inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1968 + 2068 1969 trans->block_rsv = &root->fs_info->trans_block_rsv; 2069 1970 ret = btrfs_update_inode(trans, root, inode); 2070 - nr = trans->blocks_used; 2071 1971 btrfs_end_transaction(trans, root); 2072 - btrfs_btree_balance_dirty(root, nr); 1972 + btrfs_btree_balance_dirty(root); 2073 1973 out_free: 2074 1974 btrfs_free_path(path); 2075 1975 btrfs_free_block_rsv(root, rsv); ··· 2095 1991 u64 alloc_end; 2096 1992 u64 alloc_hint = 0; 2097 1993 u64 locked_end; 2098 - u64 mask = BTRFS_I(inode)->root->sectorsize - 1; 2099 1994 struct extent_map *em; 1995 + int blocksize = BTRFS_I(inode)->root->sectorsize; 2100 1996 int ret; 2101 1997 2102 - alloc_start = offset & ~mask; 2103 - alloc_end = (offset + len + mask) & ~mask; 1998 + alloc_start = round_down(offset, blocksize); 1999 + alloc_end = round_up(offset + len, blocksize); 2104 2000 2105 2001 /* Make sure we aren't being give some crap mode */ 2106 2002 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) ··· 2113 2009 * Make sure we have enough space before we do the 2114 2010 * allocation. 2115 2011 */ 2116 - ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1); 2012 + ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); 2117 2013 if (ret) 2118 2014 return ret; 2119 2015 ··· 2181 2077 } 2182 2078 last_byte = min(extent_map_end(em), alloc_end); 2183 2079 actual_end = min_t(u64, extent_map_end(em), offset + len); 2184 - last_byte = (last_byte + mask) & ~mask; 2080 + last_byte = ALIGN(last_byte, blocksize); 2185 2081 2186 2082 if (em->block_start == EXTENT_MAP_HOLE || 2187 2083 (cur_offset >= inode->i_size && ··· 2220 2116 out: 2221 2117 mutex_unlock(&inode->i_mutex); 2222 2118 /* Let go of our reservation. */ 2223 - btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1); 2119 + btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); 2224 2120 return ret; 2225 2121 } 2226 2122 ··· 2396 2292 .compat_ioctl = btrfs_ioctl, 2397 2293 #endif 2398 2294 }; 2295 + 2296 + void btrfs_auto_defrag_exit(void) 2297 + { 2298 + if (btrfs_inode_defrag_cachep) 2299 + kmem_cache_destroy(btrfs_inode_defrag_cachep); 2300 + } 2301 + 2302 + int btrfs_auto_defrag_init(void) 2303 + { 2304 + btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", 2305 + sizeof(struct inode_defrag), 0, 2306 + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, 2307 + NULL); 2308 + if (!btrfs_inode_defrag_cachep) 2309 + return -ENOMEM; 2310 + 2311 + return 0; 2312 + }

+19 -32

fs/btrfs/free-space-cache.c

··· 307 307 308 308 static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) 309 309 { 310 - WARN_ON(io_ctl->cur); 311 310 BUG_ON(io_ctl->index >= io_ctl->num_pages); 312 311 io_ctl->page = io_ctl->pages[io_ctl->index++]; 313 312 io_ctl->cur = kmap(io_ctl->page); ··· 1249 1250 * if previous extent entry covers the offset, 1250 1251 * we should return it instead of the bitmap entry 1251 1252 */ 1252 - n = &entry->offset_index; 1253 - while (1) { 1254 - n = rb_prev(n); 1255 - if (!n) 1256 - break; 1253 + n = rb_prev(&entry->offset_index); 1254 + if (n) { 1257 1255 prev = rb_entry(n, struct btrfs_free_space, 1258 1256 offset_index); 1259 - if (!prev->bitmap) { 1260 - if (prev->offset + prev->bytes > offset) 1261 - entry = prev; 1262 - break; 1263 - } 1257 + if (!prev->bitmap && 1258 + prev->offset + prev->bytes > offset) 1259 + entry = prev; 1264 1260 } 1265 1261 } 1266 1262 return entry; ··· 1281 1287 } 1282 1288 1283 1289 if (entry->bitmap) { 1284 - n = &entry->offset_index; 1285 - while (1) { 1286 - n = rb_prev(n); 1287 - if (!n) 1288 - break; 1290 + n = rb_prev(&entry->offset_index); 1291 + if (n) { 1289 1292 prev = rb_entry(n, struct btrfs_free_space, 1290 1293 offset_index); 1291 - if (!prev->bitmap) { 1292 - if (prev->offset + prev->bytes > offset) 1293 - return prev; 1294 - break; 1295 - } 1294 + if (!prev->bitmap && 1295 + prev->offset + prev->bytes > offset) 1296 + return prev; 1296 1297 } 1297 1298 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) 1298 1299 return entry; ··· 1353 1364 u64 bitmap_bytes; 1354 1365 u64 extent_bytes; 1355 1366 u64 size = block_group->key.offset; 1356 - u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; 1367 + u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; 1357 1368 int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 1358 1369 1359 1370 BUG_ON(ctl->total_bitmaps > max_bitmaps); ··· 1639 1650 * some block groups are so tiny they can't be enveloped by a bitmap, so 1640 1651 * don't even bother to create a bitmap for this 1641 1652 */ 1642 - if (BITS_PER_BITMAP * block_group->sectorsize > 1643 - block_group->key.offset) 1653 + if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset) 1644 1654 return false; 1645 1655 1646 1656 return true; ··· 2286 2298 unsigned long total_found = 0; 2287 2299 int ret; 2288 2300 2289 - i = offset_to_bit(entry->offset, block_group->sectorsize, 2301 + i = offset_to_bit(entry->offset, ctl->unit, 2290 2302 max_t(u64, offset, entry->offset)); 2291 - want_bits = bytes_to_bits(bytes, block_group->sectorsize); 2292 - min_bits = bytes_to_bits(min_bytes, block_group->sectorsize); 2303 + want_bits = bytes_to_bits(bytes, ctl->unit); 2304 + min_bits = bytes_to_bits(min_bytes, ctl->unit); 2293 2305 2294 2306 again: 2295 2307 found_bits = 0; ··· 2313 2325 2314 2326 total_found += found_bits; 2315 2327 2316 - if (cluster->max_size < found_bits * block_group->sectorsize) 2317 - cluster->max_size = found_bits * block_group->sectorsize; 2328 + if (cluster->max_size < found_bits * ctl->unit) 2329 + cluster->max_size = found_bits * ctl->unit; 2318 2330 2319 2331 if (total_found < want_bits || cluster->max_size < cont1_bytes) { 2320 2332 i = next_zero + 1; 2321 2333 goto again; 2322 2334 } 2323 2335 2324 - cluster->window_start = start * block_group->sectorsize + 2325 - entry->offset; 2336 + cluster->window_start = start * ctl->unit + entry->offset; 2326 2337 rb_erase(&entry->offset_index, &ctl->free_space_offset); 2327 2338 ret = tree_insert_offset(&cluster->root, entry->offset, 2328 2339 &entry->offset_index, 1); 2329 2340 BUG_ON(ret); /* -EEXIST; Logic error */ 2330 2341 2331 2342 trace_btrfs_setup_cluster(block_group, cluster, 2332 - total_found * block_group->sectorsize, 1); 2343 + total_found * ctl->unit, 1); 2333 2344 return 0; 2334 2345 } 2335 2346

+3 -2

fs/btrfs/inode-map.c

··· 434 434 * 3 items for pre-allocation 435 435 */ 436 436 trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); 437 - ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv, 438 - trans->bytes_reserved); 437 + ret = btrfs_block_rsv_add(root, trans->block_rsv, 438 + trans->bytes_reserved, 439 + BTRFS_RESERVE_NO_FLUSH); 439 440 if (ret) 440 441 goto out; 441 442 trace_btrfs_space_reservation(root->fs_info, "ino_cache",

+294 -190

fs/btrfs/inode.c

··· 71 71 static struct extent_io_ops btrfs_extent_io_ops; 72 72 73 73 static struct kmem_cache *btrfs_inode_cachep; 74 + static struct kmem_cache *btrfs_delalloc_work_cachep; 74 75 struct kmem_cache *btrfs_trans_handle_cachep; 75 76 struct kmem_cache *btrfs_transaction_cachep; 76 77 struct kmem_cache *btrfs_path_cachep; ··· 95 94 struct page *locked_page, 96 95 u64 start, u64 end, int *page_started, 97 96 unsigned long *nr_written, int unlock); 97 + static struct extent_map *create_pinned_em(struct inode *inode, u64 start, 98 + u64 len, u64 orig_start, 99 + u64 block_start, u64 block_len, 100 + u64 orig_block_len, int type); 98 101 99 102 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 100 103 struct inode *inode, struct inode *dir, ··· 703 698 704 699 em->block_start = ins.objectid; 705 700 em->block_len = ins.offset; 701 + em->orig_block_len = ins.offset; 706 702 em->bdev = root->fs_info->fs_devices->latest_bdev; 707 703 em->compress_type = async_extent->compress_type; 708 704 set_bit(EXTENT_FLAG_PINNED, &em->flags); 709 705 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 706 + em->generation = -1; 710 707 711 708 while (1) { 712 709 write_lock(&em_tree->lock); 713 710 ret = add_extent_mapping(em_tree, em); 711 + if (!ret) 712 + list_move(&em->list, 713 + &em_tree->modified_extents); 714 714 write_unlock(&em_tree->lock); 715 715 if (ret != -EEXIST) { 716 716 free_extent_map(em); ··· 813 803 * required to start IO on it. It may be clean and already done with 814 804 * IO when we return. 815 805 */ 816 - static noinline int cow_file_range(struct inode *inode, 817 - struct page *locked_page, 818 - u64 start, u64 end, int *page_started, 819 - unsigned long *nr_written, 820 - int unlock) 806 + static noinline int __cow_file_range(struct btrfs_trans_handle *trans, 807 + struct inode *inode, 808 + struct btrfs_root *root, 809 + struct page *locked_page, 810 + u64 start, u64 end, int *page_started, 811 + unsigned long *nr_written, 812 + int unlock) 821 813 { 822 - struct btrfs_root *root = BTRFS_I(inode)->root; 823 - struct btrfs_trans_handle *trans; 824 814 u64 alloc_hint = 0; 825 815 u64 num_bytes; 826 816 unsigned long ram_size; ··· 833 823 int ret = 0; 834 824 835 825 BUG_ON(btrfs_is_free_space_inode(inode)); 836 - trans = btrfs_join_transaction(root); 837 - if (IS_ERR(trans)) { 838 - extent_clear_unlock_delalloc(inode, 839 - &BTRFS_I(inode)->io_tree, 840 - start, end, locked_page, 841 - EXTENT_CLEAR_UNLOCK_PAGE | 842 - EXTENT_CLEAR_UNLOCK | 843 - EXTENT_CLEAR_DELALLOC | 844 - EXTENT_CLEAR_DIRTY | 845 - EXTENT_SET_WRITEBACK | 846 - EXTENT_END_WRITEBACK); 847 - return PTR_ERR(trans); 848 - } 849 - trans->block_rsv = &root->fs_info->delalloc_block_rsv; 850 826 851 827 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 852 828 num_bytes = max(blocksize, num_bytes); 853 829 disk_num_bytes = num_bytes; 854 - ret = 0; 855 830 856 831 /* if this is a small write inside eof, kick off defrag */ 857 832 if (num_bytes < 64 * 1024 && ··· 895 900 896 901 em->block_start = ins.objectid; 897 902 em->block_len = ins.offset; 903 + em->orig_block_len = ins.offset; 898 904 em->bdev = root->fs_info->fs_devices->latest_bdev; 899 905 set_bit(EXTENT_FLAG_PINNED, &em->flags); 906 + em->generation = -1; 900 907 901 908 while (1) { 902 909 write_lock(&em_tree->lock); 903 910 ret = add_extent_mapping(em_tree, em); 911 + if (!ret) 912 + list_move(&em->list, 913 + &em_tree->modified_extents); 904 914 write_unlock(&em_tree->lock); 905 915 if (ret != -EEXIST) { 906 916 free_extent_map(em); ··· 952 952 alloc_hint = ins.objectid + ins.offset; 953 953 start += cur_alloc_size; 954 954 } 955 - ret = 0; 956 955 out: 957 - btrfs_end_transaction(trans, root); 958 - 959 956 return ret; 957 + 960 958 out_unlock: 961 959 extent_clear_unlock_delalloc(inode, 962 960 &BTRFS_I(inode)->io_tree, ··· 967 969 EXTENT_END_WRITEBACK); 968 970 969 971 goto out; 972 + } 973 + 974 + static noinline int cow_file_range(struct inode *inode, 975 + struct page *locked_page, 976 + u64 start, u64 end, int *page_started, 977 + unsigned long *nr_written, 978 + int unlock) 979 + { 980 + struct btrfs_trans_handle *trans; 981 + struct btrfs_root *root = BTRFS_I(inode)->root; 982 + int ret; 983 + 984 + trans = btrfs_join_transaction(root); 985 + if (IS_ERR(trans)) { 986 + extent_clear_unlock_delalloc(inode, 987 + &BTRFS_I(inode)->io_tree, 988 + start, end, locked_page, 989 + EXTENT_CLEAR_UNLOCK_PAGE | 990 + EXTENT_CLEAR_UNLOCK | 991 + EXTENT_CLEAR_DELALLOC | 992 + EXTENT_CLEAR_DIRTY | 993 + EXTENT_SET_WRITEBACK | 994 + EXTENT_END_WRITEBACK); 995 + return PTR_ERR(trans); 996 + } 997 + trans->block_rsv = &root->fs_info->delalloc_block_rsv; 998 + 999 + ret = __cow_file_range(trans, inode, root, locked_page, start, end, 1000 + page_started, nr_written, unlock); 1001 + 1002 + btrfs_end_transaction(trans, root); 1003 + 1004 + return ret; 970 1005 } 971 1006 972 1007 /* ··· 1157 1126 u64 extent_offset; 1158 1127 u64 disk_bytenr; 1159 1128 u64 num_bytes; 1129 + u64 disk_num_bytes; 1160 1130 int extent_type; 1161 1131 int ret, err; 1162 1132 int type; ··· 1260 1228 extent_offset = btrfs_file_extent_offset(leaf, fi); 1261 1229 extent_end = found_key.offset + 1262 1230 btrfs_file_extent_num_bytes(leaf, fi); 1231 + disk_num_bytes = 1232 + btrfs_file_extent_disk_num_bytes(leaf, fi); 1263 1233 if (extent_end <= start) { 1264 1234 path->slots[0]++; 1265 1235 goto next_slot; ··· 1315 1281 1316 1282 btrfs_release_path(path); 1317 1283 if (cow_start != (u64)-1) { 1318 - ret = cow_file_range(inode, locked_page, cow_start, 1319 - found_key.offset - 1, page_started, 1320 - nr_written, 1); 1284 + ret = __cow_file_range(trans, inode, root, locked_page, 1285 + cow_start, found_key.offset - 1, 1286 + page_started, nr_written, 1); 1321 1287 if (ret) { 1322 1288 btrfs_abort_transaction(trans, root, ret); 1323 1289 goto error; ··· 1332 1298 em = alloc_extent_map(); 1333 1299 BUG_ON(!em); /* -ENOMEM */ 1334 1300 em->start = cur_offset; 1335 - em->orig_start = em->start; 1301 + em->orig_start = found_key.offset - extent_offset; 1336 1302 em->len = num_bytes; 1337 1303 em->block_len = num_bytes; 1338 1304 em->block_start = disk_bytenr; 1305 + em->orig_block_len = disk_num_bytes; 1339 1306 em->bdev = root->fs_info->fs_devices->latest_bdev; 1340 1307 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1341 - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 1308 + set_bit(EXTENT_FLAG_FILLING, &em->flags); 1309 + em->generation = -1; 1342 1310 while (1) { 1343 1311 write_lock(&em_tree->lock); 1344 1312 ret = add_extent_mapping(em_tree, em); 1313 + if (!ret) 1314 + list_move(&em->list, 1315 + &em_tree->modified_extents); 1345 1316 write_unlock(&em_tree->lock); 1346 1317 if (ret != -EEXIST) { 1347 1318 free_extent_map(em); ··· 1391 1352 } 1392 1353 1393 1354 if (cow_start != (u64)-1) { 1394 - ret = cow_file_range(inode, locked_page, cow_start, end, 1395 - page_started, nr_written, 1); 1355 + ret = __cow_file_range(trans, inode, root, locked_page, 1356 + cow_start, end, 1357 + page_started, nr_written, 1); 1396 1358 if (ret) { 1397 1359 btrfs_abort_transaction(trans, root, ret); 1398 1360 goto error; ··· 1571 1531 unsigned long bio_flags) 1572 1532 { 1573 1533 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 1574 - struct btrfs_mapping_tree *map_tree; 1575 1534 u64 logical = (u64)bio->bi_sector << 9; 1576 1535 u64 length = 0; 1577 1536 u64 map_length; ··· 1580 1541 return 0; 1581 1542 1582 1543 length = bio->bi_size; 1583 - map_tree = &root->fs_info->mapping_tree; 1584 1544 map_length = length; 1585 - ret = btrfs_map_block(map_tree, READ, logical, 1545 + ret = btrfs_map_block(root->fs_info, READ, logical, 1586 1546 &map_length, NULL, 0); 1587 - /* Will always return 0 or 1 with map_multi == NULL */ 1547 + /* Will always return 0 with map_multi == NULL */ 1588 1548 BUG_ON(ret < 0); 1589 1549 if (map_length < length + size) 1590 1550 return 1; ··· 1624 1586 u64 bio_offset) 1625 1587 { 1626 1588 struct btrfs_root *root = BTRFS_I(inode)->root; 1627 - return btrfs_map_bio(root, rw, bio, mirror_num, 1); 1589 + int ret; 1590 + 1591 + ret = btrfs_map_bio(root, rw, bio, mirror_num, 1); 1592 + if (ret) 1593 + bio_endio(bio, ret); 1594 + return ret; 1628 1595 } 1629 1596 1630 1597 /* ··· 1644 1601 int ret = 0; 1645 1602 int skip_sum; 1646 1603 int metadata = 0; 1604 + int async = !atomic_read(&BTRFS_I(inode)->sync_writers); 1647 1605 1648 1606 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1649 1607 ··· 1654 1610 if (!(rw & REQ_WRITE)) { 1655 1611 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); 1656 1612 if (ret) 1657 - return ret; 1613 + goto out; 1658 1614 1659 1615 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1660 - return btrfs_submit_compressed_read(inode, bio, 1661 - mirror_num, bio_flags); 1616 + ret = btrfs_submit_compressed_read(inode, bio, 1617 + mirror_num, 1618 + bio_flags); 1619 + goto out; 1662 1620 } else if (!skip_sum) { 1663 1621 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); 1664 1622 if (ret) 1665 - return ret; 1623 + goto out; 1666 1624 } 1667 1625 goto mapit; 1668 - } else if (!skip_sum) { 1626 + } else if (async && !skip_sum) { 1669 1627 /* csum items have already been cloned */ 1670 1628 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 1671 1629 goto mapit; 1672 1630 /* we're doing a write, do the async checksumming */ 1673 - return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1631 + ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1674 1632 inode, rw, bio, mirror_num, 1675 1633 bio_flags, bio_offset, 1676 1634 __btrfs_submit_bio_start, 1677 1635 __btrfs_submit_bio_done); 1636 + goto out; 1637 + } else if (!skip_sum) { 1638 + ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); 1639 + if (ret) 1640 + goto out; 1678 1641 } 1679 1642 1680 1643 mapit: 1681 - return btrfs_map_bio(root, rw, bio, mirror_num, 0); 1644 + ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); 1645 + 1646 + out: 1647 + if (ret < 0) 1648 + bio_endio(bio, ret); 1649 + return ret; 1682 1650 } 1683 1651 1684 1652 /* ··· 1713 1657 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 1714 1658 struct extent_state **cached_state) 1715 1659 { 1716 - if ((end & (PAGE_CACHE_SIZE - 1)) == 0) 1717 - WARN_ON(1); 1660 + WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0); 1718 1661 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1719 1662 cached_state, GFP_NOFS); 1720 1663 } ··· 1922 1867 1923 1868 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1924 1869 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ 1925 - ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1926 - if (!ret) { 1927 - if (nolock) 1928 - trans = btrfs_join_transaction_nolock(root); 1929 - else 1930 - trans = btrfs_join_transaction(root); 1931 - if (IS_ERR(trans)) { 1932 - ret = PTR_ERR(trans); 1933 - trans = NULL; 1934 - goto out; 1935 - } 1936 - trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1937 - ret = btrfs_update_inode_fallback(trans, root, inode); 1938 - if (ret) /* -ENOMEM or corruption */ 1939 - btrfs_abort_transaction(trans, root, ret); 1870 + btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1871 + if (nolock) 1872 + trans = btrfs_join_transaction_nolock(root); 1873 + else 1874 + trans = btrfs_join_transaction(root); 1875 + if (IS_ERR(trans)) { 1876 + ret = PTR_ERR(trans); 1877 + trans = NULL; 1878 + goto out; 1940 1879 } 1880 + trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1881 + ret = btrfs_update_inode_fallback(trans, root, inode); 1882 + if (ret) /* -ENOMEM or corruption */ 1883 + btrfs_abort_transaction(trans, root, ret); 1941 1884 goto out; 1942 1885 } 1943 1886 ··· 1984 1931 add_pending_csums(trans, inode, ordered_extent->file_offset, 1985 1932 &ordered_extent->list); 1986 1933 1987 - ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1988 - if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1989 - ret = btrfs_update_inode_fallback(trans, root, inode); 1990 - if (ret) { /* -ENOMEM or corruption */ 1991 - btrfs_abort_transaction(trans, root, ret); 1992 - goto out_unlock; 1993 - } 1994 - } else { 1995 - btrfs_set_inode_last_trans(trans, inode); 1934 + btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1935 + ret = btrfs_update_inode_fallback(trans, root, inode); 1936 + if (ret) { /* -ENOMEM or corruption */ 1937 + btrfs_abort_transaction(trans, root, ret); 1938 + goto out_unlock; 1996 1939 } 1997 1940 ret = 0; 1998 1941 out_unlock: ··· 3123 3074 struct btrfs_trans_handle *trans; 3124 3075 struct inode *inode = dentry->d_inode; 3125 3076 int ret; 3126 - unsigned long nr = 0; 3127 3077 3128 3078 trans = __unlink_start_trans(dir, dentry); 3129 3079 if (IS_ERR(trans)) ··· 3142 3094 } 3143 3095 3144 3096 out: 3145 - nr = trans->blocks_used; 3146 3097 __unlink_end_trans(trans, root); 3147 - btrfs_btree_balance_dirty(root, nr); 3098 + btrfs_btree_balance_dirty(root); 3148 3099 return ret; 3149 3100 } 3150 3101 ··· 3233 3186 int err = 0; 3234 3187 struct btrfs_root *root = BTRFS_I(dir)->root; 3235 3188 struct btrfs_trans_handle *trans; 3236 - unsigned long nr = 0; 3237 3189 3238 3190 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) 3239 3191 return -ENOTEMPTY; ··· 3261 3215 if (!err) 3262 3216 btrfs_i_size_write(inode, 0); 3263 3217 out: 3264 - nr = trans->blocks_used; 3265 3218 __unlink_end_trans(trans, root); 3266 - btrfs_btree_balance_dirty(root, nr); 3219 + btrfs_btree_balance_dirty(root); 3267 3220 3268 3221 return err; 3269 3222 } ··· 3542 3497 if (ret) 3543 3498 goto out; 3544 3499 3545 - ret = -ENOMEM; 3546 3500 again: 3547 3501 page = find_or_create_page(mapping, index, mask); 3548 3502 if (!page) { 3549 3503 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3504 + ret = -ENOMEM; 3550 3505 goto out; 3551 3506 } 3552 3507 ··· 3595 3550 goto out_unlock; 3596 3551 } 3597 3552 3598 - ret = 0; 3599 3553 if (offset != PAGE_CACHE_SIZE) { 3600 3554 if (!len) 3601 3555 len = PAGE_CACHE_SIZE - offset; ··· 3712 3668 3713 3669 hole_em->block_start = EXTENT_MAP_HOLE; 3714 3670 hole_em->block_len = 0; 3671 + hole_em->orig_block_len = 0; 3715 3672 hole_em->bdev = root->fs_info->fs_devices->latest_bdev; 3716 3673 hole_em->compress_type = BTRFS_COMPRESS_NONE; 3717 3674 hole_em->generation = trans->transid; ··· 3828 3783 struct btrfs_root *root = BTRFS_I(inode)->root; 3829 3784 struct btrfs_block_rsv *rsv, *global_rsv; 3830 3785 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 3831 - unsigned long nr; 3832 3786 int ret; 3833 3787 3834 3788 trace_btrfs_inode_evict(inode); ··· 3873 3829 * inode item when doing the truncate. 3874 3830 */ 3875 3831 while (1) { 3876 - ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); 3832 + ret = btrfs_block_rsv_refill(root, rsv, min_size, 3833 + BTRFS_RESERVE_FLUSH_LIMIT); 3877 3834 3878 3835 /* 3879 3836 * Try and steal from the global reserve since we will ··· 3892 3847 goto no_delete; 3893 3848 } 3894 3849 3895 - trans = btrfs_start_transaction_noflush(root, 1); 3850 + trans = btrfs_start_transaction_lflush(root, 1); 3896 3851 if (IS_ERR(trans)) { 3897 3852 btrfs_orphan_del(NULL, inode); 3898 3853 btrfs_free_block_rsv(root, rsv); ··· 3909 3864 ret = btrfs_update_inode(trans, root, inode); 3910 3865 BUG_ON(ret); 3911 3866 3912 - nr = trans->blocks_used; 3913 3867 btrfs_end_transaction(trans, root); 3914 3868 trans = NULL; 3915 - btrfs_btree_balance_dirty(root, nr); 3869 + btrfs_btree_balance_dirty(root); 3916 3870 } 3917 3871 3918 3872 btrfs_free_block_rsv(root, rsv); ··· 3927 3883 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 3928 3884 btrfs_return_ino(root, btrfs_ino(inode)); 3929 3885 3930 - nr = trans->blocks_used; 3931 3886 btrfs_end_transaction(trans, root); 3932 - btrfs_btree_balance_dirty(root, nr); 3887 + btrfs_btree_balance_dirty(root); 3933 3888 no_delete: 3934 3889 clear_inode(inode); 3935 3890 return; ··· 4818 4775 if (S_ISREG(mode)) { 4819 4776 if (btrfs_test_opt(root, NODATASUM)) 4820 4777 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 4821 - if (btrfs_test_opt(root, NODATACOW) || 4822 - (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW)) 4778 + if (btrfs_test_opt(root, NODATACOW)) 4823 4779 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 4824 4780 } 4825 4781 ··· 4884 4842 ret = btrfs_insert_dir_item(trans, root, name, name_len, 4885 4843 parent_inode, &key, 4886 4844 btrfs_inode_type(inode), index); 4887 - if (ret == -EEXIST) 4845 + if (ret == -EEXIST || ret == -EOVERFLOW) 4888 4846 goto fail_dir_item; 4889 4847 else if (ret) { 4890 4848 btrfs_abort_transaction(trans, root, ret); ··· 4939 4897 int err; 4940 4898 int drop_inode = 0; 4941 4899 u64 objectid; 4942 - unsigned long nr = 0; 4943 4900 u64 index = 0; 4944 4901 4945 4902 if (!new_valid_dev(rdev)) ··· 4971 4930 goto out_unlock; 4972 4931 } 4973 4932 4933 + err = btrfs_update_inode(trans, root, inode); 4934 + if (err) { 4935 + drop_inode = 1; 4936 + goto out_unlock; 4937 + } 4938 + 4974 4939 /* 4975 4940 * If the active LSM wants to access the inode during 4976 4941 * d_instantiate it needs these. Smack checks to see ··· 4994 4947 d_instantiate(dentry, inode); 4995 4948 } 4996 4949 out_unlock: 4997 - nr = trans->blocks_used; 4998 4950 btrfs_end_transaction(trans, root); 4999 - btrfs_btree_balance_dirty(root, nr); 4951 + btrfs_btree_balance_dirty(root); 5000 4952 if (drop_inode) { 5001 4953 inode_dec_link_count(inode); 5002 4954 iput(inode); ··· 5009 4963 struct btrfs_trans_handle *trans; 5010 4964 struct btrfs_root *root = BTRFS_I(dir)->root; 5011 4965 struct inode *inode = NULL; 5012 - int drop_inode = 0; 4966 + int drop_inode_on_err = 0; 5013 4967 int err; 5014 - unsigned long nr = 0; 5015 4968 u64 objectid; 5016 4969 u64 index = 0; 5017 4970 ··· 5034 4989 err = PTR_ERR(inode); 5035 4990 goto out_unlock; 5036 4991 } 4992 + drop_inode_on_err = 1; 5037 4993 5038 4994 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 5039 - if (err) { 5040 - drop_inode = 1; 4995 + if (err) 5041 4996 goto out_unlock; 5042 - } 4997 + 4998 + err = btrfs_update_inode(trans, root, inode); 4999 + if (err) 5000 + goto out_unlock; 5043 5001 5044 5002 /* 5045 5003 * If the active LSM wants to access the inode during ··· 5055 5007 5056 5008 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 5057 5009 if (err) 5058 - drop_inode = 1; 5059 - else { 5060 - inode->i_mapping->a_ops = &btrfs_aops; 5061 - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 5062 - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 5063 - d_instantiate(dentry, inode); 5064 - } 5010 + goto out_unlock; 5011 + 5012 + inode->i_mapping->a_ops = &btrfs_aops; 5013 + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 5014 + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 5015 + d_instantiate(dentry, inode); 5016 + 5065 5017 out_unlock: 5066 - nr = trans->blocks_used; 5067 5018 btrfs_end_transaction(trans, root); 5068 - if (drop_inode) { 5019 + if (err && drop_inode_on_err) { 5069 5020 inode_dec_link_count(inode); 5070 5021 iput(inode); 5071 5022 } 5072 - btrfs_btree_balance_dirty(root, nr); 5023 + btrfs_btree_balance_dirty(root); 5073 5024 return err; 5074 5025 } 5075 5026 ··· 5079 5032 struct btrfs_root *root = BTRFS_I(dir)->root; 5080 5033 struct inode *inode = old_dentry->d_inode; 5081 5034 u64 index; 5082 - unsigned long nr = 0; 5083 5035 int err; 5084 5036 int drop_inode = 0; 5085 5037 ··· 5108 5062 inode_inc_iversion(inode); 5109 5063 inode->i_ctime = CURRENT_TIME; 5110 5064 ihold(inode); 5065 + set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); 5111 5066 5112 5067 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); 5113 5068 ··· 5123 5076 btrfs_log_new_name(trans, inode, NULL, parent); 5124 5077 } 5125 5078 5126 - nr = trans->blocks_used; 5127 5079 btrfs_end_transaction(trans, root); 5128 5080 fail: 5129 5081 if (drop_inode) { 5130 5082 inode_dec_link_count(inode); 5131 5083 iput(inode); 5132 5084 } 5133 - btrfs_btree_balance_dirty(root, nr); 5085 + btrfs_btree_balance_dirty(root); 5134 5086 return err; 5135 5087 } 5136 5088 ··· 5142 5096 int drop_on_err = 0; 5143 5097 u64 objectid = 0; 5144 5098 u64 index = 0; 5145 - unsigned long nr = 1; 5146 5099 5147 5100 /* 5148 5101 * 2 items for inode and ref ··· 5187 5142 drop_on_err = 0; 5188 5143 5189 5144 out_fail: 5190 - nr = trans->blocks_used; 5191 5145 btrfs_end_transaction(trans, root); 5192 5146 if (drop_on_err) 5193 5147 iput(inode); 5194 - btrfs_btree_balance_dirty(root, nr); 5148 + btrfs_btree_balance_dirty(root); 5195 5149 return err; 5196 5150 } 5197 5151 ··· 5384 5340 if (start + len <= found_key.offset) 5385 5341 goto not_found; 5386 5342 em->start = start; 5343 + em->orig_start = start; 5387 5344 em->len = found_key.offset - start; 5388 5345 goto not_found_em; 5389 5346 } ··· 5395 5350 em->len = extent_end - extent_start; 5396 5351 em->orig_start = extent_start - 5397 5352 btrfs_file_extent_offset(leaf, item); 5353 + em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, 5354 + item); 5398 5355 bytenr = btrfs_file_extent_disk_bytenr(leaf, item); 5399 5356 if (bytenr == 0) { 5400 5357 em->block_start = EXTENT_MAP_HOLE; ··· 5406 5359 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5407 5360 em->compress_type = compress_type; 5408 5361 em->block_start = bytenr; 5409 - em->block_len = btrfs_file_extent_disk_num_bytes(leaf, 5410 - item); 5362 + em->block_len = em->orig_block_len; 5411 5363 } else { 5412 5364 bytenr += btrfs_file_extent_offset(leaf, item); 5413 5365 em->block_start = bytenr; ··· 5436 5390 em->start = extent_start + extent_offset; 5437 5391 em->len = (copy_size + root->sectorsize - 1) & 5438 5392 ~((u64)root->sectorsize - 1); 5439 - em->orig_start = EXTENT_MAP_INLINE; 5393 + em->orig_block_len = em->len; 5394 + em->orig_start = em->start; 5440 5395 if (compress_type) { 5441 5396 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5442 5397 em->compress_type = compress_type; ··· 5486 5439 extent_map_end(em) - 1, NULL, GFP_NOFS); 5487 5440 goto insert; 5488 5441 } else { 5489 - printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); 5490 - WARN_ON(1); 5442 + WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type); 5491 5443 } 5492 5444 not_found: 5493 5445 em->start = start; 5446 + em->orig_start = start; 5494 5447 em->len = len; 5495 5448 not_found_em: 5496 5449 em->block_start = EXTENT_MAP_HOLE; ··· 5692 5645 } 5693 5646 5694 5647 static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 5695 - struct extent_map *em, 5696 5648 u64 start, u64 len) 5697 5649 { 5698 5650 struct btrfs_root *root = BTRFS_I(inode)->root; 5699 5651 struct btrfs_trans_handle *trans; 5700 - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5652 + struct extent_map *em; 5701 5653 struct btrfs_key ins; 5702 5654 u64 alloc_hint; 5703 5655 int ret; 5704 - bool insert = false; 5705 - 5706 - /* 5707 - * Ok if the extent map we looked up is a hole and is for the exact 5708 - * range we want, there is no reason to allocate a new one, however if 5709 - * it is not right then we need to free this one and drop the cache for 5710 - * our range. 5711 - */ 5712 - if (em->block_start != EXTENT_MAP_HOLE || em->start != start || 5713 - em->len != len) { 5714 - free_extent_map(em); 5715 - em = NULL; 5716 - insert = true; 5717 - btrfs_drop_extent_cache(inode, start, start + len - 1, 0); 5718 - } 5719 5656 5720 5657 trans = btrfs_join_transaction(root); 5721 5658 if (IS_ERR(trans)) 5722 5659 return ERR_CAST(trans); 5723 - 5724 - if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024) 5725 - btrfs_add_inode_defrag(trans, inode); 5726 5660 5727 5661 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5728 5662 ··· 5715 5687 goto out; 5716 5688 } 5717 5689 5718 - if (!em) { 5719 - em = alloc_extent_map(); 5720 - if (!em) { 5721 - em = ERR_PTR(-ENOMEM); 5722 - goto out; 5723 - } 5724 - } 5725 - 5726 - em->start = start; 5727 - em->orig_start = em->start; 5728 - em->len = ins.offset; 5729 - 5730 - em->block_start = ins.objectid; 5731 - em->block_len = ins.offset; 5732 - em->bdev = root->fs_info->fs_devices->latest_bdev; 5733 - 5734 - /* 5735 - * We need to do this because if we're using the original em we searched 5736 - * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that. 5737 - */ 5738 - em->flags = 0; 5739 - set_bit(EXTENT_FLAG_PINNED, &em->flags); 5740 - 5741 - while (insert) { 5742 - write_lock(&em_tree->lock); 5743 - ret = add_extent_mapping(em_tree, em); 5744 - write_unlock(&em_tree->lock); 5745 - if (ret != -EEXIST) 5746 - break; 5747 - btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0); 5748 - } 5690 + em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, 5691 + ins.offset, ins.offset, 0); 5692 + if (IS_ERR(em)) 5693 + goto out; 5749 5694 5750 5695 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, 5751 5696 ins.offset, ins.offset, 0); ··· 5895 5894 static struct extent_map *create_pinned_em(struct inode *inode, u64 start, 5896 5895 u64 len, u64 orig_start, 5897 5896 u64 block_start, u64 block_len, 5898 - int type) 5897 + u64 orig_block_len, int type) 5899 5898 { 5900 5899 struct extent_map_tree *em_tree; 5901 5900 struct extent_map *em; ··· 5913 5912 em->block_len = block_len; 5914 5913 em->block_start = block_start; 5915 5914 em->bdev = root->fs_info->fs_devices->latest_bdev; 5915 + em->orig_block_len = orig_block_len; 5916 + em->generation = -1; 5916 5917 set_bit(EXTENT_FLAG_PINNED, &em->flags); 5917 5918 if (type == BTRFS_ORDERED_PREALLOC) 5918 - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 5919 + set_bit(EXTENT_FLAG_FILLING, &em->flags); 5919 5920 5920 5921 do { 5921 5922 btrfs_drop_extent_cache(inode, em->start, 5922 5923 em->start + em->len - 1, 0); 5923 5924 write_lock(&em_tree->lock); 5924 5925 ret = add_extent_mapping(em_tree, em); 5926 + if (!ret) 5927 + list_move(&em->list, 5928 + &em_tree->modified_extents); 5925 5929 write_unlock(&em_tree->lock); 5926 5930 } while (ret == -EEXIST); 5927 5931 ··· 6053 6047 goto must_cow; 6054 6048 6055 6049 if (can_nocow_odirect(trans, inode, start, len) == 1) { 6056 - u64 orig_start = em->start; 6050 + u64 orig_start = em->orig_start; 6051 + u64 orig_block_len = em->orig_block_len; 6057 6052 6058 6053 if (type == BTRFS_ORDERED_PREALLOC) { 6059 6054 free_extent_map(em); 6060 6055 em = create_pinned_em(inode, start, len, 6061 6056 orig_start, 6062 - block_start, len, type); 6057 + block_start, len, 6058 + orig_block_len, type); 6063 6059 if (IS_ERR(em)) { 6064 6060 btrfs_end_transaction(trans, root); 6065 6061 goto unlock_err; ··· 6085 6077 * it above 6086 6078 */ 6087 6079 len = bh_result->b_size; 6088 - em = btrfs_new_extent_direct(inode, em, start, len); 6080 + free_extent_map(em); 6081 + em = btrfs_new_extent_direct(inode, start, len); 6089 6082 if (IS_ERR(em)) { 6090 6083 ret = PTR_ERR(em); 6091 6084 goto unlock_err; ··· 6327 6318 struct btrfs_root *root = BTRFS_I(inode)->root; 6328 6319 int ret; 6329 6320 6321 + if (async_submit) 6322 + async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); 6323 + 6330 6324 bio_get(bio); 6331 6325 6332 6326 if (!write) { ··· 6374 6362 { 6375 6363 struct inode *inode = dip->inode; 6376 6364 struct btrfs_root *root = BTRFS_I(inode)->root; 6377 - struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 6378 6365 struct bio *bio; 6379 6366 struct bio *orig_bio = dip->orig_bio; 6380 6367 struct bio_vec *bvec = orig_bio->bi_io_vec; ··· 6386 6375 int async_submit = 0; 6387 6376 6388 6377 map_length = orig_bio->bi_size; 6389 - ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6378 + ret = btrfs_map_block(root->fs_info, READ, start_sector << 9, 6390 6379 &map_length, NULL, 0); 6391 6380 if (ret) { 6392 6381 bio_put(orig_bio); ··· 6440 6429 bio->bi_end_io = btrfs_end_dio_bio; 6441 6430 6442 6431 map_length = orig_bio->bi_size; 6443 - ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6432 + ret = btrfs_map_block(root->fs_info, READ, 6433 + start_sector << 9, 6444 6434 &map_length, NULL, 0); 6445 6435 if (ret) { 6446 6436 bio_put(bio); ··· 6594 6582 btrfs_submit_direct, 0); 6595 6583 } 6596 6584 6585 + #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) 6586 + 6597 6587 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 6598 6588 __u64 start, __u64 len) 6599 6589 { 6590 + int ret; 6591 + 6592 + ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS); 6593 + if (ret) 6594 + return ret; 6595 + 6600 6596 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); 6601 6597 } 6602 6598 ··· 6875 6855 int ret; 6876 6856 int err = 0; 6877 6857 struct btrfs_trans_handle *trans; 6878 - unsigned long nr; 6879 6858 u64 mask = root->sectorsize - 1; 6880 6859 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 6881 6860 ··· 6997 6978 break; 6998 6979 } 6999 6980 7000 - nr = trans->blocks_used; 7001 6981 btrfs_end_transaction(trans, root); 7002 - btrfs_btree_balance_dirty(root, nr); 6982 + btrfs_btree_balance_dirty(root); 7003 6983 7004 6984 trans = btrfs_start_transaction(root, 2); 7005 6985 if (IS_ERR(trans)) { ··· 7032 7014 if (ret && !err) 7033 7015 err = ret; 7034 7016 7035 - nr = trans->blocks_used; 7036 7017 ret = btrfs_end_transaction(trans, root); 7037 - btrfs_btree_balance_dirty(root, nr); 7018 + btrfs_btree_balance_dirty(root); 7038 7019 } 7039 7020 7040 7021 out: ··· 7110 7093 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); 7111 7094 ei->io_tree.track_uptodate = 1; 7112 7095 ei->io_failure_tree.track_uptodate = 1; 7096 + atomic_set(&ei->sync_writers, 0); 7113 7097 mutex_init(&ei->log_mutex); 7114 7098 mutex_init(&ei->delalloc_mutex); 7115 7099 btrfs_ordered_inode_tree_init(&ei->ordered_tree); ··· 7221 7203 kmem_cache_destroy(btrfs_path_cachep); 7222 7204 if (btrfs_free_space_cachep) 7223 7205 kmem_cache_destroy(btrfs_free_space_cachep); 7206 + if (btrfs_delalloc_work_cachep) 7207 + kmem_cache_destroy(btrfs_delalloc_work_cachep); 7224 7208 } 7225 7209 7226 7210 int btrfs_init_cachep(void) ··· 7255 7235 sizeof(struct btrfs_free_space), 0, 7256 7236 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7257 7237 if (!btrfs_free_space_cachep) 7238 + goto fail; 7239 + 7240 + btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work", 7241 + sizeof(struct btrfs_delalloc_work), 0, 7242 + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, 7243 + NULL); 7244 + if (!btrfs_delalloc_work_cachep) 7258 7245 goto fail; 7259 7246 7260 7247 return 0; ··· 7335 7308 if (S_ISDIR(old_inode->i_mode) && new_inode && 7336 7309 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 7337 7310 return -ENOTEMPTY; 7311 + 7312 + 7313 + /* check for collisions, even if the name isn't there */ 7314 + ret = btrfs_check_dir_item_collision(root, new_dir->i_ino, 7315 + new_dentry->d_name.name, 7316 + new_dentry->d_name.len); 7317 + 7318 + if (ret) { 7319 + if (ret == -EEXIST) { 7320 + /* we shouldn't get 7321 + * eexist without a new_inode */ 7322 + if (!new_inode) { 7323 + WARN_ON(1); 7324 + return ret; 7325 + } 7326 + } else { 7327 + /* maybe -EOVERFLOW */ 7328 + return ret; 7329 + } 7330 + } 7331 + ret = 0; 7332 + 7338 7333 /* 7339 7334 * we're using rename to replace one file with another. 7340 7335 * and the replacement file is large. Start IO on it now so ··· 7496 7447 return ret; 7497 7448 } 7498 7449 7450 + static void btrfs_run_delalloc_work(struct btrfs_work *work) 7451 + { 7452 + struct btrfs_delalloc_work *delalloc_work; 7453 + 7454 + delalloc_work = container_of(work, struct btrfs_delalloc_work, 7455 + work); 7456 + if (delalloc_work->wait) 7457 + btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1); 7458 + else 7459 + filemap_flush(delalloc_work->inode->i_mapping); 7460 + 7461 + if (delalloc_work->delay_iput) 7462 + btrfs_add_delayed_iput(delalloc_work->inode); 7463 + else 7464 + iput(delalloc_work->inode); 7465 + complete(&delalloc_work->completion); 7466 + } 7467 + 7468 + struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, 7469 + int wait, int delay_iput) 7470 + { 7471 + struct btrfs_delalloc_work *work; 7472 + 7473 + work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS); 7474 + if (!work) 7475 + return NULL; 7476 + 7477 + init_completion(&work->completion); 7478 + INIT_LIST_HEAD(&work->list); 7479 + work->inode = inode; 7480 + work->wait = wait; 7481 + work->delay_iput = delay_iput; 7482 + work->work.func = btrfs_run_delalloc_work; 7483 + 7484 + return work; 7485 + } 7486 + 7487 + void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) 7488 + { 7489 + wait_for_completion(&work->completion); 7490 + kmem_cache_free(btrfs_delalloc_work_cachep, work); 7491 + } 7492 + 7499 7493 /* 7500 7494 * some fairly slow code that needs optimization. This walks the list 7501 7495 * of all the inodes with pending delalloc and forces them to disk. ··· 7548 7456 struct list_head *head = &root->fs_info->delalloc_inodes; 7549 7457 struct btrfs_inode *binode; 7550 7458 struct inode *inode; 7459 + struct btrfs_delalloc_work *work, *next; 7460 + struct list_head works; 7461 + int ret = 0; 7551 7462 7552 7463 if (root->fs_info->sb->s_flags & MS_RDONLY) 7553 7464 return -EROFS; 7465 + 7466 + INIT_LIST_HEAD(&works); 7554 7467 7555 7468 spin_lock(&root->fs_info->delalloc_lock); 7556 7469 while (!list_empty(head)) { ··· 7566 7469 list_del_init(&binode->delalloc_inodes); 7567 7470 spin_unlock(&root->fs_info->delalloc_lock); 7568 7471 if (inode) { 7569 - filemap_flush(inode->i_mapping); 7570 - if (delay_iput) 7571 - btrfs_add_delayed_iput(inode); 7572 - else 7573 - iput(inode); 7472 + work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); 7473 + if (!work) { 7474 + ret = -ENOMEM; 7475 + goto out; 7476 + } 7477 + list_add_tail(&work->list, &works); 7478 + btrfs_queue_worker(&root->fs_info->flush_workers, 7479 + &work->work); 7574 7480 } 7575 7481 cond_resched(); 7576 7482 spin_lock(&root->fs_info->delalloc_lock); ··· 7592 7492 atomic_read(&root->fs_info->async_delalloc_pages) == 0)); 7593 7493 } 7594 7494 atomic_dec(&root->fs_info->async_submit_draining); 7595 - return 0; 7495 + out: 7496 + list_for_each_entry_safe(work, next, &works, list) { 7497 + list_del_init(&work->list); 7498 + btrfs_wait_and_free_delalloc_work(work); 7499 + } 7500 + return ret; 7596 7501 } 7597 7502 7598 7503 static int btrfs_symlink(struct inode *dir, struct dentry *dentry, ··· 7617 7512 unsigned long ptr; 7618 7513 struct btrfs_file_extent_item *ei; 7619 7514 struct extent_buffer *leaf; 7620 - unsigned long nr = 0; 7621 7515 7622 7516 name_len = strlen(symname) + 1; 7623 7517 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) ··· 7714 7610 out_unlock: 7715 7611 if (!err) 7716 7612 d_instantiate(dentry, inode); 7717 - nr = trans->blocks_used; 7718 7613 btrfs_end_transaction(trans, root); 7719 7614 if (drop_inode) { 7720 7615 inode_dec_link_count(inode); 7721 7616 iput(inode); 7722 7617 } 7723 - btrfs_btree_balance_dirty(root, nr); 7618 + btrfs_btree_balance_dirty(root); 7724 7619 return err; 7725 7620 } 7726 7621 ··· 7782 7679 em->len = ins.offset; 7783 7680 em->block_start = ins.objectid; 7784 7681 em->block_len = ins.offset; 7682 + em->orig_block_len = ins.offset; 7785 7683 em->bdev = root->fs_info->fs_devices->latest_bdev; 7786 7684 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 7787 7685 em->generation = trans->transid;

+236 -81

fs/btrfs/ioctl.c

··· 55 55 #include "backref.h" 56 56 #include "rcu-string.h" 57 57 #include "send.h" 58 + #include "dev-replace.h" 58 59 59 60 /* Mask out flags that are inappropriate for the given type of inode. */ 60 61 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) ··· 141 140 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; 142 141 } 143 142 144 - if (flags & BTRFS_INODE_NODATACOW) 143 + if (flags & BTRFS_INODE_NODATACOW) { 145 144 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 145 + if (S_ISREG(inode->i_mode)) 146 + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 147 + } 146 148 147 149 btrfs_update_iflags(inode); 148 150 } ··· 575 571 ret = btrfs_commit_transaction(trans, 576 572 root->fs_info->extent_root); 577 573 } 578 - if (ret) 574 + if (ret) { 575 + /* cleanup_transaction has freed this for us */ 576 + if (trans->aborted) 577 + pending_snapshot = NULL; 579 578 goto fail; 579 + } 580 580 581 581 ret = pending_snapshot->error; 582 582 if (ret) ··· 710 702 goto out_dput; 711 703 712 704 error = btrfs_may_create(dir, dentry); 705 + if (error) 706 + goto out_dput; 707 + 708 + /* 709 + * even if this name doesn't exist, we may get hash collisions. 710 + * check for them now when we can safely fail 711 + */ 712 + error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, 713 + dir->i_ino, name, 714 + namelen); 713 715 if (error) 714 716 goto out_dput; 715 717 ··· 1311 1293 return ret; 1312 1294 } 1313 1295 1314 - static noinline int btrfs_ioctl_resize(struct btrfs_root *root, 1296 + static noinline int btrfs_ioctl_resize(struct file *file, 1315 1297 void __user *arg) 1316 1298 { 1317 1299 u64 new_size; 1318 1300 u64 old_size; 1319 1301 u64 devid = 1; 1302 + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 1320 1303 struct btrfs_ioctl_vol_args *vol_args; 1321 1304 struct btrfs_trans_handle *trans; 1322 1305 struct btrfs_device *device = NULL; ··· 1332 1313 if (!capable(CAP_SYS_ADMIN)) 1333 1314 return -EPERM; 1334 1315 1335 - mutex_lock(&root->fs_info->volume_mutex); 1336 - if (root->fs_info->balance_ctl) { 1337 - printk(KERN_INFO "btrfs: balance in progress\n"); 1338 - ret = -EINVAL; 1339 - goto out; 1316 + ret = mnt_want_write_file(file); 1317 + if (ret) 1318 + return ret; 1319 + 1320 + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1321 + 1)) { 1322 + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); 1323 + return -EINPROGRESS; 1340 1324 } 1341 1325 1326 + mutex_lock(&root->fs_info->volume_mutex); 1342 1327 vol_args = memdup_user(arg, sizeof(*vol_args)); 1343 1328 if (IS_ERR(vol_args)) { 1344 1329 ret = PTR_ERR(vol_args); ··· 1362 1339 printk(KERN_INFO "btrfs: resizing devid %llu\n", 1363 1340 (unsigned long long)devid); 1364 1341 } 1365 - device = btrfs_find_device(root, devid, NULL, NULL); 1342 + device = btrfs_find_device(root->fs_info, devid, NULL, NULL); 1366 1343 if (!device) { 1367 1344 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", 1368 1345 (unsigned long long)devid); ··· 1392 1369 ret = -EINVAL; 1393 1370 goto out_free; 1394 1371 } 1372 + } 1373 + 1374 + if (device->is_tgtdev_for_dev_replace) { 1375 + ret = -EINVAL; 1376 + goto out_free; 1395 1377 } 1396 1378 1397 1379 old_size = device->total_bytes; ··· 1437 1409 btrfs_commit_transaction(trans, root); 1438 1410 } else if (new_size < old_size) { 1439 1411 ret = btrfs_shrink_device(device, new_size); 1440 - } 1412 + } /* equal, nothing need to do */ 1441 1413 1442 1414 out_free: 1443 1415 kfree(vol_args); 1444 1416 out: 1445 1417 mutex_unlock(&root->fs_info->volume_mutex); 1418 + mnt_drop_write_file(file); 1419 + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); 1446 1420 return ret; 1447 1421 } 1448 1422 ··· 2186 2156 if (btrfs_root_readonly(root)) 2187 2157 return -EROFS; 2188 2158 2159 + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 2160 + 1)) { 2161 + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); 2162 + return -EINPROGRESS; 2163 + } 2189 2164 ret = mnt_want_write_file(file); 2190 - if (ret) 2165 + if (ret) { 2166 + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 2167 + 0); 2191 2168 return ret; 2169 + } 2192 2170 2193 2171 switch (inode->i_mode & S_IFMT) { 2194 2172 case S_IFDIR: ··· 2248 2210 } 2249 2211 out: 2250 2212 mnt_drop_write_file(file); 2213 + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); 2251 2214 return ret; 2252 2215 } 2253 2216 ··· 2260 2221 if (!capable(CAP_SYS_ADMIN)) 2261 2222 return -EPERM; 2262 2223 2263 - mutex_lock(&root->fs_info->volume_mutex); 2264 - if (root->fs_info->balance_ctl) { 2265 - printk(KERN_INFO "btrfs: balance in progress\n"); 2266 - ret = -EINVAL; 2267 - goto out; 2224 + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 2225 + 1)) { 2226 + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); 2227 + return -EINPROGRESS; 2268 2228 } 2269 2229 2230 + mutex_lock(&root->fs_info->volume_mutex); 2270 2231 vol_args = memdup_user(arg, sizeof(*vol_args)); 2271 2232 if (IS_ERR(vol_args)) { 2272 2233 ret = PTR_ERR(vol_args); ··· 2279 2240 kfree(vol_args); 2280 2241 out: 2281 2242 mutex_unlock(&root->fs_info->volume_mutex); 2243 + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); 2282 2244 return ret; 2283 2245 } 2284 2246 2285 - static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) 2247 + static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) 2286 2248 { 2249 + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 2287 2250 struct btrfs_ioctl_vol_args *vol_args; 2288 2251 int ret; 2289 2252 2290 2253 if (!capable(CAP_SYS_ADMIN)) 2291 2254 return -EPERM; 2292 2255 2293 - if (root->fs_info->sb->s_flags & MS_RDONLY) 2294 - return -EROFS; 2256 + ret = mnt_want_write_file(file); 2257 + if (ret) 2258 + return ret; 2295 2259 2296 - mutex_lock(&root->fs_info->volume_mutex); 2297 - if (root->fs_info->balance_ctl) { 2298 - printk(KERN_INFO "btrfs: balance in progress\n"); 2299 - ret = -EINVAL; 2300 - goto out; 2260 + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 2261 + 1)) { 2262 + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); 2263 + mnt_drop_write_file(file); 2264 + return -EINPROGRESS; 2301 2265 } 2302 2266 2267 + mutex_lock(&root->fs_info->volume_mutex); 2303 2268 vol_args = memdup_user(arg, sizeof(*vol_args)); 2304 2269 if (IS_ERR(vol_args)) { 2305 2270 ret = PTR_ERR(vol_args); ··· 2316 2273 kfree(vol_args); 2317 2274 out: 2318 2275 mutex_unlock(&root->fs_info->volume_mutex); 2276 + mnt_drop_write_file(file); 2277 + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); 2319 2278 return ret; 2320 2279 } 2321 2280 ··· 2373 2328 s_uuid = di_args->uuid; 2374 2329 2375 2330 mutex_lock(&fs_devices->device_list_mutex); 2376 - dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL); 2331 + dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL); 2377 2332 mutex_unlock(&fs_devices->device_list_mutex); 2378 2333 2379 2334 if (!dev) { ··· 2866 2821 struct btrfs_disk_key disk_key; 2867 2822 u64 objectid = 0; 2868 2823 u64 dir_id; 2824 + int ret; 2869 2825 2870 2826 if (!capable(CAP_SYS_ADMIN)) 2871 2827 return -EPERM; 2872 2828 2873 - if (copy_from_user(&objectid, argp, sizeof(objectid))) 2874 - return -EFAULT; 2829 + ret = mnt_want_write_file(file); 2830 + if (ret) 2831 + return ret; 2832 + 2833 + if (copy_from_user(&objectid, argp, sizeof(objectid))) { 2834 + ret = -EFAULT; 2835 + goto out; 2836 + } 2875 2837 2876 2838 if (!objectid) 2877 2839 objectid = root->root_key.objectid; ··· 2888 2836 location.offset = (u64)-1; 2889 2837 2890 2838 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); 2891 - if (IS_ERR(new_root)) 2892 - return PTR_ERR(new_root); 2839 + if (IS_ERR(new_root)) { 2840 + ret = PTR_ERR(new_root); 2841 + goto out; 2842 + } 2893 2843 2894 - if (btrfs_root_refs(&new_root->root_item) == 0) 2895 - return -ENOENT; 2844 + if (btrfs_root_refs(&new_root->root_item) == 0) { 2845 + ret = -ENOENT; 2846 + goto out; 2847 + } 2896 2848 2897 2849 path = btrfs_alloc_path(); 2898 - if (!path) 2899 - return -ENOMEM; 2850 + if (!path) { 2851 + ret = -ENOMEM; 2852 + goto out; 2853 + } 2900 2854 path->leave_spinning = 1; 2901 2855 2902 2856 trans = btrfs_start_transaction(root, 1); 2903 2857 if (IS_ERR(trans)) { 2904 2858 btrfs_free_path(path); 2905 - return PTR_ERR(trans); 2859 + ret = PTR_ERR(trans); 2860 + goto out; 2906 2861 } 2907 2862 2908 2863 dir_id = btrfs_super_root_dir(root->fs_info->super_copy); ··· 2920 2861 btrfs_end_transaction(trans, root); 2921 2862 printk(KERN_ERR "Umm, you don't have the default dir item, " 2922 2863 "this isn't going to work\n"); 2923 - return -ENOENT; 2864 + ret = -ENOENT; 2865 + goto out; 2924 2866 } 2925 2867 2926 2868 btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); ··· 2931 2871 2932 2872 btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL); 2933 2873 btrfs_end_transaction(trans, root); 2934 - 2935 - return 0; 2874 + out: 2875 + mnt_drop_write_file(file); 2876 + return ret; 2936 2877 } 2937 2878 2938 2879 void btrfs_get_block_group_info(struct list_head *groups_list, ··· 3097 3036 return 0; 3098 3037 } 3099 3038 3100 - static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp) 3039 + static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, 3040 + void __user *argp) 3101 3041 { 3102 - struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; 3103 3042 struct btrfs_trans_handle *trans; 3104 3043 u64 transid; 3105 3044 int ret; 3106 3045 3107 - trans = btrfs_start_transaction(root, 0); 3108 - if (IS_ERR(trans)) 3109 - return PTR_ERR(trans); 3046 + trans = btrfs_attach_transaction(root); 3047 + if (IS_ERR(trans)) { 3048 + if (PTR_ERR(trans) != -ENOENT) 3049 + return PTR_ERR(trans); 3050 + 3051 + /* No running transaction, don't bother */ 3052 + transid = root->fs_info->last_trans_committed; 3053 + goto out; 3054 + } 3110 3055 transid = trans->transid; 3111 3056 ret = btrfs_commit_transaction_async(trans, root, 0); 3112 3057 if (ret) { 3113 3058 btrfs_end_transaction(trans, root); 3114 3059 return ret; 3115 3060 } 3116 - 3061 + out: 3117 3062 if (argp) 3118 3063 if (copy_to_user(argp, &transid, sizeof(transid))) 3119 3064 return -EFAULT; 3120 3065 return 0; 3121 3066 } 3122 3067 3123 - static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp) 3068 + static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root, 3069 + void __user *argp) 3124 3070 { 3125 - struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; 3126 3071 u64 transid; 3127 3072 3128 3073 if (argp) { ··· 3140 3073 return btrfs_wait_for_commit(root, transid); 3141 3074 } 3142 3075 3143 - static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) 3076 + static long btrfs_ioctl_scrub(struct file *file, void __user *arg) 3144 3077 { 3145 - int ret; 3078 + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 3146 3079 struct btrfs_ioctl_scrub_args *sa; 3080 + int ret; 3147 3081 3148 3082 if (!capable(CAP_SYS_ADMIN)) 3149 3083 return -EPERM; ··· 3153 3085 if (IS_ERR(sa)) 3154 3086 return PTR_ERR(sa); 3155 3087 3156 - ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end, 3157 - &sa->progress, sa->flags & BTRFS_SCRUB_READONLY); 3088 + if (!(sa->flags & BTRFS_SCRUB_READONLY)) { 3089 + ret = mnt_want_write_file(file); 3090 + if (ret) 3091 + goto out; 3092 + } 3093 + 3094 + ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end, 3095 + &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, 3096 + 0); 3158 3097 3159 3098 if (copy_to_user(arg, sa, sizeof(*sa))) 3160 3099 ret = -EFAULT; 3161 3100 3101 + if (!(sa->flags & BTRFS_SCRUB_READONLY)) 3102 + mnt_drop_write_file(file); 3103 + out: 3162 3104 kfree(sa); 3163 3105 return ret; 3164 3106 } ··· 3178 3100 if (!capable(CAP_SYS_ADMIN)) 3179 3101 return -EPERM; 3180 3102 3181 - return btrfs_scrub_cancel(root); 3103 + return btrfs_scrub_cancel(root->fs_info); 3182 3104 } 3183 3105 3184 3106 static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, ··· 3224 3146 ret = -EFAULT; 3225 3147 3226 3148 kfree(sa); 3149 + return ret; 3150 + } 3151 + 3152 + static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg) 3153 + { 3154 + struct btrfs_ioctl_dev_replace_args *p; 3155 + int ret; 3156 + 3157 + if (!capable(CAP_SYS_ADMIN)) 3158 + return -EPERM; 3159 + 3160 + p = memdup_user(arg, sizeof(*p)); 3161 + if (IS_ERR(p)) 3162 + return PTR_ERR(p); 3163 + 3164 + switch (p->cmd) { 3165 + case BTRFS_IOCTL_DEV_REPLACE_CMD_START: 3166 + if (atomic_xchg( 3167 + &root->fs_info->mutually_exclusive_operation_running, 3168 + 1)) { 3169 + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); 3170 + ret = -EINPROGRESS; 3171 + } else { 3172 + ret = btrfs_dev_replace_start(root, p); 3173 + atomic_set( 3174 + &root->fs_info->mutually_exclusive_operation_running, 3175 + 0); 3176 + } 3177 + break; 3178 + case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: 3179 + btrfs_dev_replace_status(root->fs_info, p); 3180 + ret = 0; 3181 + break; 3182 + case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL: 3183 + ret = btrfs_dev_replace_cancel(root->fs_info, p); 3184 + break; 3185 + default: 3186 + ret = -EINVAL; 3187 + break; 3188 + } 3189 + 3190 + if (copy_to_user(arg, p, sizeof(*p))) 3191 + ret = -EFAULT; 3192 + 3193 + kfree(p); 3227 3194 return ret; 3228 3195 } 3229 3196 ··· 3438 3315 struct btrfs_ioctl_balance_args *bargs; 3439 3316 struct btrfs_balance_control *bctl; 3440 3317 int ret; 3318 + int need_to_clear_lock = 0; 3441 3319 3442 3320 if (!capable(CAP_SYS_ADMIN)) 3443 3321 return -EPERM; ··· 3474 3350 bargs = NULL; 3475 3351 } 3476 3352 3477 - if (fs_info->balance_ctl) { 3353 + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 3354 + 1)) { 3355 + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); 3478 3356 ret = -EINPROGRESS; 3479 3357 goto out_bargs; 3480 3358 } 3359 + need_to_clear_lock = 1; 3481 3360 3482 3361 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 3483 3362 if (!bctl) { ··· 3514 3387 out_bargs: 3515 3388 kfree(bargs); 3516 3389 out: 3390 + if (need_to_clear_lock) 3391 + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 3392 + 0); 3517 3393 mutex_unlock(&fs_info->balance_mutex); 3518 3394 mutex_unlock(&fs_info->volume_mutex); 3519 3395 mnt_drop_write_file(file); ··· 3571 3441 return ret; 3572 3442 } 3573 3443 3574 - static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg) 3444 + static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) 3575 3445 { 3446 + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 3576 3447 struct btrfs_ioctl_quota_ctl_args *sa; 3577 3448 struct btrfs_trans_handle *trans = NULL; 3578 3449 int ret; ··· 3582 3451 if (!capable(CAP_SYS_ADMIN)) 3583 3452 return -EPERM; 3584 3453 3585 - if (root->fs_info->sb->s_flags & MS_RDONLY) 3586 - return -EROFS; 3454 + ret = mnt_want_write_file(file); 3455 + if (ret) 3456 + return ret; 3587 3457 3588 3458 sa = memdup_user(arg, sizeof(*sa)); 3589 - if (IS_ERR(sa)) 3590 - return PTR_ERR(sa); 3459 + if (IS_ERR(sa)) { 3460 + ret = PTR_ERR(sa); 3461 + goto drop_write; 3462 + } 3591 3463 3592 3464 if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) { 3593 3465 trans = btrfs_start_transaction(root, 2); ··· 3623 3489 if (err && !ret) 3624 3490 ret = err; 3625 3491 } 3626 - 3627 3492 out: 3628 3493 kfree(sa); 3494 + drop_write: 3495 + mnt_drop_write_file(file); 3629 3496 return ret; 3630 3497 } 3631 3498 3632 - static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg) 3499 + static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) 3633 3500 { 3501 + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 3634 3502 struct btrfs_ioctl_qgroup_assign_args *sa; 3635 3503 struct btrfs_trans_handle *trans; 3636 3504 int ret; ··· 3641 3505 if (!capable(CAP_SYS_ADMIN)) 3642 3506 return -EPERM; 3643 3507 3644 - if (root->fs_info->sb->s_flags & MS_RDONLY) 3645 - return -EROFS; 3508 + ret = mnt_want_write_file(file); 3509 + if (ret) 3510 + return ret; 3646 3511 3647 3512 sa = memdup_user(arg, sizeof(*sa)); 3648 - if (IS_ERR(sa)) 3649 - return PTR_ERR(sa); 3513 + if (IS_ERR(sa)) { 3514 + ret = PTR_ERR(sa); 3515 + goto drop_write; 3516 + } 3650 3517 3651 3518 trans = btrfs_join_transaction(root); 3652 3519 if (IS_ERR(trans)) { ··· 3672 3533 3673 3534 out: 3674 3535 kfree(sa); 3536 + drop_write: 3537 + mnt_drop_write_file(file); 3675 3538 return ret; 3676 3539 } 3677 3540 3678 - static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg) 3541 + static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) 3679 3542 { 3543 + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 3680 3544 struct btrfs_ioctl_qgroup_create_args *sa; 3681 3545 struct btrfs_trans_handle *trans; 3682 3546 int ret; ··· 3688 3546 if (!capable(CAP_SYS_ADMIN)) 3689 3547 return -EPERM; 3690 3548 3691 - if (root->fs_info->sb->s_flags & MS_RDONLY) 3692 - return -EROFS; 3549 + ret = mnt_want_write_file(file); 3550 + if (ret) 3551 + return ret; 3693 3552 3694 3553 sa = memdup_user(arg, sizeof(*sa)); 3695 - if (IS_ERR(sa)) 3696 - return PTR_ERR(sa); 3554 + if (IS_ERR(sa)) { 3555 + ret = PTR_ERR(sa); 3556 + goto drop_write; 3557 + } 3697 3558 3698 3559 trans = btrfs_join_transaction(root); 3699 3560 if (IS_ERR(trans)) { ··· 3718 3573 3719 3574 out: 3720 3575 kfree(sa); 3576 + drop_write: 3577 + mnt_drop_write_file(file); 3721 3578 return ret; 3722 3579 } 3723 3580 3724 - static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg) 3581 + static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) 3725 3582 { 3583 + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 3726 3584 struct btrfs_ioctl_qgroup_limit_args *sa; 3727 3585 struct btrfs_trans_handle *trans; 3728 3586 int ret; ··· 3735 3587 if (!capable(CAP_SYS_ADMIN)) 3736 3588 return -EPERM; 3737 3589 3738 - if (root->fs_info->sb->s_flags & MS_RDONLY) 3739 - return -EROFS; 3590 + ret = mnt_want_write_file(file); 3591 + if (ret) 3592 + return ret; 3740 3593 3741 3594 sa = memdup_user(arg, sizeof(*sa)); 3742 - if (IS_ERR(sa)) 3743 - return PTR_ERR(sa); 3595 + if (IS_ERR(sa)) { 3596 + ret = PTR_ERR(sa); 3597 + goto drop_write; 3598 + } 3744 3599 3745 3600 trans = btrfs_join_transaction(root); 3746 3601 if (IS_ERR(trans)) { ··· 3766 3615 3767 3616 out: 3768 3617 kfree(sa); 3618 + drop_write: 3619 + mnt_drop_write_file(file); 3769 3620 return ret; 3770 3621 } 3771 3622 ··· 3888 3735 case BTRFS_IOC_DEFRAG_RANGE: 3889 3736 return btrfs_ioctl_defrag(file, argp); 3890 3737 case BTRFS_IOC_RESIZE: 3891 - return btrfs_ioctl_resize(root, argp); 3738 + return btrfs_ioctl_resize(file, argp); 3892 3739 case BTRFS_IOC_ADD_DEV: 3893 3740 return btrfs_ioctl_add_dev(root, argp); 3894 3741 case BTRFS_IOC_RM_DEV: 3895 - return btrfs_ioctl_rm_dev(root, argp); 3742 + return btrfs_ioctl_rm_dev(file, argp); 3896 3743 case BTRFS_IOC_FS_INFO: 3897 3744 return btrfs_ioctl_fs_info(root, argp); 3898 3745 case BTRFS_IOC_DEV_INFO: ··· 3921 3768 btrfs_sync_fs(file->f_dentry->d_sb, 1); 3922 3769 return 0; 3923 3770 case BTRFS_IOC_START_SYNC: 3924 - return btrfs_ioctl_start_sync(file, argp); 3771 + return btrfs_ioctl_start_sync(root, argp); 3925 3772 case BTRFS_IOC_WAIT_SYNC: 3926 - return btrfs_ioctl_wait_sync(file, argp); 3773 + return btrfs_ioctl_wait_sync(root, argp); 3927 3774 case BTRFS_IOC_SCRUB: 3928 - return btrfs_ioctl_scrub(root, argp); 3775 + return btrfs_ioctl_scrub(file, argp); 3929 3776 case BTRFS_IOC_SCRUB_CANCEL: 3930 3777 return btrfs_ioctl_scrub_cancel(root, argp); 3931 3778 case BTRFS_IOC_SCRUB_PROGRESS: ··· 3943 3790 case BTRFS_IOC_GET_DEV_STATS: 3944 3791 return btrfs_ioctl_get_dev_stats(root, argp); 3945 3792 case BTRFS_IOC_QUOTA_CTL: 3946 - return btrfs_ioctl_quota_ctl(root, argp); 3793 + return btrfs_ioctl_quota_ctl(file, argp); 3947 3794 case BTRFS_IOC_QGROUP_ASSIGN: 3948 - return btrfs_ioctl_qgroup_assign(root, argp); 3795 + return btrfs_ioctl_qgroup_assign(file, argp); 3949 3796 case BTRFS_IOC_QGROUP_CREATE: 3950 - return btrfs_ioctl_qgroup_create(root, argp); 3797 + return btrfs_ioctl_qgroup_create(file, argp); 3951 3798 case BTRFS_IOC_QGROUP_LIMIT: 3952 - return btrfs_ioctl_qgroup_limit(root, argp); 3799 + return btrfs_ioctl_qgroup_limit(file, argp); 3800 + case BTRFS_IOC_DEV_REPLACE: 3801 + return btrfs_ioctl_dev_replace(root, argp); 3953 3802 } 3954 3803 3955 3804 return -ENOTTY;

+47 -1

fs/btrfs/ioctl.h

··· 30 30 char name[BTRFS_PATH_NAME_MAX + 1]; 31 31 }; 32 32 33 + #define BTRFS_DEVICE_PATH_NAME_MAX 1024 34 + 33 35 #define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) 34 36 #define BTRFS_SUBVOL_RDONLY (1ULL << 1) 35 37 #define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2) ··· 125 123 __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8]; 126 124 }; 127 125 128 - #define BTRFS_DEVICE_PATH_NAME_MAX 1024 126 + #define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 127 + #define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 128 + struct btrfs_ioctl_dev_replace_start_params { 129 + __u64 srcdevid; /* in, if 0, use srcdev_name instead */ 130 + __u64 cont_reading_from_srcdev_mode; /* in, see #define 131 + * above */ 132 + __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */ 133 + __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */ 134 + }; 135 + 136 + #define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0 137 + #define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED 1 138 + #define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2 139 + #define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3 140 + #define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4 141 + struct btrfs_ioctl_dev_replace_status_params { 142 + __u64 replace_state; /* out, see #define above */ 143 + __u64 progress_1000; /* out, 0 <= x <= 1000 */ 144 + __u64 time_started; /* out, seconds since 1-Jan-1970 */ 145 + __u64 time_stopped; /* out, seconds since 1-Jan-1970 */ 146 + __u64 num_write_errors; /* out */ 147 + __u64 num_uncorrectable_read_errors; /* out */ 148 + }; 149 + 150 + #define BTRFS_IOCTL_DEV_REPLACE_CMD_START 0 151 + #define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS 1 152 + #define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL 2 153 + #define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR 0 154 + #define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED 1 155 + #define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED 2 156 + struct btrfs_ioctl_dev_replace_args { 157 + __u64 cmd; /* in */ 158 + __u64 result; /* out */ 159 + 160 + union { 161 + struct btrfs_ioctl_dev_replace_start_params start; 162 + struct btrfs_ioctl_dev_replace_status_params status; 163 + }; /* in/out */ 164 + 165 + __u64 spare[64]; 166 + }; 167 + 129 168 struct btrfs_ioctl_dev_info_args { 130 169 __u64 devid; /* in/out */ 131 170 __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */ ··· 496 453 struct btrfs_ioctl_qgroup_limit_args) 497 454 #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ 498 455 struct btrfs_ioctl_get_dev_stats) 456 + #define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \ 457 + struct btrfs_ioctl_dev_replace_args) 458 + 499 459 #endif

+44

fs/btrfs/math.h

··· 1 + 2 + /* 3 + * Copyright (C) 2012 Fujitsu. All rights reserved. 4 + * Written by Miao Xie <miaox@cn.fujitsu.com> 5 + * 6 + * This program is free software; you can redistribute it and/or 7 + * modify it under the terms of the GNU General Public 8 + * License v2 as published by the Free Software Foundation. 9 + * 10 + * This program is distributed in the hope that it will be useful, 11 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 + * General Public License for more details. 14 + * 15 + * You should have received a copy of the GNU General Public 16 + * License along with this program; if not, write to the 17 + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 18 + * Boston, MA 021110-1307, USA. 19 + */ 20 + 21 + #ifndef __BTRFS_MATH_H 22 + #define __BTRFS_MATH_H 23 + 24 + #include <asm/div64.h> 25 + 26 + static inline u64 div_factor(u64 num, int factor) 27 + { 28 + if (factor == 10) 29 + return num; 30 + num *= factor; 31 + do_div(num, 10); 32 + return num; 33 + } 34 + 35 + static inline u64 div_factor_fine(u64 num, int factor) 36 + { 37 + if (factor == 100) 38 + return num; 39 + num *= factor; 40 + do_div(num, 100); 41 + return num; 42 + } 43 + 44 + #endif

+63 -27

fs/btrfs/ordered-data.c

··· 211 211 init_waitqueue_head(&entry->wait); 212 212 INIT_LIST_HEAD(&entry->list); 213 213 INIT_LIST_HEAD(&entry->root_extent_list); 214 + INIT_LIST_HEAD(&entry->work_list); 215 + init_completion(&entry->completion); 214 216 215 217 trace_btrfs_ordered_extent_add(inode, entry); 216 218 ··· 466 464 wake_up(&entry->wait); 467 465 } 468 466 467 + static void btrfs_run_ordered_extent_work(struct btrfs_work *work) 468 + { 469 + struct btrfs_ordered_extent *ordered; 470 + 471 + ordered = container_of(work, struct btrfs_ordered_extent, flush_work); 472 + btrfs_start_ordered_extent(ordered->inode, ordered, 1); 473 + complete(&ordered->completion); 474 + } 475 + 469 476 /* 470 477 * wait for all the ordered extents in a root. This is done when balancing 471 478 * space between drives. 472 479 */ 473 480 void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) 474 481 { 475 - struct list_head splice; 482 + struct list_head splice, works; 476 483 struct list_head *cur; 477 - struct btrfs_ordered_extent *ordered; 484 + struct btrfs_ordered_extent *ordered, *next; 478 485 struct inode *inode; 479 486 480 487 INIT_LIST_HEAD(&splice); 488 + INIT_LIST_HEAD(&works); 481 489 482 490 spin_lock(&root->fs_info->ordered_extent_lock); 483 491 list_splice_init(&root->fs_info->ordered_extents, &splice); ··· 506 494 spin_unlock(&root->fs_info->ordered_extent_lock); 507 495 508 496 if (inode) { 509 - btrfs_start_ordered_extent(inode, ordered, 1); 510 - btrfs_put_ordered_extent(ordered); 511 - if (delay_iput) 512 - btrfs_add_delayed_iput(inode); 513 - else 514 - iput(inode); 497 + ordered->flush_work.func = btrfs_run_ordered_extent_work; 498 + list_add_tail(&ordered->work_list, &works); 499 + btrfs_queue_worker(&root->fs_info->flush_workers, 500 + &ordered->flush_work); 515 501 } else { 516 502 btrfs_put_ordered_extent(ordered); 517 503 } 518 504 505 + cond_resched(); 519 506 spin_lock(&root->fs_info->ordered_extent_lock); 520 507 } 521 508 spin_unlock(&root->fs_info->ordered_extent_lock); 509 + 510 + list_for_each_entry_safe(ordered, next, &works, work_list) { 511 + list_del_init(&ordered->work_list); 512 + wait_for_completion(&ordered->completion); 513 + 514 + inode = ordered->inode; 515 + btrfs_put_ordered_extent(ordered); 516 + if (delay_iput) 517 + btrfs_add_delayed_iput(inode); 518 + else 519 + iput(inode); 520 + 521 + cond_resched(); 522 + } 522 523 } 523 524 524 525 /* ··· 544 519 * extra check to make sure the ordered operation list really is empty 545 520 * before we return 546 521 */ 547 - void btrfs_run_ordered_operations(struct btrfs_root *root, int wait) 522 + int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) 548 523 { 549 524 struct btrfs_inode *btrfs_inode; 550 525 struct inode *inode; 551 526 struct list_head splice; 527 + struct list_head works; 528 + struct btrfs_delalloc_work *work, *next; 529 + int ret = 0; 552 530 553 531 INIT_LIST_HEAD(&splice); 532 + INIT_LIST_HEAD(&works); 554 533 555 534 mutex_lock(&root->fs_info->ordered_operations_mutex); 556 535 spin_lock(&root->fs_info->ordered_extent_lock); ··· 562 533 list_splice_init(&root->fs_info->ordered_operations, &splice); 563 534 564 535 while (!list_empty(&splice)) { 536 + 565 537 btrfs_inode = list_entry(splice.next, struct btrfs_inode, 566 538 ordered_operations); 567 539 ··· 579 549 list_add_tail(&BTRFS_I(inode)->ordered_operations, 580 550 &root->fs_info->ordered_operations); 581 551 } 552 + 553 + if (!inode) 554 + continue; 582 555 spin_unlock(&root->fs_info->ordered_extent_lock); 583 556 584 - if (inode) { 585 - if (wait) 586 - btrfs_wait_ordered_range(inode, 0, (u64)-1); 587 - else 588 - filemap_flush(inode->i_mapping); 589 - btrfs_add_delayed_iput(inode); 557 + work = btrfs_alloc_delalloc_work(inode, wait, 1); 558 + if (!work) { 559 + if (list_empty(&BTRFS_I(inode)->ordered_operations)) 560 + list_add_tail(&btrfs_inode->ordered_operations, 561 + &splice); 562 + spin_lock(&root->fs_info->ordered_extent_lock); 563 + list_splice_tail(&splice, 564 + &root->fs_info->ordered_operations); 565 + spin_unlock(&root->fs_info->ordered_extent_lock); 566 + ret = -ENOMEM; 567 + goto out; 590 568 } 569 + list_add_tail(&work->list, &works); 570 + btrfs_queue_worker(&root->fs_info->flush_workers, 571 + &work->work); 591 572 592 573 cond_resched(); 593 574 spin_lock(&root->fs_info->ordered_extent_lock); ··· 607 566 goto again; 608 567 609 568 spin_unlock(&root->fs_info->ordered_extent_lock); 569 + out: 570 + list_for_each_entry_safe(work, next, &works, list) { 571 + list_del_init(&work->list); 572 + btrfs_wait_and_free_delalloc_work(work); 573 + } 610 574 mutex_unlock(&root->fs_info->ordered_operations_mutex); 575 + return ret; 611 576 } 612 577 613 578 /* ··· 653 606 u64 end; 654 607 u64 orig_end; 655 608 struct btrfs_ordered_extent *ordered; 656 - int found; 657 609 658 610 if (start + len < start) { 659 611 orig_end = INT_LIMIT(loff_t); ··· 688 642 filemap_fdatawait_range(inode->i_mapping, start, orig_end); 689 643 690 644 end = orig_end; 691 - found = 0; 692 645 while (1) { 693 646 ordered = btrfs_lookup_first_ordered_extent(inode, end); 694 647 if (!ordered) ··· 700 655 btrfs_put_ordered_extent(ordered); 701 656 break; 702 657 } 703 - found++; 704 658 btrfs_start_ordered_extent(inode, ordered, 1); 705 659 end = ordered->file_offset; 706 660 btrfs_put_ordered_extent(ordered); ··· 978 934 if (last_mod < root->fs_info->last_trans_committed) 979 935 return; 980 936 981 - /* 982 - * the transaction is already committing. Just start the IO and 983 - * don't bother with all of this list nonsense 984 - */ 985 - if (trans && root->fs_info->running_transaction->blocked) { 986 - btrfs_wait_ordered_range(inode, 0, (u64)-1); 987 - return; 988 - } 989 - 990 937 spin_lock(&root->fs_info->ordered_extent_lock); 991 938 if (list_empty(&BTRFS_I(inode)->ordered_operations)) { 992 939 list_add_tail(&BTRFS_I(inode)->ordered_operations, ··· 994 959 NULL); 995 960 if (!btrfs_ordered_extent_cache) 996 961 return -ENOMEM; 962 + 997 963 return 0; 998 964 } 999 965

+5 -2

fs/btrfs/ordered-data.h

··· 128 128 struct list_head root_extent_list; 129 129 130 130 struct btrfs_work work; 131 - }; 132 131 132 + struct completion completion; 133 + struct btrfs_work flush_work; 134 + struct list_head work_list; 135 + }; 133 136 134 137 /* 135 138 * calculates the total size you need to allocate for an ordered sum ··· 189 186 int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 190 187 struct btrfs_ordered_extent *ordered); 191 188 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 192 - void btrfs_run_ordered_operations(struct btrfs_root *root, int wait); 189 + int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); 193 190 void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, 194 191 struct btrfs_root *root, 195 192 struct inode *inode);

+3

fs/btrfs/print-tree.c

··· 297 297 case BTRFS_DEV_STATS_KEY: 298 298 printk(KERN_INFO "\t\tdevice stats\n"); 299 299 break; 300 + case BTRFS_DEV_REPLACE_KEY: 301 + printk(KERN_INFO "\t\tdev replace\n"); 302 + break; 300 303 }; 301 304 } 302 305 }

+28 -3

fs/btrfs/reada.c

··· 27 27 #include "volumes.h" 28 28 #include "disk-io.h" 29 29 #include "transaction.h" 30 + #include "dev-replace.h" 30 31 31 32 #undef DEBUG 32 33 ··· 324 323 struct reada_extent *re = NULL; 325 324 struct reada_extent *re_exist = NULL; 326 325 struct btrfs_fs_info *fs_info = root->fs_info; 327 - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 328 326 struct btrfs_bio *bbio = NULL; 329 327 struct btrfs_device *dev; 330 328 struct btrfs_device *prev_dev; ··· 332 332 int nzones = 0; 333 333 int i; 334 334 unsigned long index = logical >> PAGE_CACHE_SHIFT; 335 + int dev_replace_is_ongoing; 335 336 336 337 spin_lock(&fs_info->reada_lock); 337 338 re = radix_tree_lookup(&fs_info->reada_tree, index); ··· 359 358 * map block 360 359 */ 361 360 length = blocksize; 362 - ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0); 361 + ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length, 362 + &bbio, 0); 363 363 if (ret || !bbio || length < blocksize) 364 364 goto error; 365 365 ··· 395 393 } 396 394 397 395 /* insert extent in reada_tree + all per-device trees, all or nothing */ 396 + btrfs_dev_replace_lock(&fs_info->dev_replace); 398 397 spin_lock(&fs_info->reada_lock); 399 398 ret = radix_tree_insert(&fs_info->reada_tree, index, re); 400 399 if (ret == -EEXIST) { ··· 403 400 BUG_ON(!re_exist); 404 401 re_exist->refcnt++; 405 402 spin_unlock(&fs_info->reada_lock); 403 + btrfs_dev_replace_unlock(&fs_info->dev_replace); 406 404 goto error; 407 405 } 408 406 if (ret) { 409 407 spin_unlock(&fs_info->reada_lock); 408 + btrfs_dev_replace_unlock(&fs_info->dev_replace); 410 409 goto error; 411 410 } 412 411 prev_dev = NULL; 412 + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing( 413 + &fs_info->dev_replace); 413 414 for (i = 0; i < nzones; ++i) { 414 415 dev = bbio->stripes[i].dev; 415 416 if (dev == prev_dev) { ··· 426 419 */ 427 420 continue; 428 421 } 422 + if (!dev->bdev) { 423 + /* cannot read ahead on missing device */ 424 + continue; 425 + } 426 + if (dev_replace_is_ongoing && 427 + dev == fs_info->dev_replace.tgtdev) { 428 + /* 429 + * as this device is selected for reading only as 430 + * a last resort, skip it for read ahead. 431 + */ 432 + continue; 433 + } 429 434 prev_dev = dev; 430 435 ret = radix_tree_insert(&dev->reada_extents, index, re); 431 436 if (ret) { 432 437 while (--i >= 0) { 433 438 dev = bbio->stripes[i].dev; 434 439 BUG_ON(dev == NULL); 440 + /* ignore whether the entry was inserted */ 435 441 radix_tree_delete(&dev->reada_extents, index); 436 442 } 437 443 BUG_ON(fs_info == NULL); 438 444 radix_tree_delete(&fs_info->reada_tree, index); 439 445 spin_unlock(&fs_info->reada_lock); 446 + btrfs_dev_replace_unlock(&fs_info->dev_replace); 440 447 goto error; 441 448 } 442 449 } 443 450 spin_unlock(&fs_info->reada_lock); 451 + btrfs_dev_replace_unlock(&fs_info->dev_replace); 444 452 445 453 kfree(bbio); 446 454 return re; ··· 937 915 generation = btrfs_header_generation(node); 938 916 free_extent_buffer(node); 939 917 940 - reada_add_block(rc, start, &max_key, level, generation); 918 + if (reada_add_block(rc, start, &max_key, level, generation)) { 919 + kfree(rc); 920 + return ERR_PTR(-ENOMEM); 921 + } 941 922 942 923 reada_start_machine(root->fs_info); 943 924

+19 -21

fs/btrfs/relocation.c

··· 2025 2025 struct btrfs_root_item *root_item; 2026 2026 struct btrfs_path *path; 2027 2027 struct extent_buffer *leaf; 2028 - unsigned long nr; 2029 2028 int level; 2030 2029 int max_level; 2031 2030 int replaced = 0; ··· 2073 2074 BUG_ON(IS_ERR(trans)); 2074 2075 trans->block_rsv = rc->block_rsv; 2075 2076 2076 - ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved); 2077 + ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, 2078 + BTRFS_RESERVE_FLUSH_ALL); 2077 2079 if (ret) { 2078 2080 BUG_ON(ret != -EAGAIN); 2079 2081 ret = btrfs_commit_transaction(trans, root); ··· 2125 2125 path->slots[level]); 2126 2126 root_item->drop_level = level; 2127 2127 2128 - nr = trans->blocks_used; 2129 2128 btrfs_end_transaction_throttle(trans, root); 2130 2129 2131 - btrfs_btree_balance_dirty(root, nr); 2130 + btrfs_btree_balance_dirty(root); 2132 2131 2133 2132 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2134 2133 invalidate_extent_cache(root, &key, &next_key); ··· 2154 2155 btrfs_update_reloc_root(trans, root); 2155 2156 } 2156 2157 2157 - nr = trans->blocks_used; 2158 2158 btrfs_end_transaction_throttle(trans, root); 2159 2159 2160 - btrfs_btree_balance_dirty(root, nr); 2160 + btrfs_btree_balance_dirty(root); 2161 2161 2162 2162 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2163 2163 invalidate_extent_cache(root, &key, &next_key); ··· 2182 2184 again: 2183 2185 if (!err) { 2184 2186 num_bytes = rc->merging_rsv_size; 2185 - ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); 2187 + ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, 2188 + BTRFS_RESERVE_FLUSH_ALL); 2186 2189 if (ret) 2187 2190 err = ret; 2188 2191 } ··· 2458 2459 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2459 2460 2460 2461 trans->block_rsv = rc->block_rsv; 2461 - ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); 2462 + ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, 2463 + BTRFS_RESERVE_FLUSH_ALL); 2462 2464 if (ret) { 2463 2465 if (ret == -EAGAIN) 2464 2466 rc->commit_transaction = 1; ··· 3259 3259 struct btrfs_path *path; 3260 3260 struct btrfs_root *root = fs_info->tree_root; 3261 3261 struct btrfs_trans_handle *trans; 3262 - unsigned long nr; 3263 3262 int ret = 0; 3264 3263 3265 3264 if (inode) ··· 3292 3293 ret = btrfs_truncate_free_space_cache(root, trans, path, inode); 3293 3294 3294 3295 btrfs_free_path(path); 3295 - nr = trans->blocks_used; 3296 3296 btrfs_end_transaction(trans, root); 3297 - btrfs_btree_balance_dirty(root, nr); 3297 + btrfs_btree_balance_dirty(root); 3298 3298 out: 3299 3299 iput(inode); 3300 3300 return ret; ··· 3683 3685 * is no reservation in transaction handle. 3684 3686 */ 3685 3687 ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv, 3686 - rc->extent_root->nodesize * 256); 3688 + rc->extent_root->nodesize * 256, 3689 + BTRFS_RESERVE_FLUSH_ALL); 3687 3690 if (ret) 3688 3691 return ret; 3689 3692 ··· 3710 3711 struct btrfs_trans_handle *trans = NULL; 3711 3712 struct btrfs_path *path; 3712 3713 struct btrfs_extent_item *ei; 3713 - unsigned long nr; 3714 3714 u64 flags; 3715 3715 u32 item_size; 3716 3716 int ret; ··· 3826 3828 ret = btrfs_commit_transaction(trans, rc->extent_root); 3827 3829 BUG_ON(ret); 3828 3830 } else { 3829 - nr = trans->blocks_used; 3830 3831 btrfs_end_transaction_throttle(trans, rc->extent_root); 3831 - btrfs_btree_balance_dirty(rc->extent_root, nr); 3832 + btrfs_btree_balance_dirty(rc->extent_root); 3832 3833 } 3833 3834 trans = NULL; 3834 3835 ··· 3857 3860 GFP_NOFS); 3858 3861 3859 3862 if (trans) { 3860 - nr = trans->blocks_used; 3861 3863 btrfs_end_transaction_throttle(trans, rc->extent_root); 3862 - btrfs_btree_balance_dirty(rc->extent_root, nr); 3864 + btrfs_btree_balance_dirty(rc->extent_root); 3863 3865 } 3864 3866 3865 3867 if (!err) { ··· 3937 3941 struct btrfs_trans_handle *trans; 3938 3942 struct btrfs_root *root; 3939 3943 struct btrfs_key key; 3940 - unsigned long nr; 3941 3944 u64 objectid = BTRFS_FIRST_FREE_OBJECTID; 3942 3945 int err = 0; 3943 3946 ··· 3964 3969 3965 3970 err = btrfs_orphan_add(trans, inode); 3966 3971 out: 3967 - nr = trans->blocks_used; 3968 3972 btrfs_end_transaction(trans, root); 3969 - btrfs_btree_balance_dirty(root, nr); 3973 + btrfs_btree_balance_dirty(root); 3970 3974 if (err) { 3971 3975 if (inode) 3972 3976 iput(inode); ··· 4051 4057 (unsigned long long)rc->block_group->key.objectid, 4052 4058 (unsigned long long)rc->block_group->flags); 4053 4059 4054 - btrfs_start_delalloc_inodes(fs_info->tree_root, 0); 4060 + ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0); 4061 + if (ret < 0) { 4062 + err = ret; 4063 + goto out; 4064 + } 4055 4065 btrfs_wait_ordered_extents(fs_info->tree_root, 0); 4056 4066 4057 4067 while (1) {

+2 -2

fs/btrfs/root-tree.c

··· 548 548 struct btrfs_root_item *item = &root->root_item; 549 549 struct timespec ct = CURRENT_TIME; 550 550 551 - spin_lock(&root->root_times_lock); 551 + spin_lock(&root->root_item_lock); 552 552 item->ctransid = cpu_to_le64(trans->transid); 553 553 item->ctime.sec = cpu_to_le64(ct.tv_sec); 554 554 item->ctime.nsec = cpu_to_le32(ct.tv_nsec); 555 - spin_unlock(&root->root_times_lock); 555 + spin_unlock(&root->root_item_lock); 556 556 }

+1333 -503

fs/btrfs/scrub.c

··· 1 1 /* 2 - * Copyright (C) 2011 STRATO. All rights reserved. 2 + * Copyright (C) 2011, 2012 STRATO. All rights reserved. 3 3 * 4 4 * This program is free software; you can redistribute it and/or 5 5 * modify it under the terms of the GNU General Public ··· 25 25 #include "transaction.h" 26 26 #include "backref.h" 27 27 #include "extent_io.h" 28 + #include "dev-replace.h" 28 29 #include "check-integrity.h" 29 30 #include "rcu-string.h" 30 31 ··· 43 42 */ 44 43 45 44 struct scrub_block; 46 - struct scrub_dev; 45 + struct scrub_ctx; 47 46 48 - #define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ 49 - #define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ 47 + /* 48 + * the following three values only influence the performance. 49 + * The last one configures the number of parallel and outstanding I/O 50 + * operations. The first two values configure an upper limit for the number 51 + * of (dynamically allocated) pages that are added to a bio. 52 + */ 53 + #define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */ 54 + #define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */ 55 + #define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */ 56 + 57 + /* 58 + * the following value times PAGE_SIZE needs to be large enough to match the 59 + * largest node/leaf/sector size that shall be supported. 60 + * Values larger than BTRFS_STRIPE_LEN are not supported. 61 + */ 50 62 #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ 51 63 52 64 struct scrub_page { ··· 70 56 u64 generation; 71 57 u64 logical; 72 58 u64 physical; 59 + u64 physical_for_dev_replace; 60 + atomic_t ref_count; 73 61 struct { 74 62 unsigned int mirror_num:8; 75 63 unsigned int have_csum:1; ··· 82 66 83 67 struct scrub_bio { 84 68 int index; 85 - struct scrub_dev *sdev; 69 + struct scrub_ctx *sctx; 70 + struct btrfs_device *dev; 86 71 struct bio *bio; 87 72 int err; 88 73 u64 logical; 89 74 u64 physical; 90 - struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; 75 + #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO 76 + struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO]; 77 + #else 78 + struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO]; 79 + #endif 91 80 int page_count; 92 81 int next_free; 93 82 struct btrfs_work work; 94 83 }; 95 84 96 85 struct scrub_block { 97 - struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK]; 86 + struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; 98 87 int page_count; 99 88 atomic_t outstanding_pages; 100 89 atomic_t ref_count; /* free mem on transition to zero */ 101 - struct scrub_dev *sdev; 90 + struct scrub_ctx *sctx; 102 91 struct { 103 92 unsigned int header_error:1; 104 93 unsigned int checksum_error:1; ··· 112 91 }; 113 92 }; 114 93 115 - struct scrub_dev { 116 - struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; 117 - struct btrfs_device *dev; 94 + struct scrub_wr_ctx { 95 + struct scrub_bio *wr_curr_bio; 96 + struct btrfs_device *tgtdev; 97 + int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ 98 + atomic_t flush_all_writes; 99 + struct mutex wr_lock; 100 + }; 101 + 102 + struct scrub_ctx { 103 + struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX]; 104 + struct btrfs_root *dev_root; 118 105 int first_free; 119 106 int curr; 120 - atomic_t in_flight; 121 - atomic_t fixup_cnt; 107 + atomic_t bios_in_flight; 108 + atomic_t workers_pending; 122 109 spinlock_t list_lock; 123 110 wait_queue_head_t list_wait; 124 111 u16 csum_size; 125 112 struct list_head csum_list; 126 113 atomic_t cancel_req; 127 114 int readonly; 128 - int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ 115 + int pages_per_rd_bio; 129 116 u32 sectorsize; 130 117 u32 nodesize; 131 118 u32 leafsize; 119 + 120 + int is_dev_replace; 121 + struct scrub_wr_ctx wr_ctx; 122 + 132 123 /* 133 124 * statistics 134 125 */ ··· 149 116 }; 150 117 151 118 struct scrub_fixup_nodatasum { 152 - struct scrub_dev *sdev; 119 + struct scrub_ctx *sctx; 120 + struct btrfs_device *dev; 153 121 u64 logical; 154 122 struct btrfs_root *root; 155 123 struct btrfs_work work; 156 124 int mirror_num; 125 + }; 126 + 127 + struct scrub_copy_nocow_ctx { 128 + struct scrub_ctx *sctx; 129 + u64 logical; 130 + u64 len; 131 + int mirror_num; 132 + u64 physical_for_dev_replace; 133 + struct btrfs_work work; 157 134 }; 158 135 159 136 struct scrub_warning { ··· 180 137 }; 181 138 182 139 140 + static void scrub_pending_bio_inc(struct scrub_ctx *sctx); 141 + static void scrub_pending_bio_dec(struct scrub_ctx *sctx); 142 + static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); 143 + static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx); 183 144 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); 184 - static int scrub_setup_recheck_block(struct scrub_dev *sdev, 185 - struct btrfs_mapping_tree *map_tree, 145 + static int scrub_setup_recheck_block(struct scrub_ctx *sctx, 146 + struct btrfs_fs_info *fs_info, 147 + struct scrub_block *original_sblock, 186 148 u64 length, u64 logical, 187 - struct scrub_block *sblock); 188 - static int scrub_recheck_block(struct btrfs_fs_info *fs_info, 189 - struct scrub_block *sblock, int is_metadata, 190 - int have_csum, u8 *csum, u64 generation, 191 - u16 csum_size); 149 + struct scrub_block *sblocks_for_recheck); 150 + static void scrub_recheck_block(struct btrfs_fs_info *fs_info, 151 + struct scrub_block *sblock, int is_metadata, 152 + int have_csum, u8 *csum, u64 generation, 153 + u16 csum_size); 192 154 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 193 155 struct scrub_block *sblock, 194 156 int is_metadata, int have_csum, ··· 206 158 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, 207 159 struct scrub_block *sblock_good, 208 160 int page_num, int force_write); 161 + static void scrub_write_block_to_dev_replace(struct scrub_block *sblock); 162 + static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, 163 + int page_num); 209 164 static int scrub_checksum_data(struct scrub_block *sblock); 210 165 static int scrub_checksum_tree_block(struct scrub_block *sblock); 211 166 static int scrub_checksum_super(struct scrub_block *sblock); 212 167 static void scrub_block_get(struct scrub_block *sblock); 213 168 static void scrub_block_put(struct scrub_block *sblock); 214 - static int scrub_add_page_to_bio(struct scrub_dev *sdev, 215 - struct scrub_page *spage); 216 - static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, 217 - u64 physical, u64 flags, u64 gen, int mirror_num, 218 - u8 *csum, int force); 169 + static void scrub_page_get(struct scrub_page *spage); 170 + static void scrub_page_put(struct scrub_page *spage); 171 + static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, 172 + struct scrub_page *spage); 173 + static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 174 + u64 physical, struct btrfs_device *dev, u64 flags, 175 + u64 gen, int mirror_num, u8 *csum, int force, 176 + u64 physical_for_dev_replace); 219 177 static void scrub_bio_end_io(struct bio *bio, int err); 220 178 static void scrub_bio_end_io_worker(struct btrfs_work *work); 221 179 static void scrub_block_complete(struct scrub_block *sblock); 180 + static void scrub_remap_extent(struct btrfs_fs_info *fs_info, 181 + u64 extent_logical, u64 extent_len, 182 + u64 *extent_physical, 183 + struct btrfs_device **extent_dev, 184 + int *extent_mirror_num); 185 + static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, 186 + struct scrub_wr_ctx *wr_ctx, 187 + struct btrfs_fs_info *fs_info, 188 + struct btrfs_device *dev, 189 + int is_dev_replace); 190 + static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx); 191 + static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, 192 + struct scrub_page *spage); 193 + static void scrub_wr_submit(struct scrub_ctx *sctx); 194 + static void scrub_wr_bio_end_io(struct bio *bio, int err); 195 + static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); 196 + static int write_page_nocow(struct scrub_ctx *sctx, 197 + u64 physical_for_dev_replace, struct page *page); 198 + static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, 199 + void *ctx); 200 + static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 201 + int mirror_num, u64 physical_for_dev_replace); 202 + static void copy_nocow_pages_worker(struct btrfs_work *work); 222 203 223 204 224 - static void scrub_free_csums(struct scrub_dev *sdev) 205 + static void scrub_pending_bio_inc(struct scrub_ctx *sctx) 225 206 { 226 - while (!list_empty(&sdev->csum_list)) { 207 + atomic_inc(&sctx->bios_in_flight); 208 + } 209 + 210 + static void scrub_pending_bio_dec(struct scrub_ctx *sctx) 211 + { 212 + atomic_dec(&sctx->bios_in_flight); 213 + wake_up(&sctx->list_wait); 214 + } 215 + 216 + /* 217 + * used for workers that require transaction commits (i.e., for the 218 + * NOCOW case) 219 + */ 220 + static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) 221 + { 222 + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; 223 + 224 + /* 225 + * increment scrubs_running to prevent cancel requests from 226 + * completing as long as a worker is running. we must also 227 + * increment scrubs_paused to prevent deadlocking on pause 228 + * requests used for transactions commits (as the worker uses a 229 + * transaction context). it is safe to regard the worker 230 + * as paused for all matters practical. effectively, we only 231 + * avoid cancellation requests from completing. 232 + */ 233 + mutex_lock(&fs_info->scrub_lock); 234 + atomic_inc(&fs_info->scrubs_running); 235 + atomic_inc(&fs_info->scrubs_paused); 236 + mutex_unlock(&fs_info->scrub_lock); 237 + atomic_inc(&sctx->workers_pending); 238 + } 239 + 240 + /* used for workers that require transaction commits */ 241 + static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx) 242 + { 243 + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; 244 + 245 + /* 246 + * see scrub_pending_trans_workers_inc() why we're pretending 247 + * to be paused in the scrub counters 248 + */ 249 + mutex_lock(&fs_info->scrub_lock); 250 + atomic_dec(&fs_info->scrubs_running); 251 + atomic_dec(&fs_info->scrubs_paused); 252 + mutex_unlock(&fs_info->scrub_lock); 253 + atomic_dec(&sctx->workers_pending); 254 + wake_up(&fs_info->scrub_pause_wait); 255 + wake_up(&sctx->list_wait); 256 + } 257 + 258 + static void scrub_free_csums(struct scrub_ctx *sctx) 259 + { 260 + while (!list_empty(&sctx->csum_list)) { 227 261 struct btrfs_ordered_sum *sum; 228 - sum = list_first_entry(&sdev->csum_list, 262 + sum = list_first_entry(&sctx->csum_list, 229 263 struct btrfs_ordered_sum, list); 230 264 list_del(&sum->list); 231 265 kfree(sum); 232 266 } 233 267 } 234 268 235 - static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) 269 + static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) 236 270 { 237 271 int i; 238 272 239 - if (!sdev) 273 + if (!sctx) 240 274 return; 241 275 276 + scrub_free_wr_ctx(&sctx->wr_ctx); 277 + 242 278 /* this can happen when scrub is cancelled */ 243 - if (sdev->curr != -1) { 244 - struct scrub_bio *sbio = sdev->bios[sdev->curr]; 279 + if (sctx->curr != -1) { 280 + struct scrub_bio *sbio = sctx->bios[sctx->curr]; 245 281 246 282 for (i = 0; i < sbio->page_count; i++) { 247 - BUG_ON(!sbio->pagev[i]); 248 - BUG_ON(!sbio->pagev[i]->page); 283 + WARN_ON(!sbio->pagev[i]->page); 249 284 scrub_block_put(sbio->pagev[i]->sblock); 250 285 } 251 286 bio_put(sbio->bio); 252 287 } 253 288 254 - for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 255 - struct scrub_bio *sbio = sdev->bios[i]; 289 + for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { 290 + struct scrub_bio *sbio = sctx->bios[i]; 256 291 257 292 if (!sbio) 258 293 break; 259 294 kfree(sbio); 260 295 } 261 296 262 - scrub_free_csums(sdev); 263 - kfree(sdev); 297 + scrub_free_csums(sctx); 298 + kfree(sctx); 264 299 } 265 300 266 301 static noinline_for_stack 267 - struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) 302 + struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) 268 303 { 269 - struct scrub_dev *sdev; 304 + struct scrub_ctx *sctx; 270 305 int i; 271 306 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 272 - int pages_per_bio; 307 + int pages_per_rd_bio; 308 + int ret; 273 309 274 - pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, 275 - bio_get_nr_vecs(dev->bdev)); 276 - sdev = kzalloc(sizeof(*sdev), GFP_NOFS); 277 - if (!sdev) 310 + /* 311 + * the setting of pages_per_rd_bio is correct for scrub but might 312 + * be wrong for the dev_replace code where we might read from 313 + * different devices in the initial huge bios. However, that 314 + * code is able to correctly handle the case when adding a page 315 + * to a bio fails. 316 + */ 317 + if (dev->bdev) 318 + pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO, 319 + bio_get_nr_vecs(dev->bdev)); 320 + else 321 + pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; 322 + sctx = kzalloc(sizeof(*sctx), GFP_NOFS); 323 + if (!sctx) 278 324 goto nomem; 279 - sdev->dev = dev; 280 - sdev->pages_per_bio = pages_per_bio; 281 - sdev->curr = -1; 282 - for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 325 + sctx->is_dev_replace = is_dev_replace; 326 + sctx->pages_per_rd_bio = pages_per_rd_bio; 327 + sctx->curr = -1; 328 + sctx->dev_root = dev->dev_root; 329 + for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { 283 330 struct scrub_bio *sbio; 284 331 285 332 sbio = kzalloc(sizeof(*sbio), GFP_NOFS); 286 333 if (!sbio) 287 334 goto nomem; 288 - sdev->bios[i] = sbio; 335 + sctx->bios[i] = sbio; 289 336 290 337 sbio->index = i; 291 - sbio->sdev = sdev; 338 + sbio->sctx = sctx; 292 339 sbio->page_count = 0; 293 340 sbio->work.func = scrub_bio_end_io_worker; 294 341 295 - if (i != SCRUB_BIOS_PER_DEV-1) 296 - sdev->bios[i]->next_free = i + 1; 342 + if (i != SCRUB_BIOS_PER_SCTX - 1) 343 + sctx->bios[i]->next_free = i + 1; 297 344 else 298 - sdev->bios[i]->next_free = -1; 345 + sctx->bios[i]->next_free = -1; 299 346 } 300 - sdev->first_free = 0; 301 - sdev->nodesize = dev->dev_root->nodesize; 302 - sdev->leafsize = dev->dev_root->leafsize; 303 - sdev->sectorsize = dev->dev_root->sectorsize; 304 - atomic_set(&sdev->in_flight, 0); 305 - atomic_set(&sdev->fixup_cnt, 0); 306 - atomic_set(&sdev->cancel_req, 0); 307 - sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); 308 - INIT_LIST_HEAD(&sdev->csum_list); 347 + sctx->first_free = 0; 348 + sctx->nodesize = dev->dev_root->nodesize; 349 + sctx->leafsize = dev->dev_root->leafsize; 350 + sctx->sectorsize = dev->dev_root->sectorsize; 351 + atomic_set(&sctx->bios_in_flight, 0); 352 + atomic_set(&sctx->workers_pending, 0); 353 + atomic_set(&sctx->cancel_req, 0); 354 + sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy); 355 + INIT_LIST_HEAD(&sctx->csum_list); 309 356 310 - spin_lock_init(&sdev->list_lock); 311 - spin_lock_init(&sdev->stat_lock); 312 - init_waitqueue_head(&sdev->list_wait); 313 - return sdev; 357 + spin_lock_init(&sctx->list_lock); 358 + spin_lock_init(&sctx->stat_lock); 359 + init_waitqueue_head(&sctx->list_wait); 360 + 361 + ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info, 362 + fs_info->dev_replace.tgtdev, is_dev_replace); 363 + if (ret) { 364 + scrub_free_ctx(sctx); 365 + return ERR_PTR(ret); 366 + } 367 + return sctx; 314 368 315 369 nomem: 316 - scrub_free_dev(sdev); 370 + scrub_free_ctx(sctx); 317 371 return ERR_PTR(-ENOMEM); 318 372 } 319 373 320 - static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) 374 + static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, 375 + void *warn_ctx) 321 376 { 322 377 u64 isize; 323 378 u32 nlink; ··· 428 277 int i; 429 278 struct extent_buffer *eb; 430 279 struct btrfs_inode_item *inode_item; 431 - struct scrub_warning *swarn = ctx; 280 + struct scrub_warning *swarn = warn_ctx; 432 281 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; 433 282 struct inode_fs_paths *ipath = NULL; 434 283 struct btrfs_root *local_root; ··· 496 345 497 346 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) 498 347 { 499 - struct btrfs_device *dev = sblock->sdev->dev; 500 - struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 348 + struct btrfs_device *dev; 349 + struct btrfs_fs_info *fs_info; 501 350 struct btrfs_path *path; 502 351 struct btrfs_key found_key; 503 352 struct extent_buffer *eb; ··· 512 361 const int bufsize = 4096; 513 362 int ret; 514 363 364 + WARN_ON(sblock->page_count < 1); 365 + dev = sblock->pagev[0]->dev; 366 + fs_info = sblock->sctx->dev_root->fs_info; 367 + 515 368 path = btrfs_alloc_path(); 516 369 517 370 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); 518 371 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); 519 - BUG_ON(sblock->page_count < 1); 520 - swarn.sector = (sblock->pagev[0].physical) >> 9; 521 - swarn.logical = sblock->pagev[0].logical; 372 + swarn.sector = (sblock->pagev[0]->physical) >> 9; 373 + swarn.logical = sblock->pagev[0]->logical; 522 374 swarn.errstr = errstr; 523 - swarn.dev = dev; 375 + swarn.dev = NULL; 524 376 swarn.msg_bufsize = bufsize; 525 377 swarn.scratch_bufsize = bufsize; 526 378 ··· 559 405 } while (ret != 1); 560 406 } else { 561 407 swarn.path = path; 408 + swarn.dev = dev; 562 409 iterate_extent_inodes(fs_info, found_key.objectid, 563 410 extent_item_pos, 1, 564 411 scrub_print_warning_inode, &swarn); ··· 571 416 kfree(swarn.msg_buf); 572 417 } 573 418 574 - static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) 419 + static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) 575 420 { 576 421 struct page *page = NULL; 577 422 unsigned long index; 578 - struct scrub_fixup_nodatasum *fixup = ctx; 423 + struct scrub_fixup_nodatasum *fixup = fixup_ctx; 579 424 int ret; 580 425 int corrected = 0; 581 426 struct btrfs_key key; ··· 606 451 } 607 452 608 453 if (PageUptodate(page)) { 609 - struct btrfs_mapping_tree *map_tree; 454 + struct btrfs_fs_info *fs_info; 610 455 if (PageDirty(page)) { 611 456 /* 612 457 * we need to write the data to the defect sector. the ··· 627 472 ret = -EIO; 628 473 goto out; 629 474 } 630 - map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; 631 - ret = repair_io_failure(map_tree, offset, PAGE_SIZE, 475 + fs_info = BTRFS_I(inode)->root->fs_info; 476 + ret = repair_io_failure(fs_info, offset, PAGE_SIZE, 632 477 fixup->logical, page, 633 478 fixup->mirror_num); 634 479 unlock_page(page); ··· 685 530 { 686 531 int ret; 687 532 struct scrub_fixup_nodatasum *fixup; 688 - struct scrub_dev *sdev; 533 + struct scrub_ctx *sctx; 689 534 struct btrfs_trans_handle *trans = NULL; 690 535 struct btrfs_fs_info *fs_info; 691 536 struct btrfs_path *path; 692 537 int uncorrectable = 0; 693 538 694 539 fixup = container_of(work, struct scrub_fixup_nodatasum, work); 695 - sdev = fixup->sdev; 540 + sctx = fixup->sctx; 696 541 fs_info = fixup->root->fs_info; 697 542 698 543 path = btrfs_alloc_path(); 699 544 if (!path) { 700 - spin_lock(&sdev->stat_lock); 701 - ++sdev->stat.malloc_errors; 702 - spin_unlock(&sdev->stat_lock); 545 + spin_lock(&sctx->stat_lock); 546 + ++sctx->stat.malloc_errors; 547 + spin_unlock(&sctx->stat_lock); 703 548 uncorrectable = 1; 704 549 goto out; 705 550 } ··· 728 573 } 729 574 WARN_ON(ret != 1); 730 575 731 - spin_lock(&sdev->stat_lock); 732 - ++sdev->stat.corrected_errors; 733 - spin_unlock(&sdev->stat_lock); 576 + spin_lock(&sctx->stat_lock); 577 + ++sctx->stat.corrected_errors; 578 + spin_unlock(&sctx->stat_lock); 734 579 735 580 out: 736 581 if (trans && !IS_ERR(trans)) 737 582 btrfs_end_transaction(trans, fixup->root); 738 583 if (uncorrectable) { 739 - spin_lock(&sdev->stat_lock); 740 - ++sdev->stat.uncorrectable_errors; 741 - spin_unlock(&sdev->stat_lock); 742 - 584 + spin_lock(&sctx->stat_lock); 585 + ++sctx->stat.uncorrectable_errors; 586 + spin_unlock(&sctx->stat_lock); 587 + btrfs_dev_replace_stats_inc( 588 + &sctx->dev_root->fs_info->dev_replace. 589 + num_uncorrectable_read_errors); 743 590 printk_ratelimited_in_rcu(KERN_ERR 744 591 "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", 745 592 (unsigned long long)fixup->logical, 746 - rcu_str_deref(sdev->dev->name)); 593 + rcu_str_deref(fixup->dev->name)); 747 594 } 748 595 749 596 btrfs_free_path(path); 750 597 kfree(fixup); 751 598 752 - /* see caller why we're pretending to be paused in the scrub counters */ 753 - mutex_lock(&fs_info->scrub_lock); 754 - atomic_dec(&fs_info->scrubs_running); 755 - atomic_dec(&fs_info->scrubs_paused); 756 - mutex_unlock(&fs_info->scrub_lock); 757 - atomic_dec(&sdev->fixup_cnt); 758 - wake_up(&fs_info->scrub_pause_wait); 759 - wake_up(&sdev->list_wait); 599 + scrub_pending_trans_workers_dec(sctx); 760 600 } 761 601 762 602 /* ··· 764 614 */ 765 615 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) 766 616 { 767 - struct scrub_dev *sdev = sblock_to_check->sdev; 617 + struct scrub_ctx *sctx = sblock_to_check->sctx; 618 + struct btrfs_device *dev; 768 619 struct btrfs_fs_info *fs_info; 769 620 u64 length; 770 621 u64 logical; ··· 784 633 DEFAULT_RATELIMIT_BURST); 785 634 786 635 BUG_ON(sblock_to_check->page_count < 1); 787 - fs_info = sdev->dev->dev_root->fs_info; 636 + fs_info = sctx->dev_root->fs_info; 637 + if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { 638 + /* 639 + * if we find an error in a super block, we just report it. 640 + * They will get written with the next transaction commit 641 + * anyway 642 + */ 643 + spin_lock(&sctx->stat_lock); 644 + ++sctx->stat.super_errors; 645 + spin_unlock(&sctx->stat_lock); 646 + return 0; 647 + } 788 648 length = sblock_to_check->page_count * PAGE_SIZE; 789 - logical = sblock_to_check->pagev[0].logical; 790 - generation = sblock_to_check->pagev[0].generation; 791 - BUG_ON(sblock_to_check->pagev[0].mirror_num < 1); 792 - failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1; 793 - is_metadata = !(sblock_to_check->pagev[0].flags & 649 + logical = sblock_to_check->pagev[0]->logical; 650 + generation = sblock_to_check->pagev[0]->generation; 651 + BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1); 652 + failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1; 653 + is_metadata = !(sblock_to_check->pagev[0]->flags & 794 654 BTRFS_EXTENT_FLAG_DATA); 795 - have_csum = sblock_to_check->pagev[0].have_csum; 796 - csum = sblock_to_check->pagev[0].csum; 655 + have_csum = sblock_to_check->pagev[0]->have_csum; 656 + csum = sblock_to_check->pagev[0]->csum; 657 + dev = sblock_to_check->pagev[0]->dev; 658 + 659 + if (sctx->is_dev_replace && !is_metadata && !have_csum) { 660 + sblocks_for_recheck = NULL; 661 + goto nodatasum_case; 662 + } 797 663 798 664 /* 799 665 * read all mirrors one after the other. This includes to ··· 845 677 sizeof(*sblocks_for_recheck), 846 678 GFP_NOFS); 847 679 if (!sblocks_for_recheck) { 848 - spin_lock(&sdev->stat_lock); 849 - sdev->stat.malloc_errors++; 850 - sdev->stat.read_errors++; 851 - sdev->stat.uncorrectable_errors++; 852 - spin_unlock(&sdev->stat_lock); 853 - btrfs_dev_stat_inc_and_print(sdev->dev, 854 - BTRFS_DEV_STAT_READ_ERRS); 680 + spin_lock(&sctx->stat_lock); 681 + sctx->stat.malloc_errors++; 682 + sctx->stat.read_errors++; 683 + sctx->stat.uncorrectable_errors++; 684 + spin_unlock(&sctx->stat_lock); 685 + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 855 686 goto out; 856 687 } 857 688 858 689 /* setup the context, map the logical blocks and alloc the pages */ 859 - ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length, 690 + ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length, 860 691 logical, sblocks_for_recheck); 861 692 if (ret) { 862 - spin_lock(&sdev->stat_lock); 863 - sdev->stat.read_errors++; 864 - sdev->stat.uncorrectable_errors++; 865 - spin_unlock(&sdev->stat_lock); 866 - btrfs_dev_stat_inc_and_print(sdev->dev, 867 - BTRFS_DEV_STAT_READ_ERRS); 693 + spin_lock(&sctx->stat_lock); 694 + sctx->stat.read_errors++; 695 + sctx->stat.uncorrectable_errors++; 696 + spin_unlock(&sctx->stat_lock); 697 + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 868 698 goto out; 869 699 } 870 700 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); 871 701 sblock_bad = sblocks_for_recheck + failed_mirror_index; 872 702 873 703 /* build and submit the bios for the failed mirror, check checksums */ 874 - ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, 875 - csum, generation, sdev->csum_size); 876 - if (ret) { 877 - spin_lock(&sdev->stat_lock); 878 - sdev->stat.read_errors++; 879 - sdev->stat.uncorrectable_errors++; 880 - spin_unlock(&sdev->stat_lock); 881 - btrfs_dev_stat_inc_and_print(sdev->dev, 882 - BTRFS_DEV_STAT_READ_ERRS); 883 - goto out; 884 - } 704 + scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, 705 + csum, generation, sctx->csum_size); 885 706 886 707 if (!sblock_bad->header_error && !sblock_bad->checksum_error && 887 708 sblock_bad->no_io_error_seen) { ··· 882 725 * different bio (usually one of the two latter cases is 883 726 * the cause) 884 727 */ 885 - spin_lock(&sdev->stat_lock); 886 - sdev->stat.unverified_errors++; 887 - spin_unlock(&sdev->stat_lock); 728 + spin_lock(&sctx->stat_lock); 729 + sctx->stat.unverified_errors++; 730 + spin_unlock(&sctx->stat_lock); 888 731 732 + if (sctx->is_dev_replace) 733 + scrub_write_block_to_dev_replace(sblock_bad); 889 734 goto out; 890 735 } 891 736 892 737 if (!sblock_bad->no_io_error_seen) { 893 - spin_lock(&sdev->stat_lock); 894 - sdev->stat.read_errors++; 895 - spin_unlock(&sdev->stat_lock); 738 + spin_lock(&sctx->stat_lock); 739 + sctx->stat.read_errors++; 740 + spin_unlock(&sctx->stat_lock); 896 741 if (__ratelimit(&_rs)) 897 742 scrub_print_warning("i/o error", sblock_to_check); 898 - btrfs_dev_stat_inc_and_print(sdev->dev, 899 - BTRFS_DEV_STAT_READ_ERRS); 743 + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 900 744 } else if (sblock_bad->checksum_error) { 901 - spin_lock(&sdev->stat_lock); 902 - sdev->stat.csum_errors++; 903 - spin_unlock(&sdev->stat_lock); 745 + spin_lock(&sctx->stat_lock); 746 + sctx->stat.csum_errors++; 747 + spin_unlock(&sctx->stat_lock); 904 748 if (__ratelimit(&_rs)) 905 749 scrub_print_warning("checksum error", sblock_to_check); 906 - btrfs_dev_stat_inc_and_print(sdev->dev, 750 + btrfs_dev_stat_inc_and_print(dev, 907 751 BTRFS_DEV_STAT_CORRUPTION_ERRS); 908 752 } else if (sblock_bad->header_error) { 909 - spin_lock(&sdev->stat_lock); 910 - sdev->stat.verify_errors++; 911 - spin_unlock(&sdev->stat_lock); 753 + spin_lock(&sctx->stat_lock); 754 + sctx->stat.verify_errors++; 755 + spin_unlock(&sctx->stat_lock); 912 756 if (__ratelimit(&_rs)) 913 757 scrub_print_warning("checksum/header error", 914 758 sblock_to_check); 915 759 if (sblock_bad->generation_error) 916 - btrfs_dev_stat_inc_and_print(sdev->dev, 760 + btrfs_dev_stat_inc_and_print(dev, 917 761 BTRFS_DEV_STAT_GENERATION_ERRS); 918 762 else 919 - btrfs_dev_stat_inc_and_print(sdev->dev, 763 + btrfs_dev_stat_inc_and_print(dev, 920 764 BTRFS_DEV_STAT_CORRUPTION_ERRS); 921 765 } 922 766 923 - if (sdev->readonly) 767 + if (sctx->readonly && !sctx->is_dev_replace) 924 768 goto did_not_correct_error; 925 769 926 770 if (!is_metadata && !have_csum) { 927 771 struct scrub_fixup_nodatasum *fixup_nodatasum; 772 + 773 + nodatasum_case: 774 + WARN_ON(sctx->is_dev_replace); 928 775 929 776 /* 930 777 * !is_metadata and !have_csum, this means that the data ··· 940 779 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); 941 780 if (!fixup_nodatasum) 942 781 goto did_not_correct_error; 943 - fixup_nodatasum->sdev = sdev; 782 + fixup_nodatasum->sctx = sctx; 783 + fixup_nodatasum->dev = dev; 944 784 fixup_nodatasum->logical = logical; 945 785 fixup_nodatasum->root = fs_info->extent_root; 946 786 fixup_nodatasum->mirror_num = failed_mirror_index + 1; 947 - /* 948 - * increment scrubs_running to prevent cancel requests from 949 - * completing as long as a fixup worker is running. we must also 950 - * increment scrubs_paused to prevent deadlocking on pause 951 - * requests used for transactions commits (as the worker uses a 952 - * transaction context). it is safe to regard the fixup worker 953 - * as paused for all matters practical. effectively, we only 954 - * avoid cancellation requests from completing. 955 - */ 956 - mutex_lock(&fs_info->scrub_lock); 957 - atomic_inc(&fs_info->scrubs_running); 958 - atomic_inc(&fs_info->scrubs_paused); 959 - mutex_unlock(&fs_info->scrub_lock); 960 - atomic_inc(&sdev->fixup_cnt); 787 + scrub_pending_trans_workers_inc(sctx); 961 788 fixup_nodatasum->work.func = scrub_fixup_nodatasum; 962 789 btrfs_queue_worker(&fs_info->scrub_workers, 963 790 &fixup_nodatasum->work); ··· 954 805 955 806 /* 956 807 * now build and submit the bios for the other mirrors, check 957 - * checksums 958 - */ 959 - for (mirror_index = 0; 960 - mirror_index < BTRFS_MAX_MIRRORS && 961 - sblocks_for_recheck[mirror_index].page_count > 0; 962 - mirror_index++) { 963 - if (mirror_index == failed_mirror_index) 964 - continue; 965 - 966 - /* build and submit the bios, check checksums */ 967 - ret = scrub_recheck_block(fs_info, 968 - sblocks_for_recheck + mirror_index, 969 - is_metadata, have_csum, csum, 970 - generation, sdev->csum_size); 971 - if (ret) 972 - goto did_not_correct_error; 973 - } 974 - 975 - /* 976 - * first try to pick the mirror which is completely without I/O 808 + * checksums. 809 + * First try to pick the mirror which is completely without I/O 977 810 * errors and also does not have a checksum error. 978 811 * If one is found, and if a checksum is present, the full block 979 812 * that is known to contain an error is rewritten. Afterwards ··· 971 840 mirror_index < BTRFS_MAX_MIRRORS && 972 841 sblocks_for_recheck[mirror_index].page_count > 0; 973 842 mirror_index++) { 974 - struct scrub_block *sblock_other = sblocks_for_recheck + 975 - mirror_index; 843 + struct scrub_block *sblock_other; 844 + 845 + if (mirror_index == failed_mirror_index) 846 + continue; 847 + sblock_other = sblocks_for_recheck + mirror_index; 848 + 849 + /* build and submit the bios, check checksums */ 850 + scrub_recheck_block(fs_info, sblock_other, is_metadata, 851 + have_csum, csum, generation, 852 + sctx->csum_size); 976 853 977 854 if (!sblock_other->header_error && 978 855 !sblock_other->checksum_error && 979 856 sblock_other->no_io_error_seen) { 980 - int force_write = is_metadata || have_csum; 857 + if (sctx->is_dev_replace) { 858 + scrub_write_block_to_dev_replace(sblock_other); 859 + } else { 860 + int force_write = is_metadata || have_csum; 981 861 982 - ret = scrub_repair_block_from_good_copy(sblock_bad, 983 - sblock_other, 984 - force_write); 862 + ret = scrub_repair_block_from_good_copy( 863 + sblock_bad, sblock_other, 864 + force_write); 865 + } 985 866 if (0 == ret) 986 867 goto corrected_error; 987 868 } 988 869 } 989 870 990 871 /* 991 - * in case of I/O errors in the area that is supposed to be 872 + * for dev_replace, pick good pages and write to the target device. 873 + */ 874 + if (sctx->is_dev_replace) { 875 + success = 1; 876 + for (page_num = 0; page_num < sblock_bad->page_count; 877 + page_num++) { 878 + int sub_success; 879 + 880 + sub_success = 0; 881 + for (mirror_index = 0; 882 + mirror_index < BTRFS_MAX_MIRRORS && 883 + sblocks_for_recheck[mirror_index].page_count > 0; 884 + mirror_index++) { 885 + struct scrub_block *sblock_other = 886 + sblocks_for_recheck + mirror_index; 887 + struct scrub_page *page_other = 888 + sblock_other->pagev[page_num]; 889 + 890 + if (!page_other->io_error) { 891 + ret = scrub_write_page_to_dev_replace( 892 + sblock_other, page_num); 893 + if (ret == 0) { 894 + /* succeeded for this page */ 895 + sub_success = 1; 896 + break; 897 + } else { 898 + btrfs_dev_replace_stats_inc( 899 + &sctx->dev_root-> 900 + fs_info->dev_replace. 901 + num_write_errors); 902 + } 903 + } 904 + } 905 + 906 + if (!sub_success) { 907 + /* 908 + * did not find a mirror to fetch the page 909 + * from. scrub_write_page_to_dev_replace() 910 + * handles this case (page->io_error), by 911 + * filling the block with zeros before 912 + * submitting the write request 913 + */ 914 + success = 0; 915 + ret = scrub_write_page_to_dev_replace( 916 + sblock_bad, page_num); 917 + if (ret) 918 + btrfs_dev_replace_stats_inc( 919 + &sctx->dev_root->fs_info-> 920 + dev_replace.num_write_errors); 921 + } 922 + } 923 + 924 + goto out; 925 + } 926 + 927 + /* 928 + * for regular scrub, repair those pages that are errored. 929 + * In case of I/O errors in the area that is supposed to be 992 930 * repaired, continue by picking good copies of those pages. 993 931 * Select the good pages from mirrors to rewrite bad pages from 994 932 * the area to fix. Afterwards verify the checksum of the block ··· 1087 887 1088 888 success = 1; 1089 889 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { 1090 - struct scrub_page *page_bad = sblock_bad->pagev + page_num; 890 + struct scrub_page *page_bad = sblock_bad->pagev[page_num]; 1091 891 1092 892 if (!page_bad->io_error) 1093 893 continue; ··· 1098 898 mirror_index++) { 1099 899 struct scrub_block *sblock_other = sblocks_for_recheck + 1100 900 mirror_index; 1101 - struct scrub_page *page_other = sblock_other->pagev + 1102 - page_num; 901 + struct scrub_page *page_other = sblock_other->pagev[ 902 + page_num]; 1103 903 1104 904 if (!page_other->io_error) { 1105 905 ret = scrub_repair_page_from_good_copy( ··· 1128 928 * is verified, but most likely the data comes out 1129 929 * of the page cache. 1130 930 */ 1131 - ret = scrub_recheck_block(fs_info, sblock_bad, 1132 - is_metadata, have_csum, csum, 1133 - generation, sdev->csum_size); 1134 - if (!ret && !sblock_bad->header_error && 931 + scrub_recheck_block(fs_info, sblock_bad, 932 + is_metadata, have_csum, csum, 933 + generation, sctx->csum_size); 934 + if (!sblock_bad->header_error && 1135 935 !sblock_bad->checksum_error && 1136 936 sblock_bad->no_io_error_seen) 1137 937 goto corrected_error; ··· 1139 939 goto did_not_correct_error; 1140 940 } else { 1141 941 corrected_error: 1142 - spin_lock(&sdev->stat_lock); 1143 - sdev->stat.corrected_errors++; 1144 - spin_unlock(&sdev->stat_lock); 942 + spin_lock(&sctx->stat_lock); 943 + sctx->stat.corrected_errors++; 944 + spin_unlock(&sctx->stat_lock); 1145 945 printk_ratelimited_in_rcu(KERN_ERR 1146 946 "btrfs: fixed up error at logical %llu on dev %s\n", 1147 947 (unsigned long long)logical, 1148 - rcu_str_deref(sdev->dev->name)); 948 + rcu_str_deref(dev->name)); 1149 949 } 1150 950 } else { 1151 951 did_not_correct_error: 1152 - spin_lock(&sdev->stat_lock); 1153 - sdev->stat.uncorrectable_errors++; 1154 - spin_unlock(&sdev->stat_lock); 952 + spin_lock(&sctx->stat_lock); 953 + sctx->stat.uncorrectable_errors++; 954 + spin_unlock(&sctx->stat_lock); 1155 955 printk_ratelimited_in_rcu(KERN_ERR 1156 956 "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", 1157 957 (unsigned long long)logical, 1158 - rcu_str_deref(sdev->dev->name)); 958 + rcu_str_deref(dev->name)); 1159 959 } 1160 960 1161 961 out: ··· 1166 966 mirror_index; 1167 967 int page_index; 1168 968 1169 - for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO; 1170 - page_index++) 1171 - if (sblock->pagev[page_index].page) 1172 - __free_page( 1173 - sblock->pagev[page_index].page); 969 + for (page_index = 0; page_index < sblock->page_count; 970 + page_index++) { 971 + sblock->pagev[page_index]->sblock = NULL; 972 + scrub_page_put(sblock->pagev[page_index]); 973 + } 1174 974 } 1175 975 kfree(sblocks_for_recheck); 1176 976 } ··· 1178 978 return 0; 1179 979 } 1180 980 1181 - static int scrub_setup_recheck_block(struct scrub_dev *sdev, 1182 - struct btrfs_mapping_tree *map_tree, 981 + static int scrub_setup_recheck_block(struct scrub_ctx *sctx, 982 + struct btrfs_fs_info *fs_info, 983 + struct scrub_block *original_sblock, 1183 984 u64 length, u64 logical, 1184 985 struct scrub_block *sblocks_for_recheck) 1185 986 { ··· 1189 988 int ret; 1190 989 1191 990 /* 1192 - * note: the three members sdev, ref_count and outstanding_pages 991 + * note: the two members ref_count and outstanding_pages 1193 992 * are not used (and not set) in the blocks that are used for 1194 993 * the recheck procedure 1195 994 */ ··· 1204 1003 * with a length of PAGE_SIZE, each returned stripe 1205 1004 * represents one mirror 1206 1005 */ 1207 - ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length, 1208 - &bbio, 0); 1006 + ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, 1007 + &mapped_length, &bbio, 0); 1209 1008 if (ret || !bbio || mapped_length < sublen) { 1210 1009 kfree(bbio); 1211 1010 return -EIO; 1212 1011 } 1213 1012 1214 - BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); 1013 + BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); 1215 1014 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; 1216 1015 mirror_index++) { 1217 1016 struct scrub_block *sblock; ··· 1221 1020 continue; 1222 1021 1223 1022 sblock = sblocks_for_recheck + mirror_index; 1224 - page = sblock->pagev + page_index; 1225 - page->logical = logical; 1226 - page->physical = bbio->stripes[mirror_index].physical; 1227 - /* for missing devices, dev->bdev is NULL */ 1228 - page->dev = bbio->stripes[mirror_index].dev; 1229 - page->mirror_num = mirror_index + 1; 1230 - page->page = alloc_page(GFP_NOFS); 1231 - if (!page->page) { 1232 - spin_lock(&sdev->stat_lock); 1233 - sdev->stat.malloc_errors++; 1234 - spin_unlock(&sdev->stat_lock); 1023 + sblock->sctx = sctx; 1024 + page = kzalloc(sizeof(*page), GFP_NOFS); 1025 + if (!page) { 1026 + leave_nomem: 1027 + spin_lock(&sctx->stat_lock); 1028 + sctx->stat.malloc_errors++; 1029 + spin_unlock(&sctx->stat_lock); 1235 1030 kfree(bbio); 1236 1031 return -ENOMEM; 1237 1032 } 1033 + scrub_page_get(page); 1034 + sblock->pagev[page_index] = page; 1035 + page->logical = logical; 1036 + page->physical = bbio->stripes[mirror_index].physical; 1037 + BUG_ON(page_index >= original_sblock->page_count); 1038 + page->physical_for_dev_replace = 1039 + original_sblock->pagev[page_index]-> 1040 + physical_for_dev_replace; 1041 + /* for missing devices, dev->bdev is NULL */ 1042 + page->dev = bbio->stripes[mirror_index].dev; 1043 + page->mirror_num = mirror_index + 1; 1238 1044 sblock->page_count++; 1045 + page->page = alloc_page(GFP_NOFS); 1046 + if (!page->page) 1047 + goto leave_nomem; 1239 1048 } 1240 1049 kfree(bbio); 1241 1050 length -= sublen; ··· 1263 1052 * to take those pages that are not errored from all the mirrors so that 1264 1053 * the pages that are errored in the just handled mirror can be repaired. 1265 1054 */ 1266 - static int scrub_recheck_block(struct btrfs_fs_info *fs_info, 1267 - struct scrub_block *sblock, int is_metadata, 1268 - int have_csum, u8 *csum, u64 generation, 1269 - u16 csum_size) 1055 + static void scrub_recheck_block(struct btrfs_fs_info *fs_info, 1056 + struct scrub_block *sblock, int is_metadata, 1057 + int have_csum, u8 *csum, u64 generation, 1058 + u16 csum_size) 1270 1059 { 1271 1060 int page_num; 1272 1061 ··· 1276 1065 1277 1066 for (page_num = 0; page_num < sblock->page_count; page_num++) { 1278 1067 struct bio *bio; 1279 - int ret; 1280 - struct scrub_page *page = sblock->pagev + page_num; 1068 + struct scrub_page *page = sblock->pagev[page_num]; 1281 1069 DECLARE_COMPLETION_ONSTACK(complete); 1282 1070 1283 1071 if (page->dev->bdev == NULL) { ··· 1285 1075 continue; 1286 1076 } 1287 1077 1288 - BUG_ON(!page->page); 1078 + WARN_ON(!page->page); 1289 1079 bio = bio_alloc(GFP_NOFS, 1); 1290 - if (!bio) 1291 - return -EIO; 1080 + if (!bio) { 1081 + page->io_error = 1; 1082 + sblock->no_io_error_seen = 0; 1083 + continue; 1084 + } 1292 1085 bio->bi_bdev = page->dev->bdev; 1293 1086 bio->bi_sector = page->physical >> 9; 1294 1087 bio->bi_end_io = scrub_complete_bio_end_io; 1295 1088 bio->bi_private = &complete; 1296 1089 1297 - ret = bio_add_page(bio, page->page, PAGE_SIZE, 0); 1298 - if (PAGE_SIZE != ret) { 1299 - bio_put(bio); 1300 - return -EIO; 1301 - } 1090 + bio_add_page(bio, page->page, PAGE_SIZE, 0); 1302 1091 btrfsic_submit_bio(READ, bio); 1303 1092 1304 1093 /* this will also unplug the queue */ ··· 1314 1105 have_csum, csum, generation, 1315 1106 csum_size); 1316 1107 1317 - return 0; 1108 + return; 1318 1109 } 1319 1110 1320 1111 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, ··· 1329 1120 struct btrfs_root *root = fs_info->extent_root; 1330 1121 void *mapped_buffer; 1331 1122 1332 - BUG_ON(!sblock->pagev[0].page); 1123 + WARN_ON(!sblock->pagev[0]->page); 1333 1124 if (is_metadata) { 1334 1125 struct btrfs_header *h; 1335 1126 1336 - mapped_buffer = kmap_atomic(sblock->pagev[0].page); 1127 + mapped_buffer = kmap_atomic(sblock->pagev[0]->page); 1337 1128 h = (struct btrfs_header *)mapped_buffer; 1338 1129 1339 - if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || 1130 + if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) || 1340 1131 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || 1341 1132 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1342 1133 BTRFS_UUID_SIZE)) { ··· 1350 1141 if (!have_csum) 1351 1142 return; 1352 1143 1353 - mapped_buffer = kmap_atomic(sblock->pagev[0].page); 1144 + mapped_buffer = kmap_atomic(sblock->pagev[0]->page); 1354 1145 } 1355 1146 1356 1147 for (page_num = 0;;) { ··· 1366 1157 page_num++; 1367 1158 if (page_num >= sblock->page_count) 1368 1159 break; 1369 - BUG_ON(!sblock->pagev[page_num].page); 1160 + WARN_ON(!sblock->pagev[page_num]->page); 1370 1161 1371 - mapped_buffer = kmap_atomic(sblock->pagev[page_num].page); 1162 + mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page); 1372 1163 } 1373 1164 1374 1165 btrfs_csum_final(crc, calculated_csum); ··· 1406 1197 struct scrub_block *sblock_good, 1407 1198 int page_num, int force_write) 1408 1199 { 1409 - struct scrub_page *page_bad = sblock_bad->pagev + page_num; 1410 - struct scrub_page *page_good = sblock_good->pagev + page_num; 1200 + struct scrub_page *page_bad = sblock_bad->pagev[page_num]; 1201 + struct scrub_page *page_good = sblock_good->pagev[page_num]; 1411 1202 1412 - BUG_ON(sblock_bad->pagev[page_num].page == NULL); 1413 - BUG_ON(sblock_good->pagev[page_num].page == NULL); 1203 + BUG_ON(page_bad->page == NULL); 1204 + BUG_ON(page_good->page == NULL); 1414 1205 if (force_write || sblock_bad->header_error || 1415 1206 sblock_bad->checksum_error || page_bad->io_error) { 1416 1207 struct bio *bio; 1417 1208 int ret; 1418 1209 DECLARE_COMPLETION_ONSTACK(complete); 1210 + 1211 + if (!page_bad->dev->bdev) { 1212 + printk_ratelimited(KERN_WARNING 1213 + "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n"); 1214 + return -EIO; 1215 + } 1419 1216 1420 1217 bio = bio_alloc(GFP_NOFS, 1); 1421 1218 if (!bio) ··· 1443 1228 if (!bio_flagged(bio, BIO_UPTODATE)) { 1444 1229 btrfs_dev_stat_inc_and_print(page_bad->dev, 1445 1230 BTRFS_DEV_STAT_WRITE_ERRS); 1231 + btrfs_dev_replace_stats_inc( 1232 + &sblock_bad->sctx->dev_root->fs_info-> 1233 + dev_replace.num_write_errors); 1446 1234 bio_put(bio); 1447 1235 return -EIO; 1448 1236 } ··· 1455 1237 return 0; 1456 1238 } 1457 1239 1458 - static void scrub_checksum(struct scrub_block *sblock) 1240 + static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) 1241 + { 1242 + int page_num; 1243 + 1244 + for (page_num = 0; page_num < sblock->page_count; page_num++) { 1245 + int ret; 1246 + 1247 + ret = scrub_write_page_to_dev_replace(sblock, page_num); 1248 + if (ret) 1249 + btrfs_dev_replace_stats_inc( 1250 + &sblock->sctx->dev_root->fs_info->dev_replace. 1251 + num_write_errors); 1252 + } 1253 + } 1254 + 1255 + static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, 1256 + int page_num) 1257 + { 1258 + struct scrub_page *spage = sblock->pagev[page_num]; 1259 + 1260 + BUG_ON(spage->page == NULL); 1261 + if (spage->io_error) { 1262 + void *mapped_buffer = kmap_atomic(spage->page); 1263 + 1264 + memset(mapped_buffer, 0, PAGE_CACHE_SIZE); 1265 + flush_dcache_page(spage->page); 1266 + kunmap_atomic(mapped_buffer); 1267 + } 1268 + return scrub_add_page_to_wr_bio(sblock->sctx, spage); 1269 + } 1270 + 1271 + static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, 1272 + struct scrub_page *spage) 1273 + { 1274 + struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; 1275 + struct scrub_bio *sbio; 1276 + int ret; 1277 + 1278 + mutex_lock(&wr_ctx->wr_lock); 1279 + again: 1280 + if (!wr_ctx->wr_curr_bio) { 1281 + wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio), 1282 + GFP_NOFS); 1283 + if (!wr_ctx->wr_curr_bio) { 1284 + mutex_unlock(&wr_ctx->wr_lock); 1285 + return -ENOMEM; 1286 + } 1287 + wr_ctx->wr_curr_bio->sctx = sctx; 1288 + wr_ctx->wr_curr_bio->page_count = 0; 1289 + } 1290 + sbio = wr_ctx->wr_curr_bio; 1291 + if (sbio->page_count == 0) { 1292 + struct bio *bio; 1293 + 1294 + sbio->physical = spage->physical_for_dev_replace; 1295 + sbio->logical = spage->logical; 1296 + sbio->dev = wr_ctx->tgtdev; 1297 + bio = sbio->bio; 1298 + if (!bio) { 1299 + bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); 1300 + if (!bio) { 1301 + mutex_unlock(&wr_ctx->wr_lock); 1302 + return -ENOMEM; 1303 + } 1304 + sbio->bio = bio; 1305 + } 1306 + 1307 + bio->bi_private = sbio; 1308 + bio->bi_end_io = scrub_wr_bio_end_io; 1309 + bio->bi_bdev = sbio->dev->bdev; 1310 + bio->bi_sector = sbio->physical >> 9; 1311 + sbio->err = 0; 1312 + } else if (sbio->physical + sbio->page_count * PAGE_SIZE != 1313 + spage->physical_for_dev_replace || 1314 + sbio->logical + sbio->page_count * PAGE_SIZE != 1315 + spage->logical) { 1316 + scrub_wr_submit(sctx); 1317 + goto again; 1318 + } 1319 + 1320 + ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); 1321 + if (ret != PAGE_SIZE) { 1322 + if (sbio->page_count < 1) { 1323 + bio_put(sbio->bio); 1324 + sbio->bio = NULL; 1325 + mutex_unlock(&wr_ctx->wr_lock); 1326 + return -EIO; 1327 + } 1328 + scrub_wr_submit(sctx); 1329 + goto again; 1330 + } 1331 + 1332 + sbio->pagev[sbio->page_count] = spage; 1333 + scrub_page_get(spage); 1334 + sbio->page_count++; 1335 + if (sbio->page_count == wr_ctx->pages_per_wr_bio) 1336 + scrub_wr_submit(sctx); 1337 + mutex_unlock(&wr_ctx->wr_lock); 1338 + 1339 + return 0; 1340 + } 1341 + 1342 + static void scrub_wr_submit(struct scrub_ctx *sctx) 1343 + { 1344 + struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; 1345 + struct scrub_bio *sbio; 1346 + 1347 + if (!wr_ctx->wr_curr_bio) 1348 + return; 1349 + 1350 + sbio = wr_ctx->wr_curr_bio; 1351 + wr_ctx->wr_curr_bio = NULL; 1352 + WARN_ON(!sbio->bio->bi_bdev); 1353 + scrub_pending_bio_inc(sctx); 1354 + /* process all writes in a single worker thread. Then the block layer 1355 + * orders the requests before sending them to the driver which 1356 + * doubled the write performance on spinning disks when measured 1357 + * with Linux 3.5 */ 1358 + btrfsic_submit_bio(WRITE, sbio->bio); 1359 + } 1360 + 1361 + static void scrub_wr_bio_end_io(struct bio *bio, int err) 1362 + { 1363 + struct scrub_bio *sbio = bio->bi_private; 1364 + struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; 1365 + 1366 + sbio->err = err; 1367 + sbio->bio = bio; 1368 + 1369 + sbio->work.func = scrub_wr_bio_end_io_worker; 1370 + btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work); 1371 + } 1372 + 1373 + static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) 1374 + { 1375 + struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 1376 + struct scrub_ctx *sctx = sbio->sctx; 1377 + int i; 1378 + 1379 + WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO); 1380 + if (sbio->err) { 1381 + struct btrfs_dev_replace *dev_replace = 1382 + &sbio->sctx->dev_root->fs_info->dev_replace; 1383 + 1384 + for (i = 0; i < sbio->page_count; i++) { 1385 + struct scrub_page *spage = sbio->pagev[i]; 1386 + 1387 + spage->io_error = 1; 1388 + btrfs_dev_replace_stats_inc(&dev_replace-> 1389 + num_write_errors); 1390 + } 1391 + } 1392 + 1393 + for (i = 0; i < sbio->page_count; i++) 1394 + scrub_page_put(sbio->pagev[i]); 1395 + 1396 + bio_put(sbio->bio); 1397 + kfree(sbio); 1398 + scrub_pending_bio_dec(sctx); 1399 + } 1400 + 1401 + static int scrub_checksum(struct scrub_block *sblock) 1459 1402 { 1460 1403 u64 flags; 1461 1404 int ret; 1462 1405 1463 - BUG_ON(sblock->page_count < 1); 1464 - flags = sblock->pagev[0].flags; 1406 + WARN_ON(sblock->page_count < 1); 1407 + flags = sblock->pagev[0]->flags; 1465 1408 ret = 0; 1466 1409 if (flags & BTRFS_EXTENT_FLAG_DATA) 1467 1410 ret = scrub_checksum_data(sblock); ··· 1634 1255 WARN_ON(1); 1635 1256 if (ret) 1636 1257 scrub_handle_errored_block(sblock); 1258 + 1259 + return ret; 1637 1260 } 1638 1261 1639 1262 static int scrub_checksum_data(struct scrub_block *sblock) 1640 1263 { 1641 - struct scrub_dev *sdev = sblock->sdev; 1264 + struct scrub_ctx *sctx = sblock->sctx; 1642 1265 u8 csum[BTRFS_CSUM_SIZE]; 1643 1266 u8 *on_disk_csum; 1644 1267 struct page *page; 1645 1268 void *buffer; 1646 1269 u32 crc = ~(u32)0; 1647 1270 int fail = 0; 1648 - struct btrfs_root *root = sdev->dev->dev_root; 1271 + struct btrfs_root *root = sctx->dev_root; 1649 1272 u64 len; 1650 1273 int index; 1651 1274 1652 1275 BUG_ON(sblock->page_count < 1); 1653 - if (!sblock->pagev[0].have_csum) 1276 + if (!sblock->pagev[0]->have_csum) 1654 1277 return 0; 1655 1278 1656 - on_disk_csum = sblock->pagev[0].csum; 1657 - page = sblock->pagev[0].page; 1279 + on_disk_csum = sblock->pagev[0]->csum; 1280 + page = sblock->pagev[0]->page; 1658 1281 buffer = kmap_atomic(page); 1659 1282 1660 - len = sdev->sectorsize; 1283 + len = sctx->sectorsize; 1661 1284 index = 0; 1662 1285 for (;;) { 1663 1286 u64 l = min_t(u64, len, PAGE_SIZE); ··· 1671 1290 break; 1672 1291 index++; 1673 1292 BUG_ON(index >= sblock->page_count); 1674 - BUG_ON(!sblock->pagev[index].page); 1675 - page = sblock->pagev[index].page; 1293 + BUG_ON(!sblock->pagev[index]->page); 1294 + page = sblock->pagev[index]->page; 1676 1295 buffer = kmap_atomic(page); 1677 1296 } 1678 1297 1679 1298 btrfs_csum_final(crc, csum); 1680 - if (memcmp(csum, on_disk_csum, sdev->csum_size)) 1299 + if (memcmp(csum, on_disk_csum, sctx->csum_size)) 1681 1300 fail = 1; 1682 1301 1683 1302 return fail; ··· 1685 1304 1686 1305 static int scrub_checksum_tree_block(struct scrub_block *sblock) 1687 1306 { 1688 - struct scrub_dev *sdev = sblock->sdev; 1307 + struct scrub_ctx *sctx = sblock->sctx; 1689 1308 struct btrfs_header *h; 1690 - struct btrfs_root *root = sdev->dev->dev_root; 1309 + struct btrfs_root *root = sctx->dev_root; 1691 1310 struct btrfs_fs_info *fs_info = root->fs_info; 1692 1311 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1693 1312 u8 on_disk_csum[BTRFS_CSUM_SIZE]; ··· 1702 1321 int index; 1703 1322 1704 1323 BUG_ON(sblock->page_count < 1); 1705 - page = sblock->pagev[0].page; 1324 + page = sblock->pagev[0]->page; 1706 1325 mapped_buffer = kmap_atomic(page); 1707 1326 h = (struct btrfs_header *)mapped_buffer; 1708 - memcpy(on_disk_csum, h->csum, sdev->csum_size); 1327 + memcpy(on_disk_csum, h->csum, sctx->csum_size); 1709 1328 1710 1329 /* 1711 1330 * we don't use the getter functions here, as we ··· 1713 1332 * b) the page is already kmapped 1714 1333 */ 1715 1334 1716 - if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr)) 1335 + if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr)) 1717 1336 ++fail; 1718 1337 1719 - if (sblock->pagev[0].generation != le64_to_cpu(h->generation)) 1338 + if (sblock->pagev[0]->generation != le64_to_cpu(h->generation)) 1720 1339 ++fail; 1721 1340 1722 1341 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) ··· 1726 1345 BTRFS_UUID_SIZE)) 1727 1346 ++fail; 1728 1347 1729 - BUG_ON(sdev->nodesize != sdev->leafsize); 1730 - len = sdev->nodesize - BTRFS_CSUM_SIZE; 1348 + WARN_ON(sctx->nodesize != sctx->leafsize); 1349 + len = sctx->nodesize - BTRFS_CSUM_SIZE; 1731 1350 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 1732 1351 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 1733 1352 index = 0; ··· 1741 1360 break; 1742 1361 index++; 1743 1362 BUG_ON(index >= sblock->page_count); 1744 - BUG_ON(!sblock->pagev[index].page); 1745 - page = sblock->pagev[index].page; 1363 + BUG_ON(!sblock->pagev[index]->page); 1364 + page = sblock->pagev[index]->page; 1746 1365 mapped_buffer = kmap_atomic(page); 1747 1366 mapped_size = PAGE_SIZE; 1748 1367 p = mapped_buffer; 1749 1368 } 1750 1369 1751 1370 btrfs_csum_final(crc, calculated_csum); 1752 - if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) 1371 + if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) 1753 1372 ++crc_fail; 1754 1373 1755 1374 return fail || crc_fail; ··· 1758 1377 static int scrub_checksum_super(struct scrub_block *sblock) 1759 1378 { 1760 1379 struct btrfs_super_block *s; 1761 - struct scrub_dev *sdev = sblock->sdev; 1762 - struct btrfs_root *root = sdev->dev->dev_root; 1380 + struct scrub_ctx *sctx = sblock->sctx; 1381 + struct btrfs_root *root = sctx->dev_root; 1763 1382 struct btrfs_fs_info *fs_info = root->fs_info; 1764 1383 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1765 1384 u8 on_disk_csum[BTRFS_CSUM_SIZE]; ··· 1774 1393 int index; 1775 1394 1776 1395 BUG_ON(sblock->page_count < 1); 1777 - page = sblock->pagev[0].page; 1396 + page = sblock->pagev[0]->page; 1778 1397 mapped_buffer = kmap_atomic(page); 1779 1398 s = (struct btrfs_super_block *)mapped_buffer; 1780 - memcpy(on_disk_csum, s->csum, sdev->csum_size); 1399 + memcpy(on_disk_csum, s->csum, sctx->csum_size); 1781 1400 1782 - if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) 1401 + if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr)) 1783 1402 ++fail_cor; 1784 1403 1785 - if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) 1404 + if (sblock->pagev[0]->generation != le64_to_cpu(s->generation)) 1786 1405 ++fail_gen; 1787 1406 1788 1407 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) ··· 1802 1421 break; 1803 1422 index++; 1804 1423 BUG_ON(index >= sblock->page_count); 1805 - BUG_ON(!sblock->pagev[index].page); 1806 - page = sblock->pagev[index].page; 1424 + BUG_ON(!sblock->pagev[index]->page); 1425 + page = sblock->pagev[index]->page; 1807 1426 mapped_buffer = kmap_atomic(page); 1808 1427 mapped_size = PAGE_SIZE; 1809 1428 p = mapped_buffer; 1810 1429 } 1811 1430 1812 1431 btrfs_csum_final(crc, calculated_csum); 1813 - if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) 1432 + if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) 1814 1433 ++fail_cor; 1815 1434 1816 1435 if (fail_cor + fail_gen) { ··· 1819 1438 * They will get written with the next transaction commit 1820 1439 * anyway 1821 1440 */ 1822 - spin_lock(&sdev->stat_lock); 1823 - ++sdev->stat.super_errors; 1824 - spin_unlock(&sdev->stat_lock); 1441 + spin_lock(&sctx->stat_lock); 1442 + ++sctx->stat.super_errors; 1443 + spin_unlock(&sctx->stat_lock); 1825 1444 if (fail_cor) 1826 - btrfs_dev_stat_inc_and_print(sdev->dev, 1445 + btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, 1827 1446 BTRFS_DEV_STAT_CORRUPTION_ERRS); 1828 1447 else 1829 - btrfs_dev_stat_inc_and_print(sdev->dev, 1448 + btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, 1830 1449 BTRFS_DEV_STAT_GENERATION_ERRS); 1831 1450 } 1832 1451 ··· 1844 1463 int i; 1845 1464 1846 1465 for (i = 0; i < sblock->page_count; i++) 1847 - if (sblock->pagev[i].page) 1848 - __free_page(sblock->pagev[i].page); 1466 + scrub_page_put(sblock->pagev[i]); 1849 1467 kfree(sblock); 1850 1468 } 1851 1469 } 1852 1470 1853 - static void scrub_submit(struct scrub_dev *sdev) 1471 + static void scrub_page_get(struct scrub_page *spage) 1472 + { 1473 + atomic_inc(&spage->ref_count); 1474 + } 1475 + 1476 + static void scrub_page_put(struct scrub_page *spage) 1477 + { 1478 + if (atomic_dec_and_test(&spage->ref_count)) { 1479 + if (spage->page) 1480 + __free_page(spage->page); 1481 + kfree(spage); 1482 + } 1483 + } 1484 + 1485 + static void scrub_submit(struct scrub_ctx *sctx) 1854 1486 { 1855 1487 struct scrub_bio *sbio; 1856 1488 1857 - if (sdev->curr == -1) 1489 + if (sctx->curr == -1) 1858 1490 return; 1859 1491 1860 - sbio = sdev->bios[sdev->curr]; 1861 - sdev->curr = -1; 1862 - atomic_inc(&sdev->in_flight); 1492 + sbio = sctx->bios[sctx->curr]; 1493 + sctx->curr = -1; 1494 + scrub_pending_bio_inc(sctx); 1863 1495 1864 - btrfsic_submit_bio(READ, sbio->bio); 1496 + if (!sbio->bio->bi_bdev) { 1497 + /* 1498 + * this case should not happen. If btrfs_map_block() is 1499 + * wrong, it could happen for dev-replace operations on 1500 + * missing devices when no mirrors are available, but in 1501 + * this case it should already fail the mount. 1502 + * This case is handled correctly (but _very_ slowly). 1503 + */ 1504 + printk_ratelimited(KERN_WARNING 1505 + "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n"); 1506 + bio_endio(sbio->bio, -EIO); 1507 + } else { 1508 + btrfsic_submit_bio(READ, sbio->bio); 1509 + } 1865 1510 } 1866 1511 1867 - static int scrub_add_page_to_bio(struct scrub_dev *sdev, 1868 - struct scrub_page *spage) 1512 + static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, 1513 + struct scrub_page *spage) 1869 1514 { 1870 1515 struct scrub_block *sblock = spage->sblock; 1871 1516 struct scrub_bio *sbio; ··· 1901 1494 /* 1902 1495 * grab a fresh bio or wait for one to become available 1903 1496 */ 1904 - while (sdev->curr == -1) { 1905 - spin_lock(&sdev->list_lock); 1906 - sdev->curr = sdev->first_free; 1907 - if (sdev->curr != -1) { 1908 - sdev->first_free = sdev->bios[sdev->curr]->next_free; 1909 - sdev->bios[sdev->curr]->next_free = -1; 1910 - sdev->bios[sdev->curr]->page_count = 0; 1911 - spin_unlock(&sdev->list_lock); 1497 + while (sctx->curr == -1) { 1498 + spin_lock(&sctx->list_lock); 1499 + sctx->curr = sctx->first_free; 1500 + if (sctx->curr != -1) { 1501 + sctx->first_free = sctx->bios[sctx->curr]->next_free; 1502 + sctx->bios[sctx->curr]->next_free = -1; 1503 + sctx->bios[sctx->curr]->page_count = 0; 1504 + spin_unlock(&sctx->list_lock); 1912 1505 } else { 1913 - spin_unlock(&sdev->list_lock); 1914 - wait_event(sdev->list_wait, sdev->first_free != -1); 1506 + spin_unlock(&sctx->list_lock); 1507 + wait_event(sctx->list_wait, sctx->first_free != -1); 1915 1508 } 1916 1509 } 1917 - sbio = sdev->bios[sdev->curr]; 1510 + sbio = sctx->bios[sctx->curr]; 1918 1511 if (sbio->page_count == 0) { 1919 1512 struct bio *bio; 1920 1513 1921 1514 sbio->physical = spage->physical; 1922 1515 sbio->logical = spage->logical; 1516 + sbio->dev = spage->dev; 1923 1517 bio = sbio->bio; 1924 1518 if (!bio) { 1925 - bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio); 1519 + bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio); 1926 1520 if (!bio) 1927 1521 return -ENOMEM; 1928 1522 sbio->bio = bio; ··· 1931 1523 1932 1524 bio->bi_private = sbio; 1933 1525 bio->bi_end_io = scrub_bio_end_io; 1934 - bio->bi_bdev = sdev->dev->bdev; 1935 - bio->bi_sector = spage->physical >> 9; 1526 + bio->bi_bdev = sbio->dev->bdev; 1527 + bio->bi_sector = sbio->physical >> 9; 1936 1528 sbio->err = 0; 1937 1529 } else if (sbio->physical + sbio->page_count * PAGE_SIZE != 1938 1530 spage->physical || 1939 1531 sbio->logical + sbio->page_count * PAGE_SIZE != 1940 - spage->logical) { 1941 - scrub_submit(sdev); 1532 + spage->logical || 1533 + sbio->dev != spage->dev) { 1534 + scrub_submit(sctx); 1942 1535 goto again; 1943 1536 } 1944 1537 ··· 1951 1542 sbio->bio = NULL; 1952 1543 return -EIO; 1953 1544 } 1954 - scrub_submit(sdev); 1545 + scrub_submit(sctx); 1955 1546 goto again; 1956 1547 } 1957 1548 1958 - scrub_block_get(sblock); /* one for the added page */ 1549 + scrub_block_get(sblock); /* one for the page added to the bio */ 1959 1550 atomic_inc(&sblock->outstanding_pages); 1960 1551 sbio->page_count++; 1961 - if (sbio->page_count == sdev->pages_per_bio) 1962 - scrub_submit(sdev); 1552 + if (sbio->page_count == sctx->pages_per_rd_bio) 1553 + scrub_submit(sctx); 1963 1554 1964 1555 return 0; 1965 1556 } 1966 1557 1967 - static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, 1968 - u64 physical, u64 flags, u64 gen, int mirror_num, 1969 - u8 *csum, int force) 1558 + static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 1559 + u64 physical, struct btrfs_device *dev, u64 flags, 1560 + u64 gen, int mirror_num, u8 *csum, int force, 1561 + u64 physical_for_dev_replace) 1970 1562 { 1971 1563 struct scrub_block *sblock; 1972 1564 int index; 1973 1565 1974 1566 sblock = kzalloc(sizeof(*sblock), GFP_NOFS); 1975 1567 if (!sblock) { 1976 - spin_lock(&sdev->stat_lock); 1977 - sdev->stat.malloc_errors++; 1978 - spin_unlock(&sdev->stat_lock); 1568 + spin_lock(&sctx->stat_lock); 1569 + sctx->stat.malloc_errors++; 1570 + spin_unlock(&sctx->stat_lock); 1979 1571 return -ENOMEM; 1980 1572 } 1981 1573 1982 - /* one ref inside this function, plus one for each page later on */ 1574 + /* one ref inside this function, plus one for each page added to 1575 + * a bio later on */ 1983 1576 atomic_set(&sblock->ref_count, 1); 1984 - sblock->sdev = sdev; 1577 + sblock->sctx = sctx; 1985 1578 sblock->no_io_error_seen = 1; 1986 1579 1987 1580 for (index = 0; len > 0; index++) { 1988 - struct scrub_page *spage = sblock->pagev + index; 1581 + struct scrub_page *spage; 1989 1582 u64 l = min_t(u64, len, PAGE_SIZE); 1990 1583 1991 - BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); 1992 - spage->page = alloc_page(GFP_NOFS); 1993 - if (!spage->page) { 1994 - spin_lock(&sdev->stat_lock); 1995 - sdev->stat.malloc_errors++; 1996 - spin_unlock(&sdev->stat_lock); 1997 - while (index > 0) { 1998 - index--; 1999 - __free_page(sblock->pagev[index].page); 2000 - } 2001 - kfree(sblock); 1584 + spage = kzalloc(sizeof(*spage), GFP_NOFS); 1585 + if (!spage) { 1586 + leave_nomem: 1587 + spin_lock(&sctx->stat_lock); 1588 + sctx->stat.malloc_errors++; 1589 + spin_unlock(&sctx->stat_lock); 1590 + scrub_block_put(sblock); 2002 1591 return -ENOMEM; 2003 1592 } 1593 + BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); 1594 + scrub_page_get(spage); 1595 + sblock->pagev[index] = spage; 2004 1596 spage->sblock = sblock; 2005 - spage->dev = sdev->dev; 1597 + spage->dev = dev; 2006 1598 spage->flags = flags; 2007 1599 spage->generation = gen; 2008 1600 spage->logical = logical; 2009 1601 spage->physical = physical; 1602 + spage->physical_for_dev_replace = physical_for_dev_replace; 2010 1603 spage->mirror_num = mirror_num; 2011 1604 if (csum) { 2012 1605 spage->have_csum = 1; 2013 - memcpy(spage->csum, csum, sdev->csum_size); 1606 + memcpy(spage->csum, csum, sctx->csum_size); 2014 1607 } else { 2015 1608 spage->have_csum = 0; 2016 1609 } 2017 1610 sblock->page_count++; 1611 + spage->page = alloc_page(GFP_NOFS); 1612 + if (!spage->page) 1613 + goto leave_nomem; 2018 1614 len -= l; 2019 1615 logical += l; 2020 1616 physical += l; 1617 + physical_for_dev_replace += l; 2021 1618 } 2022 1619 2023 - BUG_ON(sblock->page_count == 0); 1620 + WARN_ON(sblock->page_count == 0); 2024 1621 for (index = 0; index < sblock->page_count; index++) { 2025 - struct scrub_page *spage = sblock->pagev + index; 1622 + struct scrub_page *spage = sblock->pagev[index]; 2026 1623 int ret; 2027 1624 2028 - ret = scrub_add_page_to_bio(sdev, spage); 1625 + ret = scrub_add_page_to_rd_bio(sctx, spage); 2029 1626 if (ret) { 2030 1627 scrub_block_put(sblock); 2031 1628 return ret; ··· 2039 1624 } 2040 1625 2041 1626 if (force) 2042 - scrub_submit(sdev); 1627 + scrub_submit(sctx); 2043 1628 2044 1629 /* last one frees, either here or in bio completion for last page */ 2045 1630 scrub_block_put(sblock); ··· 2049 1634 static void scrub_bio_end_io(struct bio *bio, int err) 2050 1635 { 2051 1636 struct scrub_bio *sbio = bio->bi_private; 2052 - struct scrub_dev *sdev = sbio->sdev; 2053 - struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 1637 + struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; 2054 1638 2055 1639 sbio->err = err; 2056 1640 sbio->bio = bio; ··· 2060 1646 static void scrub_bio_end_io_worker(struct btrfs_work *work) 2061 1647 { 2062 1648 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 2063 - struct scrub_dev *sdev = sbio->sdev; 1649 + struct scrub_ctx *sctx = sbio->sctx; 2064 1650 int i; 2065 1651 2066 - BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); 1652 + BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO); 2067 1653 if (sbio->err) { 2068 1654 for (i = 0; i < sbio->page_count; i++) { 2069 1655 struct scrub_page *spage = sbio->pagev[i]; ··· 2085 1671 2086 1672 bio_put(sbio->bio); 2087 1673 sbio->bio = NULL; 2088 - spin_lock(&sdev->list_lock); 2089 - sbio->next_free = sdev->first_free; 2090 - sdev->first_free = sbio->index; 2091 - spin_unlock(&sdev->list_lock); 2092 - atomic_dec(&sdev->in_flight); 2093 - wake_up(&sdev->list_wait); 1674 + spin_lock(&sctx->list_lock); 1675 + sbio->next_free = sctx->first_free; 1676 + sctx->first_free = sbio->index; 1677 + spin_unlock(&sctx->list_lock); 1678 + 1679 + if (sctx->is_dev_replace && 1680 + atomic_read(&sctx->wr_ctx.flush_all_writes)) { 1681 + mutex_lock(&sctx->wr_ctx.wr_lock); 1682 + scrub_wr_submit(sctx); 1683 + mutex_unlock(&sctx->wr_ctx.wr_lock); 1684 + } 1685 + 1686 + scrub_pending_bio_dec(sctx); 2094 1687 } 2095 1688 2096 1689 static void scrub_block_complete(struct scrub_block *sblock) 2097 1690 { 2098 - if (!sblock->no_io_error_seen) 1691 + if (!sblock->no_io_error_seen) { 2099 1692 scrub_handle_errored_block(sblock); 2100 - else 2101 - scrub_checksum(sblock); 1693 + } else { 1694 + /* 1695 + * if has checksum error, write via repair mechanism in 1696 + * dev replace case, otherwise write here in dev replace 1697 + * case. 1698 + */ 1699 + if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace) 1700 + scrub_write_block_to_dev_replace(sblock); 1701 + } 2102 1702 } 2103 1703 2104 - static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, 1704 + static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, 2105 1705 u8 *csum) 2106 1706 { 2107 1707 struct btrfs_ordered_sum *sum = NULL; ··· 2123 1695 unsigned long i; 2124 1696 unsigned long num_sectors; 2125 1697 2126 - while (!list_empty(&sdev->csum_list)) { 2127 - sum = list_first_entry(&sdev->csum_list, 1698 + while (!list_empty(&sctx->csum_list)) { 1699 + sum = list_first_entry(&sctx->csum_list, 2128 1700 struct btrfs_ordered_sum, list); 2129 1701 if (sum->bytenr > logical) 2130 1702 return 0; 2131 1703 if (sum->bytenr + sum->len > logical) 2132 1704 break; 2133 1705 2134 - ++sdev->stat.csum_discards; 1706 + ++sctx->stat.csum_discards; 2135 1707 list_del(&sum->list); 2136 1708 kfree(sum); 2137 1709 sum = NULL; ··· 2139 1711 if (!sum) 2140 1712 return 0; 2141 1713 2142 - num_sectors = sum->len / sdev->sectorsize; 1714 + num_sectors = sum->len / sctx->sectorsize; 2143 1715 for (i = 0; i < num_sectors; ++i) { 2144 1716 if (sum->sums[i].bytenr == logical) { 2145 - memcpy(csum, &sum->sums[i].sum, sdev->csum_size); 1717 + memcpy(csum, &sum->sums[i].sum, sctx->csum_size); 2146 1718 ret = 1; 2147 1719 break; 2148 1720 } ··· 2155 1727 } 2156 1728 2157 1729 /* scrub extent tries to collect up to 64 kB for each bio */ 2158 - static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, 2159 - u64 physical, u64 flags, u64 gen, int mirror_num) 1730 + static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, 1731 + u64 physical, struct btrfs_device *dev, u64 flags, 1732 + u64 gen, int mirror_num, u64 physical_for_dev_replace) 2160 1733 { 2161 1734 int ret; 2162 1735 u8 csum[BTRFS_CSUM_SIZE]; 2163 1736 u32 blocksize; 2164 1737 2165 1738 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2166 - blocksize = sdev->sectorsize; 2167 - spin_lock(&sdev->stat_lock); 2168 - sdev->stat.data_extents_scrubbed++; 2169 - sdev->stat.data_bytes_scrubbed += len; 2170 - spin_unlock(&sdev->stat_lock); 1739 + blocksize = sctx->sectorsize; 1740 + spin_lock(&sctx->stat_lock); 1741 + sctx->stat.data_extents_scrubbed++; 1742 + sctx->stat.data_bytes_scrubbed += len; 1743 + spin_unlock(&sctx->stat_lock); 2171 1744 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 2172 - BUG_ON(sdev->nodesize != sdev->leafsize); 2173 - blocksize = sdev->nodesize; 2174 - spin_lock(&sdev->stat_lock); 2175 - sdev->stat.tree_extents_scrubbed++; 2176 - sdev->stat.tree_bytes_scrubbed += len; 2177 - spin_unlock(&sdev->stat_lock); 1745 + WARN_ON(sctx->nodesize != sctx->leafsize); 1746 + blocksize = sctx->nodesize; 1747 + spin_lock(&sctx->stat_lock); 1748 + sctx->stat.tree_extents_scrubbed++; 1749 + sctx->stat.tree_bytes_scrubbed += len; 1750 + spin_unlock(&sctx->stat_lock); 2178 1751 } else { 2179 - blocksize = sdev->sectorsize; 2180 - BUG_ON(1); 1752 + blocksize = sctx->sectorsize; 1753 + WARN_ON(1); 2181 1754 } 2182 1755 2183 1756 while (len) { ··· 2187 1758 2188 1759 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2189 1760 /* push csums to sbio */ 2190 - have_csum = scrub_find_csum(sdev, logical, l, csum); 1761 + have_csum = scrub_find_csum(sctx, logical, l, csum); 2191 1762 if (have_csum == 0) 2192 - ++sdev->stat.no_csum; 1763 + ++sctx->stat.no_csum; 1764 + if (sctx->is_dev_replace && !have_csum) { 1765 + ret = copy_nocow_pages(sctx, logical, l, 1766 + mirror_num, 1767 + physical_for_dev_replace); 1768 + goto behind_scrub_pages; 1769 + } 2193 1770 } 2194 - ret = scrub_pages(sdev, logical, l, physical, flags, gen, 2195 - mirror_num, have_csum ? csum : NULL, 0); 1771 + ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, 1772 + mirror_num, have_csum ? csum : NULL, 0, 1773 + physical_for_dev_replace); 1774 + behind_scrub_pages: 2196 1775 if (ret) 2197 1776 return ret; 2198 1777 len -= l; 2199 1778 logical += l; 2200 1779 physical += l; 1780 + physical_for_dev_replace += l; 2201 1781 } 2202 1782 return 0; 2203 1783 } 2204 1784 2205 - static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, 2206 - struct map_lookup *map, int num, u64 base, u64 length) 1785 + static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, 1786 + struct map_lookup *map, 1787 + struct btrfs_device *scrub_dev, 1788 + int num, u64 base, u64 length, 1789 + int is_dev_replace) 2207 1790 { 2208 1791 struct btrfs_path *path; 2209 - struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 1792 + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; 2210 1793 struct btrfs_root *root = fs_info->extent_root; 2211 1794 struct btrfs_root *csum_root = fs_info->csum_root; 2212 1795 struct btrfs_extent_item *extent; ··· 2238 1797 struct reada_control *reada2; 2239 1798 struct btrfs_key key_start; 2240 1799 struct btrfs_key key_end; 2241 - 2242 1800 u64 increment = map->stripe_len; 2243 1801 u64 offset; 1802 + u64 extent_logical; 1803 + u64 extent_physical; 1804 + u64 extent_len; 1805 + struct btrfs_device *extent_dev; 1806 + int extent_mirror_num; 2244 1807 2245 1808 nstripes = length; 2246 1809 offset = 0; ··· 2288 1843 */ 2289 1844 logical = base + offset; 2290 1845 2291 - wait_event(sdev->list_wait, 2292 - atomic_read(&sdev->in_flight) == 0); 1846 + wait_event(sctx->list_wait, 1847 + atomic_read(&sctx->bios_in_flight) == 0); 2293 1848 atomic_inc(&fs_info->scrubs_paused); 2294 1849 wake_up(&fs_info->scrub_pause_wait); 2295 1850 ··· 2343 1898 * canceled? 2344 1899 */ 2345 1900 if (atomic_read(&fs_info->scrub_cancel_req) || 2346 - atomic_read(&sdev->cancel_req)) { 1901 + atomic_read(&sctx->cancel_req)) { 2347 1902 ret = -ECANCELED; 2348 1903 goto out; 2349 1904 } ··· 2352 1907 */ 2353 1908 if (atomic_read(&fs_info->scrub_pause_req)) { 2354 1909 /* push queued extents */ 2355 - scrub_submit(sdev); 2356 - wait_event(sdev->list_wait, 2357 - atomic_read(&sdev->in_flight) == 0); 1910 + atomic_set(&sctx->wr_ctx.flush_all_writes, 1); 1911 + scrub_submit(sctx); 1912 + mutex_lock(&sctx->wr_ctx.wr_lock); 1913 + scrub_wr_submit(sctx); 1914 + mutex_unlock(&sctx->wr_ctx.wr_lock); 1915 + wait_event(sctx->list_wait, 1916 + atomic_read(&sctx->bios_in_flight) == 0); 1917 + atomic_set(&sctx->wr_ctx.flush_all_writes, 0); 2358 1918 atomic_inc(&fs_info->scrubs_paused); 2359 1919 wake_up(&fs_info->scrub_pause_wait); 2360 1920 mutex_lock(&fs_info->scrub_lock); ··· 2376 1926 2377 1927 ret = btrfs_lookup_csums_range(csum_root, logical, 2378 1928 logical + map->stripe_len - 1, 2379 - &sdev->csum_list, 1); 1929 + &sctx->csum_list, 1); 2380 1930 if (ret) 2381 1931 goto out; 2382 1932 ··· 2454 2004 key.objectid; 2455 2005 } 2456 2006 2457 - ret = scrub_extent(sdev, key.objectid, key.offset, 2458 - key.objectid - logical + physical, 2459 - flags, generation, mirror_num); 2007 + extent_logical = key.objectid; 2008 + extent_physical = key.objectid - logical + physical; 2009 + extent_len = key.offset; 2010 + extent_dev = scrub_dev; 2011 + extent_mirror_num = mirror_num; 2012 + if (is_dev_replace) 2013 + scrub_remap_extent(fs_info, extent_logical, 2014 + extent_len, &extent_physical, 2015 + &extent_dev, 2016 + &extent_mirror_num); 2017 + ret = scrub_extent(sctx, extent_logical, extent_len, 2018 + extent_physical, extent_dev, flags, 2019 + generation, extent_mirror_num, 2020 + key.objectid - logical + physical); 2460 2021 if (ret) 2461 2022 goto out; 2462 2023 ··· 2477 2016 btrfs_release_path(path); 2478 2017 logical += increment; 2479 2018 physical += map->stripe_len; 2480 - spin_lock(&sdev->stat_lock); 2481 - sdev->stat.last_physical = physical; 2482 - spin_unlock(&sdev->stat_lock); 2019 + spin_lock(&sctx->stat_lock); 2020 + sctx->stat.last_physical = physical; 2021 + spin_unlock(&sctx->stat_lock); 2483 2022 } 2484 - /* push queued extents */ 2485 - scrub_submit(sdev); 2486 - 2487 2023 out: 2024 + /* push queued extents */ 2025 + scrub_submit(sctx); 2026 + mutex_lock(&sctx->wr_ctx.wr_lock); 2027 + scrub_wr_submit(sctx); 2028 + mutex_unlock(&sctx->wr_ctx.wr_lock); 2029 + 2488 2030 blk_finish_plug(&plug); 2489 2031 btrfs_free_path(path); 2490 2032 return ret < 0 ? ret : 0; 2491 2033 } 2492 2034 2493 - static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, 2494 - u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, 2495 - u64 dev_offset) 2035 + static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, 2036 + struct btrfs_device *scrub_dev, 2037 + u64 chunk_tree, u64 chunk_objectid, 2038 + u64 chunk_offset, u64 length, 2039 + u64 dev_offset, int is_dev_replace) 2496 2040 { 2497 2041 struct btrfs_mapping_tree *map_tree = 2498 - &sdev->dev->dev_root->fs_info->mapping_tree; 2042 + &sctx->dev_root->fs_info->mapping_tree; 2499 2043 struct map_lookup *map; 2500 2044 struct extent_map *em; 2501 2045 int i; 2502 - int ret = -EINVAL; 2046 + int ret = 0; 2503 2047 2504 2048 read_lock(&map_tree->map_tree.lock); 2505 2049 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); ··· 2521 2055 goto out; 2522 2056 2523 2057 for (i = 0; i < map->num_stripes; ++i) { 2524 - if (map->stripes[i].dev == sdev->dev && 2058 + if (map->stripes[i].dev->bdev == scrub_dev->bdev && 2525 2059 map->stripes[i].physical == dev_offset) { 2526 - ret = scrub_stripe(sdev, map, i, chunk_offset, length); 2060 + ret = scrub_stripe(sctx, map, scrub_dev, i, 2061 + chunk_offset, length, 2062 + is_dev_replace); 2527 2063 if (ret) 2528 2064 goto out; 2529 2065 } ··· 2537 2069 } 2538 2070 2539 2071 static noinline_for_stack 2540 - int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) 2072 + int scrub_enumerate_chunks(struct scrub_ctx *sctx, 2073 + struct btrfs_device *scrub_dev, u64 start, u64 end, 2074 + int is_dev_replace) 2541 2075 { 2542 2076 struct btrfs_dev_extent *dev_extent = NULL; 2543 2077 struct btrfs_path *path; 2544 - struct btrfs_root *root = sdev->dev->dev_root; 2078 + struct btrfs_root *root = sctx->dev_root; 2545 2079 struct btrfs_fs_info *fs_info = root->fs_info; 2546 2080 u64 length; 2547 2081 u64 chunk_tree; ··· 2555 2085 struct btrfs_key key; 2556 2086 struct btrfs_key found_key; 2557 2087 struct btrfs_block_group_cache *cache; 2088 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 2558 2089 2559 2090 path = btrfs_alloc_path(); 2560 2091 if (!path) ··· 2565 2094 path->search_commit_root = 1; 2566 2095 path->skip_locking = 1; 2567 2096 2568 - key.objectid = sdev->dev->devid; 2097 + key.objectid = scrub_dev->devid; 2569 2098 key.offset = 0ull; 2570 2099 key.type = BTRFS_DEV_EXTENT_KEY; 2571 - 2572 2100 2573 2101 while (1) { 2574 2102 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); ··· 2587 2117 2588 2118 btrfs_item_key_to_cpu(l, &found_key, slot); 2589 2119 2590 - if (found_key.objectid != sdev->dev->devid) 2120 + if (found_key.objectid != scrub_dev->devid) 2591 2121 break; 2592 2122 2593 2123 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) ··· 2621 2151 ret = -ENOENT; 2622 2152 break; 2623 2153 } 2624 - ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, 2625 - chunk_offset, length, found_key.offset); 2154 + dev_replace->cursor_right = found_key.offset + length; 2155 + dev_replace->cursor_left = found_key.offset; 2156 + dev_replace->item_needs_writeback = 1; 2157 + ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid, 2158 + chunk_offset, length, found_key.offset, 2159 + is_dev_replace); 2160 + 2161 + /* 2162 + * flush, submit all pending read and write bios, afterwards 2163 + * wait for them. 2164 + * Note that in the dev replace case, a read request causes 2165 + * write requests that are submitted in the read completion 2166 + * worker. Therefore in the current situation, it is required 2167 + * that all write requests are flushed, so that all read and 2168 + * write requests are really completed when bios_in_flight 2169 + * changes to 0. 2170 + */ 2171 + atomic_set(&sctx->wr_ctx.flush_all_writes, 1); 2172 + scrub_submit(sctx); 2173 + mutex_lock(&sctx->wr_ctx.wr_lock); 2174 + scrub_wr_submit(sctx); 2175 + mutex_unlock(&sctx->wr_ctx.wr_lock); 2176 + 2177 + wait_event(sctx->list_wait, 2178 + atomic_read(&sctx->bios_in_flight) == 0); 2179 + atomic_set(&sctx->wr_ctx.flush_all_writes, 0); 2180 + atomic_inc(&fs_info->scrubs_paused); 2181 + wake_up(&fs_info->scrub_pause_wait); 2182 + wait_event(sctx->list_wait, 2183 + atomic_read(&sctx->workers_pending) == 0); 2184 + 2185 + mutex_lock(&fs_info->scrub_lock); 2186 + while (atomic_read(&fs_info->scrub_pause_req)) { 2187 + mutex_unlock(&fs_info->scrub_lock); 2188 + wait_event(fs_info->scrub_pause_wait, 2189 + atomic_read(&fs_info->scrub_pause_req) == 0); 2190 + mutex_lock(&fs_info->scrub_lock); 2191 + } 2192 + atomic_dec(&fs_info->scrubs_paused); 2193 + mutex_unlock(&fs_info->scrub_lock); 2194 + wake_up(&fs_info->scrub_pause_wait); 2195 + 2196 + dev_replace->cursor_left = dev_replace->cursor_right; 2197 + dev_replace->item_needs_writeback = 1; 2626 2198 btrfs_put_block_group(cache); 2627 2199 if (ret) 2628 2200 break; 2201 + if (is_dev_replace && 2202 + atomic64_read(&dev_replace->num_write_errors) > 0) { 2203 + ret = -EIO; 2204 + break; 2205 + } 2206 + if (sctx->stat.malloc_errors > 0) { 2207 + ret = -ENOMEM; 2208 + break; 2209 + } 2629 2210 2630 2211 key.offset = found_key.offset + length; 2631 2212 btrfs_release_path(path); ··· 2691 2170 return ret < 0 ? ret : 0; 2692 2171 } 2693 2172 2694 - static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) 2173 + static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, 2174 + struct btrfs_device *scrub_dev) 2695 2175 { 2696 2176 int i; 2697 2177 u64 bytenr; 2698 2178 u64 gen; 2699 2179 int ret; 2700 - struct btrfs_device *device = sdev->dev; 2701 - struct btrfs_root *root = device->dev_root; 2180 + struct btrfs_root *root = sctx->dev_root; 2702 2181 2703 2182 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 2704 2183 return -EIO; ··· 2707 2186 2708 2187 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 2709 2188 bytenr = btrfs_sb_offset(i); 2710 - if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) 2189 + if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes) 2711 2190 break; 2712 2191 2713 - ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, 2714 - BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); 2192 + ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, 2193 + scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, 2194 + NULL, 1, bytenr); 2715 2195 if (ret) 2716 2196 return ret; 2717 2197 } 2718 - wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 2198 + wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); 2719 2199 2720 2200 return 0; 2721 2201 } ··· 2724 2202 /* 2725 2203 * get a reference count on fs_info->scrub_workers. start worker if necessary 2726 2204 */ 2727 - static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) 2205 + static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, 2206 + int is_dev_replace) 2728 2207 { 2729 - struct btrfs_fs_info *fs_info = root->fs_info; 2730 2208 int ret = 0; 2731 2209 2732 2210 mutex_lock(&fs_info->scrub_lock); 2733 2211 if (fs_info->scrub_workers_refcnt == 0) { 2734 - btrfs_init_workers(&fs_info->scrub_workers, "scrub", 2735 - fs_info->thread_pool_size, &fs_info->generic_worker); 2212 + if (is_dev_replace) 2213 + btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1, 2214 + &fs_info->generic_worker); 2215 + else 2216 + btrfs_init_workers(&fs_info->scrub_workers, "scrub", 2217 + fs_info->thread_pool_size, 2218 + &fs_info->generic_worker); 2736 2219 fs_info->scrub_workers.idle_thresh = 4; 2737 2220 ret = btrfs_start_workers(&fs_info->scrub_workers); 2221 + if (ret) 2222 + goto out; 2223 + btrfs_init_workers(&fs_info->scrub_wr_completion_workers, 2224 + "scrubwrc", 2225 + fs_info->thread_pool_size, 2226 + &fs_info->generic_worker); 2227 + fs_info->scrub_wr_completion_workers.idle_thresh = 2; 2228 + ret = btrfs_start_workers( 2229 + &fs_info->scrub_wr_completion_workers); 2230 + if (ret) 2231 + goto out; 2232 + btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1, 2233 + &fs_info->generic_worker); 2234 + ret = btrfs_start_workers(&fs_info->scrub_nocow_workers); 2738 2235 if (ret) 2739 2236 goto out; 2740 2237 } ··· 2764 2223 return ret; 2765 2224 } 2766 2225 2767 - static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) 2226 + static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) 2768 2227 { 2769 - struct btrfs_fs_info *fs_info = root->fs_info; 2770 - 2771 2228 mutex_lock(&fs_info->scrub_lock); 2772 - if (--fs_info->scrub_workers_refcnt == 0) 2229 + if (--fs_info->scrub_workers_refcnt == 0) { 2773 2230 btrfs_stop_workers(&fs_info->scrub_workers); 2231 + btrfs_stop_workers(&fs_info->scrub_wr_completion_workers); 2232 + btrfs_stop_workers(&fs_info->scrub_nocow_workers); 2233 + } 2774 2234 WARN_ON(fs_info->scrub_workers_refcnt < 0); 2775 2235 mutex_unlock(&fs_info->scrub_lock); 2776 2236 } 2777 2237 2778 - 2779 - int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, 2780 - struct btrfs_scrub_progress *progress, int readonly) 2238 + int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, 2239 + u64 end, struct btrfs_scrub_progress *progress, 2240 + int readonly, int is_dev_replace) 2781 2241 { 2782 - struct scrub_dev *sdev; 2783 - struct btrfs_fs_info *fs_info = root->fs_info; 2242 + struct scrub_ctx *sctx; 2784 2243 int ret; 2785 2244 struct btrfs_device *dev; 2786 2245 2787 - if (btrfs_fs_closing(root->fs_info)) 2246 + if (btrfs_fs_closing(fs_info)) 2788 2247 return -EINVAL; 2789 2248 2790 2249 /* 2791 2250 * check some assumptions 2792 2251 */ 2793 - if (root->nodesize != root->leafsize) { 2252 + if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) { 2794 2253 printk(KERN_ERR 2795 2254 "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", 2796 - root->nodesize, root->leafsize); 2255 + fs_info->chunk_root->nodesize, 2256 + fs_info->chunk_root->leafsize); 2797 2257 return -EINVAL; 2798 2258 } 2799 2259 2800 - if (root->nodesize > BTRFS_STRIPE_LEN) { 2260 + if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) { 2801 2261 /* 2802 2262 * in this case scrub is unable to calculate the checksum 2803 2263 * the way scrub is implemented. Do not handle this ··· 2806 2264 */ 2807 2265 printk(KERN_ERR 2808 2266 "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", 2809 - root->nodesize, BTRFS_STRIPE_LEN); 2267 + fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN); 2810 2268 return -EINVAL; 2811 2269 } 2812 2270 2813 - if (root->sectorsize != PAGE_SIZE) { 2271 + if (fs_info->chunk_root->sectorsize != PAGE_SIZE) { 2814 2272 /* not supported for data w/o checksums */ 2815 2273 printk(KERN_ERR 2816 2274 "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", 2817 - root->sectorsize, (unsigned long long)PAGE_SIZE); 2275 + fs_info->chunk_root->sectorsize, 2276 + (unsigned long long)PAGE_SIZE); 2818 2277 return -EINVAL; 2819 2278 } 2820 2279 2821 - ret = scrub_workers_get(root); 2280 + if (fs_info->chunk_root->nodesize > 2281 + PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK || 2282 + fs_info->chunk_root->sectorsize > 2283 + PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) { 2284 + /* 2285 + * would exhaust the array bounds of pagev member in 2286 + * struct scrub_block 2287 + */ 2288 + pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n", 2289 + fs_info->chunk_root->nodesize, 2290 + SCRUB_MAX_PAGES_PER_BLOCK, 2291 + fs_info->chunk_root->sectorsize, 2292 + SCRUB_MAX_PAGES_PER_BLOCK); 2293 + return -EINVAL; 2294 + } 2295 + 2296 + ret = scrub_workers_get(fs_info, is_dev_replace); 2822 2297 if (ret) 2823 2298 return ret; 2824 2299 2825 - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2826 - dev = btrfs_find_device(root, devid, NULL, NULL); 2827 - if (!dev || dev->missing) { 2828 - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2829 - scrub_workers_put(root); 2300 + mutex_lock(&fs_info->fs_devices->device_list_mutex); 2301 + dev = btrfs_find_device(fs_info, devid, NULL, NULL); 2302 + if (!dev || (dev->missing && !is_dev_replace)) { 2303 + mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2304 + scrub_workers_put(fs_info); 2830 2305 return -ENODEV; 2831 2306 } 2832 2307 mutex_lock(&fs_info->scrub_lock); 2833 2308 2834 - if (!dev->in_fs_metadata) { 2309 + if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) { 2835 2310 mutex_unlock(&fs_info->scrub_lock); 2836 - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2837 - scrub_workers_put(root); 2838 - return -ENODEV; 2311 + mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2312 + scrub_workers_put(fs_info); 2313 + return -EIO; 2839 2314 } 2840 2315 2841 - if (dev->scrub_device) { 2316 + btrfs_dev_replace_lock(&fs_info->dev_replace); 2317 + if (dev->scrub_device || 2318 + (!is_dev_replace && 2319 + btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { 2320 + btrfs_dev_replace_unlock(&fs_info->dev_replace); 2842 2321 mutex_unlock(&fs_info->scrub_lock); 2843 - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2844 - scrub_workers_put(root); 2322 + mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2323 + scrub_workers_put(fs_info); 2845 2324 return -EINPROGRESS; 2846 2325 } 2847 - sdev = scrub_setup_dev(dev); 2848 - if (IS_ERR(sdev)) { 2326 + btrfs_dev_replace_unlock(&fs_info->dev_replace); 2327 + sctx = scrub_setup_ctx(dev, is_dev_replace); 2328 + if (IS_ERR(sctx)) { 2849 2329 mutex_unlock(&fs_info->scrub_lock); 2850 - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2851 - scrub_workers_put(root); 2852 - return PTR_ERR(sdev); 2330 + mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2331 + scrub_workers_put(fs_info); 2332 + return PTR_ERR(sctx); 2853 2333 } 2854 - sdev->readonly = readonly; 2855 - dev->scrub_device = sdev; 2334 + sctx->readonly = readonly; 2335 + dev->scrub_device = sctx; 2856 2336 2857 2337 atomic_inc(&fs_info->scrubs_running); 2858 2338 mutex_unlock(&fs_info->scrub_lock); 2859 - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2339 + mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2860 2340 2861 - down_read(&fs_info->scrub_super_lock); 2862 - ret = scrub_supers(sdev); 2863 - up_read(&fs_info->scrub_super_lock); 2341 + if (!is_dev_replace) { 2342 + down_read(&fs_info->scrub_super_lock); 2343 + ret = scrub_supers(sctx, dev); 2344 + up_read(&fs_info->scrub_super_lock); 2345 + } 2864 2346 2865 2347 if (!ret) 2866 - ret = scrub_enumerate_chunks(sdev, start, end); 2348 + ret = scrub_enumerate_chunks(sctx, dev, start, end, 2349 + is_dev_replace); 2867 2350 2868 - wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 2351 + wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); 2869 2352 atomic_dec(&fs_info->scrubs_running); 2870 2353 wake_up(&fs_info->scrub_pause_wait); 2871 2354 2872 - wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); 2355 + wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); 2873 2356 2874 2357 if (progress) 2875 - memcpy(progress, &sdev->stat, sizeof(*progress)); 2358 + memcpy(progress, &sctx->stat, sizeof(*progress)); 2876 2359 2877 2360 mutex_lock(&fs_info->scrub_lock); 2878 2361 dev->scrub_device = NULL; 2879 2362 mutex_unlock(&fs_info->scrub_lock); 2880 2363 2881 - scrub_free_dev(sdev); 2882 - scrub_workers_put(root); 2364 + scrub_free_ctx(sctx); 2365 + scrub_workers_put(fs_info); 2883 2366 2884 2367 return ret; 2885 2368 } ··· 2944 2377 up_write(&root->fs_info->scrub_super_lock); 2945 2378 } 2946 2379 2947 - int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) 2380 + int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) 2948 2381 { 2949 - 2950 2382 mutex_lock(&fs_info->scrub_lock); 2951 2383 if (!atomic_read(&fs_info->scrubs_running)) { 2952 2384 mutex_unlock(&fs_info->scrub_lock); ··· 2965 2399 return 0; 2966 2400 } 2967 2401 2968 - int btrfs_scrub_cancel(struct btrfs_root *root) 2402 + int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info, 2403 + struct btrfs_device *dev) 2969 2404 { 2970 - return __btrfs_scrub_cancel(root->fs_info); 2971 - } 2972 - 2973 - int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev) 2974 - { 2975 - struct btrfs_fs_info *fs_info = root->fs_info; 2976 - struct scrub_dev *sdev; 2405 + struct scrub_ctx *sctx; 2977 2406 2978 2407 mutex_lock(&fs_info->scrub_lock); 2979 - sdev = dev->scrub_device; 2980 - if (!sdev) { 2408 + sctx = dev->scrub_device; 2409 + if (!sctx) { 2981 2410 mutex_unlock(&fs_info->scrub_lock); 2982 2411 return -ENOTCONN; 2983 2412 } 2984 - atomic_inc(&sdev->cancel_req); 2413 + atomic_inc(&sctx->cancel_req); 2985 2414 while (dev->scrub_device) { 2986 2415 mutex_unlock(&fs_info->scrub_lock); 2987 2416 wait_event(fs_info->scrub_pause_wait, ··· 2999 2438 * does not go away in cancel_dev. FIXME: find a better solution 3000 2439 */ 3001 2440 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3002 - dev = btrfs_find_device(root, devid, NULL, NULL); 2441 + dev = btrfs_find_device(fs_info, devid, NULL, NULL); 3003 2442 if (!dev) { 3004 2443 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3005 2444 return -ENODEV; 3006 2445 } 3007 - ret = btrfs_scrub_cancel_dev(root, dev); 2446 + ret = btrfs_scrub_cancel_dev(fs_info, dev); 3008 2447 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3009 2448 3010 2449 return ret; ··· 3014 2453 struct btrfs_scrub_progress *progress) 3015 2454 { 3016 2455 struct btrfs_device *dev; 3017 - struct scrub_dev *sdev = NULL; 2456 + struct scrub_ctx *sctx = NULL; 3018 2457 3019 2458 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 3020 - dev = btrfs_find_device(root, devid, NULL, NULL); 2459 + dev = btrfs_find_device(root->fs_info, devid, NULL, NULL); 3021 2460 if (dev) 3022 - sdev = dev->scrub_device; 3023 - if (sdev) 3024 - memcpy(progress, &sdev->stat, sizeof(*progress)); 2461 + sctx = dev->scrub_device; 2462 + if (sctx) 2463 + memcpy(progress, &sctx->stat, sizeof(*progress)); 3025 2464 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 3026 2465 3027 - return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV; 2466 + return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; 2467 + } 2468 + 2469 + static void scrub_remap_extent(struct btrfs_fs_info *fs_info, 2470 + u64 extent_logical, u64 extent_len, 2471 + u64 *extent_physical, 2472 + struct btrfs_device **extent_dev, 2473 + int *extent_mirror_num) 2474 + { 2475 + u64 mapped_length; 2476 + struct btrfs_bio *bbio = NULL; 2477 + int ret; 2478 + 2479 + mapped_length = extent_len; 2480 + ret = btrfs_map_block(fs_info, READ, extent_logical, 2481 + &mapped_length, &bbio, 0); 2482 + if (ret || !bbio || mapped_length < extent_len || 2483 + !bbio->stripes[0].dev->bdev) { 2484 + kfree(bbio); 2485 + return; 2486 + } 2487 + 2488 + *extent_physical = bbio->stripes[0].physical; 2489 + *extent_mirror_num = bbio->mirror_num; 2490 + *extent_dev = bbio->stripes[0].dev; 2491 + kfree(bbio); 2492 + } 2493 + 2494 + static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, 2495 + struct scrub_wr_ctx *wr_ctx, 2496 + struct btrfs_fs_info *fs_info, 2497 + struct btrfs_device *dev, 2498 + int is_dev_replace) 2499 + { 2500 + WARN_ON(wr_ctx->wr_curr_bio != NULL); 2501 + 2502 + mutex_init(&wr_ctx->wr_lock); 2503 + wr_ctx->wr_curr_bio = NULL; 2504 + if (!is_dev_replace) 2505 + return 0; 2506 + 2507 + WARN_ON(!dev->bdev); 2508 + wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO, 2509 + bio_get_nr_vecs(dev->bdev)); 2510 + wr_ctx->tgtdev = dev; 2511 + atomic_set(&wr_ctx->flush_all_writes, 0); 2512 + return 0; 2513 + } 2514 + 2515 + static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx) 2516 + { 2517 + mutex_lock(&wr_ctx->wr_lock); 2518 + kfree(wr_ctx->wr_curr_bio); 2519 + wr_ctx->wr_curr_bio = NULL; 2520 + mutex_unlock(&wr_ctx->wr_lock); 2521 + } 2522 + 2523 + static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 2524 + int mirror_num, u64 physical_for_dev_replace) 2525 + { 2526 + struct scrub_copy_nocow_ctx *nocow_ctx; 2527 + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; 2528 + 2529 + nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS); 2530 + if (!nocow_ctx) { 2531 + spin_lock(&sctx->stat_lock); 2532 + sctx->stat.malloc_errors++; 2533 + spin_unlock(&sctx->stat_lock); 2534 + return -ENOMEM; 2535 + } 2536 + 2537 + scrub_pending_trans_workers_inc(sctx); 2538 + 2539 + nocow_ctx->sctx = sctx; 2540 + nocow_ctx->logical = logical; 2541 + nocow_ctx->len = len; 2542 + nocow_ctx->mirror_num = mirror_num; 2543 + nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; 2544 + nocow_ctx->work.func = copy_nocow_pages_worker; 2545 + btrfs_queue_worker(&fs_info->scrub_nocow_workers, 2546 + &nocow_ctx->work); 2547 + 2548 + return 0; 2549 + } 2550 + 2551 + static void copy_nocow_pages_worker(struct btrfs_work *work) 2552 + { 2553 + struct scrub_copy_nocow_ctx *nocow_ctx = 2554 + container_of(work, struct scrub_copy_nocow_ctx, work); 2555 + struct scrub_ctx *sctx = nocow_ctx->sctx; 2556 + u64 logical = nocow_ctx->logical; 2557 + u64 len = nocow_ctx->len; 2558 + int mirror_num = nocow_ctx->mirror_num; 2559 + u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; 2560 + int ret; 2561 + struct btrfs_trans_handle *trans = NULL; 2562 + struct btrfs_fs_info *fs_info; 2563 + struct btrfs_path *path; 2564 + struct btrfs_root *root; 2565 + int not_written = 0; 2566 + 2567 + fs_info = sctx->dev_root->fs_info; 2568 + root = fs_info->extent_root; 2569 + 2570 + path = btrfs_alloc_path(); 2571 + if (!path) { 2572 + spin_lock(&sctx->stat_lock); 2573 + sctx->stat.malloc_errors++; 2574 + spin_unlock(&sctx->stat_lock); 2575 + not_written = 1; 2576 + goto out; 2577 + } 2578 + 2579 + trans = btrfs_join_transaction(root); 2580 + if (IS_ERR(trans)) { 2581 + not_written = 1; 2582 + goto out; 2583 + } 2584 + 2585 + ret = iterate_inodes_from_logical(logical, fs_info, path, 2586 + copy_nocow_pages_for_inode, 2587 + nocow_ctx); 2588 + if (ret != 0 && ret != -ENOENT) { 2589 + pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n", 2590 + (unsigned long long)logical, 2591 + (unsigned long long)physical_for_dev_replace, 2592 + (unsigned long long)len, 2593 + (unsigned long long)mirror_num, ret); 2594 + not_written = 1; 2595 + goto out; 2596 + } 2597 + 2598 + out: 2599 + if (trans && !IS_ERR(trans)) 2600 + btrfs_end_transaction(trans, root); 2601 + if (not_written) 2602 + btrfs_dev_replace_stats_inc(&fs_info->dev_replace. 2603 + num_uncorrectable_read_errors); 2604 + 2605 + btrfs_free_path(path); 2606 + kfree(nocow_ctx); 2607 + 2608 + scrub_pending_trans_workers_dec(sctx); 2609 + } 2610 + 2611 + static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) 2612 + { 2613 + unsigned long index; 2614 + struct scrub_copy_nocow_ctx *nocow_ctx = ctx; 2615 + int ret = 0; 2616 + struct btrfs_key key; 2617 + struct inode *inode = NULL; 2618 + struct btrfs_root *local_root; 2619 + u64 physical_for_dev_replace; 2620 + u64 len; 2621 + struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; 2622 + 2623 + key.objectid = root; 2624 + key.type = BTRFS_ROOT_ITEM_KEY; 2625 + key.offset = (u64)-1; 2626 + local_root = btrfs_read_fs_root_no_name(fs_info, &key); 2627 + if (IS_ERR(local_root)) 2628 + return PTR_ERR(local_root); 2629 + 2630 + key.type = BTRFS_INODE_ITEM_KEY; 2631 + key.objectid = inum; 2632 + key.offset = 0; 2633 + inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); 2634 + if (IS_ERR(inode)) 2635 + return PTR_ERR(inode); 2636 + 2637 + physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; 2638 + len = nocow_ctx->len; 2639 + while (len >= PAGE_CACHE_SIZE) { 2640 + struct page *page = NULL; 2641 + int ret_sub; 2642 + 2643 + index = offset >> PAGE_CACHE_SHIFT; 2644 + 2645 + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 2646 + if (!page) { 2647 + pr_err("find_or_create_page() failed\n"); 2648 + ret = -ENOMEM; 2649 + goto next_page; 2650 + } 2651 + 2652 + if (PageUptodate(page)) { 2653 + if (PageDirty(page)) 2654 + goto next_page; 2655 + } else { 2656 + ClearPageError(page); 2657 + ret_sub = extent_read_full_page(&BTRFS_I(inode)-> 2658 + io_tree, 2659 + page, btrfs_get_extent, 2660 + nocow_ctx->mirror_num); 2661 + if (ret_sub) { 2662 + ret = ret_sub; 2663 + goto next_page; 2664 + } 2665 + wait_on_page_locked(page); 2666 + if (!PageUptodate(page)) { 2667 + ret = -EIO; 2668 + goto next_page; 2669 + } 2670 + } 2671 + ret_sub = write_page_nocow(nocow_ctx->sctx, 2672 + physical_for_dev_replace, page); 2673 + if (ret_sub) { 2674 + ret = ret_sub; 2675 + goto next_page; 2676 + } 2677 + 2678 + next_page: 2679 + if (page) { 2680 + unlock_page(page); 2681 + put_page(page); 2682 + } 2683 + offset += PAGE_CACHE_SIZE; 2684 + physical_for_dev_replace += PAGE_CACHE_SIZE; 2685 + len -= PAGE_CACHE_SIZE; 2686 + } 2687 + 2688 + if (inode) 2689 + iput(inode); 2690 + return ret; 2691 + } 2692 + 2693 + static int write_page_nocow(struct scrub_ctx *sctx, 2694 + u64 physical_for_dev_replace, struct page *page) 2695 + { 2696 + struct bio *bio; 2697 + struct btrfs_device *dev; 2698 + int ret; 2699 + DECLARE_COMPLETION_ONSTACK(compl); 2700 + 2701 + dev = sctx->wr_ctx.tgtdev; 2702 + if (!dev) 2703 + return -EIO; 2704 + if (!dev->bdev) { 2705 + printk_ratelimited(KERN_WARNING 2706 + "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); 2707 + return -EIO; 2708 + } 2709 + bio = bio_alloc(GFP_NOFS, 1); 2710 + if (!bio) { 2711 + spin_lock(&sctx->stat_lock); 2712 + sctx->stat.malloc_errors++; 2713 + spin_unlock(&sctx->stat_lock); 2714 + return -ENOMEM; 2715 + } 2716 + bio->bi_private = &compl; 2717 + bio->bi_end_io = scrub_complete_bio_end_io; 2718 + bio->bi_size = 0; 2719 + bio->bi_sector = physical_for_dev_replace >> 9; 2720 + bio->bi_bdev = dev->bdev; 2721 + ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); 2722 + if (ret != PAGE_CACHE_SIZE) { 2723 + leave_with_eio: 2724 + bio_put(bio); 2725 + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 2726 + return -EIO; 2727 + } 2728 + btrfsic_submit_bio(WRITE_SYNC, bio); 2729 + wait_for_completion(&compl); 2730 + 2731 + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 2732 + goto leave_with_eio; 2733 + 2734 + bio_put(bio); 2735 + return 0; 3028 2736 }

+4 -4

fs/btrfs/send.c

··· 4397 4397 if (!path) 4398 4398 return -ENOMEM; 4399 4399 4400 - spin_lock(&send_root->root_times_lock); 4400 + spin_lock(&send_root->root_item_lock); 4401 4401 start_ctransid = btrfs_root_ctransid(&send_root->root_item); 4402 - spin_unlock(&send_root->root_times_lock); 4402 + spin_unlock(&send_root->root_item_lock); 4403 4403 4404 4404 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 4405 4405 key.type = BTRFS_INODE_ITEM_KEY; ··· 4422 4422 * Make sure the tree has not changed after re-joining. We detect this 4423 4423 * by comparing start_ctransid and ctransid. They should always match. 4424 4424 */ 4425 - spin_lock(&send_root->root_times_lock); 4425 + spin_lock(&send_root->root_item_lock); 4426 4426 ctransid = btrfs_root_ctransid(&send_root->root_item); 4427 - spin_unlock(&send_root->root_times_lock); 4427 + spin_unlock(&send_root->root_item_lock); 4428 4428 4429 4429 if (ctransid != start_ctransid) { 4430 4430 WARN(1, KERN_WARNING "btrfs: the root that you're trying to "

+44 -4

fs/btrfs/super.c

··· 55 55 #include "export.h" 56 56 #include "compression.h" 57 57 #include "rcu-string.h" 58 + #include "dev-replace.h" 58 59 59 60 #define CREATE_TRACE_POINTS 60 61 #include <trace/events/btrfs.h> ··· 117 116 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 118 117 sb->s_flags |= MS_RDONLY; 119 118 printk(KERN_INFO "btrfs is forced readonly\n"); 120 - __btrfs_scrub_cancel(fs_info); 119 + /* 120 + * Note that a running device replace operation is not 121 + * canceled here although there is no way to update 122 + * the progress. It would add the risk of a deadlock, 123 + * therefore the canceling is ommited. The only penalty 124 + * is that some I/O remains active until the procedure 125 + * completes. The next time when the filesystem is 126 + * mounted writeable again, the device replace 127 + * operation continues. 128 + */ 121 129 // WARN_ON(1); 122 130 } 123 131 } ··· 1196 1186 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); 1197 1187 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); 1198 1188 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); 1199 - btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size); 1189 + btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers, 1190 + new_pool_size); 1200 1191 } 1201 1192 1202 1193 static int btrfs_remount(struct super_block *sb, int *flags, char *data) ··· 1226 1215 return 0; 1227 1216 1228 1217 if (*flags & MS_RDONLY) { 1218 + /* 1219 + * this also happens on 'umount -rf' or on shutdown, when 1220 + * the filesystem is busy. 1221 + */ 1229 1222 sb->s_flags |= MS_RDONLY; 1223 + 1224 + btrfs_dev_replace_suspend_for_unmount(fs_info); 1225 + btrfs_scrub_cancel(fs_info); 1230 1226 1231 1227 ret = btrfs_commit_super(root); 1232 1228 if (ret) 1233 1229 goto restore; 1234 1230 } else { 1235 1231 if (fs_info->fs_devices->rw_devices == 0) { 1232 + ret = -EACCES; 1233 + goto restore; 1234 + } 1235 + 1236 + if (fs_info->fs_devices->missing_devices > 1237 + fs_info->num_tolerated_disk_barrier_failures && 1238 + !(*flags & MS_RDONLY)) { 1239 + printk(KERN_WARNING 1240 + "Btrfs: too many missing devices, writeable remount is not allowed\n"); 1236 1241 ret = -EACCES; 1237 1242 goto restore; 1238 1243 } ··· 1271 1244 if (ret) 1272 1245 goto restore; 1273 1246 1247 + ret = btrfs_resume_dev_replace_async(fs_info); 1248 + if (ret) { 1249 + pr_warn("btrfs: failed to resume dev_replace\n"); 1250 + goto restore; 1251 + } 1274 1252 sb->s_flags &= ~MS_RDONLY; 1275 1253 } 1276 1254 ··· 1368 1336 min_stripe_size = BTRFS_STRIPE_LEN; 1369 1337 1370 1338 list_for_each_entry(device, &fs_devices->devices, dev_list) { 1371 - if (!device->in_fs_metadata || !device->bdev) 1339 + if (!device->in_fs_metadata || !device->bdev || 1340 + device->is_tgtdev_for_dev_replace) 1372 1341 continue; 1373 1342 1374 1343 avail_space = device->total_bytes - device->bytes_used; ··· 1680 1647 if (err) 1681 1648 goto free_ordered_data; 1682 1649 1683 - err = btrfs_interface_init(); 1650 + err = btrfs_auto_defrag_init(); 1684 1651 if (err) 1685 1652 goto free_delayed_inode; 1653 + 1654 + err = btrfs_interface_init(); 1655 + if (err) 1656 + goto free_auto_defrag; 1686 1657 1687 1658 err = register_filesystem(&btrfs_fs_type); 1688 1659 if (err) ··· 1699 1662 1700 1663 unregister_ioctl: 1701 1664 btrfs_interface_exit(); 1665 + free_auto_defrag: 1666 + btrfs_auto_defrag_exit(); 1702 1667 free_delayed_inode: 1703 1668 btrfs_delayed_inode_exit(); 1704 1669 free_ordered_data: ··· 1720 1681 static void __exit exit_btrfs_fs(void) 1721 1682 { 1722 1683 btrfs_destroy_cachep(); 1684 + btrfs_auto_defrag_exit(); 1723 1685 btrfs_delayed_inode_exit(); 1724 1686 ordered_data_exit(); 1725 1687 extent_map_exit();

+100 -72

fs/btrfs/transaction.c

··· 30 30 #include "tree-log.h" 31 31 #include "inode-map.h" 32 32 #include "volumes.h" 33 + #include "dev-replace.h" 33 34 34 35 #define BTRFS_ROOT_TRANS_TAG 0 35 36 ··· 146 145 * the log must never go across transaction boundaries. 147 146 */ 148 147 smp_mb(); 149 - if (!list_empty(&fs_info->tree_mod_seq_list)) { 150 - printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when " 148 + if (!list_empty(&fs_info->tree_mod_seq_list)) 149 + WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when " 151 150 "creating a fresh transaction\n"); 152 - WARN_ON(1); 153 - } 154 - if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) { 155 - printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when " 151 + if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) 152 + WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when " 156 153 "creating a fresh transaction\n"); 157 - WARN_ON(1); 158 - } 159 154 atomic_set(&fs_info->tree_mod_seq, 0); 160 155 161 156 spin_lock_init(&cur_trans->commit_lock); ··· 292 295 return 0; 293 296 } 294 297 295 - static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 296 - u64 num_items, int type, 297 - int noflush) 298 + static struct btrfs_trans_handle * 299 + start_transaction(struct btrfs_root *root, u64 num_items, int type, 300 + enum btrfs_reserve_flush_enum flush) 298 301 { 299 302 struct btrfs_trans_handle *h; 300 303 struct btrfs_transaction *cur_trans; ··· 309 312 WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); 310 313 h = current->journal_info; 311 314 h->use_count++; 315 + WARN_ON(h->use_count > 2); 312 316 h->orig_rsv = h->block_rsv; 313 317 h->block_rsv = NULL; 314 318 goto got_it; ··· 329 331 } 330 332 331 333 num_bytes = btrfs_calc_trans_metadata_size(root, num_items); 332 - if (noflush) 333 - ret = btrfs_block_rsv_add_noflush(root, 334 - &root->fs_info->trans_block_rsv, 335 - num_bytes); 336 - else 337 - ret = btrfs_block_rsv_add(root, 338 - &root->fs_info->trans_block_rsv, 339 - num_bytes); 334 + ret = btrfs_block_rsv_add(root, 335 + &root->fs_info->trans_block_rsv, 336 + num_bytes, flush); 340 337 if (ret) 341 338 return ERR_PTR(ret); 342 339 } ··· 415 422 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 416 423 int num_items) 417 424 { 418 - return start_transaction(root, num_items, TRANS_START, 0); 425 + return start_transaction(root, num_items, TRANS_START, 426 + BTRFS_RESERVE_FLUSH_ALL); 419 427 } 420 428 421 - struct btrfs_trans_handle *btrfs_start_transaction_noflush( 429 + struct btrfs_trans_handle *btrfs_start_transaction_lflush( 422 430 struct btrfs_root *root, int num_items) 423 431 { 424 - return start_transaction(root, num_items, TRANS_START, 1); 432 + return start_transaction(root, num_items, TRANS_START, 433 + BTRFS_RESERVE_FLUSH_LIMIT); 425 434 } 426 435 427 436 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) ··· 456 461 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) 457 462 { 458 463 struct btrfs_transaction *cur_trans = NULL, *t; 459 - int ret; 464 + int ret = 0; 460 465 461 - ret = 0; 462 466 if (transid) { 463 467 if (transid <= root->fs_info->last_trans_committed) 464 468 goto out; 465 469 470 + ret = -EINVAL; 466 471 /* find specified transaction */ 467 472 spin_lock(&root->fs_info->trans_lock); 468 473 list_for_each_entry(t, &root->fs_info->trans_list, list) { 469 474 if (t->transid == transid) { 470 475 cur_trans = t; 471 476 atomic_inc(&cur_trans->use_count); 477 + ret = 0; 472 478 break; 473 479 } 474 - if (t->transid > transid) 480 + if (t->transid > transid) { 481 + ret = 0; 475 482 break; 483 + } 476 484 } 477 485 spin_unlock(&root->fs_info->trans_lock); 478 - ret = -EINVAL; 486 + /* The specified transaction doesn't exist */ 479 487 if (!cur_trans) 480 - goto out; /* bad transid */ 488 + goto out; 481 489 } else { 482 490 /* find newest transaction that is committing | committed */ 483 491 spin_lock(&root->fs_info->trans_lock); ··· 500 502 } 501 503 502 504 wait_for_commit(root, cur_trans); 503 - 504 505 put_transaction(cur_trans); 505 - ret = 0; 506 506 out: 507 507 return ret; 508 508 } ··· 847 851 return ret; 848 852 849 853 ret = btrfs_run_dev_stats(trans, root->fs_info); 850 - BUG_ON(ret); 854 + WARN_ON(ret); 855 + ret = btrfs_run_dev_replace(trans, root->fs_info); 856 + WARN_ON(ret); 851 857 852 858 ret = btrfs_run_qgroups(trans, root->fs_info); 853 859 BUG_ON(ret); ··· 871 873 down_write(&fs_info->extent_commit_sem); 872 874 switch_commit_root(fs_info->extent_root); 873 875 up_write(&fs_info->extent_commit_sem); 876 + 877 + btrfs_after_dev_replace_commit(fs_info); 874 878 875 879 return 0; 876 880 } ··· 958 958 struct btrfs_fs_info *info = root->fs_info; 959 959 struct btrfs_trans_handle *trans; 960 960 int ret; 961 - unsigned long nr; 962 961 963 962 if (xchg(&root->defrag_running, 1)) 964 963 return 0; ··· 969 970 970 971 ret = btrfs_defrag_leaves(trans, root, cacheonly); 971 972 972 - nr = trans->blocks_used; 973 973 btrfs_end_transaction(trans, root); 974 - btrfs_btree_balance_dirty(info->tree_root, nr); 974 + btrfs_btree_balance_dirty(info->tree_root); 975 975 cond_resched(); 976 976 977 977 if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) ··· 1030 1032 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); 1031 1033 1032 1034 if (to_reserve > 0) { 1033 - ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv, 1034 - to_reserve); 1035 + ret = btrfs_block_rsv_add(root, &pending->block_rsv, 1036 + to_reserve, 1037 + BTRFS_RESERVE_NO_FLUSH); 1035 1038 if (ret) { 1036 1039 pending->error = ret; 1037 1040 goto no_free_objectid; ··· 1190 1191 parent_inode, &key, 1191 1192 BTRFS_FT_DIR, index); 1192 1193 /* We have check then name at the beginning, so it is impossible. */ 1193 - BUG_ON(ret == -EEXIST); 1194 + BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); 1194 1195 if (ret) { 1195 1196 btrfs_abort_transaction(trans, root, ret); 1196 1197 goto fail; ··· 1308 1309 * We've got freeze protection passed with the transaction. 1309 1310 * Tell lockdep about it. 1310 1311 */ 1311 - rwsem_acquire_read( 1312 - &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1313 - 0, 1, _THIS_IP_); 1312 + if (ac->newtrans->type < TRANS_JOIN_NOLOCK) 1313 + rwsem_acquire_read( 1314 + &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1315 + 0, 1, _THIS_IP_); 1314 1316 1315 1317 current->journal_info = ac->newtrans; 1316 1318 ··· 1349 1349 * Tell lockdep we've released the freeze rwsem, since the 1350 1350 * async commit thread will be the one to unlock it. 1351 1351 */ 1352 - rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1353 - 1, _THIS_IP_); 1352 + if (trans->type < TRANS_JOIN_NOLOCK) 1353 + rwsem_release( 1354 + &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1355 + 1, _THIS_IP_); 1354 1356 1355 1357 schedule_delayed_work(&ac->work, 0); 1356 1358 ··· 1402 1400 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1403 1401 } 1404 1402 1403 + static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, 1404 + struct btrfs_root *root) 1405 + { 1406 + int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); 1407 + int snap_pending = 0; 1408 + int ret; 1409 + 1410 + if (!flush_on_commit) { 1411 + spin_lock(&root->fs_info->trans_lock); 1412 + if (!list_empty(&trans->transaction->pending_snapshots)) 1413 + snap_pending = 1; 1414 + spin_unlock(&root->fs_info->trans_lock); 1415 + } 1416 + 1417 + if (flush_on_commit || snap_pending) { 1418 + btrfs_start_delalloc_inodes(root, 1); 1419 + btrfs_wait_ordered_extents(root, 1); 1420 + } 1421 + 1422 + ret = btrfs_run_delayed_items(trans, root); 1423 + if (ret) 1424 + return ret; 1425 + 1426 + /* 1427 + * running the delayed items may have added new refs. account 1428 + * them now so that they hinder processing of more delayed refs 1429 + * as little as possible. 1430 + */ 1431 + btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); 1432 + 1433 + /* 1434 + * rename don't use btrfs_join_transaction, so, once we 1435 + * set the transaction to blocked above, we aren't going 1436 + * to get any new ordered operations. We can safely run 1437 + * it here and no for sure that nothing new will be added 1438 + * to the list 1439 + */ 1440 + btrfs_run_ordered_operations(root, 1); 1441 + 1442 + return 0; 1443 + } 1444 + 1405 1445 /* 1406 1446 * btrfs_transaction state sequence: 1407 1447 * in_commit = 0, blocked = 0 (initial) ··· 1458 1414 struct btrfs_transaction *cur_trans = trans->transaction; 1459 1415 struct btrfs_transaction *prev_trans = NULL; 1460 1416 DEFINE_WAIT(wait); 1461 - int ret = -EIO; 1417 + int ret; 1462 1418 int should_grow = 0; 1463 1419 unsigned long now = get_seconds(); 1464 - int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); 1465 1420 1466 - btrfs_run_ordered_operations(root, 0); 1467 - 1468 - if (cur_trans->aborted) 1421 + ret = btrfs_run_ordered_operations(root, 0); 1422 + if (ret) { 1423 + btrfs_abort_transaction(trans, root, ret); 1469 1424 goto cleanup_transaction; 1425 + } 1426 + 1427 + if (cur_trans->aborted) { 1428 + ret = cur_trans->aborted; 1429 + goto cleanup_transaction; 1430 + } 1470 1431 1471 1432 /* make a pass through all the delayed refs we have so far 1472 1433 * any runnings procs may add more while we are here ··· 1539 1490 should_grow = 1; 1540 1491 1541 1492 do { 1542 - int snap_pending = 0; 1543 - 1544 1493 joined = cur_trans->num_joined; 1545 - if (!list_empty(&trans->transaction->pending_snapshots)) 1546 - snap_pending = 1; 1547 1494 1548 1495 WARN_ON(cur_trans != trans->transaction); 1549 1496 1550 - if (flush_on_commit || snap_pending) { 1551 - btrfs_start_delalloc_inodes(root, 1); 1552 - btrfs_wait_ordered_extents(root, 1); 1553 - } 1554 - 1555 - ret = btrfs_run_delayed_items(trans, root); 1497 + ret = btrfs_flush_all_pending_stuffs(trans, root); 1556 1498 if (ret) 1557 1499 goto cleanup_transaction; 1558 - 1559 - /* 1560 - * running the delayed items may have added new refs. account 1561 - * them now so that they hinder processing of more delayed refs 1562 - * as little as possible. 1563 - */ 1564 - btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); 1565 - 1566 - /* 1567 - * rename don't use btrfs_join_transaction, so, once we 1568 - * set the transaction to blocked above, we aren't going 1569 - * to get any new ordered operations. We can safely run 1570 - * it here and no for sure that nothing new will be added 1571 - * to the list 1572 - */ 1573 - btrfs_run_ordered_operations(root, 1); 1574 1500 1575 1501 prepare_to_wait(&cur_trans->writer_wait, &wait, 1576 1502 TASK_UNINTERRUPTIBLE); ··· 1558 1534 finish_wait(&cur_trans->writer_wait, &wait); 1559 1535 } while (atomic_read(&cur_trans->num_writers) > 1 || 1560 1536 (should_grow && cur_trans->num_joined != joined)); 1537 + 1538 + ret = btrfs_flush_all_pending_stuffs(trans, root); 1539 + if (ret) 1540 + goto cleanup_transaction; 1561 1541 1562 1542 /* 1563 1543 * Ok now we need to make sure to block out any other joins while we

+1 -1

fs/btrfs/transaction.h

··· 105 105 struct btrfs_root *root); 106 106 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 107 107 int num_items); 108 - struct btrfs_trans_handle *btrfs_start_transaction_noflush( 108 + struct btrfs_trans_handle *btrfs_start_transaction_lflush( 109 109 struct btrfs_root *root, int num_items); 110 110 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); 111 111 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);

+291 -188

fs/btrfs/tree-log.c

··· 2952 2952 struct btrfs_inode_item *item, 2953 2953 struct inode *inode, int log_inode_only) 2954 2954 { 2955 - btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); 2956 - btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); 2957 - btrfs_set_inode_mode(leaf, item, inode->i_mode); 2958 - btrfs_set_inode_nlink(leaf, item, inode->i_nlink); 2955 + struct btrfs_map_token token; 2959 2956 2960 - btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), 2961 - inode->i_atime.tv_sec); 2962 - btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), 2963 - inode->i_atime.tv_nsec); 2964 - 2965 - btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), 2966 - inode->i_mtime.tv_sec); 2967 - btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), 2968 - inode->i_mtime.tv_nsec); 2969 - 2970 - btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), 2971 - inode->i_ctime.tv_sec); 2972 - btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), 2973 - inode->i_ctime.tv_nsec); 2974 - 2975 - btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); 2976 - 2977 - btrfs_set_inode_sequence(leaf, item, inode->i_version); 2978 - btrfs_set_inode_transid(leaf, item, trans->transid); 2979 - btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 2980 - btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); 2981 - btrfs_set_inode_block_group(leaf, item, 0); 2957 + btrfs_init_map_token(&token); 2982 2958 2983 2959 if (log_inode_only) { 2984 2960 /* set the generation to zero so the recover code ··· 2962 2986 * just to say 'this inode exists' and a logging 2963 2987 * to say 'update this inode with these values' 2964 2988 */ 2965 - btrfs_set_inode_generation(leaf, item, 0); 2966 - btrfs_set_inode_size(leaf, item, 0); 2989 + btrfs_set_token_inode_generation(leaf, item, 0, &token); 2990 + btrfs_set_token_inode_size(leaf, item, 0, &token); 2967 2991 } else { 2968 - btrfs_set_inode_generation(leaf, item, 2969 - BTRFS_I(inode)->generation); 2970 - btrfs_set_inode_size(leaf, item, inode->i_size); 2992 + btrfs_set_token_inode_generation(leaf, item, 2993 + BTRFS_I(inode)->generation, 2994 + &token); 2995 + btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); 2971 2996 } 2972 2997 2998 + btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 2999 + btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3000 + btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3001 + btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3002 + 3003 + btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), 3004 + inode->i_atime.tv_sec, &token); 3005 + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), 3006 + inode->i_atime.tv_nsec, &token); 3007 + 3008 + btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), 3009 + inode->i_mtime.tv_sec, &token); 3010 + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), 3011 + inode->i_mtime.tv_nsec, &token); 3012 + 3013 + btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), 3014 + inode->i_ctime.tv_sec, &token); 3015 + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), 3016 + inode->i_ctime.tv_nsec, &token); 3017 + 3018 + btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3019 + &token); 3020 + 3021 + btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); 3022 + btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3023 + btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3024 + btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3025 + btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3026 + } 3027 + 3028 + static int log_inode_item(struct btrfs_trans_handle *trans, 3029 + struct btrfs_root *log, struct btrfs_path *path, 3030 + struct inode *inode) 3031 + { 3032 + struct btrfs_inode_item *inode_item; 3033 + struct btrfs_key key; 3034 + int ret; 3035 + 3036 + memcpy(&key, &BTRFS_I(inode)->location, sizeof(key)); 3037 + ret = btrfs_insert_empty_item(trans, log, path, &key, 3038 + sizeof(*inode_item)); 3039 + if (ret && ret != -EEXIST) 3040 + return ret; 3041 + inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3042 + struct btrfs_inode_item); 3043 + fill_inode_item(trans, path->nodes[0], inode_item, inode, 0); 3044 + btrfs_release_path(path); 3045 + return 0; 2973 3046 } 2974 3047 2975 3048 static noinline int copy_items(struct btrfs_trans_handle *trans, ··· 3155 3130 return 0; 3156 3131 } 3157 3132 3158 - struct log_args { 3159 - struct extent_buffer *src; 3160 - u64 next_offset; 3161 - int start_slot; 3162 - int nr; 3163 - }; 3133 + static int drop_adjacent_extents(struct btrfs_trans_handle *trans, 3134 + struct btrfs_root *root, struct inode *inode, 3135 + struct extent_map *em, 3136 + struct btrfs_path *path) 3137 + { 3138 + struct btrfs_file_extent_item *fi; 3139 + struct extent_buffer *leaf; 3140 + struct btrfs_key key, new_key; 3141 + struct btrfs_map_token token; 3142 + u64 extent_end; 3143 + u64 extent_offset = 0; 3144 + int extent_type; 3145 + int del_slot = 0; 3146 + int del_nr = 0; 3147 + int ret = 0; 3148 + 3149 + while (1) { 3150 + btrfs_init_map_token(&token); 3151 + leaf = path->nodes[0]; 3152 + path->slots[0]++; 3153 + if (path->slots[0] >= btrfs_header_nritems(leaf)) { 3154 + if (del_nr) { 3155 + ret = btrfs_del_items(trans, root, path, 3156 + del_slot, del_nr); 3157 + if (ret) 3158 + return ret; 3159 + del_nr = 0; 3160 + } 3161 + 3162 + ret = btrfs_next_leaf_write(trans, root, path, 1); 3163 + if (ret < 0) 3164 + return ret; 3165 + if (ret > 0) 3166 + return 0; 3167 + leaf = path->nodes[0]; 3168 + } 3169 + 3170 + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3171 + if (key.objectid != btrfs_ino(inode) || 3172 + key.type != BTRFS_EXTENT_DATA_KEY || 3173 + key.offset >= em->start + em->len) 3174 + break; 3175 + 3176 + fi = btrfs_item_ptr(leaf, path->slots[0], 3177 + struct btrfs_file_extent_item); 3178 + extent_type = btrfs_token_file_extent_type(leaf, fi, &token); 3179 + if (extent_type == BTRFS_FILE_EXTENT_REG || 3180 + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 3181 + extent_offset = btrfs_token_file_extent_offset(leaf, 3182 + fi, &token); 3183 + extent_end = key.offset + 3184 + btrfs_token_file_extent_num_bytes(leaf, fi, 3185 + &token); 3186 + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 3187 + extent_end = key.offset + 3188 + btrfs_file_extent_inline_len(leaf, fi); 3189 + } else { 3190 + BUG(); 3191 + } 3192 + 3193 + if (extent_end <= em->len + em->start) { 3194 + if (!del_nr) { 3195 + del_slot = path->slots[0]; 3196 + } 3197 + del_nr++; 3198 + continue; 3199 + } 3200 + 3201 + /* 3202 + * Ok so we'll ignore previous items if we log a new extent, 3203 + * which can lead to overlapping extents, so if we have an 3204 + * existing extent we want to adjust we _have_ to check the next 3205 + * guy to make sure we even need this extent anymore, this keeps 3206 + * us from panicing in set_item_key_safe. 3207 + */ 3208 + if (path->slots[0] < btrfs_header_nritems(leaf) - 1) { 3209 + struct btrfs_key tmp_key; 3210 + 3211 + btrfs_item_key_to_cpu(leaf, &tmp_key, 3212 + path->slots[0] + 1); 3213 + if (tmp_key.objectid == btrfs_ino(inode) && 3214 + tmp_key.type == BTRFS_EXTENT_DATA_KEY && 3215 + tmp_key.offset <= em->start + em->len) { 3216 + if (!del_nr) 3217 + del_slot = path->slots[0]; 3218 + del_nr++; 3219 + continue; 3220 + } 3221 + } 3222 + 3223 + BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 3224 + memcpy(&new_key, &key, sizeof(new_key)); 3225 + new_key.offset = em->start + em->len; 3226 + btrfs_set_item_key_safe(trans, root, path, &new_key); 3227 + extent_offset += em->start + em->len - key.offset; 3228 + btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, 3229 + &token); 3230 + btrfs_set_token_file_extent_num_bytes(leaf, fi, extent_end - 3231 + (em->start + em->len), 3232 + &token); 3233 + btrfs_mark_buffer_dirty(leaf); 3234 + } 3235 + 3236 + if (del_nr) 3237 + ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 3238 + 3239 + return ret; 3240 + } 3164 3241 3165 3242 static int log_one_extent(struct btrfs_trans_handle *trans, 3166 3243 struct inode *inode, struct btrfs_root *root, 3167 - struct extent_map *em, struct btrfs_path *path, 3168 - struct btrfs_path *dst_path, struct log_args *args) 3244 + struct extent_map *em, struct btrfs_path *path) 3169 3245 { 3170 3246 struct btrfs_root *log = root->log_root; 3171 3247 struct btrfs_file_extent_item *fi; 3248 + struct extent_buffer *leaf; 3249 + struct list_head ordered_sums; 3250 + struct btrfs_map_token token; 3172 3251 struct btrfs_key key; 3173 - u64 start = em->mod_start; 3174 - u64 search_start = start; 3175 - u64 len = em->mod_len; 3176 - u64 num_bytes; 3177 - int nritems; 3252 + u64 csum_offset = em->mod_start - em->start; 3253 + u64 csum_len = em->mod_len; 3254 + u64 extent_offset = em->start - em->orig_start; 3255 + u64 block_len; 3178 3256 int ret; 3257 + bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3179 3258 3180 - if (BTRFS_I(inode)->logged_trans == trans->transid) { 3181 - ret = __btrfs_drop_extents(trans, log, inode, dst_path, start, 3182 - start + len, NULL, 0); 3183 - if (ret) 3184 - return ret; 3259 + INIT_LIST_HEAD(&ordered_sums); 3260 + btrfs_init_map_token(&token); 3261 + key.objectid = btrfs_ino(inode); 3262 + key.type = BTRFS_EXTENT_DATA_KEY; 3263 + key.offset = em->start; 3264 + path->really_keep_locks = 1; 3265 + 3266 + ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi)); 3267 + if (ret && ret != -EEXIST) { 3268 + path->really_keep_locks = 0; 3269 + return ret; 3270 + } 3271 + leaf = path->nodes[0]; 3272 + fi = btrfs_item_ptr(leaf, path->slots[0], 3273 + struct btrfs_file_extent_item); 3274 + btrfs_set_token_file_extent_generation(leaf, fi, em->generation, 3275 + &token); 3276 + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 3277 + skip_csum = true; 3278 + btrfs_set_token_file_extent_type(leaf, fi, 3279 + BTRFS_FILE_EXTENT_PREALLOC, 3280 + &token); 3281 + } else { 3282 + btrfs_set_token_file_extent_type(leaf, fi, 3283 + BTRFS_FILE_EXTENT_REG, 3284 + &token); 3285 + if (em->block_start == 0) 3286 + skip_csum = true; 3185 3287 } 3186 3288 3187 - while (len) { 3188 - if (args->nr) 3189 - goto next_slot; 3190 - again: 3191 - key.objectid = btrfs_ino(inode); 3192 - key.type = BTRFS_EXTENT_DATA_KEY; 3193 - key.offset = search_start; 3194 - 3195 - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3196 - if (ret < 0) 3197 - return ret; 3198 - 3199 - if (ret) { 3200 - /* 3201 - * A rare case were we can have an em for a section of a 3202 - * larger extent so we need to make sure that this em 3203 - * falls within the extent we've found. If not we just 3204 - * bail and go back to ye-olde way of doing things but 3205 - * it happens often enough in testing that we need to do 3206 - * this dance to make sure. 3207 - */ 3208 - do { 3209 - if (path->slots[0] == 0) { 3210 - btrfs_release_path(path); 3211 - if (search_start == 0) 3212 - return -ENOENT; 3213 - search_start--; 3214 - goto again; 3215 - } 3216 - 3217 - path->slots[0]--; 3218 - btrfs_item_key_to_cpu(path->nodes[0], &key, 3219 - path->slots[0]); 3220 - if (key.objectid != btrfs_ino(inode) || 3221 - key.type != BTRFS_EXTENT_DATA_KEY) { 3222 - btrfs_release_path(path); 3223 - return -ENOENT; 3224 - } 3225 - } while (key.offset > start); 3226 - 3227 - fi = btrfs_item_ptr(path->nodes[0], path->slots[0], 3228 - struct btrfs_file_extent_item); 3229 - num_bytes = btrfs_file_extent_num_bytes(path->nodes[0], 3230 - fi); 3231 - if (key.offset + num_bytes <= start) { 3232 - btrfs_release_path(path); 3233 - return -ENOENT; 3234 - } 3235 - } 3236 - args->src = path->nodes[0]; 3237 - next_slot: 3238 - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 3239 - fi = btrfs_item_ptr(args->src, path->slots[0], 3240 - struct btrfs_file_extent_item); 3241 - if (args->nr && 3242 - args->start_slot + args->nr == path->slots[0]) { 3243 - args->nr++; 3244 - } else if (args->nr) { 3245 - ret = copy_items(trans, inode, dst_path, args->src, 3246 - args->start_slot, args->nr, 3247 - LOG_INODE_ALL); 3248 - if (ret) 3249 - return ret; 3250 - args->nr = 1; 3251 - args->start_slot = path->slots[0]; 3252 - } else if (!args->nr) { 3253 - args->nr = 1; 3254 - args->start_slot = path->slots[0]; 3255 - } 3256 - nritems = btrfs_header_nritems(path->nodes[0]); 3257 - path->slots[0]++; 3258 - num_bytes = btrfs_file_extent_num_bytes(args->src, fi); 3259 - if (len < num_bytes) { 3260 - /* I _think_ this is ok, envision we write to a 3261 - * preallocated space that is adjacent to a previously 3262 - * written preallocated space that gets merged when we 3263 - * mark this preallocated space written. If we do not 3264 - * have the adjacent extent in cache then when we copy 3265 - * this extent it could end up being larger than our EM 3266 - * thinks it is, which is a-ok, so just set len to 0. 3267 - */ 3268 - len = 0; 3269 - } else { 3270 - len -= num_bytes; 3271 - } 3272 - start = key.offset + num_bytes; 3273 - args->next_offset = start; 3274 - search_start = start; 3275 - 3276 - if (path->slots[0] < nritems) { 3277 - if (len) 3278 - goto next_slot; 3279 - break; 3280 - } 3281 - 3282 - if (args->nr) { 3283 - ret = copy_items(trans, inode, dst_path, args->src, 3284 - args->start_slot, args->nr, 3285 - LOG_INODE_ALL); 3286 - if (ret) 3287 - return ret; 3288 - args->nr = 0; 3289 - btrfs_release_path(path); 3290 - } 3289 + block_len = max(em->block_len, em->orig_block_len); 3290 + if (em->compress_type != BTRFS_COMPRESS_NONE) { 3291 + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 3292 + em->block_start, 3293 + &token); 3294 + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 3295 + &token); 3296 + } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 3297 + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 3298 + em->block_start - 3299 + extent_offset, &token); 3300 + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 3301 + &token); 3302 + } else { 3303 + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); 3304 + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, 3305 + &token); 3291 3306 } 3292 3307 3293 - return 0; 3308 + btrfs_set_token_file_extent_offset(leaf, fi, 3309 + em->start - em->orig_start, 3310 + &token); 3311 + btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); 3312 + btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->len, &token); 3313 + btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, 3314 + &token); 3315 + btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); 3316 + btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); 3317 + btrfs_mark_buffer_dirty(leaf); 3318 + 3319 + /* 3320 + * Have to check the extent to the right of us to make sure it doesn't 3321 + * fall in our current range. We're ok if the previous extent is in our 3322 + * range since the recovery stuff will run us in key order and thus just 3323 + * drop the part we overwrote. 3324 + */ 3325 + ret = drop_adjacent_extents(trans, log, inode, em, path); 3326 + btrfs_release_path(path); 3327 + path->really_keep_locks = 0; 3328 + if (ret) { 3329 + return ret; 3330 + } 3331 + 3332 + if (skip_csum) 3333 + return 0; 3334 + 3335 + /* block start is already adjusted for the file extent offset. */ 3336 + ret = btrfs_lookup_csums_range(log->fs_info->csum_root, 3337 + em->block_start + csum_offset, 3338 + em->block_start + csum_offset + 3339 + csum_len - 1, &ordered_sums, 0); 3340 + if (ret) 3341 + return ret; 3342 + 3343 + while (!list_empty(&ordered_sums)) { 3344 + struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3345 + struct btrfs_ordered_sum, 3346 + list); 3347 + if (!ret) 3348 + ret = btrfs_csum_file_blocks(trans, log, sums); 3349 + list_del(&sums->list); 3350 + kfree(sums); 3351 + } 3352 + 3353 + return ret; 3294 3354 } 3295 3355 3296 3356 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 3297 3357 struct btrfs_root *root, 3298 3358 struct inode *inode, 3299 - struct btrfs_path *path, 3300 - struct btrfs_path *dst_path) 3359 + struct btrfs_path *path) 3301 3360 { 3302 - struct log_args args; 3303 3361 struct extent_map *em, *n; 3304 3362 struct list_head extents; 3305 3363 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; ··· 3390 3282 int ret = 0; 3391 3283 3392 3284 INIT_LIST_HEAD(&extents); 3393 - 3394 - memset(&args, 0, sizeof(args)); 3395 3285 3396 3286 write_lock(&tree->lock); 3397 3287 test_gen = root->fs_info->last_trans_committed; ··· 3423 3317 3424 3318 write_unlock(&tree->lock); 3425 3319 3426 - /* 3427 - * If the previous EM and the last extent we left off on aren't 3428 - * sequential then we need to copy the items we have and redo 3429 - * our search 3430 - */ 3431 - if (args.nr && em->mod_start != args.next_offset) { 3432 - ret = copy_items(trans, inode, dst_path, args.src, 3433 - args.start_slot, args.nr, 3434 - LOG_INODE_ALL); 3435 - if (ret) { 3436 - free_extent_map(em); 3437 - write_lock(&tree->lock); 3438 - continue; 3439 - } 3440 - btrfs_release_path(path); 3441 - args.nr = 0; 3442 - } 3443 - 3444 - ret = log_one_extent(trans, inode, root, em, path, dst_path, &args); 3320 + ret = log_one_extent(trans, inode, root, em, path); 3445 3321 free_extent_map(em); 3446 3322 write_lock(&tree->lock); 3447 3323 } 3448 3324 WARN_ON(!list_empty(&extents)); 3449 3325 write_unlock(&tree->lock); 3450 3326 3451 - if (!ret && args.nr) 3452 - ret = copy_items(trans, inode, dst_path, args.src, 3453 - args.start_slot, args.nr, LOG_INODE_ALL); 3454 3327 btrfs_release_path(path); 3455 3328 return ret; 3456 3329 } ··· 3485 3400 3486 3401 3487 3402 /* today the code can only do partial logging of directories */ 3488 - if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 3403 + if (S_ISDIR(inode->i_mode) || 3404 + (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3405 + &BTRFS_I(inode)->runtime_flags) && 3406 + inode_only == LOG_INODE_EXISTS)) 3489 3407 max_key.type = BTRFS_XATTR_ITEM_KEY; 3490 3408 else 3491 3409 max_key.type = (u8)-1; ··· 3520 3432 } else { 3521 3433 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3522 3434 &BTRFS_I(inode)->runtime_flags)) { 3435 + clear_bit(BTRFS_INODE_COPY_EVERYTHING, 3436 + &BTRFS_I(inode)->runtime_flags); 3523 3437 ret = btrfs_truncate_inode_items(trans, log, 3524 3438 inode, 0, 0); 3525 - } else { 3526 - fast_search = true; 3439 + } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 3440 + &BTRFS_I(inode)->runtime_flags)) { 3441 + if (inode_only == LOG_INODE_ALL) 3442 + fast_search = true; 3527 3443 max_key.type = BTRFS_XATTR_ITEM_KEY; 3528 3444 ret = drop_objectid_items(trans, log, path, ino, 3529 - BTRFS_XATTR_ITEM_KEY); 3445 + max_key.type); 3446 + } else { 3447 + if (inode_only == LOG_INODE_ALL) 3448 + fast_search = true; 3449 + ret = log_inode_item(trans, log, dst_path, inode); 3450 + if (ret) { 3451 + err = ret; 3452 + goto out_unlock; 3453 + } 3454 + goto log_extents; 3530 3455 } 3456 + 3531 3457 } 3532 3458 if (ret) { 3533 3459 err = ret; ··· 3620 3518 ins_nr = 0; 3621 3519 } 3622 3520 3521 + log_extents: 3623 3522 if (fast_search) { 3624 - btrfs_release_path(path); 3625 3523 btrfs_release_path(dst_path); 3626 - ret = btrfs_log_changed_extents(trans, root, inode, path, 3627 - dst_path); 3524 + ret = btrfs_log_changed_extents(trans, root, inode, dst_path); 3628 3525 if (ret) { 3629 3526 err = ret; 3630 3527 goto out_unlock; ··· 3632 3531 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 3633 3532 struct extent_map *em, *n; 3634 3533 3534 + write_lock(&tree->lock); 3635 3535 list_for_each_entry_safe(em, n, &tree->modified_extents, list) 3636 3536 list_del_init(&em->list); 3537 + write_unlock(&tree->lock); 3637 3538 } 3638 3539 3639 3540 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {

+776 -188

fs/btrfs/volumes.c

··· 25 25 #include <linux/capability.h> 26 26 #include <linux/ratelimit.h> 27 27 #include <linux/kthread.h> 28 - #include <asm/div64.h> 29 28 #include "compat.h" 30 29 #include "ctree.h" 31 30 #include "extent_map.h" ··· 35 36 #include "async-thread.h" 36 37 #include "check-integrity.h" 37 38 #include "rcu-string.h" 39 + #include "math.h" 40 + #include "dev-replace.h" 38 41 39 42 static int init_first_rw_device(struct btrfs_trans_handle *trans, 40 43 struct btrfs_root *root, ··· 70 69 kfree(device); 71 70 } 72 71 kfree(fs_devices); 72 + } 73 + 74 + static void btrfs_kobject_uevent(struct block_device *bdev, 75 + enum kobject_action action) 76 + { 77 + int ret; 78 + 79 + ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); 80 + if (ret) 81 + pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n", 82 + action, 83 + kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), 84 + &disk_to_dev(bdev->bd_disk)->kobj); 73 85 } 74 86 75 87 void btrfs_cleanup_fs_uuids(void) ··· 120 106 return fs_devices; 121 107 } 122 108 return NULL; 109 + } 110 + 111 + static int 112 + btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 113 + int flush, struct block_device **bdev, 114 + struct buffer_head **bh) 115 + { 116 + int ret; 117 + 118 + *bdev = blkdev_get_by_path(device_path, flags, holder); 119 + 120 + if (IS_ERR(*bdev)) { 121 + ret = PTR_ERR(*bdev); 122 + printk(KERN_INFO "btrfs: open %s failed\n", device_path); 123 + goto error; 124 + } 125 + 126 + if (flush) 127 + filemap_write_and_wait((*bdev)->bd_inode->i_mapping); 128 + ret = set_blocksize(*bdev, 4096); 129 + if (ret) { 130 + blkdev_put(*bdev, flags); 131 + goto error; 132 + } 133 + invalidate_bdev(*bdev); 134 + *bh = btrfs_read_dev_super(*bdev); 135 + if (!*bh) { 136 + ret = -EINVAL; 137 + blkdev_put(*bdev, flags); 138 + goto error; 139 + } 140 + 141 + return 0; 142 + 143 + error: 144 + *bdev = NULL; 145 + *bh = NULL; 146 + return ret; 123 147 } 124 148 125 149 static void requeue_list(struct btrfs_pending_bios *pending_bios, ··· 519 467 return ERR_PTR(-ENOMEM); 520 468 } 521 469 522 - void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 470 + void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, 471 + struct btrfs_fs_devices *fs_devices, int step) 523 472 { 524 473 struct btrfs_device *device, *next; 525 474 ··· 533 480 /* This is the initialized path, it is safe to release the devices. */ 534 481 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 535 482 if (device->in_fs_metadata) { 536 - if (!latest_transid || 537 - device->generation > latest_transid) { 483 + if (!device->is_tgtdev_for_dev_replace && 484 + (!latest_transid || 485 + device->generation > latest_transid)) { 538 486 latest_devid = device->devid; 539 487 latest_transid = device->generation; 540 488 latest_bdev = device->bdev; ··· 543 489 continue; 544 490 } 545 491 492 + if (device->devid == BTRFS_DEV_REPLACE_DEVID) { 493 + /* 494 + * In the first step, keep the device which has 495 + * the correct fsid and the devid that is used 496 + * for the dev_replace procedure. 497 + * In the second step, the dev_replace state is 498 + * read from the device tree and it is known 499 + * whether the procedure is really active or 500 + * not, which means whether this device is 501 + * used or whether it should be removed. 502 + */ 503 + if (step == 0 || device->is_tgtdev_for_dev_replace) { 504 + continue; 505 + } 506 + } 546 507 if (device->bdev) { 547 508 blkdev_put(device->bdev, device->mode); 548 509 device->bdev = NULL; ··· 566 497 if (device->writeable) { 567 498 list_del_init(&device->dev_alloc_list); 568 499 device->writeable = 0; 569 - fs_devices->rw_devices--; 500 + if (!device->is_tgtdev_for_dev_replace) 501 + fs_devices->rw_devices--; 570 502 } 571 503 list_del_init(&device->dev_list); 572 504 fs_devices->num_devices--; ··· 625 555 if (device->bdev) 626 556 fs_devices->open_devices--; 627 557 628 - if (device->writeable) { 558 + if (device->writeable && !device->is_tgtdev_for_dev_replace) { 629 559 list_del_init(&device->dev_alloc_list); 630 560 fs_devices->rw_devices--; 631 561 } ··· 707 637 if (!device->name) 708 638 continue; 709 639 710 - bdev = blkdev_get_by_path(device->name->str, flags, holder); 711 - if (IS_ERR(bdev)) { 712 - printk(KERN_INFO "btrfs: open %s failed\n", device->name->str); 713 - goto error; 714 - } 715 - filemap_write_and_wait(bdev->bd_inode->i_mapping); 716 - invalidate_bdev(bdev); 717 - set_blocksize(bdev, 4096); 718 - 719 - bh = btrfs_read_dev_super(bdev); 720 - if (!bh) 721 - goto error_close; 640 + ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 641 + &bdev, &bh); 642 + if (ret) 643 + continue; 722 644 723 645 disk_super = (struct btrfs_super_block *)bh->b_data; 724 646 devid = btrfs_stack_device_id(&disk_super->dev_item); ··· 749 687 fs_devices->rotating = 1; 750 688 751 689 fs_devices->open_devices++; 752 - if (device->writeable) { 690 + if (device->writeable && !device->is_tgtdev_for_dev_replace) { 753 691 fs_devices->rw_devices++; 754 692 list_add(&device->dev_alloc_list, 755 693 &fs_devices->alloc_list); ··· 759 697 760 698 error_brelse: 761 699 brelse(bh); 762 - error_close: 763 700 blkdev_put(bdev, flags); 764 - error: 765 701 continue; 766 702 } 767 703 if (fs_devices->open_devices == 0) { ··· 804 744 u64 total_devices; 805 745 806 746 flags |= FMODE_EXCL; 807 - bdev = blkdev_get_by_path(path, flags, holder); 808 - 809 - if (IS_ERR(bdev)) { 810 - ret = PTR_ERR(bdev); 811 - goto error; 812 - } 813 - 814 747 mutex_lock(&uuid_mutex); 815 - ret = set_blocksize(bdev, 4096); 748 + ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh); 816 749 if (ret) 817 - goto error_close; 818 - bh = btrfs_read_dev_super(bdev); 819 - if (!bh) { 820 - ret = -EINVAL; 821 - goto error_close; 822 - } 750 + goto error; 823 751 disk_super = (struct btrfs_super_block *)bh->b_data; 824 752 devid = btrfs_stack_device_id(&disk_super->dev_item); 825 753 transid = btrfs_super_generation(disk_super); 826 754 total_devices = btrfs_super_num_devices(disk_super); 827 - if (disk_super->label[0]) 755 + if (disk_super->label[0]) { 756 + if (disk_super->label[BTRFS_LABEL_SIZE - 1]) 757 + disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; 828 758 printk(KERN_INFO "device label %s ", disk_super->label); 829 - else 759 + } else { 830 760 printk(KERN_INFO "device fsid %pU ", disk_super->fsid); 761 + } 831 762 printk(KERN_CONT "devid %llu transid %llu %s\n", 832 763 (unsigned long long)devid, (unsigned long long)transid, path); 833 764 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 834 765 if (!ret && fs_devices_ret) 835 766 (*fs_devices_ret)->total_devices = total_devices; 836 767 brelse(bh); 837 - error_close: 838 - mutex_unlock(&uuid_mutex); 839 768 blkdev_put(bdev, flags); 840 769 error: 770 + mutex_unlock(&uuid_mutex); 841 771 return ret; 842 772 } 843 773 ··· 846 796 847 797 *length = 0; 848 798 849 - if (start >= device->total_bytes) 799 + if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace) 850 800 return 0; 851 801 852 802 path = btrfs_alloc_path(); ··· 963 913 max_hole_size = 0; 964 914 hole_size = 0; 965 915 966 - if (search_start >= search_end) { 916 + if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { 967 917 ret = -ENOSPC; 968 918 goto error; 969 919 } ··· 1146 1096 struct btrfs_key key; 1147 1097 1148 1098 WARN_ON(!device->in_fs_metadata); 1099 + WARN_ON(device->is_tgtdev_for_dev_replace); 1149 1100 path = btrfs_alloc_path(); 1150 1101 if (!path) 1151 1102 return -ENOMEM; ··· 1381 1330 root->fs_info->avail_system_alloc_bits | 1382 1331 root->fs_info->avail_metadata_alloc_bits; 1383 1332 1384 - if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && 1385 - root->fs_info->fs_devices->num_devices <= 4) { 1333 + num_devices = root->fs_info->fs_devices->num_devices; 1334 + btrfs_dev_replace_lock(&root->fs_info->dev_replace); 1335 + if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { 1336 + WARN_ON(num_devices < 1); 1337 + num_devices--; 1338 + } 1339 + btrfs_dev_replace_unlock(&root->fs_info->dev_replace); 1340 + 1341 + if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { 1386 1342 printk(KERN_ERR "btrfs: unable to go below four devices " 1387 1343 "on raid10\n"); 1388 1344 ret = -EINVAL; 1389 1345 goto out; 1390 1346 } 1391 1347 1392 - if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && 1393 - root->fs_info->fs_devices->num_devices <= 2) { 1348 + if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { 1394 1349 printk(KERN_ERR "btrfs: unable to go below two " 1395 1350 "devices on raid1\n"); 1396 1351 ret = -EINVAL; ··· 1414 1357 * is held. 1415 1358 */ 1416 1359 list_for_each_entry(tmp, devices, dev_list) { 1417 - if (tmp->in_fs_metadata && !tmp->bdev) { 1360 + if (tmp->in_fs_metadata && 1361 + !tmp->is_tgtdev_for_dev_replace && 1362 + !tmp->bdev) { 1418 1363 device = tmp; 1419 1364 break; 1420 1365 } ··· 1430 1371 goto out; 1431 1372 } 1432 1373 } else { 1433 - bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, 1434 - root->fs_info->bdev_holder); 1435 - if (IS_ERR(bdev)) { 1436 - ret = PTR_ERR(bdev); 1374 + ret = btrfs_get_bdev_and_sb(device_path, 1375 + FMODE_READ | FMODE_EXCL, 1376 + root->fs_info->bdev_holder, 0, 1377 + &bdev, &bh); 1378 + if (ret) 1437 1379 goto out; 1438 - } 1439 - 1440 - set_blocksize(bdev, 4096); 1441 - invalidate_bdev(bdev); 1442 - bh = btrfs_read_dev_super(bdev); 1443 - if (!bh) { 1444 - ret = -EINVAL; 1445 - goto error_close; 1446 - } 1447 1380 disk_super = (struct btrfs_super_block *)bh->b_data; 1448 1381 devid = btrfs_stack_device_id(&disk_super->dev_item); 1449 1382 dev_uuid = disk_super->dev_item.uuid; 1450 - device = btrfs_find_device(root, devid, dev_uuid, 1383 + device = btrfs_find_device(root->fs_info, devid, dev_uuid, 1451 1384 disk_super->fsid); 1452 1385 if (!device) { 1453 1386 ret = -ENOENT; 1454 1387 goto error_brelse; 1455 1388 } 1389 + } 1390 + 1391 + if (device->is_tgtdev_for_dev_replace) { 1392 + pr_err("btrfs: unable to remove the dev_replace target dev\n"); 1393 + ret = -EINVAL; 1394 + goto error_brelse; 1456 1395 } 1457 1396 1458 1397 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { ··· 1472 1415 if (ret) 1473 1416 goto error_undo; 1474 1417 1418 + /* 1419 + * TODO: the superblock still includes this device in its num_devices 1420 + * counter although write_all_supers() is not locked out. This 1421 + * could give a filesystem state which requires a degraded mount. 1422 + */ 1475 1423 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1476 1424 if (ret) 1477 1425 goto error_undo; ··· 1487 1425 spin_unlock(&root->fs_info->free_chunk_lock); 1488 1426 1489 1427 device->in_fs_metadata = 0; 1490 - btrfs_scrub_cancel_dev(root, device); 1428 + btrfs_scrub_cancel_dev(root->fs_info, device); 1491 1429 1492 1430 /* 1493 1431 * the device list mutex makes sure that we don't change ··· 1544 1482 * at this point, the device is zero sized. We want to 1545 1483 * remove it from the devices list and zero out the old super 1546 1484 */ 1547 - if (clear_super) { 1485 + if (clear_super && disk_super) { 1548 1486 /* make sure this device isn't detected as part of 1549 1487 * the FS anymore 1550 1488 */ ··· 1555 1493 1556 1494 ret = 0; 1557 1495 1496 + /* Notify udev that device has changed */ 1497 + btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 1498 + 1558 1499 error_brelse: 1559 1500 brelse(bh); 1560 - error_close: 1561 1501 if (bdev) 1562 1502 blkdev_put(bdev, FMODE_READ | FMODE_EXCL); 1563 1503 out: ··· 1574 1510 root->fs_info->fs_devices->rw_devices++; 1575 1511 } 1576 1512 goto error_brelse; 1513 + } 1514 + 1515 + void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, 1516 + struct btrfs_device *srcdev) 1517 + { 1518 + WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); 1519 + list_del_rcu(&srcdev->dev_list); 1520 + list_del_rcu(&srcdev->dev_alloc_list); 1521 + fs_info->fs_devices->num_devices--; 1522 + if (srcdev->missing) { 1523 + fs_info->fs_devices->missing_devices--; 1524 + fs_info->fs_devices->rw_devices++; 1525 + } 1526 + if (srcdev->can_discard) 1527 + fs_info->fs_devices->num_can_discard--; 1528 + if (srcdev->bdev) 1529 + fs_info->fs_devices->open_devices--; 1530 + 1531 + call_rcu(&srcdev->rcu, free_device); 1532 + } 1533 + 1534 + void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 1535 + struct btrfs_device *tgtdev) 1536 + { 1537 + struct btrfs_device *next_device; 1538 + 1539 + WARN_ON(!tgtdev); 1540 + mutex_lock(&fs_info->fs_devices->device_list_mutex); 1541 + if (tgtdev->bdev) { 1542 + btrfs_scratch_superblock(tgtdev); 1543 + fs_info->fs_devices->open_devices--; 1544 + } 1545 + fs_info->fs_devices->num_devices--; 1546 + if (tgtdev->can_discard) 1547 + fs_info->fs_devices->num_can_discard++; 1548 + 1549 + next_device = list_entry(fs_info->fs_devices->devices.next, 1550 + struct btrfs_device, dev_list); 1551 + if (tgtdev->bdev == fs_info->sb->s_bdev) 1552 + fs_info->sb->s_bdev = next_device->bdev; 1553 + if (tgtdev->bdev == fs_info->fs_devices->latest_bdev) 1554 + fs_info->fs_devices->latest_bdev = next_device->bdev; 1555 + list_del_rcu(&tgtdev->dev_list); 1556 + 1557 + call_rcu(&tgtdev->rcu, free_device); 1558 + 1559 + mutex_unlock(&fs_info->fs_devices->device_list_mutex); 1560 + } 1561 + 1562 + int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, 1563 + struct btrfs_device **device) 1564 + { 1565 + int ret = 0; 1566 + struct btrfs_super_block *disk_super; 1567 + u64 devid; 1568 + u8 *dev_uuid; 1569 + struct block_device *bdev; 1570 + struct buffer_head *bh; 1571 + 1572 + *device = NULL; 1573 + ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, 1574 + root->fs_info->bdev_holder, 0, &bdev, &bh); 1575 + if (ret) 1576 + return ret; 1577 + disk_super = (struct btrfs_super_block *)bh->b_data; 1578 + devid = btrfs_stack_device_id(&disk_super->dev_item); 1579 + dev_uuid = disk_super->dev_item.uuid; 1580 + *device = btrfs_find_device(root->fs_info, devid, dev_uuid, 1581 + disk_super->fsid); 1582 + brelse(bh); 1583 + if (!*device) 1584 + ret = -ENOENT; 1585 + blkdev_put(bdev, FMODE_READ); 1586 + return ret; 1587 + } 1588 + 1589 + int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, 1590 + char *device_path, 1591 + struct btrfs_device **device) 1592 + { 1593 + *device = NULL; 1594 + if (strcmp(device_path, "missing") == 0) { 1595 + struct list_head *devices; 1596 + struct btrfs_device *tmp; 1597 + 1598 + devices = &root->fs_info->fs_devices->devices; 1599 + /* 1600 + * It is safe to read the devices since the volume_mutex 1601 + * is held by the caller. 1602 + */ 1603 + list_for_each_entry(tmp, devices, dev_list) { 1604 + if (tmp->in_fs_metadata && !tmp->bdev) { 1605 + *device = tmp; 1606 + break; 1607 + } 1608 + } 1609 + 1610 + if (!*device) { 1611 + pr_err("btrfs: no missing device found\n"); 1612 + return -ENOENT; 1613 + } 1614 + 1615 + return 0; 1616 + } else { 1617 + return btrfs_find_device_by_path(root, device_path, device); 1618 + } 1577 1619 } 1578 1620 1579 1621 /* ··· 1800 1630 read_extent_buffer(leaf, fs_uuid, 1801 1631 (unsigned long)btrfs_device_fsid(dev_item), 1802 1632 BTRFS_UUID_SIZE); 1803 - device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 1633 + device = btrfs_find_device(root->fs_info, devid, dev_uuid, 1634 + fs_uuid); 1804 1635 BUG_ON(!device); /* Logic error */ 1805 1636 1806 1637 if (device->fs_devices->seeding) { ··· 1849 1678 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1850 1679 1851 1680 devices = &root->fs_info->fs_devices->devices; 1852 - /* 1853 - * we have the volume lock, so we don't need the extra 1854 - * device list mutex while reading the list here. 1855 - */ 1681 + 1682 + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1856 1683 list_for_each_entry(device, devices, dev_list) { 1857 1684 if (device->bdev == bdev) { 1858 1685 ret = -EEXIST; 1686 + mutex_unlock( 1687 + &root->fs_info->fs_devices->device_list_mutex); 1859 1688 goto error; 1860 1689 } 1861 1690 } 1691 + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1862 1692 1863 1693 device = kzalloc(sizeof(*device), GFP_NOFS); 1864 1694 if (!device) { ··· 1909 1737 device->dev_root = root->fs_info->dev_root; 1910 1738 device->bdev = bdev; 1911 1739 device->in_fs_metadata = 1; 1740 + device->is_tgtdev_for_dev_replace = 0; 1912 1741 device->mode = FMODE_EXCL; 1913 1742 set_blocksize(device->bdev, 4096); 1914 1743 ··· 2017 1844 return ret; 2018 1845 } 2019 1846 1847 + int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, 1848 + struct btrfs_device **device_out) 1849 + { 1850 + struct request_queue *q; 1851 + struct btrfs_device *device; 1852 + struct block_device *bdev; 1853 + struct btrfs_fs_info *fs_info = root->fs_info; 1854 + struct list_head *devices; 1855 + struct rcu_string *name; 1856 + int ret = 0; 1857 + 1858 + *device_out = NULL; 1859 + if (fs_info->fs_devices->seeding) 1860 + return -EINVAL; 1861 + 1862 + bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 1863 + fs_info->bdev_holder); 1864 + if (IS_ERR(bdev)) 1865 + return PTR_ERR(bdev); 1866 + 1867 + filemap_write_and_wait(bdev->bd_inode->i_mapping); 1868 + 1869 + devices = &fs_info->fs_devices->devices; 1870 + list_for_each_entry(device, devices, dev_list) { 1871 + if (device->bdev == bdev) { 1872 + ret = -EEXIST; 1873 + goto error; 1874 + } 1875 + } 1876 + 1877 + device = kzalloc(sizeof(*device), GFP_NOFS); 1878 + if (!device) { 1879 + ret = -ENOMEM; 1880 + goto error; 1881 + } 1882 + 1883 + name = rcu_string_strdup(device_path, GFP_NOFS); 1884 + if (!name) { 1885 + kfree(device); 1886 + ret = -ENOMEM; 1887 + goto error; 1888 + } 1889 + rcu_assign_pointer(device->name, name); 1890 + 1891 + q = bdev_get_queue(bdev); 1892 + if (blk_queue_discard(q)) 1893 + device->can_discard = 1; 1894 + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1895 + device->writeable = 1; 1896 + device->work.func = pending_bios_fn; 1897 + generate_random_uuid(device->uuid); 1898 + device->devid = BTRFS_DEV_REPLACE_DEVID; 1899 + spin_lock_init(&device->io_lock); 1900 + device->generation = 0; 1901 + device->io_width = root->sectorsize; 1902 + device->io_align = root->sectorsize; 1903 + device->sector_size = root->sectorsize; 1904 + device->total_bytes = i_size_read(bdev->bd_inode); 1905 + device->disk_total_bytes = device->total_bytes; 1906 + device->dev_root = fs_info->dev_root; 1907 + device->bdev = bdev; 1908 + device->in_fs_metadata = 1; 1909 + device->is_tgtdev_for_dev_replace = 1; 1910 + device->mode = FMODE_EXCL; 1911 + set_blocksize(device->bdev, 4096); 1912 + device->fs_devices = fs_info->fs_devices; 1913 + list_add(&device->dev_list, &fs_info->fs_devices->devices); 1914 + fs_info->fs_devices->num_devices++; 1915 + fs_info->fs_devices->open_devices++; 1916 + if (device->can_discard) 1917 + fs_info->fs_devices->num_can_discard++; 1918 + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1919 + 1920 + *device_out = device; 1921 + return ret; 1922 + 1923 + error: 1924 + blkdev_put(bdev, FMODE_EXCL); 1925 + return ret; 1926 + } 1927 + 1928 + void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, 1929 + struct btrfs_device *tgtdev) 1930 + { 1931 + WARN_ON(fs_info->fs_devices->rw_devices == 0); 1932 + tgtdev->io_width = fs_info->dev_root->sectorsize; 1933 + tgtdev->io_align = fs_info->dev_root->sectorsize; 1934 + tgtdev->sector_size = fs_info->dev_root->sectorsize; 1935 + tgtdev->dev_root = fs_info->dev_root; 1936 + tgtdev->in_fs_metadata = 1; 1937 + } 1938 + 2020 1939 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2021 1940 struct btrfs_device *device) 2022 1941 { ··· 2165 1900 2166 1901 if (!device->writeable) 2167 1902 return -EACCES; 2168 - if (new_size <= device->total_bytes) 1903 + if (new_size <= device->total_bytes || 1904 + device->is_tgtdev_for_dev_replace) 2169 1905 return -EINVAL; 2170 1906 2171 1907 btrfs_set_super_total_bytes(super_copy, old_total + diff); ··· 2604 2338 return 1; 2605 2339 } 2606 2340 2607 - static u64 div_factor_fine(u64 num, int factor) 2608 - { 2609 - if (factor <= 0) 2610 - return 0; 2611 - if (factor >= 100) 2612 - return num; 2613 - 2614 - num *= factor; 2615 - do_div(num, 100); 2616 - return num; 2617 - } 2618 - 2619 2341 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 2620 2342 struct btrfs_balance_args *bargs) 2621 2343 { ··· 2768 2514 return 1; 2769 2515 } 2770 2516 2771 - static u64 div_factor(u64 num, int factor) 2772 - { 2773 - if (factor == 10) 2774 - return num; 2775 - num *= factor; 2776 - do_div(num, 10); 2777 - return num; 2778 - } 2779 - 2780 2517 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 2781 2518 { 2782 2519 struct btrfs_balance_control *bctl = fs_info->balance_ctl; ··· 2795 2550 size_to_free = div_factor(old_size, 1); 2796 2551 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 2797 2552 if (!device->writeable || 2798 - device->total_bytes - device->bytes_used > size_to_free) 2553 + device->total_bytes - device->bytes_used > size_to_free || 2554 + device->is_tgtdev_for_dev_replace) 2799 2555 continue; 2800 2556 2801 2557 ret = btrfs_shrink_device(device, old_size - size_to_free); ··· 2974 2728 u64 allowed; 2975 2729 int mixed = 0; 2976 2730 int ret; 2731 + u64 num_devices; 2977 2732 2978 2733 if (btrfs_fs_closing(fs_info) || 2979 2734 atomic_read(&fs_info->balance_pause_req) || ··· 3003 2756 } 3004 2757 } 3005 2758 2759 + num_devices = fs_info->fs_devices->num_devices; 2760 + btrfs_dev_replace_lock(&fs_info->dev_replace); 2761 + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 2762 + BUG_ON(num_devices < 1); 2763 + num_devices--; 2764 + } 2765 + btrfs_dev_replace_unlock(&fs_info->dev_replace); 3006 2766 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 3007 - if (fs_info->fs_devices->num_devices == 1) 2767 + if (num_devices == 1) 3008 2768 allowed |= BTRFS_BLOCK_GROUP_DUP; 3009 - else if (fs_info->fs_devices->num_devices < 4) 2769 + else if (num_devices < 4) 3010 2770 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3011 2771 else 3012 2772 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | ··· 3156 2902 ret = btrfs_balance(fs_info->balance_ctl, NULL); 3157 2903 } 3158 2904 2905 + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); 3159 2906 mutex_unlock(&fs_info->balance_mutex); 3160 2907 mutex_unlock(&fs_info->volume_mutex); 3161 2908 ··· 3179 2924 return 0; 3180 2925 } 3181 2926 2927 + WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); 3182 2928 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 3183 2929 if (IS_ERR(tsk)) 3184 2930 return PTR_ERR(tsk); ··· 3336 3080 u64 old_size = device->total_bytes; 3337 3081 u64 diff = device->total_bytes - new_size; 3338 3082 3339 - if (new_size >= device->total_bytes) 3083 + if (device->is_tgtdev_for_dev_replace) 3340 3084 return -EINVAL; 3341 3085 3342 3086 path = btrfs_alloc_path(); ··· 3491 3235 return 0; 3492 3236 } 3493 3237 3238 + struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 3239 + { 2, 1, 0, 4, 2, 2 /* raid10 */ }, 3240 + { 1, 1, 2, 2, 2, 2 /* raid1 */ }, 3241 + { 1, 2, 1, 1, 1, 2 /* dup */ }, 3242 + { 1, 1, 0, 2, 1, 1 /* raid0 */ }, 3243 + { 1, 1, 0, 1, 1, 1 /* single */ }, 3244 + }; 3245 + 3494 3246 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3495 3247 struct btrfs_root *extent_root, 3496 3248 struct map_lookup **map_ret, ··· 3528 3264 int ndevs; 3529 3265 int i; 3530 3266 int j; 3267 + int index; 3531 3268 3532 3269 BUG_ON(!alloc_profile_is_valid(type, 0)); 3533 3270 3534 3271 if (list_empty(&fs_devices->alloc_list)) 3535 3272 return -ENOSPC; 3536 3273 3537 - sub_stripes = 1; 3538 - dev_stripes = 1; 3539 - devs_increment = 1; 3540 - ncopies = 1; 3541 - devs_max = 0; /* 0 == as many as possible */ 3542 - devs_min = 1; 3274 + index = __get_raid_index(type); 3543 3275 3544 - /* 3545 - * define the properties of each RAID type. 3546 - * FIXME: move this to a global table and use it in all RAID 3547 - * calculation code 3548 - */ 3549 - if (type & (BTRFS_BLOCK_GROUP_DUP)) { 3550 - dev_stripes = 2; 3551 - ncopies = 2; 3552 - devs_max = 1; 3553 - } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) { 3554 - devs_min = 2; 3555 - } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 3556 - devs_increment = 2; 3557 - ncopies = 2; 3558 - devs_max = 2; 3559 - devs_min = 2; 3560 - } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 3561 - sub_stripes = 2; 3562 - devs_increment = 2; 3563 - ncopies = 2; 3564 - devs_min = 4; 3565 - } else { 3566 - devs_max = 1; 3567 - } 3276 + sub_stripes = btrfs_raid_array[index].sub_stripes; 3277 + dev_stripes = btrfs_raid_array[index].dev_stripes; 3278 + devs_max = btrfs_raid_array[index].devs_max; 3279 + devs_min = btrfs_raid_array[index].devs_min; 3280 + devs_increment = btrfs_raid_array[index].devs_increment; 3281 + ncopies = btrfs_raid_array[index].ncopies; 3568 3282 3569 3283 if (type & BTRFS_BLOCK_GROUP_DATA) { 3570 3284 max_stripe_size = 1024 * 1024 * 1024; ··· 3589 3347 cur = cur->next; 3590 3348 3591 3349 if (!device->writeable) { 3592 - printk(KERN_ERR 3350 + WARN(1, KERN_ERR 3593 3351 "btrfs: read-only device in alloc_list\n"); 3594 - WARN_ON(1); 3595 3352 continue; 3596 3353 } 3597 3354 3598 - if (!device->in_fs_metadata) 3355 + if (!device->in_fs_metadata || 3356 + device->is_tgtdev_for_dev_replace) 3599 3357 continue; 3600 3358 3601 3359 if (device->total_bytes > device->bytes_used) ··· 3624 3382 devices_info[ndevs].total_avail = total_avail; 3625 3383 devices_info[ndevs].dev = device; 3626 3384 ++ndevs; 3385 + WARN_ON(ndevs > fs_devices->rw_devices); 3627 3386 } 3628 3387 3629 3388 /* ··· 3983 3740 } 3984 3741 } 3985 3742 3986 - int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) 3743 + int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 3987 3744 { 3745 + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 3988 3746 struct extent_map *em; 3989 3747 struct map_lookup *map; 3990 3748 struct extent_map_tree *em_tree = &map_tree->map_tree; ··· 4005 3761 else 4006 3762 ret = 1; 4007 3763 free_extent_map(em); 3764 + 3765 + btrfs_dev_replace_lock(&fs_info->dev_replace); 3766 + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) 3767 + ret++; 3768 + btrfs_dev_replace_unlock(&fs_info->dev_replace); 3769 + 4008 3770 return ret; 4009 3771 } 4010 3772 4011 - static int find_live_mirror(struct map_lookup *map, int first, int num, 4012 - int optimal) 3773 + static int find_live_mirror(struct btrfs_fs_info *fs_info, 3774 + struct map_lookup *map, int first, int num, 3775 + int optimal, int dev_replace_is_ongoing) 4013 3776 { 4014 3777 int i; 4015 - if (map->stripes[optimal].dev->bdev) 4016 - return optimal; 4017 - for (i = first; i < first + num; i++) { 4018 - if (map->stripes[i].dev->bdev) 4019 - return i; 3778 + int tolerance; 3779 + struct btrfs_device *srcdev; 3780 + 3781 + if (dev_replace_is_ongoing && 3782 + fs_info->dev_replace.cont_reading_from_srcdev_mode == 3783 + BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 3784 + srcdev = fs_info->dev_replace.srcdev; 3785 + else 3786 + srcdev = NULL; 3787 + 3788 + /* 3789 + * try to avoid the drive that is the source drive for a 3790 + * dev-replace procedure, only choose it if no other non-missing 3791 + * mirror is available 3792 + */ 3793 + for (tolerance = 0; tolerance < 2; tolerance++) { 3794 + if (map->stripes[optimal].dev->bdev && 3795 + (tolerance || map->stripes[optimal].dev != srcdev)) 3796 + return optimal; 3797 + for (i = first; i < first + num; i++) { 3798 + if (map->stripes[i].dev->bdev && 3799 + (tolerance || map->stripes[i].dev != srcdev)) 3800 + return i; 3801 + } 4020 3802 } 3803 + 4021 3804 /* we couldn't find one that doesn't fail. Just return something 4022 3805 * and the io error handling code will clean up eventually 4023 3806 */ 4024 3807 return optimal; 4025 3808 } 4026 3809 4027 - static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 3810 + static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 4028 3811 u64 logical, u64 *length, 4029 3812 struct btrfs_bio **bbio_ret, 4030 3813 int mirror_num) 4031 3814 { 4032 3815 struct extent_map *em; 4033 3816 struct map_lookup *map; 3817 + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 4034 3818 struct extent_map_tree *em_tree = &map_tree->map_tree; 4035 3819 u64 offset; 4036 3820 u64 stripe_offset; ··· 4072 3800 int num_stripes; 4073 3801 int max_errors = 0; 4074 3802 struct btrfs_bio *bbio = NULL; 3803 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 3804 + int dev_replace_is_ongoing = 0; 3805 + int num_alloc_stripes; 3806 + int patch_the_first_stripe_for_dev_replace = 0; 3807 + u64 physical_to_patch_in_first_stripe = 0; 4075 3808 4076 3809 read_lock(&em_tree->lock); 4077 3810 em = lookup_extent_mapping(em_tree, logical, *length); ··· 4092 3815 BUG_ON(em->start > logical || em->start + em->len < logical); 4093 3816 map = (struct map_lookup *)em->bdev; 4094 3817 offset = logical - em->start; 4095 - 4096 - if (mirror_num > map->num_stripes) 4097 - mirror_num = 0; 4098 3818 4099 3819 stripe_nr = offset; 4100 3820 /* ··· 4119 3845 if (!bbio_ret) 4120 3846 goto out; 4121 3847 3848 + btrfs_dev_replace_lock(dev_replace); 3849 + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 3850 + if (!dev_replace_is_ongoing) 3851 + btrfs_dev_replace_unlock(dev_replace); 3852 + 3853 + if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 3854 + !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && 3855 + dev_replace->tgtdev != NULL) { 3856 + /* 3857 + * in dev-replace case, for repair case (that's the only 3858 + * case where the mirror is selected explicitly when 3859 + * calling btrfs_map_block), blocks left of the left cursor 3860 + * can also be read from the target drive. 3861 + * For REQ_GET_READ_MIRRORS, the target drive is added as 3862 + * the last one to the array of stripes. For READ, it also 3863 + * needs to be supported using the same mirror number. 3864 + * If the requested block is not left of the left cursor, 3865 + * EIO is returned. This can happen because btrfs_num_copies() 3866 + * returns one more in the dev-replace case. 3867 + */ 3868 + u64 tmp_length = *length; 3869 + struct btrfs_bio *tmp_bbio = NULL; 3870 + int tmp_num_stripes; 3871 + u64 srcdev_devid = dev_replace->srcdev->devid; 3872 + int index_srcdev = 0; 3873 + int found = 0; 3874 + u64 physical_of_found = 0; 3875 + 3876 + ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, 3877 + logical, &tmp_length, &tmp_bbio, 0); 3878 + if (ret) { 3879 + WARN_ON(tmp_bbio != NULL); 3880 + goto out; 3881 + } 3882 + 3883 + tmp_num_stripes = tmp_bbio->num_stripes; 3884 + if (mirror_num > tmp_num_stripes) { 3885 + /* 3886 + * REQ_GET_READ_MIRRORS does not contain this 3887 + * mirror, that means that the requested area 3888 + * is not left of the left cursor 3889 + */ 3890 + ret = -EIO; 3891 + kfree(tmp_bbio); 3892 + goto out; 3893 + } 3894 + 3895 + /* 3896 + * process the rest of the function using the mirror_num 3897 + * of the source drive. Therefore look it up first. 3898 + * At the end, patch the device pointer to the one of the 3899 + * target drive. 3900 + */ 3901 + for (i = 0; i < tmp_num_stripes; i++) { 3902 + if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) { 3903 + /* 3904 + * In case of DUP, in order to keep it 3905 + * simple, only add the mirror with the 3906 + * lowest physical address 3907 + */ 3908 + if (found && 3909 + physical_of_found <= 3910 + tmp_bbio->stripes[i].physical) 3911 + continue; 3912 + index_srcdev = i; 3913 + found = 1; 3914 + physical_of_found = 3915 + tmp_bbio->stripes[i].physical; 3916 + } 3917 + } 3918 + 3919 + if (found) { 3920 + mirror_num = index_srcdev + 1; 3921 + patch_the_first_stripe_for_dev_replace = 1; 3922 + physical_to_patch_in_first_stripe = physical_of_found; 3923 + } else { 3924 + WARN_ON(1); 3925 + ret = -EIO; 3926 + kfree(tmp_bbio); 3927 + goto out; 3928 + } 3929 + 3930 + kfree(tmp_bbio); 3931 + } else if (mirror_num > map->num_stripes) { 3932 + mirror_num = 0; 3933 + } 3934 + 4122 3935 num_stripes = 1; 4123 3936 stripe_index = 0; 4124 3937 stripe_nr_orig = stripe_nr; ··· 4220 3859 stripe_nr_end - stripe_nr_orig); 4221 3860 stripe_index = do_div(stripe_nr, map->num_stripes); 4222 3861 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 4223 - if (rw & (REQ_WRITE | REQ_DISCARD)) 3862 + if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) 4224 3863 num_stripes = map->num_stripes; 4225 3864 else if (mirror_num) 4226 3865 stripe_index = mirror_num - 1; 4227 3866 else { 4228 - stripe_index = find_live_mirror(map, 0, 3867 + stripe_index = find_live_mirror(fs_info, map, 0, 4229 3868 map->num_stripes, 4230 - current->pid % map->num_stripes); 3869 + current->pid % map->num_stripes, 3870 + dev_replace_is_ongoing); 4231 3871 mirror_num = stripe_index + 1; 4232 3872 } 4233 3873 4234 3874 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 4235 - if (rw & (REQ_WRITE | REQ_DISCARD)) { 3875 + if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) { 4236 3876 num_stripes = map->num_stripes; 4237 3877 } else if (mirror_num) { 4238 3878 stripe_index = mirror_num - 1; ··· 4247 3885 stripe_index = do_div(stripe_nr, factor); 4248 3886 stripe_index *= map->sub_stripes; 4249 3887 4250 - if (rw & REQ_WRITE) 3888 + if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) 4251 3889 num_stripes = map->sub_stripes; 4252 3890 else if (rw & REQ_DISCARD) 4253 3891 num_stripes = min_t(u64, map->sub_stripes * ··· 4257 3895 stripe_index += mirror_num - 1; 4258 3896 else { 4259 3897 int old_stripe_index = stripe_index; 4260 - stripe_index = find_live_mirror(map, stripe_index, 3898 + stripe_index = find_live_mirror(fs_info, map, 3899 + stripe_index, 4261 3900 map->sub_stripes, stripe_index + 4262 - current->pid % map->sub_stripes); 3901 + current->pid % map->sub_stripes, 3902 + dev_replace_is_ongoing); 4263 3903 mirror_num = stripe_index - old_stripe_index + 1; 4264 3904 } 4265 3905 } else { ··· 4275 3911 } 4276 3912 BUG_ON(stripe_index >= map->num_stripes); 4277 3913 4278 - bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS); 3914 + num_alloc_stripes = num_stripes; 3915 + if (dev_replace_is_ongoing) { 3916 + if (rw & (REQ_WRITE | REQ_DISCARD)) 3917 + num_alloc_stripes <<= 1; 3918 + if (rw & REQ_GET_READ_MIRRORS) 3919 + num_alloc_stripes++; 3920 + } 3921 + bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS); 4279 3922 if (!bbio) { 4280 3923 ret = -ENOMEM; 4281 3924 goto out; ··· 4369 3998 } 4370 3999 } 4371 4000 4372 - if (rw & REQ_WRITE) { 4001 + if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { 4373 4002 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 4374 4003 BTRFS_BLOCK_GROUP_RAID10 | 4375 4004 BTRFS_BLOCK_GROUP_DUP)) { ··· 4377 4006 } 4378 4007 } 4379 4008 4009 + if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && 4010 + dev_replace->tgtdev != NULL) { 4011 + int index_where_to_add; 4012 + u64 srcdev_devid = dev_replace->srcdev->devid; 4013 + 4014 + /* 4015 + * duplicate the write operations while the dev replace 4016 + * procedure is running. Since the copying of the old disk 4017 + * to the new disk takes place at run time while the 4018 + * filesystem is mounted writable, the regular write 4019 + * operations to the old disk have to be duplicated to go 4020 + * to the new disk as well. 4021 + * Note that device->missing is handled by the caller, and 4022 + * that the write to the old disk is already set up in the 4023 + * stripes array. 4024 + */ 4025 + index_where_to_add = num_stripes; 4026 + for (i = 0; i < num_stripes; i++) { 4027 + if (bbio->stripes[i].dev->devid == srcdev_devid) { 4028 + /* write to new disk, too */ 4029 + struct btrfs_bio_stripe *new = 4030 + bbio->stripes + index_where_to_add; 4031 + struct btrfs_bio_stripe *old = 4032 + bbio->stripes + i; 4033 + 4034 + new->physical = old->physical; 4035 + new->length = old->length; 4036 + new->dev = dev_replace->tgtdev; 4037 + index_where_to_add++; 4038 + max_errors++; 4039 + } 4040 + } 4041 + num_stripes = index_where_to_add; 4042 + } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) && 4043 + dev_replace->tgtdev != NULL) { 4044 + u64 srcdev_devid = dev_replace->srcdev->devid; 4045 + int index_srcdev = 0; 4046 + int found = 0; 4047 + u64 physical_of_found = 0; 4048 + 4049 + /* 4050 + * During the dev-replace procedure, the target drive can 4051 + * also be used to read data in case it is needed to repair 4052 + * a corrupt block elsewhere. This is possible if the 4053 + * requested area is left of the left cursor. In this area, 4054 + * the target drive is a full copy of the source drive. 4055 + */ 4056 + for (i = 0; i < num_stripes; i++) { 4057 + if (bbio->stripes[i].dev->devid == srcdev_devid) { 4058 + /* 4059 + * In case of DUP, in order to keep it 4060 + * simple, only add the mirror with the 4061 + * lowest physical address 4062 + */ 4063 + if (found && 4064 + physical_of_found <= 4065 + bbio->stripes[i].physical) 4066 + continue; 4067 + index_srcdev = i; 4068 + found = 1; 4069 + physical_of_found = bbio->stripes[i].physical; 4070 + } 4071 + } 4072 + if (found) { 4073 + u64 length = map->stripe_len; 4074 + 4075 + if (physical_of_found + length <= 4076 + dev_replace->cursor_left) { 4077 + struct btrfs_bio_stripe *tgtdev_stripe = 4078 + bbio->stripes + num_stripes; 4079 + 4080 + tgtdev_stripe->physical = physical_of_found; 4081 + tgtdev_stripe->length = 4082 + bbio->stripes[index_srcdev].length; 4083 + tgtdev_stripe->dev = dev_replace->tgtdev; 4084 + 4085 + num_stripes++; 4086 + } 4087 + } 4088 + } 4089 + 4380 4090 *bbio_ret = bbio; 4381 4091 bbio->num_stripes = num_stripes; 4382 4092 bbio->max_errors = max_errors; 4383 4093 bbio->mirror_num = mirror_num; 4094 + 4095 + /* 4096 + * this is the case that REQ_READ && dev_replace_is_ongoing && 4097 + * mirror_num == num_stripes + 1 && dev_replace target drive is 4098 + * available as a mirror 4099 + */ 4100 + if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 4101 + WARN_ON(num_stripes > 1); 4102 + bbio->stripes[0].dev = dev_replace->tgtdev; 4103 + bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 4104 + bbio->mirror_num = map->num_stripes + 1; 4105 + } 4384 4106 out: 4107 + if (dev_replace_is_ongoing) 4108 + btrfs_dev_replace_unlock(dev_replace); 4385 4109 free_extent_map(em); 4386 4110 return ret; 4387 4111 } 4388 4112 4389 - int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 4113 + int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 4390 4114 u64 logical, u64 *length, 4391 4115 struct btrfs_bio **bbio_ret, int mirror_num) 4392 4116 { 4393 - return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret, 4117 + return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, 4394 4118 mirror_num); 4395 4119 } 4396 4120 ··· 4704 4238 &device->work); 4705 4239 } 4706 4240 4241 + static int bio_size_ok(struct block_device *bdev, struct bio *bio, 4242 + sector_t sector) 4243 + { 4244 + struct bio_vec *prev; 4245 + struct request_queue *q = bdev_get_queue(bdev); 4246 + unsigned short max_sectors = queue_max_sectors(q); 4247 + struct bvec_merge_data bvm = { 4248 + .bi_bdev = bdev, 4249 + .bi_sector = sector, 4250 + .bi_rw = bio->bi_rw, 4251 + }; 4252 + 4253 + if (bio->bi_vcnt == 0) { 4254 + WARN_ON(1); 4255 + return 1; 4256 + } 4257 + 4258 + prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; 4259 + if ((bio->bi_size >> 9) > max_sectors) 4260 + return 0; 4261 + 4262 + if (!q->merge_bvec_fn) 4263 + return 1; 4264 + 4265 + bvm.bi_size = bio->bi_size - prev->bv_len; 4266 + if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) 4267 + return 0; 4268 + return 1; 4269 + } 4270 + 4271 + static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, 4272 + struct bio *bio, u64 physical, int dev_nr, 4273 + int rw, int async) 4274 + { 4275 + struct btrfs_device *dev = bbio->stripes[dev_nr].dev; 4276 + 4277 + bio->bi_private = bbio; 4278 + bio->bi_private = merge_stripe_index_into_bio_private( 4279 + bio->bi_private, (unsigned int)dev_nr); 4280 + bio->bi_end_io = btrfs_end_bio; 4281 + bio->bi_sector = physical >> 9; 4282 + #ifdef DEBUG 4283 + { 4284 + struct rcu_string *name; 4285 + 4286 + rcu_read_lock(); 4287 + name = rcu_dereference(dev->name); 4288 + pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " 4289 + "(%s id %llu), size=%u\n", rw, 4290 + (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, 4291 + name->str, dev->devid, bio->bi_size); 4292 + rcu_read_unlock(); 4293 + } 4294 + #endif 4295 + bio->bi_bdev = dev->bdev; 4296 + if (async) 4297 + schedule_bio(root, dev, rw, bio); 4298 + else 4299 + btrfsic_submit_bio(rw, bio); 4300 + } 4301 + 4302 + static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, 4303 + struct bio *first_bio, struct btrfs_device *dev, 4304 + int dev_nr, int rw, int async) 4305 + { 4306 + struct bio_vec *bvec = first_bio->bi_io_vec; 4307 + struct bio *bio; 4308 + int nr_vecs = bio_get_nr_vecs(dev->bdev); 4309 + u64 physical = bbio->stripes[dev_nr].physical; 4310 + 4311 + again: 4312 + bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS); 4313 + if (!bio) 4314 + return -ENOMEM; 4315 + 4316 + while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) { 4317 + if (bio_add_page(bio, bvec->bv_page, bvec->bv_len, 4318 + bvec->bv_offset) < bvec->bv_len) { 4319 + u64 len = bio->bi_size; 4320 + 4321 + atomic_inc(&bbio->stripes_pending); 4322 + submit_stripe_bio(root, bbio, bio, physical, dev_nr, 4323 + rw, async); 4324 + physical += len; 4325 + goto again; 4326 + } 4327 + bvec++; 4328 + } 4329 + 4330 + submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async); 4331 + return 0; 4332 + } 4333 + 4334 + static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) 4335 + { 4336 + atomic_inc(&bbio->error); 4337 + if (atomic_dec_and_test(&bbio->stripes_pending)) { 4338 + bio->bi_private = bbio->private; 4339 + bio->bi_end_io = bbio->end_io; 4340 + bio->bi_bdev = (struct block_device *) 4341 + (unsigned long)bbio->mirror_num; 4342 + bio->bi_sector = logical >> 9; 4343 + kfree(bbio); 4344 + bio_endio(bio, -EIO); 4345 + } 4346 + } 4347 + 4707 4348 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 4708 4349 int mirror_num, int async_submit) 4709 4350 { 4710 - struct btrfs_mapping_tree *map_tree; 4711 4351 struct btrfs_device *dev; 4712 4352 struct bio *first_bio = bio; 4713 4353 u64 logical = (u64)bio->bi_sector << 9; ··· 4825 4253 struct btrfs_bio *bbio = NULL; 4826 4254 4827 4255 length = bio->bi_size; 4828 - map_tree = &root->fs_info->mapping_tree; 4829 4256 map_length = length; 4830 4257 4831 - ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, 4258 + ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 4832 4259 mirror_num); 4833 - if (ret) /* -ENOMEM */ 4260 + if (ret) 4834 4261 return ret; 4835 4262 4836 4263 total_devs = bbio->num_stripes; ··· 4847 4276 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 4848 4277 4849 4278 while (dev_nr < total_devs) { 4279 + dev = bbio->stripes[dev_nr].dev; 4280 + if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { 4281 + bbio_error(bbio, first_bio, logical); 4282 + dev_nr++; 4283 + continue; 4284 + } 4285 + 4286 + /* 4287 + * Check and see if we're ok with this bio based on it's size 4288 + * and offset with the given device. 4289 + */ 4290 + if (!bio_size_ok(dev->bdev, first_bio, 4291 + bbio->stripes[dev_nr].physical >> 9)) { 4292 + ret = breakup_stripe_bio(root, bbio, first_bio, dev, 4293 + dev_nr, rw, async_submit); 4294 + BUG_ON(ret); 4295 + dev_nr++; 4296 + continue; 4297 + } 4298 + 4850 4299 if (dev_nr < total_devs - 1) { 4851 4300 bio = bio_clone(first_bio, GFP_NOFS); 4852 4301 BUG_ON(!bio); /* -ENOMEM */ 4853 4302 } else { 4854 4303 bio = first_bio; 4855 4304 } 4856 - bio->bi_private = bbio; 4857 - bio->bi_private = merge_stripe_index_into_bio_private( 4858 - bio->bi_private, (unsigned int)dev_nr); 4859 - bio->bi_end_io = btrfs_end_bio; 4860 - bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; 4861 - dev = bbio->stripes[dev_nr].dev; 4862 - if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { 4863 - #ifdef DEBUG 4864 - struct rcu_string *name; 4865 4305 4866 - rcu_read_lock(); 4867 - name = rcu_dereference(dev->name); 4868 - pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " 4869 - "(%s id %llu), size=%u\n", rw, 4870 - (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, 4871 - name->str, dev->devid, bio->bi_size); 4872 - rcu_read_unlock(); 4873 - #endif 4874 - bio->bi_bdev = dev->bdev; 4875 - if (async_submit) 4876 - schedule_bio(root, dev, rw, bio); 4877 - else 4878 - btrfsic_submit_bio(rw, bio); 4879 - } else { 4880 - bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; 4881 - bio->bi_sector = logical >> 9; 4882 - bio_endio(bio, -EIO); 4883 - } 4306 + submit_stripe_bio(root, bbio, bio, 4307 + bbio->stripes[dev_nr].physical, dev_nr, rw, 4308 + async_submit); 4884 4309 dev_nr++; 4885 4310 } 4886 4311 return 0; 4887 4312 } 4888 4313 4889 - struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 4314 + struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, 4890 4315 u8 *uuid, u8 *fsid) 4891 4316 { 4892 4317 struct btrfs_device *device; 4893 4318 struct btrfs_fs_devices *cur_devices; 4894 4319 4895 - cur_devices = root->fs_info->fs_devices; 4320 + cur_devices = fs_info->fs_devices; 4896 4321 while (cur_devices) { 4897 4322 if (!fsid || 4898 4323 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { ··· 4969 4402 em->bdev = (struct block_device *)map; 4970 4403 em->start = logical; 4971 4404 em->len = length; 4405 + em->orig_start = 0; 4972 4406 em->block_start = 0; 4973 4407 em->block_len = em->len; 4974 4408 ··· 4987 4419 read_extent_buffer(leaf, uuid, (unsigned long) 4988 4420 btrfs_stripe_dev_uuid_nr(chunk, i), 4989 4421 BTRFS_UUID_SIZE); 4990 - map->stripes[i].dev = btrfs_find_device(root, devid, uuid, 4991 - NULL); 4422 + map->stripes[i].dev = btrfs_find_device(root->fs_info, devid, 4423 + uuid, NULL); 4992 4424 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 4993 4425 kfree(map); 4994 4426 free_extent_map(em); ··· 5029 4461 device->io_align = btrfs_device_io_align(leaf, dev_item); 5030 4462 device->io_width = btrfs_device_io_width(leaf, dev_item); 5031 4463 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 4464 + WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 4465 + device->is_tgtdev_for_dev_replace = 0; 5032 4466 5033 4467 ptr = (unsigned long)btrfs_device_uuid(dev_item); 5034 4468 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); ··· 5108 4538 return ret; 5109 4539 } 5110 4540 5111 - device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 4541 + device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid); 5112 4542 if (!device || !device->bdev) { 5113 4543 if (!btrfs_test_opt(root, DEGRADED)) 5114 4544 return -EIO; ··· 5141 4571 fill_device_from_item(leaf, dev_item, device); 5142 4572 device->dev_root = root->fs_info->dev_root; 5143 4573 device->in_fs_metadata = 1; 5144 - if (device->writeable) { 4574 + if (device->writeable && !device->is_tgtdev_for_dev_replace) { 5145 4575 device->fs_devices->total_rw_bytes += device->total_bytes; 5146 4576 spin_lock(&root->fs_info->free_chunk_lock); 5147 4577 root->fs_info->free_chunk_space += device->total_bytes - ··· 5500 4930 int i; 5501 4931 5502 4932 mutex_lock(&fs_devices->device_list_mutex); 5503 - dev = btrfs_find_device(root, stats->devid, NULL, NULL); 4933 + dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL); 5504 4934 mutex_unlock(&fs_devices->device_list_mutex); 5505 4935 5506 4936 if (!dev) { ··· 5526 4956 } 5527 4957 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 5528 4958 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 4959 + return 0; 4960 + } 4961 + 4962 + int btrfs_scratch_superblock(struct btrfs_device *device) 4963 + { 4964 + struct buffer_head *bh; 4965 + struct btrfs_super_block *disk_super; 4966 + 4967 + bh = btrfs_read_dev_super(device->bdev); 4968 + if (!bh) 4969 + return -EINVAL; 4970 + disk_super = (struct btrfs_super_block *)bh->b_data; 4971 + 4972 + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 4973 + set_buffer_dirty(bh); 4974 + sync_dirty_buffer(bh); 4975 + brelse(bh); 4976 + 5529 4977 return 0; 5530 4978 }

+30 -5

fs/btrfs/volumes.h

··· 50 50 int in_fs_metadata; 51 51 int missing; 52 52 int can_discard; 53 + int is_tgtdev_for_dev_replace; 53 54 54 55 spinlock_t io_lock; 55 56 ··· 89 88 u8 uuid[BTRFS_UUID_SIZE]; 90 89 91 90 /* per-device scrub information */ 92 - struct scrub_dev *scrub_device; 91 + struct scrub_ctx *scrub_device; 93 92 94 93 struct btrfs_work work; 95 94 struct rcu_head rcu; ··· 180 179 u64 total_avail; 181 180 }; 182 181 182 + struct btrfs_raid_attr { 183 + int sub_stripes; /* sub_stripes info for map */ 184 + int dev_stripes; /* stripes per dev */ 185 + int devs_max; /* max devs to use */ 186 + int devs_min; /* min devs needed */ 187 + int devs_increment; /* ndevs has to be a multiple of this */ 188 + int ncopies; /* how many copies to data has */ 189 + }; 190 + 183 191 struct map_lookup { 184 192 u64 type; 185 193 int io_align; ··· 258 248 struct btrfs_device *device, 259 249 u64 chunk_tree, u64 chunk_objectid, 260 250 u64 chunk_offset, u64 start, u64 num_bytes); 261 - int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 251 + int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 262 252 u64 logical, u64 *length, 263 253 struct btrfs_bio **bbio_ret, int mirror_num); 264 254 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, ··· 277 267 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 278 268 struct btrfs_fs_devices **fs_devices_ret); 279 269 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); 280 - void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); 270 + void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, 271 + struct btrfs_fs_devices *fs_devices, int step); 272 + int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, 273 + char *device_path, 274 + struct btrfs_device **device); 275 + int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, 276 + struct btrfs_device **device); 281 277 int btrfs_add_device(struct btrfs_trans_handle *trans, 282 278 struct btrfs_root *root, 283 279 struct btrfs_device *device); 284 280 int btrfs_rm_device(struct btrfs_root *root, char *device_path); 285 281 void btrfs_cleanup_fs_uuids(void); 286 - int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); 282 + int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); 287 283 int btrfs_grow_device(struct btrfs_trans_handle *trans, 288 284 struct btrfs_device *device, u64 new_size); 289 - struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 285 + struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, 290 286 u8 *uuid, u8 *fsid); 291 287 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); 292 288 int btrfs_init_new_device(struct btrfs_root *root, char *path); 289 + int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, 290 + struct btrfs_device **device_out); 293 291 int btrfs_balance(struct btrfs_balance_control *bctl, 294 292 struct btrfs_ioctl_balance_args *bargs); 295 293 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); ··· 314 296 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); 315 297 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 316 298 struct btrfs_fs_info *fs_info); 299 + void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, 300 + struct btrfs_device *srcdev); 301 + void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 302 + struct btrfs_device *tgtdev); 303 + void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, 304 + struct btrfs_device *tgtdev); 305 + int btrfs_scratch_superblock(struct btrfs_device *device); 317 306 318 307 static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, 319 308 int index)

+12 -1

fs/btrfs/xattr.c

··· 122 122 */ 123 123 if (!value) 124 124 goto out; 125 + } else { 126 + di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), 127 + name, name_len, 0); 128 + if (IS_ERR(di)) { 129 + ret = PTR_ERR(di); 130 + goto out; 131 + } 132 + if (!di && !value) 133 + goto out; 134 + btrfs_release_path(path); 125 135 } 126 136 127 137 again: ··· 208 198 209 199 inode_inc_iversion(inode); 210 200 inode->i_ctime = CURRENT_TIME; 201 + set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); 211 202 ret = btrfs_update_inode(trans, root, inode); 212 203 BUG_ON(ret); 213 204 out: ··· 276 265 277 266 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 278 267 if (verify_dir_item(root, leaf, di)) 279 - continue; 268 + goto next; 280 269 281 270 name_len = btrfs_dir_name_len(leaf, di); 282 271 total_size += name_len + 1;

+2 -1

include/trace/events/btrfs.h

··· 45 45 46 46 #define show_root_type(obj) \ 47 47 obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) || \ 48 - (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-" 48 + (obj >= BTRFS_ROOT_TREE_OBJECTID && \ 49 + obj <= BTRFS_CSUM_TREE_OBJECTID)) ? __show_root_type(obj) : "-" 49 50 50 51 #define BTRFS_GROUP_FLAGS \ 51 52 { BTRFS_BLOCK_GROUP_DATA, "DATA"}, \