Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs

Pull btrfs update from Chris Mason:
"A big set of fixes and features.

In terms of line count, most of the code comes from Stefan, who added
the ability to replace a single drive in place. This is different
from how btrfs normally replaces drives, and is much much much faster.

Josef is plowing through our synchronous write performance. This pull
request does not include the DIO_OWN_WAITING patch that was discussed
on the list, but it has a number of other improvements to cut down our
latencies and CPU time during fsync/O_DIRECT writes.

Miao Xie has a big series of fixes and is spreading out ordered
operations over more CPUs. This improves performance and reduces
contention.

I've put in fixes for error handling around hash collisions. These
are going back to individual stable kernels as I test against them.

Otherwise we have a lot of fixes and cleanups, thanks everyone!
raid5/6 is being rebased against the device replacement code. I'll
have it posted this Friday along with a nice series of benchmarks."

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (115 commits)
Btrfs: fix a bug of per-file nocow
Btrfs: fix hash overflow handling
Btrfs: don't take inode delalloc mutex if we're a free space inode
Btrfs: fix autodefrag and umount lockup
Btrfs: fix permissions of empty files not affected by umask
Btrfs: put raid properties into global table
Btrfs: fix BUG() in scrub when first superblock reading gives EIO
Btrfs: do not call file_update_time in aio_write
Btrfs: only unlock and relock if we have to
Btrfs: use tokens where we can in the tree log
Btrfs: optimize leaf_space_used
Btrfs: don't memset new tokens
Btrfs: only clear dirty on the buffer if it is marked as dirty
Btrfs: move checks in set_page_dirty under DEBUG
Btrfs: log changed inodes based on the extent map tree
Btrfs: add path->really_keep_locks
Btrfs: do not mark ems as prealloc if we are writing to them
Btrfs: keep track of the extents original block length
Btrfs: inline csums if we're fsyncing
Btrfs: don't bother copying if we're only logging the inode
...

+5259 -1748
+1 -1
fs/btrfs/Makefile
··· 8 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 9 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 10 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 11 - reada.o backref.o ulist.o qgroup.o send.o 11 + reada.o backref.o ulist.o qgroup.o send.o dev-replace.o 12 12 13 13 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 14 14 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
+2
fs/btrfs/acl.c
··· 121 121 ret = posix_acl_equiv_mode(acl, &inode->i_mode); 122 122 if (ret < 0) 123 123 return ret; 124 + if (ret == 0) 125 + acl = NULL; 124 126 } 125 127 ret = 0; 126 128 break;
+12 -4
fs/btrfs/backref.c
··· 461 461 pos2 = n2, n2 = pos2->next) { 462 462 struct __prelim_ref *ref2; 463 463 struct __prelim_ref *xchg; 464 + struct extent_inode_elem *eie; 464 465 465 466 ref2 = list_entry(pos2, struct __prelim_ref, list); 466 467 ··· 473 472 ref1 = ref2; 474 473 ref2 = xchg; 475 474 } 476 - ref1->count += ref2->count; 477 475 } else { 478 476 if (ref1->parent != ref2->parent) 479 477 continue; 480 - ref1->count += ref2->count; 481 478 } 479 + 480 + eie = ref1->inode_list; 481 + while (eie && eie->next) 482 + eie = eie->next; 483 + if (eie) 484 + eie->next = ref2->inode_list; 485 + else 486 + ref1->inode_list = ref2->inode_list; 487 + ref1->count += ref2->count; 488 + 482 489 list_del(&ref2->list); 483 490 kfree(ref2); 484 491 } ··· 899 890 while (!list_empty(&prefs)) { 900 891 ref = list_first_entry(&prefs, struct __prelim_ref, list); 901 892 list_del(&ref->list); 902 - if (ref->count < 0) 903 - WARN_ON(1); 893 + WARN_ON(ref->count < 0); 904 894 if (ref->count && ref->root_id && ref->parent == 0) { 905 895 /* no parent == root of tree */ 906 896 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
+4
fs/btrfs/btrfs_inode.h
··· 39 39 #define BTRFS_INODE_HAS_ORPHAN_ITEM 5 40 40 #define BTRFS_INODE_HAS_ASYNC_EXTENT 6 41 41 #define BTRFS_INODE_NEEDS_FULL_SYNC 7 42 + #define BTRFS_INODE_COPY_EVERYTHING 8 42 43 43 44 /* in memory btrfs inode */ 44 45 struct btrfs_inode { ··· 90 89 struct rb_node rb_node; 91 90 92 91 unsigned long runtime_flags; 92 + 93 + /* Keep track of who's O_SYNC/fsycing currently */ 94 + atomic_t sync_writers; 93 95 94 96 /* full 64 bit generation number, struct vfs_inode doesn't have a big 95 97 * enough field for this.
+21 -10
fs/btrfs/check-integrity.c
··· 137 137 unsigned int never_written:1; /* block was added because it was 138 138 * referenced, not because it was 139 139 * written */ 140 - unsigned int mirror_num:2; /* large enough to hold 140 + unsigned int mirror_num; /* large enough to hold 141 141 * BTRFS_SUPER_MIRROR_MAX */ 142 142 struct btrfsic_dev_state *dev_state; 143 143 u64 dev_bytenr; /* key, physical byte num on disk */ ··· 723 723 } 724 724 725 725 num_copies = 726 - btrfs_num_copies(&state->root->fs_info->mapping_tree, 726 + btrfs_num_copies(state->root->fs_info, 727 727 next_bytenr, state->metablock_size); 728 728 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 729 729 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", ··· 903 903 } 904 904 905 905 num_copies = 906 - btrfs_num_copies(&state->root->fs_info->mapping_tree, 906 + btrfs_num_copies(state->root->fs_info, 907 907 next_bytenr, state->metablock_size); 908 908 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 909 909 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", ··· 1287 1287 *next_blockp = NULL; 1288 1288 if (0 == *num_copiesp) { 1289 1289 *num_copiesp = 1290 - btrfs_num_copies(&state->root->fs_info->mapping_tree, 1290 + btrfs_num_copies(state->root->fs_info, 1291 1291 next_bytenr, state->metablock_size); 1292 1292 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 1293 1293 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", ··· 1489 1489 chunk_len = num_bytes; 1490 1490 1491 1491 num_copies = 1492 - btrfs_num_copies(&state->root->fs_info->mapping_tree, 1492 + btrfs_num_copies(state->root->fs_info, 1493 1493 next_bytenr, state->datablock_size); 1494 1494 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 1495 1495 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", ··· 1582 1582 struct btrfs_device *device; 1583 1583 1584 1584 length = len; 1585 - ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ, 1585 + ret = btrfs_map_block(state->root->fs_info, READ, 1586 1586 bytenr, &length, &multi, mirror_num); 1587 + 1588 + if (ret) { 1589 + block_ctx_out->start = 0; 1590 + block_ctx_out->dev_bytenr = 0; 1591 + block_ctx_out->len = 0; 1592 + block_ctx_out->dev = NULL; 1593 + block_ctx_out->datav = NULL; 1594 + block_ctx_out->pagev = NULL; 1595 + block_ctx_out->mem_to_free = NULL; 1596 + 1597 + return ret; 1598 + } 1587 1599 1588 1600 device = multi->stripes[0].dev; 1589 1601 block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev); ··· 1606 1594 block_ctx_out->pagev = NULL; 1607 1595 block_ctx_out->mem_to_free = NULL; 1608 1596 1609 - if (0 == ret) 1610 - kfree(multi); 1597 + kfree(multi); 1611 1598 if (NULL == block_ctx_out->dev) { 1612 1599 ret = -ENXIO; 1613 1600 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n"); ··· 2474 2463 } 2475 2464 2476 2465 num_copies = 2477 - btrfs_num_copies(&state->root->fs_info->mapping_tree, 2466 + btrfs_num_copies(state->root->fs_info, 2478 2467 next_bytenr, BTRFS_SUPER_INFO_SIZE); 2479 2468 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 2480 2469 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", ··· 2971 2960 struct btrfsic_block_data_ctx block_ctx; 2972 2961 int match = 0; 2973 2962 2974 - num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, 2963 + num_copies = btrfs_num_copies(state->root->fs_info, 2975 2964 bytenr, state->metablock_size); 2976 2965 2977 2966 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+4 -2
fs/btrfs/compression.c
··· 687 687 688 688 ret = btrfs_map_bio(root, READ, comp_bio, 689 689 mirror_num, 0); 690 - BUG_ON(ret); /* -ENOMEM */ 690 + if (ret) 691 + bio_endio(comp_bio, ret); 691 692 692 693 bio_put(comp_bio); 693 694 ··· 713 712 } 714 713 715 714 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); 716 - BUG_ON(ret); /* -ENOMEM */ 715 + if (ret) 716 + bio_endio(comp_bio, ret); 717 717 718 718 bio_put(comp_bio); 719 719 return 0;
+189 -52
fs/btrfs/ctree.c
··· 38 38 struct extent_buffer *dst_buf, 39 39 struct extent_buffer *src_buf); 40 40 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 41 - struct btrfs_path *path, int level, int slot, 42 - int tree_mod_log); 41 + struct btrfs_path *path, int level, int slot); 43 42 static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, 44 43 struct extent_buffer *eb); 45 44 struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, ··· 775 776 776 777 static noinline void 777 778 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, 778 - struct extent_buffer *eb, 779 - struct btrfs_disk_key *disk_key, int slot, int atomic) 779 + struct extent_buffer *eb, int slot, int atomic) 780 780 { 781 781 int ret; 782 782 ··· 1138 1140 switch (tm->op) { 1139 1141 case MOD_LOG_KEY_REMOVE_WHILE_FREEING: 1140 1142 BUG_ON(tm->slot < n); 1141 - case MOD_LOG_KEY_REMOVE_WHILE_MOVING: 1142 1143 case MOD_LOG_KEY_REMOVE: 1144 + n++; 1145 + case MOD_LOG_KEY_REMOVE_WHILE_MOVING: 1143 1146 btrfs_set_node_key(eb, &tm->key, tm->slot); 1144 1147 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 1145 1148 btrfs_set_node_ptr_generation(eb, tm->slot, 1146 1149 tm->generation); 1147 - n++; 1148 1150 break; 1149 1151 case MOD_LOG_KEY_REPLACE: 1150 1152 BUG_ON(tm->slot >= n); ··· 1359 1361 u64 search_start; 1360 1362 int ret; 1361 1363 1362 - if (trans->transaction != root->fs_info->running_transaction) { 1363 - printk(KERN_CRIT "trans %llu running %llu\n", 1364 + if (trans->transaction != root->fs_info->running_transaction) 1365 + WARN(1, KERN_CRIT "trans %llu running %llu\n", 1364 1366 (unsigned long long)trans->transid, 1365 1367 (unsigned long long) 1366 1368 root->fs_info->running_transaction->transid); 1367 - WARN_ON(1); 1368 - } 1369 - if (trans->transid != root->fs_info->generation) { 1370 - printk(KERN_CRIT "trans %llu running %llu\n", 1369 + 1370 + if (trans->transid != root->fs_info->generation) 1371 + WARN(1, KERN_CRIT "trans %llu running %llu\n", 1371 1372 (unsigned long long)trans->transid, 1372 1373 (unsigned long long)root->fs_info->generation); 1373 - WARN_ON(1); 1374 - } 1375 1374 1376 1375 if (!should_cow_block(trans, root, buf)) { 1377 1376 *cow_ret = buf; ··· 1464 1469 if (cache_only && parent_level != 1) 1465 1470 return 0; 1466 1471 1467 - if (trans->transaction != root->fs_info->running_transaction) 1468 - WARN_ON(1); 1469 - if (trans->transid != root->fs_info->generation) 1470 - WARN_ON(1); 1472 + WARN_ON(trans->transaction != root->fs_info->running_transaction); 1473 + WARN_ON(trans->transid != root->fs_info->generation); 1471 1474 1472 1475 parent_nritems = btrfs_header_nritems(parent); 1473 1476 blocksize = btrfs_level_size(root, parent_level - 1); ··· 1820 1827 if (btrfs_header_nritems(right) == 0) { 1821 1828 clean_tree_block(trans, root, right); 1822 1829 btrfs_tree_unlock(right); 1823 - del_ptr(trans, root, path, level + 1, pslot + 1, 1); 1830 + del_ptr(trans, root, path, level + 1, pslot + 1); 1824 1831 root_sub_used(root, right->len); 1825 1832 btrfs_free_tree_block(trans, root, right, 0, 1); 1826 1833 free_extent_buffer_stale(right); ··· 1829 1836 struct btrfs_disk_key right_key; 1830 1837 btrfs_node_key(right, &right_key, 0); 1831 1838 tree_mod_log_set_node_key(root->fs_info, parent, 1832 - &right_key, pslot + 1, 0); 1839 + pslot + 1, 0); 1833 1840 btrfs_set_node_key(parent, &right_key, pslot + 1); 1834 1841 btrfs_mark_buffer_dirty(parent); 1835 1842 } ··· 1864 1871 if (btrfs_header_nritems(mid) == 0) { 1865 1872 clean_tree_block(trans, root, mid); 1866 1873 btrfs_tree_unlock(mid); 1867 - del_ptr(trans, root, path, level + 1, pslot, 1); 1874 + del_ptr(trans, root, path, level + 1, pslot); 1868 1875 root_sub_used(root, mid->len); 1869 1876 btrfs_free_tree_block(trans, root, mid, 0, 1); 1870 1877 free_extent_buffer_stale(mid); ··· 1873 1880 /* update the parent key to reflect our changes */ 1874 1881 struct btrfs_disk_key mid_key; 1875 1882 btrfs_node_key(mid, &mid_key, 0); 1876 - tree_mod_log_set_node_key(root->fs_info, parent, &mid_key, 1883 + tree_mod_log_set_node_key(root->fs_info, parent, 1877 1884 pslot, 0); 1878 1885 btrfs_set_node_key(parent, &mid_key, pslot); 1879 1886 btrfs_mark_buffer_dirty(parent); ··· 1973 1980 orig_slot += left_nr; 1974 1981 btrfs_node_key(mid, &disk_key, 0); 1975 1982 tree_mod_log_set_node_key(root->fs_info, parent, 1976 - &disk_key, pslot, 0); 1983 + pslot, 0); 1977 1984 btrfs_set_node_key(parent, &disk_key, pslot); 1978 1985 btrfs_mark_buffer_dirty(parent); 1979 1986 if (btrfs_header_nritems(left) > orig_slot) { ··· 2026 2033 2027 2034 btrfs_node_key(right, &disk_key, 0); 2028 2035 tree_mod_log_set_node_key(root->fs_info, parent, 2029 - &disk_key, pslot + 1, 0); 2036 + pslot + 1, 0); 2030 2037 btrfs_set_node_key(parent, &disk_key, pslot + 1); 2031 2038 btrfs_mark_buffer_dirty(parent); 2032 2039 ··· 2212 2219 int no_skips = 0; 2213 2220 struct extent_buffer *t; 2214 2221 2222 + if (path->really_keep_locks) 2223 + return; 2224 + 2215 2225 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2216 2226 if (!path->nodes[i]) 2217 2227 break; ··· 2262 2266 { 2263 2267 int i; 2264 2268 2265 - if (path->keep_locks) 2269 + if (path->keep_locks || path->really_keep_locks) 2266 2270 return; 2267 2271 2268 2272 for (i = level; i < BTRFS_MAX_LEVEL; i++) { ··· 2495 2499 if (!cow) 2496 2500 write_lock_level = -1; 2497 2501 2498 - if (cow && (p->keep_locks || p->lowest_level)) 2502 + if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level)) 2499 2503 write_lock_level = BTRFS_MAX_LEVEL; 2500 2504 2501 2505 min_write_lock_level = write_lock_level; ··· 2564 2568 * must have write locks on this node and the 2565 2569 * parent 2566 2570 */ 2567 - if (level + 1 > write_lock_level) { 2571 + if (level > write_lock_level || 2572 + (level + 1 > write_lock_level && 2573 + level + 1 < BTRFS_MAX_LEVEL && 2574 + p->nodes[level + 1])) { 2568 2575 write_lock_level = level + 1; 2569 2576 btrfs_release_path(p); 2570 2577 goto again; ··· 2916 2917 if (!path->nodes[i]) 2917 2918 break; 2918 2919 t = path->nodes[i]; 2919 - tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1); 2920 + tree_mod_log_set_node_key(root->fs_info, t, tslot, 1); 2920 2921 btrfs_set_node_key(t, key, tslot); 2921 2922 btrfs_mark_buffer_dirty(path->nodes[i]); 2922 2923 if (tslot != 0) ··· 3301 3302 */ 3302 3303 static int leaf_space_used(struct extent_buffer *l, int start, int nr) 3303 3304 { 3305 + struct btrfs_item *start_item; 3306 + struct btrfs_item *end_item; 3307 + struct btrfs_map_token token; 3304 3308 int data_len; 3305 3309 int nritems = btrfs_header_nritems(l); 3306 3310 int end = min(nritems, start + nr) - 1; 3307 3311 3308 3312 if (!nr) 3309 3313 return 0; 3310 - data_len = btrfs_item_end_nr(l, start); 3311 - data_len = data_len - btrfs_item_offset_nr(l, end); 3314 + btrfs_init_map_token(&token); 3315 + start_item = btrfs_item_nr(l, start); 3316 + end_item = btrfs_item_nr(l, end); 3317 + data_len = btrfs_token_item_offset(l, start_item, &token) + 3318 + btrfs_token_item_size(l, start_item, &token); 3319 + data_len = data_len - btrfs_token_item_offset(l, end_item, &token); 3312 3320 data_len += sizeof(struct btrfs_item) * nr; 3313 3321 WARN_ON(data_len < 0); 3314 3322 return data_len; ··· 3409 3403 if (push_items == 0) 3410 3404 goto out_unlock; 3411 3405 3412 - if (!empty && push_items == left_nritems) 3413 - WARN_ON(1); 3406 + WARN_ON(!empty && push_items == left_nritems); 3414 3407 3415 3408 /* push left to right */ 3416 3409 right_nritems = btrfs_header_nritems(right); ··· 3647 3642 btrfs_set_header_nritems(left, old_left_nritems + push_items); 3648 3643 3649 3644 /* fixup right node */ 3650 - if (push_items > right_nritems) { 3651 - printk(KERN_CRIT "push items %d nr %u\n", push_items, 3645 + if (push_items > right_nritems) 3646 + WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, 3652 3647 right_nritems); 3653 - WARN_ON(1); 3654 - } 3655 3648 3656 3649 if (push_items < right_nritems) { 3657 3650 push_space = btrfs_item_offset_nr(right, push_items - 1) - ··· 4605 4602 * empty a node. 4606 4603 */ 4607 4604 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4608 - struct btrfs_path *path, int level, int slot, 4609 - int tree_mod_log) 4605 + struct btrfs_path *path, int level, int slot) 4610 4606 { 4611 4607 struct extent_buffer *parent = path->nodes[level]; 4612 4608 u32 nritems; 4613 4609 int ret; 4614 4610 4611 + if (level) { 4612 + ret = tree_mod_log_insert_key(root->fs_info, parent, slot, 4613 + MOD_LOG_KEY_REMOVE); 4614 + BUG_ON(ret < 0); 4615 + } 4616 + 4615 4617 nritems = btrfs_header_nritems(parent); 4616 4618 if (slot != nritems - 1) { 4617 - if (tree_mod_log && level) 4619 + if (level) 4618 4620 tree_mod_log_eb_move(root->fs_info, parent, slot, 4619 4621 slot + 1, nritems - slot - 1); 4620 4622 memmove_extent_buffer(parent, ··· 4627 4619 btrfs_node_key_ptr_offset(slot + 1), 4628 4620 sizeof(struct btrfs_key_ptr) * 4629 4621 (nritems - slot - 1)); 4630 - } else if (tree_mod_log && level) { 4631 - ret = tree_mod_log_insert_key(root->fs_info, parent, slot, 4632 - MOD_LOG_KEY_REMOVE); 4633 - BUG_ON(ret < 0); 4634 4622 } 4635 4623 4636 4624 nritems--; ··· 4660 4656 struct extent_buffer *leaf) 4661 4657 { 4662 4658 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4663 - del_ptr(trans, root, path, 1, path->slots[1], 1); 4659 + del_ptr(trans, root, path, 1, path->slots[1]); 4664 4660 4665 4661 /* 4666 4662 * btrfs_free_extent is expensive, we want to make sure we ··· 5127 5123 right_path->search_commit_root = 1; 5128 5124 right_path->skip_locking = 1; 5129 5125 5130 - spin_lock(&left_root->root_times_lock); 5126 + spin_lock(&left_root->root_item_lock); 5131 5127 left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); 5132 - spin_unlock(&left_root->root_times_lock); 5128 + spin_unlock(&left_root->root_item_lock); 5133 5129 5134 - spin_lock(&right_root->root_times_lock); 5130 + spin_lock(&right_root->root_item_lock); 5135 5131 right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); 5136 - spin_unlock(&right_root->root_times_lock); 5132 + spin_unlock(&right_root->root_item_lock); 5137 5133 5138 5134 trans = btrfs_join_transaction(left_root); 5139 5135 if (IS_ERR(trans)) { ··· 5228 5224 goto out; 5229 5225 } 5230 5226 5231 - spin_lock(&left_root->root_times_lock); 5227 + spin_lock(&left_root->root_item_lock); 5232 5228 ctransid = btrfs_root_ctransid(&left_root->root_item); 5233 - spin_unlock(&left_root->root_times_lock); 5229 + spin_unlock(&left_root->root_item_lock); 5234 5230 if (ctransid != left_start_ctransid) 5235 5231 left_start_ctransid = 0; 5236 5232 5237 - spin_lock(&right_root->root_times_lock); 5233 + spin_lock(&right_root->root_item_lock); 5238 5234 ctransid = btrfs_root_ctransid(&right_root->root_item); 5239 - spin_unlock(&right_root->root_times_lock); 5235 + spin_unlock(&right_root->root_item_lock); 5240 5236 if (ctransid != right_start_ctransid) 5241 5237 right_start_ctransid = 0; 5242 5238 ··· 5498 5494 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) 5499 5495 { 5500 5496 return btrfs_next_old_leaf(root, path, 0); 5497 + } 5498 + 5499 + /* Release the path up to but not including the given level */ 5500 + static void btrfs_release_level(struct btrfs_path *path, int level) 5501 + { 5502 + int i; 5503 + 5504 + for (i = 0; i < level; i++) { 5505 + path->slots[i] = 0; 5506 + if (!path->nodes[i]) 5507 + continue; 5508 + if (path->locks[i]) { 5509 + btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); 5510 + path->locks[i] = 0; 5511 + } 5512 + free_extent_buffer(path->nodes[i]); 5513 + path->nodes[i] = NULL; 5514 + } 5515 + } 5516 + 5517 + /* 5518 + * This function assumes 2 things 5519 + * 5520 + * 1) You are using path->keep_locks 5521 + * 2) You are not inserting items. 5522 + * 5523 + * If either of these are not true do not use this function. If you need a next 5524 + * leaf with either of these not being true then this function can be easily 5525 + * adapted to do that, but at the moment these are the limitations. 5526 + */ 5527 + int btrfs_next_leaf_write(struct btrfs_trans_handle *trans, 5528 + struct btrfs_root *root, struct btrfs_path *path, 5529 + int del) 5530 + { 5531 + struct extent_buffer *b; 5532 + struct btrfs_key key; 5533 + u32 nritems; 5534 + int level = 1; 5535 + int slot; 5536 + int ret = 1; 5537 + int write_lock_level = BTRFS_MAX_LEVEL; 5538 + int ins_len = del ? -1 : 0; 5539 + 5540 + WARN_ON(!(path->keep_locks || path->really_keep_locks)); 5541 + 5542 + nritems = btrfs_header_nritems(path->nodes[0]); 5543 + btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 5544 + 5545 + while (path->nodes[level]) { 5546 + nritems = btrfs_header_nritems(path->nodes[level]); 5547 + if (!(path->locks[level] & BTRFS_WRITE_LOCK)) { 5548 + search: 5549 + btrfs_release_path(path); 5550 + ret = btrfs_search_slot(trans, root, &key, path, 5551 + ins_len, 1); 5552 + if (ret < 0) 5553 + goto out; 5554 + level = 1; 5555 + continue; 5556 + } 5557 + 5558 + if (path->slots[level] >= nritems - 1) { 5559 + level++; 5560 + continue; 5561 + } 5562 + 5563 + btrfs_release_level(path, level); 5564 + break; 5565 + } 5566 + 5567 + if (!path->nodes[level]) { 5568 + ret = 1; 5569 + goto out; 5570 + } 5571 + 5572 + path->slots[level]++; 5573 + b = path->nodes[level]; 5574 + 5575 + while (b) { 5576 + level = btrfs_header_level(b); 5577 + 5578 + if (!should_cow_block(trans, root, b)) 5579 + goto cow_done; 5580 + 5581 + btrfs_set_path_blocking(path); 5582 + ret = btrfs_cow_block(trans, root, b, 5583 + path->nodes[level + 1], 5584 + path->slots[level + 1], &b); 5585 + if (ret) 5586 + goto out; 5587 + cow_done: 5588 + path->nodes[level] = b; 5589 + btrfs_clear_path_blocking(path, NULL, 0); 5590 + if (level != 0) { 5591 + ret = setup_nodes_for_search(trans, root, path, b, 5592 + level, ins_len, 5593 + &write_lock_level); 5594 + if (ret == -EAGAIN) 5595 + goto search; 5596 + if (ret) 5597 + goto out; 5598 + 5599 + b = path->nodes[level]; 5600 + slot = path->slots[level]; 5601 + 5602 + ret = read_block_for_search(trans, root, path, 5603 + &b, level, slot, &key, 0); 5604 + if (ret == -EAGAIN) 5605 + goto search; 5606 + if (ret) 5607 + goto out; 5608 + level = btrfs_header_level(b); 5609 + if (!btrfs_try_tree_write_lock(b)) { 5610 + btrfs_set_path_blocking(path); 5611 + btrfs_tree_lock(b); 5612 + btrfs_clear_path_blocking(path, b, 5613 + BTRFS_WRITE_LOCK); 5614 + } 5615 + path->locks[level] = BTRFS_WRITE_LOCK; 5616 + path->nodes[level] = b; 5617 + path->slots[level] = 0; 5618 + } else { 5619 + path->slots[level] = 0; 5620 + ret = 0; 5621 + break; 5622 + } 5623 + } 5624 + 5625 + out: 5626 + if (ret) 5627 + btrfs_release_path(path); 5628 + 5629 + return ret; 5501 5630 } 5502 5631 5503 5632 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
+164 -18
fs/btrfs/ctree.h
··· 48 48 49 49 #define BTRFS_MAGIC "_BHRfS_M" 50 50 51 - #define BTRFS_MAX_MIRRORS 2 51 + #define BTRFS_MAX_MIRRORS 3 52 52 53 53 #define BTRFS_MAX_LEVEL 8 54 54 ··· 142 142 143 143 #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 144 144 145 + #define BTRFS_DEV_REPLACE_DEVID 0 146 + 145 147 /* 146 148 * the max metadata block size. This limit is somewhat artificial, 147 149 * but the memmove costs go through the roof for larger blocks. ··· 173 171 174 172 /* four bytes for CRC32 */ 175 173 #define BTRFS_EMPTY_DIR_SIZE 0 174 + 175 + /* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */ 176 + #define REQ_GET_READ_MIRRORS (1 << 30) 176 177 177 178 #define BTRFS_FT_UNKNOWN 0 178 179 #define BTRFS_FT_REG_FILE 1 ··· 576 571 unsigned int skip_locking:1; 577 572 unsigned int leave_spinning:1; 578 573 unsigned int search_commit_root:1; 574 + unsigned int really_keep_locks:1; 579 575 }; 580 576 581 577 /* ··· 889 883 * the existing values unchanged 890 884 */ 891 885 __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; 886 + } __attribute__ ((__packed__)); 887 + 888 + #define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 889 + #define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 890 + #define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0 891 + #define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1 892 + #define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2 893 + #define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3 894 + #define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4 895 + 896 + struct btrfs_dev_replace { 897 + u64 replace_state; /* see #define above */ 898 + u64 time_started; /* seconds since 1-Jan-1970 */ 899 + u64 time_stopped; /* seconds since 1-Jan-1970 */ 900 + atomic64_t num_write_errors; 901 + atomic64_t num_uncorrectable_read_errors; 902 + 903 + u64 cursor_left; 904 + u64 committed_cursor_left; 905 + u64 cursor_left_last_write_of_item; 906 + u64 cursor_right; 907 + 908 + u64 cont_reading_from_srcdev_mode; /* see #define above */ 909 + 910 + int is_valid; 911 + int item_needs_writeback; 912 + struct btrfs_device *srcdev; 913 + struct btrfs_device *tgtdev; 914 + 915 + pid_t lock_owner; 916 + atomic_t nesting_level; 917 + struct mutex lock_finishing_cancel_unmount; 918 + struct mutex lock_management_lock; 919 + struct mutex lock; 920 + 921 + struct btrfs_scrub_progress scrub_progress; 922 + }; 923 + 924 + struct btrfs_dev_replace_item { 925 + /* 926 + * grow this item struct at the end for future enhancements and keep 927 + * the existing values unchanged 928 + */ 929 + __le64 src_devid; 930 + __le64 cursor_left; 931 + __le64 cursor_right; 932 + __le64 cont_reading_from_srcdev_mode; 933 + 934 + __le64 replace_state; 935 + __le64 time_started; 936 + __le64 time_stopped; 937 + __le64 num_write_errors; 938 + __le64 num_uncorrectable_read_errors; 892 939 } __attribute__ ((__packed__)); 893 940 894 941 /* different types of block groups (and chunks) */ ··· 1392 1333 struct btrfs_workers generic_worker; 1393 1334 struct btrfs_workers workers; 1394 1335 struct btrfs_workers delalloc_workers; 1336 + struct btrfs_workers flush_workers; 1395 1337 struct btrfs_workers endio_workers; 1396 1338 struct btrfs_workers endio_meta_workers; 1397 1339 struct btrfs_workers endio_meta_write_workers; ··· 1489 1429 struct rw_semaphore scrub_super_lock; 1490 1430 int scrub_workers_refcnt; 1491 1431 struct btrfs_workers scrub_workers; 1432 + struct btrfs_workers scrub_wr_completion_workers; 1433 + struct btrfs_workers scrub_nocow_workers; 1492 1434 1493 1435 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 1494 1436 u32 check_integrity_print_mask; ··· 1532 1470 int backup_root_index; 1533 1471 1534 1472 int num_tolerated_disk_barrier_failures; 1473 + 1474 + /* device replace state */ 1475 + struct btrfs_dev_replace dev_replace; 1476 + 1477 + atomic_t mutually_exclusive_operation_running; 1535 1478 }; 1536 1479 1537 1480 /* ··· 1646 1579 1647 1580 int force_cow; 1648 1581 1649 - spinlock_t root_times_lock; 1582 + spinlock_t root_item_lock; 1650 1583 }; 1651 1584 1652 1585 struct btrfs_ioctl_defrag_range_args { ··· 1790 1723 #define BTRFS_DEV_STATS_KEY 249 1791 1724 1792 1725 /* 1726 + * Persistantly stores the device replace state in the device tree. 1727 + * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0). 1728 + */ 1729 + #define BTRFS_DEV_REPLACE_KEY 250 1730 + 1731 + /* 1793 1732 * string items are for debugging. They just store a short string of 1794 1733 * data in the FS 1795 1734 */ ··· 1860 1787 1861 1788 static inline void btrfs_init_map_token (struct btrfs_map_token *token) 1862 1789 { 1863 - memset(token, 0, sizeof(*token)); 1790 + token->kaddr = NULL; 1864 1791 } 1865 1792 1866 1793 /* some macros to generate set/get funcs for the struct fields. This ··· 2828 2755 BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, 2829 2756 rsv_excl, 64); 2830 2757 2758 + /* btrfs_dev_replace_item */ 2759 + BTRFS_SETGET_FUNCS(dev_replace_src_devid, 2760 + struct btrfs_dev_replace_item, src_devid, 64); 2761 + BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode, 2762 + struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, 2763 + 64); 2764 + BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item, 2765 + replace_state, 64); 2766 + BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item, 2767 + time_started, 64); 2768 + BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item, 2769 + time_stopped, 64); 2770 + BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item, 2771 + num_write_errors, 64); 2772 + BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors, 2773 + struct btrfs_dev_replace_item, num_uncorrectable_read_errors, 2774 + 64); 2775 + BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item, 2776 + cursor_left, 64); 2777 + BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item, 2778 + cursor_right, 64); 2779 + 2780 + BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid, 2781 + struct btrfs_dev_replace_item, src_devid, 64); 2782 + BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode, 2783 + struct btrfs_dev_replace_item, 2784 + cont_reading_from_srcdev_mode, 64); 2785 + BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state, 2786 + struct btrfs_dev_replace_item, replace_state, 64); 2787 + BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started, 2788 + struct btrfs_dev_replace_item, time_started, 64); 2789 + BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped, 2790 + struct btrfs_dev_replace_item, time_stopped, 64); 2791 + BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors, 2792 + struct btrfs_dev_replace_item, num_write_errors, 64); 2793 + BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors, 2794 + struct btrfs_dev_replace_item, 2795 + num_uncorrectable_read_errors, 64); 2796 + BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left, 2797 + struct btrfs_dev_replace_item, cursor_left, 64); 2798 + BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, 2799 + struct btrfs_dev_replace_item, cursor_right, 64); 2800 + 2831 2801 static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) 2832 2802 { 2833 2803 return sb->s_fs_info; ··· 3016 2900 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 3017 2901 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); 3018 2902 void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2903 + 2904 + enum btrfs_reserve_flush_enum { 2905 + /* If we are in the transaction, we can't flush anything.*/ 2906 + BTRFS_RESERVE_NO_FLUSH, 2907 + /* 2908 + * Flushing delalloc may cause deadlock somewhere, in this 2909 + * case, use FLUSH LIMIT 2910 + */ 2911 + BTRFS_RESERVE_FLUSH_LIMIT, 2912 + BTRFS_RESERVE_FLUSH_ALL, 2913 + }; 2914 + 3019 2915 int btrfs_check_data_free_space(struct inode *inode, u64 bytes); 3020 2916 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); 3021 2917 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, ··· 3047 2919 void btrfs_free_block_rsv(struct btrfs_root *root, 3048 2920 struct btrfs_block_rsv *rsv); 3049 2921 int btrfs_block_rsv_add(struct btrfs_root *root, 3050 - struct btrfs_block_rsv *block_rsv, 3051 - u64 num_bytes); 3052 - int btrfs_block_rsv_add_noflush(struct btrfs_root *root, 3053 - struct btrfs_block_rsv *block_rsv, 3054 - u64 num_bytes); 2922 + struct btrfs_block_rsv *block_rsv, u64 num_bytes, 2923 + enum btrfs_reserve_flush_enum flush); 3055 2924 int btrfs_block_rsv_check(struct btrfs_root *root, 3056 2925 struct btrfs_block_rsv *block_rsv, int min_factor); 3057 2926 int btrfs_block_rsv_refill(struct btrfs_root *root, 3058 - struct btrfs_block_rsv *block_rsv, 3059 - u64 min_reserved); 3060 - int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, 3061 - struct btrfs_block_rsv *block_rsv, 3062 - u64 min_reserved); 2927 + struct btrfs_block_rsv *block_rsv, u64 min_reserved, 2928 + enum btrfs_reserve_flush_enum flush); 3063 2929 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3064 2930 struct btrfs_block_rsv *dst_rsv, 3065 2931 u64 num_bytes); ··· 3077 2955 int btrfs_init_space_info(struct btrfs_fs_info *fs_info); 3078 2956 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, 3079 2957 struct btrfs_fs_info *fs_info); 2958 + int __get_raid_index(u64 flags); 3080 2959 /* ctree.c */ 3081 2960 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 3082 2961 int level, int *slot); ··· 3188 3065 } 3189 3066 3190 3067 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 3068 + int btrfs_next_leaf_write(struct btrfs_trans_handle *trans, 3069 + struct btrfs_root *root, struct btrfs_path *path, 3070 + int del); 3191 3071 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 3192 3072 u64 time_seq); 3193 3073 static inline int btrfs_next_old_item(struct btrfs_root *root, ··· 3283 3157 struct btrfs_root *root); 3284 3158 3285 3159 /* dir-item.c */ 3160 + int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, 3161 + const char *name, int name_len); 3286 3162 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, 3287 3163 struct btrfs_root *root, const char *name, 3288 3164 int name_len, struct inode *dir, ··· 3384 3256 struct btrfs_root *root, 3385 3257 struct btrfs_path *path, u64 objectid, 3386 3258 u64 bytenr, int mod); 3259 + u64 btrfs_file_extent_length(struct btrfs_path *path); 3387 3260 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, 3388 3261 struct btrfs_root *root, 3389 3262 struct btrfs_ordered_sum *sums); ··· 3400 3271 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 3401 3272 struct list_head *list, int search_commit); 3402 3273 /* inode.c */ 3274 + struct btrfs_delalloc_work { 3275 + struct inode *inode; 3276 + int wait; 3277 + int delay_iput; 3278 + struct completion completion; 3279 + struct list_head list; 3280 + struct btrfs_work work; 3281 + }; 3282 + 3283 + struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, 3284 + int wait, int delay_iput); 3285 + void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work); 3286 + 3403 3287 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, 3404 3288 size_t pg_offset, u64 start, u64 len, 3405 3289 int create); ··· 3512 3370 struct btrfs_ioctl_space_info *space); 3513 3371 3514 3372 /* file.c */ 3373 + int btrfs_auto_defrag_init(void); 3374 + void btrfs_auto_defrag_exit(void); 3515 3375 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 3516 3376 struct inode *inode); 3517 3377 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); 3378 + void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); 3518 3379 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); 3519 3380 void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 3520 3381 int skip_pinned); ··· 3664 3519 struct btrfs_pending_snapshot *pending); 3665 3520 3666 3521 /* scrub.c */ 3667 - int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, 3668 - struct btrfs_scrub_progress *progress, int readonly); 3522 + int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, 3523 + u64 end, struct btrfs_scrub_progress *progress, 3524 + int readonly, int is_dev_replace); 3669 3525 void btrfs_scrub_pause(struct btrfs_root *root); 3670 3526 void btrfs_scrub_pause_super(struct btrfs_root *root); 3671 3527 void btrfs_scrub_continue(struct btrfs_root *root); 3672 3528 void btrfs_scrub_continue_super(struct btrfs_root *root); 3673 - int __btrfs_scrub_cancel(struct btrfs_fs_info *info); 3674 - int btrfs_scrub_cancel(struct btrfs_root *root); 3675 - int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev); 3529 + int btrfs_scrub_cancel(struct btrfs_fs_info *info); 3530 + int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, 3531 + struct btrfs_device *dev); 3676 3532 int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); 3677 3533 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 3678 3534 struct btrfs_scrub_progress *progress);
+5 -6
fs/btrfs/delayed-inode.c
··· 651 651 */ 652 652 if (!src_rsv || (!trans->bytes_reserved && 653 653 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { 654 - ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); 654 + ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, 655 + BTRFS_RESERVE_NO_FLUSH); 655 656 /* 656 657 * Since we're under a transaction reserve_metadata_bytes could 657 658 * try to commit the transaction which will make it return ··· 687 686 * reserve something strictly for us. If not be a pain and try 688 687 * to steal from the delalloc block rsv. 689 688 */ 690 - ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); 689 + ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, 690 + BTRFS_RESERVE_NO_FLUSH); 691 691 if (!ret) 692 692 goto out; 693 693 ··· 1257 1255 struct btrfs_delayed_node *delayed_node = NULL; 1258 1256 struct btrfs_root *root; 1259 1257 struct btrfs_block_rsv *block_rsv; 1260 - unsigned long nr = 0; 1261 1258 int need_requeue = 0; 1262 1259 int ret; 1263 1260 ··· 1317 1316 delayed_node); 1318 1317 mutex_unlock(&delayed_node->mutex); 1319 1318 1320 - nr = trans->blocks_used; 1321 - 1322 1319 trans->block_rsv = block_rsv; 1323 1320 btrfs_end_transaction_dmeta(trans, root); 1324 - __btrfs_btree_balance_dirty(root, nr); 1321 + btrfs_btree_balance_dirty_nodelay(root); 1325 1322 free_path: 1326 1323 btrfs_free_path(path); 1327 1324 out:
+856
fs/btrfs/dev-replace.c
··· 1 + /* 2 + * Copyright (C) STRATO AG 2012. All rights reserved. 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public 6 + * License v2 as published by the Free Software Foundation. 7 + * 8 + * This program is distributed in the hope that it will be useful, 9 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 + * General Public License for more details. 12 + * 13 + * You should have received a copy of the GNU General Public 14 + * License along with this program; if not, write to the 15 + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 + * Boston, MA 021110-1307, USA. 17 + */ 18 + #include <linux/sched.h> 19 + #include <linux/bio.h> 20 + #include <linux/slab.h> 21 + #include <linux/buffer_head.h> 22 + #include <linux/blkdev.h> 23 + #include <linux/random.h> 24 + #include <linux/iocontext.h> 25 + #include <linux/capability.h> 26 + #include <linux/kthread.h> 27 + #include <linux/math64.h> 28 + #include <asm/div64.h> 29 + #include "compat.h" 30 + #include "ctree.h" 31 + #include "extent_map.h" 32 + #include "disk-io.h" 33 + #include "transaction.h" 34 + #include "print-tree.h" 35 + #include "volumes.h" 36 + #include "async-thread.h" 37 + #include "check-integrity.h" 38 + #include "rcu-string.h" 39 + #include "dev-replace.h" 40 + 41 + static u64 btrfs_get_seconds_since_1970(void); 42 + static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, 43 + int scrub_ret); 44 + static void btrfs_dev_replace_update_device_in_mapping_tree( 45 + struct btrfs_fs_info *fs_info, 46 + struct btrfs_device *srcdev, 47 + struct btrfs_device *tgtdev); 48 + static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, 49 + char *srcdev_name, 50 + struct btrfs_device **device); 51 + static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); 52 + static int btrfs_dev_replace_kthread(void *data); 53 + static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info); 54 + 55 + 56 + int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) 57 + { 58 + struct btrfs_key key; 59 + struct btrfs_root *dev_root = fs_info->dev_root; 60 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 61 + struct extent_buffer *eb; 62 + int slot; 63 + int ret = 0; 64 + struct btrfs_path *path = NULL; 65 + int item_size; 66 + struct btrfs_dev_replace_item *ptr; 67 + u64 src_devid; 68 + 69 + path = btrfs_alloc_path(); 70 + if (!path) { 71 + ret = -ENOMEM; 72 + goto out; 73 + } 74 + 75 + key.objectid = 0; 76 + key.type = BTRFS_DEV_REPLACE_KEY; 77 + key.offset = 0; 78 + ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 79 + if (ret) { 80 + no_valid_dev_replace_entry_found: 81 + ret = 0; 82 + dev_replace->replace_state = 83 + BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED; 84 + dev_replace->cont_reading_from_srcdev_mode = 85 + BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS; 86 + dev_replace->replace_state = 0; 87 + dev_replace->time_started = 0; 88 + dev_replace->time_stopped = 0; 89 + atomic64_set(&dev_replace->num_write_errors, 0); 90 + atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); 91 + dev_replace->cursor_left = 0; 92 + dev_replace->committed_cursor_left = 0; 93 + dev_replace->cursor_left_last_write_of_item = 0; 94 + dev_replace->cursor_right = 0; 95 + dev_replace->srcdev = NULL; 96 + dev_replace->tgtdev = NULL; 97 + dev_replace->is_valid = 0; 98 + dev_replace->item_needs_writeback = 0; 99 + goto out; 100 + } 101 + slot = path->slots[0]; 102 + eb = path->nodes[0]; 103 + item_size = btrfs_item_size_nr(eb, slot); 104 + ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item); 105 + 106 + if (item_size != sizeof(struct btrfs_dev_replace_item)) { 107 + pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n"); 108 + goto no_valid_dev_replace_entry_found; 109 + } 110 + 111 + src_devid = btrfs_dev_replace_src_devid(eb, ptr); 112 + dev_replace->cont_reading_from_srcdev_mode = 113 + btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr); 114 + dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr); 115 + dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr); 116 + dev_replace->time_stopped = 117 + btrfs_dev_replace_time_stopped(eb, ptr); 118 + atomic64_set(&dev_replace->num_write_errors, 119 + btrfs_dev_replace_num_write_errors(eb, ptr)); 120 + atomic64_set(&dev_replace->num_uncorrectable_read_errors, 121 + btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr)); 122 + dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr); 123 + dev_replace->committed_cursor_left = dev_replace->cursor_left; 124 + dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left; 125 + dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr); 126 + dev_replace->is_valid = 1; 127 + 128 + dev_replace->item_needs_writeback = 0; 129 + switch (dev_replace->replace_state) { 130 + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 131 + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 132 + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 133 + dev_replace->srcdev = NULL; 134 + dev_replace->tgtdev = NULL; 135 + break; 136 + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 137 + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 138 + dev_replace->srcdev = btrfs_find_device(fs_info, src_devid, 139 + NULL, NULL); 140 + dev_replace->tgtdev = btrfs_find_device(fs_info, 141 + BTRFS_DEV_REPLACE_DEVID, 142 + NULL, NULL); 143 + /* 144 + * allow 'btrfs dev replace_cancel' if src/tgt device is 145 + * missing 146 + */ 147 + if (!dev_replace->srcdev && 148 + !btrfs_test_opt(dev_root, DEGRADED)) { 149 + ret = -EIO; 150 + pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n", 151 + (unsigned long long)src_devid); 152 + } 153 + if (!dev_replace->tgtdev && 154 + !btrfs_test_opt(dev_root, DEGRADED)) { 155 + ret = -EIO; 156 + pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n", 157 + (unsigned long long)BTRFS_DEV_REPLACE_DEVID); 158 + } 159 + if (dev_replace->tgtdev) { 160 + if (dev_replace->srcdev) { 161 + dev_replace->tgtdev->total_bytes = 162 + dev_replace->srcdev->total_bytes; 163 + dev_replace->tgtdev->disk_total_bytes = 164 + dev_replace->srcdev->disk_total_bytes; 165 + dev_replace->tgtdev->bytes_used = 166 + dev_replace->srcdev->bytes_used; 167 + } 168 + dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1; 169 + btrfs_init_dev_replace_tgtdev_for_resume(fs_info, 170 + dev_replace->tgtdev); 171 + } 172 + break; 173 + } 174 + 175 + out: 176 + if (path) 177 + btrfs_free_path(path); 178 + return ret; 179 + } 180 + 181 + /* 182 + * called from commit_transaction. Writes changed device replace state to 183 + * disk. 184 + */ 185 + int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, 186 + struct btrfs_fs_info *fs_info) 187 + { 188 + int ret; 189 + struct btrfs_root *dev_root = fs_info->dev_root; 190 + struct btrfs_path *path; 191 + struct btrfs_key key; 192 + struct extent_buffer *eb; 193 + struct btrfs_dev_replace_item *ptr; 194 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 195 + 196 + btrfs_dev_replace_lock(dev_replace); 197 + if (!dev_replace->is_valid || 198 + !dev_replace->item_needs_writeback) { 199 + btrfs_dev_replace_unlock(dev_replace); 200 + return 0; 201 + } 202 + btrfs_dev_replace_unlock(dev_replace); 203 + 204 + key.objectid = 0; 205 + key.type = BTRFS_DEV_REPLACE_KEY; 206 + key.offset = 0; 207 + 208 + path = btrfs_alloc_path(); 209 + if (!path) { 210 + ret = -ENOMEM; 211 + goto out; 212 + } 213 + ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 214 + if (ret < 0) { 215 + pr_warn("btrfs: error %d while searching for dev_replace item!\n", 216 + ret); 217 + goto out; 218 + } 219 + 220 + if (ret == 0 && 221 + btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 222 + /* 223 + * need to delete old one and insert a new one. 224 + * Since no attempt is made to recover any old state, if the 225 + * dev_replace state is 'running', the data on the target 226 + * drive is lost. 227 + * It would be possible to recover the state: just make sure 228 + * that the beginning of the item is never changed and always 229 + * contains all the essential information. Then read this 230 + * minimal set of information and use it as a base for the 231 + * new state. 232 + */ 233 + ret = btrfs_del_item(trans, dev_root, path); 234 + if (ret != 0) { 235 + pr_warn("btrfs: delete too small dev_replace item failed %d!\n", 236 + ret); 237 + goto out; 238 + } 239 + ret = 1; 240 + } 241 + 242 + if (ret == 1) { 243 + /* need to insert a new item */ 244 + btrfs_release_path(path); 245 + ret = btrfs_insert_empty_item(trans, dev_root, path, 246 + &key, sizeof(*ptr)); 247 + if (ret < 0) { 248 + pr_warn("btrfs: insert dev_replace item failed %d!\n", 249 + ret); 250 + goto out; 251 + } 252 + } 253 + 254 + eb = path->nodes[0]; 255 + ptr = btrfs_item_ptr(eb, path->slots[0], 256 + struct btrfs_dev_replace_item); 257 + 258 + btrfs_dev_replace_lock(dev_replace); 259 + if (dev_replace->srcdev) 260 + btrfs_set_dev_replace_src_devid(eb, ptr, 261 + dev_replace->srcdev->devid); 262 + else 263 + btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1); 264 + btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr, 265 + dev_replace->cont_reading_from_srcdev_mode); 266 + btrfs_set_dev_replace_replace_state(eb, ptr, 267 + dev_replace->replace_state); 268 + btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started); 269 + btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped); 270 + btrfs_set_dev_replace_num_write_errors(eb, ptr, 271 + atomic64_read(&dev_replace->num_write_errors)); 272 + btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr, 273 + atomic64_read(&dev_replace->num_uncorrectable_read_errors)); 274 + dev_replace->cursor_left_last_write_of_item = 275 + dev_replace->cursor_left; 276 + btrfs_set_dev_replace_cursor_left(eb, ptr, 277 + dev_replace->cursor_left_last_write_of_item); 278 + btrfs_set_dev_replace_cursor_right(eb, ptr, 279 + dev_replace->cursor_right); 280 + dev_replace->item_needs_writeback = 0; 281 + btrfs_dev_replace_unlock(dev_replace); 282 + 283 + btrfs_mark_buffer_dirty(eb); 284 + 285 + out: 286 + btrfs_free_path(path); 287 + 288 + return ret; 289 + } 290 + 291 + void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) 292 + { 293 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 294 + 295 + dev_replace->committed_cursor_left = 296 + dev_replace->cursor_left_last_write_of_item; 297 + } 298 + 299 + static u64 btrfs_get_seconds_since_1970(void) 300 + { 301 + struct timespec t = CURRENT_TIME_SEC; 302 + 303 + return t.tv_sec; 304 + } 305 + 306 + int btrfs_dev_replace_start(struct btrfs_root *root, 307 + struct btrfs_ioctl_dev_replace_args *args) 308 + { 309 + struct btrfs_trans_handle *trans; 310 + struct btrfs_fs_info *fs_info = root->fs_info; 311 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 312 + int ret; 313 + struct btrfs_device *tgt_device = NULL; 314 + struct btrfs_device *src_device = NULL; 315 + 316 + switch (args->start.cont_reading_from_srcdev_mode) { 317 + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: 318 + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: 319 + break; 320 + default: 321 + return -EINVAL; 322 + } 323 + 324 + if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || 325 + args->start.tgtdev_name[0] == '\0') 326 + return -EINVAL; 327 + 328 + mutex_lock(&fs_info->volume_mutex); 329 + ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, 330 + &tgt_device); 331 + if (ret) { 332 + pr_err("btrfs: target device %s is invalid!\n", 333 + args->start.tgtdev_name); 334 + mutex_unlock(&fs_info->volume_mutex); 335 + return -EINVAL; 336 + } 337 + 338 + ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, 339 + args->start.srcdev_name, 340 + &src_device); 341 + mutex_unlock(&fs_info->volume_mutex); 342 + if (ret) { 343 + ret = -EINVAL; 344 + goto leave_no_lock; 345 + } 346 + 347 + if (tgt_device->total_bytes < src_device->total_bytes) { 348 + pr_err("btrfs: target device is smaller than source device!\n"); 349 + ret = -EINVAL; 350 + goto leave_no_lock; 351 + } 352 + 353 + btrfs_dev_replace_lock(dev_replace); 354 + switch (dev_replace->replace_state) { 355 + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 356 + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 357 + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 358 + break; 359 + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 360 + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 361 + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; 362 + goto leave; 363 + } 364 + 365 + dev_replace->cont_reading_from_srcdev_mode = 366 + args->start.cont_reading_from_srcdev_mode; 367 + WARN_ON(!src_device); 368 + dev_replace->srcdev = src_device; 369 + WARN_ON(!tgt_device); 370 + dev_replace->tgtdev = tgt_device; 371 + 372 + printk_in_rcu(KERN_INFO 373 + "btrfs: dev_replace from %s (devid %llu) to %s) started\n", 374 + src_device->missing ? "<missing disk>" : 375 + rcu_str_deref(src_device->name), 376 + src_device->devid, 377 + rcu_str_deref(tgt_device->name)); 378 + 379 + tgt_device->total_bytes = src_device->total_bytes; 380 + tgt_device->disk_total_bytes = src_device->disk_total_bytes; 381 + tgt_device->bytes_used = src_device->bytes_used; 382 + 383 + /* 384 + * from now on, the writes to the srcdev are all duplicated to 385 + * go to the tgtdev as well (refer to btrfs_map_block()). 386 + */ 387 + dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; 388 + dev_replace->time_started = btrfs_get_seconds_since_1970(); 389 + dev_replace->cursor_left = 0; 390 + dev_replace->committed_cursor_left = 0; 391 + dev_replace->cursor_left_last_write_of_item = 0; 392 + dev_replace->cursor_right = 0; 393 + dev_replace->is_valid = 1; 394 + dev_replace->item_needs_writeback = 1; 395 + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 396 + btrfs_dev_replace_unlock(dev_replace); 397 + 398 + btrfs_wait_ordered_extents(root, 0); 399 + 400 + /* force writing the updated state information to disk */ 401 + trans = btrfs_start_transaction(root, 0); 402 + if (IS_ERR(trans)) { 403 + ret = PTR_ERR(trans); 404 + btrfs_dev_replace_lock(dev_replace); 405 + goto leave; 406 + } 407 + 408 + ret = btrfs_commit_transaction(trans, root); 409 + WARN_ON(ret); 410 + 411 + /* the disk copy procedure reuses the scrub code */ 412 + ret = btrfs_scrub_dev(fs_info, src_device->devid, 0, 413 + src_device->total_bytes, 414 + &dev_replace->scrub_progress, 0, 1); 415 + 416 + ret = btrfs_dev_replace_finishing(root->fs_info, ret); 417 + WARN_ON(ret); 418 + 419 + return 0; 420 + 421 + leave: 422 + dev_replace->srcdev = NULL; 423 + dev_replace->tgtdev = NULL; 424 + btrfs_dev_replace_unlock(dev_replace); 425 + leave_no_lock: 426 + if (tgt_device) 427 + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); 428 + return ret; 429 + } 430 + 431 + static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, 432 + int scrub_ret) 433 + { 434 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 435 + struct btrfs_device *tgt_device; 436 + struct btrfs_device *src_device; 437 + struct btrfs_root *root = fs_info->tree_root; 438 + u8 uuid_tmp[BTRFS_UUID_SIZE]; 439 + struct btrfs_trans_handle *trans; 440 + int ret = 0; 441 + 442 + /* don't allow cancel or unmount to disturb the finishing procedure */ 443 + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); 444 + 445 + btrfs_dev_replace_lock(dev_replace); 446 + /* was the operation canceled, or is it finished? */ 447 + if (dev_replace->replace_state != 448 + BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { 449 + btrfs_dev_replace_unlock(dev_replace); 450 + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 451 + return 0; 452 + } 453 + 454 + tgt_device = dev_replace->tgtdev; 455 + src_device = dev_replace->srcdev; 456 + btrfs_dev_replace_unlock(dev_replace); 457 + 458 + /* replace old device with new one in mapping tree */ 459 + if (!scrub_ret) 460 + btrfs_dev_replace_update_device_in_mapping_tree(fs_info, 461 + src_device, 462 + tgt_device); 463 + 464 + /* 465 + * flush all outstanding I/O and inode extent mappings before the 466 + * copy operation is declared as being finished 467 + */ 468 + btrfs_start_delalloc_inodes(root, 0); 469 + btrfs_wait_ordered_extents(root, 0); 470 + 471 + trans = btrfs_start_transaction(root, 0); 472 + if (IS_ERR(trans)) { 473 + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 474 + return PTR_ERR(trans); 475 + } 476 + ret = btrfs_commit_transaction(trans, root); 477 + WARN_ON(ret); 478 + 479 + /* keep away write_all_supers() during the finishing procedure */ 480 + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 481 + btrfs_dev_replace_lock(dev_replace); 482 + dev_replace->replace_state = 483 + scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 484 + : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; 485 + dev_replace->tgtdev = NULL; 486 + dev_replace->srcdev = NULL; 487 + dev_replace->time_stopped = btrfs_get_seconds_since_1970(); 488 + dev_replace->item_needs_writeback = 1; 489 + 490 + if (scrub_ret) { 491 + printk_in_rcu(KERN_ERR 492 + "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", 493 + src_device->missing ? "<missing disk>" : 494 + rcu_str_deref(src_device->name), 495 + src_device->devid, 496 + rcu_str_deref(tgt_device->name), scrub_ret); 497 + btrfs_dev_replace_unlock(dev_replace); 498 + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 499 + if (tgt_device) 500 + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); 501 + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 502 + 503 + return 0; 504 + } 505 + 506 + printk_in_rcu(KERN_INFO 507 + "btrfs: dev_replace from %s (devid %llu) to %s) finished\n", 508 + src_device->missing ? "<missing disk>" : 509 + rcu_str_deref(src_device->name), 510 + src_device->devid, 511 + rcu_str_deref(tgt_device->name)); 512 + tgt_device->is_tgtdev_for_dev_replace = 0; 513 + tgt_device->devid = src_device->devid; 514 + src_device->devid = BTRFS_DEV_REPLACE_DEVID; 515 + tgt_device->bytes_used = src_device->bytes_used; 516 + memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp)); 517 + memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid)); 518 + memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid)); 519 + tgt_device->total_bytes = src_device->total_bytes; 520 + tgt_device->disk_total_bytes = src_device->disk_total_bytes; 521 + tgt_device->bytes_used = src_device->bytes_used; 522 + if (fs_info->sb->s_bdev == src_device->bdev) 523 + fs_info->sb->s_bdev = tgt_device->bdev; 524 + if (fs_info->fs_devices->latest_bdev == src_device->bdev) 525 + fs_info->fs_devices->latest_bdev = tgt_device->bdev; 526 + list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); 527 + 528 + btrfs_rm_dev_replace_srcdev(fs_info, src_device); 529 + if (src_device->bdev) { 530 + /* zero out the old super */ 531 + btrfs_scratch_superblock(src_device); 532 + } 533 + /* 534 + * this is again a consistent state where no dev_replace procedure 535 + * is running, the target device is part of the filesystem, the 536 + * source device is not part of the filesystem anymore and its 1st 537 + * superblock is scratched out so that it is no longer marked to 538 + * belong to this filesystem. 539 + */ 540 + btrfs_dev_replace_unlock(dev_replace); 541 + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 542 + 543 + /* write back the superblocks */ 544 + trans = btrfs_start_transaction(root, 0); 545 + if (!IS_ERR(trans)) 546 + btrfs_commit_transaction(trans, root); 547 + 548 + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 549 + 550 + return 0; 551 + } 552 + 553 + static void btrfs_dev_replace_update_device_in_mapping_tree( 554 + struct btrfs_fs_info *fs_info, 555 + struct btrfs_device *srcdev, 556 + struct btrfs_device *tgtdev) 557 + { 558 + struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; 559 + struct extent_map *em; 560 + struct map_lookup *map; 561 + u64 start = 0; 562 + int i; 563 + 564 + write_lock(&em_tree->lock); 565 + do { 566 + em = lookup_extent_mapping(em_tree, start, (u64)-1); 567 + if (!em) 568 + break; 569 + map = (struct map_lookup *)em->bdev; 570 + for (i = 0; i < map->num_stripes; i++) 571 + if (srcdev == map->stripes[i].dev) 572 + map->stripes[i].dev = tgtdev; 573 + start = em->start + em->len; 574 + free_extent_map(em); 575 + } while (start); 576 + write_unlock(&em_tree->lock); 577 + } 578 + 579 + static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, 580 + char *srcdev_name, 581 + struct btrfs_device **device) 582 + { 583 + int ret; 584 + 585 + if (srcdevid) { 586 + ret = 0; 587 + *device = btrfs_find_device(root->fs_info, srcdevid, NULL, 588 + NULL); 589 + if (!*device) 590 + ret = -ENOENT; 591 + } else { 592 + ret = btrfs_find_device_missing_or_by_path(root, srcdev_name, 593 + device); 594 + } 595 + return ret; 596 + } 597 + 598 + void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, 599 + struct btrfs_ioctl_dev_replace_args *args) 600 + { 601 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 602 + 603 + btrfs_dev_replace_lock(dev_replace); 604 + /* even if !dev_replace_is_valid, the values are good enough for 605 + * the replace_status ioctl */ 606 + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 607 + args->status.replace_state = dev_replace->replace_state; 608 + args->status.time_started = dev_replace->time_started; 609 + args->status.time_stopped = dev_replace->time_stopped; 610 + args->status.num_write_errors = 611 + atomic64_read(&dev_replace->num_write_errors); 612 + args->status.num_uncorrectable_read_errors = 613 + atomic64_read(&dev_replace->num_uncorrectable_read_errors); 614 + switch (dev_replace->replace_state) { 615 + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 616 + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 617 + args->status.progress_1000 = 0; 618 + break; 619 + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 620 + args->status.progress_1000 = 1000; 621 + break; 622 + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 623 + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 624 + args->status.progress_1000 = div64_u64(dev_replace->cursor_left, 625 + div64_u64(dev_replace->srcdev->total_bytes, 1000)); 626 + break; 627 + } 628 + btrfs_dev_replace_unlock(dev_replace); 629 + } 630 + 631 + int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, 632 + struct btrfs_ioctl_dev_replace_args *args) 633 + { 634 + args->result = __btrfs_dev_replace_cancel(fs_info); 635 + return 0; 636 + } 637 + 638 + static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) 639 + { 640 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 641 + struct btrfs_device *tgt_device = NULL; 642 + struct btrfs_trans_handle *trans; 643 + struct btrfs_root *root = fs_info->tree_root; 644 + u64 result; 645 + int ret; 646 + 647 + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); 648 + btrfs_dev_replace_lock(dev_replace); 649 + switch (dev_replace->replace_state) { 650 + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 651 + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 652 + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 653 + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; 654 + btrfs_dev_replace_unlock(dev_replace); 655 + goto leave; 656 + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 657 + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 658 + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 659 + tgt_device = dev_replace->tgtdev; 660 + dev_replace->tgtdev = NULL; 661 + dev_replace->srcdev = NULL; 662 + break; 663 + } 664 + dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; 665 + dev_replace->time_stopped = btrfs_get_seconds_since_1970(); 666 + dev_replace->item_needs_writeback = 1; 667 + btrfs_dev_replace_unlock(dev_replace); 668 + btrfs_scrub_cancel(fs_info); 669 + 670 + trans = btrfs_start_transaction(root, 0); 671 + if (IS_ERR(trans)) { 672 + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 673 + return PTR_ERR(trans); 674 + } 675 + ret = btrfs_commit_transaction(trans, root); 676 + WARN_ON(ret); 677 + if (tgt_device) 678 + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); 679 + 680 + leave: 681 + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 682 + return result; 683 + } 684 + 685 + void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) 686 + { 687 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 688 + 689 + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); 690 + btrfs_dev_replace_lock(dev_replace); 691 + switch (dev_replace->replace_state) { 692 + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 693 + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 694 + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 695 + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 696 + break; 697 + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 698 + dev_replace->replace_state = 699 + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; 700 + dev_replace->time_stopped = btrfs_get_seconds_since_1970(); 701 + dev_replace->item_needs_writeback = 1; 702 + pr_info("btrfs: suspending dev_replace for unmount\n"); 703 + break; 704 + } 705 + 706 + btrfs_dev_replace_unlock(dev_replace); 707 + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 708 + } 709 + 710 + /* resume dev_replace procedure that was interrupted by unmount */ 711 + int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) 712 + { 713 + struct task_struct *task; 714 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 715 + 716 + btrfs_dev_replace_lock(dev_replace); 717 + switch (dev_replace->replace_state) { 718 + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 719 + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 720 + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 721 + btrfs_dev_replace_unlock(dev_replace); 722 + return 0; 723 + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 724 + break; 725 + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 726 + dev_replace->replace_state = 727 + BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; 728 + break; 729 + } 730 + if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) { 731 + pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n" 732 + "btrfs: you may cancel the operation after 'mount -o degraded'\n"); 733 + btrfs_dev_replace_unlock(dev_replace); 734 + return 0; 735 + } 736 + btrfs_dev_replace_unlock(dev_replace); 737 + 738 + WARN_ON(atomic_xchg( 739 + &fs_info->mutually_exclusive_operation_running, 1)); 740 + task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); 741 + return PTR_RET(task); 742 + } 743 + 744 + static int btrfs_dev_replace_kthread(void *data) 745 + { 746 + struct btrfs_fs_info *fs_info = data; 747 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 748 + struct btrfs_ioctl_dev_replace_args *status_args; 749 + u64 progress; 750 + 751 + status_args = kzalloc(sizeof(*status_args), GFP_NOFS); 752 + if (status_args) { 753 + btrfs_dev_replace_status(fs_info, status_args); 754 + progress = status_args->status.progress_1000; 755 + kfree(status_args); 756 + do_div(progress, 10); 757 + printk_in_rcu(KERN_INFO 758 + "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", 759 + dev_replace->srcdev->missing ? "<missing disk>" : 760 + rcu_str_deref(dev_replace->srcdev->name), 761 + dev_replace->srcdev->devid, 762 + dev_replace->tgtdev ? 763 + rcu_str_deref(dev_replace->tgtdev->name) : 764 + "<missing target disk>", 765 + (unsigned int)progress); 766 + } 767 + btrfs_dev_replace_continue_on_mount(fs_info); 768 + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); 769 + 770 + return 0; 771 + } 772 + 773 + static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info) 774 + { 775 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 776 + int ret; 777 + 778 + ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid, 779 + dev_replace->committed_cursor_left, 780 + dev_replace->srcdev->total_bytes, 781 + &dev_replace->scrub_progress, 0, 1); 782 + ret = btrfs_dev_replace_finishing(fs_info, ret); 783 + WARN_ON(ret); 784 + return 0; 785 + } 786 + 787 + int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) 788 + { 789 + if (!dev_replace->is_valid) 790 + return 0; 791 + 792 + switch (dev_replace->replace_state) { 793 + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 794 + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 795 + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 796 + return 0; 797 + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 798 + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 799 + /* 800 + * return true even if tgtdev is missing (this is 801 + * something that can happen if the dev_replace 802 + * procedure is suspended by an umount and then 803 + * the tgtdev is missing (or "btrfs dev scan") was 804 + * not called and the the filesystem is remounted 805 + * in degraded state. This does not stop the 806 + * dev_replace procedure. It needs to be canceled 807 + * manually if the cancelation is wanted. 808 + */ 809 + break; 810 + } 811 + return 1; 812 + } 813 + 814 + void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace) 815 + { 816 + /* the beginning is just an optimization for the typical case */ 817 + if (atomic_read(&dev_replace->nesting_level) == 0) { 818 + acquire_lock: 819 + /* this is not a nested case where the same thread 820 + * is trying to acqurire the same lock twice */ 821 + mutex_lock(&dev_replace->lock); 822 + mutex_lock(&dev_replace->lock_management_lock); 823 + dev_replace->lock_owner = current->pid; 824 + atomic_inc(&dev_replace->nesting_level); 825 + mutex_unlock(&dev_replace->lock_management_lock); 826 + return; 827 + } 828 + 829 + mutex_lock(&dev_replace->lock_management_lock); 830 + if (atomic_read(&dev_replace->nesting_level) > 0 && 831 + dev_replace->lock_owner == current->pid) { 832 + WARN_ON(!mutex_is_locked(&dev_replace->lock)); 833 + atomic_inc(&dev_replace->nesting_level); 834 + mutex_unlock(&dev_replace->lock_management_lock); 835 + return; 836 + } 837 + 838 + mutex_unlock(&dev_replace->lock_management_lock); 839 + goto acquire_lock; 840 + } 841 + 842 + void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace) 843 + { 844 + WARN_ON(!mutex_is_locked(&dev_replace->lock)); 845 + mutex_lock(&dev_replace->lock_management_lock); 846 + WARN_ON(atomic_read(&dev_replace->nesting_level) < 1); 847 + WARN_ON(dev_replace->lock_owner != current->pid); 848 + atomic_dec(&dev_replace->nesting_level); 849 + if (atomic_read(&dev_replace->nesting_level) == 0) { 850 + dev_replace->lock_owner = 0; 851 + mutex_unlock(&dev_replace->lock_management_lock); 852 + mutex_unlock(&dev_replace->lock); 853 + } else { 854 + mutex_unlock(&dev_replace->lock_management_lock); 855 + } 856 + }
+44
fs/btrfs/dev-replace.h
··· 1 + /* 2 + * Copyright (C) STRATO AG 2012. All rights reserved. 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public 6 + * License v2 as published by the Free Software Foundation. 7 + * 8 + * This program is distributed in the hope that it will be useful, 9 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 + * General Public License for more details. 12 + * 13 + * You should have received a copy of the GNU General Public 14 + * License along with this program; if not, write to the 15 + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 + * Boston, MA 021110-1307, USA. 17 + */ 18 + 19 + #if !defined(__BTRFS_DEV_REPLACE__) 20 + #define __BTRFS_DEV_REPLACE__ 21 + 22 + struct btrfs_ioctl_dev_replace_args; 23 + 24 + int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); 25 + int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, 26 + struct btrfs_fs_info *fs_info); 27 + void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info); 28 + int btrfs_dev_replace_start(struct btrfs_root *root, 29 + struct btrfs_ioctl_dev_replace_args *args); 30 + void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, 31 + struct btrfs_ioctl_dev_replace_args *args); 32 + int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, 33 + struct btrfs_ioctl_dev_replace_args *args); 34 + void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); 35 + int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); 36 + int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); 37 + void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace); 38 + void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace); 39 + 40 + static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value) 41 + { 42 + atomic64_inc(stat_value); 43 + } 44 + #endif
+59
fs/btrfs/dir-item.c
··· 213 213 return btrfs_match_dir_item_name(root, path, name, name_len); 214 214 } 215 215 216 + int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, 217 + const char *name, int name_len) 218 + { 219 + int ret; 220 + struct btrfs_key key; 221 + struct btrfs_dir_item *di; 222 + int data_size; 223 + struct extent_buffer *leaf; 224 + int slot; 225 + struct btrfs_path *path; 226 + 227 + 228 + path = btrfs_alloc_path(); 229 + if (!path) 230 + return -ENOMEM; 231 + 232 + key.objectid = dir; 233 + btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); 234 + key.offset = btrfs_name_hash(name, name_len); 235 + 236 + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 237 + 238 + /* return back any errors */ 239 + if (ret < 0) 240 + goto out; 241 + 242 + /* nothing found, we're safe */ 243 + if (ret > 0) { 244 + ret = 0; 245 + goto out; 246 + } 247 + 248 + /* we found an item, look for our name in the item */ 249 + di = btrfs_match_dir_item_name(root, path, name, name_len); 250 + if (di) { 251 + /* our exact name was found */ 252 + ret = -EEXIST; 253 + goto out; 254 + } 255 + 256 + /* 257 + * see if there is room in the item to insert this 258 + * name 259 + */ 260 + data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item); 261 + leaf = path->nodes[0]; 262 + slot = path->slots[0]; 263 + if (data_size + btrfs_item_size_nr(leaf, slot) + 264 + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) { 265 + ret = -EOVERFLOW; 266 + } else { 267 + /* plenty of insertion room */ 268 + ret = 0; 269 + } 270 + out: 271 + btrfs_free_path(path); 272 + return ret; 273 + } 274 + 216 275 /* 217 276 * lookup a directory item based on index. 'dir' is the objectid 218 277 * we're searching in, and 'mod' tells us if you plan on deleting the
+92 -50
fs/btrfs/disk-io.c
··· 45 45 #include "inode-map.h" 46 46 #include "check-integrity.h" 47 47 #include "rcu-string.h" 48 + #include "dev-replace.h" 48 49 49 50 #ifdef CONFIG_X86 50 51 #include <asm/cpufeature.h> ··· 388 387 if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) 389 388 break; 390 389 391 - num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, 390 + num_copies = btrfs_num_copies(root->fs_info, 392 391 eb->start, eb->len); 393 392 if (num_copies == 1) 394 393 break; ··· 853 852 int mirror_num, unsigned long bio_flags, 854 853 u64 bio_offset) 855 854 { 855 + int ret; 856 + 856 857 /* 857 858 * when we're called for a write, we're already in the async 858 859 * submission context. Just jump into btrfs_map_bio 859 860 */ 860 - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); 861 + ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); 862 + if (ret) 863 + bio_endio(bio, ret); 864 + return ret; 861 865 } 862 866 863 867 static int check_async_write(struct inode *inode, unsigned long bio_flags) ··· 884 878 int ret; 885 879 886 880 if (!(rw & REQ_WRITE)) { 887 - 888 881 /* 889 882 * called for a read, do the setup so that checksum validation 890 883 * can happen in the async kernel threads ··· 891 886 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, 892 887 bio, 1); 893 888 if (ret) 894 - return ret; 895 - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 896 - mirror_num, 0); 889 + goto out_w_error; 890 + ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 891 + mirror_num, 0); 897 892 } else if (!async) { 898 893 ret = btree_csum_one_bio(bio); 899 894 if (ret) 900 - return ret; 901 - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 902 - mirror_num, 0); 895 + goto out_w_error; 896 + ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 897 + mirror_num, 0); 898 + } else { 899 + /* 900 + * kthread helpers are used to submit writes so that 901 + * checksumming can happen in parallel across all CPUs 902 + */ 903 + ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 904 + inode, rw, bio, mirror_num, 0, 905 + bio_offset, 906 + __btree_submit_bio_start, 907 + __btree_submit_bio_done); 903 908 } 904 909 905 - /* 906 - * kthread helpers are used to submit writes so that checksumming 907 - * can happen in parallel across all CPUs 908 - */ 909 - return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 910 - inode, rw, bio, mirror_num, 0, 911 - bio_offset, 912 - __btree_submit_bio_start, 913 - __btree_submit_bio_done); 910 + if (ret) { 911 + out_w_error: 912 + bio_endio(bio, ret); 913 + } 914 + return ret; 914 915 } 915 916 916 917 #ifdef CONFIG_MIGRATION ··· 1001 990 1002 991 static int btree_set_page_dirty(struct page *page) 1003 992 { 993 + #ifdef DEBUG 1004 994 struct extent_buffer *eb; 1005 995 1006 996 BUG_ON(!PagePrivate(page)); ··· 1010 998 BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 1011 999 BUG_ON(!atomic_read(&eb->refs)); 1012 1000 btrfs_assert_tree_locked(eb); 1001 + #endif 1013 1002 return __set_page_dirty_nobuffers(page); 1014 1003 } 1015 1004 ··· 1142 1129 root->fs_info->dirty_metadata_bytes); 1143 1130 } 1144 1131 spin_unlock(&root->fs_info->delalloc_lock); 1145 - } 1146 1132 1147 - /* ugh, clear_extent_buffer_dirty needs to lock the page */ 1148 - btrfs_set_lock_blocking(buf); 1149 - clear_extent_buffer_dirty(buf); 1133 + /* ugh, clear_extent_buffer_dirty needs to lock the page */ 1134 + btrfs_set_lock_blocking(buf); 1135 + clear_extent_buffer_dirty(buf); 1136 + } 1150 1137 } 1151 1138 } 1152 1139 ··· 1206 1193 root->root_key.objectid = objectid; 1207 1194 root->anon_dev = 0; 1208 1195 1209 - spin_lock_init(&root->root_times_lock); 1196 + spin_lock_init(&root->root_item_lock); 1210 1197 } 1211 1198 1212 1199 static int __must_check find_and_setup_root(struct btrfs_root *tree_root, ··· 2144 2131 init_rwsem(&fs_info->extent_commit_sem); 2145 2132 init_rwsem(&fs_info->cleanup_work_sem); 2146 2133 init_rwsem(&fs_info->subvol_sem); 2134 + fs_info->dev_replace.lock_owner = 0; 2135 + atomic_set(&fs_info->dev_replace.nesting_level, 0); 2136 + mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); 2137 + mutex_init(&fs_info->dev_replace.lock_management_lock); 2138 + mutex_init(&fs_info->dev_replace.lock); 2147 2139 2148 2140 spin_lock_init(&fs_info->qgroup_lock); 2149 2141 fs_info->qgroup_tree = RB_ROOT; ··· 2297 2279 fs_info->thread_pool_size, 2298 2280 &fs_info->generic_worker); 2299 2281 2282 + btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc", 2283 + fs_info->thread_pool_size, 2284 + &fs_info->generic_worker); 2285 + 2300 2286 btrfs_init_workers(&fs_info->submit_workers, "submit", 2301 2287 min_t(u64, fs_devices->num_devices, 2302 2288 fs_info->thread_pool_size), ··· 2372 2350 ret |= btrfs_start_workers(&fs_info->delayed_workers); 2373 2351 ret |= btrfs_start_workers(&fs_info->caching_workers); 2374 2352 ret |= btrfs_start_workers(&fs_info->readahead_workers); 2353 + ret |= btrfs_start_workers(&fs_info->flush_workers); 2375 2354 if (ret) { 2376 2355 err = -ENOMEM; 2377 2356 goto fail_sb_buffer; ··· 2441 2418 goto fail_tree_roots; 2442 2419 } 2443 2420 2444 - btrfs_close_extra_devices(fs_devices); 2421 + /* 2422 + * keep the device that is marked to be the target device for the 2423 + * dev_replace procedure 2424 + */ 2425 + btrfs_close_extra_devices(fs_info, fs_devices, 0); 2445 2426 2446 2427 if (!fs_devices->latest_bdev) { 2447 2428 printk(KERN_CRIT "btrfs: failed to read devices on %s\n", ··· 2517 2490 goto fail_block_groups; 2518 2491 } 2519 2492 2493 + ret = btrfs_init_dev_replace(fs_info); 2494 + if (ret) { 2495 + pr_err("btrfs: failed to init dev_replace: %d\n", ret); 2496 + goto fail_block_groups; 2497 + } 2498 + 2499 + btrfs_close_extra_devices(fs_info, fs_devices, 1); 2500 + 2520 2501 ret = btrfs_init_space_info(fs_info); 2521 2502 if (ret) { 2522 2503 printk(KERN_ERR "Failed to initial space info: %d\n", ret); ··· 2538 2503 } 2539 2504 fs_info->num_tolerated_disk_barrier_failures = 2540 2505 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 2506 + if (fs_info->fs_devices->missing_devices > 2507 + fs_info->num_tolerated_disk_barrier_failures && 2508 + !(sb->s_flags & MS_RDONLY)) { 2509 + printk(KERN_WARNING 2510 + "Btrfs: too many missing devices, writeable mount is not allowed\n"); 2511 + goto fail_block_groups; 2512 + } 2541 2513 2542 2514 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 2543 2515 "btrfs-cleaner"); ··· 2673 2631 return ret; 2674 2632 } 2675 2633 2634 + ret = btrfs_resume_dev_replace_async(fs_info); 2635 + if (ret) { 2636 + pr_warn("btrfs: failed to resume dev_replace\n"); 2637 + close_ctree(tree_root); 2638 + return ret; 2639 + } 2640 + 2676 2641 return 0; 2677 2642 2678 2643 fail_qgroup: ··· 2716 2667 btrfs_stop_workers(&fs_info->submit_workers); 2717 2668 btrfs_stop_workers(&fs_info->delayed_workers); 2718 2669 btrfs_stop_workers(&fs_info->caching_workers); 2670 + btrfs_stop_workers(&fs_info->flush_workers); 2719 2671 fail_alloc: 2720 2672 fail_iput: 2721 2673 btrfs_mapping_tree_free(&fs_info->mapping_tree); ··· 3320 3270 smp_mb(); 3321 3271 3322 3272 /* pause restriper - we want to resume on mount */ 3323 - btrfs_pause_balance(root->fs_info); 3273 + btrfs_pause_balance(fs_info); 3324 3274 3325 - btrfs_scrub_cancel(root); 3275 + btrfs_dev_replace_suspend_for_unmount(fs_info); 3276 + 3277 + btrfs_scrub_cancel(fs_info); 3326 3278 3327 3279 /* wait for any defraggers to finish */ 3328 3280 wait_event(fs_info->transaction_wait, 3329 3281 (atomic_read(&fs_info->defrag_running) == 0)); 3330 3282 3331 3283 /* clear out the rbtree of defraggable inodes */ 3332 - btrfs_run_defrag_inodes(fs_info); 3284 + btrfs_cleanup_defrag_inodes(fs_info); 3333 3285 3334 3286 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 3335 3287 ret = btrfs_commit_super(root); ··· 3391 3339 btrfs_stop_workers(&fs_info->delayed_workers); 3392 3340 btrfs_stop_workers(&fs_info->caching_workers); 3393 3341 btrfs_stop_workers(&fs_info->readahead_workers); 3342 + btrfs_stop_workers(&fs_info->flush_workers); 3394 3343 3395 3344 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 3396 3345 if (btrfs_test_opt(root, CHECK_INTEGRITY)) ··· 3436 3383 int was_dirty; 3437 3384 3438 3385 btrfs_assert_tree_locked(buf); 3439 - if (transid != root->fs_info->generation) { 3440 - printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " 3386 + if (transid != root->fs_info->generation) 3387 + WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, " 3441 3388 "found %llu running %llu\n", 3442 3389 (unsigned long long)buf->start, 3443 3390 (unsigned long long)transid, 3444 3391 (unsigned long long)root->fs_info->generation); 3445 - WARN_ON(1); 3446 - } 3447 3392 was_dirty = set_extent_buffer_dirty(buf); 3448 3393 if (!was_dirty) { 3449 3394 spin_lock(&root->fs_info->delalloc_lock); ··· 3450 3399 } 3451 3400 } 3452 3401 3453 - void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) 3402 + static void __btrfs_btree_balance_dirty(struct btrfs_root *root, 3403 + int flush_delayed) 3454 3404 { 3455 3405 /* 3456 3406 * looks as though older kernels can get into trouble with ··· 3463 3411 if (current->flags & PF_MEMALLOC) 3464 3412 return; 3465 3413 3466 - btrfs_balance_delayed_items(root); 3414 + if (flush_delayed) 3415 + btrfs_balance_delayed_items(root); 3467 3416 3468 3417 num_dirty = root->fs_info->dirty_metadata_bytes; 3469 3418 ··· 3475 3422 return; 3476 3423 } 3477 3424 3478 - void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) 3425 + void btrfs_btree_balance_dirty(struct btrfs_root *root) 3479 3426 { 3480 - /* 3481 - * looks as though older kernels can get into trouble with 3482 - * this code, they end up stuck in balance_dirty_pages forever 3483 - */ 3484 - u64 num_dirty; 3485 - unsigned long thresh = 32 * 1024 * 1024; 3427 + __btrfs_btree_balance_dirty(root, 1); 3428 + } 3486 3429 3487 - if (current->flags & PF_MEMALLOC) 3488 - return; 3489 - 3490 - num_dirty = root->fs_info->dirty_metadata_bytes; 3491 - 3492 - if (num_dirty > thresh) { 3493 - balance_dirty_pages_ratelimited( 3494 - root->fs_info->btree_inode->i_mapping); 3495 - } 3496 - return; 3430 + void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root) 3431 + { 3432 + __btrfs_btree_balance_dirty(root, 0); 3497 3433 } 3498 3434 3499 3435 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
+2 -2
fs/btrfs/disk-io.h
··· 62 62 struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, 63 63 struct btrfs_key *location); 64 64 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); 65 - void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); 66 - void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); 65 + void btrfs_btree_balance_dirty(struct btrfs_root *root); 66 + void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); 67 67 void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); 68 68 void btrfs_mark_buffer_dirty(struct extent_buffer *buf); 69 69 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
+135 -92
fs/btrfs/extent-tree.c
··· 33 33 #include "volumes.h" 34 34 #include "locking.h" 35 35 #include "free-space-cache.h" 36 + #include "math.h" 36 37 37 38 #undef SCRAMBLE_DELAYED_REFS 38 39 ··· 648 647 list_for_each_entry_rcu(found, head, list) 649 648 found->full = 0; 650 649 rcu_read_unlock(); 651 - } 652 - 653 - static u64 div_factor(u64 num, int factor) 654 - { 655 - if (factor == 10) 656 - return num; 657 - num *= factor; 658 - do_div(num, 10); 659 - return num; 660 - } 661 - 662 - static u64 div_factor_fine(u64 num, int factor) 663 - { 664 - if (factor == 100) 665 - return num; 666 - num *= factor; 667 - do_div(num, 100); 668 - return num; 669 650 } 670 651 671 652 u64 btrfs_find_block_group(struct btrfs_root *root, ··· 1818 1835 1819 1836 1820 1837 /* Tell the block device(s) that the sectors can be discarded */ 1821 - ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, 1838 + ret = btrfs_map_block(root->fs_info, REQ_DISCARD, 1822 1839 bytenr, &num_bytes, &bbio, 0); 1823 1840 /* Error condition is -ENOMEM */ 1824 1841 if (!ret) { ··· 2297 2314 kfree(extent_op); 2298 2315 2299 2316 if (ret) { 2317 + list_del_init(&locked_ref->cluster); 2318 + mutex_unlock(&locked_ref->mutex); 2319 + 2300 2320 printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); 2301 2321 spin_lock(&delayed_refs->lock); 2302 2322 return ret; ··· 2342 2356 count++; 2343 2357 2344 2358 if (ret) { 2359 + if (locked_ref) { 2360 + list_del_init(&locked_ref->cluster); 2361 + mutex_unlock(&locked_ref->mutex); 2362 + } 2345 2363 printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); 2346 2364 spin_lock(&delayed_refs->lock); 2347 2365 return ret; ··· 3651 3661 3652 3662 static int can_overcommit(struct btrfs_root *root, 3653 3663 struct btrfs_space_info *space_info, u64 bytes, 3654 - int flush) 3664 + enum btrfs_reserve_flush_enum flush) 3655 3665 { 3656 3666 u64 profile = btrfs_get_alloc_profile(root, 0); 3657 3667 u64 avail; ··· 3675 3685 avail >>= 1; 3676 3686 3677 3687 /* 3678 - * If we aren't flushing don't let us overcommit too much, say 3679 - * 1/8th of the space. If we can flush, let it overcommit up to 3680 - * 1/2 of the space. 3688 + * If we aren't flushing all things, let us overcommit up to 3689 + * 1/2th of the space. If we can flush, don't let us overcommit 3690 + * too much, let it overcommit up to 1/8 of the space. 3681 3691 */ 3682 - if (flush) 3692 + if (flush == BTRFS_RESERVE_FLUSH_ALL) 3683 3693 avail >>= 3; 3684 3694 else 3685 3695 avail >>= 1; 3686 3696 3687 3697 if (used + bytes < space_info->total_bytes + avail) 3688 3698 return 1; 3699 + return 0; 3700 + } 3701 + 3702 + static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb, 3703 + unsigned long nr_pages, 3704 + enum wb_reason reason) 3705 + { 3706 + if (!writeback_in_progress(sb->s_bdi) && 3707 + down_read_trylock(&sb->s_umount)) { 3708 + writeback_inodes_sb_nr(sb, nr_pages, reason); 3709 + up_read(&sb->s_umount); 3710 + return 1; 3711 + } 3712 + 3689 3713 return 0; 3690 3714 } 3691 3715 ··· 3717 3713 long time_left; 3718 3714 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3719 3715 int loops = 0; 3716 + enum btrfs_reserve_flush_enum flush; 3720 3717 3721 3718 trans = (struct btrfs_trans_handle *)current->journal_info; 3722 3719 block_rsv = &root->fs_info->delalloc_block_rsv; ··· 3735 3730 while (delalloc_bytes && loops < 3) { 3736 3731 max_reclaim = min(delalloc_bytes, to_reclaim); 3737 3732 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 3738 - writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, 3739 - WB_REASON_FS_FREE_SPACE); 3733 + writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb, 3734 + nr_pages, 3735 + WB_REASON_FS_FREE_SPACE); 3740 3736 3741 3737 /* 3742 3738 * We need to wait for the async pages to actually start before ··· 3746 3740 wait_event(root->fs_info->async_submit_wait, 3747 3741 !atomic_read(&root->fs_info->async_delalloc_pages)); 3748 3742 3743 + if (!trans) 3744 + flush = BTRFS_RESERVE_FLUSH_ALL; 3745 + else 3746 + flush = BTRFS_RESERVE_NO_FLUSH; 3749 3747 spin_lock(&space_info->lock); 3750 - if (can_overcommit(root, space_info, orig, !trans)) { 3748 + if (can_overcommit(root, space_info, orig, flush)) { 3751 3749 spin_unlock(&space_info->lock); 3752 3750 break; 3753 3751 } ··· 3909 3899 */ 3910 3900 static int reserve_metadata_bytes(struct btrfs_root *root, 3911 3901 struct btrfs_block_rsv *block_rsv, 3912 - u64 orig_bytes, int flush) 3902 + u64 orig_bytes, 3903 + enum btrfs_reserve_flush_enum flush) 3913 3904 { 3914 3905 struct btrfs_space_info *space_info = block_rsv->space_info; 3915 3906 u64 used; ··· 3923 3912 ret = 0; 3924 3913 spin_lock(&space_info->lock); 3925 3914 /* 3926 - * We only want to wait if somebody other than us is flushing and we are 3927 - * actually alloed to flush. 3915 + * We only want to wait if somebody other than us is flushing and we 3916 + * are actually allowed to flush all things. 3928 3917 */ 3929 - while (flush && !flushing && space_info->flush) { 3918 + while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && 3919 + space_info->flush) { 3930 3920 spin_unlock(&space_info->lock); 3931 3921 /* 3932 3922 * If we have a trans handle we can't wait because the flusher ··· 3993 3981 * Couldn't make our reservation, save our place so while we're trying 3994 3982 * to reclaim space we can actually use it instead of somebody else 3995 3983 * stealing it from us. 3984 + * 3985 + * We make the other tasks wait for the flush only when we can flush 3986 + * all things. 3996 3987 */ 3997 - if (ret && flush) { 3988 + if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) { 3998 3989 flushing = true; 3999 3990 space_info->flush = 1; 4000 3991 } 4001 3992 4002 3993 spin_unlock(&space_info->lock); 4003 3994 4004 - if (!ret || !flush) 3995 + if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 4005 3996 goto out; 4006 3997 4007 3998 ret = flush_space(root, space_info, num_bytes, orig_bytes, 4008 3999 flush_state); 4009 4000 flush_state++; 4001 + 4002 + /* 4003 + * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock 4004 + * would happen. So skip delalloc flush. 4005 + */ 4006 + if (flush == BTRFS_RESERVE_FLUSH_LIMIT && 4007 + (flush_state == FLUSH_DELALLOC || 4008 + flush_state == FLUSH_DELALLOC_WAIT)) 4009 + flush_state = ALLOC_CHUNK; 4010 + 4010 4011 if (!ret) 4011 4012 goto again; 4012 - else if (flush_state <= COMMIT_TRANS) 4013 + else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && 4014 + flush_state < COMMIT_TRANS) 4015 + goto again; 4016 + else if (flush == BTRFS_RESERVE_FLUSH_ALL && 4017 + flush_state <= COMMIT_TRANS) 4013 4018 goto again; 4014 4019 4015 4020 out: ··· 4177 4148 kfree(rsv); 4178 4149 } 4179 4150 4180 - static inline int __block_rsv_add(struct btrfs_root *root, 4181 - struct btrfs_block_rsv *block_rsv, 4182 - u64 num_bytes, int flush) 4151 + int btrfs_block_rsv_add(struct btrfs_root *root, 4152 + struct btrfs_block_rsv *block_rsv, u64 num_bytes, 4153 + enum btrfs_reserve_flush_enum flush) 4183 4154 { 4184 4155 int ret; 4185 4156 ··· 4193 4164 } 4194 4165 4195 4166 return ret; 4196 - } 4197 - 4198 - int btrfs_block_rsv_add(struct btrfs_root *root, 4199 - struct btrfs_block_rsv *block_rsv, 4200 - u64 num_bytes) 4201 - { 4202 - return __block_rsv_add(root, block_rsv, num_bytes, 1); 4203 - } 4204 - 4205 - int btrfs_block_rsv_add_noflush(struct btrfs_root *root, 4206 - struct btrfs_block_rsv *block_rsv, 4207 - u64 num_bytes) 4208 - { 4209 - return __block_rsv_add(root, block_rsv, num_bytes, 0); 4210 4167 } 4211 4168 4212 4169 int btrfs_block_rsv_check(struct btrfs_root *root, ··· 4213 4198 return ret; 4214 4199 } 4215 4200 4216 - static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, 4217 - struct btrfs_block_rsv *block_rsv, 4218 - u64 min_reserved, int flush) 4201 + int btrfs_block_rsv_refill(struct btrfs_root *root, 4202 + struct btrfs_block_rsv *block_rsv, u64 min_reserved, 4203 + enum btrfs_reserve_flush_enum flush) 4219 4204 { 4220 4205 u64 num_bytes = 0; 4221 4206 int ret = -ENOSPC; ··· 4241 4226 } 4242 4227 4243 4228 return ret; 4244 - } 4245 - 4246 - int btrfs_block_rsv_refill(struct btrfs_root *root, 4247 - struct btrfs_block_rsv *block_rsv, 4248 - u64 min_reserved) 4249 - { 4250 - return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1); 4251 - } 4252 - 4253 - int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, 4254 - struct btrfs_block_rsv *block_rsv, 4255 - u64 min_reserved) 4256 - { 4257 - return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0); 4258 4229 } 4259 4230 4260 4231 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, ··· 4533 4532 u64 csum_bytes; 4534 4533 unsigned nr_extents = 0; 4535 4534 int extra_reserve = 0; 4536 - int flush = 1; 4535 + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 4537 4536 int ret; 4537 + bool delalloc_lock = true; 4538 4538 4539 - /* Need to be holding the i_mutex here if we aren't free space cache */ 4540 - if (btrfs_is_free_space_inode(inode)) 4541 - flush = 0; 4539 + /* If we are a free space inode we need to not flush since we will be in 4540 + * the middle of a transaction commit. We also don't need the delalloc 4541 + * mutex since we won't race with anybody. We need this mostly to make 4542 + * lockdep shut its filthy mouth. 4543 + */ 4544 + if (btrfs_is_free_space_inode(inode)) { 4545 + flush = BTRFS_RESERVE_NO_FLUSH; 4546 + delalloc_lock = false; 4547 + } 4542 4548 4543 - if (flush && btrfs_transaction_in_commit(root->fs_info)) 4549 + if (flush != BTRFS_RESERVE_NO_FLUSH && 4550 + btrfs_transaction_in_commit(root->fs_info)) 4544 4551 schedule_timeout(1); 4545 4552 4546 - mutex_lock(&BTRFS_I(inode)->delalloc_mutex); 4553 + if (delalloc_lock) 4554 + mutex_lock(&BTRFS_I(inode)->delalloc_mutex); 4555 + 4547 4556 num_bytes = ALIGN(num_bytes, root->sectorsize); 4548 4557 4549 4558 spin_lock(&BTRFS_I(inode)->lock); ··· 4583 4572 ret = btrfs_qgroup_reserve(root, num_bytes + 4584 4573 nr_extents * root->leafsize); 4585 4574 if (ret) { 4586 - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4575 + spin_lock(&BTRFS_I(inode)->lock); 4576 + calc_csum_metadata_size(inode, num_bytes, 0); 4577 + spin_unlock(&BTRFS_I(inode)->lock); 4578 + if (delalloc_lock) 4579 + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4587 4580 return ret; 4588 4581 } 4589 4582 } ··· 4622 4607 btrfs_ino(inode), 4623 4608 to_free, 0); 4624 4609 } 4625 - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4610 + if (root->fs_info->quota_enabled) { 4611 + btrfs_qgroup_free(root, num_bytes + 4612 + nr_extents * root->leafsize); 4613 + } 4614 + if (delalloc_lock) 4615 + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4626 4616 return ret; 4627 4617 } 4628 4618 ··· 4639 4619 } 4640 4620 BTRFS_I(inode)->reserved_extents += nr_extents; 4641 4621 spin_unlock(&BTRFS_I(inode)->lock); 4642 - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4622 + 4623 + if (delalloc_lock) 4624 + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4643 4625 4644 4626 if (to_reserve) 4645 4627 trace_btrfs_space_reservation(root->fs_info,"delalloc", ··· 4991 4969 { 4992 4970 struct btrfs_fs_info *fs_info = root->fs_info; 4993 4971 struct btrfs_block_group_cache *cache = NULL; 4972 + struct btrfs_space_info *space_info; 4973 + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 4994 4974 u64 len; 4975 + bool readonly; 4995 4976 4996 4977 while (start <= end) { 4978 + readonly = false; 4997 4979 if (!cache || 4998 4980 start >= cache->key.objectid + cache->key.offset) { 4999 4981 if (cache) ··· 5015 4989 } 5016 4990 5017 4991 start += len; 4992 + space_info = cache->space_info; 5018 4993 5019 - spin_lock(&cache->space_info->lock); 4994 + spin_lock(&space_info->lock); 5020 4995 spin_lock(&cache->lock); 5021 4996 cache->pinned -= len; 5022 - cache->space_info->bytes_pinned -= len; 5023 - if (cache->ro) 5024 - cache->space_info->bytes_readonly += len; 4997 + space_info->bytes_pinned -= len; 4998 + if (cache->ro) { 4999 + space_info->bytes_readonly += len; 5000 + readonly = true; 5001 + } 5025 5002 spin_unlock(&cache->lock); 5026 - spin_unlock(&cache->space_info->lock); 5003 + if (!readonly && global_rsv->space_info == space_info) { 5004 + spin_lock(&global_rsv->lock); 5005 + if (!global_rsv->full) { 5006 + len = min(len, global_rsv->size - 5007 + global_rsv->reserved); 5008 + global_rsv->reserved += len; 5009 + space_info->bytes_may_use += len; 5010 + if (global_rsv->reserved >= global_rsv->size) 5011 + global_rsv->full = 1; 5012 + } 5013 + spin_unlock(&global_rsv->lock); 5014 + } 5015 + spin_unlock(&space_info->lock); 5027 5016 } 5028 5017 5029 5018 if (cache) ··· 5507 5466 return 0; 5508 5467 } 5509 5468 5510 - static int __get_block_group_index(u64 flags) 5469 + int __get_raid_index(u64 flags) 5511 5470 { 5512 5471 int index; 5513 5472 ··· 5527 5486 5528 5487 static int get_block_group_index(struct btrfs_block_group_cache *cache) 5529 5488 { 5530 - return __get_block_group_index(cache->flags); 5489 + return __get_raid_index(cache->flags); 5531 5490 } 5532 5491 5533 5492 enum btrfs_loop_type { ··· 6310 6269 block_rsv = get_block_rsv(trans, root); 6311 6270 6312 6271 if (block_rsv->size == 0) { 6313 - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); 6272 + ret = reserve_metadata_bytes(root, block_rsv, blocksize, 6273 + BTRFS_RESERVE_NO_FLUSH); 6314 6274 /* 6315 6275 * If we couldn't reserve metadata bytes try and use some from 6316 6276 * the global reserve. ··· 6334 6292 static DEFINE_RATELIMIT_STATE(_rs, 6335 6293 DEFAULT_RATELIMIT_INTERVAL, 6336 6294 /*DEFAULT_RATELIMIT_BURST*/ 2); 6337 - if (__ratelimit(&_rs)) { 6338 - printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret); 6339 - WARN_ON(1); 6340 - } 6341 - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); 6295 + if (__ratelimit(&_rs)) 6296 + WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n", 6297 + ret); 6298 + ret = reserve_metadata_bytes(root, block_rsv, blocksize, 6299 + BTRFS_RESERVE_NO_FLUSH); 6342 6300 if (!ret) { 6343 6301 return block_rsv; 6344 6302 } else if (ret && block_rsv != global_rsv) { ··· 7469 7427 */ 7470 7428 target = get_restripe_target(root->fs_info, block_group->flags); 7471 7429 if (target) { 7472 - index = __get_block_group_index(extended_to_chunk(target)); 7430 + index = __get_raid_index(extended_to_chunk(target)); 7473 7431 } else { 7474 7432 /* 7475 7433 * this is just a balance, so if we were marked as full ··· 7503 7461 * check to make sure we can actually find a chunk with enough 7504 7462 * space to fit our block group in. 7505 7463 */ 7506 - if (device->total_bytes > device->bytes_used + min_free) { 7464 + if (device->total_bytes > device->bytes_used + min_free && 7465 + !device->is_tgtdev_for_dev_replace) { 7507 7466 ret = find_free_dev_extent(device, min_free, 7508 7467 &dev_offset, NULL); 7509 7468 if (!ret)
+14 -23
fs/btrfs/extent_io.c
··· 341 341 { 342 342 struct rb_node *node; 343 343 344 - if (end < start) { 345 - printk(KERN_ERR "btrfs end < start %llu %llu\n", 344 + if (end < start) 345 + WARN(1, KERN_ERR "btrfs end < start %llu %llu\n", 346 346 (unsigned long long)end, 347 347 (unsigned long long)start); 348 - WARN_ON(1); 349 - } 350 348 state->start = start; 351 349 state->end = end; 352 350 ··· 1917 1919 * the standard behavior is to write all copies in a raid setup. here we only 1918 1920 * want to write the one bad copy. so we do the mapping for ourselves and issue 1919 1921 * submit_bio directly. 1920 - * to avoid any synchonization issues, wait for the data after writing, which 1922 + * to avoid any synchronization issues, wait for the data after writing, which 1921 1923 * actually prevents the read that triggered the error from finishing. 1922 1924 * currently, there can be no more than two copies of every data bit. thus, 1923 1925 * exactly one rewrite is required. 1924 1926 */ 1925 - int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, 1927 + int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, 1926 1928 u64 length, u64 logical, struct page *page, 1927 1929 int mirror_num) 1928 1930 { ··· 1944 1946 bio->bi_size = 0; 1945 1947 map_length = length; 1946 1948 1947 - ret = btrfs_map_block(map_tree, WRITE, logical, 1949 + ret = btrfs_map_block(fs_info, WRITE, logical, 1948 1950 &map_length, &bbio, mirror_num); 1949 1951 if (ret) { 1950 1952 bio_put(bio); ··· 1982 1984 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, 1983 1985 int mirror_num) 1984 1986 { 1985 - struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 1986 1987 u64 start = eb->start; 1987 1988 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); 1988 1989 int ret = 0; 1989 1990 1990 1991 for (i = 0; i < num_pages; i++) { 1991 1992 struct page *p = extent_buffer_page(eb, i); 1992 - ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE, 1993 + ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE, 1993 1994 start, p, mirror_num); 1994 1995 if (ret) 1995 1996 break; ··· 2007 2010 u64 private; 2008 2011 u64 private_failure; 2009 2012 struct io_failure_record *failrec; 2010 - struct btrfs_mapping_tree *map_tree; 2013 + struct btrfs_fs_info *fs_info; 2011 2014 struct extent_state *state; 2012 2015 int num_copies; 2013 2016 int did_repair = 0; ··· 2043 2046 spin_unlock(&BTRFS_I(inode)->io_tree.lock); 2044 2047 2045 2048 if (state && state->start == failrec->start) { 2046 - map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; 2047 - num_copies = btrfs_num_copies(map_tree, failrec->logical, 2048 - failrec->len); 2049 + fs_info = BTRFS_I(inode)->root->fs_info; 2050 + num_copies = btrfs_num_copies(fs_info, failrec->logical, 2051 + failrec->len); 2049 2052 if (num_copies > 1) { 2050 - ret = repair_io_failure(map_tree, start, failrec->len, 2053 + ret = repair_io_failure(fs_info, start, failrec->len, 2051 2054 failrec->logical, page, 2052 2055 failrec->failed_mirror); 2053 2056 did_repair = !ret; ··· 2156 2159 * clean_io_failure() clean all those errors at once. 2157 2160 */ 2158 2161 } 2159 - num_copies = btrfs_num_copies( 2160 - &BTRFS_I(inode)->root->fs_info->mapping_tree, 2161 - failrec->logical, failrec->len); 2162 + num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, 2163 + failrec->logical, failrec->len); 2162 2164 if (num_copies == 1) { 2163 2165 /* 2164 2166 * we only have a single copy of the data, so don't bother with ··· 2462 2466 return bio; 2463 2467 } 2464 2468 2465 - /* 2466 - * Since writes are async, they will only return -ENOMEM. 2467 - * Reads can return the full range of I/O error conditions. 2468 - */ 2469 2469 static int __must_check submit_one_bio(int rw, struct bio *bio, 2470 2470 int mirror_num, unsigned long bio_flags) 2471 2471 { ··· 4713 4721 } 4714 4722 4715 4723 if (start + min_len > eb->len) { 4716 - printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " 4724 + WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, " 4717 4725 "wanted %lu %lu\n", (unsigned long long)eb->start, 4718 4726 eb->len, start, min_len); 4719 - WARN_ON(1); 4720 4727 return -EINVAL; 4721 4728 } 4722 4729
+2 -2
fs/btrfs/extent_io.h
··· 337 337 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 338 338 gfp_t gfp_flags); 339 339 340 - struct btrfs_mapping_tree; 340 + struct btrfs_fs_info; 341 341 342 - int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, 342 + int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, 343 343 u64 length, u64 logical, struct page *page, 344 344 int mirror_num); 345 345 int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
+10 -14
fs/btrfs/extent_map.c
··· 49 49 struct extent_map *alloc_extent_map(void) 50 50 { 51 51 struct extent_map *em; 52 - em = kmem_cache_alloc(extent_map_cache, GFP_NOFS); 52 + em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); 53 53 if (!em) 54 54 return NULL; 55 55 em->in_tree = 0; ··· 198 198 merge = rb_entry(rb, struct extent_map, rb_node); 199 199 if (rb && mergable_maps(merge, em)) { 200 200 em->start = merge->start; 201 + em->orig_start = merge->orig_start; 201 202 em->len += merge->len; 202 203 em->block_len += merge->block_len; 203 204 em->block_start = merge->block_start; 204 205 merge->in_tree = 0; 205 - if (merge->generation > em->generation) { 206 - em->mod_start = em->start; 207 - em->mod_len = em->len; 208 - em->generation = merge->generation; 209 - list_move(&em->list, &tree->modified_extents); 210 - } 206 + em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; 207 + em->mod_start = merge->mod_start; 208 + em->generation = max(em->generation, merge->generation); 209 + list_move(&em->list, &tree->modified_extents); 211 210 212 211 list_del_init(&merge->list); 213 212 rb_erase(&merge->rb_node, &tree->map); ··· 222 223 em->block_len += merge->len; 223 224 rb_erase(&merge->rb_node, &tree->map); 224 225 merge->in_tree = 0; 225 - if (merge->generation > em->generation) { 226 - em->mod_len = em->len; 227 - em->generation = merge->generation; 228 - list_move(&em->list, &tree->modified_extents); 229 - } 226 + em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; 227 + em->generation = max(em->generation, merge->generation); 230 228 list_del_init(&merge->list); 231 229 free_extent_map(merge); 232 230 } ··· 261 265 em->mod_start = em->start; 262 266 em->mod_len = em->len; 263 267 264 - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 268 + if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) { 265 269 prealloc = true; 266 - clear_bit(EXTENT_FLAG_PREALLOC, &em->flags); 270 + clear_bit(EXTENT_FLAG_FILLING, &em->flags); 267 271 } 268 272 269 273 try_merge_map(tree, em);
+2
fs/btrfs/extent_map.h
··· 14 14 #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ 15 15 #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ 16 16 #define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ 17 + #define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */ 17 18 18 19 struct extent_map { 19 20 struct rb_node rb_node; ··· 25 24 u64 mod_start; 26 25 u64 mod_len; 27 26 u64 orig_start; 27 + u64 orig_block_len; 28 28 u64 block_start; 29 29 u64 block_len; 30 30 u64 generation;
+20 -1
fs/btrfs/file-item.c
··· 133 133 return ERR_PTR(ret); 134 134 } 135 135 136 - 137 136 int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, 138 137 struct btrfs_root *root, 139 138 struct btrfs_path *path, u64 objectid, ··· 150 151 return ret; 151 152 } 152 153 154 + u64 btrfs_file_extent_length(struct btrfs_path *path) 155 + { 156 + int extent_type; 157 + struct btrfs_file_extent_item *fi; 158 + u64 len; 159 + 160 + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], 161 + struct btrfs_file_extent_item); 162 + extent_type = btrfs_file_extent_type(path->nodes[0], fi); 163 + 164 + if (extent_type == BTRFS_FILE_EXTENT_REG || 165 + extent_type == BTRFS_FILE_EXTENT_PREALLOC) 166 + len = btrfs_file_extent_num_bytes(path->nodes[0], fi); 167 + else if (extent_type == BTRFS_FILE_EXTENT_INLINE) 168 + len = btrfs_file_extent_inline_len(path->nodes[0], fi); 169 + else 170 + BUG(); 171 + 172 + return len; 173 + } 153 174 154 175 static int __btrfs_lookup_bio_sums(struct btrfs_root *root, 155 176 struct inode *inode, struct bio *bio,
+265 -143
fs/btrfs/file.c
··· 41 41 #include "compat.h" 42 42 #include "volumes.h" 43 43 44 + static struct kmem_cache *btrfs_inode_defrag_cachep; 44 45 /* 45 46 * when auto defrag is enabled we 46 47 * queue up these defrag structs to remember which ··· 91 90 * If an existing record is found the defrag item you 92 91 * pass in is freed 93 92 */ 94 - static void __btrfs_add_inode_defrag(struct inode *inode, 93 + static int __btrfs_add_inode_defrag(struct inode *inode, 95 94 struct inode_defrag *defrag) 96 95 { 97 96 struct btrfs_root *root = BTRFS_I(inode)->root; ··· 119 118 entry->transid = defrag->transid; 120 119 if (defrag->last_offset > entry->last_offset) 121 120 entry->last_offset = defrag->last_offset; 122 - goto exists; 121 + return -EEXIST; 123 122 } 124 123 } 125 124 set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 126 125 rb_link_node(&defrag->rb_node, parent, p); 127 126 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); 128 - return; 127 + return 0; 128 + } 129 129 130 - exists: 131 - kfree(defrag); 132 - return; 130 + static inline int __need_auto_defrag(struct btrfs_root *root) 131 + { 132 + if (!btrfs_test_opt(root, AUTO_DEFRAG)) 133 + return 0; 133 134 135 + if (btrfs_fs_closing(root->fs_info)) 136 + return 0; 137 + 138 + return 1; 134 139 } 135 140 136 141 /* ··· 149 142 struct btrfs_root *root = BTRFS_I(inode)->root; 150 143 struct inode_defrag *defrag; 151 144 u64 transid; 145 + int ret; 152 146 153 - if (!btrfs_test_opt(root, AUTO_DEFRAG)) 154 - return 0; 155 - 156 - if (btrfs_fs_closing(root->fs_info)) 147 + if (!__need_auto_defrag(root)) 157 148 return 0; 158 149 159 150 if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) ··· 162 157 else 163 158 transid = BTRFS_I(inode)->root->last_trans; 164 159 165 - defrag = kzalloc(sizeof(*defrag), GFP_NOFS); 160 + defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); 166 161 if (!defrag) 167 162 return -ENOMEM; 168 163 ··· 171 166 defrag->root = root->root_key.objectid; 172 167 173 168 spin_lock(&root->fs_info->defrag_inodes_lock); 174 - if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) 175 - __btrfs_add_inode_defrag(inode, defrag); 176 - else 177 - kfree(defrag); 169 + if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) { 170 + /* 171 + * If we set IN_DEFRAG flag and evict the inode from memory, 172 + * and then re-read this inode, this new inode doesn't have 173 + * IN_DEFRAG flag. At the case, we may find the existed defrag. 174 + */ 175 + ret = __btrfs_add_inode_defrag(inode, defrag); 176 + if (ret) 177 + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 178 + } else { 179 + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 180 + } 178 181 spin_unlock(&root->fs_info->defrag_inodes_lock); 179 182 return 0; 180 183 } 181 184 182 185 /* 183 - * must be called with the defrag_inodes lock held 186 + * Requeue the defrag object. If there is a defrag object that points to 187 + * the same inode in the tree, we will merge them together (by 188 + * __btrfs_add_inode_defrag()) and free the one that we want to requeue. 184 189 */ 185 - struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, 186 - u64 root, u64 ino, 187 - struct rb_node **next) 190 + void btrfs_requeue_inode_defrag(struct inode *inode, 191 + struct inode_defrag *defrag) 192 + { 193 + struct btrfs_root *root = BTRFS_I(inode)->root; 194 + int ret; 195 + 196 + if (!__need_auto_defrag(root)) 197 + goto out; 198 + 199 + /* 200 + * Here we don't check the IN_DEFRAG flag, because we need merge 201 + * them together. 202 + */ 203 + spin_lock(&root->fs_info->defrag_inodes_lock); 204 + ret = __btrfs_add_inode_defrag(inode, defrag); 205 + spin_unlock(&root->fs_info->defrag_inodes_lock); 206 + if (ret) 207 + goto out; 208 + return; 209 + out: 210 + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 211 + } 212 + 213 + /* 214 + * pick the defragable inode that we want, if it doesn't exist, we will get 215 + * the next one. 216 + */ 217 + static struct inode_defrag * 218 + btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino) 188 219 { 189 220 struct inode_defrag *entry = NULL; 190 221 struct inode_defrag tmp; ··· 231 190 tmp.ino = ino; 232 191 tmp.root = root; 233 192 234 - p = info->defrag_inodes.rb_node; 193 + spin_lock(&fs_info->defrag_inodes_lock); 194 + p = fs_info->defrag_inodes.rb_node; 235 195 while (p) { 236 196 parent = p; 237 197 entry = rb_entry(parent, struct inode_defrag, rb_node); ··· 243 201 else if (ret > 0) 244 202 p = parent->rb_right; 245 203 else 246 - return entry; 204 + goto out; 247 205 } 248 206 249 - if (next) { 250 - while (parent && __compare_inode_defrag(&tmp, entry) > 0) { 251 - parent = rb_next(parent); 207 + if (parent && __compare_inode_defrag(&tmp, entry) > 0) { 208 + parent = rb_next(parent); 209 + if (parent) 252 210 entry = rb_entry(parent, struct inode_defrag, rb_node); 253 - } 254 - *next = parent; 211 + else 212 + entry = NULL; 255 213 } 256 - return NULL; 214 + out: 215 + if (entry) 216 + rb_erase(parent, &fs_info->defrag_inodes); 217 + spin_unlock(&fs_info->defrag_inodes_lock); 218 + return entry; 219 + } 220 + 221 + void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) 222 + { 223 + struct inode_defrag *defrag; 224 + struct rb_node *node; 225 + 226 + spin_lock(&fs_info->defrag_inodes_lock); 227 + node = rb_first(&fs_info->defrag_inodes); 228 + while (node) { 229 + rb_erase(node, &fs_info->defrag_inodes); 230 + defrag = rb_entry(node, struct inode_defrag, rb_node); 231 + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 232 + 233 + if (need_resched()) { 234 + spin_unlock(&fs_info->defrag_inodes_lock); 235 + cond_resched(); 236 + spin_lock(&fs_info->defrag_inodes_lock); 237 + } 238 + 239 + node = rb_first(&fs_info->defrag_inodes); 240 + } 241 + spin_unlock(&fs_info->defrag_inodes_lock); 242 + } 243 + 244 + #define BTRFS_DEFRAG_BATCH 1024 245 + 246 + static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, 247 + struct inode_defrag *defrag) 248 + { 249 + struct btrfs_root *inode_root; 250 + struct inode *inode; 251 + struct btrfs_key key; 252 + struct btrfs_ioctl_defrag_range_args range; 253 + int num_defrag; 254 + 255 + /* get the inode */ 256 + key.objectid = defrag->root; 257 + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 258 + key.offset = (u64)-1; 259 + inode_root = btrfs_read_fs_root_no_name(fs_info, &key); 260 + if (IS_ERR(inode_root)) { 261 + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 262 + return PTR_ERR(inode_root); 263 + } 264 + 265 + key.objectid = defrag->ino; 266 + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 267 + key.offset = 0; 268 + inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); 269 + if (IS_ERR(inode)) { 270 + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 271 + return PTR_ERR(inode); 272 + } 273 + 274 + /* do a chunk of defrag */ 275 + clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 276 + memset(&range, 0, sizeof(range)); 277 + range.len = (u64)-1; 278 + range.start = defrag->last_offset; 279 + 280 + sb_start_write(fs_info->sb); 281 + num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, 282 + BTRFS_DEFRAG_BATCH); 283 + sb_end_write(fs_info->sb); 284 + /* 285 + * if we filled the whole defrag batch, there 286 + * must be more work to do. Queue this defrag 287 + * again 288 + */ 289 + if (num_defrag == BTRFS_DEFRAG_BATCH) { 290 + defrag->last_offset = range.start; 291 + btrfs_requeue_inode_defrag(inode, defrag); 292 + } else if (defrag->last_offset && !defrag->cycled) { 293 + /* 294 + * we didn't fill our defrag batch, but 295 + * we didn't start at zero. Make sure we loop 296 + * around to the start of the file. 297 + */ 298 + defrag->last_offset = 0; 299 + defrag->cycled = 1; 300 + btrfs_requeue_inode_defrag(inode, defrag); 301 + } else { 302 + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 303 + } 304 + 305 + iput(inode); 306 + return 0; 257 307 } 258 308 259 309 /* ··· 355 221 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) 356 222 { 357 223 struct inode_defrag *defrag; 358 - struct btrfs_root *inode_root; 359 - struct inode *inode; 360 - struct rb_node *n; 361 - struct btrfs_key key; 362 - struct btrfs_ioctl_defrag_range_args range; 363 224 u64 first_ino = 0; 364 225 u64 root_objectid = 0; 365 - int num_defrag; 366 - int defrag_batch = 1024; 367 - 368 - memset(&range, 0, sizeof(range)); 369 - range.len = (u64)-1; 370 226 371 227 atomic_inc(&fs_info->defrag_running); 372 - spin_lock(&fs_info->defrag_inodes_lock); 373 228 while(1) { 374 - n = NULL; 229 + if (!__need_auto_defrag(fs_info->tree_root)) 230 + break; 375 231 376 232 /* find an inode to defrag */ 377 - defrag = btrfs_find_defrag_inode(fs_info, root_objectid, 378 - first_ino, &n); 233 + defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, 234 + first_ino); 379 235 if (!defrag) { 380 - if (n) { 381 - defrag = rb_entry(n, struct inode_defrag, 382 - rb_node); 383 - } else if (root_objectid || first_ino) { 236 + if (root_objectid || first_ino) { 384 237 root_objectid = 0; 385 238 first_ino = 0; 386 239 continue; ··· 376 255 } 377 256 } 378 257 379 - /* remove it from the rbtree */ 380 258 first_ino = defrag->ino + 1; 381 259 root_objectid = defrag->root; 382 - rb_erase(&defrag->rb_node, &fs_info->defrag_inodes); 383 260 384 - if (btrfs_fs_closing(fs_info)) 385 - goto next_free; 386 - 387 - spin_unlock(&fs_info->defrag_inodes_lock); 388 - 389 - /* get the inode */ 390 - key.objectid = defrag->root; 391 - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 392 - key.offset = (u64)-1; 393 - inode_root = btrfs_read_fs_root_no_name(fs_info, &key); 394 - if (IS_ERR(inode_root)) 395 - goto next; 396 - 397 - key.objectid = defrag->ino; 398 - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 399 - key.offset = 0; 400 - 401 - inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); 402 - if (IS_ERR(inode)) 403 - goto next; 404 - 405 - /* do a chunk of defrag */ 406 - clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 407 - range.start = defrag->last_offset; 408 - num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, 409 - defrag_batch); 410 - /* 411 - * if we filled the whole defrag batch, there 412 - * must be more work to do. Queue this defrag 413 - * again 414 - */ 415 - if (num_defrag == defrag_batch) { 416 - defrag->last_offset = range.start; 417 - __btrfs_add_inode_defrag(inode, defrag); 418 - /* 419 - * we don't want to kfree defrag, we added it back to 420 - * the rbtree 421 - */ 422 - defrag = NULL; 423 - } else if (defrag->last_offset && !defrag->cycled) { 424 - /* 425 - * we didn't fill our defrag batch, but 426 - * we didn't start at zero. Make sure we loop 427 - * around to the start of the file. 428 - */ 429 - defrag->last_offset = 0; 430 - defrag->cycled = 1; 431 - __btrfs_add_inode_defrag(inode, defrag); 432 - defrag = NULL; 433 - } 434 - 435 - iput(inode); 436 - next: 437 - spin_lock(&fs_info->defrag_inodes_lock); 438 - next_free: 439 - kfree(defrag); 261 + __btrfs_run_defrag_inode(fs_info, defrag); 440 262 } 441 - spin_unlock(&fs_info->defrag_inodes_lock); 442 - 443 263 atomic_dec(&fs_info->defrag_running); 444 264 445 265 /* ··· 588 526 split->block_len = em->block_len; 589 527 else 590 528 split->block_len = split->len; 529 + split->orig_block_len = max(split->block_len, 530 + em->orig_block_len); 591 531 split->generation = gen; 592 532 split->bdev = em->bdev; 593 533 split->flags = flags; ··· 611 547 split->flags = flags; 612 548 split->compress_type = em->compress_type; 613 549 split->generation = gen; 550 + split->orig_block_len = max(em->block_len, 551 + em->orig_block_len); 614 552 615 553 if (compressed) { 616 554 split->block_len = em->block_len; ··· 621 555 } else { 622 556 split->block_len = split->len; 623 557 split->block_start = em->block_start + diff; 624 - split->orig_start = split->start; 558 + split->orig_start = em->orig_start; 625 559 } 626 560 627 561 ret = add_extent_mapping(em_tree, split); ··· 1414 1348 1415 1349 balance_dirty_pages_ratelimited(inode->i_mapping); 1416 1350 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1417 - btrfs_btree_balance_dirty(root, 1); 1351 + btrfs_btree_balance_dirty(root); 1418 1352 1419 1353 pos += copied; 1420 1354 num_written += copied; ··· 1463 1397 return written ? written : err; 1464 1398 } 1465 1399 1400 + static void update_time_for_write(struct inode *inode) 1401 + { 1402 + struct timespec now; 1403 + 1404 + if (IS_NOCMTIME(inode)) 1405 + return; 1406 + 1407 + now = current_fs_time(inode->i_sb); 1408 + if (!timespec_equal(&inode->i_mtime, &now)) 1409 + inode->i_mtime = now; 1410 + 1411 + if (!timespec_equal(&inode->i_ctime, &now)) 1412 + inode->i_ctime = now; 1413 + 1414 + if (IS_I_VERSION(inode)) 1415 + inode_inc_iversion(inode); 1416 + } 1417 + 1466 1418 static ssize_t btrfs_file_aio_write(struct kiocb *iocb, 1467 1419 const struct iovec *iov, 1468 1420 unsigned long nr_segs, loff_t pos) ··· 1493 1409 ssize_t num_written = 0; 1494 1410 ssize_t err = 0; 1495 1411 size_t count, ocount; 1412 + bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); 1496 1413 1497 1414 sb_start_write(inode->i_sb); 1498 1415 ··· 1536 1451 goto out; 1537 1452 } 1538 1453 1539 - err = file_update_time(file); 1540 - if (err) { 1541 - mutex_unlock(&inode->i_mutex); 1542 - goto out; 1543 - } 1454 + /* 1455 + * We reserve space for updating the inode when we reserve space for the 1456 + * extent we are going to write, so we will enospc out there. We don't 1457 + * need to start yet another transaction to update the inode as we will 1458 + * update the inode when we finish writing whatever data we write. 1459 + */ 1460 + update_time_for_write(inode); 1544 1461 1545 1462 start_pos = round_down(pos, root->sectorsize); 1546 1463 if (start_pos > i_size_read(inode)) { ··· 1552 1465 goto out; 1553 1466 } 1554 1467 } 1468 + 1469 + if (sync) 1470 + atomic_inc(&BTRFS_I(inode)->sync_writers); 1555 1471 1556 1472 if (unlikely(file->f_flags & O_DIRECT)) { 1557 1473 num_written = __btrfs_direct_write(iocb, iov, nr_segs, ··· 1582 1492 * this will either be one more than the running transaction 1583 1493 * or the generation used for the next transaction if there isn't 1584 1494 * one running right now. 1495 + * 1496 + * We also have to set last_sub_trans to the current log transid, 1497 + * otherwise subsequent syncs to a file that's been synced in this 1498 + * transaction will appear to have already occured. 1585 1499 */ 1586 1500 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 1501 + BTRFS_I(inode)->last_sub_trans = root->log_transid; 1587 1502 if (num_written > 0 || num_written == -EIOCBQUEUED) { 1588 1503 err = generic_write_sync(file, pos, num_written); 1589 1504 if (err < 0 && num_written > 0) 1590 1505 num_written = err; 1591 1506 } 1592 1507 out: 1508 + if (sync) 1509 + atomic_dec(&BTRFS_I(inode)->sync_writers); 1593 1510 sb_end_write(inode->i_sb); 1594 1511 current->backing_dev_info = NULL; 1595 1512 return num_written ? num_written : err; ··· 1647 1550 * out of the ->i_mutex. If so, we can flush the dirty pages by 1648 1551 * multi-task, and make the performance up. 1649 1552 */ 1553 + atomic_inc(&BTRFS_I(inode)->sync_writers); 1650 1554 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 1555 + atomic_dec(&BTRFS_I(inode)->sync_writers); 1651 1556 if (ret) 1652 1557 return ret; 1653 1558 ··· 1660 1561 * range being left. 1661 1562 */ 1662 1563 atomic_inc(&root->log_batch); 1663 - btrfs_wait_ordered_range(inode, start, end); 1564 + btrfs_wait_ordered_range(inode, start, end - start + 1); 1664 1565 atomic_inc(&root->log_batch); 1665 1566 1666 1567 /* ··· 1866 1767 1867 1768 hole_em->block_start = EXTENT_MAP_HOLE; 1868 1769 hole_em->block_len = 0; 1770 + hole_em->orig_block_len = 0; 1869 1771 hole_em->bdev = root->fs_info->fs_devices->latest_bdev; 1870 1772 hole_em->compress_type = BTRFS_COMPRESS_NONE; 1871 1773 hole_em->generation = trans->transid; ··· 1896 1796 struct btrfs_path *path; 1897 1797 struct btrfs_block_rsv *rsv; 1898 1798 struct btrfs_trans_handle *trans; 1899 - u64 mask = BTRFS_I(inode)->root->sectorsize - 1; 1900 - u64 lockstart = (offset + mask) & ~mask; 1901 - u64 lockend = ((offset + len) & ~mask) - 1; 1799 + u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); 1800 + u64 lockend = round_down(offset + len, 1801 + BTRFS_I(inode)->root->sectorsize) - 1; 1902 1802 u64 cur_offset = lockstart; 1903 1803 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 1904 1804 u64 drop_end; 1905 - unsigned long nr; 1906 1805 int ret = 0; 1907 1806 int err = 0; 1908 - bool same_page = (offset >> PAGE_CACHE_SHIFT) == 1909 - ((offset + len) >> PAGE_CACHE_SHIFT); 1807 + bool same_page = ((offset >> PAGE_CACHE_SHIFT) == 1808 + ((offset + len - 1) >> PAGE_CACHE_SHIFT)); 1910 1809 1911 1810 btrfs_wait_ordered_range(inode, offset, len); 1912 1811 1913 1812 mutex_lock(&inode->i_mutex); 1914 - if (offset >= inode->i_size) { 1915 - mutex_unlock(&inode->i_mutex); 1916 - return 0; 1917 - } 1918 - 1813 + /* 1814 + * We needn't truncate any page which is beyond the end of the file 1815 + * because we are sure there is no data there. 1816 + */ 1919 1817 /* 1920 1818 * Only do this if we are in the same page and we aren't doing the 1921 1819 * entire page. 1922 1820 */ 1923 1821 if (same_page && len < PAGE_CACHE_SIZE) { 1924 - ret = btrfs_truncate_page(inode, offset, len, 0); 1822 + if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) 1823 + ret = btrfs_truncate_page(inode, offset, len, 0); 1925 1824 mutex_unlock(&inode->i_mutex); 1926 1825 return ret; 1927 1826 } 1928 1827 1929 1828 /* zero back part of the first page */ 1930 - ret = btrfs_truncate_page(inode, offset, 0, 0); 1931 - if (ret) { 1932 - mutex_unlock(&inode->i_mutex); 1933 - return ret; 1829 + if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) { 1830 + ret = btrfs_truncate_page(inode, offset, 0, 0); 1831 + if (ret) { 1832 + mutex_unlock(&inode->i_mutex); 1833 + return ret; 1834 + } 1934 1835 } 1935 1836 1936 1837 /* zero the front end of the last page */ 1937 - ret = btrfs_truncate_page(inode, offset + len, 0, 1); 1938 - if (ret) { 1939 - mutex_unlock(&inode->i_mutex); 1940 - return ret; 1838 + if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) { 1839 + ret = btrfs_truncate_page(inode, offset + len, 0, 1); 1840 + if (ret) { 1841 + mutex_unlock(&inode->i_mutex); 1842 + return ret; 1843 + } 1941 1844 } 1942 1845 1943 1846 if (lockend < lockstart) { ··· 2033 1930 break; 2034 1931 } 2035 1932 2036 - nr = trans->blocks_used; 2037 1933 btrfs_end_transaction(trans, root); 2038 - btrfs_btree_balance_dirty(root, nr); 1934 + btrfs_btree_balance_dirty(root); 2039 1935 2040 1936 trans = btrfs_start_transaction(root, 3); 2041 1937 if (IS_ERR(trans)) { ··· 2065 1963 if (!trans) 2066 1964 goto out_free; 2067 1965 1966 + inode_inc_iversion(inode); 1967 + inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1968 + 2068 1969 trans->block_rsv = &root->fs_info->trans_block_rsv; 2069 1970 ret = btrfs_update_inode(trans, root, inode); 2070 - nr = trans->blocks_used; 2071 1971 btrfs_end_transaction(trans, root); 2072 - btrfs_btree_balance_dirty(root, nr); 1972 + btrfs_btree_balance_dirty(root); 2073 1973 out_free: 2074 1974 btrfs_free_path(path); 2075 1975 btrfs_free_block_rsv(root, rsv); ··· 2095 1991 u64 alloc_end; 2096 1992 u64 alloc_hint = 0; 2097 1993 u64 locked_end; 2098 - u64 mask = BTRFS_I(inode)->root->sectorsize - 1; 2099 1994 struct extent_map *em; 1995 + int blocksize = BTRFS_I(inode)->root->sectorsize; 2100 1996 int ret; 2101 1997 2102 - alloc_start = offset & ~mask; 2103 - alloc_end = (offset + len + mask) & ~mask; 1998 + alloc_start = round_down(offset, blocksize); 1999 + alloc_end = round_up(offset + len, blocksize); 2104 2000 2105 2001 /* Make sure we aren't being give some crap mode */ 2106 2002 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) ··· 2113 2009 * Make sure we have enough space before we do the 2114 2010 * allocation. 2115 2011 */ 2116 - ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1); 2012 + ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); 2117 2013 if (ret) 2118 2014 return ret; 2119 2015 ··· 2181 2077 } 2182 2078 last_byte = min(extent_map_end(em), alloc_end); 2183 2079 actual_end = min_t(u64, extent_map_end(em), offset + len); 2184 - last_byte = (last_byte + mask) & ~mask; 2080 + last_byte = ALIGN(last_byte, blocksize); 2185 2081 2186 2082 if (em->block_start == EXTENT_MAP_HOLE || 2187 2083 (cur_offset >= inode->i_size && ··· 2220 2116 out: 2221 2117 mutex_unlock(&inode->i_mutex); 2222 2118 /* Let go of our reservation. */ 2223 - btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1); 2119 + btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); 2224 2120 return ret; 2225 2121 } 2226 2122 ··· 2396 2292 .compat_ioctl = btrfs_ioctl, 2397 2293 #endif 2398 2294 }; 2295 + 2296 + void btrfs_auto_defrag_exit(void) 2297 + { 2298 + if (btrfs_inode_defrag_cachep) 2299 + kmem_cache_destroy(btrfs_inode_defrag_cachep); 2300 + } 2301 + 2302 + int btrfs_auto_defrag_init(void) 2303 + { 2304 + btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", 2305 + sizeof(struct inode_defrag), 0, 2306 + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, 2307 + NULL); 2308 + if (!btrfs_inode_defrag_cachep) 2309 + return -ENOMEM; 2310 + 2311 + return 0; 2312 + }
+19 -32
fs/btrfs/free-space-cache.c
··· 307 307 308 308 static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) 309 309 { 310 - WARN_ON(io_ctl->cur); 311 310 BUG_ON(io_ctl->index >= io_ctl->num_pages); 312 311 io_ctl->page = io_ctl->pages[io_ctl->index++]; 313 312 io_ctl->cur = kmap(io_ctl->page); ··· 1249 1250 * if previous extent entry covers the offset, 1250 1251 * we should return it instead of the bitmap entry 1251 1252 */ 1252 - n = &entry->offset_index; 1253 - while (1) { 1254 - n = rb_prev(n); 1255 - if (!n) 1256 - break; 1253 + n = rb_prev(&entry->offset_index); 1254 + if (n) { 1257 1255 prev = rb_entry(n, struct btrfs_free_space, 1258 1256 offset_index); 1259 - if (!prev->bitmap) { 1260 - if (prev->offset + prev->bytes > offset) 1261 - entry = prev; 1262 - break; 1263 - } 1257 + if (!prev->bitmap && 1258 + prev->offset + prev->bytes > offset) 1259 + entry = prev; 1264 1260 } 1265 1261 } 1266 1262 return entry; ··· 1281 1287 } 1282 1288 1283 1289 if (entry->bitmap) { 1284 - n = &entry->offset_index; 1285 - while (1) { 1286 - n = rb_prev(n); 1287 - if (!n) 1288 - break; 1290 + n = rb_prev(&entry->offset_index); 1291 + if (n) { 1289 1292 prev = rb_entry(n, struct btrfs_free_space, 1290 1293 offset_index); 1291 - if (!prev->bitmap) { 1292 - if (prev->offset + prev->bytes > offset) 1293 - return prev; 1294 - break; 1295 - } 1294 + if (!prev->bitmap && 1295 + prev->offset + prev->bytes > offset) 1296 + return prev; 1296 1297 } 1297 1298 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) 1298 1299 return entry; ··· 1353 1364 u64 bitmap_bytes; 1354 1365 u64 extent_bytes; 1355 1366 u64 size = block_group->key.offset; 1356 - u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; 1367 + u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; 1357 1368 int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 1358 1369 1359 1370 BUG_ON(ctl->total_bitmaps > max_bitmaps); ··· 1639 1650 * some block groups are so tiny they can't be enveloped by a bitmap, so 1640 1651 * don't even bother to create a bitmap for this 1641 1652 */ 1642 - if (BITS_PER_BITMAP * block_group->sectorsize > 1643 - block_group->key.offset) 1653 + if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset) 1644 1654 return false; 1645 1655 1646 1656 return true; ··· 2286 2298 unsigned long total_found = 0; 2287 2299 int ret; 2288 2300 2289 - i = offset_to_bit(entry->offset, block_group->sectorsize, 2301 + i = offset_to_bit(entry->offset, ctl->unit, 2290 2302 max_t(u64, offset, entry->offset)); 2291 - want_bits = bytes_to_bits(bytes, block_group->sectorsize); 2292 - min_bits = bytes_to_bits(min_bytes, block_group->sectorsize); 2303 + want_bits = bytes_to_bits(bytes, ctl->unit); 2304 + min_bits = bytes_to_bits(min_bytes, ctl->unit); 2293 2305 2294 2306 again: 2295 2307 found_bits = 0; ··· 2313 2325 2314 2326 total_found += found_bits; 2315 2327 2316 - if (cluster->max_size < found_bits * block_group->sectorsize) 2317 - cluster->max_size = found_bits * block_group->sectorsize; 2328 + if (cluster->max_size < found_bits * ctl->unit) 2329 + cluster->max_size = found_bits * ctl->unit; 2318 2330 2319 2331 if (total_found < want_bits || cluster->max_size < cont1_bytes) { 2320 2332 i = next_zero + 1; 2321 2333 goto again; 2322 2334 } 2323 2335 2324 - cluster->window_start = start * block_group->sectorsize + 2325 - entry->offset; 2336 + cluster->window_start = start * ctl->unit + entry->offset; 2326 2337 rb_erase(&entry->offset_index, &ctl->free_space_offset); 2327 2338 ret = tree_insert_offset(&cluster->root, entry->offset, 2328 2339 &entry->offset_index, 1); 2329 2340 BUG_ON(ret); /* -EEXIST; Logic error */ 2330 2341 2331 2342 trace_btrfs_setup_cluster(block_group, cluster, 2332 - total_found * block_group->sectorsize, 1); 2343 + total_found * ctl->unit, 1); 2333 2344 return 0; 2334 2345 } 2335 2346
+3 -2
fs/btrfs/inode-map.c
··· 434 434 * 3 items for pre-allocation 435 435 */ 436 436 trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); 437 - ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv, 438 - trans->bytes_reserved); 437 + ret = btrfs_block_rsv_add(root, trans->block_rsv, 438 + trans->bytes_reserved, 439 + BTRFS_RESERVE_NO_FLUSH); 439 440 if (ret) 440 441 goto out; 441 442 trace_btrfs_space_reservation(root->fs_info, "ino_cache",
+294 -190
fs/btrfs/inode.c
··· 71 71 static struct extent_io_ops btrfs_extent_io_ops; 72 72 73 73 static struct kmem_cache *btrfs_inode_cachep; 74 + static struct kmem_cache *btrfs_delalloc_work_cachep; 74 75 struct kmem_cache *btrfs_trans_handle_cachep; 75 76 struct kmem_cache *btrfs_transaction_cachep; 76 77 struct kmem_cache *btrfs_path_cachep; ··· 95 94 struct page *locked_page, 96 95 u64 start, u64 end, int *page_started, 97 96 unsigned long *nr_written, int unlock); 97 + static struct extent_map *create_pinned_em(struct inode *inode, u64 start, 98 + u64 len, u64 orig_start, 99 + u64 block_start, u64 block_len, 100 + u64 orig_block_len, int type); 98 101 99 102 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 100 103 struct inode *inode, struct inode *dir, ··· 703 698 704 699 em->block_start = ins.objectid; 705 700 em->block_len = ins.offset; 701 + em->orig_block_len = ins.offset; 706 702 em->bdev = root->fs_info->fs_devices->latest_bdev; 707 703 em->compress_type = async_extent->compress_type; 708 704 set_bit(EXTENT_FLAG_PINNED, &em->flags); 709 705 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 706 + em->generation = -1; 710 707 711 708 while (1) { 712 709 write_lock(&em_tree->lock); 713 710 ret = add_extent_mapping(em_tree, em); 711 + if (!ret) 712 + list_move(&em->list, 713 + &em_tree->modified_extents); 714 714 write_unlock(&em_tree->lock); 715 715 if (ret != -EEXIST) { 716 716 free_extent_map(em); ··· 813 803 * required to start IO on it. It may be clean and already done with 814 804 * IO when we return. 815 805 */ 816 - static noinline int cow_file_range(struct inode *inode, 817 - struct page *locked_page, 818 - u64 start, u64 end, int *page_started, 819 - unsigned long *nr_written, 820 - int unlock) 806 + static noinline int __cow_file_range(struct btrfs_trans_handle *trans, 807 + struct inode *inode, 808 + struct btrfs_root *root, 809 + struct page *locked_page, 810 + u64 start, u64 end, int *page_started, 811 + unsigned long *nr_written, 812 + int unlock) 821 813 { 822 - struct btrfs_root *root = BTRFS_I(inode)->root; 823 - struct btrfs_trans_handle *trans; 824 814 u64 alloc_hint = 0; 825 815 u64 num_bytes; 826 816 unsigned long ram_size; ··· 833 823 int ret = 0; 834 824 835 825 BUG_ON(btrfs_is_free_space_inode(inode)); 836 - trans = btrfs_join_transaction(root); 837 - if (IS_ERR(trans)) { 838 - extent_clear_unlock_delalloc(inode, 839 - &BTRFS_I(inode)->io_tree, 840 - start, end, locked_page, 841 - EXTENT_CLEAR_UNLOCK_PAGE | 842 - EXTENT_CLEAR_UNLOCK | 843 - EXTENT_CLEAR_DELALLOC | 844 - EXTENT_CLEAR_DIRTY | 845 - EXTENT_SET_WRITEBACK | 846 - EXTENT_END_WRITEBACK); 847 - return PTR_ERR(trans); 848 - } 849 - trans->block_rsv = &root->fs_info->delalloc_block_rsv; 850 826 851 827 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 852 828 num_bytes = max(blocksize, num_bytes); 853 829 disk_num_bytes = num_bytes; 854 - ret = 0; 855 830 856 831 /* if this is a small write inside eof, kick off defrag */ 857 832 if (num_bytes < 64 * 1024 && ··· 895 900 896 901 em->block_start = ins.objectid; 897 902 em->block_len = ins.offset; 903 + em->orig_block_len = ins.offset; 898 904 em->bdev = root->fs_info->fs_devices->latest_bdev; 899 905 set_bit(EXTENT_FLAG_PINNED, &em->flags); 906 + em->generation = -1; 900 907 901 908 while (1) { 902 909 write_lock(&em_tree->lock); 903 910 ret = add_extent_mapping(em_tree, em); 911 + if (!ret) 912 + list_move(&em->list, 913 + &em_tree->modified_extents); 904 914 write_unlock(&em_tree->lock); 905 915 if (ret != -EEXIST) { 906 916 free_extent_map(em); ··· 952 952 alloc_hint = ins.objectid + ins.offset; 953 953 start += cur_alloc_size; 954 954 } 955 - ret = 0; 956 955 out: 957 - btrfs_end_transaction(trans, root); 958 - 959 956 return ret; 957 + 960 958 out_unlock: 961 959 extent_clear_unlock_delalloc(inode, 962 960 &BTRFS_I(inode)->io_tree, ··· 967 969 EXTENT_END_WRITEBACK); 968 970 969 971 goto out; 972 + } 973 + 974 + static noinline int cow_file_range(struct inode *inode, 975 + struct page *locked_page, 976 + u64 start, u64 end, int *page_started, 977 + unsigned long *nr_written, 978 + int unlock) 979 + { 980 + struct btrfs_trans_handle *trans; 981 + struct btrfs_root *root = BTRFS_I(inode)->root; 982 + int ret; 983 + 984 + trans = btrfs_join_transaction(root); 985 + if (IS_ERR(trans)) { 986 + extent_clear_unlock_delalloc(inode, 987 + &BTRFS_I(inode)->io_tree, 988 + start, end, locked_page, 989 + EXTENT_CLEAR_UNLOCK_PAGE | 990 + EXTENT_CLEAR_UNLOCK | 991 + EXTENT_CLEAR_DELALLOC | 992 + EXTENT_CLEAR_DIRTY | 993 + EXTENT_SET_WRITEBACK | 994 + EXTENT_END_WRITEBACK); 995 + return PTR_ERR(trans); 996 + } 997 + trans->block_rsv = &root->fs_info->delalloc_block_rsv; 998 + 999 + ret = __cow_file_range(trans, inode, root, locked_page, start, end, 1000 + page_started, nr_written, unlock); 1001 + 1002 + btrfs_end_transaction(trans, root); 1003 + 1004 + return ret; 970 1005 } 971 1006 972 1007 /* ··· 1157 1126 u64 extent_offset; 1158 1127 u64 disk_bytenr; 1159 1128 u64 num_bytes; 1129 + u64 disk_num_bytes; 1160 1130 int extent_type; 1161 1131 int ret, err; 1162 1132 int type; ··· 1260 1228 extent_offset = btrfs_file_extent_offset(leaf, fi); 1261 1229 extent_end = found_key.offset + 1262 1230 btrfs_file_extent_num_bytes(leaf, fi); 1231 + disk_num_bytes = 1232 + btrfs_file_extent_disk_num_bytes(leaf, fi); 1263 1233 if (extent_end <= start) { 1264 1234 path->slots[0]++; 1265 1235 goto next_slot; ··· 1315 1281 1316 1282 btrfs_release_path(path); 1317 1283 if (cow_start != (u64)-1) { 1318 - ret = cow_file_range(inode, locked_page, cow_start, 1319 - found_key.offset - 1, page_started, 1320 - nr_written, 1); 1284 + ret = __cow_file_range(trans, inode, root, locked_page, 1285 + cow_start, found_key.offset - 1, 1286 + page_started, nr_written, 1); 1321 1287 if (ret) { 1322 1288 btrfs_abort_transaction(trans, root, ret); 1323 1289 goto error; ··· 1332 1298 em = alloc_extent_map(); 1333 1299 BUG_ON(!em); /* -ENOMEM */ 1334 1300 em->start = cur_offset; 1335 - em->orig_start = em->start; 1301 + em->orig_start = found_key.offset - extent_offset; 1336 1302 em->len = num_bytes; 1337 1303 em->block_len = num_bytes; 1338 1304 em->block_start = disk_bytenr; 1305 + em->orig_block_len = disk_num_bytes; 1339 1306 em->bdev = root->fs_info->fs_devices->latest_bdev; 1340 1307 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1341 - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 1308 + set_bit(EXTENT_FLAG_FILLING, &em->flags); 1309 + em->generation = -1; 1342 1310 while (1) { 1343 1311 write_lock(&em_tree->lock); 1344 1312 ret = add_extent_mapping(em_tree, em); 1313 + if (!ret) 1314 + list_move(&em->list, 1315 + &em_tree->modified_extents); 1345 1316 write_unlock(&em_tree->lock); 1346 1317 if (ret != -EEXIST) { 1347 1318 free_extent_map(em); ··· 1391 1352 } 1392 1353 1393 1354 if (cow_start != (u64)-1) { 1394 - ret = cow_file_range(inode, locked_page, cow_start, end, 1395 - page_started, nr_written, 1); 1355 + ret = __cow_file_range(trans, inode, root, locked_page, 1356 + cow_start, end, 1357 + page_started, nr_written, 1); 1396 1358 if (ret) { 1397 1359 btrfs_abort_transaction(trans, root, ret); 1398 1360 goto error; ··· 1571 1531 unsigned long bio_flags) 1572 1532 { 1573 1533 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 1574 - struct btrfs_mapping_tree *map_tree; 1575 1534 u64 logical = (u64)bio->bi_sector << 9; 1576 1535 u64 length = 0; 1577 1536 u64 map_length; ··· 1580 1541 return 0; 1581 1542 1582 1543 length = bio->bi_size; 1583 - map_tree = &root->fs_info->mapping_tree; 1584 1544 map_length = length; 1585 - ret = btrfs_map_block(map_tree, READ, logical, 1545 + ret = btrfs_map_block(root->fs_info, READ, logical, 1586 1546 &map_length, NULL, 0); 1587 - /* Will always return 0 or 1 with map_multi == NULL */ 1547 + /* Will always return 0 with map_multi == NULL */ 1588 1548 BUG_ON(ret < 0); 1589 1549 if (map_length < length + size) 1590 1550 return 1; ··· 1624 1586 u64 bio_offset) 1625 1587 { 1626 1588 struct btrfs_root *root = BTRFS_I(inode)->root; 1627 - return btrfs_map_bio(root, rw, bio, mirror_num, 1); 1589 + int ret; 1590 + 1591 + ret = btrfs_map_bio(root, rw, bio, mirror_num, 1); 1592 + if (ret) 1593 + bio_endio(bio, ret); 1594 + return ret; 1628 1595 } 1629 1596 1630 1597 /* ··· 1644 1601 int ret = 0; 1645 1602 int skip_sum; 1646 1603 int metadata = 0; 1604 + int async = !atomic_read(&BTRFS_I(inode)->sync_writers); 1647 1605 1648 1606 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1649 1607 ··· 1654 1610 if (!(rw & REQ_WRITE)) { 1655 1611 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); 1656 1612 if (ret) 1657 - return ret; 1613 + goto out; 1658 1614 1659 1615 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1660 - return btrfs_submit_compressed_read(inode, bio, 1661 - mirror_num, bio_flags); 1616 + ret = btrfs_submit_compressed_read(inode, bio, 1617 + mirror_num, 1618 + bio_flags); 1619 + goto out; 1662 1620 } else if (!skip_sum) { 1663 1621 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); 1664 1622 if (ret) 1665 - return ret; 1623 + goto out; 1666 1624 } 1667 1625 goto mapit; 1668 - } else if (!skip_sum) { 1626 + } else if (async && !skip_sum) { 1669 1627 /* csum items have already been cloned */ 1670 1628 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 1671 1629 goto mapit; 1672 1630 /* we're doing a write, do the async checksumming */ 1673 - return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1631 + ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1674 1632 inode, rw, bio, mirror_num, 1675 1633 bio_flags, bio_offset, 1676 1634 __btrfs_submit_bio_start, 1677 1635 __btrfs_submit_bio_done); 1636 + goto out; 1637 + } else if (!skip_sum) { 1638 + ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); 1639 + if (ret) 1640 + goto out; 1678 1641 } 1679 1642 1680 1643 mapit: 1681 - return btrfs_map_bio(root, rw, bio, mirror_num, 0); 1644 + ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); 1645 + 1646 + out: 1647 + if (ret < 0) 1648 + bio_endio(bio, ret); 1649 + return ret; 1682 1650 } 1683 1651 1684 1652 /* ··· 1713 1657 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 1714 1658 struct extent_state **cached_state) 1715 1659 { 1716 - if ((end & (PAGE_CACHE_SIZE - 1)) == 0) 1717 - WARN_ON(1); 1660 + WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0); 1718 1661 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1719 1662 cached_state, GFP_NOFS); 1720 1663 } ··· 1922 1867 1923 1868 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1924 1869 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ 1925 - ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1926 - if (!ret) { 1927 - if (nolock) 1928 - trans = btrfs_join_transaction_nolock(root); 1929 - else 1930 - trans = btrfs_join_transaction(root); 1931 - if (IS_ERR(trans)) { 1932 - ret = PTR_ERR(trans); 1933 - trans = NULL; 1934 - goto out; 1935 - } 1936 - trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1937 - ret = btrfs_update_inode_fallback(trans, root, inode); 1938 - if (ret) /* -ENOMEM or corruption */ 1939 - btrfs_abort_transaction(trans, root, ret); 1870 + btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1871 + if (nolock) 1872 + trans = btrfs_join_transaction_nolock(root); 1873 + else 1874 + trans = btrfs_join_transaction(root); 1875 + if (IS_ERR(trans)) { 1876 + ret = PTR_ERR(trans); 1877 + trans = NULL; 1878 + goto out; 1940 1879 } 1880 + trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1881 + ret = btrfs_update_inode_fallback(trans, root, inode); 1882 + if (ret) /* -ENOMEM or corruption */ 1883 + btrfs_abort_transaction(trans, root, ret); 1941 1884 goto out; 1942 1885 } 1943 1886 ··· 1984 1931 add_pending_csums(trans, inode, ordered_extent->file_offset, 1985 1932 &ordered_extent->list); 1986 1933 1987 - ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1988 - if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1989 - ret = btrfs_update_inode_fallback(trans, root, inode); 1990 - if (ret) { /* -ENOMEM or corruption */ 1991 - btrfs_abort_transaction(trans, root, ret); 1992 - goto out_unlock; 1993 - } 1994 - } else { 1995 - btrfs_set_inode_last_trans(trans, inode); 1934 + btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1935 + ret = btrfs_update_inode_fallback(trans, root, inode); 1936 + if (ret) { /* -ENOMEM or corruption */ 1937 + btrfs_abort_transaction(trans, root, ret); 1938 + goto out_unlock; 1996 1939 } 1997 1940 ret = 0; 1998 1941 out_unlock: ··· 3123 3074 struct btrfs_trans_handle *trans; 3124 3075 struct inode *inode = dentry->d_inode; 3125 3076 int ret; 3126 - unsigned long nr = 0; 3127 3077 3128 3078 trans = __unlink_start_trans(dir, dentry); 3129 3079 if (IS_ERR(trans)) ··· 3142 3094 } 3143 3095 3144 3096 out: 3145 - nr = trans->blocks_used; 3146 3097 __unlink_end_trans(trans, root); 3147 - btrfs_btree_balance_dirty(root, nr); 3098 + btrfs_btree_balance_dirty(root); 3148 3099 return ret; 3149 3100 } 3150 3101 ··· 3233 3186 int err = 0; 3234 3187 struct btrfs_root *root = BTRFS_I(dir)->root; 3235 3188 struct btrfs_trans_handle *trans; 3236 - unsigned long nr = 0; 3237 3189 3238 3190 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) 3239 3191 return -ENOTEMPTY; ··· 3261 3215 if (!err) 3262 3216 btrfs_i_size_write(inode, 0); 3263 3217 out: 3264 - nr = trans->blocks_used; 3265 3218 __unlink_end_trans(trans, root); 3266 - btrfs_btree_balance_dirty(root, nr); 3219 + btrfs_btree_balance_dirty(root); 3267 3220 3268 3221 return err; 3269 3222 } ··· 3542 3497 if (ret) 3543 3498 goto out; 3544 3499 3545 - ret = -ENOMEM; 3546 3500 again: 3547 3501 page = find_or_create_page(mapping, index, mask); 3548 3502 if (!page) { 3549 3503 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3504 + ret = -ENOMEM; 3550 3505 goto out; 3551 3506 } 3552 3507 ··· 3595 3550 goto out_unlock; 3596 3551 } 3597 3552 3598 - ret = 0; 3599 3553 if (offset != PAGE_CACHE_SIZE) { 3600 3554 if (!len) 3601 3555 len = PAGE_CACHE_SIZE - offset; ··· 3712 3668 3713 3669 hole_em->block_start = EXTENT_MAP_HOLE; 3714 3670 hole_em->block_len = 0; 3671 + hole_em->orig_block_len = 0; 3715 3672 hole_em->bdev = root->fs_info->fs_devices->latest_bdev; 3716 3673 hole_em->compress_type = BTRFS_COMPRESS_NONE; 3717 3674 hole_em->generation = trans->transid; ··· 3828 3783 struct btrfs_root *root = BTRFS_I(inode)->root; 3829 3784 struct btrfs_block_rsv *rsv, *global_rsv; 3830 3785 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 3831 - unsigned long nr; 3832 3786 int ret; 3833 3787 3834 3788 trace_btrfs_inode_evict(inode); ··· 3873 3829 * inode item when doing the truncate. 3874 3830 */ 3875 3831 while (1) { 3876 - ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); 3832 + ret = btrfs_block_rsv_refill(root, rsv, min_size, 3833 + BTRFS_RESERVE_FLUSH_LIMIT); 3877 3834 3878 3835 /* 3879 3836 * Try and steal from the global reserve since we will ··· 3892 3847 goto no_delete; 3893 3848 } 3894 3849 3895 - trans = btrfs_start_transaction_noflush(root, 1); 3850 + trans = btrfs_start_transaction_lflush(root, 1); 3896 3851 if (IS_ERR(trans)) { 3897 3852 btrfs_orphan_del(NULL, inode); 3898 3853 btrfs_free_block_rsv(root, rsv); ··· 3909 3864 ret = btrfs_update_inode(trans, root, inode); 3910 3865 BUG_ON(ret); 3911 3866 3912 - nr = trans->blocks_used; 3913 3867 btrfs_end_transaction(trans, root); 3914 3868 trans = NULL; 3915 - btrfs_btree_balance_dirty(root, nr); 3869 + btrfs_btree_balance_dirty(root); 3916 3870 } 3917 3871 3918 3872 btrfs_free_block_rsv(root, rsv); ··· 3927 3883 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 3928 3884 btrfs_return_ino(root, btrfs_ino(inode)); 3929 3885 3930 - nr = trans->blocks_used; 3931 3886 btrfs_end_transaction(trans, root); 3932 - btrfs_btree_balance_dirty(root, nr); 3887 + btrfs_btree_balance_dirty(root); 3933 3888 no_delete: 3934 3889 clear_inode(inode); 3935 3890 return; ··· 4818 4775 if (S_ISREG(mode)) { 4819 4776 if (btrfs_test_opt(root, NODATASUM)) 4820 4777 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 4821 - if (btrfs_test_opt(root, NODATACOW) || 4822 - (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW)) 4778 + if (btrfs_test_opt(root, NODATACOW)) 4823 4779 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 4824 4780 } 4825 4781 ··· 4884 4842 ret = btrfs_insert_dir_item(trans, root, name, name_len, 4885 4843 parent_inode, &key, 4886 4844 btrfs_inode_type(inode), index); 4887 - if (ret == -EEXIST) 4845 + if (ret == -EEXIST || ret == -EOVERFLOW) 4888 4846 goto fail_dir_item; 4889 4847 else if (ret) { 4890 4848 btrfs_abort_transaction(trans, root, ret); ··· 4939 4897 int err; 4940 4898 int drop_inode = 0; 4941 4899 u64 objectid; 4942 - unsigned long nr = 0; 4943 4900 u64 index = 0; 4944 4901 4945 4902 if (!new_valid_dev(rdev)) ··· 4971 4930 goto out_unlock; 4972 4931 } 4973 4932 4933 + err = btrfs_update_inode(trans, root, inode); 4934 + if (err) { 4935 + drop_inode = 1; 4936 + goto out_unlock; 4937 + } 4938 + 4974 4939 /* 4975 4940 * If the active LSM wants to access the inode during 4976 4941 * d_instantiate it needs these. Smack checks to see ··· 4994 4947 d_instantiate(dentry, inode); 4995 4948 } 4996 4949 out_unlock: 4997 - nr = trans->blocks_used; 4998 4950 btrfs_end_transaction(trans, root); 4999 - btrfs_btree_balance_dirty(root, nr); 4951 + btrfs_btree_balance_dirty(root); 5000 4952 if (drop_inode) { 5001 4953 inode_dec_link_count(inode); 5002 4954 iput(inode); ··· 5009 4963 struct btrfs_trans_handle *trans; 5010 4964 struct btrfs_root *root = BTRFS_I(dir)->root; 5011 4965 struct inode *inode = NULL; 5012 - int drop_inode = 0; 4966 + int drop_inode_on_err = 0; 5013 4967 int err; 5014 - unsigned long nr = 0; 5015 4968 u64 objectid; 5016 4969 u64 index = 0; 5017 4970 ··· 5034 4989 err = PTR_ERR(inode); 5035 4990 goto out_unlock; 5036 4991 } 4992 + drop_inode_on_err = 1; 5037 4993 5038 4994 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 5039 - if (err) { 5040 - drop_inode = 1; 4995 + if (err) 5041 4996 goto out_unlock; 5042 - } 4997 + 4998 + err = btrfs_update_inode(trans, root, inode); 4999 + if (err) 5000 + goto out_unlock; 5043 5001 5044 5002 /* 5045 5003 * If the active LSM wants to access the inode during ··· 5055 5007 5056 5008 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 5057 5009 if (err) 5058 - drop_inode = 1; 5059 - else { 5060 - inode->i_mapping->a_ops = &btrfs_aops; 5061 - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 5062 - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 5063 - d_instantiate(dentry, inode); 5064 - } 5010 + goto out_unlock; 5011 + 5012 + inode->i_mapping->a_ops = &btrfs_aops; 5013 + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 5014 + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 5015 + d_instantiate(dentry, inode); 5016 + 5065 5017 out_unlock: 5066 - nr = trans->blocks_used; 5067 5018 btrfs_end_transaction(trans, root); 5068 - if (drop_inode) { 5019 + if (err && drop_inode_on_err) { 5069 5020 inode_dec_link_count(inode); 5070 5021 iput(inode); 5071 5022 } 5072 - btrfs_btree_balance_dirty(root, nr); 5023 + btrfs_btree_balance_dirty(root); 5073 5024 return err; 5074 5025 } 5075 5026 ··· 5079 5032 struct btrfs_root *root = BTRFS_I(dir)->root; 5080 5033 struct inode *inode = old_dentry->d_inode; 5081 5034 u64 index; 5082 - unsigned long nr = 0; 5083 5035 int err; 5084 5036 int drop_inode = 0; 5085 5037 ··· 5108 5062 inode_inc_iversion(inode); 5109 5063 inode->i_ctime = CURRENT_TIME; 5110 5064 ihold(inode); 5065 + set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); 5111 5066 5112 5067 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); 5113 5068 ··· 5123 5076 btrfs_log_new_name(trans, inode, NULL, parent); 5124 5077 } 5125 5078 5126 - nr = trans->blocks_used; 5127 5079 btrfs_end_transaction(trans, root); 5128 5080 fail: 5129 5081 if (drop_inode) { 5130 5082 inode_dec_link_count(inode); 5131 5083 iput(inode); 5132 5084 } 5133 - btrfs_btree_balance_dirty(root, nr); 5085 + btrfs_btree_balance_dirty(root); 5134 5086 return err; 5135 5087 } 5136 5088 ··· 5142 5096 int drop_on_err = 0; 5143 5097 u64 objectid = 0; 5144 5098 u64 index = 0; 5145 - unsigned long nr = 1; 5146 5099 5147 5100 /* 5148 5101 * 2 items for inode and ref ··· 5187 5142 drop_on_err = 0; 5188 5143 5189 5144 out_fail: 5190 - nr = trans->blocks_used; 5191 5145 btrfs_end_transaction(trans, root); 5192 5146 if (drop_on_err) 5193 5147 iput(inode); 5194 - btrfs_btree_balance_dirty(root, nr); 5148 + btrfs_btree_balance_dirty(root); 5195 5149 return err; 5196 5150 } 5197 5151 ··· 5384 5340 if (start + len <= found_key.offset) 5385 5341 goto not_found; 5386 5342 em->start = start; 5343 + em->orig_start = start; 5387 5344 em->len = found_key.offset - start; 5388 5345 goto not_found_em; 5389 5346 } ··· 5395 5350 em->len = extent_end - extent_start; 5396 5351 em->orig_start = extent_start - 5397 5352 btrfs_file_extent_offset(leaf, item); 5353 + em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, 5354 + item); 5398 5355 bytenr = btrfs_file_extent_disk_bytenr(leaf, item); 5399 5356 if (bytenr == 0) { 5400 5357 em->block_start = EXTENT_MAP_HOLE; ··· 5406 5359 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5407 5360 em->compress_type = compress_type; 5408 5361 em->block_start = bytenr; 5409 - em->block_len = btrfs_file_extent_disk_num_bytes(leaf, 5410 - item); 5362 + em->block_len = em->orig_block_len; 5411 5363 } else { 5412 5364 bytenr += btrfs_file_extent_offset(leaf, item); 5413 5365 em->block_start = bytenr; ··· 5436 5390 em->start = extent_start + extent_offset; 5437 5391 em->len = (copy_size + root->sectorsize - 1) & 5438 5392 ~((u64)root->sectorsize - 1); 5439 - em->orig_start = EXTENT_MAP_INLINE; 5393 + em->orig_block_len = em->len; 5394 + em->orig_start = em->start; 5440 5395 if (compress_type) { 5441 5396 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5442 5397 em->compress_type = compress_type; ··· 5486 5439 extent_map_end(em) - 1, NULL, GFP_NOFS); 5487 5440 goto insert; 5488 5441 } else { 5489 - printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); 5490 - WARN_ON(1); 5442 + WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type); 5491 5443 } 5492 5444 not_found: 5493 5445 em->start = start; 5446 + em->orig_start = start; 5494 5447 em->len = len; 5495 5448 not_found_em: 5496 5449 em->block_start = EXTENT_MAP_HOLE; ··· 5692 5645 } 5693 5646 5694 5647 static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 5695 - struct extent_map *em, 5696 5648 u64 start, u64 len) 5697 5649 { 5698 5650 struct btrfs_root *root = BTRFS_I(inode)->root; 5699 5651 struct btrfs_trans_handle *trans; 5700 - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5652 + struct extent_map *em; 5701 5653 struct btrfs_key ins; 5702 5654 u64 alloc_hint; 5703 5655 int ret; 5704 - bool insert = false; 5705 - 5706 - /* 5707 - * Ok if the extent map we looked up is a hole and is for the exact 5708 - * range we want, there is no reason to allocate a new one, however if 5709 - * it is not right then we need to free this one and drop the cache for 5710 - * our range. 5711 - */ 5712 - if (em->block_start != EXTENT_MAP_HOLE || em->start != start || 5713 - em->len != len) { 5714 - free_extent_map(em); 5715 - em = NULL; 5716 - insert = true; 5717 - btrfs_drop_extent_cache(inode, start, start + len - 1, 0); 5718 - } 5719 5656 5720 5657 trans = btrfs_join_transaction(root); 5721 5658 if (IS_ERR(trans)) 5722 5659 return ERR_CAST(trans); 5723 - 5724 - if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024) 5725 - btrfs_add_inode_defrag(trans, inode); 5726 5660 5727 5661 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5728 5662 ··· 5715 5687 goto out; 5716 5688 } 5717 5689 5718 - if (!em) { 5719 - em = alloc_extent_map(); 5720 - if (!em) { 5721 - em = ERR_PTR(-ENOMEM); 5722 - goto out; 5723 - } 5724 - } 5725 - 5726 - em->start = start; 5727 - em->orig_start = em->start; 5728 - em->len = ins.offset; 5729 - 5730 - em->block_start = ins.objectid; 5731 - em->block_len = ins.offset; 5732 - em->bdev = root->fs_info->fs_devices->latest_bdev; 5733 - 5734 - /* 5735 - * We need to do this because if we're using the original em we searched 5736 - * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that. 5737 - */ 5738 - em->flags = 0; 5739 - set_bit(EXTENT_FLAG_PINNED, &em->flags); 5740 - 5741 - while (insert) { 5742 - write_lock(&em_tree->lock); 5743 - ret = add_extent_mapping(em_tree, em); 5744 - write_unlock(&em_tree->lock); 5745 - if (ret != -EEXIST) 5746 - break; 5747 - btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0); 5748 - } 5690 + em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, 5691 + ins.offset, ins.offset, 0); 5692 + if (IS_ERR(em)) 5693 + goto out; 5749 5694 5750 5695 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, 5751 5696 ins.offset, ins.offset, 0); ··· 5895 5894 static struct extent_map *create_pinned_em(struct inode *inode, u64 start, 5896 5895 u64 len, u64 orig_start, 5897 5896 u64 block_start, u64 block_len, 5898 - int type) 5897 + u64 orig_block_len, int type) 5899 5898 { 5900 5899 struct extent_map_tree *em_tree; 5901 5900 struct extent_map *em; ··· 5913 5912 em->block_len = block_len; 5914 5913 em->block_start = block_start; 5915 5914 em->bdev = root->fs_info->fs_devices->latest_bdev; 5915 + em->orig_block_len = orig_block_len; 5916 + em->generation = -1; 5916 5917 set_bit(EXTENT_FLAG_PINNED, &em->flags); 5917 5918 if (type == BTRFS_ORDERED_PREALLOC) 5918 - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 5919 + set_bit(EXTENT_FLAG_FILLING, &em->flags); 5919 5920 5920 5921 do { 5921 5922 btrfs_drop_extent_cache(inode, em->start, 5922 5923 em->start + em->len - 1, 0); 5923 5924 write_lock(&em_tree->lock); 5924 5925 ret = add_extent_mapping(em_tree, em); 5926 + if (!ret) 5927 + list_move(&em->list, 5928 + &em_tree->modified_extents); 5925 5929 write_unlock(&em_tree->lock); 5926 5930 } while (ret == -EEXIST); 5927 5931 ··· 6053 6047 goto must_cow; 6054 6048 6055 6049 if (can_nocow_odirect(trans, inode, start, len) == 1) { 6056 - u64 orig_start = em->start; 6050 + u64 orig_start = em->orig_start; 6051 + u64 orig_block_len = em->orig_block_len; 6057 6052 6058 6053 if (type == BTRFS_ORDERED_PREALLOC) { 6059 6054 free_extent_map(em); 6060 6055 em = create_pinned_em(inode, start, len, 6061 6056 orig_start, 6062 - block_start, len, type); 6057 + block_start, len, 6058 + orig_block_len, type); 6063 6059 if (IS_ERR(em)) { 6064 6060 btrfs_end_transaction(trans, root); 6065 6061 goto unlock_err; ··· 6085 6077 * it above 6086 6078 */ 6087 6079 len = bh_result->b_size; 6088 - em = btrfs_new_extent_direct(inode, em, start, len); 6080 + free_extent_map(em); 6081 + em = btrfs_new_extent_direct(inode, start, len); 6089 6082 if (IS_ERR(em)) { 6090 6083 ret = PTR_ERR(em); 6091 6084 goto unlock_err; ··· 6327 6318 struct btrfs_root *root = BTRFS_I(inode)->root; 6328 6319 int ret; 6329 6320 6321 + if (async_submit) 6322 + async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); 6323 + 6330 6324 bio_get(bio); 6331 6325 6332 6326 if (!write) { ··· 6374 6362 { 6375 6363 struct inode *inode = dip->inode; 6376 6364 struct btrfs_root *root = BTRFS_I(inode)->root; 6377 - struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 6378 6365 struct bio *bio; 6379 6366 struct bio *orig_bio = dip->orig_bio; 6380 6367 struct bio_vec *bvec = orig_bio->bi_io_vec; ··· 6386 6375 int async_submit = 0; 6387 6376 6388 6377 map_length = orig_bio->bi_size; 6389 - ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6378 + ret = btrfs_map_block(root->fs_info, READ, start_sector << 9, 6390 6379 &map_length, NULL, 0); 6391 6380 if (ret) { 6392 6381 bio_put(orig_bio); ··· 6440 6429 bio->bi_end_io = btrfs_end_dio_bio; 6441 6430 6442 6431 map_length = orig_bio->bi_size; 6443 - ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6432 + ret = btrfs_map_block(root->fs_info, READ, 6433 + start_sector << 9, 6444 6434 &map_length, NULL, 0); 6445 6435 if (ret) { 6446 6436 bio_put(bio); ··· 6594 6582 btrfs_submit_direct, 0); 6595 6583 } 6596 6584 6585 + #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) 6586 + 6597 6587 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 6598 6588 __u64 start, __u64 len) 6599 6589 { 6590 + int ret; 6591 + 6592 + ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS); 6593 + if (ret) 6594 + return ret; 6595 + 6600 6596 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); 6601 6597 } 6602 6598 ··· 6875 6855 int ret; 6876 6856 int err = 0; 6877 6857 struct btrfs_trans_handle *trans; 6878 - unsigned long nr; 6879 6858 u64 mask = root->sectorsize - 1; 6880 6859 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 6881 6860 ··· 6997 6978 break; 6998 6979 } 6999 6980 7000 - nr = trans->blocks_used; 7001 6981 btrfs_end_transaction(trans, root); 7002 - btrfs_btree_balance_dirty(root, nr); 6982 + btrfs_btree_balance_dirty(root); 7003 6983 7004 6984 trans = btrfs_start_transaction(root, 2); 7005 6985 if (IS_ERR(trans)) { ··· 7032 7014 if (ret && !err) 7033 7015 err = ret; 7034 7016 7035 - nr = trans->blocks_used; 7036 7017 ret = btrfs_end_transaction(trans, root); 7037 - btrfs_btree_balance_dirty(root, nr); 7018 + btrfs_btree_balance_dirty(root); 7038 7019 } 7039 7020 7040 7021 out: ··· 7110 7093 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); 7111 7094 ei->io_tree.track_uptodate = 1; 7112 7095 ei->io_failure_tree.track_uptodate = 1; 7096 + atomic_set(&ei->sync_writers, 0); 7113 7097 mutex_init(&ei->log_mutex); 7114 7098 mutex_init(&ei->delalloc_mutex); 7115 7099 btrfs_ordered_inode_tree_init(&ei->ordered_tree); ··· 7221 7203 kmem_cache_destroy(btrfs_path_cachep); 7222 7204 if (btrfs_free_space_cachep) 7223 7205 kmem_cache_destroy(btrfs_free_space_cachep); 7206 + if (btrfs_delalloc_work_cachep) 7207 + kmem_cache_destroy(btrfs_delalloc_work_cachep); 7224 7208 } 7225 7209 7226 7210 int btrfs_init_cachep(void) ··· 7255 7235 sizeof(struct btrfs_free_space), 0, 7256 7236 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7257 7237 if (!btrfs_free_space_cachep) 7238 + goto fail; 7239 + 7240 + btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work", 7241 + sizeof(struct btrfs_delalloc_work), 0, 7242 + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, 7243 + NULL); 7244 + if (!btrfs_delalloc_work_cachep) 7258 7245 goto fail; 7259 7246 7260 7247 return 0; ··· 7335 7308 if (S_ISDIR(old_inode->i_mode) && new_inode && 7336 7309 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 7337 7310 return -ENOTEMPTY; 7311 + 7312 + 7313 + /* check for collisions, even if the name isn't there */ 7314 + ret = btrfs_check_dir_item_collision(root, new_dir->i_ino, 7315 + new_dentry->d_name.name, 7316 + new_dentry->d_name.len); 7317 + 7318 + if (ret) { 7319 + if (ret == -EEXIST) { 7320 + /* we shouldn't get 7321 + * eexist without a new_inode */ 7322 + if (!new_inode) { 7323 + WARN_ON(1); 7324 + return ret; 7325 + } 7326 + } else { 7327 + /* maybe -EOVERFLOW */ 7328 + return ret; 7329 + } 7330 + } 7331 + ret = 0; 7332 + 7338 7333 /* 7339 7334 * we're using rename to replace one file with another. 7340 7335 * and the replacement file is large. Start IO on it now so ··· 7496 7447 return ret; 7497 7448 } 7498 7449 7450 + static void btrfs_run_delalloc_work(struct btrfs_work *work) 7451 + { 7452 + struct btrfs_delalloc_work *delalloc_work; 7453 + 7454 + delalloc_work = container_of(work, struct btrfs_delalloc_work, 7455 + work); 7456 + if (delalloc_work->wait) 7457 + btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1); 7458 + else 7459 + filemap_flush(delalloc_work->inode->i_mapping); 7460 + 7461 + if (delalloc_work->delay_iput) 7462 + btrfs_add_delayed_iput(delalloc_work->inode); 7463 + else 7464 + iput(delalloc_work->inode); 7465 + complete(&delalloc_work->completion); 7466 + } 7467 + 7468 + struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, 7469 + int wait, int delay_iput) 7470 + { 7471 + struct btrfs_delalloc_work *work; 7472 + 7473 + work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS); 7474 + if (!work) 7475 + return NULL; 7476 + 7477 + init_completion(&work->completion); 7478 + INIT_LIST_HEAD(&work->list); 7479 + work->inode = inode; 7480 + work->wait = wait; 7481 + work->delay_iput = delay_iput; 7482 + work->work.func = btrfs_run_delalloc_work; 7483 + 7484 + return work; 7485 + } 7486 + 7487 + void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) 7488 + { 7489 + wait_for_completion(&work->completion); 7490 + kmem_cache_free(btrfs_delalloc_work_cachep, work); 7491 + } 7492 + 7499 7493 /* 7500 7494 * some fairly slow code that needs optimization. This walks the list 7501 7495 * of all the inodes with pending delalloc and forces them to disk. ··· 7548 7456 struct list_head *head = &root->fs_info->delalloc_inodes; 7549 7457 struct btrfs_inode *binode; 7550 7458 struct inode *inode; 7459 + struct btrfs_delalloc_work *work, *next; 7460 + struct list_head works; 7461 + int ret = 0; 7551 7462 7552 7463 if (root->fs_info->sb->s_flags & MS_RDONLY) 7553 7464 return -EROFS; 7465 + 7466 + INIT_LIST_HEAD(&works); 7554 7467 7555 7468 spin_lock(&root->fs_info->delalloc_lock); 7556 7469 while (!list_empty(head)) { ··· 7566 7469 list_del_init(&binode->delalloc_inodes); 7567 7470 spin_unlock(&root->fs_info->delalloc_lock); 7568 7471 if (inode) { 7569 - filemap_flush(inode->i_mapping); 7570 - if (delay_iput) 7571 - btrfs_add_delayed_iput(inode); 7572 - else 7573 - iput(inode); 7472 + work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); 7473 + if (!work) { 7474 + ret = -ENOMEM; 7475 + goto out; 7476 + } 7477 + list_add_tail(&work->list, &works); 7478 + btrfs_queue_worker(&root->fs_info->flush_workers, 7479 + &work->work); 7574 7480 } 7575 7481 cond_resched(); 7576 7482 spin_lock(&root->fs_info->delalloc_lock); ··· 7592 7492 atomic_read(&root->fs_info->async_delalloc_pages) == 0)); 7593 7493 } 7594 7494 atomic_dec(&root->fs_info->async_submit_draining); 7595 - return 0; 7495 + out: 7496 + list_for_each_entry_safe(work, next, &works, list) { 7497 + list_del_init(&work->list); 7498 + btrfs_wait_and_free_delalloc_work(work); 7499 + } 7500 + return ret; 7596 7501 } 7597 7502 7598 7503 static int btrfs_symlink(struct inode *dir, struct dentry *dentry, ··· 7617 7512 unsigned long ptr; 7618 7513 struct btrfs_file_extent_item *ei; 7619 7514 struct extent_buffer *leaf; 7620 - unsigned long nr = 0; 7621 7515 7622 7516 name_len = strlen(symname) + 1; 7623 7517 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) ··· 7714 7610 out_unlock: 7715 7611 if (!err) 7716 7612 d_instantiate(dentry, inode); 7717 - nr = trans->blocks_used; 7718 7613 btrfs_end_transaction(trans, root); 7719 7614 if (drop_inode) { 7720 7615 inode_dec_link_count(inode); 7721 7616 iput(inode); 7722 7617 } 7723 - btrfs_btree_balance_dirty(root, nr); 7618 + btrfs_btree_balance_dirty(root); 7724 7619 return err; 7725 7620 } 7726 7621 ··· 7782 7679 em->len = ins.offset; 7783 7680 em->block_start = ins.objectid; 7784 7681 em->block_len = ins.offset; 7682 + em->orig_block_len = ins.offset; 7785 7683 em->bdev = root->fs_info->fs_devices->latest_bdev; 7786 7684 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 7787 7685 em->generation = trans->transid;
+236 -81
fs/btrfs/ioctl.c
··· 55 55 #include "backref.h" 56 56 #include "rcu-string.h" 57 57 #include "send.h" 58 + #include "dev-replace.h" 58 59 59 60 /* Mask out flags that are inappropriate for the given type of inode. */ 60 61 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) ··· 141 140 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; 142 141 } 143 142 144 - if (flags & BTRFS_INODE_NODATACOW) 143 + if (flags & BTRFS_INODE_NODATACOW) { 145 144 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 145 + if (S_ISREG(inode->i_mode)) 146 + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 147 + } 146 148 147 149 btrfs_update_iflags(inode); 148 150 } ··· 575 571 ret = btrfs_commit_transaction(trans, 576 572 root->fs_info->extent_root); 577 573 } 578 - if (ret) 574 + if (ret) { 575 + /* cleanup_transaction has freed this for us */ 576 + if (trans->aborted) 577 + pending_snapshot = NULL; 579 578 goto fail; 579 + } 580 580 581 581 ret = pending_snapshot->error; 582 582 if (ret) ··· 710 702 goto out_dput; 711 703 712 704 error = btrfs_may_create(dir, dentry); 705 + if (error) 706 + goto out_dput; 707 + 708 + /* 709 + * even if this name doesn't exist, we may get hash collisions. 710 + * check for them now when we can safely fail 711 + */ 712 + error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, 713 + dir->i_ino, name, 714 + namelen); 713 715 if (error) 714 716 goto out_dput; 715 717 ··· 1311 1293 return ret; 1312 1294 } 1313 1295 1314 - static noinline int btrfs_ioctl_resize(struct btrfs_root *root, 1296 + static noinline int btrfs_ioctl_resize(struct file *file, 1315 1297 void __user *arg) 1316 1298 { 1317 1299 u64 new_size; 1318 1300 u64 old_size; 1319 1301 u64 devid = 1; 1302 + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 1320 1303 struct btrfs_ioctl_vol_args *vol_args; 1321 1304 struct btrfs_trans_handle *trans; 1322 1305 struct btrfs_device *device = NULL; ··· 1332 1313 if (!capable(CAP_SYS_ADMIN)) 1333 1314 return -EPERM; 1334 1315 1335 - mutex_lock(&root->fs_info->volume_mutex); 1336 - if (root->fs_info->balance_ctl) { 1337 - printk(KERN_INFO "btrfs: balance in progress\n"); 1338 - ret = -EINVAL; 1339 - goto out; 1316 + ret = mnt_want_write_file(file); 1317 + if (ret) 1318 + return ret; 1319 + 1320 + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1321 + 1)) { 1322 + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); 1323 + return -EINPROGRESS; 1340 1324 } 1341 1325 1326 + mutex_lock(&root->fs_info->volume_mutex); 1342 1327 vol_args = memdup_user(arg, sizeof(*vol_args)); 1343 1328 if (IS_ERR(vol_args)) { 1344 1329 ret = PTR_ERR(vol_args); ··· 1362 1339 printk(KERN_INFO "btrfs: resizing devid %llu\n", 1363 1340 (unsigned long long)devid); 1364 1341 } 1365 - device = btrfs_find_device(root, devid, NULL, NULL); 1342 + device = btrfs_find_device(root->fs_info, devid, NULL, NULL); 1366 1343 if (!device) { 1367 1344 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", 1368 1345 (unsigned long long)devid); ··· 1392 1369 ret = -EINVAL; 1393 1370 goto out_free; 1394 1371 } 1372 + } 1373 + 1374 + if (device->is_tgtdev_for_dev_replace) { 1375 + ret = -EINVAL; 1376 + goto out_free; 1395 1377 } 1396 1378 1397 1379 old_size = device->total_bytes; ··· 1437 1409 btrfs_commit_transaction(trans, root); 1438 1410 } else if (new_size < old_size) { 1439 1411 ret = btrfs_shrink_device(device, new_size); 1440 - } 1412 + } /* equal, nothing need to do */ 1441 1413 1442 1414 out_free: 1443 1415 kfree(vol_args); 1444 1416 out: 1445 1417 mutex_unlock(&root->fs_info->volume_mutex); 1418 + mnt_drop_write_file(file); 1419 + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); 1446 1420 return ret; 1447 1421 } 1448 1422 ··· 2186 2156 if (btrfs_root_readonly(root)) 2187 2157 return -EROFS; 2188 2158 2159 + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 2160 + 1)) { 2161 + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); 2162 + return -EINPROGRESS; 2163 + } 2189 2164 ret = mnt_want_write_file(file); 2190 - if (ret) 2165 + if (ret) { 2166 + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 2167 + 0); 2191 2168 return ret; 2169 + } 2192 2170 2193 2171 switch (inode->i_mode & S_IFMT) { 2194 2172 case S_IFDIR: ··· 2248 2210 } 2249 2211 out: 2250 2212 mnt_drop_write_file(file); 2213 + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); 2251 2214 return ret; 2252 2215 } 2253 2216 ··· 2260 2221 if (!capable(CAP_SYS_ADMIN)) 2261 2222 return -EPERM; 2262 2223 2263 - mutex_lock(&root->fs_info->volume_mutex); 2264 - if (root->fs_info->balance_ctl) { 2265 - printk(KERN_INFO "btrfs: balance in progress\n"); 2266 - ret = -EINVAL; 2267 - goto out; 2224 + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 2225 + 1)) { 2226 + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); 2227 + return -EINPROGRESS; 2268 2228 } 2269 2229 2230 + mutex_lock(&root->fs_info->volume_mutex); 2270 2231 vol_args = memdup_user(arg, sizeof(*vol_args)); 2271 2232 if (IS_ERR(vol_args)) { 2272 2233 ret = PTR_ERR(vol_args); ··· 2279 2240 kfree(vol_args); 2280 2241 out: 2281 2242 mutex_unlock(&root->fs_info->volume_mutex); 2243 + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); 2282 2244 return ret; 2283 2245 } 2284 2246 2285 - static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) 2247 + static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) 2286 2248 { 2249 + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 2287 2250 struct btrfs_ioctl_vol_args *vol_args; 2288 2251 int ret; 2289 2252 2290 2253 if (!capable(CAP_SYS_ADMIN)) 2291 2254 return -EPERM; 2292 2255 2293 - if (root->fs_info->sb->s_flags & MS_RDONLY) 2294 - return -EROFS; 2256 + ret = mnt_want_write_file(file); 2257 + if (ret) 2258 + return ret; 2295 2259 2296 - mutex_lock(&root->fs_info->volume_mutex); 2297 - if (root->fs_info->balance_ctl) { 2298 - printk(KERN_INFO "btrfs: balance in progress\n"); 2299 - ret = -EINVAL; 2300 - goto out; 2260 + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 2261 + 1)) { 2262 + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); 2263 + mnt_drop_write_file(file); 2264 + return -EINPROGRESS; 2301 2265 } 2302 2266 2267 + mutex_lock(&root->fs_info->volume_mutex); 2303 2268 vol_args = memdup_user(arg, sizeof(*vol_args)); 2304 2269 if (IS_ERR(vol_args)) { 2305 2270 ret = PTR_ERR(vol_args); ··· 2316 2273 kfree(vol_args); 2317 2274 out: 2318 2275 mutex_unlock(&root->fs_info->volume_mutex); 2276 + mnt_drop_write_file(file); 2277 + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); 2319 2278 return ret; 2320 2279 } 2321 2280 ··· 2373 2328 s_uuid = di_args->uuid; 2374 2329 2375 2330 mutex_lock(&fs_devices->device_list_mutex); 2376 - dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL); 2331 + dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL); 2377 2332 mutex_unlock(&fs_devices->device_list_mutex); 2378 2333 2379 2334 if (!dev) { ··· 2866 2821 struct btrfs_disk_key disk_key; 2867 2822 u64 objectid = 0; 2868 2823 u64 dir_id; 2824 + int ret; 2869 2825 2870 2826 if (!capable(CAP_SYS_ADMIN)) 2871 2827 return -EPERM; 2872 2828 2873 - if (copy_from_user(&objectid, argp, sizeof(objectid))) 2874 - return -EFAULT; 2829 + ret = mnt_want_write_file(file); 2830 + if (ret) 2831 + return ret; 2832 + 2833 + if (copy_from_user(&objectid, argp, sizeof(objectid))) { 2834 + ret = -EFAULT; 2835 + goto out; 2836 + } 2875 2837 2876 2838 if (!objectid) 2877 2839 objectid = root->root_key.objectid; ··· 2888 2836 location.offset = (u64)-1; 2889 2837 2890 2838 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); 2891 - if (IS_ERR(new_root)) 2892 - return PTR_ERR(new_root); 2839 + if (IS_ERR(new_root)) { 2840 + ret = PTR_ERR(new_root); 2841 + goto out; 2842 + } 2893 2843 2894 - if (btrfs_root_refs(&new_root->root_item) == 0) 2895 - return -ENOENT; 2844 + if (btrfs_root_refs(&new_root->root_item) == 0) { 2845 + ret = -ENOENT; 2846 + goto out; 2847 + } 2896 2848 2897 2849 path = btrfs_alloc_path(); 2898 - if (!path) 2899 - return -ENOMEM; 2850 + if (!path) { 2851 + ret = -ENOMEM; 2852 + goto out; 2853 + } 2900 2854 path->leave_spinning = 1; 2901 2855 2902 2856 trans = btrfs_start_transaction(root, 1); 2903 2857 if (IS_ERR(trans)) { 2904 2858 btrfs_free_path(path); 2905 - return PTR_ERR(trans); 2859 + ret = PTR_ERR(trans); 2860 + goto out; 2906 2861 } 2907 2862 2908 2863 dir_id = btrfs_super_root_dir(root->fs_info->super_copy); ··· 2920 2861 btrfs_end_transaction(trans, root); 2921 2862 printk(KERN_ERR "Umm, you don't have the default dir item, " 2922 2863 "this isn't going to work\n"); 2923 - return -ENOENT; 2864 + ret = -ENOENT; 2865 + goto out; 2924 2866 } 2925 2867 2926 2868 btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); ··· 2931 2871 2932 2872 btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL); 2933 2873 btrfs_end_transaction(trans, root); 2934 - 2935 - return 0; 2874 + out: 2875 + mnt_drop_write_file(file); 2876 + return ret; 2936 2877 } 2937 2878 2938 2879 void btrfs_get_block_group_info(struct list_head *groups_list, ··· 3097 3036 return 0; 3098 3037 } 3099 3038 3100 - static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp) 3039 + static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, 3040 + void __user *argp) 3101 3041 { 3102 - struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; 3103 3042 struct btrfs_trans_handle *trans; 3104 3043 u64 transid; 3105 3044 int ret; 3106 3045 3107 - trans = btrfs_start_transaction(root, 0); 3108 - if (IS_ERR(trans)) 3109 - return PTR_ERR(trans); 3046 + trans = btrfs_attach_transaction(root); 3047 + if (IS_ERR(trans)) { 3048 + if (PTR_ERR(trans) != -ENOENT) 3049 + return PTR_ERR(trans); 3050 + 3051 + /* No running transaction, don't bother */ 3052 + transid = root->fs_info->last_trans_committed; 3053 + goto out; 3054 + } 3110 3055 transid = trans->transid; 3111 3056 ret = btrfs_commit_transaction_async(trans, root, 0); 3112 3057 if (ret) { 3113 3058 btrfs_end_transaction(trans, root); 3114 3059 return ret; 3115 3060 } 3116 - 3061 + out: 3117 3062 if (argp) 3118 3063 if (copy_to_user(argp, &transid, sizeof(transid))) 3119 3064 return -EFAULT; 3120 3065 return 0; 3121 3066 } 3122 3067 3123 - static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp) 3068 + static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root, 3069 + void __user *argp) 3124 3070 { 3125 - struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; 3126 3071 u64 transid; 3127 3072 3128 3073 if (argp) { ··· 3140 3073 return btrfs_wait_for_commit(root, transid); 3141 3074 } 3142 3075 3143 - static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) 3076 + static long btrfs_ioctl_scrub(struct file *file, void __user *arg) 3144 3077 { 3145 - int ret; 3078 + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 3146 3079 struct btrfs_ioctl_scrub_args *sa; 3080 + int ret; 3147 3081 3148 3082 if (!capable(CAP_SYS_ADMIN)) 3149 3083 return -EPERM; ··· 3153 3085 if (IS_ERR(sa)) 3154 3086 return PTR_ERR(sa); 3155 3087 3156 - ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end, 3157 - &sa->progress, sa->flags & BTRFS_SCRUB_READONLY); 3088 + if (!(sa->flags & BTRFS_SCRUB_READONLY)) { 3089 + ret = mnt_want_write_file(file); 3090 + if (ret) 3091 + goto out; 3092 + } 3093 + 3094 + ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end, 3095 + &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, 3096 + 0); 3158 3097 3159 3098 if (copy_to_user(arg, sa, sizeof(*sa))) 3160 3099 ret = -EFAULT; 3161 3100 3101 + if (!(sa->flags & BTRFS_SCRUB_READONLY)) 3102 + mnt_drop_write_file(file); 3103 + out: 3162 3104 kfree(sa); 3163 3105 return ret; 3164 3106 } ··· 3178 3100 if (!capable(CAP_SYS_ADMIN)) 3179 3101 return -EPERM; 3180 3102 3181 - return btrfs_scrub_cancel(root); 3103 + return btrfs_scrub_cancel(root->fs_info); 3182 3104 } 3183 3105 3184 3106 static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, ··· 3224 3146 ret = -EFAULT; 3225 3147 3226 3148 kfree(sa); 3149 + return ret; 3150 + } 3151 + 3152 + static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg) 3153 + { 3154 + struct btrfs_ioctl_dev_replace_args *p; 3155 + int ret; 3156 + 3157 + if (!capable(CAP_SYS_ADMIN)) 3158 + return -EPERM; 3159 + 3160 + p = memdup_user(arg, sizeof(*p)); 3161 + if (IS_ERR(p)) 3162 + return PTR_ERR(p); 3163 + 3164 + switch (p->cmd) { 3165 + case BTRFS_IOCTL_DEV_REPLACE_CMD_START: 3166 + if (atomic_xchg( 3167 + &root->fs_info->mutually_exclusive_operation_running, 3168 + 1)) { 3169 + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); 3170 + ret = -EINPROGRESS; 3171 + } else { 3172 + ret = btrfs_dev_replace_start(root, p); 3173 + atomic_set( 3174 + &root->fs_info->mutually_exclusive_operation_running, 3175 + 0); 3176 + } 3177 + break; 3178 + case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: 3179 + btrfs_dev_replace_status(root->fs_info, p); 3180 + ret = 0; 3181 + break; 3182 + case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL: 3183 + ret = btrfs_dev_replace_cancel(root->fs_info, p); 3184 + break; 3185 + default: 3186 + ret = -EINVAL; 3187 + break; 3188 + } 3189 + 3190 + if (copy_to_user(arg, p, sizeof(*p))) 3191 + ret = -EFAULT; 3192 + 3193 + kfree(p); 3227 3194 return ret; 3228 3195 } 3229 3196 ··· 3438 3315 struct btrfs_ioctl_balance_args *bargs; 3439 3316 struct btrfs_balance_control *bctl; 3440 3317 int ret; 3318 + int need_to_clear_lock = 0; 3441 3319 3442 3320 if (!capable(CAP_SYS_ADMIN)) 3443 3321 return -EPERM; ··· 3474 3350 bargs = NULL; 3475 3351 } 3476 3352 3477 - if (fs_info->balance_ctl) { 3353 + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 3354 + 1)) { 3355 + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); 3478 3356 ret = -EINPROGRESS; 3479 3357 goto out_bargs; 3480 3358 } 3359 + need_to_clear_lock = 1; 3481 3360 3482 3361 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 3483 3362 if (!bctl) { ··· 3514 3387 out_bargs: 3515 3388 kfree(bargs); 3516 3389 out: 3390 + if (need_to_clear_lock) 3391 + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 3392 + 0); 3517 3393 mutex_unlock(&fs_info->balance_mutex); 3518 3394 mutex_unlock(&fs_info->volume_mutex); 3519 3395 mnt_drop_write_file(file); ··· 3571 3441 return ret; 3572 3442 } 3573 3443 3574 - static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg) 3444 + static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) 3575 3445 { 3446 + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 3576 3447 struct btrfs_ioctl_quota_ctl_args *sa; 3577 3448 struct btrfs_trans_handle *trans = NULL; 3578 3449 int ret; ··· 3582 3451 if (!capable(CAP_SYS_ADMIN)) 3583 3452 return -EPERM; 3584 3453 3585 - if (root->fs_info->sb->s_flags & MS_RDONLY) 3586 - return -EROFS; 3454 + ret = mnt_want_write_file(file); 3455 + if (ret) 3456 + return ret; 3587 3457 3588 3458 sa = memdup_user(arg, sizeof(*sa)); 3589 - if (IS_ERR(sa)) 3590 - return PTR_ERR(sa); 3459 + if (IS_ERR(sa)) { 3460 + ret = PTR_ERR(sa); 3461 + goto drop_write; 3462 + } 3591 3463 3592 3464 if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) { 3593 3465 trans = btrfs_start_transaction(root, 2); ··· 3623 3489 if (err && !ret) 3624 3490 ret = err; 3625 3491 } 3626 - 3627 3492 out: 3628 3493 kfree(sa); 3494 + drop_write: 3495 + mnt_drop_write_file(file); 3629 3496 return ret; 3630 3497 } 3631 3498 3632 - static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg) 3499 + static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) 3633 3500 { 3501 + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 3634 3502 struct btrfs_ioctl_qgroup_assign_args *sa; 3635 3503 struct btrfs_trans_handle *trans; 3636 3504 int ret; ··· 3641 3505 if (!capable(CAP_SYS_ADMIN)) 3642 3506 return -EPERM; 3643 3507 3644 - if (root->fs_info->sb->s_flags & MS_RDONLY) 3645 - return -EROFS; 3508 + ret = mnt_want_write_file(file); 3509 + if (ret) 3510 + return ret; 3646 3511 3647 3512 sa = memdup_user(arg, sizeof(*sa)); 3648 - if (IS_ERR(sa)) 3649 - return PTR_ERR(sa); 3513 + if (IS_ERR(sa)) { 3514 + ret = PTR_ERR(sa); 3515 + goto drop_write; 3516 + } 3650 3517 3651 3518 trans = btrfs_join_transaction(root); 3652 3519 if (IS_ERR(trans)) { ··· 3672 3533 3673 3534 out: 3674 3535 kfree(sa); 3536 + drop_write: 3537 + mnt_drop_write_file(file); 3675 3538 return ret; 3676 3539 } 3677 3540 3678 - static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg) 3541 + static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) 3679 3542 { 3543 + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 3680 3544 struct btrfs_ioctl_qgroup_create_args *sa; 3681 3545 struct btrfs_trans_handle *trans; 3682 3546 int ret; ··· 3688 3546 if (!capable(CAP_SYS_ADMIN)) 3689 3547 return -EPERM; 3690 3548 3691 - if (root->fs_info->sb->s_flags & MS_RDONLY) 3692 - return -EROFS; 3549 + ret = mnt_want_write_file(file); 3550 + if (ret) 3551 + return ret; 3693 3552 3694 3553 sa = memdup_user(arg, sizeof(*sa)); 3695 - if (IS_ERR(sa)) 3696 - return PTR_ERR(sa); 3554 + if (IS_ERR(sa)) { 3555 + ret = PTR_ERR(sa); 3556 + goto drop_write; 3557 + } 3697 3558 3698 3559 trans = btrfs_join_transaction(root); 3699 3560 if (IS_ERR(trans)) { ··· 3718 3573 3719 3574 out: 3720 3575 kfree(sa); 3576 + drop_write: 3577 + mnt_drop_write_file(file); 3721 3578 return ret; 3722 3579 } 3723 3580 3724 - static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg) 3581 + static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) 3725 3582 { 3583 + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 3726 3584 struct btrfs_ioctl_qgroup_limit_args *sa; 3727 3585 struct btrfs_trans_handle *trans; 3728 3586 int ret; ··· 3735 3587 if (!capable(CAP_SYS_ADMIN)) 3736 3588 return -EPERM; 3737 3589 3738 - if (root->fs_info->sb->s_flags & MS_RDONLY) 3739 - return -EROFS; 3590 + ret = mnt_want_write_file(file); 3591 + if (ret) 3592 + return ret; 3740 3593 3741 3594 sa = memdup_user(arg, sizeof(*sa)); 3742 - if (IS_ERR(sa)) 3743 - return PTR_ERR(sa); 3595 + if (IS_ERR(sa)) { 3596 + ret = PTR_ERR(sa); 3597 + goto drop_write; 3598 + } 3744 3599 3745 3600 trans = btrfs_join_transaction(root); 3746 3601 if (IS_ERR(trans)) { ··· 3766 3615 3767 3616 out: 3768 3617 kfree(sa); 3618 + drop_write: 3619 + mnt_drop_write_file(file); 3769 3620 return ret; 3770 3621 } 3771 3622 ··· 3888 3735 case BTRFS_IOC_DEFRAG_RANGE: 3889 3736 return btrfs_ioctl_defrag(file, argp); 3890 3737 case BTRFS_IOC_RESIZE: 3891 - return btrfs_ioctl_resize(root, argp); 3738 + return btrfs_ioctl_resize(file, argp); 3892 3739 case BTRFS_IOC_ADD_DEV: 3893 3740 return btrfs_ioctl_add_dev(root, argp); 3894 3741 case BTRFS_IOC_RM_DEV: 3895 - return btrfs_ioctl_rm_dev(root, argp); 3742 + return btrfs_ioctl_rm_dev(file, argp); 3896 3743 case BTRFS_IOC_FS_INFO: 3897 3744 return btrfs_ioctl_fs_info(root, argp); 3898 3745 case BTRFS_IOC_DEV_INFO: ··· 3921 3768 btrfs_sync_fs(file->f_dentry->d_sb, 1); 3922 3769 return 0; 3923 3770 case BTRFS_IOC_START_SYNC: 3924 - return btrfs_ioctl_start_sync(file, argp); 3771 + return btrfs_ioctl_start_sync(root, argp); 3925 3772 case BTRFS_IOC_WAIT_SYNC: 3926 - return btrfs_ioctl_wait_sync(file, argp); 3773 + return btrfs_ioctl_wait_sync(root, argp); 3927 3774 case BTRFS_IOC_SCRUB: 3928 - return btrfs_ioctl_scrub(root, argp); 3775 + return btrfs_ioctl_scrub(file, argp); 3929 3776 case BTRFS_IOC_SCRUB_CANCEL: 3930 3777 return btrfs_ioctl_scrub_cancel(root, argp); 3931 3778 case BTRFS_IOC_SCRUB_PROGRESS: ··· 3943 3790 case BTRFS_IOC_GET_DEV_STATS: 3944 3791 return btrfs_ioctl_get_dev_stats(root, argp); 3945 3792 case BTRFS_IOC_QUOTA_CTL: 3946 - return btrfs_ioctl_quota_ctl(root, argp); 3793 + return btrfs_ioctl_quota_ctl(file, argp); 3947 3794 case BTRFS_IOC_QGROUP_ASSIGN: 3948 - return btrfs_ioctl_qgroup_assign(root, argp); 3795 + return btrfs_ioctl_qgroup_assign(file, argp); 3949 3796 case BTRFS_IOC_QGROUP_CREATE: 3950 - return btrfs_ioctl_qgroup_create(root, argp); 3797 + return btrfs_ioctl_qgroup_create(file, argp); 3951 3798 case BTRFS_IOC_QGROUP_LIMIT: 3952 - return btrfs_ioctl_qgroup_limit(root, argp); 3799 + return btrfs_ioctl_qgroup_limit(file, argp); 3800 + case BTRFS_IOC_DEV_REPLACE: 3801 + return btrfs_ioctl_dev_replace(root, argp); 3953 3802 } 3954 3803 3955 3804 return -ENOTTY;
+47 -1
fs/btrfs/ioctl.h
··· 30 30 char name[BTRFS_PATH_NAME_MAX + 1]; 31 31 }; 32 32 33 + #define BTRFS_DEVICE_PATH_NAME_MAX 1024 34 + 33 35 #define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) 34 36 #define BTRFS_SUBVOL_RDONLY (1ULL << 1) 35 37 #define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2) ··· 125 123 __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8]; 126 124 }; 127 125 128 - #define BTRFS_DEVICE_PATH_NAME_MAX 1024 126 + #define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 127 + #define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 128 + struct btrfs_ioctl_dev_replace_start_params { 129 + __u64 srcdevid; /* in, if 0, use srcdev_name instead */ 130 + __u64 cont_reading_from_srcdev_mode; /* in, see #define 131 + * above */ 132 + __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */ 133 + __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */ 134 + }; 135 + 136 + #define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0 137 + #define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED 1 138 + #define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2 139 + #define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3 140 + #define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4 141 + struct btrfs_ioctl_dev_replace_status_params { 142 + __u64 replace_state; /* out, see #define above */ 143 + __u64 progress_1000; /* out, 0 <= x <= 1000 */ 144 + __u64 time_started; /* out, seconds since 1-Jan-1970 */ 145 + __u64 time_stopped; /* out, seconds since 1-Jan-1970 */ 146 + __u64 num_write_errors; /* out */ 147 + __u64 num_uncorrectable_read_errors; /* out */ 148 + }; 149 + 150 + #define BTRFS_IOCTL_DEV_REPLACE_CMD_START 0 151 + #define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS 1 152 + #define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL 2 153 + #define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR 0 154 + #define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED 1 155 + #define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED 2 156 + struct btrfs_ioctl_dev_replace_args { 157 + __u64 cmd; /* in */ 158 + __u64 result; /* out */ 159 + 160 + union { 161 + struct btrfs_ioctl_dev_replace_start_params start; 162 + struct btrfs_ioctl_dev_replace_status_params status; 163 + }; /* in/out */ 164 + 165 + __u64 spare[64]; 166 + }; 167 + 129 168 struct btrfs_ioctl_dev_info_args { 130 169 __u64 devid; /* in/out */ 131 170 __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */ ··· 496 453 struct btrfs_ioctl_qgroup_limit_args) 497 454 #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ 498 455 struct btrfs_ioctl_get_dev_stats) 456 + #define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \ 457 + struct btrfs_ioctl_dev_replace_args) 458 + 499 459 #endif
+44
fs/btrfs/math.h
··· 1 + 2 + /* 3 + * Copyright (C) 2012 Fujitsu. All rights reserved. 4 + * Written by Miao Xie <miaox@cn.fujitsu.com> 5 + * 6 + * This program is free software; you can redistribute it and/or 7 + * modify it under the terms of the GNU General Public 8 + * License v2 as published by the Free Software Foundation. 9 + * 10 + * This program is distributed in the hope that it will be useful, 11 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 + * General Public License for more details. 14 + * 15 + * You should have received a copy of the GNU General Public 16 + * License along with this program; if not, write to the 17 + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 18 + * Boston, MA 021110-1307, USA. 19 + */ 20 + 21 + #ifndef __BTRFS_MATH_H 22 + #define __BTRFS_MATH_H 23 + 24 + #include <asm/div64.h> 25 + 26 + static inline u64 div_factor(u64 num, int factor) 27 + { 28 + if (factor == 10) 29 + return num; 30 + num *= factor; 31 + do_div(num, 10); 32 + return num; 33 + } 34 + 35 + static inline u64 div_factor_fine(u64 num, int factor) 36 + { 37 + if (factor == 100) 38 + return num; 39 + num *= factor; 40 + do_div(num, 100); 41 + return num; 42 + } 43 + 44 + #endif
+63 -27
fs/btrfs/ordered-data.c
··· 211 211 init_waitqueue_head(&entry->wait); 212 212 INIT_LIST_HEAD(&entry->list); 213 213 INIT_LIST_HEAD(&entry->root_extent_list); 214 + INIT_LIST_HEAD(&entry->work_list); 215 + init_completion(&entry->completion); 214 216 215 217 trace_btrfs_ordered_extent_add(inode, entry); 216 218 ··· 466 464 wake_up(&entry->wait); 467 465 } 468 466 467 + static void btrfs_run_ordered_extent_work(struct btrfs_work *work) 468 + { 469 + struct btrfs_ordered_extent *ordered; 470 + 471 + ordered = container_of(work, struct btrfs_ordered_extent, flush_work); 472 + btrfs_start_ordered_extent(ordered->inode, ordered, 1); 473 + complete(&ordered->completion); 474 + } 475 + 469 476 /* 470 477 * wait for all the ordered extents in a root. This is done when balancing 471 478 * space between drives. 472 479 */ 473 480 void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) 474 481 { 475 - struct list_head splice; 482 + struct list_head splice, works; 476 483 struct list_head *cur; 477 - struct btrfs_ordered_extent *ordered; 484 + struct btrfs_ordered_extent *ordered, *next; 478 485 struct inode *inode; 479 486 480 487 INIT_LIST_HEAD(&splice); 488 + INIT_LIST_HEAD(&works); 481 489 482 490 spin_lock(&root->fs_info->ordered_extent_lock); 483 491 list_splice_init(&root->fs_info->ordered_extents, &splice); ··· 506 494 spin_unlock(&root->fs_info->ordered_extent_lock); 507 495 508 496 if (inode) { 509 - btrfs_start_ordered_extent(inode, ordered, 1); 510 - btrfs_put_ordered_extent(ordered); 511 - if (delay_iput) 512 - btrfs_add_delayed_iput(inode); 513 - else 514 - iput(inode); 497 + ordered->flush_work.func = btrfs_run_ordered_extent_work; 498 + list_add_tail(&ordered->work_list, &works); 499 + btrfs_queue_worker(&root->fs_info->flush_workers, 500 + &ordered->flush_work); 515 501 } else { 516 502 btrfs_put_ordered_extent(ordered); 517 503 } 518 504 505 + cond_resched(); 519 506 spin_lock(&root->fs_info->ordered_extent_lock); 520 507 } 521 508 spin_unlock(&root->fs_info->ordered_extent_lock); 509 + 510 + list_for_each_entry_safe(ordered, next, &works, work_list) { 511 + list_del_init(&ordered->work_list); 512 + wait_for_completion(&ordered->completion); 513 + 514 + inode = ordered->inode; 515 + btrfs_put_ordered_extent(ordered); 516 + if (delay_iput) 517 + btrfs_add_delayed_iput(inode); 518 + else 519 + iput(inode); 520 + 521 + cond_resched(); 522 + } 522 523 } 523 524 524 525 /* ··· 544 519 * extra check to make sure the ordered operation list really is empty 545 520 * before we return 546 521 */ 547 - void btrfs_run_ordered_operations(struct btrfs_root *root, int wait) 522 + int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) 548 523 { 549 524 struct btrfs_inode *btrfs_inode; 550 525 struct inode *inode; 551 526 struct list_head splice; 527 + struct list_head works; 528 + struct btrfs_delalloc_work *work, *next; 529 + int ret = 0; 552 530 553 531 INIT_LIST_HEAD(&splice); 532 + INIT_LIST_HEAD(&works); 554 533 555 534 mutex_lock(&root->fs_info->ordered_operations_mutex); 556 535 spin_lock(&root->fs_info->ordered_extent_lock); ··· 562 533 list_splice_init(&root->fs_info->ordered_operations, &splice); 563 534 564 535 while (!list_empty(&splice)) { 536 + 565 537 btrfs_inode = list_entry(splice.next, struct btrfs_inode, 566 538 ordered_operations); 567 539 ··· 579 549 list_add_tail(&BTRFS_I(inode)->ordered_operations, 580 550 &root->fs_info->ordered_operations); 581 551 } 552 + 553 + if (!inode) 554 + continue; 582 555 spin_unlock(&root->fs_info->ordered_extent_lock); 583 556 584 - if (inode) { 585 - if (wait) 586 - btrfs_wait_ordered_range(inode, 0, (u64)-1); 587 - else 588 - filemap_flush(inode->i_mapping); 589 - btrfs_add_delayed_iput(inode); 557 + work = btrfs_alloc_delalloc_work(inode, wait, 1); 558 + if (!work) { 559 + if (list_empty(&BTRFS_I(inode)->ordered_operations)) 560 + list_add_tail(&btrfs_inode->ordered_operations, 561 + &splice); 562 + spin_lock(&root->fs_info->ordered_extent_lock); 563 + list_splice_tail(&splice, 564 + &root->fs_info->ordered_operations); 565 + spin_unlock(&root->fs_info->ordered_extent_lock); 566 + ret = -ENOMEM; 567 + goto out; 590 568 } 569 + list_add_tail(&work->list, &works); 570 + btrfs_queue_worker(&root->fs_info->flush_workers, 571 + &work->work); 591 572 592 573 cond_resched(); 593 574 spin_lock(&root->fs_info->ordered_extent_lock); ··· 607 566 goto again; 608 567 609 568 spin_unlock(&root->fs_info->ordered_extent_lock); 569 + out: 570 + list_for_each_entry_safe(work, next, &works, list) { 571 + list_del_init(&work->list); 572 + btrfs_wait_and_free_delalloc_work(work); 573 + } 610 574 mutex_unlock(&root->fs_info->ordered_operations_mutex); 575 + return ret; 611 576 } 612 577 613 578 /* ··· 653 606 u64 end; 654 607 u64 orig_end; 655 608 struct btrfs_ordered_extent *ordered; 656 - int found; 657 609 658 610 if (start + len < start) { 659 611 orig_end = INT_LIMIT(loff_t); ··· 688 642 filemap_fdatawait_range(inode->i_mapping, start, orig_end); 689 643 690 644 end = orig_end; 691 - found = 0; 692 645 while (1) { 693 646 ordered = btrfs_lookup_first_ordered_extent(inode, end); 694 647 if (!ordered) ··· 700 655 btrfs_put_ordered_extent(ordered); 701 656 break; 702 657 } 703 - found++; 704 658 btrfs_start_ordered_extent(inode, ordered, 1); 705 659 end = ordered->file_offset; 706 660 btrfs_put_ordered_extent(ordered); ··· 978 934 if (last_mod < root->fs_info->last_trans_committed) 979 935 return; 980 936 981 - /* 982 - * the transaction is already committing. Just start the IO and 983 - * don't bother with all of this list nonsense 984 - */ 985 - if (trans && root->fs_info->running_transaction->blocked) { 986 - btrfs_wait_ordered_range(inode, 0, (u64)-1); 987 - return; 988 - } 989 - 990 937 spin_lock(&root->fs_info->ordered_extent_lock); 991 938 if (list_empty(&BTRFS_I(inode)->ordered_operations)) { 992 939 list_add_tail(&BTRFS_I(inode)->ordered_operations, ··· 994 959 NULL); 995 960 if (!btrfs_ordered_extent_cache) 996 961 return -ENOMEM; 962 + 997 963 return 0; 998 964 } 999 965
+5 -2
fs/btrfs/ordered-data.h
··· 128 128 struct list_head root_extent_list; 129 129 130 130 struct btrfs_work work; 131 - }; 132 131 132 + struct completion completion; 133 + struct btrfs_work flush_work; 134 + struct list_head work_list; 135 + }; 133 136 134 137 /* 135 138 * calculates the total size you need to allocate for an ordered sum ··· 189 186 int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 190 187 struct btrfs_ordered_extent *ordered); 191 188 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 192 - void btrfs_run_ordered_operations(struct btrfs_root *root, int wait); 189 + int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); 193 190 void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, 194 191 struct btrfs_root *root, 195 192 struct inode *inode);
+3
fs/btrfs/print-tree.c
··· 297 297 case BTRFS_DEV_STATS_KEY: 298 298 printk(KERN_INFO "\t\tdevice stats\n"); 299 299 break; 300 + case BTRFS_DEV_REPLACE_KEY: 301 + printk(KERN_INFO "\t\tdev replace\n"); 302 + break; 300 303 }; 301 304 } 302 305 }
+28 -3
fs/btrfs/reada.c
··· 27 27 #include "volumes.h" 28 28 #include "disk-io.h" 29 29 #include "transaction.h" 30 + #include "dev-replace.h" 30 31 31 32 #undef DEBUG 32 33 ··· 324 323 struct reada_extent *re = NULL; 325 324 struct reada_extent *re_exist = NULL; 326 325 struct btrfs_fs_info *fs_info = root->fs_info; 327 - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 328 326 struct btrfs_bio *bbio = NULL; 329 327 struct btrfs_device *dev; 330 328 struct btrfs_device *prev_dev; ··· 332 332 int nzones = 0; 333 333 int i; 334 334 unsigned long index = logical >> PAGE_CACHE_SHIFT; 335 + int dev_replace_is_ongoing; 335 336 336 337 spin_lock(&fs_info->reada_lock); 337 338 re = radix_tree_lookup(&fs_info->reada_tree, index); ··· 359 358 * map block 360 359 */ 361 360 length = blocksize; 362 - ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0); 361 + ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length, 362 + &bbio, 0); 363 363 if (ret || !bbio || length < blocksize) 364 364 goto error; 365 365 ··· 395 393 } 396 394 397 395 /* insert extent in reada_tree + all per-device trees, all or nothing */ 396 + btrfs_dev_replace_lock(&fs_info->dev_replace); 398 397 spin_lock(&fs_info->reada_lock); 399 398 ret = radix_tree_insert(&fs_info->reada_tree, index, re); 400 399 if (ret == -EEXIST) { ··· 403 400 BUG_ON(!re_exist); 404 401 re_exist->refcnt++; 405 402 spin_unlock(&fs_info->reada_lock); 403 + btrfs_dev_replace_unlock(&fs_info->dev_replace); 406 404 goto error; 407 405 } 408 406 if (ret) { 409 407 spin_unlock(&fs_info->reada_lock); 408 + btrfs_dev_replace_unlock(&fs_info->dev_replace); 410 409 goto error; 411 410 } 412 411 prev_dev = NULL; 412 + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing( 413 + &fs_info->dev_replace); 413 414 for (i = 0; i < nzones; ++i) { 414 415 dev = bbio->stripes[i].dev; 415 416 if (dev == prev_dev) { ··· 426 419 */ 427 420 continue; 428 421 } 422 + if (!dev->bdev) { 423 + /* cannot read ahead on missing device */ 424 + continue; 425 + } 426 + if (dev_replace_is_ongoing && 427 + dev == fs_info->dev_replace.tgtdev) { 428 + /* 429 + * as this device is selected for reading only as 430 + * a last resort, skip it for read ahead. 431 + */ 432 + continue; 433 + } 429 434 prev_dev = dev; 430 435 ret = radix_tree_insert(&dev->reada_extents, index, re); 431 436 if (ret) { 432 437 while (--i >= 0) { 433 438 dev = bbio->stripes[i].dev; 434 439 BUG_ON(dev == NULL); 440 + /* ignore whether the entry was inserted */ 435 441 radix_tree_delete(&dev->reada_extents, index); 436 442 } 437 443 BUG_ON(fs_info == NULL); 438 444 radix_tree_delete(&fs_info->reada_tree, index); 439 445 spin_unlock(&fs_info->reada_lock); 446 + btrfs_dev_replace_unlock(&fs_info->dev_replace); 440 447 goto error; 441 448 } 442 449 } 443 450 spin_unlock(&fs_info->reada_lock); 451 + btrfs_dev_replace_unlock(&fs_info->dev_replace); 444 452 445 453 kfree(bbio); 446 454 return re; ··· 937 915 generation = btrfs_header_generation(node); 938 916 free_extent_buffer(node); 939 917 940 - reada_add_block(rc, start, &max_key, level, generation); 918 + if (reada_add_block(rc, start, &max_key, level, generation)) { 919 + kfree(rc); 920 + return ERR_PTR(-ENOMEM); 921 + } 941 922 942 923 reada_start_machine(root->fs_info); 943 924
+19 -21
fs/btrfs/relocation.c
··· 2025 2025 struct btrfs_root_item *root_item; 2026 2026 struct btrfs_path *path; 2027 2027 struct extent_buffer *leaf; 2028 - unsigned long nr; 2029 2028 int level; 2030 2029 int max_level; 2031 2030 int replaced = 0; ··· 2073 2074 BUG_ON(IS_ERR(trans)); 2074 2075 trans->block_rsv = rc->block_rsv; 2075 2076 2076 - ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved); 2077 + ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, 2078 + BTRFS_RESERVE_FLUSH_ALL); 2077 2079 if (ret) { 2078 2080 BUG_ON(ret != -EAGAIN); 2079 2081 ret = btrfs_commit_transaction(trans, root); ··· 2125 2125 path->slots[level]); 2126 2126 root_item->drop_level = level; 2127 2127 2128 - nr = trans->blocks_used; 2129 2128 btrfs_end_transaction_throttle(trans, root); 2130 2129 2131 - btrfs_btree_balance_dirty(root, nr); 2130 + btrfs_btree_balance_dirty(root); 2132 2131 2133 2132 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2134 2133 invalidate_extent_cache(root, &key, &next_key); ··· 2154 2155 btrfs_update_reloc_root(trans, root); 2155 2156 } 2156 2157 2157 - nr = trans->blocks_used; 2158 2158 btrfs_end_transaction_throttle(trans, root); 2159 2159 2160 - btrfs_btree_balance_dirty(root, nr); 2160 + btrfs_btree_balance_dirty(root); 2161 2161 2162 2162 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2163 2163 invalidate_extent_cache(root, &key, &next_key); ··· 2182 2184 again: 2183 2185 if (!err) { 2184 2186 num_bytes = rc->merging_rsv_size; 2185 - ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); 2187 + ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, 2188 + BTRFS_RESERVE_FLUSH_ALL); 2186 2189 if (ret) 2187 2190 err = ret; 2188 2191 } ··· 2458 2459 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2459 2460 2460 2461 trans->block_rsv = rc->block_rsv; 2461 - ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); 2462 + ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, 2463 + BTRFS_RESERVE_FLUSH_ALL); 2462 2464 if (ret) { 2463 2465 if (ret == -EAGAIN) 2464 2466 rc->commit_transaction = 1; ··· 3259 3259 struct btrfs_path *path; 3260 3260 struct btrfs_root *root = fs_info->tree_root; 3261 3261 struct btrfs_trans_handle *trans; 3262 - unsigned long nr; 3263 3262 int ret = 0; 3264 3263 3265 3264 if (inode) ··· 3292 3293 ret = btrfs_truncate_free_space_cache(root, trans, path, inode); 3293 3294 3294 3295 btrfs_free_path(path); 3295 - nr = trans->blocks_used; 3296 3296 btrfs_end_transaction(trans, root); 3297 - btrfs_btree_balance_dirty(root, nr); 3297 + btrfs_btree_balance_dirty(root); 3298 3298 out: 3299 3299 iput(inode); 3300 3300 return ret; ··· 3683 3685 * is no reservation in transaction handle. 3684 3686 */ 3685 3687 ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv, 3686 - rc->extent_root->nodesize * 256); 3688 + rc->extent_root->nodesize * 256, 3689 + BTRFS_RESERVE_FLUSH_ALL); 3687 3690 if (ret) 3688 3691 return ret; 3689 3692 ··· 3710 3711 struct btrfs_trans_handle *trans = NULL; 3711 3712 struct btrfs_path *path; 3712 3713 struct btrfs_extent_item *ei; 3713 - unsigned long nr; 3714 3714 u64 flags; 3715 3715 u32 item_size; 3716 3716 int ret; ··· 3826 3828 ret = btrfs_commit_transaction(trans, rc->extent_root); 3827 3829 BUG_ON(ret); 3828 3830 } else { 3829 - nr = trans->blocks_used; 3830 3831 btrfs_end_transaction_throttle(trans, rc->extent_root); 3831 - btrfs_btree_balance_dirty(rc->extent_root, nr); 3832 + btrfs_btree_balance_dirty(rc->extent_root); 3832 3833 } 3833 3834 trans = NULL; 3834 3835 ··· 3857 3860 GFP_NOFS); 3858 3861 3859 3862 if (trans) { 3860 - nr = trans->blocks_used; 3861 3863 btrfs_end_transaction_throttle(trans, rc->extent_root); 3862 - btrfs_btree_balance_dirty(rc->extent_root, nr); 3864 + btrfs_btree_balance_dirty(rc->extent_root); 3863 3865 } 3864 3866 3865 3867 if (!err) { ··· 3937 3941 struct btrfs_trans_handle *trans; 3938 3942 struct btrfs_root *root; 3939 3943 struct btrfs_key key; 3940 - unsigned long nr; 3941 3944 u64 objectid = BTRFS_FIRST_FREE_OBJECTID; 3942 3945 int err = 0; 3943 3946 ··· 3964 3969 3965 3970 err = btrfs_orphan_add(trans, inode); 3966 3971 out: 3967 - nr = trans->blocks_used; 3968 3972 btrfs_end_transaction(trans, root); 3969 - btrfs_btree_balance_dirty(root, nr); 3973 + btrfs_btree_balance_dirty(root); 3970 3974 if (err) { 3971 3975 if (inode) 3972 3976 iput(inode); ··· 4051 4057 (unsigned long long)rc->block_group->key.objectid, 4052 4058 (unsigned long long)rc->block_group->flags); 4053 4059 4054 - btrfs_start_delalloc_inodes(fs_info->tree_root, 0); 4060 + ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0); 4061 + if (ret < 0) { 4062 + err = ret; 4063 + goto out; 4064 + } 4055 4065 btrfs_wait_ordered_extents(fs_info->tree_root, 0); 4056 4066 4057 4067 while (1) {
+2 -2
fs/btrfs/root-tree.c
··· 548 548 struct btrfs_root_item *item = &root->root_item; 549 549 struct timespec ct = CURRENT_TIME; 550 550 551 - spin_lock(&root->root_times_lock); 551 + spin_lock(&root->root_item_lock); 552 552 item->ctransid = cpu_to_le64(trans->transid); 553 553 item->ctime.sec = cpu_to_le64(ct.tv_sec); 554 554 item->ctime.nsec = cpu_to_le32(ct.tv_nsec); 555 - spin_unlock(&root->root_times_lock); 555 + spin_unlock(&root->root_item_lock); 556 556 }
+1333 -503
fs/btrfs/scrub.c
··· 1 1 /* 2 - * Copyright (C) 2011 STRATO. All rights reserved. 2 + * Copyright (C) 2011, 2012 STRATO. All rights reserved. 3 3 * 4 4 * This program is free software; you can redistribute it and/or 5 5 * modify it under the terms of the GNU General Public ··· 25 25 #include "transaction.h" 26 26 #include "backref.h" 27 27 #include "extent_io.h" 28 + #include "dev-replace.h" 28 29 #include "check-integrity.h" 29 30 #include "rcu-string.h" 30 31 ··· 43 42 */ 44 43 45 44 struct scrub_block; 46 - struct scrub_dev; 45 + struct scrub_ctx; 47 46 48 - #define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ 49 - #define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ 47 + /* 48 + * the following three values only influence the performance. 49 + * The last one configures the number of parallel and outstanding I/O 50 + * operations. The first two values configure an upper limit for the number 51 + * of (dynamically allocated) pages that are added to a bio. 52 + */ 53 + #define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */ 54 + #define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */ 55 + #define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */ 56 + 57 + /* 58 + * the following value times PAGE_SIZE needs to be large enough to match the 59 + * largest node/leaf/sector size that shall be supported. 60 + * Values larger than BTRFS_STRIPE_LEN are not supported. 61 + */ 50 62 #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ 51 63 52 64 struct scrub_page { ··· 70 56 u64 generation; 71 57 u64 logical; 72 58 u64 physical; 59 + u64 physical_for_dev_replace; 60 + atomic_t ref_count; 73 61 struct { 74 62 unsigned int mirror_num:8; 75 63 unsigned int have_csum:1; ··· 82 66 83 67 struct scrub_bio { 84 68 int index; 85 - struct scrub_dev *sdev; 69 + struct scrub_ctx *sctx; 70 + struct btrfs_device *dev; 86 71 struct bio *bio; 87 72 int err; 88 73 u64 logical; 89 74 u64 physical; 90 - struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; 75 + #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO 76 + struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO]; 77 + #else 78 + struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO]; 79 + #endif 91 80 int page_count; 92 81 int next_free; 93 82 struct btrfs_work work; 94 83 }; 95 84 96 85 struct scrub_block { 97 - struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK]; 86 + struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; 98 87 int page_count; 99 88 atomic_t outstanding_pages; 100 89 atomic_t ref_count; /* free mem on transition to zero */ 101 - struct scrub_dev *sdev; 90 + struct scrub_ctx *sctx; 102 91 struct { 103 92 unsigned int header_error:1; 104 93 unsigned int checksum_error:1; ··· 112 91 }; 113 92 }; 114 93 115 - struct scrub_dev { 116 - struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; 117 - struct btrfs_device *dev; 94 + struct scrub_wr_ctx { 95 + struct scrub_bio *wr_curr_bio; 96 + struct btrfs_device *tgtdev; 97 + int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ 98 + atomic_t flush_all_writes; 99 + struct mutex wr_lock; 100 + }; 101 + 102 + struct scrub_ctx { 103 + struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX]; 104 + struct btrfs_root *dev_root; 118 105 int first_free; 119 106 int curr; 120 - atomic_t in_flight; 121 - atomic_t fixup_cnt; 107 + atomic_t bios_in_flight; 108 + atomic_t workers_pending; 122 109 spinlock_t list_lock; 123 110 wait_queue_head_t list_wait; 124 111 u16 csum_size; 125 112 struct list_head csum_list; 126 113 atomic_t cancel_req; 127 114 int readonly; 128 - int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ 115 + int pages_per_rd_bio; 129 116 u32 sectorsize; 130 117 u32 nodesize; 131 118 u32 leafsize; 119 + 120 + int is_dev_replace; 121 + struct scrub_wr_ctx wr_ctx; 122 + 132 123 /* 133 124 * statistics 134 125 */ ··· 149 116 }; 150 117 151 118 struct scrub_fixup_nodatasum { 152 - struct scrub_dev *sdev; 119 + struct scrub_ctx *sctx; 120 + struct btrfs_device *dev; 153 121 u64 logical; 154 122 struct btrfs_root *root; 155 123 struct btrfs_work work; 156 124 int mirror_num; 125 + }; 126 + 127 + struct scrub_copy_nocow_ctx { 128 + struct scrub_ctx *sctx; 129 + u64 logical; 130 + u64 len; 131 + int mirror_num; 132 + u64 physical_for_dev_replace; 133 + struct btrfs_work work; 157 134 }; 158 135 159 136 struct scrub_warning { ··· 180 137 }; 181 138 182 139 140 + static void scrub_pending_bio_inc(struct scrub_ctx *sctx); 141 + static void scrub_pending_bio_dec(struct scrub_ctx *sctx); 142 + static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); 143 + static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx); 183 144 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); 184 - static int scrub_setup_recheck_block(struct scrub_dev *sdev, 185 - struct btrfs_mapping_tree *map_tree, 145 + static int scrub_setup_recheck_block(struct scrub_ctx *sctx, 146 + struct btrfs_fs_info *fs_info, 147 + struct scrub_block *original_sblock, 186 148 u64 length, u64 logical, 187 - struct scrub_block *sblock); 188 - static int scrub_recheck_block(struct btrfs_fs_info *fs_info, 189 - struct scrub_block *sblock, int is_metadata, 190 - int have_csum, u8 *csum, u64 generation, 191 - u16 csum_size); 149 + struct scrub_block *sblocks_for_recheck); 150 + static void scrub_recheck_block(struct btrfs_fs_info *fs_info, 151 + struct scrub_block *sblock, int is_metadata, 152 + int have_csum, u8 *csum, u64 generation, 153 + u16 csum_size); 192 154 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 193 155 struct scrub_block *sblock, 194 156 int is_metadata, int have_csum, ··· 206 158 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, 207 159 struct scrub_block *sblock_good, 208 160 int page_num, int force_write); 161 + static void scrub_write_block_to_dev_replace(struct scrub_block *sblock); 162 + static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, 163 + int page_num); 209 164 static int scrub_checksum_data(struct scrub_block *sblock); 210 165 static int scrub_checksum_tree_block(struct scrub_block *sblock); 211 166 static int scrub_checksum_super(struct scrub_block *sblock); 212 167 static void scrub_block_get(struct scrub_block *sblock); 213 168 static void scrub_block_put(struct scrub_block *sblock); 214 - static int scrub_add_page_to_bio(struct scrub_dev *sdev, 215 - struct scrub_page *spage); 216 - static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, 217 - u64 physical, u64 flags, u64 gen, int mirror_num, 218 - u8 *csum, int force); 169 + static void scrub_page_get(struct scrub_page *spage); 170 + static void scrub_page_put(struct scrub_page *spage); 171 + static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, 172 + struct scrub_page *spage); 173 + static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 174 + u64 physical, struct btrfs_device *dev, u64 flags, 175 + u64 gen, int mirror_num, u8 *csum, int force, 176 + u64 physical_for_dev_replace); 219 177 static void scrub_bio_end_io(struct bio *bio, int err); 220 178 static void scrub_bio_end_io_worker(struct btrfs_work *work); 221 179 static void scrub_block_complete(struct scrub_block *sblock); 180 + static void scrub_remap_extent(struct btrfs_fs_info *fs_info, 181 + u64 extent_logical, u64 extent_len, 182 + u64 *extent_physical, 183 + struct btrfs_device **extent_dev, 184 + int *extent_mirror_num); 185 + static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, 186 + struct scrub_wr_ctx *wr_ctx, 187 + struct btrfs_fs_info *fs_info, 188 + struct btrfs_device *dev, 189 + int is_dev_replace); 190 + static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx); 191 + static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, 192 + struct scrub_page *spage); 193 + static void scrub_wr_submit(struct scrub_ctx *sctx); 194 + static void scrub_wr_bio_end_io(struct bio *bio, int err); 195 + static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); 196 + static int write_page_nocow(struct scrub_ctx *sctx, 197 + u64 physical_for_dev_replace, struct page *page); 198 + static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, 199 + void *ctx); 200 + static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 201 + int mirror_num, u64 physical_for_dev_replace); 202 + static void copy_nocow_pages_worker(struct btrfs_work *work); 222 203 223 204 224 - static void scrub_free_csums(struct scrub_dev *sdev) 205 + static void scrub_pending_bio_inc(struct scrub_ctx *sctx) 225 206 { 226 - while (!list_empty(&sdev->csum_list)) { 207 + atomic_inc(&sctx->bios_in_flight); 208 + } 209 + 210 + static void scrub_pending_bio_dec(struct scrub_ctx *sctx) 211 + { 212 + atomic_dec(&sctx->bios_in_flight); 213 + wake_up(&sctx->list_wait); 214 + } 215 + 216 + /* 217 + * used for workers that require transaction commits (i.e., for the 218 + * NOCOW case) 219 + */ 220 + static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) 221 + { 222 + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; 223 + 224 + /* 225 + * increment scrubs_running to prevent cancel requests from 226 + * completing as long as a worker is running. we must also 227 + * increment scrubs_paused to prevent deadlocking on pause 228 + * requests used for transactions commits (as the worker uses a 229 + * transaction context). it is safe to regard the worker 230 + * as paused for all matters practical. effectively, we only 231 + * avoid cancellation requests from completing. 232 + */ 233 + mutex_lock(&fs_info->scrub_lock); 234 + atomic_inc(&fs_info->scrubs_running); 235 + atomic_inc(&fs_info->scrubs_paused); 236 + mutex_unlock(&fs_info->scrub_lock); 237 + atomic_inc(&sctx->workers_pending); 238 + } 239 + 240 + /* used for workers that require transaction commits */ 241 + static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx) 242 + { 243 + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; 244 + 245 + /* 246 + * see scrub_pending_trans_workers_inc() why we're pretending 247 + * to be paused in the scrub counters 248 + */ 249 + mutex_lock(&fs_info->scrub_lock); 250 + atomic_dec(&fs_info->scrubs_running); 251 + atomic_dec(&fs_info->scrubs_paused); 252 + mutex_unlock(&fs_info->scrub_lock); 253 + atomic_dec(&sctx->workers_pending); 254 + wake_up(&fs_info->scrub_pause_wait); 255 + wake_up(&sctx->list_wait); 256 + } 257 + 258 + static void scrub_free_csums(struct scrub_ctx *sctx) 259 + { 260 + while (!list_empty(&sctx->csum_list)) { 227 261 struct btrfs_ordered_sum *sum; 228 - sum = list_first_entry(&sdev->csum_list, 262 + sum = list_first_entry(&sctx->csum_list, 229 263 struct btrfs_ordered_sum, list); 230 264 list_del(&sum->list); 231 265 kfree(sum); 232 266 } 233 267 } 234 268 235 - static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) 269 + static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) 236 270 { 237 271 int i; 238 272 239 - if (!sdev) 273 + if (!sctx) 240 274 return; 241 275 276 + scrub_free_wr_ctx(&sctx->wr_ctx); 277 + 242 278 /* this can happen when scrub is cancelled */ 243 - if (sdev->curr != -1) { 244 - struct scrub_bio *sbio = sdev->bios[sdev->curr]; 279 + if (sctx->curr != -1) { 280 + struct scrub_bio *sbio = sctx->bios[sctx->curr]; 245 281 246 282 for (i = 0; i < sbio->page_count; i++) { 247 - BUG_ON(!sbio->pagev[i]); 248 - BUG_ON(!sbio->pagev[i]->page); 283 + WARN_ON(!sbio->pagev[i]->page); 249 284 scrub_block_put(sbio->pagev[i]->sblock); 250 285 } 251 286 bio_put(sbio->bio); 252 287 } 253 288 254 - for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 255 - struct scrub_bio *sbio = sdev->bios[i]; 289 + for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { 290 + struct scrub_bio *sbio = sctx->bios[i]; 256 291 257 292 if (!sbio) 258 293 break; 259 294 kfree(sbio); 260 295 } 261 296 262 - scrub_free_csums(sdev); 263 - kfree(sdev); 297 + scrub_free_csums(sctx); 298 + kfree(sctx); 264 299 } 265 300 266 301 static noinline_for_stack 267 - struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) 302 + struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) 268 303 { 269 - struct scrub_dev *sdev; 304 + struct scrub_ctx *sctx; 270 305 int i; 271 306 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 272 - int pages_per_bio; 307 + int pages_per_rd_bio; 308 + int ret; 273 309 274 - pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, 275 - bio_get_nr_vecs(dev->bdev)); 276 - sdev = kzalloc(sizeof(*sdev), GFP_NOFS); 277 - if (!sdev) 310 + /* 311 + * the setting of pages_per_rd_bio is correct for scrub but might 312 + * be wrong for the dev_replace code where we might read from 313 + * different devices in the initial huge bios. However, that 314 + * code is able to correctly handle the case when adding a page 315 + * to a bio fails. 316 + */ 317 + if (dev->bdev) 318 + pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO, 319 + bio_get_nr_vecs(dev->bdev)); 320 + else 321 + pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; 322 + sctx = kzalloc(sizeof(*sctx), GFP_NOFS); 323 + if (!sctx) 278 324 goto nomem; 279 - sdev->dev = dev; 280 - sdev->pages_per_bio = pages_per_bio; 281 - sdev->curr = -1; 282 - for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 325 + sctx->is_dev_replace = is_dev_replace; 326 + sctx->pages_per_rd_bio = pages_per_rd_bio; 327 + sctx->curr = -1; 328 + sctx->dev_root = dev->dev_root; 329 + for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { 283 330 struct scrub_bio *sbio; 284 331 285 332 sbio = kzalloc(sizeof(*sbio), GFP_NOFS); 286 333 if (!sbio) 287 334 goto nomem; 288 - sdev->bios[i] = sbio; 335 + sctx->bios[i] = sbio; 289 336 290 337 sbio->index = i; 291 - sbio->sdev = sdev; 338 + sbio->sctx = sctx; 292 339 sbio->page_count = 0; 293 340 sbio->work.func = scrub_bio_end_io_worker; 294 341 295 - if (i != SCRUB_BIOS_PER_DEV-1) 296 - sdev->bios[i]->next_free = i + 1; 342 + if (i != SCRUB_BIOS_PER_SCTX - 1) 343 + sctx->bios[i]->next_free = i + 1; 297 344 else 298 - sdev->bios[i]->next_free = -1; 345 + sctx->bios[i]->next_free = -1; 299 346 } 300 - sdev->first_free = 0; 301 - sdev->nodesize = dev->dev_root->nodesize; 302 - sdev->leafsize = dev->dev_root->leafsize; 303 - sdev->sectorsize = dev->dev_root->sectorsize; 304 - atomic_set(&sdev->in_flight, 0); 305 - atomic_set(&sdev->fixup_cnt, 0); 306 - atomic_set(&sdev->cancel_req, 0); 307 - sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); 308 - INIT_LIST_HEAD(&sdev->csum_list); 347 + sctx->first_free = 0; 348 + sctx->nodesize = dev->dev_root->nodesize; 349 + sctx->leafsize = dev->dev_root->leafsize; 350 + sctx->sectorsize = dev->dev_root->sectorsize; 351 + atomic_set(&sctx->bios_in_flight, 0); 352 + atomic_set(&sctx->workers_pending, 0); 353 + atomic_set(&sctx->cancel_req, 0); 354 + sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy); 355 + INIT_LIST_HEAD(&sctx->csum_list); 309 356 310 - spin_lock_init(&sdev->list_lock); 311 - spin_lock_init(&sdev->stat_lock); 312 - init_waitqueue_head(&sdev->list_wait); 313 - return sdev; 357 + spin_lock_init(&sctx->list_lock); 358 + spin_lock_init(&sctx->stat_lock); 359 + init_waitqueue_head(&sctx->list_wait); 360 + 361 + ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info, 362 + fs_info->dev_replace.tgtdev, is_dev_replace); 363 + if (ret) { 364 + scrub_free_ctx(sctx); 365 + return ERR_PTR(ret); 366 + } 367 + return sctx; 314 368 315 369 nomem: 316 - scrub_free_dev(sdev); 370 + scrub_free_ctx(sctx); 317 371 return ERR_PTR(-ENOMEM); 318 372 } 319 373 320 - static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) 374 + static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, 375 + void *warn_ctx) 321 376 { 322 377 u64 isize; 323 378 u32 nlink; ··· 428 277 int i; 429 278 struct extent_buffer *eb; 430 279 struct btrfs_inode_item *inode_item; 431 - struct scrub_warning *swarn = ctx; 280 + struct scrub_warning *swarn = warn_ctx; 432 281 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; 433 282 struct inode_fs_paths *ipath = NULL; 434 283 struct btrfs_root *local_root; ··· 496 345 497 346 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) 498 347 { 499 - struct btrfs_device *dev = sblock->sdev->dev; 500 - struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 348 + struct btrfs_device *dev; 349 + struct btrfs_fs_info *fs_info; 501 350 struct btrfs_path *path; 502 351 struct btrfs_key found_key; 503 352 struct extent_buffer *eb; ··· 512 361 const int bufsize = 4096; 513 362 int ret; 514 363 364 + WARN_ON(sblock->page_count < 1); 365 + dev = sblock->pagev[0]->dev; 366 + fs_info = sblock->sctx->dev_root->fs_info; 367 + 515 368 path = btrfs_alloc_path(); 516 369 517 370 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); 518 371 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); 519 - BUG_ON(sblock->page_count < 1); 520 - swarn.sector = (sblock->pagev[0].physical) >> 9; 521 - swarn.logical = sblock->pagev[0].logical; 372 + swarn.sector = (sblock->pagev[0]->physical) >> 9; 373 + swarn.logical = sblock->pagev[0]->logical; 522 374 swarn.errstr = errstr; 523 - swarn.dev = dev; 375 + swarn.dev = NULL; 524 376 swarn.msg_bufsize = bufsize; 525 377 swarn.scratch_bufsize = bufsize; 526 378 ··· 559 405 } while (ret != 1); 560 406 } else { 561 407 swarn.path = path; 408 + swarn.dev = dev; 562 409 iterate_extent_inodes(fs_info, found_key.objectid, 563 410 extent_item_pos, 1, 564 411 scrub_print_warning_inode, &swarn); ··· 571 416 kfree(swarn.msg_buf); 572 417 } 573 418 574 - static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) 419 + static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) 575 420 { 576 421 struct page *page = NULL; 577 422 unsigned long index; 578 - struct scrub_fixup_nodatasum *fixup = ctx; 423 + struct scrub_fixup_nodatasum *fixup = fixup_ctx; 579 424 int ret; 580 425 int corrected = 0; 581 426 struct btrfs_key key; ··· 606 451 } 607 452 608 453 if (PageUptodate(page)) { 609 - struct btrfs_mapping_tree *map_tree; 454 + struct btrfs_fs_info *fs_info; 610 455 if (PageDirty(page)) { 611 456 /* 612 457 * we need to write the data to the defect sector. the ··· 627 472 ret = -EIO; 628 473 goto out; 629 474 } 630 - map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; 631 - ret = repair_io_failure(map_tree, offset, PAGE_SIZE, 475 + fs_info = BTRFS_I(inode)->root->fs_info; 476 + ret = repair_io_failure(fs_info, offset, PAGE_SIZE, 632 477 fixup->logical, page, 633 478 fixup->mirror_num); 634 479 unlock_page(page); ··· 685 530 { 686 531 int ret; 687 532 struct scrub_fixup_nodatasum *fixup; 688 - struct scrub_dev *sdev; 533 + struct scrub_ctx *sctx; 689 534 struct btrfs_trans_handle *trans = NULL; 690 535 struct btrfs_fs_info *fs_info; 691 536 struct btrfs_path *path; 692 537 int uncorrectable = 0; 693 538 694 539 fixup = container_of(work, struct scrub_fixup_nodatasum, work); 695 - sdev = fixup->sdev; 540 + sctx = fixup->sctx; 696 541 fs_info = fixup->root->fs_info; 697 542 698 543 path = btrfs_alloc_path(); 699 544 if (!path) { 700 - spin_lock(&sdev->stat_lock); 701 - ++sdev->stat.malloc_errors; 702 - spin_unlock(&sdev->stat_lock); 545 + spin_lock(&sctx->stat_lock); 546 + ++sctx->stat.malloc_errors; 547 + spin_unlock(&sctx->stat_lock); 703 548 uncorrectable = 1; 704 549 goto out; 705 550 } ··· 728 573 } 729 574 WARN_ON(ret != 1); 730 575 731 - spin_lock(&sdev->stat_lock); 732 - ++sdev->stat.corrected_errors; 733 - spin_unlock(&sdev->stat_lock); 576 + spin_lock(&sctx->stat_lock); 577 + ++sctx->stat.corrected_errors; 578 + spin_unlock(&sctx->stat_lock); 734 579 735 580 out: 736 581 if (trans && !IS_ERR(trans)) 737 582 btrfs_end_transaction(trans, fixup->root); 738 583 if (uncorrectable) { 739 - spin_lock(&sdev->stat_lock); 740 - ++sdev->stat.uncorrectable_errors; 741 - spin_unlock(&sdev->stat_lock); 742 - 584 + spin_lock(&sctx->stat_lock); 585 + ++sctx->stat.uncorrectable_errors; 586 + spin_unlock(&sctx->stat_lock); 587 + btrfs_dev_replace_stats_inc( 588 + &sctx->dev_root->fs_info->dev_replace. 589 + num_uncorrectable_read_errors); 743 590 printk_ratelimited_in_rcu(KERN_ERR 744 591 "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", 745 592 (unsigned long long)fixup->logical, 746 - rcu_str_deref(sdev->dev->name)); 593 + rcu_str_deref(fixup->dev->name)); 747 594 } 748 595 749 596 btrfs_free_path(path); 750 597 kfree(fixup); 751 598 752 - /* see caller why we're pretending to be paused in the scrub counters */ 753 - mutex_lock(&fs_info->scrub_lock); 754 - atomic_dec(&fs_info->scrubs_running); 755 - atomic_dec(&fs_info->scrubs_paused); 756 - mutex_unlock(&fs_info->scrub_lock); 757 - atomic_dec(&sdev->fixup_cnt); 758 - wake_up(&fs_info->scrub_pause_wait); 759 - wake_up(&sdev->list_wait); 599 + scrub_pending_trans_workers_dec(sctx); 760 600 } 761 601 762 602 /* ··· 764 614 */ 765 615 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) 766 616 { 767 - struct scrub_dev *sdev = sblock_to_check->sdev; 617 + struct scrub_ctx *sctx = sblock_to_check->sctx; 618 + struct btrfs_device *dev; 768 619 struct btrfs_fs_info *fs_info; 769 620 u64 length; 770 621 u64 logical; ··· 784 633 DEFAULT_RATELIMIT_BURST); 785 634 786 635 BUG_ON(sblock_to_check->page_count < 1); 787 - fs_info = sdev->dev->dev_root->fs_info; 636 + fs_info = sctx->dev_root->fs_info; 637 + if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { 638 + /* 639 + * if we find an error in a super block, we just report it. 640 + * They will get written with the next transaction commit 641 + * anyway 642 + */ 643 + spin_lock(&sctx->stat_lock); 644 + ++sctx->stat.super_errors; 645 + spin_unlock(&sctx->stat_lock); 646 + return 0; 647 + } 788 648 length = sblock_to_check->page_count * PAGE_SIZE; 789 - logical = sblock_to_check->pagev[0].logical; 790 - generation = sblock_to_check->pagev[0].generation; 791 - BUG_ON(sblock_to_check->pagev[0].mirror_num < 1); 792 - failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1; 793 - is_metadata = !(sblock_to_check->pagev[0].flags & 649 + logical = sblock_to_check->pagev[0]->logical; 650 + generation = sblock_to_check->pagev[0]->generation; 651 + BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1); 652 + failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1; 653 + is_metadata = !(sblock_to_check->pagev[0]->flags & 794 654 BTRFS_EXTENT_FLAG_DATA); 795 - have_csum = sblock_to_check->pagev[0].have_csum; 796 - csum = sblock_to_check->pagev[0].csum; 655 + have_csum = sblock_to_check->pagev[0]->have_csum; 656 + csum = sblock_to_check->pagev[0]->csum; 657 + dev = sblock_to_check->pagev[0]->dev; 658 + 659 + if (sctx->is_dev_replace && !is_metadata && !have_csum) { 660 + sblocks_for_recheck = NULL; 661 + goto nodatasum_case; 662 + } 797 663 798 664 /* 799 665 * read all mirrors one after the other. This includes to ··· 845 677 sizeof(*sblocks_for_recheck), 846 678 GFP_NOFS); 847 679 if (!sblocks_for_recheck) { 848 - spin_lock(&sdev->stat_lock); 849 - sdev->stat.malloc_errors++; 850 - sdev->stat.read_errors++; 851 - sdev->stat.uncorrectable_errors++; 852 - spin_unlock(&sdev->stat_lock); 853 - btrfs_dev_stat_inc_and_print(sdev->dev, 854 - BTRFS_DEV_STAT_READ_ERRS); 680 + spin_lock(&sctx->stat_lock); 681 + sctx->stat.malloc_errors++; 682 + sctx->stat.read_errors++; 683 + sctx->stat.uncorrectable_errors++; 684 + spin_unlock(&sctx->stat_lock); 685 + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 855 686 goto out; 856 687 } 857 688 858 689 /* setup the context, map the logical blocks and alloc the pages */ 859 - ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length, 690 + ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length, 860 691 logical, sblocks_for_recheck); 861 692 if (ret) { 862 - spin_lock(&sdev->stat_lock); 863 - sdev->stat.read_errors++; 864 - sdev->stat.uncorrectable_errors++; 865 - spin_unlock(&sdev->stat_lock); 866 - btrfs_dev_stat_inc_and_print(sdev->dev, 867 - BTRFS_DEV_STAT_READ_ERRS); 693 + spin_lock(&sctx->stat_lock); 694 + sctx->stat.read_errors++; 695 + sctx->stat.uncorrectable_errors++; 696 + spin_unlock(&sctx->stat_lock); 697 + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 868 698 goto out; 869 699 } 870 700 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); 871 701 sblock_bad = sblocks_for_recheck + failed_mirror_index; 872 702 873 703 /* build and submit the bios for the failed mirror, check checksums */ 874 - ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, 875 - csum, generation, sdev->csum_size); 876 - if (ret) { 877 - spin_lock(&sdev->stat_lock); 878 - sdev->stat.read_errors++; 879 - sdev->stat.uncorrectable_errors++; 880 - spin_unlock(&sdev->stat_lock); 881 - btrfs_dev_stat_inc_and_print(sdev->dev, 882 - BTRFS_DEV_STAT_READ_ERRS); 883 - goto out; 884 - } 704 + scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, 705 + csum, generation, sctx->csum_size); 885 706 886 707 if (!sblock_bad->header_error && !sblock_bad->checksum_error && 887 708 sblock_bad->no_io_error_seen) { ··· 882 725 * different bio (usually one of the two latter cases is 883 726 * the cause) 884 727 */ 885 - spin_lock(&sdev->stat_lock); 886 - sdev->stat.unverified_errors++; 887 - spin_unlock(&sdev->stat_lock); 728 + spin_lock(&sctx->stat_lock); 729 + sctx->stat.unverified_errors++; 730 + spin_unlock(&sctx->stat_lock); 888 731 732 + if (sctx->is_dev_replace) 733 + scrub_write_block_to_dev_replace(sblock_bad); 889 734 goto out; 890 735 } 891 736 892 737 if (!sblock_bad->no_io_error_seen) { 893 - spin_lock(&sdev->stat_lock); 894 - sdev->stat.read_errors++; 895 - spin_unlock(&sdev->stat_lock); 738 + spin_lock(&sctx->stat_lock); 739 + sctx->stat.read_errors++; 740 + spin_unlock(&sctx->stat_lock); 896 741 if (__ratelimit(&_rs)) 897 742 scrub_print_warning("i/o error", sblock_to_check); 898 - btrfs_dev_stat_inc_and_print(sdev->dev, 899 - BTRFS_DEV_STAT_READ_ERRS); 743 + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 900 744 } else if (sblock_bad->checksum_error) { 901 - spin_lock(&sdev->stat_lock); 902 - sdev->stat.csum_errors++; 903 - spin_unlock(&sdev->stat_lock); 745 + spin_lock(&sctx->stat_lock); 746 + sctx->stat.csum_errors++; 747 + spin_unlock(&sctx->stat_lock); 904 748 if (__ratelimit(&_rs)) 905 749 scrub_print_warning("checksum error", sblock_to_check); 906 - btrfs_dev_stat_inc_and_print(sdev->dev, 750 + btrfs_dev_stat_inc_and_print(dev, 907 751 BTRFS_DEV_STAT_CORRUPTION_ERRS); 908 752 } else if (sblock_bad->header_error) { 909 - spin_lock(&sdev->stat_lock); 910 - sdev->stat.verify_errors++; 911 - spin_unlock(&sdev->stat_lock); 753 + spin_lock(&sctx->stat_lock); 754 + sctx->stat.verify_errors++; 755 + spin_unlock(&sctx->stat_lock); 912 756 if (__ratelimit(&_rs)) 913 757 scrub_print_warning("checksum/header error", 914 758 sblock_to_check); 915 759 if (sblock_bad->generation_error) 916 - btrfs_dev_stat_inc_and_print(sdev->dev, 760 + btrfs_dev_stat_inc_and_print(dev, 917 761 BTRFS_DEV_STAT_GENERATION_ERRS); 918 762 else 919 - btrfs_dev_stat_inc_and_print(sdev->dev, 763 + btrfs_dev_stat_inc_and_print(dev, 920 764 BTRFS_DEV_STAT_CORRUPTION_ERRS); 921 765 } 922 766 923 - if (sdev->readonly) 767 + if (sctx->readonly && !sctx->is_dev_replace) 924 768 goto did_not_correct_error; 925 769 926 770 if (!is_metadata && !have_csum) { 927 771 struct scrub_fixup_nodatasum *fixup_nodatasum; 772 + 773 + nodatasum_case: 774 + WARN_ON(sctx->is_dev_replace); 928 775 929 776 /* 930 777 * !is_metadata and !have_csum, this means that the data ··· 940 779 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); 941 780 if (!fixup_nodatasum) 942 781 goto did_not_correct_error; 943 - fixup_nodatasum->sdev = sdev; 782 + fixup_nodatasum->sctx = sctx; 783 + fixup_nodatasum->dev = dev; 944 784 fixup_nodatasum->logical = logical; 945 785 fixup_nodatasum->root = fs_info->extent_root; 946 786 fixup_nodatasum->mirror_num = failed_mirror_index + 1; 947 - /* 948 - * increment scrubs_running to prevent cancel requests from 949 - * completing as long as a fixup worker is running. we must also 950 - * increment scrubs_paused to prevent deadlocking on pause 951 - * requests used for transactions commits (as the worker uses a 952 - * transaction context). it is safe to regard the fixup worker 953 - * as paused for all matters practical. effectively, we only 954 - * avoid cancellation requests from completing. 955 - */ 956 - mutex_lock(&fs_info->scrub_lock); 957 - atomic_inc(&fs_info->scrubs_running); 958 - atomic_inc(&fs_info->scrubs_paused); 959 - mutex_unlock(&fs_info->scrub_lock); 960 - atomic_inc(&sdev->fixup_cnt); 787 + scrub_pending_trans_workers_inc(sctx); 961 788 fixup_nodatasum->work.func = scrub_fixup_nodatasum; 962 789 btrfs_queue_worker(&fs_info->scrub_workers, 963 790 &fixup_nodatasum->work); ··· 954 805 955 806 /* 956 807 * now build and submit the bios for the other mirrors, check 957 - * checksums 958 - */ 959 - for (mirror_index = 0; 960 - mirror_index < BTRFS_MAX_MIRRORS && 961 - sblocks_for_recheck[mirror_index].page_count > 0; 962 - mirror_index++) { 963 - if (mirror_index == failed_mirror_index) 964 - continue; 965 - 966 - /* build and submit the bios, check checksums */ 967 - ret = scrub_recheck_block(fs_info, 968 - sblocks_for_recheck + mirror_index, 969 - is_metadata, have_csum, csum, 970 - generation, sdev->csum_size); 971 - if (ret) 972 - goto did_not_correct_error; 973 - } 974 - 975 - /* 976 - * first try to pick the mirror which is completely without I/O 808 + * checksums. 809 + * First try to pick the mirror which is completely without I/O 977 810 * errors and also does not have a checksum error. 978 811 * If one is found, and if a checksum is present, the full block 979 812 * that is known to contain an error is rewritten. Afterwards ··· 971 840 mirror_index < BTRFS_MAX_MIRRORS && 972 841 sblocks_for_recheck[mirror_index].page_count > 0; 973 842 mirror_index++) { 974 - struct scrub_block *sblock_other = sblocks_for_recheck + 975 - mirror_index; 843 + struct scrub_block *sblock_other; 844 + 845 + if (mirror_index == failed_mirror_index) 846 + continue; 847 + sblock_other = sblocks_for_recheck + mirror_index; 848 + 849 + /* build and submit the bios, check checksums */ 850 + scrub_recheck_block(fs_info, sblock_other, is_metadata, 851 + have_csum, csum, generation, 852 + sctx->csum_size); 976 853 977 854 if (!sblock_other->header_error && 978 855 !sblock_other->checksum_error && 979 856 sblock_other->no_io_error_seen) { 980 - int force_write = is_metadata || have_csum; 857 + if (sctx->is_dev_replace) { 858 + scrub_write_block_to_dev_replace(sblock_other); 859 + } else { 860 + int force_write = is_metadata || have_csum; 981 861 982 - ret = scrub_repair_block_from_good_copy(sblock_bad, 983 - sblock_other, 984 - force_write); 862 + ret = scrub_repair_block_from_good_copy( 863 + sblock_bad, sblock_other, 864 + force_write); 865 + } 985 866 if (0 == ret) 986 867 goto corrected_error; 987 868 } 988 869 } 989 870 990 871 /* 991 - * in case of I/O errors in the area that is supposed to be 872 + * for dev_replace, pick good pages and write to the target device. 873 + */ 874 + if (sctx->is_dev_replace) { 875 + success = 1; 876 + for (page_num = 0; page_num < sblock_bad->page_count; 877 + page_num++) { 878 + int sub_success; 879 + 880 + sub_success = 0; 881 + for (mirror_index = 0; 882 + mirror_index < BTRFS_MAX_MIRRORS && 883 + sblocks_for_recheck[mirror_index].page_count > 0; 884 + mirror_index++) { 885 + struct scrub_block *sblock_other = 886 + sblocks_for_recheck + mirror_index; 887 + struct scrub_page *page_other = 888 + sblock_other->pagev[page_num]; 889 + 890 + if (!page_other->io_error) { 891 + ret = scrub_write_page_to_dev_replace( 892 + sblock_other, page_num); 893 + if (ret == 0) { 894 + /* succeeded for this page */ 895 + sub_success = 1; 896 + break; 897 + } else { 898 + btrfs_dev_replace_stats_inc( 899 + &sctx->dev_root-> 900 + fs_info->dev_replace. 901 + num_write_errors); 902 + } 903 + } 904 + } 905 + 906 + if (!sub_success) { 907 + /* 908 + * did not find a mirror to fetch the page 909 + * from. scrub_write_page_to_dev_replace() 910 + * handles this case (page->io_error), by 911 + * filling the block with zeros before 912 + * submitting the write request 913 + */ 914 + success = 0; 915 + ret = scrub_write_page_to_dev_replace( 916 + sblock_bad, page_num); 917 + if (ret) 918 + btrfs_dev_replace_stats_inc( 919 + &sctx->dev_root->fs_info-> 920 + dev_replace.num_write_errors); 921 + } 922 + } 923 + 924 + goto out; 925 + } 926 + 927 + /* 928 + * for regular scrub, repair those pages that are errored. 929 + * In case of I/O errors in the area that is supposed to be 992 930 * repaired, continue by picking good copies of those pages. 993 931 * Select the good pages from mirrors to rewrite bad pages from 994 932 * the area to fix. Afterwards verify the checksum of the block ··· 1087 887 1088 888 success = 1; 1089 889 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { 1090 - struct scrub_page *page_bad = sblock_bad->pagev + page_num; 890 + struct scrub_page *page_bad = sblock_bad->pagev[page_num]; 1091 891 1092 892 if (!page_bad->io_error) 1093 893 continue; ··· 1098 898 mirror_index++) { 1099 899 struct scrub_block *sblock_other = sblocks_for_recheck + 1100 900 mirror_index; 1101 - struct scrub_page *page_other = sblock_other->pagev + 1102 - page_num; 901 + struct scrub_page *page_other = sblock_other->pagev[ 902 + page_num]; 1103 903 1104 904 if (!page_other->io_error) { 1105 905 ret = scrub_repair_page_from_good_copy( ··· 1128 928 * is verified, but most likely the data comes out 1129 929 * of the page cache. 1130 930 */ 1131 - ret = scrub_recheck_block(fs_info, sblock_bad, 1132 - is_metadata, have_csum, csum, 1133 - generation, sdev->csum_size); 1134 - if (!ret && !sblock_bad->header_error && 931 + scrub_recheck_block(fs_info, sblock_bad, 932 + is_metadata, have_csum, csum, 933 + generation, sctx->csum_size); 934 + if (!sblock_bad->header_error && 1135 935 !sblock_bad->checksum_error && 1136 936 sblock_bad->no_io_error_seen) 1137 937 goto corrected_error; ··· 1139 939 goto did_not_correct_error; 1140 940 } else { 1141 941 corrected_error: 1142 - spin_lock(&sdev->stat_lock); 1143 - sdev->stat.corrected_errors++; 1144 - spin_unlock(&sdev->stat_lock); 942 + spin_lock(&sctx->stat_lock); 943 + sctx->stat.corrected_errors++; 944 + spin_unlock(&sctx->stat_lock); 1145 945 printk_ratelimited_in_rcu(KERN_ERR 1146 946 "btrfs: fixed up error at logical %llu on dev %s\n", 1147 947 (unsigned long long)logical, 1148 - rcu_str_deref(sdev->dev->name)); 948 + rcu_str_deref(dev->name)); 1149 949 } 1150 950 } else { 1151 951 did_not_correct_error: 1152 - spin_lock(&sdev->stat_lock); 1153 - sdev->stat.uncorrectable_errors++; 1154 - spin_unlock(&sdev->stat_lock); 952 + spin_lock(&sctx->stat_lock); 953 + sctx->stat.uncorrectable_errors++; 954 + spin_unlock(&sctx->stat_lock); 1155 955 printk_ratelimited_in_rcu(KERN_ERR 1156 956 "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", 1157 957 (unsigned long long)logical, 1158 - rcu_str_deref(sdev->dev->name)); 958 + rcu_str_deref(dev->name)); 1159 959 } 1160 960 1161 961 out: ··· 1166 966 mirror_index; 1167 967 int page_index; 1168 968 1169 - for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO; 1170 - page_index++) 1171 - if (sblock->pagev[page_index].page) 1172 - __free_page( 1173 - sblock->pagev[page_index].page); 969 + for (page_index = 0; page_index < sblock->page_count; 970 + page_index++) { 971 + sblock->pagev[page_index]->sblock = NULL; 972 + scrub_page_put(sblock->pagev[page_index]); 973 + } 1174 974 } 1175 975 kfree(sblocks_for_recheck); 1176 976 } ··· 1178 978 return 0; 1179 979 } 1180 980 1181 - static int scrub_setup_recheck_block(struct scrub_dev *sdev, 1182 - struct btrfs_mapping_tree *map_tree, 981 + static int scrub_setup_recheck_block(struct scrub_ctx *sctx, 982 + struct btrfs_fs_info *fs_info, 983 + struct scrub_block *original_sblock, 1183 984 u64 length, u64 logical, 1184 985 struct scrub_block *sblocks_for_recheck) 1185 986 { ··· 1189 988 int ret; 1190 989 1191 990 /* 1192 - * note: the three members sdev, ref_count and outstanding_pages 991 + * note: the two members ref_count and outstanding_pages 1193 992 * are not used (and not set) in the blocks that are used for 1194 993 * the recheck procedure 1195 994 */ ··· 1204 1003 * with a length of PAGE_SIZE, each returned stripe 1205 1004 * represents one mirror 1206 1005 */ 1207 - ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length, 1208 - &bbio, 0); 1006 + ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, 1007 + &mapped_length, &bbio, 0); 1209 1008 if (ret || !bbio || mapped_length < sublen) { 1210 1009 kfree(bbio); 1211 1010 return -EIO; 1212 1011 } 1213 1012 1214 - BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); 1013 + BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); 1215 1014 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; 1216 1015 mirror_index++) { 1217 1016 struct scrub_block *sblock; ··· 1221 1020 continue; 1222 1021 1223 1022 sblock = sblocks_for_recheck + mirror_index; 1224 - page = sblock->pagev + page_index; 1225 - page->logical = logical; 1226 - page->physical = bbio->stripes[mirror_index].physical; 1227 - /* for missing devices, dev->bdev is NULL */ 1228 - page->dev = bbio->stripes[mirror_index].dev; 1229 - page->mirror_num = mirror_index + 1; 1230 - page->page = alloc_page(GFP_NOFS); 1231 - if (!page->page) { 1232 - spin_lock(&sdev->stat_lock); 1233 - sdev->stat.malloc_errors++; 1234 - spin_unlock(&sdev->stat_lock); 1023 + sblock->sctx = sctx; 1024 + page = kzalloc(sizeof(*page), GFP_NOFS); 1025 + if (!page) { 1026 + leave_nomem: 1027 + spin_lock(&sctx->stat_lock); 1028 + sctx->stat.malloc_errors++; 1029 + spin_unlock(&sctx->stat_lock); 1235 1030 kfree(bbio); 1236 1031 return -ENOMEM; 1237 1032 } 1033 + scrub_page_get(page); 1034 + sblock->pagev[page_index] = page; 1035 + page->logical = logical; 1036 + page->physical = bbio->stripes[mirror_index].physical; 1037 + BUG_ON(page_index >= original_sblock->page_count); 1038 + page->physical_for_dev_replace = 1039 + original_sblock->pagev[page_index]-> 1040 + physical_for_dev_replace; 1041 + /* for missing devices, dev->bdev is NULL */ 1042 + page->dev = bbio->stripes[mirror_index].dev; 1043 + page->mirror_num = mirror_index + 1; 1238 1044 sblock->page_count++; 1045 + page->page = alloc_page(GFP_NOFS); 1046 + if (!page->page) 1047 + goto leave_nomem; 1239 1048 } 1240 1049 kfree(bbio); 1241 1050 length -= sublen; ··· 1263 1052 * to take those pages that are not errored from all the mirrors so that 1264 1053 * the pages that are errored in the just handled mirror can be repaired. 1265 1054 */ 1266 - static int scrub_recheck_block(struct btrfs_fs_info *fs_info, 1267 - struct scrub_block *sblock, int is_metadata, 1268 - int have_csum, u8 *csum, u64 generation, 1269 - u16 csum_size) 1055 + static void scrub_recheck_block(struct btrfs_fs_info *fs_info, 1056 + struct scrub_block *sblock, int is_metadata, 1057 + int have_csum, u8 *csum, u64 generation, 1058 + u16 csum_size) 1270 1059 { 1271 1060 int page_num; 1272 1061 ··· 1276 1065 1277 1066 for (page_num = 0; page_num < sblock->page_count; page_num++) { 1278 1067 struct bio *bio; 1279 - int ret; 1280 - struct scrub_page *page = sblock->pagev + page_num; 1068 + struct scrub_page *page = sblock->pagev[page_num]; 1281 1069 DECLARE_COMPLETION_ONSTACK(complete); 1282 1070 1283 1071 if (page->dev->bdev == NULL) { ··· 1285 1075 continue; 1286 1076 } 1287 1077 1288 - BUG_ON(!page->page); 1078 + WARN_ON(!page->page); 1289 1079 bio = bio_alloc(GFP_NOFS, 1); 1290 - if (!bio) 1291 - return -EIO; 1080 + if (!bio) { 1081 + page->io_error = 1; 1082 + sblock->no_io_error_seen = 0; 1083 + continue; 1084 + } 1292 1085 bio->bi_bdev = page->dev->bdev; 1293 1086 bio->bi_sector = page->physical >> 9; 1294 1087 bio->bi_end_io = scrub_complete_bio_end_io; 1295 1088 bio->bi_private = &complete; 1296 1089 1297 - ret = bio_add_page(bio, page->page, PAGE_SIZE, 0); 1298 - if (PAGE_SIZE != ret) { 1299 - bio_put(bio); 1300 - return -EIO; 1301 - } 1090 + bio_add_page(bio, page->page, PAGE_SIZE, 0); 1302 1091 btrfsic_submit_bio(READ, bio); 1303 1092 1304 1093 /* this will also unplug the queue */ ··· 1314 1105 have_csum, csum, generation, 1315 1106 csum_size); 1316 1107 1317 - return 0; 1108 + return; 1318 1109 } 1319 1110 1320 1111 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, ··· 1329 1120 struct btrfs_root *root = fs_info->extent_root; 1330 1121 void *mapped_buffer; 1331 1122 1332 - BUG_ON(!sblock->pagev[0].page); 1123 + WARN_ON(!sblock->pagev[0]->page); 1333 1124 if (is_metadata) { 1334 1125 struct btrfs_header *h; 1335 1126 1336 - mapped_buffer = kmap_atomic(sblock->pagev[0].page); 1127 + mapped_buffer = kmap_atomic(sblock->pagev[0]->page); 1337 1128 h = (struct btrfs_header *)mapped_buffer; 1338 1129 1339 - if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || 1130 + if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) || 1340 1131 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || 1341 1132 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1342 1133 BTRFS_UUID_SIZE)) { ··· 1350 1141 if (!have_csum) 1351 1142 return; 1352 1143 1353 - mapped_buffer = kmap_atomic(sblock->pagev[0].page); 1144 + mapped_buffer = kmap_atomic(sblock->pagev[0]->page); 1354 1145 } 1355 1146 1356 1147 for (page_num = 0;;) { ··· 1366 1157 page_num++; 1367 1158 if (page_num >= sblock->page_count) 1368 1159 break; 1369 - BUG_ON(!sblock->pagev[page_num].page); 1160 + WARN_ON(!sblock->pagev[page_num]->page); 1370 1161 1371 - mapped_buffer = kmap_atomic(sblock->pagev[page_num].page); 1162 + mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page); 1372 1163 } 1373 1164 1374 1165 btrfs_csum_final(crc, calculated_csum); ··· 1406 1197 struct scrub_block *sblock_good, 1407 1198 int page_num, int force_write) 1408 1199 { 1409 - struct scrub_page *page_bad = sblock_bad->pagev + page_num; 1410 - struct scrub_page *page_good = sblock_good->pagev + page_num; 1200 + struct scrub_page *page_bad = sblock_bad->pagev[page_num]; 1201 + struct scrub_page *page_good = sblock_good->pagev[page_num]; 1411 1202 1412 - BUG_ON(sblock_bad->pagev[page_num].page == NULL); 1413 - BUG_ON(sblock_good->pagev[page_num].page == NULL); 1203 + BUG_ON(page_bad->page == NULL); 1204 + BUG_ON(page_good->page == NULL); 1414 1205 if (force_write || sblock_bad->header_error || 1415 1206 sblock_bad->checksum_error || page_bad->io_error) { 1416 1207 struct bio *bio; 1417 1208 int ret; 1418 1209 DECLARE_COMPLETION_ONSTACK(complete); 1210 + 1211 + if (!page_bad->dev->bdev) { 1212 + printk_ratelimited(KERN_WARNING 1213 + "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n"); 1214 + return -EIO; 1215 + } 1419 1216 1420 1217 bio = bio_alloc(GFP_NOFS, 1); 1421 1218 if (!bio) ··· 1443 1228 if (!bio_flagged(bio, BIO_UPTODATE)) { 1444 1229 btrfs_dev_stat_inc_and_print(page_bad->dev, 1445 1230 BTRFS_DEV_STAT_WRITE_ERRS); 1231 + btrfs_dev_replace_stats_inc( 1232 + &sblock_bad->sctx->dev_root->fs_info-> 1233 + dev_replace.num_write_errors); 1446 1234 bio_put(bio); 1447 1235 return -EIO; 1448 1236 } ··· 1455 1237 return 0; 1456 1238 } 1457 1239 1458 - static void scrub_checksum(struct scrub_block *sblock) 1240 + static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) 1241 + { 1242 + int page_num; 1243 + 1244 + for (page_num = 0; page_num < sblock->page_count; page_num++) { 1245 + int ret; 1246 + 1247 + ret = scrub_write_page_to_dev_replace(sblock, page_num); 1248 + if (ret) 1249 + btrfs_dev_replace_stats_inc( 1250 + &sblock->sctx->dev_root->fs_info->dev_replace. 1251 + num_write_errors); 1252 + } 1253 + } 1254 + 1255 + static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, 1256 + int page_num) 1257 + { 1258 + struct scrub_page *spage = sblock->pagev[page_num]; 1259 + 1260 + BUG_ON(spage->page == NULL); 1261 + if (spage->io_error) { 1262 + void *mapped_buffer = kmap_atomic(spage->page); 1263 + 1264 + memset(mapped_buffer, 0, PAGE_CACHE_SIZE); 1265 + flush_dcache_page(spage->page); 1266 + kunmap_atomic(mapped_buffer); 1267 + } 1268 + return scrub_add_page_to_wr_bio(sblock->sctx, spage); 1269 + } 1270 + 1271 + static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, 1272 + struct scrub_page *spage) 1273 + { 1274 + struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; 1275 + struct scrub_bio *sbio; 1276 + int ret; 1277 + 1278 + mutex_lock(&wr_ctx->wr_lock); 1279 + again: 1280 + if (!wr_ctx->wr_curr_bio) { 1281 + wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio), 1282 + GFP_NOFS); 1283 + if (!wr_ctx->wr_curr_bio) { 1284 + mutex_unlock(&wr_ctx->wr_lock); 1285 + return -ENOMEM; 1286 + } 1287 + wr_ctx->wr_curr_bio->sctx = sctx; 1288 + wr_ctx->wr_curr_bio->page_count = 0; 1289 + } 1290 + sbio = wr_ctx->wr_curr_bio; 1291 + if (sbio->page_count == 0) { 1292 + struct bio *bio; 1293 + 1294 + sbio->physical = spage->physical_for_dev_replace; 1295 + sbio->logical = spage->logical; 1296 + sbio->dev = wr_ctx->tgtdev; 1297 + bio = sbio->bio; 1298 + if (!bio) { 1299 + bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); 1300 + if (!bio) { 1301 + mutex_unlock(&wr_ctx->wr_lock); 1302 + return -ENOMEM; 1303 + } 1304 + sbio->bio = bio; 1305 + } 1306 + 1307 + bio->bi_private = sbio; 1308 + bio->bi_end_io = scrub_wr_bio_end_io; 1309 + bio->bi_bdev = sbio->dev->bdev; 1310 + bio->bi_sector = sbio->physical >> 9; 1311 + sbio->err = 0; 1312 + } else if (sbio->physical + sbio->page_count * PAGE_SIZE != 1313 + spage->physical_for_dev_replace || 1314 + sbio->logical + sbio->page_count * PAGE_SIZE != 1315 + spage->logical) { 1316 + scrub_wr_submit(sctx); 1317 + goto again; 1318 + } 1319 + 1320 + ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); 1321 + if (ret != PAGE_SIZE) { 1322 + if (sbio->page_count < 1) { 1323 + bio_put(sbio->bio); 1324 + sbio->bio = NULL; 1325 + mutex_unlock(&wr_ctx->wr_lock); 1326 + return -EIO; 1327 + } 1328 + scrub_wr_submit(sctx); 1329 + goto again; 1330 + } 1331 + 1332 + sbio->pagev[sbio->page_count] = spage; 1333 + scrub_page_get(spage); 1334 + sbio->page_count++; 1335 + if (sbio->page_count == wr_ctx->pages_per_wr_bio) 1336 + scrub_wr_submit(sctx); 1337 + mutex_unlock(&wr_ctx->wr_lock); 1338 + 1339 + return 0; 1340 + } 1341 + 1342 + static void scrub_wr_submit(struct scrub_ctx *sctx) 1343 + { 1344 + struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; 1345 + struct scrub_bio *sbio; 1346 + 1347 + if (!wr_ctx->wr_curr_bio) 1348 + return; 1349 + 1350 + sbio = wr_ctx->wr_curr_bio; 1351 + wr_ctx->wr_curr_bio = NULL; 1352 + WARN_ON(!sbio->bio->bi_bdev); 1353 + scrub_pending_bio_inc(sctx); 1354 + /* process all writes in a single worker thread. Then the block layer 1355 + * orders the requests before sending them to the driver which 1356 + * doubled the write performance on spinning disks when measured 1357 + * with Linux 3.5 */ 1358 + btrfsic_submit_bio(WRITE, sbio->bio); 1359 + } 1360 + 1361 + static void scrub_wr_bio_end_io(struct bio *bio, int err) 1362 + { 1363 + struct scrub_bio *sbio = bio->bi_private; 1364 + struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; 1365 + 1366 + sbio->err = err; 1367 + sbio->bio = bio; 1368 + 1369 + sbio->work.func = scrub_wr_bio_end_io_worker; 1370 + btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work); 1371 + } 1372 + 1373 + static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) 1374 + { 1375 + struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 1376 + struct scrub_ctx *sctx = sbio->sctx; 1377 + int i; 1378 + 1379 + WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO); 1380 + if (sbio->err) { 1381 + struct btrfs_dev_replace *dev_replace = 1382 + &sbio->sctx->dev_root->fs_info->dev_replace; 1383 + 1384 + for (i = 0; i < sbio->page_count; i++) { 1385 + struct scrub_page *spage = sbio->pagev[i]; 1386 + 1387 + spage->io_error = 1; 1388 + btrfs_dev_replace_stats_inc(&dev_replace-> 1389 + num_write_errors); 1390 + } 1391 + } 1392 + 1393 + for (i = 0; i < sbio->page_count; i++) 1394 + scrub_page_put(sbio->pagev[i]); 1395 + 1396 + bio_put(sbio->bio); 1397 + kfree(sbio); 1398 + scrub_pending_bio_dec(sctx); 1399 + } 1400 + 1401 + static int scrub_checksum(struct scrub_block *sblock) 1459 1402 { 1460 1403 u64 flags; 1461 1404 int ret; 1462 1405 1463 - BUG_ON(sblock->page_count < 1); 1464 - flags = sblock->pagev[0].flags; 1406 + WARN_ON(sblock->page_count < 1); 1407 + flags = sblock->pagev[0]->flags; 1465 1408 ret = 0; 1466 1409 if (flags & BTRFS_EXTENT_FLAG_DATA) 1467 1410 ret = scrub_checksum_data(sblock); ··· 1634 1255 WARN_ON(1); 1635 1256 if (ret) 1636 1257 scrub_handle_errored_block(sblock); 1258 + 1259 + return ret; 1637 1260 } 1638 1261 1639 1262 static int scrub_checksum_data(struct scrub_block *sblock) 1640 1263 { 1641 - struct scrub_dev *sdev = sblock->sdev; 1264 + struct scrub_ctx *sctx = sblock->sctx; 1642 1265 u8 csum[BTRFS_CSUM_SIZE]; 1643 1266 u8 *on_disk_csum; 1644 1267 struct page *page; 1645 1268 void *buffer; 1646 1269 u32 crc = ~(u32)0; 1647 1270 int fail = 0; 1648 - struct btrfs_root *root = sdev->dev->dev_root; 1271 + struct btrfs_root *root = sctx->dev_root; 1649 1272 u64 len; 1650 1273 int index; 1651 1274 1652 1275 BUG_ON(sblock->page_count < 1); 1653 - if (!sblock->pagev[0].have_csum) 1276 + if (!sblock->pagev[0]->have_csum) 1654 1277 return 0; 1655 1278 1656 - on_disk_csum = sblock->pagev[0].csum; 1657 - page = sblock->pagev[0].page; 1279 + on_disk_csum = sblock->pagev[0]->csum; 1280 + page = sblock->pagev[0]->page; 1658 1281 buffer = kmap_atomic(page); 1659 1282 1660 - len = sdev->sectorsize; 1283 + len = sctx->sectorsize; 1661 1284 index = 0; 1662 1285 for (;;) { 1663 1286 u64 l = min_t(u64, len, PAGE_SIZE); ··· 1671 1290 break; 1672 1291 index++; 1673 1292 BUG_ON(index >= sblock->page_count); 1674 - BUG_ON(!sblock->pagev[index].page); 1675 - page = sblock->pagev[index].page; 1293 + BUG_ON(!sblock->pagev[index]->page); 1294 + page = sblock->pagev[index]->page; 1676 1295 buffer = kmap_atomic(page); 1677 1296 } 1678 1297 1679 1298 btrfs_csum_final(crc, csum); 1680 - if (memcmp(csum, on_disk_csum, sdev->csum_size)) 1299 + if (memcmp(csum, on_disk_csum, sctx->csum_size)) 1681 1300 fail = 1; 1682 1301 1683 1302 return fail; ··· 1685 1304 1686 1305 static int scrub_checksum_tree_block(struct scrub_block *sblock) 1687 1306 { 1688 - struct scrub_dev *sdev = sblock->sdev; 1307 + struct scrub_ctx *sctx = sblock->sctx; 1689 1308 struct btrfs_header *h; 1690 - struct btrfs_root *root = sdev->dev->dev_root; 1309 + struct btrfs_root *root = sctx->dev_root; 1691 1310 struct btrfs_fs_info *fs_info = root->fs_info; 1692 1311 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1693 1312 u8 on_disk_csum[BTRFS_CSUM_SIZE]; ··· 1702 1321 int index; 1703 1322 1704 1323 BUG_ON(sblock->page_count < 1); 1705 - page = sblock->pagev[0].page; 1324 + page = sblock->pagev[0]->page; 1706 1325 mapped_buffer = kmap_atomic(page); 1707 1326 h = (struct btrfs_header *)mapped_buffer; 1708 - memcpy(on_disk_csum, h->csum, sdev->csum_size); 1327 + memcpy(on_disk_csum, h->csum, sctx->csum_size); 1709 1328 1710 1329 /* 1711 1330 * we don't use the getter functions here, as we ··· 1713 1332 * b) the page is already kmapped 1714 1333 */ 1715 1334 1716 - if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr)) 1335 + if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr)) 1717 1336 ++fail; 1718 1337 1719 - if (sblock->pagev[0].generation != le64_to_cpu(h->generation)) 1338 + if (sblock->pagev[0]->generation != le64_to_cpu(h->generation)) 1720 1339 ++fail; 1721 1340 1722 1341 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) ··· 1726 1345 BTRFS_UUID_SIZE)) 1727 1346 ++fail; 1728 1347 1729 - BUG_ON(sdev->nodesize != sdev->leafsize); 1730 - len = sdev->nodesize - BTRFS_CSUM_SIZE; 1348 + WARN_ON(sctx->nodesize != sctx->leafsize); 1349 + len = sctx->nodesize - BTRFS_CSUM_SIZE; 1731 1350 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 1732 1351 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 1733 1352 index = 0; ··· 1741 1360 break; 1742 1361 index++; 1743 1362 BUG_ON(index >= sblock->page_count); 1744 - BUG_ON(!sblock->pagev[index].page); 1745 - page = sblock->pagev[index].page; 1363 + BUG_ON(!sblock->pagev[index]->page); 1364 + page = sblock->pagev[index]->page; 1746 1365 mapped_buffer = kmap_atomic(page); 1747 1366 mapped_size = PAGE_SIZE; 1748 1367 p = mapped_buffer; 1749 1368 } 1750 1369 1751 1370 btrfs_csum_final(crc, calculated_csum); 1752 - if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) 1371 + if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) 1753 1372 ++crc_fail; 1754 1373 1755 1374 return fail || crc_fail; ··· 1758 1377 static int scrub_checksum_super(struct scrub_block *sblock) 1759 1378 { 1760 1379 struct btrfs_super_block *s; 1761 - struct scrub_dev *sdev = sblock->sdev; 1762 - struct btrfs_root *root = sdev->dev->dev_root; 1380 + struct scrub_ctx *sctx = sblock->sctx; 1381 + struct btrfs_root *root = sctx->dev_root; 1763 1382 struct btrfs_fs_info *fs_info = root->fs_info; 1764 1383 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1765 1384 u8 on_disk_csum[BTRFS_CSUM_SIZE]; ··· 1774 1393 int index; 1775 1394 1776 1395 BUG_ON(sblock->page_count < 1); 1777 - page = sblock->pagev[0].page; 1396 + page = sblock->pagev[0]->page; 1778 1397 mapped_buffer = kmap_atomic(page); 1779 1398 s = (struct btrfs_super_block *)mapped_buffer; 1780 - memcpy(on_disk_csum, s->csum, sdev->csum_size); 1399 + memcpy(on_disk_csum, s->csum, sctx->csum_size); 1781 1400 1782 - if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) 1401 + if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr)) 1783 1402 ++fail_cor; 1784 1403 1785 - if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) 1404 + if (sblock->pagev[0]->generation != le64_to_cpu(s->generation)) 1786 1405 ++fail_gen; 1787 1406 1788 1407 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) ··· 1802 1421 break; 1803 1422 index++; 1804 1423 BUG_ON(index >= sblock->page_count); 1805 - BUG_ON(!sblock->pagev[index].page); 1806 - page = sblock->pagev[index].page; 1424 + BUG_ON(!sblock->pagev[index]->page); 1425 + page = sblock->pagev[index]->page; 1807 1426 mapped_buffer = kmap_atomic(page); 1808 1427 mapped_size = PAGE_SIZE; 1809 1428 p = mapped_buffer; 1810 1429 } 1811 1430 1812 1431 btrfs_csum_final(crc, calculated_csum); 1813 - if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) 1432 + if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) 1814 1433 ++fail_cor; 1815 1434 1816 1435 if (fail_cor + fail_gen) { ··· 1819 1438 * They will get written with the next transaction commit 1820 1439 * anyway 1821 1440 */ 1822 - spin_lock(&sdev->stat_lock); 1823 - ++sdev->stat.super_errors; 1824 - spin_unlock(&sdev->stat_lock); 1441 + spin_lock(&sctx->stat_lock); 1442 + ++sctx->stat.super_errors; 1443 + spin_unlock(&sctx->stat_lock); 1825 1444 if (fail_cor) 1826 - btrfs_dev_stat_inc_and_print(sdev->dev, 1445 + btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, 1827 1446 BTRFS_DEV_STAT_CORRUPTION_ERRS); 1828 1447 else 1829 - btrfs_dev_stat_inc_and_print(sdev->dev, 1448 + btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, 1830 1449 BTRFS_DEV_STAT_GENERATION_ERRS); 1831 1450 } 1832 1451 ··· 1844 1463 int i; 1845 1464 1846 1465 for (i = 0; i < sblock->page_count; i++) 1847 - if (sblock->pagev[i].page) 1848 - __free_page(sblock->pagev[i].page); 1466 + scrub_page_put(sblock->pagev[i]); 1849 1467 kfree(sblock); 1850 1468 } 1851 1469 } 1852 1470 1853 - static void scrub_submit(struct scrub_dev *sdev) 1471 + static void scrub_page_get(struct scrub_page *spage) 1472 + { 1473 + atomic_inc(&spage->ref_count); 1474 + } 1475 + 1476 + static void scrub_page_put(struct scrub_page *spage) 1477 + { 1478 + if (atomic_dec_and_test(&spage->ref_count)) { 1479 + if (spage->page) 1480 + __free_page(spage->page); 1481 + kfree(spage); 1482 + } 1483 + } 1484 + 1485 + static void scrub_submit(struct scrub_ctx *sctx) 1854 1486 { 1855 1487 struct scrub_bio *sbio; 1856 1488 1857 - if (sdev->curr == -1) 1489 + if (sctx->curr == -1) 1858 1490 return; 1859 1491 1860 - sbio = sdev->bios[sdev->curr]; 1861 - sdev->curr = -1; 1862 - atomic_inc(&sdev->in_flight); 1492 + sbio = sctx->bios[sctx->curr]; 1493 + sctx->curr = -1; 1494 + scrub_pending_bio_inc(sctx); 1863 1495 1864 - btrfsic_submit_bio(READ, sbio->bio); 1496 + if (!sbio->bio->bi_bdev) { 1497 + /* 1498 + * this case should not happen. If btrfs_map_block() is 1499 + * wrong, it could happen for dev-replace operations on 1500 + * missing devices when no mirrors are available, but in 1501 + * this case it should already fail the mount. 1502 + * This case is handled correctly (but _very_ slowly). 1503 + */ 1504 + printk_ratelimited(KERN_WARNING 1505 + "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n"); 1506 + bio_endio(sbio->bio, -EIO); 1507 + } else { 1508 + btrfsic_submit_bio(READ, sbio->bio); 1509 + } 1865 1510 } 1866 1511 1867 - static int scrub_add_page_to_bio(struct scrub_dev *sdev, 1868 - struct scrub_page *spage) 1512 + static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, 1513 + struct scrub_page *spage) 1869 1514 { 1870 1515 struct scrub_block *sblock = spage->sblock; 1871 1516 struct scrub_bio *sbio; ··· 1901 1494 /* 1902 1495 * grab a fresh bio or wait for one to become available 1903 1496 */ 1904 - while (sdev->curr == -1) { 1905 - spin_lock(&sdev->list_lock); 1906 - sdev->curr = sdev->first_free; 1907 - if (sdev->curr != -1) { 1908 - sdev->first_free = sdev->bios[sdev->curr]->next_free; 1909 - sdev->bios[sdev->curr]->next_free = -1; 1910 - sdev->bios[sdev->curr]->page_count = 0; 1911 - spin_unlock(&sdev->list_lock); 1497 + while (sctx->curr == -1) { 1498 + spin_lock(&sctx->list_lock); 1499 + sctx->curr = sctx->first_free; 1500 + if (sctx->curr != -1) { 1501 + sctx->first_free = sctx->bios[sctx->curr]->next_free; 1502 + sctx->bios[sctx->curr]->next_free = -1; 1503 + sctx->bios[sctx->curr]->page_count = 0; 1504 + spin_unlock(&sctx->list_lock); 1912 1505 } else { 1913 - spin_unlock(&sdev->list_lock); 1914 - wait_event(sdev->list_wait, sdev->first_free != -1); 1506 + spin_unlock(&sctx->list_lock); 1507 + wait_event(sctx->list_wait, sctx->first_free != -1); 1915 1508 } 1916 1509 } 1917 - sbio = sdev->bios[sdev->curr]; 1510 + sbio = sctx->bios[sctx->curr]; 1918 1511 if (sbio->page_count == 0) { 1919 1512 struct bio *bio; 1920 1513 1921 1514 sbio->physical = spage->physical; 1922 1515 sbio->logical = spage->logical; 1516 + sbio->dev = spage->dev; 1923 1517 bio = sbio->bio; 1924 1518 if (!bio) { 1925 - bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio); 1519 + bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio); 1926 1520 if (!bio) 1927 1521 return -ENOMEM; 1928 1522 sbio->bio = bio; ··· 1931 1523 1932 1524 bio->bi_private = sbio; 1933 1525 bio->bi_end_io = scrub_bio_end_io; 1934 - bio->bi_bdev = sdev->dev->bdev; 1935 - bio->bi_sector = spage->physical >> 9; 1526 + bio->bi_bdev = sbio->dev->bdev; 1527 + bio->bi_sector = sbio->physical >> 9; 1936 1528 sbio->err = 0; 1937 1529 } else if (sbio->physical + sbio->page_count * PAGE_SIZE != 1938 1530 spage->physical || 1939 1531 sbio->logical + sbio->page_count * PAGE_SIZE != 1940 - spage->logical) { 1941 - scrub_submit(sdev); 1532 + spage->logical || 1533 + sbio->dev != spage->dev) { 1534 + scrub_submit(sctx); 1942 1535 goto again; 1943 1536 } 1944 1537 ··· 1951 1542 sbio->bio = NULL; 1952 1543 return -EIO; 1953 1544 } 1954 - scrub_submit(sdev); 1545 + scrub_submit(sctx); 1955 1546 goto again; 1956 1547 } 1957 1548 1958 - scrub_block_get(sblock); /* one for the added page */ 1549 + scrub_block_get(sblock); /* one for the page added to the bio */ 1959 1550 atomic_inc(&sblock->outstanding_pages); 1960 1551 sbio->page_count++; 1961 - if (sbio->page_count == sdev->pages_per_bio) 1962 - scrub_submit(sdev); 1552 + if (sbio->page_count == sctx->pages_per_rd_bio) 1553 + scrub_submit(sctx); 1963 1554 1964 1555 return 0; 1965 1556 } 1966 1557 1967 - static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, 1968 - u64 physical, u64 flags, u64 gen, int mirror_num, 1969 - u8 *csum, int force) 1558 + static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 1559 + u64 physical, struct btrfs_device *dev, u64 flags, 1560 + u64 gen, int mirror_num, u8 *csum, int force, 1561 + u64 physical_for_dev_replace) 1970 1562 { 1971 1563 struct scrub_block *sblock; 1972 1564 int index; 1973 1565 1974 1566 sblock = kzalloc(sizeof(*sblock), GFP_NOFS); 1975 1567 if (!sblock) { 1976 - spin_lock(&sdev->stat_lock); 1977 - sdev->stat.malloc_errors++; 1978 - spin_unlock(&sdev->stat_lock); 1568 + spin_lock(&sctx->stat_lock); 1569 + sctx->stat.malloc_errors++; 1570 + spin_unlock(&sctx->stat_lock); 1979 1571 return -ENOMEM; 1980 1572 } 1981 1573 1982 - /* one ref inside this function, plus one for each page later on */ 1574 + /* one ref inside this function, plus one for each page added to 1575 + * a bio later on */ 1983 1576 atomic_set(&sblock->ref_count, 1); 1984 - sblock->sdev = sdev; 1577 + sblock->sctx = sctx; 1985 1578 sblock->no_io_error_seen = 1; 1986 1579 1987 1580 for (index = 0; len > 0; index++) { 1988 - struct scrub_page *spage = sblock->pagev + index; 1581 + struct scrub_page *spage; 1989 1582 u64 l = min_t(u64, len, PAGE_SIZE); 1990 1583 1991 - BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); 1992 - spage->page = alloc_page(GFP_NOFS); 1993 - if (!spage->page) { 1994 - spin_lock(&sdev->stat_lock); 1995 - sdev->stat.malloc_errors++; 1996 - spin_unlock(&sdev->stat_lock); 1997 - while (index > 0) { 1998 - index--; 1999 - __free_page(sblock->pagev[index].page); 2000 - } 2001 - kfree(sblock); 1584 + spage = kzalloc(sizeof(*spage), GFP_NOFS); 1585 + if (!spage) { 1586 + leave_nomem: 1587 + spin_lock(&sctx->stat_lock); 1588 + sctx->stat.malloc_errors++; 1589 + spin_unlock(&sctx->stat_lock); 1590 + scrub_block_put(sblock); 2002 1591 return -ENOMEM; 2003 1592 } 1593 + BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); 1594 + scrub_page_get(spage); 1595 + sblock->pagev[index] = spage; 2004 1596 spage->sblock = sblock; 2005 - spage->dev = sdev->dev; 1597 + spage->dev = dev; 2006 1598 spage->flags = flags; 2007 1599 spage->generation = gen; 2008 1600 spage->logical = logical; 2009 1601 spage->physical = physical; 1602 + spage->physical_for_dev_replace = physical_for_dev_replace; 2010 1603 spage->mirror_num = mirror_num; 2011 1604 if (csum) { 2012 1605 spage->have_csum = 1; 2013 - memcpy(spage->csum, csum, sdev->csum_size); 1606 + memcpy(spage->csum, csum, sctx->csum_size); 2014 1607 } else { 2015 1608 spage->have_csum = 0; 2016 1609 } 2017 1610 sblock->page_count++; 1611 + spage->page = alloc_page(GFP_NOFS); 1612 + if (!spage->page) 1613 + goto leave_nomem; 2018 1614 len -= l; 2019 1615 logical += l; 2020 1616 physical += l; 1617 + physical_for_dev_replace += l; 2021 1618 } 2022 1619 2023 - BUG_ON(sblock->page_count == 0); 1620 + WARN_ON(sblock->page_count == 0); 2024 1621 for (index = 0; index < sblock->page_count; index++) { 2025 - struct scrub_page *spage = sblock->pagev + index; 1622 + struct scrub_page *spage = sblock->pagev[index]; 2026 1623 int ret; 2027 1624 2028 - ret = scrub_add_page_to_bio(sdev, spage); 1625 + ret = scrub_add_page_to_rd_bio(sctx, spage); 2029 1626 if (ret) { 2030 1627 scrub_block_put(sblock); 2031 1628 return ret; ··· 2039 1624 } 2040 1625 2041 1626 if (force) 2042 - scrub_submit(sdev); 1627 + scrub_submit(sctx); 2043 1628 2044 1629 /* last one frees, either here or in bio completion for last page */ 2045 1630 scrub_block_put(sblock); ··· 2049 1634 static void scrub_bio_end_io(struct bio *bio, int err) 2050 1635 { 2051 1636 struct scrub_bio *sbio = bio->bi_private; 2052 - struct scrub_dev *sdev = sbio->sdev; 2053 - struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 1637 + struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; 2054 1638 2055 1639 sbio->err = err; 2056 1640 sbio->bio = bio; ··· 2060 1646 static void scrub_bio_end_io_worker(struct btrfs_work *work) 2061 1647 { 2062 1648 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 2063 - struct scrub_dev *sdev = sbio->sdev; 1649 + struct scrub_ctx *sctx = sbio->sctx; 2064 1650 int i; 2065 1651 2066 - BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); 1652 + BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO); 2067 1653 if (sbio->err) { 2068 1654 for (i = 0; i < sbio->page_count; i++) { 2069 1655 struct scrub_page *spage = sbio->pagev[i]; ··· 2085 1671 2086 1672 bio_put(sbio->bio); 2087 1673 sbio->bio = NULL; 2088 - spin_lock(&sdev->list_lock); 2089 - sbio->next_free = sdev->first_free; 2090 - sdev->first_free = sbio->index; 2091 - spin_unlock(&sdev->list_lock); 2092 - atomic_dec(&sdev->in_flight); 2093 - wake_up(&sdev->list_wait); 1674 + spin_lock(&sctx->list_lock); 1675 + sbio->next_free = sctx->first_free; 1676 + sctx->first_free = sbio->index; 1677 + spin_unlock(&sctx->list_lock); 1678 + 1679 + if (sctx->is_dev_replace && 1680 + atomic_read(&sctx->wr_ctx.flush_all_writes)) { 1681 + mutex_lock(&sctx->wr_ctx.wr_lock); 1682 + scrub_wr_submit(sctx); 1683 + mutex_unlock(&sctx->wr_ctx.wr_lock); 1684 + } 1685 + 1686 + scrub_pending_bio_dec(sctx); 2094 1687 } 2095 1688 2096 1689 static void scrub_block_complete(struct scrub_block *sblock) 2097 1690 { 2098 - if (!sblock->no_io_error_seen) 1691 + if (!sblock->no_io_error_seen) { 2099 1692 scrub_handle_errored_block(sblock); 2100 - else 2101 - scrub_checksum(sblock); 1693 + } else { 1694 + /* 1695 + * if has checksum error, write via repair mechanism in 1696 + * dev replace case, otherwise write here in dev replace 1697 + * case. 1698 + */ 1699 + if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace) 1700 + scrub_write_block_to_dev_replace(sblock); 1701 + } 2102 1702 } 2103 1703 2104 - static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, 1704 + static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, 2105 1705 u8 *csum) 2106 1706 { 2107 1707 struct btrfs_ordered_sum *sum = NULL; ··· 2123 1695 unsigned long i; 2124 1696 unsigned long num_sectors; 2125 1697 2126 - while (!list_empty(&sdev->csum_list)) { 2127 - sum = list_first_entry(&sdev->csum_list, 1698 + while (!list_empty(&sctx->csum_list)) { 1699 + sum = list_first_entry(&sctx->csum_list, 2128 1700 struct btrfs_ordered_sum, list); 2129 1701 if (sum->bytenr > logical) 2130 1702 return 0; 2131 1703 if (sum->bytenr + sum->len > logical) 2132 1704 break; 2133 1705 2134 - ++sdev->stat.csum_discards; 1706 + ++sctx->stat.csum_discards; 2135 1707 list_del(&sum->list); 2136 1708 kfree(sum); 2137 1709 sum = NULL; ··· 2139 1711 if (!sum) 2140 1712 return 0; 2141 1713 2142 - num_sectors = sum->len / sdev->sectorsize; 1714 + num_sectors = sum->len / sctx->sectorsize; 2143 1715 for (i = 0; i < num_sectors; ++i) { 2144 1716 if (sum->sums[i].bytenr == logical) { 2145 - memcpy(csum, &sum->sums[i].sum, sdev->csum_size); 1717 + memcpy(csum, &sum->sums[i].sum, sctx->csum_size); 2146 1718 ret = 1; 2147 1719 break; 2148 1720 } ··· 2155 1727 } 2156 1728 2157 1729 /* scrub extent tries to collect up to 64 kB for each bio */ 2158 - static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, 2159 - u64 physical, u64 flags, u64 gen, int mirror_num) 1730 + static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, 1731 + u64 physical, struct btrfs_device *dev, u64 flags, 1732 + u64 gen, int mirror_num, u64 physical_for_dev_replace) 2160 1733 { 2161 1734 int ret; 2162 1735 u8 csum[BTRFS_CSUM_SIZE]; 2163 1736 u32 blocksize; 2164 1737 2165 1738 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2166 - blocksize = sdev->sectorsize; 2167 - spin_lock(&sdev->stat_lock); 2168 - sdev->stat.data_extents_scrubbed++; 2169 - sdev->stat.data_bytes_scrubbed += len; 2170 - spin_unlock(&sdev->stat_lock); 1739 + blocksize = sctx->sectorsize; 1740 + spin_lock(&sctx->stat_lock); 1741 + sctx->stat.data_extents_scrubbed++; 1742 + sctx->stat.data_bytes_scrubbed += len; 1743 + spin_unlock(&sctx->stat_lock); 2171 1744 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 2172 - BUG_ON(sdev->nodesize != sdev->leafsize); 2173 - blocksize = sdev->nodesize; 2174 - spin_lock(&sdev->stat_lock); 2175 - sdev->stat.tree_extents_scrubbed++; 2176 - sdev->stat.tree_bytes_scrubbed += len; 2177 - spin_unlock(&sdev->stat_lock); 1745 + WARN_ON(sctx->nodesize != sctx->leafsize); 1746 + blocksize = sctx->nodesize; 1747 + spin_lock(&sctx->stat_lock); 1748 + sctx->stat.tree_extents_scrubbed++; 1749 + sctx->stat.tree_bytes_scrubbed += len; 1750 + spin_unlock(&sctx->stat_lock); 2178 1751 } else { 2179 - blocksize = sdev->sectorsize; 2180 - BUG_ON(1); 1752 + blocksize = sctx->sectorsize; 1753 + WARN_ON(1); 2181 1754 } 2182 1755 2183 1756 while (len) { ··· 2187 1758 2188 1759 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2189 1760 /* push csums to sbio */ 2190 - have_csum = scrub_find_csum(sdev, logical, l, csum); 1761 + have_csum = scrub_find_csum(sctx, logical, l, csum); 2191 1762 if (have_csum == 0) 2192 - ++sdev->stat.no_csum; 1763 + ++sctx->stat.no_csum; 1764 + if (sctx->is_dev_replace && !have_csum) { 1765 + ret = copy_nocow_pages(sctx, logical, l, 1766 + mirror_num, 1767 + physical_for_dev_replace); 1768 + goto behind_scrub_pages; 1769 + } 2193 1770 } 2194 - ret = scrub_pages(sdev, logical, l, physical, flags, gen, 2195 - mirror_num, have_csum ? csum : NULL, 0); 1771 + ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, 1772 + mirror_num, have_csum ? csum : NULL, 0, 1773 + physical_for_dev_replace); 1774 + behind_scrub_pages: 2196 1775 if (ret) 2197 1776 return ret; 2198 1777 len -= l; 2199 1778 logical += l; 2200 1779 physical += l; 1780 + physical_for_dev_replace += l; 2201 1781 } 2202 1782 return 0; 2203 1783 } 2204 1784 2205 - static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, 2206 - struct map_lookup *map, int num, u64 base, u64 length) 1785 + static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, 1786 + struct map_lookup *map, 1787 + struct btrfs_device *scrub_dev, 1788 + int num, u64 base, u64 length, 1789 + int is_dev_replace) 2207 1790 { 2208 1791 struct btrfs_path *path; 2209 - struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 1792 + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; 2210 1793 struct btrfs_root *root = fs_info->extent_root; 2211 1794 struct btrfs_root *csum_root = fs_info->csum_root; 2212 1795 struct btrfs_extent_item *extent; ··· 2238 1797 struct reada_control *reada2; 2239 1798 struct btrfs_key key_start; 2240 1799 struct btrfs_key key_end; 2241 - 2242 1800 u64 increment = map->stripe_len; 2243 1801 u64 offset; 1802 + u64 extent_logical; 1803 + u64 extent_physical; 1804 + u64 extent_len; 1805 + struct btrfs_device *extent_dev; 1806 + int extent_mirror_num; 2244 1807 2245 1808 nstripes = length; 2246 1809 offset = 0; ··· 2288 1843 */ 2289 1844 logical = base + offset; 2290 1845 2291 - wait_event(sdev->list_wait, 2292 - atomic_read(&sdev->in_flight) == 0); 1846 + wait_event(sctx->list_wait, 1847 + atomic_read(&sctx->bios_in_flight) == 0); 2293 1848 atomic_inc(&fs_info->scrubs_paused); 2294 1849 wake_up(&fs_info->scrub_pause_wait); 2295 1850 ··· 2343 1898 * canceled? 2344 1899 */ 2345 1900 if (atomic_read(&fs_info->scrub_cancel_req) || 2346 - atomic_read(&sdev->cancel_req)) { 1901 + atomic_read(&sctx->cancel_req)) { 2347 1902 ret = -ECANCELED; 2348 1903 goto out; 2349 1904 } ··· 2352 1907 */ 2353 1908 if (atomic_read(&fs_info->scrub_pause_req)) { 2354 1909 /* push queued extents */ 2355 - scrub_submit(sdev); 2356 - wait_event(sdev->list_wait, 2357 - atomic_read(&sdev->in_flight) == 0); 1910 + atomic_set(&sctx->wr_ctx.flush_all_writes, 1); 1911 + scrub_submit(sctx); 1912 + mutex_lock(&sctx->wr_ctx.wr_lock); 1913 + scrub_wr_submit(sctx); 1914 + mutex_unlock(&sctx->wr_ctx.wr_lock); 1915 + wait_event(sctx->list_wait, 1916 + atomic_read(&sctx->bios_in_flight) == 0); 1917 + atomic_set(&sctx->wr_ctx.flush_all_writes, 0); 2358 1918 atomic_inc(&fs_info->scrubs_paused); 2359 1919 wake_up(&fs_info->scrub_pause_wait); 2360 1920 mutex_lock(&fs_info->scrub_lock); ··· 2376 1926 2377 1927 ret = btrfs_lookup_csums_range(csum_root, logical, 2378 1928 logical + map->stripe_len - 1, 2379 - &sdev->csum_list, 1); 1929 + &sctx->csum_list, 1); 2380 1930 if (ret) 2381 1931 goto out; 2382 1932 ··· 2454 2004 key.objectid; 2455 2005 } 2456 2006 2457 - ret = scrub_extent(sdev, key.objectid, key.offset, 2458 - key.objectid - logical + physical, 2459 - flags, generation, mirror_num); 2007 + extent_logical = key.objectid; 2008 + extent_physical = key.objectid - logical + physical; 2009 + extent_len = key.offset; 2010 + extent_dev = scrub_dev; 2011 + extent_mirror_num = mirror_num; 2012 + if (is_dev_replace) 2013 + scrub_remap_extent(fs_info, extent_logical, 2014 + extent_len, &extent_physical, 2015 + &extent_dev, 2016 + &extent_mirror_num); 2017 + ret = scrub_extent(sctx, extent_logical, extent_len, 2018 + extent_physical, extent_dev, flags, 2019 + generation, extent_mirror_num, 2020 + key.objectid - logical + physical); 2460 2021 if (ret) 2461 2022 goto out; 2462 2023 ··· 2477 2016 btrfs_release_path(path); 2478 2017 logical += increment; 2479 2018 physical += map->stripe_len; 2480 - spin_lock(&sdev->stat_lock); 2481 - sdev->stat.last_physical = physical; 2482 - spin_unlock(&sdev->stat_lock); 2019 + spin_lock(&sctx->stat_lock); 2020 + sctx->stat.last_physical = physical; 2021 + spin_unlock(&sctx->stat_lock); 2483 2022 } 2484 - /* push queued extents */ 2485 - scrub_submit(sdev); 2486 - 2487 2023 out: 2024 + /* push queued extents */ 2025 + scrub_submit(sctx); 2026 + mutex_lock(&sctx->wr_ctx.wr_lock); 2027 + scrub_wr_submit(sctx); 2028 + mutex_unlock(&sctx->wr_ctx.wr_lock); 2029 + 2488 2030 blk_finish_plug(&plug); 2489 2031 btrfs_free_path(path); 2490 2032 return ret < 0 ? ret : 0; 2491 2033 } 2492 2034 2493 - static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, 2494 - u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, 2495 - u64 dev_offset) 2035 + static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, 2036 + struct btrfs_device *scrub_dev, 2037 + u64 chunk_tree, u64 chunk_objectid, 2038 + u64 chunk_offset, u64 length, 2039 + u64 dev_offset, int is_dev_replace) 2496 2040 { 2497 2041 struct btrfs_mapping_tree *map_tree = 2498 - &sdev->dev->dev_root->fs_info->mapping_tree; 2042 + &sctx->dev_root->fs_info->mapping_tree; 2499 2043 struct map_lookup *map; 2500 2044 struct extent_map *em; 2501 2045 int i; 2502 - int ret = -EINVAL; 2046 + int ret = 0; 2503 2047 2504 2048 read_lock(&map_tree->map_tree.lock); 2505 2049 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); ··· 2521 2055 goto out; 2522 2056 2523 2057 for (i = 0; i < map->num_stripes; ++i) { 2524 - if (map->stripes[i].dev == sdev->dev && 2058 + if (map->stripes[i].dev->bdev == scrub_dev->bdev && 2525 2059 map->stripes[i].physical == dev_offset) { 2526 - ret = scrub_stripe(sdev, map, i, chunk_offset, length); 2060 + ret = scrub_stripe(sctx, map, scrub_dev, i, 2061 + chunk_offset, length, 2062 + is_dev_replace); 2527 2063 if (ret) 2528 2064 goto out; 2529 2065 } ··· 2537 2069 } 2538 2070 2539 2071 static noinline_for_stack 2540 - int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) 2072 + int scrub_enumerate_chunks(struct scrub_ctx *sctx, 2073 + struct btrfs_device *scrub_dev, u64 start, u64 end, 2074 + int is_dev_replace) 2541 2075 { 2542 2076 struct btrfs_dev_extent *dev_extent = NULL; 2543 2077 struct btrfs_path *path; 2544 - struct btrfs_root *root = sdev->dev->dev_root; 2078 + struct btrfs_root *root = sctx->dev_root; 2545 2079 struct btrfs_fs_info *fs_info = root->fs_info; 2546 2080 u64 length; 2547 2081 u64 chunk_tree; ··· 2555 2085 struct btrfs_key key; 2556 2086 struct btrfs_key found_key; 2557 2087 struct btrfs_block_group_cache *cache; 2088 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 2558 2089 2559 2090 path = btrfs_alloc_path(); 2560 2091 if (!path) ··· 2565 2094 path->search_commit_root = 1; 2566 2095 path->skip_locking = 1; 2567 2096 2568 - key.objectid = sdev->dev->devid; 2097 + key.objectid = scrub_dev->devid; 2569 2098 key.offset = 0ull; 2570 2099 key.type = BTRFS_DEV_EXTENT_KEY; 2571 - 2572 2100 2573 2101 while (1) { 2574 2102 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); ··· 2587 2117 2588 2118 btrfs_item_key_to_cpu(l, &found_key, slot); 2589 2119 2590 - if (found_key.objectid != sdev->dev->devid) 2120 + if (found_key.objectid != scrub_dev->devid) 2591 2121 break; 2592 2122 2593 2123 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) ··· 2621 2151 ret = -ENOENT; 2622 2152 break; 2623 2153 } 2624 - ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, 2625 - chunk_offset, length, found_key.offset); 2154 + dev_replace->cursor_right = found_key.offset + length; 2155 + dev_replace->cursor_left = found_key.offset; 2156 + dev_replace->item_needs_writeback = 1; 2157 + ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid, 2158 + chunk_offset, length, found_key.offset, 2159 + is_dev_replace); 2160 + 2161 + /* 2162 + * flush, submit all pending read and write bios, afterwards 2163 + * wait for them. 2164 + * Note that in the dev replace case, a read request causes 2165 + * write requests that are submitted in the read completion 2166 + * worker. Therefore in the current situation, it is required 2167 + * that all write requests are flushed, so that all read and 2168 + * write requests are really completed when bios_in_flight 2169 + * changes to 0. 2170 + */ 2171 + atomic_set(&sctx->wr_ctx.flush_all_writes, 1); 2172 + scrub_submit(sctx); 2173 + mutex_lock(&sctx->wr_ctx.wr_lock); 2174 + scrub_wr_submit(sctx); 2175 + mutex_unlock(&sctx->wr_ctx.wr_lock); 2176 + 2177 + wait_event(sctx->list_wait, 2178 + atomic_read(&sctx->bios_in_flight) == 0); 2179 + atomic_set(&sctx->wr_ctx.flush_all_writes, 0); 2180 + atomic_inc(&fs_info->scrubs_paused); 2181 + wake_up(&fs_info->scrub_pause_wait); 2182 + wait_event(sctx->list_wait, 2183 + atomic_read(&sctx->workers_pending) == 0); 2184 + 2185 + mutex_lock(&fs_info->scrub_lock); 2186 + while (atomic_read(&fs_info->scrub_pause_req)) { 2187 + mutex_unlock(&fs_info->scrub_lock); 2188 + wait_event(fs_info->scrub_pause_wait, 2189 + atomic_read(&fs_info->scrub_pause_req) == 0); 2190 + mutex_lock(&fs_info->scrub_lock); 2191 + } 2192 + atomic_dec(&fs_info->scrubs_paused); 2193 + mutex_unlock(&fs_info->scrub_lock); 2194 + wake_up(&fs_info->scrub_pause_wait); 2195 + 2196 + dev_replace->cursor_left = dev_replace->cursor_right; 2197 + dev_replace->item_needs_writeback = 1; 2626 2198 btrfs_put_block_group(cache); 2627 2199 if (ret) 2628 2200 break; 2201 + if (is_dev_replace && 2202 + atomic64_read(&dev_replace->num_write_errors) > 0) { 2203 + ret = -EIO; 2204 + break; 2205 + } 2206 + if (sctx->stat.malloc_errors > 0) { 2207 + ret = -ENOMEM; 2208 + break; 2209 + } 2629 2210 2630 2211 key.offset = found_key.offset + length; 2631 2212 btrfs_release_path(path); ··· 2691 2170 return ret < 0 ? ret : 0; 2692 2171 } 2693 2172 2694 - static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) 2173 + static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, 2174 + struct btrfs_device *scrub_dev) 2695 2175 { 2696 2176 int i; 2697 2177 u64 bytenr; 2698 2178 u64 gen; 2699 2179 int ret; 2700 - struct btrfs_device *device = sdev->dev; 2701 - struct btrfs_root *root = device->dev_root; 2180 + struct btrfs_root *root = sctx->dev_root; 2702 2181 2703 2182 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 2704 2183 return -EIO; ··· 2707 2186 2708 2187 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 2709 2188 bytenr = btrfs_sb_offset(i); 2710 - if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) 2189 + if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes) 2711 2190 break; 2712 2191 2713 - ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, 2714 - BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); 2192 + ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, 2193 + scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, 2194 + NULL, 1, bytenr); 2715 2195 if (ret) 2716 2196 return ret; 2717 2197 } 2718 - wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 2198 + wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); 2719 2199 2720 2200 return 0; 2721 2201 } ··· 2724 2202 /* 2725 2203 * get a reference count on fs_info->scrub_workers. start worker if necessary 2726 2204 */ 2727 - static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) 2205 + static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, 2206 + int is_dev_replace) 2728 2207 { 2729 - struct btrfs_fs_info *fs_info = root->fs_info; 2730 2208 int ret = 0; 2731 2209 2732 2210 mutex_lock(&fs_info->scrub_lock); 2733 2211 if (fs_info->scrub_workers_refcnt == 0) { 2734 - btrfs_init_workers(&fs_info->scrub_workers, "scrub", 2735 - fs_info->thread_pool_size, &fs_info->generic_worker); 2212 + if (is_dev_replace) 2213 + btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1, 2214 + &fs_info->generic_worker); 2215 + else 2216 + btrfs_init_workers(&fs_info->scrub_workers, "scrub", 2217 + fs_info->thread_pool_size, 2218 + &fs_info->generic_worker); 2736 2219 fs_info->scrub_workers.idle_thresh = 4; 2737 2220 ret = btrfs_start_workers(&fs_info->scrub_workers); 2221 + if (ret) 2222 + goto out; 2223 + btrfs_init_workers(&fs_info->scrub_wr_completion_workers, 2224 + "scrubwrc", 2225 + fs_info->thread_pool_size, 2226 + &fs_info->generic_worker); 2227 + fs_info->scrub_wr_completion_workers.idle_thresh = 2; 2228 + ret = btrfs_start_workers( 2229 + &fs_info->scrub_wr_completion_workers); 2230 + if (ret) 2231 + goto out; 2232 + btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1, 2233 + &fs_info->generic_worker); 2234 + ret = btrfs_start_workers(&fs_info->scrub_nocow_workers); 2738 2235 if (ret) 2739 2236 goto out; 2740 2237 } ··· 2764 2223 return ret; 2765 2224 } 2766 2225 2767 - static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) 2226 + static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) 2768 2227 { 2769 - struct btrfs_fs_info *fs_info = root->fs_info; 2770 - 2771 2228 mutex_lock(&fs_info->scrub_lock); 2772 - if (--fs_info->scrub_workers_refcnt == 0) 2229 + if (--fs_info->scrub_workers_refcnt == 0) { 2773 2230 btrfs_stop_workers(&fs_info->scrub_workers); 2231 + btrfs_stop_workers(&fs_info->scrub_wr_completion_workers); 2232 + btrfs_stop_workers(&fs_info->scrub_nocow_workers); 2233 + } 2774 2234 WARN_ON(fs_info->scrub_workers_refcnt < 0); 2775 2235 mutex_unlock(&fs_info->scrub_lock); 2776 2236 } 2777 2237 2778 - 2779 - int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, 2780 - struct btrfs_scrub_progress *progress, int readonly) 2238 + int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, 2239 + u64 end, struct btrfs_scrub_progress *progress, 2240 + int readonly, int is_dev_replace) 2781 2241 { 2782 - struct scrub_dev *sdev; 2783 - struct btrfs_fs_info *fs_info = root->fs_info; 2242 + struct scrub_ctx *sctx; 2784 2243 int ret; 2785 2244 struct btrfs_device *dev; 2786 2245 2787 - if (btrfs_fs_closing(root->fs_info)) 2246 + if (btrfs_fs_closing(fs_info)) 2788 2247 return -EINVAL; 2789 2248 2790 2249 /* 2791 2250 * check some assumptions 2792 2251 */ 2793 - if (root->nodesize != root->leafsize) { 2252 + if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) { 2794 2253 printk(KERN_ERR 2795 2254 "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", 2796 - root->nodesize, root->leafsize); 2255 + fs_info->chunk_root->nodesize, 2256 + fs_info->chunk_root->leafsize); 2797 2257 return -EINVAL; 2798 2258 } 2799 2259 2800 - if (root->nodesize > BTRFS_STRIPE_LEN) { 2260 + if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) { 2801 2261 /* 2802 2262 * in this case scrub is unable to calculate the checksum 2803 2263 * the way scrub is implemented. Do not handle this ··· 2806 2264 */ 2807 2265 printk(KERN_ERR 2808 2266 "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", 2809 - root->nodesize, BTRFS_STRIPE_LEN); 2267 + fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN); 2810 2268 return -EINVAL; 2811 2269 } 2812 2270 2813 - if (root->sectorsize != PAGE_SIZE) { 2271 + if (fs_info->chunk_root->sectorsize != PAGE_SIZE) { 2814 2272 /* not supported for data w/o checksums */ 2815 2273 printk(KERN_ERR 2816 2274 "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", 2817 - root->sectorsize, (unsigned long long)PAGE_SIZE); 2275 + fs_info->chunk_root->sectorsize, 2276 + (unsigned long long)PAGE_SIZE); 2818 2277 return -EINVAL; 2819 2278 } 2820 2279 2821 - ret = scrub_workers_get(root); 2280 + if (fs_info->chunk_root->nodesize > 2281 + PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK || 2282 + fs_info->chunk_root->sectorsize > 2283 + PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) { 2284 + /* 2285 + * would exhaust the array bounds of pagev member in 2286 + * struct scrub_block 2287 + */ 2288 + pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n", 2289 + fs_info->chunk_root->nodesize, 2290 + SCRUB_MAX_PAGES_PER_BLOCK, 2291 + fs_info->chunk_root->sectorsize, 2292 + SCRUB_MAX_PAGES_PER_BLOCK); 2293 + return -EINVAL; 2294 + } 2295 + 2296 + ret = scrub_workers_get(fs_info, is_dev_replace); 2822 2297 if (ret) 2823 2298 return ret; 2824 2299 2825 - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2826 - dev = btrfs_find_device(root, devid, NULL, NULL); 2827 - if (!dev || dev->missing) { 2828 - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2829 - scrub_workers_put(root); 2300 + mutex_lock(&fs_info->fs_devices->device_list_mutex); 2301 + dev = btrfs_find_device(fs_info, devid, NULL, NULL); 2302 + if (!dev || (dev->missing && !is_dev_replace)) { 2303 + mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2304 + scrub_workers_put(fs_info); 2830 2305 return -ENODEV; 2831 2306 } 2832 2307 mutex_lock(&fs_info->scrub_lock); 2833 2308 2834 - if (!dev->in_fs_metadata) { 2309 + if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) { 2835 2310 mutex_unlock(&fs_info->scrub_lock); 2836 - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2837 - scrub_workers_put(root); 2838 - return -ENODEV; 2311 + mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2312 + scrub_workers_put(fs_info); 2313 + return -EIO; 2839 2314 } 2840 2315 2841 - if (dev->scrub_device) { 2316 + btrfs_dev_replace_lock(&fs_info->dev_replace); 2317 + if (dev->scrub_device || 2318 + (!is_dev_replace && 2319 + btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { 2320 + btrfs_dev_replace_unlock(&fs_info->dev_replace); 2842 2321 mutex_unlock(&fs_info->scrub_lock); 2843 - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2844 - scrub_workers_put(root); 2322 + mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2323 + scrub_workers_put(fs_info); 2845 2324 return -EINPROGRESS; 2846 2325 } 2847 - sdev = scrub_setup_dev(dev); 2848 - if (IS_ERR(sdev)) { 2326 + btrfs_dev_replace_unlock(&fs_info->dev_replace); 2327 + sctx = scrub_setup_ctx(dev, is_dev_replace); 2328 + if (IS_ERR(sctx)) { 2849 2329 mutex_unlock(&fs_info->scrub_lock); 2850 - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2851 - scrub_workers_put(root); 2852 - return PTR_ERR(sdev); 2330 + mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2331 + scrub_workers_put(fs_info); 2332 + return PTR_ERR(sctx); 2853 2333 } 2854 - sdev->readonly = readonly; 2855 - dev->scrub_device = sdev; 2334 + sctx->readonly = readonly; 2335 + dev->scrub_device = sctx; 2856 2336 2857 2337 atomic_inc(&fs_info->scrubs_running); 2858 2338 mutex_unlock(&fs_info->scrub_lock); 2859 - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2339 + mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2860 2340 2861 - down_read(&fs_info->scrub_super_lock); 2862 - ret = scrub_supers(sdev); 2863 - up_read(&fs_info->scrub_super_lock); 2341 + if (!is_dev_replace) { 2342 + down_read(&fs_info->scrub_super_lock); 2343 + ret = scrub_supers(sctx, dev); 2344 + up_read(&fs_info->scrub_super_lock); 2345 + } 2864 2346 2865 2347 if (!ret) 2866 - ret = scrub_enumerate_chunks(sdev, start, end); 2348 + ret = scrub_enumerate_chunks(sctx, dev, start, end, 2349 + is_dev_replace); 2867 2350 2868 - wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 2351 + wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); 2869 2352 atomic_dec(&fs_info->scrubs_running); 2870 2353 wake_up(&fs_info->scrub_pause_wait); 2871 2354 2872 - wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); 2355 + wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); 2873 2356 2874 2357 if (progress) 2875 - memcpy(progress, &sdev->stat, sizeof(*progress)); 2358 + memcpy(progress, &sctx->stat, sizeof(*progress)); 2876 2359 2877 2360 mutex_lock(&fs_info->scrub_lock); 2878 2361 dev->scrub_device = NULL; 2879 2362 mutex_unlock(&fs_info->scrub_lock); 2880 2363 2881 - scrub_free_dev(sdev); 2882 - scrub_workers_put(root); 2364 + scrub_free_ctx(sctx); 2365 + scrub_workers_put(fs_info); 2883 2366 2884 2367 return ret; 2885 2368 } ··· 2944 2377 up_write(&root->fs_info->scrub_super_lock); 2945 2378 } 2946 2379 2947 - int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) 2380 + int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) 2948 2381 { 2949 - 2950 2382 mutex_lock(&fs_info->scrub_lock); 2951 2383 if (!atomic_read(&fs_info->scrubs_running)) { 2952 2384 mutex_unlock(&fs_info->scrub_lock); ··· 2965 2399 return 0; 2966 2400 } 2967 2401 2968 - int btrfs_scrub_cancel(struct btrfs_root *root) 2402 + int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info, 2403 + struct btrfs_device *dev) 2969 2404 { 2970 - return __btrfs_scrub_cancel(root->fs_info); 2971 - } 2972 - 2973 - int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev) 2974 - { 2975 - struct btrfs_fs_info *fs_info = root->fs_info; 2976 - struct scrub_dev *sdev; 2405 + struct scrub_ctx *sctx; 2977 2406 2978 2407 mutex_lock(&fs_info->scrub_lock); 2979 - sdev = dev->scrub_device; 2980 - if (!sdev) { 2408 + sctx = dev->scrub_device; 2409 + if (!sctx) { 2981 2410 mutex_unlock(&fs_info->scrub_lock); 2982 2411 return -ENOTCONN; 2983 2412 } 2984 - atomic_inc(&sdev->cancel_req); 2413 + atomic_inc(&sctx->cancel_req); 2985 2414 while (dev->scrub_device) { 2986 2415 mutex_unlock(&fs_info->scrub_lock); 2987 2416 wait_event(fs_info->scrub_pause_wait, ··· 2999 2438 * does not go away in cancel_dev. FIXME: find a better solution 3000 2439 */ 3001 2440 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3002 - dev = btrfs_find_device(root, devid, NULL, NULL); 2441 + dev = btrfs_find_device(fs_info, devid, NULL, NULL); 3003 2442 if (!dev) { 3004 2443 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3005 2444 return -ENODEV; 3006 2445 } 3007 - ret = btrfs_scrub_cancel_dev(root, dev); 2446 + ret = btrfs_scrub_cancel_dev(fs_info, dev); 3008 2447 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3009 2448 3010 2449 return ret; ··· 3014 2453 struct btrfs_scrub_progress *progress) 3015 2454 { 3016 2455 struct btrfs_device *dev; 3017 - struct scrub_dev *sdev = NULL; 2456 + struct scrub_ctx *sctx = NULL; 3018 2457 3019 2458 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 3020 - dev = btrfs_find_device(root, devid, NULL, NULL); 2459 + dev = btrfs_find_device(root->fs_info, devid, NULL, NULL); 3021 2460 if (dev) 3022 - sdev = dev->scrub_device; 3023 - if (sdev) 3024 - memcpy(progress, &sdev->stat, sizeof(*progress)); 2461 + sctx = dev->scrub_device; 2462 + if (sctx) 2463 + memcpy(progress, &sctx->stat, sizeof(*progress)); 3025 2464 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 3026 2465 3027 - return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV; 2466 + return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; 2467 + } 2468 + 2469 + static void scrub_remap_extent(struct btrfs_fs_info *fs_info, 2470 + u64 extent_logical, u64 extent_len, 2471 + u64 *extent_physical, 2472 + struct btrfs_device **extent_dev, 2473 + int *extent_mirror_num) 2474 + { 2475 + u64 mapped_length; 2476 + struct btrfs_bio *bbio = NULL; 2477 + int ret; 2478 + 2479 + mapped_length = extent_len; 2480 + ret = btrfs_map_block(fs_info, READ, extent_logical, 2481 + &mapped_length, &bbio, 0); 2482 + if (ret || !bbio || mapped_length < extent_len || 2483 + !bbio->stripes[0].dev->bdev) { 2484 + kfree(bbio); 2485 + return; 2486 + } 2487 + 2488 + *extent_physical = bbio->stripes[0].physical; 2489 + *extent_mirror_num = bbio->mirror_num; 2490 + *extent_dev = bbio->stripes[0].dev; 2491 + kfree(bbio); 2492 + } 2493 + 2494 + static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, 2495 + struct scrub_wr_ctx *wr_ctx, 2496 + struct btrfs_fs_info *fs_info, 2497 + struct btrfs_device *dev, 2498 + int is_dev_replace) 2499 + { 2500 + WARN_ON(wr_ctx->wr_curr_bio != NULL); 2501 + 2502 + mutex_init(&wr_ctx->wr_lock); 2503 + wr_ctx->wr_curr_bio = NULL; 2504 + if (!is_dev_replace) 2505 + return 0; 2506 + 2507 + WARN_ON(!dev->bdev); 2508 + wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO, 2509 + bio_get_nr_vecs(dev->bdev)); 2510 + wr_ctx->tgtdev = dev; 2511 + atomic_set(&wr_ctx->flush_all_writes, 0); 2512 + return 0; 2513 + } 2514 + 2515 + static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx) 2516 + { 2517 + mutex_lock(&wr_ctx->wr_lock); 2518 + kfree(wr_ctx->wr_curr_bio); 2519 + wr_ctx->wr_curr_bio = NULL; 2520 + mutex_unlock(&wr_ctx->wr_lock); 2521 + } 2522 + 2523 + static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 2524 + int mirror_num, u64 physical_for_dev_replace) 2525 + { 2526 + struct scrub_copy_nocow_ctx *nocow_ctx; 2527 + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; 2528 + 2529 + nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS); 2530 + if (!nocow_ctx) { 2531 + spin_lock(&sctx->stat_lock); 2532 + sctx->stat.malloc_errors++; 2533 + spin_unlock(&sctx->stat_lock); 2534 + return -ENOMEM; 2535 + } 2536 + 2537 + scrub_pending_trans_workers_inc(sctx); 2538 + 2539 + nocow_ctx->sctx = sctx; 2540 + nocow_ctx->logical = logical; 2541 + nocow_ctx->len = len; 2542 + nocow_ctx->mirror_num = mirror_num; 2543 + nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; 2544 + nocow_ctx->work.func = copy_nocow_pages_worker; 2545 + btrfs_queue_worker(&fs_info->scrub_nocow_workers, 2546 + &nocow_ctx->work); 2547 + 2548 + return 0; 2549 + } 2550 + 2551 + static void copy_nocow_pages_worker(struct btrfs_work *work) 2552 + { 2553 + struct scrub_copy_nocow_ctx *nocow_ctx = 2554 + container_of(work, struct scrub_copy_nocow_ctx, work); 2555 + struct scrub_ctx *sctx = nocow_ctx->sctx; 2556 + u64 logical = nocow_ctx->logical; 2557 + u64 len = nocow_ctx->len; 2558 + int mirror_num = nocow_ctx->mirror_num; 2559 + u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; 2560 + int ret; 2561 + struct btrfs_trans_handle *trans = NULL; 2562 + struct btrfs_fs_info *fs_info; 2563 + struct btrfs_path *path; 2564 + struct btrfs_root *root; 2565 + int not_written = 0; 2566 + 2567 + fs_info = sctx->dev_root->fs_info; 2568 + root = fs_info->extent_root; 2569 + 2570 + path = btrfs_alloc_path(); 2571 + if (!path) { 2572 + spin_lock(&sctx->stat_lock); 2573 + sctx->stat.malloc_errors++; 2574 + spin_unlock(&sctx->stat_lock); 2575 + not_written = 1; 2576 + goto out; 2577 + } 2578 + 2579 + trans = btrfs_join_transaction(root); 2580 + if (IS_ERR(trans)) { 2581 + not_written = 1; 2582 + goto out; 2583 + } 2584 + 2585 + ret = iterate_inodes_from_logical(logical, fs_info, path, 2586 + copy_nocow_pages_for_inode, 2587 + nocow_ctx); 2588 + if (ret != 0 && ret != -ENOENT) { 2589 + pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n", 2590 + (unsigned long long)logical, 2591 + (unsigned long long)physical_for_dev_replace, 2592 + (unsigned long long)len, 2593 + (unsigned long long)mirror_num, ret); 2594 + not_written = 1; 2595 + goto out; 2596 + } 2597 + 2598 + out: 2599 + if (trans && !IS_ERR(trans)) 2600 + btrfs_end_transaction(trans, root); 2601 + if (not_written) 2602 + btrfs_dev_replace_stats_inc(&fs_info->dev_replace. 2603 + num_uncorrectable_read_errors); 2604 + 2605 + btrfs_free_path(path); 2606 + kfree(nocow_ctx); 2607 + 2608 + scrub_pending_trans_workers_dec(sctx); 2609 + } 2610 + 2611 + static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) 2612 + { 2613 + unsigned long index; 2614 + struct scrub_copy_nocow_ctx *nocow_ctx = ctx; 2615 + int ret = 0; 2616 + struct btrfs_key key; 2617 + struct inode *inode = NULL; 2618 + struct btrfs_root *local_root; 2619 + u64 physical_for_dev_replace; 2620 + u64 len; 2621 + struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; 2622 + 2623 + key.objectid = root; 2624 + key.type = BTRFS_ROOT_ITEM_KEY; 2625 + key.offset = (u64)-1; 2626 + local_root = btrfs_read_fs_root_no_name(fs_info, &key); 2627 + if (IS_ERR(local_root)) 2628 + return PTR_ERR(local_root); 2629 + 2630 + key.type = BTRFS_INODE_ITEM_KEY; 2631 + key.objectid = inum; 2632 + key.offset = 0; 2633 + inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); 2634 + if (IS_ERR(inode)) 2635 + return PTR_ERR(inode); 2636 + 2637 + physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; 2638 + len = nocow_ctx->len; 2639 + while (len >= PAGE_CACHE_SIZE) { 2640 + struct page *page = NULL; 2641 + int ret_sub; 2642 + 2643 + index = offset >> PAGE_CACHE_SHIFT; 2644 + 2645 + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 2646 + if (!page) { 2647 + pr_err("find_or_create_page() failed\n"); 2648 + ret = -ENOMEM; 2649 + goto next_page; 2650 + } 2651 + 2652 + if (PageUptodate(page)) { 2653 + if (PageDirty(page)) 2654 + goto next_page; 2655 + } else { 2656 + ClearPageError(page); 2657 + ret_sub = extent_read_full_page(&BTRFS_I(inode)-> 2658 + io_tree, 2659 + page, btrfs_get_extent, 2660 + nocow_ctx->mirror_num); 2661 + if (ret_sub) { 2662 + ret = ret_sub; 2663 + goto next_page; 2664 + } 2665 + wait_on_page_locked(page); 2666 + if (!PageUptodate(page)) { 2667 + ret = -EIO; 2668 + goto next_page; 2669 + } 2670 + } 2671 + ret_sub = write_page_nocow(nocow_ctx->sctx, 2672 + physical_for_dev_replace, page); 2673 + if (ret_sub) { 2674 + ret = ret_sub; 2675 + goto next_page; 2676 + } 2677 + 2678 + next_page: 2679 + if (page) { 2680 + unlock_page(page); 2681 + put_page(page); 2682 + } 2683 + offset += PAGE_CACHE_SIZE; 2684 + physical_for_dev_replace += PAGE_CACHE_SIZE; 2685 + len -= PAGE_CACHE_SIZE; 2686 + } 2687 + 2688 + if (inode) 2689 + iput(inode); 2690 + return ret; 2691 + } 2692 + 2693 + static int write_page_nocow(struct scrub_ctx *sctx, 2694 + u64 physical_for_dev_replace, struct page *page) 2695 + { 2696 + struct bio *bio; 2697 + struct btrfs_device *dev; 2698 + int ret; 2699 + DECLARE_COMPLETION_ONSTACK(compl); 2700 + 2701 + dev = sctx->wr_ctx.tgtdev; 2702 + if (!dev) 2703 + return -EIO; 2704 + if (!dev->bdev) { 2705 + printk_ratelimited(KERN_WARNING 2706 + "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); 2707 + return -EIO; 2708 + } 2709 + bio = bio_alloc(GFP_NOFS, 1); 2710 + if (!bio) { 2711 + spin_lock(&sctx->stat_lock); 2712 + sctx->stat.malloc_errors++; 2713 + spin_unlock(&sctx->stat_lock); 2714 + return -ENOMEM; 2715 + } 2716 + bio->bi_private = &compl; 2717 + bio->bi_end_io = scrub_complete_bio_end_io; 2718 + bio->bi_size = 0; 2719 + bio->bi_sector = physical_for_dev_replace >> 9; 2720 + bio->bi_bdev = dev->bdev; 2721 + ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); 2722 + if (ret != PAGE_CACHE_SIZE) { 2723 + leave_with_eio: 2724 + bio_put(bio); 2725 + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 2726 + return -EIO; 2727 + } 2728 + btrfsic_submit_bio(WRITE_SYNC, bio); 2729 + wait_for_completion(&compl); 2730 + 2731 + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 2732 + goto leave_with_eio; 2733 + 2734 + bio_put(bio); 2735 + return 0; 3028 2736 }
+4 -4
fs/btrfs/send.c
··· 4397 4397 if (!path) 4398 4398 return -ENOMEM; 4399 4399 4400 - spin_lock(&send_root->root_times_lock); 4400 + spin_lock(&send_root->root_item_lock); 4401 4401 start_ctransid = btrfs_root_ctransid(&send_root->root_item); 4402 - spin_unlock(&send_root->root_times_lock); 4402 + spin_unlock(&send_root->root_item_lock); 4403 4403 4404 4404 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 4405 4405 key.type = BTRFS_INODE_ITEM_KEY; ··· 4422 4422 * Make sure the tree has not changed after re-joining. We detect this 4423 4423 * by comparing start_ctransid and ctransid. They should always match. 4424 4424 */ 4425 - spin_lock(&send_root->root_times_lock); 4425 + spin_lock(&send_root->root_item_lock); 4426 4426 ctransid = btrfs_root_ctransid(&send_root->root_item); 4427 - spin_unlock(&send_root->root_times_lock); 4427 + spin_unlock(&send_root->root_item_lock); 4428 4428 4429 4429 if (ctransid != start_ctransid) { 4430 4430 WARN(1, KERN_WARNING "btrfs: the root that you're trying to "
+44 -4
fs/btrfs/super.c
··· 55 55 #include "export.h" 56 56 #include "compression.h" 57 57 #include "rcu-string.h" 58 + #include "dev-replace.h" 58 59 59 60 #define CREATE_TRACE_POINTS 60 61 #include <trace/events/btrfs.h> ··· 117 116 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 118 117 sb->s_flags |= MS_RDONLY; 119 118 printk(KERN_INFO "btrfs is forced readonly\n"); 120 - __btrfs_scrub_cancel(fs_info); 119 + /* 120 + * Note that a running device replace operation is not 121 + * canceled here although there is no way to update 122 + * the progress. It would add the risk of a deadlock, 123 + * therefore the canceling is ommited. The only penalty 124 + * is that some I/O remains active until the procedure 125 + * completes. The next time when the filesystem is 126 + * mounted writeable again, the device replace 127 + * operation continues. 128 + */ 121 129 // WARN_ON(1); 122 130 } 123 131 } ··· 1196 1186 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); 1197 1187 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); 1198 1188 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); 1199 - btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size); 1189 + btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers, 1190 + new_pool_size); 1200 1191 } 1201 1192 1202 1193 static int btrfs_remount(struct super_block *sb, int *flags, char *data) ··· 1226 1215 return 0; 1227 1216 1228 1217 if (*flags & MS_RDONLY) { 1218 + /* 1219 + * this also happens on 'umount -rf' or on shutdown, when 1220 + * the filesystem is busy. 1221 + */ 1229 1222 sb->s_flags |= MS_RDONLY; 1223 + 1224 + btrfs_dev_replace_suspend_for_unmount(fs_info); 1225 + btrfs_scrub_cancel(fs_info); 1230 1226 1231 1227 ret = btrfs_commit_super(root); 1232 1228 if (ret) 1233 1229 goto restore; 1234 1230 } else { 1235 1231 if (fs_info->fs_devices->rw_devices == 0) { 1232 + ret = -EACCES; 1233 + goto restore; 1234 + } 1235 + 1236 + if (fs_info->fs_devices->missing_devices > 1237 + fs_info->num_tolerated_disk_barrier_failures && 1238 + !(*flags & MS_RDONLY)) { 1239 + printk(KERN_WARNING 1240 + "Btrfs: too many missing devices, writeable remount is not allowed\n"); 1236 1241 ret = -EACCES; 1237 1242 goto restore; 1238 1243 } ··· 1271 1244 if (ret) 1272 1245 goto restore; 1273 1246 1247 + ret = btrfs_resume_dev_replace_async(fs_info); 1248 + if (ret) { 1249 + pr_warn("btrfs: failed to resume dev_replace\n"); 1250 + goto restore; 1251 + } 1274 1252 sb->s_flags &= ~MS_RDONLY; 1275 1253 } 1276 1254 ··· 1368 1336 min_stripe_size = BTRFS_STRIPE_LEN; 1369 1337 1370 1338 list_for_each_entry(device, &fs_devices->devices, dev_list) { 1371 - if (!device->in_fs_metadata || !device->bdev) 1339 + if (!device->in_fs_metadata || !device->bdev || 1340 + device->is_tgtdev_for_dev_replace) 1372 1341 continue; 1373 1342 1374 1343 avail_space = device->total_bytes - device->bytes_used; ··· 1680 1647 if (err) 1681 1648 goto free_ordered_data; 1682 1649 1683 - err = btrfs_interface_init(); 1650 + err = btrfs_auto_defrag_init(); 1684 1651 if (err) 1685 1652 goto free_delayed_inode; 1653 + 1654 + err = btrfs_interface_init(); 1655 + if (err) 1656 + goto free_auto_defrag; 1686 1657 1687 1658 err = register_filesystem(&btrfs_fs_type); 1688 1659 if (err) ··· 1699 1662 1700 1663 unregister_ioctl: 1701 1664 btrfs_interface_exit(); 1665 + free_auto_defrag: 1666 + btrfs_auto_defrag_exit(); 1702 1667 free_delayed_inode: 1703 1668 btrfs_delayed_inode_exit(); 1704 1669 free_ordered_data: ··· 1720 1681 static void __exit exit_btrfs_fs(void) 1721 1682 { 1722 1683 btrfs_destroy_cachep(); 1684 + btrfs_auto_defrag_exit(); 1723 1685 btrfs_delayed_inode_exit(); 1724 1686 ordered_data_exit(); 1725 1687 extent_map_exit();
+100 -72
fs/btrfs/transaction.c
··· 30 30 #include "tree-log.h" 31 31 #include "inode-map.h" 32 32 #include "volumes.h" 33 + #include "dev-replace.h" 33 34 34 35 #define BTRFS_ROOT_TRANS_TAG 0 35 36 ··· 146 145 * the log must never go across transaction boundaries. 147 146 */ 148 147 smp_mb(); 149 - if (!list_empty(&fs_info->tree_mod_seq_list)) { 150 - printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when " 148 + if (!list_empty(&fs_info->tree_mod_seq_list)) 149 + WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when " 151 150 "creating a fresh transaction\n"); 152 - WARN_ON(1); 153 - } 154 - if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) { 155 - printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when " 151 + if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) 152 + WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when " 156 153 "creating a fresh transaction\n"); 157 - WARN_ON(1); 158 - } 159 154 atomic_set(&fs_info->tree_mod_seq, 0); 160 155 161 156 spin_lock_init(&cur_trans->commit_lock); ··· 292 295 return 0; 293 296 } 294 297 295 - static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 296 - u64 num_items, int type, 297 - int noflush) 298 + static struct btrfs_trans_handle * 299 + start_transaction(struct btrfs_root *root, u64 num_items, int type, 300 + enum btrfs_reserve_flush_enum flush) 298 301 { 299 302 struct btrfs_trans_handle *h; 300 303 struct btrfs_transaction *cur_trans; ··· 309 312 WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); 310 313 h = current->journal_info; 311 314 h->use_count++; 315 + WARN_ON(h->use_count > 2); 312 316 h->orig_rsv = h->block_rsv; 313 317 h->block_rsv = NULL; 314 318 goto got_it; ··· 329 331 } 330 332 331 333 num_bytes = btrfs_calc_trans_metadata_size(root, num_items); 332 - if (noflush) 333 - ret = btrfs_block_rsv_add_noflush(root, 334 - &root->fs_info->trans_block_rsv, 335 - num_bytes); 336 - else 337 - ret = btrfs_block_rsv_add(root, 338 - &root->fs_info->trans_block_rsv, 339 - num_bytes); 334 + ret = btrfs_block_rsv_add(root, 335 + &root->fs_info->trans_block_rsv, 336 + num_bytes, flush); 340 337 if (ret) 341 338 return ERR_PTR(ret); 342 339 } ··· 415 422 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 416 423 int num_items) 417 424 { 418 - return start_transaction(root, num_items, TRANS_START, 0); 425 + return start_transaction(root, num_items, TRANS_START, 426 + BTRFS_RESERVE_FLUSH_ALL); 419 427 } 420 428 421 - struct btrfs_trans_handle *btrfs_start_transaction_noflush( 429 + struct btrfs_trans_handle *btrfs_start_transaction_lflush( 422 430 struct btrfs_root *root, int num_items) 423 431 { 424 - return start_transaction(root, num_items, TRANS_START, 1); 432 + return start_transaction(root, num_items, TRANS_START, 433 + BTRFS_RESERVE_FLUSH_LIMIT); 425 434 } 426 435 427 436 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) ··· 456 461 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) 457 462 { 458 463 struct btrfs_transaction *cur_trans = NULL, *t; 459 - int ret; 464 + int ret = 0; 460 465 461 - ret = 0; 462 466 if (transid) { 463 467 if (transid <= root->fs_info->last_trans_committed) 464 468 goto out; 465 469 470 + ret = -EINVAL; 466 471 /* find specified transaction */ 467 472 spin_lock(&root->fs_info->trans_lock); 468 473 list_for_each_entry(t, &root->fs_info->trans_list, list) { 469 474 if (t->transid == transid) { 470 475 cur_trans = t; 471 476 atomic_inc(&cur_trans->use_count); 477 + ret = 0; 472 478 break; 473 479 } 474 - if (t->transid > transid) 480 + if (t->transid > transid) { 481 + ret = 0; 475 482 break; 483 + } 476 484 } 477 485 spin_unlock(&root->fs_info->trans_lock); 478 - ret = -EINVAL; 486 + /* The specified transaction doesn't exist */ 479 487 if (!cur_trans) 480 - goto out; /* bad transid */ 488 + goto out; 481 489 } else { 482 490 /* find newest transaction that is committing | committed */ 483 491 spin_lock(&root->fs_info->trans_lock); ··· 500 502 } 501 503 502 504 wait_for_commit(root, cur_trans); 503 - 504 505 put_transaction(cur_trans); 505 - ret = 0; 506 506 out: 507 507 return ret; 508 508 } ··· 847 851 return ret; 848 852 849 853 ret = btrfs_run_dev_stats(trans, root->fs_info); 850 - BUG_ON(ret); 854 + WARN_ON(ret); 855 + ret = btrfs_run_dev_replace(trans, root->fs_info); 856 + WARN_ON(ret); 851 857 852 858 ret = btrfs_run_qgroups(trans, root->fs_info); 853 859 BUG_ON(ret); ··· 871 873 down_write(&fs_info->extent_commit_sem); 872 874 switch_commit_root(fs_info->extent_root); 873 875 up_write(&fs_info->extent_commit_sem); 876 + 877 + btrfs_after_dev_replace_commit(fs_info); 874 878 875 879 return 0; 876 880 } ··· 958 958 struct btrfs_fs_info *info = root->fs_info; 959 959 struct btrfs_trans_handle *trans; 960 960 int ret; 961 - unsigned long nr; 962 961 963 962 if (xchg(&root->defrag_running, 1)) 964 963 return 0; ··· 969 970 970 971 ret = btrfs_defrag_leaves(trans, root, cacheonly); 971 972 972 - nr = trans->blocks_used; 973 973 btrfs_end_transaction(trans, root); 974 - btrfs_btree_balance_dirty(info->tree_root, nr); 974 + btrfs_btree_balance_dirty(info->tree_root); 975 975 cond_resched(); 976 976 977 977 if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) ··· 1030 1032 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); 1031 1033 1032 1034 if (to_reserve > 0) { 1033 - ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv, 1034 - to_reserve); 1035 + ret = btrfs_block_rsv_add(root, &pending->block_rsv, 1036 + to_reserve, 1037 + BTRFS_RESERVE_NO_FLUSH); 1035 1038 if (ret) { 1036 1039 pending->error = ret; 1037 1040 goto no_free_objectid; ··· 1190 1191 parent_inode, &key, 1191 1192 BTRFS_FT_DIR, index); 1192 1193 /* We have check then name at the beginning, so it is impossible. */ 1193 - BUG_ON(ret == -EEXIST); 1194 + BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); 1194 1195 if (ret) { 1195 1196 btrfs_abort_transaction(trans, root, ret); 1196 1197 goto fail; ··· 1308 1309 * We've got freeze protection passed with the transaction. 1309 1310 * Tell lockdep about it. 1310 1311 */ 1311 - rwsem_acquire_read( 1312 - &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1313 - 0, 1, _THIS_IP_); 1312 + if (ac->newtrans->type < TRANS_JOIN_NOLOCK) 1313 + rwsem_acquire_read( 1314 + &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1315 + 0, 1, _THIS_IP_); 1314 1316 1315 1317 current->journal_info = ac->newtrans; 1316 1318 ··· 1349 1349 * Tell lockdep we've released the freeze rwsem, since the 1350 1350 * async commit thread will be the one to unlock it. 1351 1351 */ 1352 - rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1353 - 1, _THIS_IP_); 1352 + if (trans->type < TRANS_JOIN_NOLOCK) 1353 + rwsem_release( 1354 + &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1355 + 1, _THIS_IP_); 1354 1356 1355 1357 schedule_delayed_work(&ac->work, 0); 1356 1358 ··· 1402 1400 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1403 1401 } 1404 1402 1403 + static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, 1404 + struct btrfs_root *root) 1405 + { 1406 + int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); 1407 + int snap_pending = 0; 1408 + int ret; 1409 + 1410 + if (!flush_on_commit) { 1411 + spin_lock(&root->fs_info->trans_lock); 1412 + if (!list_empty(&trans->transaction->pending_snapshots)) 1413 + snap_pending = 1; 1414 + spin_unlock(&root->fs_info->trans_lock); 1415 + } 1416 + 1417 + if (flush_on_commit || snap_pending) { 1418 + btrfs_start_delalloc_inodes(root, 1); 1419 + btrfs_wait_ordered_extents(root, 1); 1420 + } 1421 + 1422 + ret = btrfs_run_delayed_items(trans, root); 1423 + if (ret) 1424 + return ret; 1425 + 1426 + /* 1427 + * running the delayed items may have added new refs. account 1428 + * them now so that they hinder processing of more delayed refs 1429 + * as little as possible. 1430 + */ 1431 + btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); 1432 + 1433 + /* 1434 + * rename don't use btrfs_join_transaction, so, once we 1435 + * set the transaction to blocked above, we aren't going 1436 + * to get any new ordered operations. We can safely run 1437 + * it here and no for sure that nothing new will be added 1438 + * to the list 1439 + */ 1440 + btrfs_run_ordered_operations(root, 1); 1441 + 1442 + return 0; 1443 + } 1444 + 1405 1445 /* 1406 1446 * btrfs_transaction state sequence: 1407 1447 * in_commit = 0, blocked = 0 (initial) ··· 1458 1414 struct btrfs_transaction *cur_trans = trans->transaction; 1459 1415 struct btrfs_transaction *prev_trans = NULL; 1460 1416 DEFINE_WAIT(wait); 1461 - int ret = -EIO; 1417 + int ret; 1462 1418 int should_grow = 0; 1463 1419 unsigned long now = get_seconds(); 1464 - int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); 1465 1420 1466 - btrfs_run_ordered_operations(root, 0); 1467 - 1468 - if (cur_trans->aborted) 1421 + ret = btrfs_run_ordered_operations(root, 0); 1422 + if (ret) { 1423 + btrfs_abort_transaction(trans, root, ret); 1469 1424 goto cleanup_transaction; 1425 + } 1426 + 1427 + if (cur_trans->aborted) { 1428 + ret = cur_trans->aborted; 1429 + goto cleanup_transaction; 1430 + } 1470 1431 1471 1432 /* make a pass through all the delayed refs we have so far 1472 1433 * any runnings procs may add more while we are here ··· 1539 1490 should_grow = 1; 1540 1491 1541 1492 do { 1542 - int snap_pending = 0; 1543 - 1544 1493 joined = cur_trans->num_joined; 1545 - if (!list_empty(&trans->transaction->pending_snapshots)) 1546 - snap_pending = 1; 1547 1494 1548 1495 WARN_ON(cur_trans != trans->transaction); 1549 1496 1550 - if (flush_on_commit || snap_pending) { 1551 - btrfs_start_delalloc_inodes(root, 1); 1552 - btrfs_wait_ordered_extents(root, 1); 1553 - } 1554 - 1555 - ret = btrfs_run_delayed_items(trans, root); 1497 + ret = btrfs_flush_all_pending_stuffs(trans, root); 1556 1498 if (ret) 1557 1499 goto cleanup_transaction; 1558 - 1559 - /* 1560 - * running the delayed items may have added new refs. account 1561 - * them now so that they hinder processing of more delayed refs 1562 - * as little as possible. 1563 - */ 1564 - btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); 1565 - 1566 - /* 1567 - * rename don't use btrfs_join_transaction, so, once we 1568 - * set the transaction to blocked above, we aren't going 1569 - * to get any new ordered operations. We can safely run 1570 - * it here and no for sure that nothing new will be added 1571 - * to the list 1572 - */ 1573 - btrfs_run_ordered_operations(root, 1); 1574 1500 1575 1501 prepare_to_wait(&cur_trans->writer_wait, &wait, 1576 1502 TASK_UNINTERRUPTIBLE); ··· 1558 1534 finish_wait(&cur_trans->writer_wait, &wait); 1559 1535 } while (atomic_read(&cur_trans->num_writers) > 1 || 1560 1536 (should_grow && cur_trans->num_joined != joined)); 1537 + 1538 + ret = btrfs_flush_all_pending_stuffs(trans, root); 1539 + if (ret) 1540 + goto cleanup_transaction; 1561 1541 1562 1542 /* 1563 1543 * Ok now we need to make sure to block out any other joins while we
+1 -1
fs/btrfs/transaction.h
··· 105 105 struct btrfs_root *root); 106 106 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 107 107 int num_items); 108 - struct btrfs_trans_handle *btrfs_start_transaction_noflush( 108 + struct btrfs_trans_handle *btrfs_start_transaction_lflush( 109 109 struct btrfs_root *root, int num_items); 110 110 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); 111 111 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
+291 -188
fs/btrfs/tree-log.c
··· 2952 2952 struct btrfs_inode_item *item, 2953 2953 struct inode *inode, int log_inode_only) 2954 2954 { 2955 - btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); 2956 - btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); 2957 - btrfs_set_inode_mode(leaf, item, inode->i_mode); 2958 - btrfs_set_inode_nlink(leaf, item, inode->i_nlink); 2955 + struct btrfs_map_token token; 2959 2956 2960 - btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), 2961 - inode->i_atime.tv_sec); 2962 - btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), 2963 - inode->i_atime.tv_nsec); 2964 - 2965 - btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), 2966 - inode->i_mtime.tv_sec); 2967 - btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), 2968 - inode->i_mtime.tv_nsec); 2969 - 2970 - btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), 2971 - inode->i_ctime.tv_sec); 2972 - btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), 2973 - inode->i_ctime.tv_nsec); 2974 - 2975 - btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); 2976 - 2977 - btrfs_set_inode_sequence(leaf, item, inode->i_version); 2978 - btrfs_set_inode_transid(leaf, item, trans->transid); 2979 - btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 2980 - btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); 2981 - btrfs_set_inode_block_group(leaf, item, 0); 2957 + btrfs_init_map_token(&token); 2982 2958 2983 2959 if (log_inode_only) { 2984 2960 /* set the generation to zero so the recover code ··· 2962 2986 * just to say 'this inode exists' and a logging 2963 2987 * to say 'update this inode with these values' 2964 2988 */ 2965 - btrfs_set_inode_generation(leaf, item, 0); 2966 - btrfs_set_inode_size(leaf, item, 0); 2989 + btrfs_set_token_inode_generation(leaf, item, 0, &token); 2990 + btrfs_set_token_inode_size(leaf, item, 0, &token); 2967 2991 } else { 2968 - btrfs_set_inode_generation(leaf, item, 2969 - BTRFS_I(inode)->generation); 2970 - btrfs_set_inode_size(leaf, item, inode->i_size); 2992 + btrfs_set_token_inode_generation(leaf, item, 2993 + BTRFS_I(inode)->generation, 2994 + &token); 2995 + btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); 2971 2996 } 2972 2997 2998 + btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 2999 + btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3000 + btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3001 + btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3002 + 3003 + btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), 3004 + inode->i_atime.tv_sec, &token); 3005 + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), 3006 + inode->i_atime.tv_nsec, &token); 3007 + 3008 + btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), 3009 + inode->i_mtime.tv_sec, &token); 3010 + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), 3011 + inode->i_mtime.tv_nsec, &token); 3012 + 3013 + btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), 3014 + inode->i_ctime.tv_sec, &token); 3015 + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), 3016 + inode->i_ctime.tv_nsec, &token); 3017 + 3018 + btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3019 + &token); 3020 + 3021 + btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); 3022 + btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3023 + btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3024 + btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3025 + btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3026 + } 3027 + 3028 + static int log_inode_item(struct btrfs_trans_handle *trans, 3029 + struct btrfs_root *log, struct btrfs_path *path, 3030 + struct inode *inode) 3031 + { 3032 + struct btrfs_inode_item *inode_item; 3033 + struct btrfs_key key; 3034 + int ret; 3035 + 3036 + memcpy(&key, &BTRFS_I(inode)->location, sizeof(key)); 3037 + ret = btrfs_insert_empty_item(trans, log, path, &key, 3038 + sizeof(*inode_item)); 3039 + if (ret && ret != -EEXIST) 3040 + return ret; 3041 + inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3042 + struct btrfs_inode_item); 3043 + fill_inode_item(trans, path->nodes[0], inode_item, inode, 0); 3044 + btrfs_release_path(path); 3045 + return 0; 2973 3046 } 2974 3047 2975 3048 static noinline int copy_items(struct btrfs_trans_handle *trans, ··· 3155 3130 return 0; 3156 3131 } 3157 3132 3158 - struct log_args { 3159 - struct extent_buffer *src; 3160 - u64 next_offset; 3161 - int start_slot; 3162 - int nr; 3163 - }; 3133 + static int drop_adjacent_extents(struct btrfs_trans_handle *trans, 3134 + struct btrfs_root *root, struct inode *inode, 3135 + struct extent_map *em, 3136 + struct btrfs_path *path) 3137 + { 3138 + struct btrfs_file_extent_item *fi; 3139 + struct extent_buffer *leaf; 3140 + struct btrfs_key key, new_key; 3141 + struct btrfs_map_token token; 3142 + u64 extent_end; 3143 + u64 extent_offset = 0; 3144 + int extent_type; 3145 + int del_slot = 0; 3146 + int del_nr = 0; 3147 + int ret = 0; 3148 + 3149 + while (1) { 3150 + btrfs_init_map_token(&token); 3151 + leaf = path->nodes[0]; 3152 + path->slots[0]++; 3153 + if (path->slots[0] >= btrfs_header_nritems(leaf)) { 3154 + if (del_nr) { 3155 + ret = btrfs_del_items(trans, root, path, 3156 + del_slot, del_nr); 3157 + if (ret) 3158 + return ret; 3159 + del_nr = 0; 3160 + } 3161 + 3162 + ret = btrfs_next_leaf_write(trans, root, path, 1); 3163 + if (ret < 0) 3164 + return ret; 3165 + if (ret > 0) 3166 + return 0; 3167 + leaf = path->nodes[0]; 3168 + } 3169 + 3170 + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3171 + if (key.objectid != btrfs_ino(inode) || 3172 + key.type != BTRFS_EXTENT_DATA_KEY || 3173 + key.offset >= em->start + em->len) 3174 + break; 3175 + 3176 + fi = btrfs_item_ptr(leaf, path->slots[0], 3177 + struct btrfs_file_extent_item); 3178 + extent_type = btrfs_token_file_extent_type(leaf, fi, &token); 3179 + if (extent_type == BTRFS_FILE_EXTENT_REG || 3180 + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 3181 + extent_offset = btrfs_token_file_extent_offset(leaf, 3182 + fi, &token); 3183 + extent_end = key.offset + 3184 + btrfs_token_file_extent_num_bytes(leaf, fi, 3185 + &token); 3186 + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 3187 + extent_end = key.offset + 3188 + btrfs_file_extent_inline_len(leaf, fi); 3189 + } else { 3190 + BUG(); 3191 + } 3192 + 3193 + if (extent_end <= em->len + em->start) { 3194 + if (!del_nr) { 3195 + del_slot = path->slots[0]; 3196 + } 3197 + del_nr++; 3198 + continue; 3199 + } 3200 + 3201 + /* 3202 + * Ok so we'll ignore previous items if we log a new extent, 3203 + * which can lead to overlapping extents, so if we have an 3204 + * existing extent we want to adjust we _have_ to check the next 3205 + * guy to make sure we even need this extent anymore, this keeps 3206 + * us from panicing in set_item_key_safe. 3207 + */ 3208 + if (path->slots[0] < btrfs_header_nritems(leaf) - 1) { 3209 + struct btrfs_key tmp_key; 3210 + 3211 + btrfs_item_key_to_cpu(leaf, &tmp_key, 3212 + path->slots[0] + 1); 3213 + if (tmp_key.objectid == btrfs_ino(inode) && 3214 + tmp_key.type == BTRFS_EXTENT_DATA_KEY && 3215 + tmp_key.offset <= em->start + em->len) { 3216 + if (!del_nr) 3217 + del_slot = path->slots[0]; 3218 + del_nr++; 3219 + continue; 3220 + } 3221 + } 3222 + 3223 + BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 3224 + memcpy(&new_key, &key, sizeof(new_key)); 3225 + new_key.offset = em->start + em->len; 3226 + btrfs_set_item_key_safe(trans, root, path, &new_key); 3227 + extent_offset += em->start + em->len - key.offset; 3228 + btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, 3229 + &token); 3230 + btrfs_set_token_file_extent_num_bytes(leaf, fi, extent_end - 3231 + (em->start + em->len), 3232 + &token); 3233 + btrfs_mark_buffer_dirty(leaf); 3234 + } 3235 + 3236 + if (del_nr) 3237 + ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 3238 + 3239 + return ret; 3240 + } 3164 3241 3165 3242 static int log_one_extent(struct btrfs_trans_handle *trans, 3166 3243 struct inode *inode, struct btrfs_root *root, 3167 - struct extent_map *em, struct btrfs_path *path, 3168 - struct btrfs_path *dst_path, struct log_args *args) 3244 + struct extent_map *em, struct btrfs_path *path) 3169 3245 { 3170 3246 struct btrfs_root *log = root->log_root; 3171 3247 struct btrfs_file_extent_item *fi; 3248 + struct extent_buffer *leaf; 3249 + struct list_head ordered_sums; 3250 + struct btrfs_map_token token; 3172 3251 struct btrfs_key key; 3173 - u64 start = em->mod_start; 3174 - u64 search_start = start; 3175 - u64 len = em->mod_len; 3176 - u64 num_bytes; 3177 - int nritems; 3252 + u64 csum_offset = em->mod_start - em->start; 3253 + u64 csum_len = em->mod_len; 3254 + u64 extent_offset = em->start - em->orig_start; 3255 + u64 block_len; 3178 3256 int ret; 3257 + bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3179 3258 3180 - if (BTRFS_I(inode)->logged_trans == trans->transid) { 3181 - ret = __btrfs_drop_extents(trans, log, inode, dst_path, start, 3182 - start + len, NULL, 0); 3183 - if (ret) 3184 - return ret; 3259 + INIT_LIST_HEAD(&ordered_sums); 3260 + btrfs_init_map_token(&token); 3261 + key.objectid = btrfs_ino(inode); 3262 + key.type = BTRFS_EXTENT_DATA_KEY; 3263 + key.offset = em->start; 3264 + path->really_keep_locks = 1; 3265 + 3266 + ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi)); 3267 + if (ret && ret != -EEXIST) { 3268 + path->really_keep_locks = 0; 3269 + return ret; 3270 + } 3271 + leaf = path->nodes[0]; 3272 + fi = btrfs_item_ptr(leaf, path->slots[0], 3273 + struct btrfs_file_extent_item); 3274 + btrfs_set_token_file_extent_generation(leaf, fi, em->generation, 3275 + &token); 3276 + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 3277 + skip_csum = true; 3278 + btrfs_set_token_file_extent_type(leaf, fi, 3279 + BTRFS_FILE_EXTENT_PREALLOC, 3280 + &token); 3281 + } else { 3282 + btrfs_set_token_file_extent_type(leaf, fi, 3283 + BTRFS_FILE_EXTENT_REG, 3284 + &token); 3285 + if (em->block_start == 0) 3286 + skip_csum = true; 3185 3287 } 3186 3288 3187 - while (len) { 3188 - if (args->nr) 3189 - goto next_slot; 3190 - again: 3191 - key.objectid = btrfs_ino(inode); 3192 - key.type = BTRFS_EXTENT_DATA_KEY; 3193 - key.offset = search_start; 3194 - 3195 - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3196 - if (ret < 0) 3197 - return ret; 3198 - 3199 - if (ret) { 3200 - /* 3201 - * A rare case were we can have an em for a section of a 3202 - * larger extent so we need to make sure that this em 3203 - * falls within the extent we've found. If not we just 3204 - * bail and go back to ye-olde way of doing things but 3205 - * it happens often enough in testing that we need to do 3206 - * this dance to make sure. 3207 - */ 3208 - do { 3209 - if (path->slots[0] == 0) { 3210 - btrfs_release_path(path); 3211 - if (search_start == 0) 3212 - return -ENOENT; 3213 - search_start--; 3214 - goto again; 3215 - } 3216 - 3217 - path->slots[0]--; 3218 - btrfs_item_key_to_cpu(path->nodes[0], &key, 3219 - path->slots[0]); 3220 - if (key.objectid != btrfs_ino(inode) || 3221 - key.type != BTRFS_EXTENT_DATA_KEY) { 3222 - btrfs_release_path(path); 3223 - return -ENOENT; 3224 - } 3225 - } while (key.offset > start); 3226 - 3227 - fi = btrfs_item_ptr(path->nodes[0], path->slots[0], 3228 - struct btrfs_file_extent_item); 3229 - num_bytes = btrfs_file_extent_num_bytes(path->nodes[0], 3230 - fi); 3231 - if (key.offset + num_bytes <= start) { 3232 - btrfs_release_path(path); 3233 - return -ENOENT; 3234 - } 3235 - } 3236 - args->src = path->nodes[0]; 3237 - next_slot: 3238 - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 3239 - fi = btrfs_item_ptr(args->src, path->slots[0], 3240 - struct btrfs_file_extent_item); 3241 - if (args->nr && 3242 - args->start_slot + args->nr == path->slots[0]) { 3243 - args->nr++; 3244 - } else if (args->nr) { 3245 - ret = copy_items(trans, inode, dst_path, args->src, 3246 - args->start_slot, args->nr, 3247 - LOG_INODE_ALL); 3248 - if (ret) 3249 - return ret; 3250 - args->nr = 1; 3251 - args->start_slot = path->slots[0]; 3252 - } else if (!args->nr) { 3253 - args->nr = 1; 3254 - args->start_slot = path->slots[0]; 3255 - } 3256 - nritems = btrfs_header_nritems(path->nodes[0]); 3257 - path->slots[0]++; 3258 - num_bytes = btrfs_file_extent_num_bytes(args->src, fi); 3259 - if (len < num_bytes) { 3260 - /* I _think_ this is ok, envision we write to a 3261 - * preallocated space that is adjacent to a previously 3262 - * written preallocated space that gets merged when we 3263 - * mark this preallocated space written. If we do not 3264 - * have the adjacent extent in cache then when we copy 3265 - * this extent it could end up being larger than our EM 3266 - * thinks it is, which is a-ok, so just set len to 0. 3267 - */ 3268 - len = 0; 3269 - } else { 3270 - len -= num_bytes; 3271 - } 3272 - start = key.offset + num_bytes; 3273 - args->next_offset = start; 3274 - search_start = start; 3275 - 3276 - if (path->slots[0] < nritems) { 3277 - if (len) 3278 - goto next_slot; 3279 - break; 3280 - } 3281 - 3282 - if (args->nr) { 3283 - ret = copy_items(trans, inode, dst_path, args->src, 3284 - args->start_slot, args->nr, 3285 - LOG_INODE_ALL); 3286 - if (ret) 3287 - return ret; 3288 - args->nr = 0; 3289 - btrfs_release_path(path); 3290 - } 3289 + block_len = max(em->block_len, em->orig_block_len); 3290 + if (em->compress_type != BTRFS_COMPRESS_NONE) { 3291 + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 3292 + em->block_start, 3293 + &token); 3294 + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 3295 + &token); 3296 + } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 3297 + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 3298 + em->block_start - 3299 + extent_offset, &token); 3300 + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 3301 + &token); 3302 + } else { 3303 + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); 3304 + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, 3305 + &token); 3291 3306 } 3292 3307 3293 - return 0; 3308 + btrfs_set_token_file_extent_offset(leaf, fi, 3309 + em->start - em->orig_start, 3310 + &token); 3311 + btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); 3312 + btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->len, &token); 3313 + btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, 3314 + &token); 3315 + btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); 3316 + btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); 3317 + btrfs_mark_buffer_dirty(leaf); 3318 + 3319 + /* 3320 + * Have to check the extent to the right of us to make sure it doesn't 3321 + * fall in our current range. We're ok if the previous extent is in our 3322 + * range since the recovery stuff will run us in key order and thus just 3323 + * drop the part we overwrote. 3324 + */ 3325 + ret = drop_adjacent_extents(trans, log, inode, em, path); 3326 + btrfs_release_path(path); 3327 + path->really_keep_locks = 0; 3328 + if (ret) { 3329 + return ret; 3330 + } 3331 + 3332 + if (skip_csum) 3333 + return 0; 3334 + 3335 + /* block start is already adjusted for the file extent offset. */ 3336 + ret = btrfs_lookup_csums_range(log->fs_info->csum_root, 3337 + em->block_start + csum_offset, 3338 + em->block_start + csum_offset + 3339 + csum_len - 1, &ordered_sums, 0); 3340 + if (ret) 3341 + return ret; 3342 + 3343 + while (!list_empty(&ordered_sums)) { 3344 + struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3345 + struct btrfs_ordered_sum, 3346 + list); 3347 + if (!ret) 3348 + ret = btrfs_csum_file_blocks(trans, log, sums); 3349 + list_del(&sums->list); 3350 + kfree(sums); 3351 + } 3352 + 3353 + return ret; 3294 3354 } 3295 3355 3296 3356 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 3297 3357 struct btrfs_root *root, 3298 3358 struct inode *inode, 3299 - struct btrfs_path *path, 3300 - struct btrfs_path *dst_path) 3359 + struct btrfs_path *path) 3301 3360 { 3302 - struct log_args args; 3303 3361 struct extent_map *em, *n; 3304 3362 struct list_head extents; 3305 3363 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; ··· 3390 3282 int ret = 0; 3391 3283 3392 3284 INIT_LIST_HEAD(&extents); 3393 - 3394 - memset(&args, 0, sizeof(args)); 3395 3285 3396 3286 write_lock(&tree->lock); 3397 3287 test_gen = root->fs_info->last_trans_committed; ··· 3423 3317 3424 3318 write_unlock(&tree->lock); 3425 3319 3426 - /* 3427 - * If the previous EM and the last extent we left off on aren't 3428 - * sequential then we need to copy the items we have and redo 3429 - * our search 3430 - */ 3431 - if (args.nr && em->mod_start != args.next_offset) { 3432 - ret = copy_items(trans, inode, dst_path, args.src, 3433 - args.start_slot, args.nr, 3434 - LOG_INODE_ALL); 3435 - if (ret) { 3436 - free_extent_map(em); 3437 - write_lock(&tree->lock); 3438 - continue; 3439 - } 3440 - btrfs_release_path(path); 3441 - args.nr = 0; 3442 - } 3443 - 3444 - ret = log_one_extent(trans, inode, root, em, path, dst_path, &args); 3320 + ret = log_one_extent(trans, inode, root, em, path); 3445 3321 free_extent_map(em); 3446 3322 write_lock(&tree->lock); 3447 3323 } 3448 3324 WARN_ON(!list_empty(&extents)); 3449 3325 write_unlock(&tree->lock); 3450 3326 3451 - if (!ret && args.nr) 3452 - ret = copy_items(trans, inode, dst_path, args.src, 3453 - args.start_slot, args.nr, LOG_INODE_ALL); 3454 3327 btrfs_release_path(path); 3455 3328 return ret; 3456 3329 } ··· 3485 3400 3486 3401 3487 3402 /* today the code can only do partial logging of directories */ 3488 - if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 3403 + if (S_ISDIR(inode->i_mode) || 3404 + (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3405 + &BTRFS_I(inode)->runtime_flags) && 3406 + inode_only == LOG_INODE_EXISTS)) 3489 3407 max_key.type = BTRFS_XATTR_ITEM_KEY; 3490 3408 else 3491 3409 max_key.type = (u8)-1; ··· 3520 3432 } else { 3521 3433 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3522 3434 &BTRFS_I(inode)->runtime_flags)) { 3435 + clear_bit(BTRFS_INODE_COPY_EVERYTHING, 3436 + &BTRFS_I(inode)->runtime_flags); 3523 3437 ret = btrfs_truncate_inode_items(trans, log, 3524 3438 inode, 0, 0); 3525 - } else { 3526 - fast_search = true; 3439 + } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 3440 + &BTRFS_I(inode)->runtime_flags)) { 3441 + if (inode_only == LOG_INODE_ALL) 3442 + fast_search = true; 3527 3443 max_key.type = BTRFS_XATTR_ITEM_KEY; 3528 3444 ret = drop_objectid_items(trans, log, path, ino, 3529 - BTRFS_XATTR_ITEM_KEY); 3445 + max_key.type); 3446 + } else { 3447 + if (inode_only == LOG_INODE_ALL) 3448 + fast_search = true; 3449 + ret = log_inode_item(trans, log, dst_path, inode); 3450 + if (ret) { 3451 + err = ret; 3452 + goto out_unlock; 3453 + } 3454 + goto log_extents; 3530 3455 } 3456 + 3531 3457 } 3532 3458 if (ret) { 3533 3459 err = ret; ··· 3620 3518 ins_nr = 0; 3621 3519 } 3622 3520 3521 + log_extents: 3623 3522 if (fast_search) { 3624 - btrfs_release_path(path); 3625 3523 btrfs_release_path(dst_path); 3626 - ret = btrfs_log_changed_extents(trans, root, inode, path, 3627 - dst_path); 3524 + ret = btrfs_log_changed_extents(trans, root, inode, dst_path); 3628 3525 if (ret) { 3629 3526 err = ret; 3630 3527 goto out_unlock; ··· 3632 3531 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 3633 3532 struct extent_map *em, *n; 3634 3533 3534 + write_lock(&tree->lock); 3635 3535 list_for_each_entry_safe(em, n, &tree->modified_extents, list) 3636 3536 list_del_init(&em->list); 3537 + write_unlock(&tree->lock); 3637 3538 } 3638 3539 3639 3540 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
+776 -188
fs/btrfs/volumes.c
··· 25 25 #include <linux/capability.h> 26 26 #include <linux/ratelimit.h> 27 27 #include <linux/kthread.h> 28 - #include <asm/div64.h> 29 28 #include "compat.h" 30 29 #include "ctree.h" 31 30 #include "extent_map.h" ··· 35 36 #include "async-thread.h" 36 37 #include "check-integrity.h" 37 38 #include "rcu-string.h" 39 + #include "math.h" 40 + #include "dev-replace.h" 38 41 39 42 static int init_first_rw_device(struct btrfs_trans_handle *trans, 40 43 struct btrfs_root *root, ··· 70 69 kfree(device); 71 70 } 72 71 kfree(fs_devices); 72 + } 73 + 74 + static void btrfs_kobject_uevent(struct block_device *bdev, 75 + enum kobject_action action) 76 + { 77 + int ret; 78 + 79 + ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); 80 + if (ret) 81 + pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n", 82 + action, 83 + kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), 84 + &disk_to_dev(bdev->bd_disk)->kobj); 73 85 } 74 86 75 87 void btrfs_cleanup_fs_uuids(void) ··· 120 106 return fs_devices; 121 107 } 122 108 return NULL; 109 + } 110 + 111 + static int 112 + btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 113 + int flush, struct block_device **bdev, 114 + struct buffer_head **bh) 115 + { 116 + int ret; 117 + 118 + *bdev = blkdev_get_by_path(device_path, flags, holder); 119 + 120 + if (IS_ERR(*bdev)) { 121 + ret = PTR_ERR(*bdev); 122 + printk(KERN_INFO "btrfs: open %s failed\n", device_path); 123 + goto error; 124 + } 125 + 126 + if (flush) 127 + filemap_write_and_wait((*bdev)->bd_inode->i_mapping); 128 + ret = set_blocksize(*bdev, 4096); 129 + if (ret) { 130 + blkdev_put(*bdev, flags); 131 + goto error; 132 + } 133 + invalidate_bdev(*bdev); 134 + *bh = btrfs_read_dev_super(*bdev); 135 + if (!*bh) { 136 + ret = -EINVAL; 137 + blkdev_put(*bdev, flags); 138 + goto error; 139 + } 140 + 141 + return 0; 142 + 143 + error: 144 + *bdev = NULL; 145 + *bh = NULL; 146 + return ret; 123 147 } 124 148 125 149 static void requeue_list(struct btrfs_pending_bios *pending_bios, ··· 519 467 return ERR_PTR(-ENOMEM); 520 468 } 521 469 522 - void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 470 + void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, 471 + struct btrfs_fs_devices *fs_devices, int step) 523 472 { 524 473 struct btrfs_device *device, *next; 525 474 ··· 533 480 /* This is the initialized path, it is safe to release the devices. */ 534 481 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 535 482 if (device->in_fs_metadata) { 536 - if (!latest_transid || 537 - device->generation > latest_transid) { 483 + if (!device->is_tgtdev_for_dev_replace && 484 + (!latest_transid || 485 + device->generation > latest_transid)) { 538 486 latest_devid = device->devid; 539 487 latest_transid = device->generation; 540 488 latest_bdev = device->bdev; ··· 543 489 continue; 544 490 } 545 491 492 + if (device->devid == BTRFS_DEV_REPLACE_DEVID) { 493 + /* 494 + * In the first step, keep the device which has 495 + * the correct fsid and the devid that is used 496 + * for the dev_replace procedure. 497 + * In the second step, the dev_replace state is 498 + * read from the device tree and it is known 499 + * whether the procedure is really active or 500 + * not, which means whether this device is 501 + * used or whether it should be removed. 502 + */ 503 + if (step == 0 || device->is_tgtdev_for_dev_replace) { 504 + continue; 505 + } 506 + } 546 507 if (device->bdev) { 547 508 blkdev_put(device->bdev, device->mode); 548 509 device->bdev = NULL; ··· 566 497 if (device->writeable) { 567 498 list_del_init(&device->dev_alloc_list); 568 499 device->writeable = 0; 569 - fs_devices->rw_devices--; 500 + if (!device->is_tgtdev_for_dev_replace) 501 + fs_devices->rw_devices--; 570 502 } 571 503 list_del_init(&device->dev_list); 572 504 fs_devices->num_devices--; ··· 625 555 if (device->bdev) 626 556 fs_devices->open_devices--; 627 557 628 - if (device->writeable) { 558 + if (device->writeable && !device->is_tgtdev_for_dev_replace) { 629 559 list_del_init(&device->dev_alloc_list); 630 560 fs_devices->rw_devices--; 631 561 } ··· 707 637 if (!device->name) 708 638 continue; 709 639 710 - bdev = blkdev_get_by_path(device->name->str, flags, holder); 711 - if (IS_ERR(bdev)) { 712 - printk(KERN_INFO "btrfs: open %s failed\n", device->name->str); 713 - goto error; 714 - } 715 - filemap_write_and_wait(bdev->bd_inode->i_mapping); 716 - invalidate_bdev(bdev); 717 - set_blocksize(bdev, 4096); 718 - 719 - bh = btrfs_read_dev_super(bdev); 720 - if (!bh) 721 - goto error_close; 640 + ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 641 + &bdev, &bh); 642 + if (ret) 643 + continue; 722 644 723 645 disk_super = (struct btrfs_super_block *)bh->b_data; 724 646 devid = btrfs_stack_device_id(&disk_super->dev_item); ··· 749 687 fs_devices->rotating = 1; 750 688 751 689 fs_devices->open_devices++; 752 - if (device->writeable) { 690 + if (device->writeable && !device->is_tgtdev_for_dev_replace) { 753 691 fs_devices->rw_devices++; 754 692 list_add(&device->dev_alloc_list, 755 693 &fs_devices->alloc_list); ··· 759 697 760 698 error_brelse: 761 699 brelse(bh); 762 - error_close: 763 700 blkdev_put(bdev, flags); 764 - error: 765 701 continue; 766 702 } 767 703 if (fs_devices->open_devices == 0) { ··· 804 744 u64 total_devices; 805 745 806 746 flags |= FMODE_EXCL; 807 - bdev = blkdev_get_by_path(path, flags, holder); 808 - 809 - if (IS_ERR(bdev)) { 810 - ret = PTR_ERR(bdev); 811 - goto error; 812 - } 813 - 814 747 mutex_lock(&uuid_mutex); 815 - ret = set_blocksize(bdev, 4096); 748 + ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh); 816 749 if (ret) 817 - goto error_close; 818 - bh = btrfs_read_dev_super(bdev); 819 - if (!bh) { 820 - ret = -EINVAL; 821 - goto error_close; 822 - } 750 + goto error; 823 751 disk_super = (struct btrfs_super_block *)bh->b_data; 824 752 devid = btrfs_stack_device_id(&disk_super->dev_item); 825 753 transid = btrfs_super_generation(disk_super); 826 754 total_devices = btrfs_super_num_devices(disk_super); 827 - if (disk_super->label[0]) 755 + if (disk_super->label[0]) { 756 + if (disk_super->label[BTRFS_LABEL_SIZE - 1]) 757 + disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; 828 758 printk(KERN_INFO "device label %s ", disk_super->label); 829 - else 759 + } else { 830 760 printk(KERN_INFO "device fsid %pU ", disk_super->fsid); 761 + } 831 762 printk(KERN_CONT "devid %llu transid %llu %s\n", 832 763 (unsigned long long)devid, (unsigned long long)transid, path); 833 764 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 834 765 if (!ret && fs_devices_ret) 835 766 (*fs_devices_ret)->total_devices = total_devices; 836 767 brelse(bh); 837 - error_close: 838 - mutex_unlock(&uuid_mutex); 839 768 blkdev_put(bdev, flags); 840 769 error: 770 + mutex_unlock(&uuid_mutex); 841 771 return ret; 842 772 } 843 773 ··· 846 796 847 797 *length = 0; 848 798 849 - if (start >= device->total_bytes) 799 + if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace) 850 800 return 0; 851 801 852 802 path = btrfs_alloc_path(); ··· 963 913 max_hole_size = 0; 964 914 hole_size = 0; 965 915 966 - if (search_start >= search_end) { 916 + if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { 967 917 ret = -ENOSPC; 968 918 goto error; 969 919 } ··· 1146 1096 struct btrfs_key key; 1147 1097 1148 1098 WARN_ON(!device->in_fs_metadata); 1099 + WARN_ON(device->is_tgtdev_for_dev_replace); 1149 1100 path = btrfs_alloc_path(); 1150 1101 if (!path) 1151 1102 return -ENOMEM; ··· 1381 1330 root->fs_info->avail_system_alloc_bits | 1382 1331 root->fs_info->avail_metadata_alloc_bits; 1383 1332 1384 - if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && 1385 - root->fs_info->fs_devices->num_devices <= 4) { 1333 + num_devices = root->fs_info->fs_devices->num_devices; 1334 + btrfs_dev_replace_lock(&root->fs_info->dev_replace); 1335 + if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { 1336 + WARN_ON(num_devices < 1); 1337 + num_devices--; 1338 + } 1339 + btrfs_dev_replace_unlock(&root->fs_info->dev_replace); 1340 + 1341 + if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { 1386 1342 printk(KERN_ERR "btrfs: unable to go below four devices " 1387 1343 "on raid10\n"); 1388 1344 ret = -EINVAL; 1389 1345 goto out; 1390 1346 } 1391 1347 1392 - if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && 1393 - root->fs_info->fs_devices->num_devices <= 2) { 1348 + if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { 1394 1349 printk(KERN_ERR "btrfs: unable to go below two " 1395 1350 "devices on raid1\n"); 1396 1351 ret = -EINVAL; ··· 1414 1357 * is held. 1415 1358 */ 1416 1359 list_for_each_entry(tmp, devices, dev_list) { 1417 - if (tmp->in_fs_metadata && !tmp->bdev) { 1360 + if (tmp->in_fs_metadata && 1361 + !tmp->is_tgtdev_for_dev_replace && 1362 + !tmp->bdev) { 1418 1363 device = tmp; 1419 1364 break; 1420 1365 } ··· 1430 1371 goto out; 1431 1372 } 1432 1373 } else { 1433 - bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, 1434 - root->fs_info->bdev_holder); 1435 - if (IS_ERR(bdev)) { 1436 - ret = PTR_ERR(bdev); 1374 + ret = btrfs_get_bdev_and_sb(device_path, 1375 + FMODE_READ | FMODE_EXCL, 1376 + root->fs_info->bdev_holder, 0, 1377 + &bdev, &bh); 1378 + if (ret) 1437 1379 goto out; 1438 - } 1439 - 1440 - set_blocksize(bdev, 4096); 1441 - invalidate_bdev(bdev); 1442 - bh = btrfs_read_dev_super(bdev); 1443 - if (!bh) { 1444 - ret = -EINVAL; 1445 - goto error_close; 1446 - } 1447 1380 disk_super = (struct btrfs_super_block *)bh->b_data; 1448 1381 devid = btrfs_stack_device_id(&disk_super->dev_item); 1449 1382 dev_uuid = disk_super->dev_item.uuid; 1450 - device = btrfs_find_device(root, devid, dev_uuid, 1383 + device = btrfs_find_device(root->fs_info, devid, dev_uuid, 1451 1384 disk_super->fsid); 1452 1385 if (!device) { 1453 1386 ret = -ENOENT; 1454 1387 goto error_brelse; 1455 1388 } 1389 + } 1390 + 1391 + if (device->is_tgtdev_for_dev_replace) { 1392 + pr_err("btrfs: unable to remove the dev_replace target dev\n"); 1393 + ret = -EINVAL; 1394 + goto error_brelse; 1456 1395 } 1457 1396 1458 1397 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { ··· 1472 1415 if (ret) 1473 1416 goto error_undo; 1474 1417 1418 + /* 1419 + * TODO: the superblock still includes this device in its num_devices 1420 + * counter although write_all_supers() is not locked out. This 1421 + * could give a filesystem state which requires a degraded mount. 1422 + */ 1475 1423 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1476 1424 if (ret) 1477 1425 goto error_undo; ··· 1487 1425 spin_unlock(&root->fs_info->free_chunk_lock); 1488 1426 1489 1427 device->in_fs_metadata = 0; 1490 - btrfs_scrub_cancel_dev(root, device); 1428 + btrfs_scrub_cancel_dev(root->fs_info, device); 1491 1429 1492 1430 /* 1493 1431 * the device list mutex makes sure that we don't change ··· 1544 1482 * at this point, the device is zero sized. We want to 1545 1483 * remove it from the devices list and zero out the old super 1546 1484 */ 1547 - if (clear_super) { 1485 + if (clear_super && disk_super) { 1548 1486 /* make sure this device isn't detected as part of 1549 1487 * the FS anymore 1550 1488 */ ··· 1555 1493 1556 1494 ret = 0; 1557 1495 1496 + /* Notify udev that device has changed */ 1497 + btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 1498 + 1558 1499 error_brelse: 1559 1500 brelse(bh); 1560 - error_close: 1561 1501 if (bdev) 1562 1502 blkdev_put(bdev, FMODE_READ | FMODE_EXCL); 1563 1503 out: ··· 1574 1510 root->fs_info->fs_devices->rw_devices++; 1575 1511 } 1576 1512 goto error_brelse; 1513 + } 1514 + 1515 + void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, 1516 + struct btrfs_device *srcdev) 1517 + { 1518 + WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); 1519 + list_del_rcu(&srcdev->dev_list); 1520 + list_del_rcu(&srcdev->dev_alloc_list); 1521 + fs_info->fs_devices->num_devices--; 1522 + if (srcdev->missing) { 1523 + fs_info->fs_devices->missing_devices--; 1524 + fs_info->fs_devices->rw_devices++; 1525 + } 1526 + if (srcdev->can_discard) 1527 + fs_info->fs_devices->num_can_discard--; 1528 + if (srcdev->bdev) 1529 + fs_info->fs_devices->open_devices--; 1530 + 1531 + call_rcu(&srcdev->rcu, free_device); 1532 + } 1533 + 1534 + void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 1535 + struct btrfs_device *tgtdev) 1536 + { 1537 + struct btrfs_device *next_device; 1538 + 1539 + WARN_ON(!tgtdev); 1540 + mutex_lock(&fs_info->fs_devices->device_list_mutex); 1541 + if (tgtdev->bdev) { 1542 + btrfs_scratch_superblock(tgtdev); 1543 + fs_info->fs_devices->open_devices--; 1544 + } 1545 + fs_info->fs_devices->num_devices--; 1546 + if (tgtdev->can_discard) 1547 + fs_info->fs_devices->num_can_discard++; 1548 + 1549 + next_device = list_entry(fs_info->fs_devices->devices.next, 1550 + struct btrfs_device, dev_list); 1551 + if (tgtdev->bdev == fs_info->sb->s_bdev) 1552 + fs_info->sb->s_bdev = next_device->bdev; 1553 + if (tgtdev->bdev == fs_info->fs_devices->latest_bdev) 1554 + fs_info->fs_devices->latest_bdev = next_device->bdev; 1555 + list_del_rcu(&tgtdev->dev_list); 1556 + 1557 + call_rcu(&tgtdev->rcu, free_device); 1558 + 1559 + mutex_unlock(&fs_info->fs_devices->device_list_mutex); 1560 + } 1561 + 1562 + int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, 1563 + struct btrfs_device **device) 1564 + { 1565 + int ret = 0; 1566 + struct btrfs_super_block *disk_super; 1567 + u64 devid; 1568 + u8 *dev_uuid; 1569 + struct block_device *bdev; 1570 + struct buffer_head *bh; 1571 + 1572 + *device = NULL; 1573 + ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, 1574 + root->fs_info->bdev_holder, 0, &bdev, &bh); 1575 + if (ret) 1576 + return ret; 1577 + disk_super = (struct btrfs_super_block *)bh->b_data; 1578 + devid = btrfs_stack_device_id(&disk_super->dev_item); 1579 + dev_uuid = disk_super->dev_item.uuid; 1580 + *device = btrfs_find_device(root->fs_info, devid, dev_uuid, 1581 + disk_super->fsid); 1582 + brelse(bh); 1583 + if (!*device) 1584 + ret = -ENOENT; 1585 + blkdev_put(bdev, FMODE_READ); 1586 + return ret; 1587 + } 1588 + 1589 + int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, 1590 + char *device_path, 1591 + struct btrfs_device **device) 1592 + { 1593 + *device = NULL; 1594 + if (strcmp(device_path, "missing") == 0) { 1595 + struct list_head *devices; 1596 + struct btrfs_device *tmp; 1597 + 1598 + devices = &root->fs_info->fs_devices->devices; 1599 + /* 1600 + * It is safe to read the devices since the volume_mutex 1601 + * is held by the caller. 1602 + */ 1603 + list_for_each_entry(tmp, devices, dev_list) { 1604 + if (tmp->in_fs_metadata && !tmp->bdev) { 1605 + *device = tmp; 1606 + break; 1607 + } 1608 + } 1609 + 1610 + if (!*device) { 1611 + pr_err("btrfs: no missing device found\n"); 1612 + return -ENOENT; 1613 + } 1614 + 1615 + return 0; 1616 + } else { 1617 + return btrfs_find_device_by_path(root, device_path, device); 1618 + } 1577 1619 } 1578 1620 1579 1621 /* ··· 1800 1630 read_extent_buffer(leaf, fs_uuid, 1801 1631 (unsigned long)btrfs_device_fsid(dev_item), 1802 1632 BTRFS_UUID_SIZE); 1803 - device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 1633 + device = btrfs_find_device(root->fs_info, devid, dev_uuid, 1634 + fs_uuid); 1804 1635 BUG_ON(!device); /* Logic error */ 1805 1636 1806 1637 if (device->fs_devices->seeding) { ··· 1849 1678 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1850 1679 1851 1680 devices = &root->fs_info->fs_devices->devices; 1852 - /* 1853 - * we have the volume lock, so we don't need the extra 1854 - * device list mutex while reading the list here. 1855 - */ 1681 + 1682 + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1856 1683 list_for_each_entry(device, devices, dev_list) { 1857 1684 if (device->bdev == bdev) { 1858 1685 ret = -EEXIST; 1686 + mutex_unlock( 1687 + &root->fs_info->fs_devices->device_list_mutex); 1859 1688 goto error; 1860 1689 } 1861 1690 } 1691 + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1862 1692 1863 1693 device = kzalloc(sizeof(*device), GFP_NOFS); 1864 1694 if (!device) { ··· 1909 1737 device->dev_root = root->fs_info->dev_root; 1910 1738 device->bdev = bdev; 1911 1739 device->in_fs_metadata = 1; 1740 + device->is_tgtdev_for_dev_replace = 0; 1912 1741 device->mode = FMODE_EXCL; 1913 1742 set_blocksize(device->bdev, 4096); 1914 1743 ··· 2017 1844 return ret; 2018 1845 } 2019 1846 1847 + int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, 1848 + struct btrfs_device **device_out) 1849 + { 1850 + struct request_queue *q; 1851 + struct btrfs_device *device; 1852 + struct block_device *bdev; 1853 + struct btrfs_fs_info *fs_info = root->fs_info; 1854 + struct list_head *devices; 1855 + struct rcu_string *name; 1856 + int ret = 0; 1857 + 1858 + *device_out = NULL; 1859 + if (fs_info->fs_devices->seeding) 1860 + return -EINVAL; 1861 + 1862 + bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 1863 + fs_info->bdev_holder); 1864 + if (IS_ERR(bdev)) 1865 + return PTR_ERR(bdev); 1866 + 1867 + filemap_write_and_wait(bdev->bd_inode->i_mapping); 1868 + 1869 + devices = &fs_info->fs_devices->devices; 1870 + list_for_each_entry(device, devices, dev_list) { 1871 + if (device->bdev == bdev) { 1872 + ret = -EEXIST; 1873 + goto error; 1874 + } 1875 + } 1876 + 1877 + device = kzalloc(sizeof(*device), GFP_NOFS); 1878 + if (!device) { 1879 + ret = -ENOMEM; 1880 + goto error; 1881 + } 1882 + 1883 + name = rcu_string_strdup(device_path, GFP_NOFS); 1884 + if (!name) { 1885 + kfree(device); 1886 + ret = -ENOMEM; 1887 + goto error; 1888 + } 1889 + rcu_assign_pointer(device->name, name); 1890 + 1891 + q = bdev_get_queue(bdev); 1892 + if (blk_queue_discard(q)) 1893 + device->can_discard = 1; 1894 + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1895 + device->writeable = 1; 1896 + device->work.func = pending_bios_fn; 1897 + generate_random_uuid(device->uuid); 1898 + device->devid = BTRFS_DEV_REPLACE_DEVID; 1899 + spin_lock_init(&device->io_lock); 1900 + device->generation = 0; 1901 + device->io_width = root->sectorsize; 1902 + device->io_align = root->sectorsize; 1903 + device->sector_size = root->sectorsize; 1904 + device->total_bytes = i_size_read(bdev->bd_inode); 1905 + device->disk_total_bytes = device->total_bytes; 1906 + device->dev_root = fs_info->dev_root; 1907 + device->bdev = bdev; 1908 + device->in_fs_metadata = 1; 1909 + device->is_tgtdev_for_dev_replace = 1; 1910 + device->mode = FMODE_EXCL; 1911 + set_blocksize(device->bdev, 4096); 1912 + device->fs_devices = fs_info->fs_devices; 1913 + list_add(&device->dev_list, &fs_info->fs_devices->devices); 1914 + fs_info->fs_devices->num_devices++; 1915 + fs_info->fs_devices->open_devices++; 1916 + if (device->can_discard) 1917 + fs_info->fs_devices->num_can_discard++; 1918 + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1919 + 1920 + *device_out = device; 1921 + return ret; 1922 + 1923 + error: 1924 + blkdev_put(bdev, FMODE_EXCL); 1925 + return ret; 1926 + } 1927 + 1928 + void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, 1929 + struct btrfs_device *tgtdev) 1930 + { 1931 + WARN_ON(fs_info->fs_devices->rw_devices == 0); 1932 + tgtdev->io_width = fs_info->dev_root->sectorsize; 1933 + tgtdev->io_align = fs_info->dev_root->sectorsize; 1934 + tgtdev->sector_size = fs_info->dev_root->sectorsize; 1935 + tgtdev->dev_root = fs_info->dev_root; 1936 + tgtdev->in_fs_metadata = 1; 1937 + } 1938 + 2020 1939 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2021 1940 struct btrfs_device *device) 2022 1941 { ··· 2165 1900 2166 1901 if (!device->writeable) 2167 1902 return -EACCES; 2168 - if (new_size <= device->total_bytes) 1903 + if (new_size <= device->total_bytes || 1904 + device->is_tgtdev_for_dev_replace) 2169 1905 return -EINVAL; 2170 1906 2171 1907 btrfs_set_super_total_bytes(super_copy, old_total + diff); ··· 2604 2338 return 1; 2605 2339 } 2606 2340 2607 - static u64 div_factor_fine(u64 num, int factor) 2608 - { 2609 - if (factor <= 0) 2610 - return 0; 2611 - if (factor >= 100) 2612 - return num; 2613 - 2614 - num *= factor; 2615 - do_div(num, 100); 2616 - return num; 2617 - } 2618 - 2619 2341 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 2620 2342 struct btrfs_balance_args *bargs) 2621 2343 { ··· 2768 2514 return 1; 2769 2515 } 2770 2516 2771 - static u64 div_factor(u64 num, int factor) 2772 - { 2773 - if (factor == 10) 2774 - return num; 2775 - num *= factor; 2776 - do_div(num, 10); 2777 - return num; 2778 - } 2779 - 2780 2517 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 2781 2518 { 2782 2519 struct btrfs_balance_control *bctl = fs_info->balance_ctl; ··· 2795 2550 size_to_free = div_factor(old_size, 1); 2796 2551 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 2797 2552 if (!device->writeable || 2798 - device->total_bytes - device->bytes_used > size_to_free) 2553 + device->total_bytes - device->bytes_used > size_to_free || 2554 + device->is_tgtdev_for_dev_replace) 2799 2555 continue; 2800 2556 2801 2557 ret = btrfs_shrink_device(device, old_size - size_to_free); ··· 2974 2728 u64 allowed; 2975 2729 int mixed = 0; 2976 2730 int ret; 2731 + u64 num_devices; 2977 2732 2978 2733 if (btrfs_fs_closing(fs_info) || 2979 2734 atomic_read(&fs_info->balance_pause_req) || ··· 3003 2756 } 3004 2757 } 3005 2758 2759 + num_devices = fs_info->fs_devices->num_devices; 2760 + btrfs_dev_replace_lock(&fs_info->dev_replace); 2761 + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 2762 + BUG_ON(num_devices < 1); 2763 + num_devices--; 2764 + } 2765 + btrfs_dev_replace_unlock(&fs_info->dev_replace); 3006 2766 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 3007 - if (fs_info->fs_devices->num_devices == 1) 2767 + if (num_devices == 1) 3008 2768 allowed |= BTRFS_BLOCK_GROUP_DUP; 3009 - else if (fs_info->fs_devices->num_devices < 4) 2769 + else if (num_devices < 4) 3010 2770 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3011 2771 else 3012 2772 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | ··· 3156 2902 ret = btrfs_balance(fs_info->balance_ctl, NULL); 3157 2903 } 3158 2904 2905 + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); 3159 2906 mutex_unlock(&fs_info->balance_mutex); 3160 2907 mutex_unlock(&fs_info->volume_mutex); 3161 2908 ··· 3179 2924 return 0; 3180 2925 } 3181 2926 2927 + WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); 3182 2928 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 3183 2929 if (IS_ERR(tsk)) 3184 2930 return PTR_ERR(tsk); ··· 3336 3080 u64 old_size = device->total_bytes; 3337 3081 u64 diff = device->total_bytes - new_size; 3338 3082 3339 - if (new_size >= device->total_bytes) 3083 + if (device->is_tgtdev_for_dev_replace) 3340 3084 return -EINVAL; 3341 3085 3342 3086 path = btrfs_alloc_path(); ··· 3491 3235 return 0; 3492 3236 } 3493 3237 3238 + struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 3239 + { 2, 1, 0, 4, 2, 2 /* raid10 */ }, 3240 + { 1, 1, 2, 2, 2, 2 /* raid1 */ }, 3241 + { 1, 2, 1, 1, 1, 2 /* dup */ }, 3242 + { 1, 1, 0, 2, 1, 1 /* raid0 */ }, 3243 + { 1, 1, 0, 1, 1, 1 /* single */ }, 3244 + }; 3245 + 3494 3246 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3495 3247 struct btrfs_root *extent_root, 3496 3248 struct map_lookup **map_ret, ··· 3528 3264 int ndevs; 3529 3265 int i; 3530 3266 int j; 3267 + int index; 3531 3268 3532 3269 BUG_ON(!alloc_profile_is_valid(type, 0)); 3533 3270 3534 3271 if (list_empty(&fs_devices->alloc_list)) 3535 3272 return -ENOSPC; 3536 3273 3537 - sub_stripes = 1; 3538 - dev_stripes = 1; 3539 - devs_increment = 1; 3540 - ncopies = 1; 3541 - devs_max = 0; /* 0 == as many as possible */ 3542 - devs_min = 1; 3274 + index = __get_raid_index(type); 3543 3275 3544 - /* 3545 - * define the properties of each RAID type. 3546 - * FIXME: move this to a global table and use it in all RAID 3547 - * calculation code 3548 - */ 3549 - if (type & (BTRFS_BLOCK_GROUP_DUP)) { 3550 - dev_stripes = 2; 3551 - ncopies = 2; 3552 - devs_max = 1; 3553 - } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) { 3554 - devs_min = 2; 3555 - } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 3556 - devs_increment = 2; 3557 - ncopies = 2; 3558 - devs_max = 2; 3559 - devs_min = 2; 3560 - } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 3561 - sub_stripes = 2; 3562 - devs_increment = 2; 3563 - ncopies = 2; 3564 - devs_min = 4; 3565 - } else { 3566 - devs_max = 1; 3567 - } 3276 + sub_stripes = btrfs_raid_array[index].sub_stripes; 3277 + dev_stripes = btrfs_raid_array[index].dev_stripes; 3278 + devs_max = btrfs_raid_array[index].devs_max; 3279 + devs_min = btrfs_raid_array[index].devs_min; 3280 + devs_increment = btrfs_raid_array[index].devs_increment; 3281 + ncopies = btrfs_raid_array[index].ncopies; 3568 3282 3569 3283 if (type & BTRFS_BLOCK_GROUP_DATA) { 3570 3284 max_stripe_size = 1024 * 1024 * 1024; ··· 3589 3347 cur = cur->next; 3590 3348 3591 3349 if (!device->writeable) { 3592 - printk(KERN_ERR 3350 + WARN(1, KERN_ERR 3593 3351 "btrfs: read-only device in alloc_list\n"); 3594 - WARN_ON(1); 3595 3352 continue; 3596 3353 } 3597 3354 3598 - if (!device->in_fs_metadata) 3355 + if (!device->in_fs_metadata || 3356 + device->is_tgtdev_for_dev_replace) 3599 3357 continue; 3600 3358 3601 3359 if (device->total_bytes > device->bytes_used) ··· 3624 3382 devices_info[ndevs].total_avail = total_avail; 3625 3383 devices_info[ndevs].dev = device; 3626 3384 ++ndevs; 3385 + WARN_ON(ndevs > fs_devices->rw_devices); 3627 3386 } 3628 3387 3629 3388 /* ··· 3983 3740 } 3984 3741 } 3985 3742 3986 - int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) 3743 + int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 3987 3744 { 3745 + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 3988 3746 struct extent_map *em; 3989 3747 struct map_lookup *map; 3990 3748 struct extent_map_tree *em_tree = &map_tree->map_tree; ··· 4005 3761 else 4006 3762 ret = 1; 4007 3763 free_extent_map(em); 3764 + 3765 + btrfs_dev_replace_lock(&fs_info->dev_replace); 3766 + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) 3767 + ret++; 3768 + btrfs_dev_replace_unlock(&fs_info->dev_replace); 3769 + 4008 3770 return ret; 4009 3771 } 4010 3772 4011 - static int find_live_mirror(struct map_lookup *map, int first, int num, 4012 - int optimal) 3773 + static int find_live_mirror(struct btrfs_fs_info *fs_info, 3774 + struct map_lookup *map, int first, int num, 3775 + int optimal, int dev_replace_is_ongoing) 4013 3776 { 4014 3777 int i; 4015 - if (map->stripes[optimal].dev->bdev) 4016 - return optimal; 4017 - for (i = first; i < first + num; i++) { 4018 - if (map->stripes[i].dev->bdev) 4019 - return i; 3778 + int tolerance; 3779 + struct btrfs_device *srcdev; 3780 + 3781 + if (dev_replace_is_ongoing && 3782 + fs_info->dev_replace.cont_reading_from_srcdev_mode == 3783 + BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 3784 + srcdev = fs_info->dev_replace.srcdev; 3785 + else 3786 + srcdev = NULL; 3787 + 3788 + /* 3789 + * try to avoid the drive that is the source drive for a 3790 + * dev-replace procedure, only choose it if no other non-missing 3791 + * mirror is available 3792 + */ 3793 + for (tolerance = 0; tolerance < 2; tolerance++) { 3794 + if (map->stripes[optimal].dev->bdev && 3795 + (tolerance || map->stripes[optimal].dev != srcdev)) 3796 + return optimal; 3797 + for (i = first; i < first + num; i++) { 3798 + if (map->stripes[i].dev->bdev && 3799 + (tolerance || map->stripes[i].dev != srcdev)) 3800 + return i; 3801 + } 4020 3802 } 3803 + 4021 3804 /* we couldn't find one that doesn't fail. Just return something 4022 3805 * and the io error handling code will clean up eventually 4023 3806 */ 4024 3807 return optimal; 4025 3808 } 4026 3809 4027 - static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 3810 + static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 4028 3811 u64 logical, u64 *length, 4029 3812 struct btrfs_bio **bbio_ret, 4030 3813 int mirror_num) 4031 3814 { 4032 3815 struct extent_map *em; 4033 3816 struct map_lookup *map; 3817 + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 4034 3818 struct extent_map_tree *em_tree = &map_tree->map_tree; 4035 3819 u64 offset; 4036 3820 u64 stripe_offset; ··· 4072 3800 int num_stripes; 4073 3801 int max_errors = 0; 4074 3802 struct btrfs_bio *bbio = NULL; 3803 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 3804 + int dev_replace_is_ongoing = 0; 3805 + int num_alloc_stripes; 3806 + int patch_the_first_stripe_for_dev_replace = 0; 3807 + u64 physical_to_patch_in_first_stripe = 0; 4075 3808 4076 3809 read_lock(&em_tree->lock); 4077 3810 em = lookup_extent_mapping(em_tree, logical, *length); ··· 4092 3815 BUG_ON(em->start > logical || em->start + em->len < logical); 4093 3816 map = (struct map_lookup *)em->bdev; 4094 3817 offset = logical - em->start; 4095 - 4096 - if (mirror_num > map->num_stripes) 4097 - mirror_num = 0; 4098 3818 4099 3819 stripe_nr = offset; 4100 3820 /* ··· 4119 3845 if (!bbio_ret) 4120 3846 goto out; 4121 3847 3848 + btrfs_dev_replace_lock(dev_replace); 3849 + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 3850 + if (!dev_replace_is_ongoing) 3851 + btrfs_dev_replace_unlock(dev_replace); 3852 + 3853 + if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 3854 + !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && 3855 + dev_replace->tgtdev != NULL) { 3856 + /* 3857 + * in dev-replace case, for repair case (that's the only 3858 + * case where the mirror is selected explicitly when 3859 + * calling btrfs_map_block), blocks left of the left cursor 3860 + * can also be read from the target drive. 3861 + * For REQ_GET_READ_MIRRORS, the target drive is added as 3862 + * the last one to the array of stripes. For READ, it also 3863 + * needs to be supported using the same mirror number. 3864 + * If the requested block is not left of the left cursor, 3865 + * EIO is returned. This can happen because btrfs_num_copies() 3866 + * returns one more in the dev-replace case. 3867 + */ 3868 + u64 tmp_length = *length; 3869 + struct btrfs_bio *tmp_bbio = NULL; 3870 + int tmp_num_stripes; 3871 + u64 srcdev_devid = dev_replace->srcdev->devid; 3872 + int index_srcdev = 0; 3873 + int found = 0; 3874 + u64 physical_of_found = 0; 3875 + 3876 + ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, 3877 + logical, &tmp_length, &tmp_bbio, 0); 3878 + if (ret) { 3879 + WARN_ON(tmp_bbio != NULL); 3880 + goto out; 3881 + } 3882 + 3883 + tmp_num_stripes = tmp_bbio->num_stripes; 3884 + if (mirror_num > tmp_num_stripes) { 3885 + /* 3886 + * REQ_GET_READ_MIRRORS does not contain this 3887 + * mirror, that means that the requested area 3888 + * is not left of the left cursor 3889 + */ 3890 + ret = -EIO; 3891 + kfree(tmp_bbio); 3892 + goto out; 3893 + } 3894 + 3895 + /* 3896 + * process the rest of the function using the mirror_num 3897 + * of the source drive. Therefore look it up first. 3898 + * At the end, patch the device pointer to the one of the 3899 + * target drive. 3900 + */ 3901 + for (i = 0; i < tmp_num_stripes; i++) { 3902 + if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) { 3903 + /* 3904 + * In case of DUP, in order to keep it 3905 + * simple, only add the mirror with the 3906 + * lowest physical address 3907 + */ 3908 + if (found && 3909 + physical_of_found <= 3910 + tmp_bbio->stripes[i].physical) 3911 + continue; 3912 + index_srcdev = i; 3913 + found = 1; 3914 + physical_of_found = 3915 + tmp_bbio->stripes[i].physical; 3916 + } 3917 + } 3918 + 3919 + if (found) { 3920 + mirror_num = index_srcdev + 1; 3921 + patch_the_first_stripe_for_dev_replace = 1; 3922 + physical_to_patch_in_first_stripe = physical_of_found; 3923 + } else { 3924 + WARN_ON(1); 3925 + ret = -EIO; 3926 + kfree(tmp_bbio); 3927 + goto out; 3928 + } 3929 + 3930 + kfree(tmp_bbio); 3931 + } else if (mirror_num > map->num_stripes) { 3932 + mirror_num = 0; 3933 + } 3934 + 4122 3935 num_stripes = 1; 4123 3936 stripe_index = 0; 4124 3937 stripe_nr_orig = stripe_nr; ··· 4220 3859 stripe_nr_end - stripe_nr_orig); 4221 3860 stripe_index = do_div(stripe_nr, map->num_stripes); 4222 3861 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 4223 - if (rw & (REQ_WRITE | REQ_DISCARD)) 3862 + if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) 4224 3863 num_stripes = map->num_stripes; 4225 3864 else if (mirror_num) 4226 3865 stripe_index = mirror_num - 1; 4227 3866 else { 4228 - stripe_index = find_live_mirror(map, 0, 3867 + stripe_index = find_live_mirror(fs_info, map, 0, 4229 3868 map->num_stripes, 4230 - current->pid % map->num_stripes); 3869 + current->pid % map->num_stripes, 3870 + dev_replace_is_ongoing); 4231 3871 mirror_num = stripe_index + 1; 4232 3872 } 4233 3873 4234 3874 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 4235 - if (rw & (REQ_WRITE | REQ_DISCARD)) { 3875 + if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) { 4236 3876 num_stripes = map->num_stripes; 4237 3877 } else if (mirror_num) { 4238 3878 stripe_index = mirror_num - 1; ··· 4247 3885 stripe_index = do_div(stripe_nr, factor); 4248 3886 stripe_index *= map->sub_stripes; 4249 3887 4250 - if (rw & REQ_WRITE) 3888 + if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) 4251 3889 num_stripes = map->sub_stripes; 4252 3890 else if (rw & REQ_DISCARD) 4253 3891 num_stripes = min_t(u64, map->sub_stripes * ··· 4257 3895 stripe_index += mirror_num - 1; 4258 3896 else { 4259 3897 int old_stripe_index = stripe_index; 4260 - stripe_index = find_live_mirror(map, stripe_index, 3898 + stripe_index = find_live_mirror(fs_info, map, 3899 + stripe_index, 4261 3900 map->sub_stripes, stripe_index + 4262 - current->pid % map->sub_stripes); 3901 + current->pid % map->sub_stripes, 3902 + dev_replace_is_ongoing); 4263 3903 mirror_num = stripe_index - old_stripe_index + 1; 4264 3904 } 4265 3905 } else { ··· 4275 3911 } 4276 3912 BUG_ON(stripe_index >= map->num_stripes); 4277 3913 4278 - bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS); 3914 + num_alloc_stripes = num_stripes; 3915 + if (dev_replace_is_ongoing) { 3916 + if (rw & (REQ_WRITE | REQ_DISCARD)) 3917 + num_alloc_stripes <<= 1; 3918 + if (rw & REQ_GET_READ_MIRRORS) 3919 + num_alloc_stripes++; 3920 + } 3921 + bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS); 4279 3922 if (!bbio) { 4280 3923 ret = -ENOMEM; 4281 3924 goto out; ··· 4369 3998 } 4370 3999 } 4371 4000 4372 - if (rw & REQ_WRITE) { 4001 + if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { 4373 4002 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 4374 4003 BTRFS_BLOCK_GROUP_RAID10 | 4375 4004 BTRFS_BLOCK_GROUP_DUP)) { ··· 4377 4006 } 4378 4007 } 4379 4008 4009 + if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && 4010 + dev_replace->tgtdev != NULL) { 4011 + int index_where_to_add; 4012 + u64 srcdev_devid = dev_replace->srcdev->devid; 4013 + 4014 + /* 4015 + * duplicate the write operations while the dev replace 4016 + * procedure is running. Since the copying of the old disk 4017 + * to the new disk takes place at run time while the 4018 + * filesystem is mounted writable, the regular write 4019 + * operations to the old disk have to be duplicated to go 4020 + * to the new disk as well. 4021 + * Note that device->missing is handled by the caller, and 4022 + * that the write to the old disk is already set up in the 4023 + * stripes array. 4024 + */ 4025 + index_where_to_add = num_stripes; 4026 + for (i = 0; i < num_stripes; i++) { 4027 + if (bbio->stripes[i].dev->devid == srcdev_devid) { 4028 + /* write to new disk, too */ 4029 + struct btrfs_bio_stripe *new = 4030 + bbio->stripes + index_where_to_add; 4031 + struct btrfs_bio_stripe *old = 4032 + bbio->stripes + i; 4033 + 4034 + new->physical = old->physical; 4035 + new->length = old->length; 4036 + new->dev = dev_replace->tgtdev; 4037 + index_where_to_add++; 4038 + max_errors++; 4039 + } 4040 + } 4041 + num_stripes = index_where_to_add; 4042 + } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) && 4043 + dev_replace->tgtdev != NULL) { 4044 + u64 srcdev_devid = dev_replace->srcdev->devid; 4045 + int index_srcdev = 0; 4046 + int found = 0; 4047 + u64 physical_of_found = 0; 4048 + 4049 + /* 4050 + * During the dev-replace procedure, the target drive can 4051 + * also be used to read data in case it is needed to repair 4052 + * a corrupt block elsewhere. This is possible if the 4053 + * requested area is left of the left cursor. In this area, 4054 + * the target drive is a full copy of the source drive. 4055 + */ 4056 + for (i = 0; i < num_stripes; i++) { 4057 + if (bbio->stripes[i].dev->devid == srcdev_devid) { 4058 + /* 4059 + * In case of DUP, in order to keep it 4060 + * simple, only add the mirror with the 4061 + * lowest physical address 4062 + */ 4063 + if (found && 4064 + physical_of_found <= 4065 + bbio->stripes[i].physical) 4066 + continue; 4067 + index_srcdev = i; 4068 + found = 1; 4069 + physical_of_found = bbio->stripes[i].physical; 4070 + } 4071 + } 4072 + if (found) { 4073 + u64 length = map->stripe_len; 4074 + 4075 + if (physical_of_found + length <= 4076 + dev_replace->cursor_left) { 4077 + struct btrfs_bio_stripe *tgtdev_stripe = 4078 + bbio->stripes + num_stripes; 4079 + 4080 + tgtdev_stripe->physical = physical_of_found; 4081 + tgtdev_stripe->length = 4082 + bbio->stripes[index_srcdev].length; 4083 + tgtdev_stripe->dev = dev_replace->tgtdev; 4084 + 4085 + num_stripes++; 4086 + } 4087 + } 4088 + } 4089 + 4380 4090 *bbio_ret = bbio; 4381 4091 bbio->num_stripes = num_stripes; 4382 4092 bbio->max_errors = max_errors; 4383 4093 bbio->mirror_num = mirror_num; 4094 + 4095 + /* 4096 + * this is the case that REQ_READ && dev_replace_is_ongoing && 4097 + * mirror_num == num_stripes + 1 && dev_replace target drive is 4098 + * available as a mirror 4099 + */ 4100 + if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 4101 + WARN_ON(num_stripes > 1); 4102 + bbio->stripes[0].dev = dev_replace->tgtdev; 4103 + bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 4104 + bbio->mirror_num = map->num_stripes + 1; 4105 + } 4384 4106 out: 4107 + if (dev_replace_is_ongoing) 4108 + btrfs_dev_replace_unlock(dev_replace); 4385 4109 free_extent_map(em); 4386 4110 return ret; 4387 4111 } 4388 4112 4389 - int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 4113 + int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 4390 4114 u64 logical, u64 *length, 4391 4115 struct btrfs_bio **bbio_ret, int mirror_num) 4392 4116 { 4393 - return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret, 4117 + return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, 4394 4118 mirror_num); 4395 4119 } 4396 4120 ··· 4704 4238 &device->work); 4705 4239 } 4706 4240 4241 + static int bio_size_ok(struct block_device *bdev, struct bio *bio, 4242 + sector_t sector) 4243 + { 4244 + struct bio_vec *prev; 4245 + struct request_queue *q = bdev_get_queue(bdev); 4246 + unsigned short max_sectors = queue_max_sectors(q); 4247 + struct bvec_merge_data bvm = { 4248 + .bi_bdev = bdev, 4249 + .bi_sector = sector, 4250 + .bi_rw = bio->bi_rw, 4251 + }; 4252 + 4253 + if (bio->bi_vcnt == 0) { 4254 + WARN_ON(1); 4255 + return 1; 4256 + } 4257 + 4258 + prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; 4259 + if ((bio->bi_size >> 9) > max_sectors) 4260 + return 0; 4261 + 4262 + if (!q->merge_bvec_fn) 4263 + return 1; 4264 + 4265 + bvm.bi_size = bio->bi_size - prev->bv_len; 4266 + if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) 4267 + return 0; 4268 + return 1; 4269 + } 4270 + 4271 + static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, 4272 + struct bio *bio, u64 physical, int dev_nr, 4273 + int rw, int async) 4274 + { 4275 + struct btrfs_device *dev = bbio->stripes[dev_nr].dev; 4276 + 4277 + bio->bi_private = bbio; 4278 + bio->bi_private = merge_stripe_index_into_bio_private( 4279 + bio->bi_private, (unsigned int)dev_nr); 4280 + bio->bi_end_io = btrfs_end_bio; 4281 + bio->bi_sector = physical >> 9; 4282 + #ifdef DEBUG 4283 + { 4284 + struct rcu_string *name; 4285 + 4286 + rcu_read_lock(); 4287 + name = rcu_dereference(dev->name); 4288 + pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " 4289 + "(%s id %llu), size=%u\n", rw, 4290 + (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, 4291 + name->str, dev->devid, bio->bi_size); 4292 + rcu_read_unlock(); 4293 + } 4294 + #endif 4295 + bio->bi_bdev = dev->bdev; 4296 + if (async) 4297 + schedule_bio(root, dev, rw, bio); 4298 + else 4299 + btrfsic_submit_bio(rw, bio); 4300 + } 4301 + 4302 + static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, 4303 + struct bio *first_bio, struct btrfs_device *dev, 4304 + int dev_nr, int rw, int async) 4305 + { 4306 + struct bio_vec *bvec = first_bio->bi_io_vec; 4307 + struct bio *bio; 4308 + int nr_vecs = bio_get_nr_vecs(dev->bdev); 4309 + u64 physical = bbio->stripes[dev_nr].physical; 4310 + 4311 + again: 4312 + bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS); 4313 + if (!bio) 4314 + return -ENOMEM; 4315 + 4316 + while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) { 4317 + if (bio_add_page(bio, bvec->bv_page, bvec->bv_len, 4318 + bvec->bv_offset) < bvec->bv_len) { 4319 + u64 len = bio->bi_size; 4320 + 4321 + atomic_inc(&bbio->stripes_pending); 4322 + submit_stripe_bio(root, bbio, bio, physical, dev_nr, 4323 + rw, async); 4324 + physical += len; 4325 + goto again; 4326 + } 4327 + bvec++; 4328 + } 4329 + 4330 + submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async); 4331 + return 0; 4332 + } 4333 + 4334 + static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) 4335 + { 4336 + atomic_inc(&bbio->error); 4337 + if (atomic_dec_and_test(&bbio->stripes_pending)) { 4338 + bio->bi_private = bbio->private; 4339 + bio->bi_end_io = bbio->end_io; 4340 + bio->bi_bdev = (struct block_device *) 4341 + (unsigned long)bbio->mirror_num; 4342 + bio->bi_sector = logical >> 9; 4343 + kfree(bbio); 4344 + bio_endio(bio, -EIO); 4345 + } 4346 + } 4347 + 4707 4348 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 4708 4349 int mirror_num, int async_submit) 4709 4350 { 4710 - struct btrfs_mapping_tree *map_tree; 4711 4351 struct btrfs_device *dev; 4712 4352 struct bio *first_bio = bio; 4713 4353 u64 logical = (u64)bio->bi_sector << 9; ··· 4825 4253 struct btrfs_bio *bbio = NULL; 4826 4254 4827 4255 length = bio->bi_size; 4828 - map_tree = &root->fs_info->mapping_tree; 4829 4256 map_length = length; 4830 4257 4831 - ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, 4258 + ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 4832 4259 mirror_num); 4833 - if (ret) /* -ENOMEM */ 4260 + if (ret) 4834 4261 return ret; 4835 4262 4836 4263 total_devs = bbio->num_stripes; ··· 4847 4276 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 4848 4277 4849 4278 while (dev_nr < total_devs) { 4279 + dev = bbio->stripes[dev_nr].dev; 4280 + if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { 4281 + bbio_error(bbio, first_bio, logical); 4282 + dev_nr++; 4283 + continue; 4284 + } 4285 + 4286 + /* 4287 + * Check and see if we're ok with this bio based on it's size 4288 + * and offset with the given device. 4289 + */ 4290 + if (!bio_size_ok(dev->bdev, first_bio, 4291 + bbio->stripes[dev_nr].physical >> 9)) { 4292 + ret = breakup_stripe_bio(root, bbio, first_bio, dev, 4293 + dev_nr, rw, async_submit); 4294 + BUG_ON(ret); 4295 + dev_nr++; 4296 + continue; 4297 + } 4298 + 4850 4299 if (dev_nr < total_devs - 1) { 4851 4300 bio = bio_clone(first_bio, GFP_NOFS); 4852 4301 BUG_ON(!bio); /* -ENOMEM */ 4853 4302 } else { 4854 4303 bio = first_bio; 4855 4304 } 4856 - bio->bi_private = bbio; 4857 - bio->bi_private = merge_stripe_index_into_bio_private( 4858 - bio->bi_private, (unsigned int)dev_nr); 4859 - bio->bi_end_io = btrfs_end_bio; 4860 - bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; 4861 - dev = bbio->stripes[dev_nr].dev; 4862 - if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { 4863 - #ifdef DEBUG 4864 - struct rcu_string *name; 4865 4305 4866 - rcu_read_lock(); 4867 - name = rcu_dereference(dev->name); 4868 - pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " 4869 - "(%s id %llu), size=%u\n", rw, 4870 - (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, 4871 - name->str, dev->devid, bio->bi_size); 4872 - rcu_read_unlock(); 4873 - #endif 4874 - bio->bi_bdev = dev->bdev; 4875 - if (async_submit) 4876 - schedule_bio(root, dev, rw, bio); 4877 - else 4878 - btrfsic_submit_bio(rw, bio); 4879 - } else { 4880 - bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; 4881 - bio->bi_sector = logical >> 9; 4882 - bio_endio(bio, -EIO); 4883 - } 4306 + submit_stripe_bio(root, bbio, bio, 4307 + bbio->stripes[dev_nr].physical, dev_nr, rw, 4308 + async_submit); 4884 4309 dev_nr++; 4885 4310 } 4886 4311 return 0; 4887 4312 } 4888 4313 4889 - struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 4314 + struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, 4890 4315 u8 *uuid, u8 *fsid) 4891 4316 { 4892 4317 struct btrfs_device *device; 4893 4318 struct btrfs_fs_devices *cur_devices; 4894 4319 4895 - cur_devices = root->fs_info->fs_devices; 4320 + cur_devices = fs_info->fs_devices; 4896 4321 while (cur_devices) { 4897 4322 if (!fsid || 4898 4323 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { ··· 4969 4402 em->bdev = (struct block_device *)map; 4970 4403 em->start = logical; 4971 4404 em->len = length; 4405 + em->orig_start = 0; 4972 4406 em->block_start = 0; 4973 4407 em->block_len = em->len; 4974 4408 ··· 4987 4419 read_extent_buffer(leaf, uuid, (unsigned long) 4988 4420 btrfs_stripe_dev_uuid_nr(chunk, i), 4989 4421 BTRFS_UUID_SIZE); 4990 - map->stripes[i].dev = btrfs_find_device(root, devid, uuid, 4991 - NULL); 4422 + map->stripes[i].dev = btrfs_find_device(root->fs_info, devid, 4423 + uuid, NULL); 4992 4424 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 4993 4425 kfree(map); 4994 4426 free_extent_map(em); ··· 5029 4461 device->io_align = btrfs_device_io_align(leaf, dev_item); 5030 4462 device->io_width = btrfs_device_io_width(leaf, dev_item); 5031 4463 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 4464 + WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 4465 + device->is_tgtdev_for_dev_replace = 0; 5032 4466 5033 4467 ptr = (unsigned long)btrfs_device_uuid(dev_item); 5034 4468 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); ··· 5108 4538 return ret; 5109 4539 } 5110 4540 5111 - device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 4541 + device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid); 5112 4542 if (!device || !device->bdev) { 5113 4543 if (!btrfs_test_opt(root, DEGRADED)) 5114 4544 return -EIO; ··· 5141 4571 fill_device_from_item(leaf, dev_item, device); 5142 4572 device->dev_root = root->fs_info->dev_root; 5143 4573 device->in_fs_metadata = 1; 5144 - if (device->writeable) { 4574 + if (device->writeable && !device->is_tgtdev_for_dev_replace) { 5145 4575 device->fs_devices->total_rw_bytes += device->total_bytes; 5146 4576 spin_lock(&root->fs_info->free_chunk_lock); 5147 4577 root->fs_info->free_chunk_space += device->total_bytes - ··· 5500 4930 int i; 5501 4931 5502 4932 mutex_lock(&fs_devices->device_list_mutex); 5503 - dev = btrfs_find_device(root, stats->devid, NULL, NULL); 4933 + dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL); 5504 4934 mutex_unlock(&fs_devices->device_list_mutex); 5505 4935 5506 4936 if (!dev) { ··· 5526 4956 } 5527 4957 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 5528 4958 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 4959 + return 0; 4960 + } 4961 + 4962 + int btrfs_scratch_superblock(struct btrfs_device *device) 4963 + { 4964 + struct buffer_head *bh; 4965 + struct btrfs_super_block *disk_super; 4966 + 4967 + bh = btrfs_read_dev_super(device->bdev); 4968 + if (!bh) 4969 + return -EINVAL; 4970 + disk_super = (struct btrfs_super_block *)bh->b_data; 4971 + 4972 + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 4973 + set_buffer_dirty(bh); 4974 + sync_dirty_buffer(bh); 4975 + brelse(bh); 4976 + 5529 4977 return 0; 5530 4978 }
+30 -5
fs/btrfs/volumes.h
··· 50 50 int in_fs_metadata; 51 51 int missing; 52 52 int can_discard; 53 + int is_tgtdev_for_dev_replace; 53 54 54 55 spinlock_t io_lock; 55 56 ··· 89 88 u8 uuid[BTRFS_UUID_SIZE]; 90 89 91 90 /* per-device scrub information */ 92 - struct scrub_dev *scrub_device; 91 + struct scrub_ctx *scrub_device; 93 92 94 93 struct btrfs_work work; 95 94 struct rcu_head rcu; ··· 180 179 u64 total_avail; 181 180 }; 182 181 182 + struct btrfs_raid_attr { 183 + int sub_stripes; /* sub_stripes info for map */ 184 + int dev_stripes; /* stripes per dev */ 185 + int devs_max; /* max devs to use */ 186 + int devs_min; /* min devs needed */ 187 + int devs_increment; /* ndevs has to be a multiple of this */ 188 + int ncopies; /* how many copies to data has */ 189 + }; 190 + 183 191 struct map_lookup { 184 192 u64 type; 185 193 int io_align; ··· 258 248 struct btrfs_device *device, 259 249 u64 chunk_tree, u64 chunk_objectid, 260 250 u64 chunk_offset, u64 start, u64 num_bytes); 261 - int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 251 + int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 262 252 u64 logical, u64 *length, 263 253 struct btrfs_bio **bbio_ret, int mirror_num); 264 254 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, ··· 277 267 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 278 268 struct btrfs_fs_devices **fs_devices_ret); 279 269 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); 280 - void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); 270 + void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, 271 + struct btrfs_fs_devices *fs_devices, int step); 272 + int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, 273 + char *device_path, 274 + struct btrfs_device **device); 275 + int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, 276 + struct btrfs_device **device); 281 277 int btrfs_add_device(struct btrfs_trans_handle *trans, 282 278 struct btrfs_root *root, 283 279 struct btrfs_device *device); 284 280 int btrfs_rm_device(struct btrfs_root *root, char *device_path); 285 281 void btrfs_cleanup_fs_uuids(void); 286 - int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); 282 + int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); 287 283 int btrfs_grow_device(struct btrfs_trans_handle *trans, 288 284 struct btrfs_device *device, u64 new_size); 289 - struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 285 + struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, 290 286 u8 *uuid, u8 *fsid); 291 287 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); 292 288 int btrfs_init_new_device(struct btrfs_root *root, char *path); 289 + int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, 290 + struct btrfs_device **device_out); 293 291 int btrfs_balance(struct btrfs_balance_control *bctl, 294 292 struct btrfs_ioctl_balance_args *bargs); 295 293 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); ··· 314 296 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); 315 297 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 316 298 struct btrfs_fs_info *fs_info); 299 + void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, 300 + struct btrfs_device *srcdev); 301 + void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 302 + struct btrfs_device *tgtdev); 303 + void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, 304 + struct btrfs_device *tgtdev); 305 + int btrfs_scratch_superblock(struct btrfs_device *device); 317 306 318 307 static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, 319 308 int index)
+12 -1
fs/btrfs/xattr.c
··· 122 122 */ 123 123 if (!value) 124 124 goto out; 125 + } else { 126 + di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), 127 + name, name_len, 0); 128 + if (IS_ERR(di)) { 129 + ret = PTR_ERR(di); 130 + goto out; 131 + } 132 + if (!di && !value) 133 + goto out; 134 + btrfs_release_path(path); 125 135 } 126 136 127 137 again: ··· 208 198 209 199 inode_inc_iversion(inode); 210 200 inode->i_ctime = CURRENT_TIME; 201 + set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); 211 202 ret = btrfs_update_inode(trans, root, inode); 212 203 BUG_ON(ret); 213 204 out: ··· 276 265 277 266 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 278 267 if (verify_dir_item(root, leaf, di)) 279 - continue; 268 + goto next; 280 269 281 270 name_len = btrfs_dir_name_len(leaf, di); 282 271 total_size += name_len + 1;
+2 -1
include/trace/events/btrfs.h
··· 45 45 46 46 #define show_root_type(obj) \ 47 47 obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) || \ 48 - (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-" 48 + (obj >= BTRFS_ROOT_TREE_OBJECTID && \ 49 + obj <= BTRFS_CSUM_TREE_OBJECTID)) ? __show_root_type(obj) : "-" 49 50 50 51 #define BTRFS_GROUP_FLAGS \ 51 52 { BTRFS_BLOCK_GROUP_DATA, "DATA"}, \