Merge branch 'for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

+11

fs/btrfs/Kconfig

··· 91 91 any of the assertions trip. This is meant for btrfs developers only. 92 92 93 93 If unsure, say N. 94 + 95 + config BTRFS_FS_REF_VERIFY 96 + bool "Btrfs with the ref verify tool compiled in" 97 + depends on BTRFS_FS 98 + default n 99 + help 100 + Enable run-time extent reference verification instrumentation. This 101 + is meant to be used by btrfs developers for tracking down extent 102 + reference problems or verifying they didn't break something. 103 + 104 + If unsure, say N.

+2 -1

fs/btrfs/Makefile

··· 10 10 export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ 11 11 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 12 12 reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ 13 - uuid-tree.o props.o hash.o free-space-tree.o 13 + uuid-tree.o props.o hash.o free-space-tree.o tree-checker.o 14 14 15 15 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 16 16 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o 17 + btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o 17 18 18 19 btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ 19 20 tests/extent-buffer-tests.o tests/btrfs-tests.o \

+1 -1

fs/btrfs/async-thread.c

··· 67 67 static void normal_work_helper(struct btrfs_work *work); 68 68 69 69 #define BTRFS_WORK_HELPER(name) \ 70 - void btrfs_##name(struct work_struct *arg) \ 70 + noinline_for_stack void btrfs_##name(struct work_struct *arg) \ 71 71 { \ 72 72 struct btrfs_work *work = container_of(arg, struct btrfs_work, \ 73 73 normal_work); \

+44 -28

fs/btrfs/backref.c

··· 40 40 const struct extent_buffer *eb, 41 41 const struct btrfs_file_extent_item *fi, 42 42 u64 extent_item_pos, 43 - struct extent_inode_elem **eie) 43 + struct extent_inode_elem **eie, 44 + bool ignore_offset) 44 45 { 45 46 u64 offset = 0; 46 47 struct extent_inode_elem *e; 47 48 48 - if (!btrfs_file_extent_compression(eb, fi) && 49 + if (!ignore_offset && 50 + !btrfs_file_extent_compression(eb, fi) && 49 51 !btrfs_file_extent_encryption(eb, fi) && 50 52 !btrfs_file_extent_other_encoding(eb, fi)) { 51 53 u64 data_offset; ··· 86 84 87 85 static int find_extent_in_eb(const struct extent_buffer *eb, 88 86 u64 wanted_disk_byte, u64 extent_item_pos, 89 - struct extent_inode_elem **eie) 87 + struct extent_inode_elem **eie, 88 + bool ignore_offset) 90 89 { 91 90 u64 disk_byte; 92 91 struct btrfs_key key; ··· 116 113 if (disk_byte != wanted_disk_byte) 117 114 continue; 118 115 119 - ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie); 116 + ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie, ignore_offset); 120 117 if (ret < 0) 121 118 return ret; 122 119 } ··· 422 419 static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, 423 420 struct ulist *parents, struct prelim_ref *ref, 424 421 int level, u64 time_seq, const u64 *extent_item_pos, 425 - u64 total_refs) 422 + u64 total_refs, bool ignore_offset) 426 423 { 427 424 int ret = 0; 428 425 int slot; ··· 475 472 if (extent_item_pos) { 476 473 ret = check_extent_in_eb(&key, eb, fi, 477 474 *extent_item_pos, 478 - &eie); 475 + &eie, ignore_offset); 479 476 if (ret < 0) 480 477 break; 481 478 } ··· 513 510 static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, 514 511 struct btrfs_path *path, u64 time_seq, 515 512 struct prelim_ref *ref, struct ulist *parents, 516 - const u64 *extent_item_pos, u64 total_refs) 513 + const u64 *extent_item_pos, u64 total_refs, 514 + bool ignore_offset) 517 515 { 518 516 struct btrfs_root *root; 519 517 struct btrfs_key root_key; ··· 585 581 } 586 582 587 583 ret = add_all_parents(root, path, parents, ref, level, time_seq, 588 - extent_item_pos, total_refs); 584 + extent_item_pos, total_refs, ignore_offset); 589 585 out: 590 586 path->lowest_level = 0; 591 587 btrfs_release_path(path); ··· 620 616 struct btrfs_path *path, u64 time_seq, 621 617 struct preftrees *preftrees, 622 618 const u64 *extent_item_pos, u64 total_refs, 623 - struct share_check *sc) 619 + struct share_check *sc, bool ignore_offset) 624 620 { 625 621 int err; 626 622 int ret = 0; ··· 665 661 } 666 662 err = resolve_indirect_ref(fs_info, path, time_seq, ref, 667 663 parents, extent_item_pos, 668 - total_refs); 664 + total_refs, ignore_offset); 669 665 /* 670 666 * we can only tolerate ENOENT,otherwise,we should catch error 671 667 * and return directly. ··· 773 769 struct btrfs_key key; 774 770 struct btrfs_key tmp_op_key; 775 771 struct btrfs_key *op_key = NULL; 772 + struct rb_node *n; 776 773 int count; 777 774 int ret = 0; 778 775 ··· 783 778 } 784 779 785 780 spin_lock(&head->lock); 786 - list_for_each_entry(node, &head->ref_list, list) { 781 + for (n = rb_first(&head->ref_tree); n; n = rb_next(n)) { 782 + node = rb_entry(n, struct btrfs_delayed_ref_node, 783 + ref_node); 787 784 if (node->seq > seq) 788 785 continue; 789 786 ··· 1114 1107 * 1115 1108 * Otherwise this returns 0 for success and <0 for an error. 1116 1109 * 1110 + * If ignore_offset is set to false, only extent refs whose offsets match 1111 + * extent_item_pos are returned. If true, every extent ref is returned 1112 + * and extent_item_pos is ignored. 1113 + * 1117 1114 * FIXME some caching might speed things up 1118 1115 */ 1119 1116 static int find_parent_nodes(struct btrfs_trans_handle *trans, 1120 1117 struct btrfs_fs_info *fs_info, u64 bytenr, 1121 1118 u64 time_seq, struct ulist *refs, 1122 1119 struct ulist *roots, const u64 *extent_item_pos, 1123 - struct share_check *sc) 1120 + struct share_check *sc, bool ignore_offset) 1124 1121 { 1125 1122 struct btrfs_key key; 1126 1123 struct btrfs_path *path; ··· 1189 1178 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 1190 1179 if (head) { 1191 1180 if (!mutex_trylock(&head->mutex)) { 1192 - refcount_inc(&head->node.refs); 1181 + refcount_inc(&head->refs); 1193 1182 spin_unlock(&delayed_refs->lock); 1194 1183 1195 1184 btrfs_release_path(path); ··· 1200 1189 */ 1201 1190 mutex_lock(&head->mutex); 1202 1191 mutex_unlock(&head->mutex); 1203 - btrfs_put_delayed_ref(&head->node); 1192 + btrfs_put_delayed_ref_head(head); 1204 1193 goto again; 1205 1194 } 1206 1195 spin_unlock(&delayed_refs->lock); ··· 1246 1235 WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root)); 1247 1236 1248 1237 ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees, 1249 - extent_item_pos, total_refs, sc); 1238 + extent_item_pos, total_refs, sc, ignore_offset); 1250 1239 if (ret) 1251 1240 goto out; 1252 1241 ··· 1293 1282 btrfs_tree_read_lock(eb); 1294 1283 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 1295 1284 ret = find_extent_in_eb(eb, bytenr, 1296 - *extent_item_pos, &eie); 1285 + *extent_item_pos, &eie, ignore_offset); 1297 1286 btrfs_tree_read_unlock_blocking(eb); 1298 1287 free_extent_buffer(eb); 1299 1288 if (ret < 0) ··· 1361 1350 static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, 1362 1351 struct btrfs_fs_info *fs_info, u64 bytenr, 1363 1352 u64 time_seq, struct ulist **leafs, 1364 - const u64 *extent_item_pos) 1353 + const u64 *extent_item_pos, bool ignore_offset) 1365 1354 { 1366 1355 int ret; 1367 1356 ··· 1370 1359 return -ENOMEM; 1371 1360 1372 1361 ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, 1373 - *leafs, NULL, extent_item_pos, NULL); 1362 + *leafs, NULL, extent_item_pos, NULL, ignore_offset); 1374 1363 if (ret < 0 && ret != -ENOENT) { 1375 1364 free_leaf_list(*leafs); 1376 1365 return ret; ··· 1394 1383 */ 1395 1384 static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans, 1396 1385 struct btrfs_fs_info *fs_info, u64 bytenr, 1397 - u64 time_seq, struct ulist **roots) 1386 + u64 time_seq, struct ulist **roots, 1387 + bool ignore_offset) 1398 1388 { 1399 1389 struct ulist *tmp; 1400 1390 struct ulist_node *node = NULL; ··· 1414 1402 ULIST_ITER_INIT(&uiter); 1415 1403 while (1) { 1416 1404 ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, 1417 - tmp, *roots, NULL, NULL); 1405 + tmp, *roots, NULL, NULL, ignore_offset); 1418 1406 if (ret < 0 && ret != -ENOENT) { 1419 1407 ulist_free(tmp); 1420 1408 ulist_free(*roots); ··· 1433 1421 1434 1422 int btrfs_find_all_roots(struct btrfs_trans_handle *trans, 1435 1423 struct btrfs_fs_info *fs_info, u64 bytenr, 1436 - u64 time_seq, struct ulist **roots) 1424 + u64 time_seq, struct ulist **roots, 1425 + bool ignore_offset) 1437 1426 { 1438 1427 int ret; 1439 1428 1440 1429 if (!trans) 1441 1430 down_read(&fs_info->commit_root_sem); 1442 1431 ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr, 1443 - time_seq, roots); 1432 + time_seq, roots, ignore_offset); 1444 1433 if (!trans) 1445 1434 up_read(&fs_info->commit_root_sem); 1446 1435 return ret; ··· 1496 1483 ULIST_ITER_INIT(&uiter); 1497 1484 while (1) { 1498 1485 ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp, 1499 - roots, NULL, &shared); 1486 + roots, NULL, &shared, false); 1500 1487 if (ret == BACKREF_FOUND_SHARED) { 1501 1488 /* this is the only condition under which we return 1 */ 1502 1489 ret = 1; ··· 1890 1877 int iterate_extent_inodes(struct btrfs_fs_info *fs_info, 1891 1878 u64 extent_item_objectid, u64 extent_item_pos, 1892 1879 int search_commit_root, 1893 - iterate_extent_inodes_t *iterate, void *ctx) 1880 + iterate_extent_inodes_t *iterate, void *ctx, 1881 + bool ignore_offset) 1894 1882 { 1895 1883 int ret; 1896 1884 struct btrfs_trans_handle *trans = NULL; ··· 1917 1903 1918 1904 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, 1919 1905 tree_mod_seq_elem.seq, &refs, 1920 - &extent_item_pos); 1906 + &extent_item_pos, ignore_offset); 1921 1907 if (ret) 1922 1908 goto out; 1923 1909 1924 1910 ULIST_ITER_INIT(&ref_uiter); 1925 1911 while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { 1926 1912 ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val, 1927 - tree_mod_seq_elem.seq, &roots); 1913 + tree_mod_seq_elem.seq, &roots, 1914 + ignore_offset); 1928 1915 if (ret) 1929 1916 break; 1930 1917 ULIST_ITER_INIT(&root_uiter); ··· 1958 1943 1959 1944 int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, 1960 1945 struct btrfs_path *path, 1961 - iterate_extent_inodes_t *iterate, void *ctx) 1946 + iterate_extent_inodes_t *iterate, void *ctx, 1947 + bool ignore_offset) 1962 1948 { 1963 1949 int ret; 1964 1950 u64 extent_item_pos; ··· 1977 1961 extent_item_pos = logical - found_key.objectid; 1978 1962 ret = iterate_extent_inodes(fs_info, found_key.objectid, 1979 1963 extent_item_pos, search_commit_root, 1980 - iterate, ctx); 1964 + iterate, ctx, ignore_offset); 1981 1965 1982 1966 return ret; 1983 1967 }

+5 -3

fs/btrfs/backref.h

··· 43 43 int iterate_extent_inodes(struct btrfs_fs_info *fs_info, 44 44 u64 extent_item_objectid, 45 45 u64 extent_offset, int search_commit_root, 46 - iterate_extent_inodes_t *iterate, void *ctx); 46 + iterate_extent_inodes_t *iterate, void *ctx, 47 + bool ignore_offset); 47 48 48 49 int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, 49 50 struct btrfs_path *path, 50 - iterate_extent_inodes_t *iterate, void *ctx); 51 + iterate_extent_inodes_t *iterate, void *ctx, 52 + bool ignore_offset); 51 53 52 54 int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); 53 55 54 56 int btrfs_find_all_roots(struct btrfs_trans_handle *trans, 55 57 struct btrfs_fs_info *fs_info, u64 bytenr, 56 - u64 time_seq, struct ulist **roots); 58 + u64 time_seq, struct ulist **roots, bool ignore_offset); 57 59 char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, 58 60 u32 name_len, unsigned long name_off, 59 61 struct extent_buffer *eb_in, u64 parent,

+20 -9

fs/btrfs/btrfs_inode.h

··· 36 36 #define BTRFS_INODE_ORPHAN_META_RESERVED 1 37 37 #define BTRFS_INODE_DUMMY 2 38 38 #define BTRFS_INODE_IN_DEFRAG 3 39 - #define BTRFS_INODE_DELALLOC_META_RESERVED 4 40 - #define BTRFS_INODE_HAS_ORPHAN_ITEM 5 41 - #define BTRFS_INODE_HAS_ASYNC_EXTENT 6 42 - #define BTRFS_INODE_NEEDS_FULL_SYNC 7 43 - #define BTRFS_INODE_COPY_EVERYTHING 8 44 - #define BTRFS_INODE_IN_DELALLOC_LIST 9 45 - #define BTRFS_INODE_READDIO_NEED_LOCK 10 46 - #define BTRFS_INODE_HAS_PROPS 11 39 + #define BTRFS_INODE_HAS_ORPHAN_ITEM 4 40 + #define BTRFS_INODE_HAS_ASYNC_EXTENT 5 41 + #define BTRFS_INODE_NEEDS_FULL_SYNC 6 42 + #define BTRFS_INODE_COPY_EVERYTHING 7 43 + #define BTRFS_INODE_IN_DELALLOC_LIST 8 44 + #define BTRFS_INODE_READDIO_NEED_LOCK 9 45 + #define BTRFS_INODE_HAS_PROPS 10 47 46 48 47 /* in memory btrfs inode */ 49 48 struct btrfs_inode { ··· 175 176 * of extent items we've reserved metadata for. 176 177 */ 177 178 unsigned outstanding_extents; 178 - unsigned reserved_extents; 179 + 180 + struct btrfs_block_rsv block_rsv; 179 181 180 182 /* 181 183 * Cached values of inode properties ··· 265 265 if (inode->location.objectid == BTRFS_FREE_INO_OBJECTID) 266 266 return true; 267 267 return false; 268 + } 269 + 270 + static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode, 271 + int mod) 272 + { 273 + lockdep_assert_held(&inode->lock); 274 + inode->outstanding_extents += mod; 275 + if (btrfs_is_free_space_inode(inode)) 276 + return; 277 + trace_btrfs_inode_mod_outstanding_extents(inode->root, btrfs_ino(inode), 278 + mod); 268 279 } 269 280 270 281 static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)

+4 -4

fs/btrfs/check-integrity.c

··· 613 613 struct btrfsic_dev_state_hashtable *h) 614 614 { 615 615 const unsigned int hashval = 616 - (((unsigned int)((uintptr_t)ds->bdev)) & 616 + (((unsigned int)((uintptr_t)ds->bdev->bd_dev)) & 617 617 (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); 618 618 619 619 list_add(&ds->collision_resolving_node, h->table + hashval); ··· 2803 2803 mutex_lock(&btrfsic_mutex); 2804 2804 /* since btrfsic_submit_bio() is also called before 2805 2805 * btrfsic_mount(), this might return NULL */ 2806 - dev_state = btrfsic_dev_state_lookup(bio_dev(bio)); 2806 + dev_state = btrfsic_dev_state_lookup(bio_dev(bio) + bio->bi_partno); 2807 2807 if (NULL != dev_state && 2808 2808 (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { 2809 2809 unsigned int i = 0; ··· 2913 2913 state = kvzalloc(sizeof(*state), GFP_KERNEL); 2914 2914 if (!state) { 2915 2915 pr_info("btrfs check-integrity: allocation failed!\n"); 2916 - return -1; 2916 + return -ENOMEM; 2917 2917 } 2918 2918 2919 2919 if (!btrfsic_is_initialized) { ··· 2945 2945 if (NULL == ds) { 2946 2946 pr_info("btrfs check-integrity: kmalloc() failed!\n"); 2947 2947 mutex_unlock(&btrfsic_mutex); 2948 - return -1; 2948 + return -ENOMEM; 2949 2949 } 2950 2950 ds->bdev = device->bdev; 2951 2951 ds->state = state;

+463 -30

fs/btrfs/compression.c

··· 33 33 #include <linux/bit_spinlock.h> 34 34 #include <linux/slab.h> 35 35 #include <linux/sched/mm.h> 36 + #include <linux/sort.h> 37 + #include <linux/log2.h> 36 38 #include "ctree.h" 37 39 #include "disk-io.h" 38 40 #include "transaction.h" ··· 257 255 cb->start, 258 256 cb->start + cb->len - 1, 259 257 NULL, 260 - bio->bi_status ? 0 : 1); 258 + bio->bi_status ? 259 + BLK_STS_OK : BLK_STS_NOTSUPP); 261 260 cb->compressed_pages[0]->mapping = NULL; 262 261 263 262 end_compressed_writeback(inode, cb); ··· 709 706 return ret; 710 707 } 711 708 712 - static struct { 709 + /* 710 + * Heuristic uses systematic sampling to collect data from the input data 711 + * range, the logic can be tuned by the following constants: 712 + * 713 + * @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample 714 + * @SAMPLING_INTERVAL - range from which the sampled data can be collected 715 + */ 716 + #define SAMPLING_READ_SIZE (16) 717 + #define SAMPLING_INTERVAL (256) 718 + 719 + /* 720 + * For statistical analysis of the input data we consider bytes that form a 721 + * Galois Field of 256 objects. Each object has an attribute count, ie. how 722 + * many times the object appeared in the sample. 723 + */ 724 + #define BUCKET_SIZE (256) 725 + 726 + /* 727 + * The size of the sample is based on a statistical sampling rule of thumb. 728 + * The common way is to perform sampling tests as long as the number of 729 + * elements in each cell is at least 5. 730 + * 731 + * Instead of 5, we choose 32 to obtain more accurate results. 732 + * If the data contain the maximum number of symbols, which is 256, we obtain a 733 + * sample size bound by 8192. 734 + * 735 + * For a sample of at most 8KB of data per data range: 16 consecutive bytes 736 + * from up to 512 locations. 737 + */ 738 + #define MAX_SAMPLE_SIZE (BTRFS_MAX_UNCOMPRESSED * \ 739 + SAMPLING_READ_SIZE / SAMPLING_INTERVAL) 740 + 741 + struct bucket_item { 742 + u32 count; 743 + }; 744 + 745 + struct heuristic_ws { 746 + /* Partial copy of input data */ 747 + u8 *sample; 748 + u32 sample_size; 749 + /* Buckets store counters for each byte value */ 750 + struct bucket_item *bucket; 751 + struct list_head list; 752 + }; 753 + 754 + static void free_heuristic_ws(struct list_head *ws) 755 + { 756 + struct heuristic_ws *workspace; 757 + 758 + workspace = list_entry(ws, struct heuristic_ws, list); 759 + 760 + kvfree(workspace->sample); 761 + kfree(workspace->bucket); 762 + kfree(workspace); 763 + } 764 + 765 + static struct list_head *alloc_heuristic_ws(void) 766 + { 767 + struct heuristic_ws *ws; 768 + 769 + ws = kzalloc(sizeof(*ws), GFP_KERNEL); 770 + if (!ws) 771 + return ERR_PTR(-ENOMEM); 772 + 773 + ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL); 774 + if (!ws->sample) 775 + goto fail; 776 + 777 + ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL); 778 + if (!ws->bucket) 779 + goto fail; 780 + 781 + INIT_LIST_HEAD(&ws->list); 782 + return &ws->list; 783 + fail: 784 + free_heuristic_ws(&ws->list); 785 + return ERR_PTR(-ENOMEM); 786 + } 787 + 788 + struct workspaces_list { 713 789 struct list_head idle_ws; 714 790 spinlock_t ws_lock; 715 791 /* Number of free workspaces */ ··· 797 715 atomic_t total_ws; 798 716 /* Waiters for a free workspace */ 799 717 wait_queue_head_t ws_wait; 800 - } btrfs_comp_ws[BTRFS_COMPRESS_TYPES]; 718 + }; 719 + 720 + static struct workspaces_list btrfs_comp_ws[BTRFS_COMPRESS_TYPES]; 721 + 722 + static struct workspaces_list btrfs_heuristic_ws; 801 723 802 724 static const struct btrfs_compress_op * const btrfs_compress_op[] = { 803 725 &btrfs_zlib_compress, ··· 811 725 812 726 void __init btrfs_init_compress(void) 813 727 { 728 + struct list_head *workspace; 814 729 int i; 815 730 816 - for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { 817 - struct list_head *workspace; 731 + INIT_LIST_HEAD(&btrfs_heuristic_ws.idle_ws); 732 + spin_lock_init(&btrfs_heuristic_ws.ws_lock); 733 + atomic_set(&btrfs_heuristic_ws.total_ws, 0); 734 + init_waitqueue_head(&btrfs_heuristic_ws.ws_wait); 818 735 736 + workspace = alloc_heuristic_ws(); 737 + if (IS_ERR(workspace)) { 738 + pr_warn( 739 + "BTRFS: cannot preallocate heuristic workspace, will try later\n"); 740 + } else { 741 + atomic_set(&btrfs_heuristic_ws.total_ws, 1); 742 + btrfs_heuristic_ws.free_ws = 1; 743 + list_add(workspace, &btrfs_heuristic_ws.idle_ws); 744 + } 745 + 746 + for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { 819 747 INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws); 820 748 spin_lock_init(&btrfs_comp_ws[i].ws_lock); 821 749 atomic_set(&btrfs_comp_ws[i].total_ws, 0); ··· 856 756 * Preallocation makes a forward progress guarantees and we do not return 857 757 * errors. 858 758 */ 859 - static struct list_head *find_workspace(int type) 759 + static struct list_head *__find_workspace(int type, bool heuristic) 860 760 { 861 761 struct list_head *workspace; 862 762 int cpus = num_online_cpus(); 863 763 int idx = type - 1; 864 764 unsigned nofs_flag; 765 + struct list_head *idle_ws; 766 + spinlock_t *ws_lock; 767 + atomic_t *total_ws; 768 + wait_queue_head_t *ws_wait; 769 + int *free_ws; 865 770 866 - struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; 867 - spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; 868 - atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws; 869 - wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; 870 - int *free_ws = &btrfs_comp_ws[idx].free_ws; 771 + if (heuristic) { 772 + idle_ws = &btrfs_heuristic_ws.idle_ws; 773 + ws_lock = &btrfs_heuristic_ws.ws_lock; 774 + total_ws = &btrfs_heuristic_ws.total_ws; 775 + ws_wait = &btrfs_heuristic_ws.ws_wait; 776 + free_ws = &btrfs_heuristic_ws.free_ws; 777 + } else { 778 + idle_ws = &btrfs_comp_ws[idx].idle_ws; 779 + ws_lock = &btrfs_comp_ws[idx].ws_lock; 780 + total_ws = &btrfs_comp_ws[idx].total_ws; 781 + ws_wait = &btrfs_comp_ws[idx].ws_wait; 782 + free_ws = &btrfs_comp_ws[idx].free_ws; 783 + } 784 + 871 785 again: 872 786 spin_lock(ws_lock); 873 787 if (!list_empty(idle_ws)) { ··· 911 797 * context of btrfs_compress_bio/btrfs_compress_pages 912 798 */ 913 799 nofs_flag = memalloc_nofs_save(); 914 - workspace = btrfs_compress_op[idx]->alloc_workspace(); 800 + if (heuristic) 801 + workspace = alloc_heuristic_ws(); 802 + else 803 + workspace = btrfs_compress_op[idx]->alloc_workspace(); 915 804 memalloc_nofs_restore(nofs_flag); 916 805 917 806 if (IS_ERR(workspace)) { ··· 945 828 return workspace; 946 829 } 947 830 831 + static struct list_head *find_workspace(int type) 832 + { 833 + return __find_workspace(type, false); 834 + } 835 + 948 836 /* 949 837 * put a workspace struct back on the list or free it if we have enough 950 838 * idle ones sitting around 951 839 */ 952 - static void free_workspace(int type, struct list_head *workspace) 840 + static void __free_workspace(int type, struct list_head *workspace, 841 + bool heuristic) 953 842 { 954 843 int idx = type - 1; 955 - struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; 956 - spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; 957 - atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws; 958 - wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; 959 - int *free_ws = &btrfs_comp_ws[idx].free_ws; 844 + struct list_head *idle_ws; 845 + spinlock_t *ws_lock; 846 + atomic_t *total_ws; 847 + wait_queue_head_t *ws_wait; 848 + int *free_ws; 849 + 850 + if (heuristic) { 851 + idle_ws = &btrfs_heuristic_ws.idle_ws; 852 + ws_lock = &btrfs_heuristic_ws.ws_lock; 853 + total_ws = &btrfs_heuristic_ws.total_ws; 854 + ws_wait = &btrfs_heuristic_ws.ws_wait; 855 + free_ws = &btrfs_heuristic_ws.free_ws; 856 + } else { 857 + idle_ws = &btrfs_comp_ws[idx].idle_ws; 858 + ws_lock = &btrfs_comp_ws[idx].ws_lock; 859 + total_ws = &btrfs_comp_ws[idx].total_ws; 860 + ws_wait = &btrfs_comp_ws[idx].ws_wait; 861 + free_ws = &btrfs_comp_ws[idx].free_ws; 862 + } 960 863 961 864 spin_lock(ws_lock); 962 865 if (*free_ws <= num_online_cpus()) { ··· 987 850 } 988 851 spin_unlock(ws_lock); 989 852 990 - btrfs_compress_op[idx]->free_workspace(workspace); 853 + if (heuristic) 854 + free_heuristic_ws(workspace); 855 + else 856 + btrfs_compress_op[idx]->free_workspace(workspace); 991 857 atomic_dec(total_ws); 992 858 wake: 993 859 /* ··· 1001 861 wake_up(ws_wait); 1002 862 } 1003 863 864 + static void free_workspace(int type, struct list_head *ws) 865 + { 866 + return __free_workspace(type, ws, false); 867 + } 868 + 1004 869 /* 1005 870 * cleanup function for module exit 1006 871 */ ··· 1013 868 { 1014 869 struct list_head *workspace; 1015 870 int i; 871 + 872 + while (!list_empty(&btrfs_heuristic_ws.idle_ws)) { 873 + workspace = btrfs_heuristic_ws.idle_ws.next; 874 + list_del(workspace); 875 + free_heuristic_ws(workspace); 876 + atomic_dec(&btrfs_heuristic_ws.total_ws); 877 + } 1016 878 1017 879 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { 1018 880 while (!list_empty(&btrfs_comp_ws[i].idle_ws)) { ··· 1035 883 * Given an address space and start and length, compress the bytes into @pages 1036 884 * that are allocated on demand. 1037 885 * 886 + * @type_level is encoded algorithm and level, where level 0 means whatever 887 + * default the algorithm chooses and is opaque here; 888 + * - compression algo are 0-3 889 + * - the level are bits 4-7 890 + * 1038 891 * @out_pages is an in/out parameter, holds maximum number of pages to allocate 1039 892 * and returns number of actually allocated pages 1040 893 * ··· 1054 897 * @max_out tells us the max number of bytes that we're allowed to 1055 898 * stuff into pages 1056 899 */ 1057 - int btrfs_compress_pages(int type, struct address_space *mapping, 900 + int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, 1058 901 u64 start, struct page **pages, 1059 902 unsigned long *out_pages, 1060 903 unsigned long *total_in, ··· 1062 905 { 1063 906 struct list_head *workspace; 1064 907 int ret; 908 + int type = type_level & 0xF; 1065 909 1066 910 workspace = find_workspace(type); 1067 911 912 + btrfs_compress_op[type - 1]->set_level(workspace, type_level); 1068 913 ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, 1069 914 start, pages, 1070 915 out_pages, ··· 1225 1066 } 1226 1067 1227 1068 /* 1069 + * Shannon Entropy calculation 1070 + * 1071 + * Pure byte distribution analysis fails to determine compressiability of data. 1072 + * Try calculating entropy to estimate the average minimum number of bits 1073 + * needed to encode the sampled data. 1074 + * 1075 + * For convenience, return the percentage of needed bits, instead of amount of 1076 + * bits directly. 1077 + * 1078 + * @ENTROPY_LVL_ACEPTABLE - below that threshold, sample has low byte entropy 1079 + * and can be compressible with high probability 1080 + * 1081 + * @ENTROPY_LVL_HIGH - data are not compressible with high probability 1082 + * 1083 + * Use of ilog2() decreases precision, we lower the LVL to 5 to compensate. 1084 + */ 1085 + #define ENTROPY_LVL_ACEPTABLE (65) 1086 + #define ENTROPY_LVL_HIGH (80) 1087 + 1088 + /* 1089 + * For increasead precision in shannon_entropy calculation, 1090 + * let's do pow(n, M) to save more digits after comma: 1091 + * 1092 + * - maximum int bit length is 64 1093 + * - ilog2(MAX_SAMPLE_SIZE) -> 13 1094 + * - 13 * 4 = 52 < 64 -> M = 4 1095 + * 1096 + * So use pow(n, 4). 1097 + */ 1098 + static inline u32 ilog2_w(u64 n) 1099 + { 1100 + return ilog2(n * n * n * n); 1101 + } 1102 + 1103 + static u32 shannon_entropy(struct heuristic_ws *ws) 1104 + { 1105 + const u32 entropy_max = 8 * ilog2_w(2); 1106 + u32 entropy_sum = 0; 1107 + u32 p, p_base, sz_base; 1108 + u32 i; 1109 + 1110 + sz_base = ilog2_w(ws->sample_size); 1111 + for (i = 0; i < BUCKET_SIZE && ws->bucket[i].count > 0; i++) { 1112 + p = ws->bucket[i].count; 1113 + p_base = ilog2_w(p); 1114 + entropy_sum += p * (sz_base - p_base); 1115 + } 1116 + 1117 + entropy_sum /= ws->sample_size; 1118 + return entropy_sum * 100 / entropy_max; 1119 + } 1120 + 1121 + /* Compare buckets by size, ascending */ 1122 + static int bucket_comp_rev(const void *lv, const void *rv) 1123 + { 1124 + const struct bucket_item *l = (const struct bucket_item *)lv; 1125 + const struct bucket_item *r = (const struct bucket_item *)rv; 1126 + 1127 + return r->count - l->count; 1128 + } 1129 + 1130 + /* 1131 + * Size of the core byte set - how many bytes cover 90% of the sample 1132 + * 1133 + * There are several types of structured binary data that use nearly all byte 1134 + * values. The distribution can be uniform and counts in all buckets will be 1135 + * nearly the same (eg. encrypted data). Unlikely to be compressible. 1136 + * 1137 + * Other possibility is normal (Gaussian) distribution, where the data could 1138 + * be potentially compressible, but we have to take a few more steps to decide 1139 + * how much. 1140 + * 1141 + * @BYTE_CORE_SET_LOW - main part of byte values repeated frequently, 1142 + * compression algo can easy fix that 1143 + * @BYTE_CORE_SET_HIGH - data have uniform distribution and with high 1144 + * probability is not compressible 1145 + */ 1146 + #define BYTE_CORE_SET_LOW (64) 1147 + #define BYTE_CORE_SET_HIGH (200) 1148 + 1149 + static int byte_core_set_size(struct heuristic_ws *ws) 1150 + { 1151 + u32 i; 1152 + u32 coreset_sum = 0; 1153 + const u32 core_set_threshold = ws->sample_size * 90 / 100; 1154 + struct bucket_item *bucket = ws->bucket; 1155 + 1156 + /* Sort in reverse order */ 1157 + sort(bucket, BUCKET_SIZE, sizeof(*bucket), &bucket_comp_rev, NULL); 1158 + 1159 + for (i = 0; i < BYTE_CORE_SET_LOW; i++) 1160 + coreset_sum += bucket[i].count; 1161 + 1162 + if (coreset_sum > core_set_threshold) 1163 + return i; 1164 + 1165 + for (; i < BYTE_CORE_SET_HIGH && bucket[i].count > 0; i++) { 1166 + coreset_sum += bucket[i].count; 1167 + if (coreset_sum > core_set_threshold) 1168 + break; 1169 + } 1170 + 1171 + return i; 1172 + } 1173 + 1174 + /* 1175 + * Count byte values in buckets. 1176 + * This heuristic can detect textual data (configs, xml, json, html, etc). 1177 + * Because in most text-like data byte set is restricted to limited number of 1178 + * possible characters, and that restriction in most cases makes data easy to 1179 + * compress. 1180 + * 1181 + * @BYTE_SET_THRESHOLD - consider all data within this byte set size: 1182 + * less - compressible 1183 + * more - need additional analysis 1184 + */ 1185 + #define BYTE_SET_THRESHOLD (64) 1186 + 1187 + static u32 byte_set_size(const struct heuristic_ws *ws) 1188 + { 1189 + u32 i; 1190 + u32 byte_set_size = 0; 1191 + 1192 + for (i = 0; i < BYTE_SET_THRESHOLD; i++) { 1193 + if (ws->bucket[i].count > 0) 1194 + byte_set_size++; 1195 + } 1196 + 1197 + /* 1198 + * Continue collecting count of byte values in buckets. If the byte 1199 + * set size is bigger then the threshold, it's pointless to continue, 1200 + * the detection technique would fail for this type of data. 1201 + */ 1202 + for (; i < BUCKET_SIZE; i++) { 1203 + if (ws->bucket[i].count > 0) { 1204 + byte_set_size++; 1205 + if (byte_set_size > BYTE_SET_THRESHOLD) 1206 + return byte_set_size; 1207 + } 1208 + } 1209 + 1210 + return byte_set_size; 1211 + } 1212 + 1213 + static bool sample_repeated_patterns(struct heuristic_ws *ws) 1214 + { 1215 + const u32 half_of_sample = ws->sample_size / 2; 1216 + const u8 *data = ws->sample; 1217 + 1218 + return memcmp(&data[0], &data[half_of_sample], half_of_sample) == 0; 1219 + } 1220 + 1221 + static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, 1222 + struct heuristic_ws *ws) 1223 + { 1224 + struct page *page; 1225 + u64 index, index_end; 1226 + u32 i, curr_sample_pos; 1227 + u8 *in_data; 1228 + 1229 + /* 1230 + * Compression handles the input data by chunks of 128KiB 1231 + * (defined by BTRFS_MAX_UNCOMPRESSED) 1232 + * 1233 + * We do the same for the heuristic and loop over the whole range. 1234 + * 1235 + * MAX_SAMPLE_SIZE - calculated under assumption that heuristic will 1236 + * process no more than BTRFS_MAX_UNCOMPRESSED at a time. 1237 + */ 1238 + if (end - start > BTRFS_MAX_UNCOMPRESSED) 1239 + end = start + BTRFS_MAX_UNCOMPRESSED; 1240 + 1241 + index = start >> PAGE_SHIFT; 1242 + index_end = end >> PAGE_SHIFT; 1243 + 1244 + /* Don't miss unaligned end */ 1245 + if (!IS_ALIGNED(end, PAGE_SIZE)) 1246 + index_end++; 1247 + 1248 + curr_sample_pos = 0; 1249 + while (index < index_end) { 1250 + page = find_get_page(inode->i_mapping, index); 1251 + in_data = kmap(page); 1252 + /* Handle case where the start is not aligned to PAGE_SIZE */ 1253 + i = start % PAGE_SIZE; 1254 + while (i < PAGE_SIZE - SAMPLING_READ_SIZE) { 1255 + /* Don't sample any garbage from the last page */ 1256 + if (start > end - SAMPLING_READ_SIZE) 1257 + break; 1258 + memcpy(&ws->sample[curr_sample_pos], &in_data[i], 1259 + SAMPLING_READ_SIZE); 1260 + i += SAMPLING_INTERVAL; 1261 + start += SAMPLING_INTERVAL; 1262 + curr_sample_pos += SAMPLING_READ_SIZE; 1263 + } 1264 + kunmap(page); 1265 + put_page(page); 1266 + 1267 + index++; 1268 + } 1269 + 1270 + ws->sample_size = curr_sample_pos; 1271 + } 1272 + 1273 + /* 1228 1274 * Compression heuristic. 1229 1275 * 1230 1276 * For now is's a naive and optimistic 'return true', we'll extend the logic to ··· 1446 1082 */ 1447 1083 int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end) 1448 1084 { 1449 - u64 index = start >> PAGE_SHIFT; 1450 - u64 end_index = end >> PAGE_SHIFT; 1451 - struct page *page; 1452 - int ret = 1; 1085 + struct list_head *ws_list = __find_workspace(0, true); 1086 + struct heuristic_ws *ws; 1087 + u32 i; 1088 + u8 byte; 1089 + int ret = 0; 1453 1090 1454 - while (index <= end_index) { 1455 - page = find_get_page(inode->i_mapping, index); 1456 - kmap(page); 1457 - kunmap(page); 1458 - put_page(page); 1459 - index++; 1091 + ws = list_entry(ws_list, struct heuristic_ws, list); 1092 + 1093 + heuristic_collect_sample(inode, start, end, ws); 1094 + 1095 + if (sample_repeated_patterns(ws)) { 1096 + ret = 1; 1097 + goto out; 1460 1098 } 1461 1099 1100 + memset(ws->bucket, 0, sizeof(*ws->bucket)*BUCKET_SIZE); 1101 + 1102 + for (i = 0; i < ws->sample_size; i++) { 1103 + byte = ws->sample[i]; 1104 + ws->bucket[byte].count++; 1105 + } 1106 + 1107 + i = byte_set_size(ws); 1108 + if (i < BYTE_SET_THRESHOLD) { 1109 + ret = 2; 1110 + goto out; 1111 + } 1112 + 1113 + i = byte_core_set_size(ws); 1114 + if (i <= BYTE_CORE_SET_LOW) { 1115 + ret = 3; 1116 + goto out; 1117 + } 1118 + 1119 + if (i >= BYTE_CORE_SET_HIGH) { 1120 + ret = 0; 1121 + goto out; 1122 + } 1123 + 1124 + i = shannon_entropy(ws); 1125 + if (i <= ENTROPY_LVL_ACEPTABLE) { 1126 + ret = 4; 1127 + goto out; 1128 + } 1129 + 1130 + /* 1131 + * For the levels below ENTROPY_LVL_HIGH, additional analysis would be 1132 + * needed to give green light to compression. 1133 + * 1134 + * For now just assume that compression at that level is not worth the 1135 + * resources because: 1136 + * 1137 + * 1. it is possible to defrag the data later 1138 + * 1139 + * 2. the data would turn out to be hardly compressible, eg. 150 byte 1140 + * values, every bucket has counter at level ~54. The heuristic would 1141 + * be confused. This can happen when data have some internal repeated 1142 + * patterns like "abbacbbc...". This can be detected by analyzing 1143 + * pairs of bytes, which is too costly. 1144 + */ 1145 + if (i < ENTROPY_LVL_HIGH) { 1146 + ret = 5; 1147 + goto out; 1148 + } else { 1149 + ret = 0; 1150 + goto out; 1151 + } 1152 + 1153 + out: 1154 + __free_workspace(0, ws_list, true); 1462 1155 return ret; 1156 + } 1157 + 1158 + unsigned int btrfs_compress_str2level(const char *str) 1159 + { 1160 + if (strncmp(str, "zlib", 4) != 0) 1161 + return 0; 1162 + 1163 + /* Accepted form: zlib:1 up to zlib:9 and nothing left after the number */ 1164 + if (str[4] == ':' && '1' <= str[5] && str[5] <= '9' && str[6] == 0) 1165 + return str[5] - '0'; 1166 + 1167 + return 0; 1463 1168 }

+5 -1

fs/btrfs/compression.h

··· 76 76 void btrfs_init_compress(void); 77 77 void btrfs_exit_compress(void); 78 78 79 - int btrfs_compress_pages(int type, struct address_space *mapping, 79 + int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, 80 80 u64 start, struct page **pages, 81 81 unsigned long *out_pages, 82 82 unsigned long *total_in, ··· 94 94 unsigned long nr_pages); 95 95 blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, 96 96 int mirror_num, unsigned long bio_flags); 97 + 98 + unsigned btrfs_compress_str2level(const char *str); 97 99 98 100 enum btrfs_compression_type { 99 101 BTRFS_COMPRESS_NONE = 0, ··· 126 124 struct page *dest_page, 127 125 unsigned long start_byte, 128 126 size_t srclen, size_t destlen); 127 + 128 + void (*set_level)(struct list_head *ws, unsigned int type); 129 129 }; 130 130 131 131 extern const struct btrfs_compress_op btrfs_zlib_compress;

+6 -11

fs/btrfs/ctree.c

··· 192 192 * tree until you end up with a lock on the root. A locked buffer 193 193 * is returned, with a reference held. 194 194 */ 195 - static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) 195 + struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) 196 196 { 197 197 struct extent_buffer *eb; 198 198 ··· 5496 5496 goto out; 5497 5497 } else if (left_end_reached) { 5498 5498 if (right_level == 0) { 5499 - ret = changed_cb(left_root, right_root, 5500 - left_path, right_path, 5499 + ret = changed_cb(left_path, right_path, 5501 5500 &right_key, 5502 5501 BTRFS_COMPARE_TREE_DELETED, 5503 5502 ctx); ··· 5507 5508 continue; 5508 5509 } else if (right_end_reached) { 5509 5510 if (left_level == 0) { 5510 - ret = changed_cb(left_root, right_root, 5511 - left_path, right_path, 5511 + ret = changed_cb(left_path, right_path, 5512 5512 &left_key, 5513 5513 BTRFS_COMPARE_TREE_NEW, 5514 5514 ctx); ··· 5521 5523 if (left_level == 0 && right_level == 0) { 5522 5524 cmp = btrfs_comp_cpu_keys(&left_key, &right_key); 5523 5525 if (cmp < 0) { 5524 - ret = changed_cb(left_root, right_root, 5525 - left_path, right_path, 5526 + ret = changed_cb(left_path, right_path, 5526 5527 &left_key, 5527 5528 BTRFS_COMPARE_TREE_NEW, 5528 5529 ctx); ··· 5529 5532 goto out; 5530 5533 advance_left = ADVANCE; 5531 5534 } else if (cmp > 0) { 5532 - ret = changed_cb(left_root, right_root, 5533 - left_path, right_path, 5535 + ret = changed_cb(left_path, right_path, 5534 5536 &right_key, 5535 5537 BTRFS_COMPARE_TREE_DELETED, 5536 5538 ctx); ··· 5546 5550 result = BTRFS_COMPARE_TREE_CHANGED; 5547 5551 else 5548 5552 result = BTRFS_COMPARE_TREE_SAME; 5549 - ret = changed_cb(left_root, right_root, 5550 - left_path, right_path, 5553 + ret = changed_cb(left_path, right_path, 5551 5554 &left_key, result, ctx); 5552 5555 if (ret < 0) 5553 5556 goto out;

+18 -12

fs/btrfs/ctree.h

··· 523 523 }; 524 524 525 525 /* Once caching_thread() finds this much free space, it will wake up waiters. */ 526 - #define CACHING_CTL_WAKE_UP (1024 * 1024 * 2) 526 + #define CACHING_CTL_WAKE_UP SZ_2M 527 527 528 528 struct btrfs_io_ctl { 529 529 void *cur, *orig; ··· 763 763 * delayed dir index item 764 764 */ 765 765 struct btrfs_block_rsv global_block_rsv; 766 - /* block reservation for delay allocation */ 767 - struct btrfs_block_rsv delalloc_block_rsv; 768 766 /* block reservation for metadata operations */ 769 767 struct btrfs_block_rsv trans_block_rsv; 770 768 /* block reservation for chunk tree */ ··· 788 790 */ 789 791 unsigned long pending_changes; 790 792 unsigned long compress_type:4; 793 + unsigned int compress_level; 791 794 int commit_interval; 792 795 /* 793 796 * It is a suggestive number, the read side is safe even it gets a ··· 877 878 rwlock_t tree_mod_log_lock; 878 879 struct rb_root tree_mod_log; 879 880 880 - atomic_t nr_async_submits; 881 - atomic_t async_submit_draining; 882 - atomic_t nr_async_bios; 883 881 atomic_t async_delalloc_pages; 884 882 atomic_t open_ioctl_trans; 885 883 ··· 1096 1100 u32 nodesize; 1097 1101 u32 sectorsize; 1098 1102 u32 stripesize; 1103 + 1104 + #ifdef CONFIG_BTRFS_FS_REF_VERIFY 1105 + spinlock_t ref_verify_lock; 1106 + struct rb_root block_tree; 1107 + #endif 1099 1108 }; 1100 1109 1101 1110 static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) ··· 1339 1338 #define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25) 1340 1339 #define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26) 1341 1340 #define BTRFS_MOUNT_NOLOGREPLAY (1 << 27) 1341 + #define BTRFS_MOUNT_REF_VERIFY (1 << 28) 1342 1342 1343 1343 #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) 1344 1344 #define BTRFS_DEFAULT_MAX_INLINE (2048) ··· 2641 2639 struct extent_buffer *buf, 2642 2640 u64 parent, int last_ref); 2643 2641 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 2644 - u64 root_objectid, u64 owner, 2642 + struct btrfs_root *root, u64 owner, 2645 2643 u64 offset, u64 ram_bytes, 2646 2644 struct btrfs_key *ins); 2647 2645 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, ··· 2660 2658 u64 bytenr, u64 num_bytes, u64 flags, 2661 2659 int level, int is_data); 2662 2660 int btrfs_free_extent(struct btrfs_trans_handle *trans, 2663 - struct btrfs_fs_info *fs_info, 2661 + struct btrfs_root *root, 2664 2662 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 2665 2663 u64 owner, u64 offset); 2666 2664 ··· 2672 2670 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 2673 2671 struct btrfs_fs_info *fs_info); 2674 2672 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2675 - struct btrfs_fs_info *fs_info, 2673 + struct btrfs_root *root, 2676 2674 u64 bytenr, u64 num_bytes, u64 parent, 2677 2675 u64 root_objectid, u64 owner, u64 offset); 2678 2676 ··· 2746 2744 u64 *qgroup_reserved, bool use_global_rsv); 2747 2745 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, 2748 2746 struct btrfs_block_rsv *rsv); 2747 + void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); 2748 + 2749 2749 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); 2750 2750 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes); 2751 2751 int btrfs_delalloc_reserve_space(struct inode *inode, ··· 2755 2751 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); 2756 2752 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, 2757 2753 unsigned short type); 2754 + void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, 2755 + struct btrfs_block_rsv *rsv, 2756 + unsigned short type); 2758 2757 void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, 2759 2758 struct btrfs_block_rsv *rsv); 2760 2759 void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv); ··· 2816 2809 const struct btrfs_key *new_key); 2817 2810 struct extent_buffer *btrfs_root_node(struct btrfs_root *root); 2818 2811 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); 2812 + struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); 2819 2813 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 2820 2814 struct btrfs_key *key, int lowest_level, 2821 2815 u64 min_trans); ··· 2829 2821 BTRFS_COMPARE_TREE_CHANGED, 2830 2822 BTRFS_COMPARE_TREE_SAME, 2831 2823 }; 2832 - typedef int (*btrfs_changed_cb_t)(struct btrfs_root *left_root, 2833 - struct btrfs_root *right_root, 2834 - struct btrfs_path *left_path, 2824 + typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path, 2835 2825 struct btrfs_path *right_path, 2836 2826 struct btrfs_key *key, 2837 2827 enum btrfs_compare_tree_result result,

+1 -45

fs/btrfs/delayed-inode.c

··· 581 581 struct btrfs_block_rsv *dst_rsv; 582 582 u64 num_bytes; 583 583 int ret; 584 - bool release = false; 585 584 586 585 src_rsv = trans->block_rsv; 587 586 dst_rsv = &fs_info->delayed_block_rsv; 588 587 589 588 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 590 - 591 - /* 592 - * If our block_rsv is the delalloc block reserve then check and see if 593 - * we have our extra reservation for updating the inode. If not fall 594 - * through and try to reserve space quickly. 595 - * 596 - * We used to try and steal from the delalloc block rsv or the global 597 - * reserve, but we'd steal a full reservation, which isn't kind. We are 598 - * here through delalloc which means we've likely just cowed down close 599 - * to the leaf that contains the inode, so we would steal less just 600 - * doing the fallback inode update, so if we do end up having to steal 601 - * from the global block rsv we hopefully only steal one or two blocks 602 - * worth which is less likely to hurt us. 603 - */ 604 - if (src_rsv && src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) { 605 - spin_lock(&inode->lock); 606 - if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 607 - &inode->runtime_flags)) 608 - release = true; 609 - else 610 - src_rsv = NULL; 611 - spin_unlock(&inode->lock); 612 - } 613 589 614 590 /* 615 591 * btrfs_dirty_inode will update the inode under btrfs_join_transaction ··· 594 618 * space. 595 619 * 596 620 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since 597 - * we're accounted for. 621 + * we always reserve enough to update the inode item. 598 622 */ 599 623 if (!src_rsv || (!trans->bytes_reserved && 600 624 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { ··· 619 643 } 620 644 621 645 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); 622 - 623 - /* 624 - * Migrate only takes a reservation, it doesn't touch the size of the 625 - * block_rsv. This is to simplify people who don't normally have things 626 - * migrated from their block rsv. If they go to release their 627 - * reservation, that will decrease the size as well, so if migrate 628 - * reduced size we'd end up with a negative size. But for the 629 - * delalloc_meta_reserved stuff we will only know to drop 1 reservation, 630 - * but we could in fact do this reserve/migrate dance several times 631 - * between the time we did the original reservation and we'd clean it 632 - * up. So to take care of this, release the space for the meta 633 - * reservation here. I think it may be time for a documentation page on 634 - * how block rsvs. work. 635 - */ 636 646 if (!ret) { 637 647 trace_btrfs_space_reservation(fs_info, "delayed_inode", 638 648 btrfs_ino(inode), num_bytes, 1); 639 649 node->bytes_reserved = num_bytes; 640 - } 641 - 642 - if (release) { 643 - trace_btrfs_space_reservation(fs_info, "delalloc", 644 - btrfs_ino(inode), num_bytes, 0); 645 - btrfs_block_rsv_release(fs_info, src_rsv, num_bytes); 646 650 } 647 651 648 652 return ret;

+143 -153

fs/btrfs/delayed-ref.c

··· 40 40 /* 41 41 * compare two delayed tree backrefs with same bytenr and type 42 42 */ 43 - static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2, 44 - struct btrfs_delayed_tree_ref *ref1, int type) 43 + static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1, 44 + struct btrfs_delayed_tree_ref *ref2) 45 45 { 46 - if (type == BTRFS_TREE_BLOCK_REF_KEY) { 46 + if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) { 47 47 if (ref1->root < ref2->root) 48 48 return -1; 49 49 if (ref1->root > ref2->root) ··· 60 60 /* 61 61 * compare two delayed data backrefs with same bytenr and type 62 62 */ 63 - static int comp_data_refs(struct btrfs_delayed_data_ref *ref2, 64 - struct btrfs_delayed_data_ref *ref1) 63 + static int comp_data_refs(struct btrfs_delayed_data_ref *ref1, 64 + struct btrfs_delayed_data_ref *ref2) 65 65 { 66 66 if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) { 67 67 if (ref1->root < ref2->root) ··· 85 85 return 0; 86 86 } 87 87 88 + static int comp_refs(struct btrfs_delayed_ref_node *ref1, 89 + struct btrfs_delayed_ref_node *ref2, 90 + bool check_seq) 91 + { 92 + int ret = 0; 93 + 94 + if (ref1->type < ref2->type) 95 + return -1; 96 + if (ref1->type > ref2->type) 97 + return 1; 98 + if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || 99 + ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) 100 + ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1), 101 + btrfs_delayed_node_to_tree_ref(ref2)); 102 + else 103 + ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1), 104 + btrfs_delayed_node_to_data_ref(ref2)); 105 + if (ret) 106 + return ret; 107 + if (check_seq) { 108 + if (ref1->seq < ref2->seq) 109 + return -1; 110 + if (ref1->seq > ref2->seq) 111 + return 1; 112 + } 113 + return 0; 114 + } 115 + 88 116 /* insert a new ref to head ref rbtree */ 89 117 static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, 90 118 struct rb_node *node) ··· 124 96 u64 bytenr; 125 97 126 98 ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); 127 - bytenr = ins->node.bytenr; 99 + bytenr = ins->bytenr; 128 100 while (*p) { 129 101 parent_node = *p; 130 102 entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, 131 103 href_node); 132 104 133 - if (bytenr < entry->node.bytenr) 105 + if (bytenr < entry->bytenr) 134 106 p = &(*p)->rb_left; 135 - else if (bytenr > entry->node.bytenr) 107 + else if (bytenr > entry->bytenr) 108 + p = &(*p)->rb_right; 109 + else 110 + return entry; 111 + } 112 + 113 + rb_link_node(node, parent_node, p); 114 + rb_insert_color(node, root); 115 + return NULL; 116 + } 117 + 118 + static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root, 119 + struct btrfs_delayed_ref_node *ins) 120 + { 121 + struct rb_node **p = &root->rb_node; 122 + struct rb_node *node = &ins->ref_node; 123 + struct rb_node *parent_node = NULL; 124 + struct btrfs_delayed_ref_node *entry; 125 + 126 + while (*p) { 127 + int comp; 128 + 129 + parent_node = *p; 130 + entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, 131 + ref_node); 132 + comp = comp_refs(ins, entry, true); 133 + if (comp < 0) 134 + p = &(*p)->rb_left; 135 + else if (comp > 0) 136 136 p = &(*p)->rb_right; 137 137 else 138 138 return entry; ··· 189 133 while (n) { 190 134 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); 191 135 192 - if (bytenr < entry->node.bytenr) 136 + if (bytenr < entry->bytenr) 193 137 n = n->rb_left; 194 - else if (bytenr > entry->node.bytenr) 138 + else if (bytenr > entry->bytenr) 195 139 n = n->rb_right; 196 140 else 197 141 return entry; 198 142 } 199 143 if (entry && return_bigger) { 200 - if (bytenr > entry->node.bytenr) { 144 + if (bytenr > entry->bytenr) { 201 145 n = rb_next(&entry->href_node); 202 146 if (!n) 203 147 n = rb_first(root); ··· 220 164 if (mutex_trylock(&head->mutex)) 221 165 return 0; 222 166 223 - refcount_inc(&head->node.refs); 167 + refcount_inc(&head->refs); 224 168 spin_unlock(&delayed_refs->lock); 225 169 226 170 mutex_lock(&head->mutex); 227 171 spin_lock(&delayed_refs->lock); 228 - if (!head->node.in_tree) { 172 + if (RB_EMPTY_NODE(&head->href_node)) { 229 173 mutex_unlock(&head->mutex); 230 - btrfs_put_delayed_ref(&head->node); 174 + btrfs_put_delayed_ref_head(head); 231 175 return -EAGAIN; 232 176 } 233 - btrfs_put_delayed_ref(&head->node); 177 + btrfs_put_delayed_ref_head(head); 234 178 return 0; 235 179 } 236 180 ··· 239 183 struct btrfs_delayed_ref_head *head, 240 184 struct btrfs_delayed_ref_node *ref) 241 185 { 242 - if (btrfs_delayed_ref_is_head(ref)) { 243 - head = btrfs_delayed_node_to_head(ref); 244 - rb_erase(&head->href_node, &delayed_refs->href_root); 245 - } else { 246 - assert_spin_locked(&head->lock); 247 - list_del(&ref->list); 248 - if (!list_empty(&ref->add_list)) 249 - list_del(&ref->add_list); 250 - } 186 + assert_spin_locked(&head->lock); 187 + rb_erase(&ref->ref_node, &head->ref_tree); 188 + RB_CLEAR_NODE(&ref->ref_node); 189 + if (!list_empty(&ref->add_list)) 190 + list_del(&ref->add_list); 251 191 ref->in_tree = 0; 252 192 btrfs_put_delayed_ref(ref); 253 193 atomic_dec(&delayed_refs->num_entries); ··· 258 206 u64 seq) 259 207 { 260 208 struct btrfs_delayed_ref_node *next; 209 + struct rb_node *node = rb_next(&ref->ref_node); 261 210 bool done = false; 262 211 263 - next = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, 264 - list); 265 - while (!done && &next->list != &head->ref_list) { 212 + while (!done && node) { 266 213 int mod; 267 - struct btrfs_delayed_ref_node *next2; 268 214 269 - next2 = list_next_entry(next, list); 270 - 271 - if (next == ref) 272 - goto next; 273 - 215 + next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); 216 + node = rb_next(node); 274 217 if (seq && next->seq >= seq) 275 - goto next; 276 - 277 - if (next->type != ref->type) 278 - goto next; 279 - 280 - if ((ref->type == BTRFS_TREE_BLOCK_REF_KEY || 281 - ref->type == BTRFS_SHARED_BLOCK_REF_KEY) && 282 - comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref), 283 - btrfs_delayed_node_to_tree_ref(next), 284 - ref->type)) 285 - goto next; 286 - if ((ref->type == BTRFS_EXTENT_DATA_REF_KEY || 287 - ref->type == BTRFS_SHARED_DATA_REF_KEY) && 288 - comp_data_refs(btrfs_delayed_node_to_data_ref(ref), 289 - btrfs_delayed_node_to_data_ref(next))) 290 - goto next; 218 + break; 219 + if (comp_refs(ref, next, false)) 220 + break; 291 221 292 222 if (ref->action == next->action) { 293 223 mod = next->ref_mod; ··· 293 259 WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || 294 260 ref->type == BTRFS_SHARED_BLOCK_REF_KEY); 295 261 } 296 - next: 297 - next = next2; 298 262 } 299 263 300 264 return done; ··· 304 272 struct btrfs_delayed_ref_head *head) 305 273 { 306 274 struct btrfs_delayed_ref_node *ref; 275 + struct rb_node *node; 307 276 u64 seq = 0; 308 277 309 278 assert_spin_locked(&head->lock); 310 279 311 - if (list_empty(&head->ref_list)) 280 + if (RB_EMPTY_ROOT(&head->ref_tree)) 312 281 return; 313 282 314 283 /* We don't have too many refs to merge for data. */ ··· 326 293 } 327 294 spin_unlock(&fs_info->tree_mod_seq_lock); 328 295 329 - ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, 330 - list); 331 - while (&ref->list != &head->ref_list) { 296 + again: 297 + for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) { 298 + ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); 332 299 if (seq && ref->seq >= seq) 333 - goto next; 334 - 335 - if (merge_ref(trans, delayed_refs, head, ref, seq)) { 336 - if (list_empty(&head->ref_list)) 337 - break; 338 - ref = list_first_entry(&head->ref_list, 339 - struct btrfs_delayed_ref_node, 340 - list); 341 300 continue; 342 - } 343 - next: 344 - ref = list_next_entry(ref, list); 301 + if (merge_ref(trans, delayed_refs, head, ref, seq)) 302 + goto again; 345 303 } 346 304 } 347 305 ··· 404 380 head->processing = 1; 405 381 WARN_ON(delayed_refs->num_heads_ready == 0); 406 382 delayed_refs->num_heads_ready--; 407 - delayed_refs->run_delayed_start = head->node.bytenr + 408 - head->node.num_bytes; 383 + delayed_refs->run_delayed_start = head->bytenr + 384 + head->num_bytes; 409 385 return head; 410 386 } 411 387 ··· 415 391 * Return 0 for insert. 416 392 * Return >0 for merge. 417 393 */ 418 - static int 419 - add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans, 420 - struct btrfs_delayed_ref_root *root, 421 - struct btrfs_delayed_ref_head *href, 422 - struct btrfs_delayed_ref_node *ref) 394 + static int insert_delayed_ref(struct btrfs_trans_handle *trans, 395 + struct btrfs_delayed_ref_root *root, 396 + struct btrfs_delayed_ref_head *href, 397 + struct btrfs_delayed_ref_node *ref) 423 398 { 424 399 struct btrfs_delayed_ref_node *exist; 425 400 int mod; 426 401 int ret = 0; 427 402 428 403 spin_lock(&href->lock); 429 - /* Check whether we can merge the tail node with ref */ 430 - if (list_empty(&href->ref_list)) 431 - goto add_tail; 432 - exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node, 433 - list); 434 - /* No need to compare bytenr nor is_head */ 435 - if (exist->type != ref->type || exist->seq != ref->seq) 436 - goto add_tail; 437 - 438 - if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY || 439 - exist->type == BTRFS_SHARED_BLOCK_REF_KEY) && 440 - comp_tree_refs(btrfs_delayed_node_to_tree_ref(exist), 441 - btrfs_delayed_node_to_tree_ref(ref), 442 - ref->type)) 443 - goto add_tail; 444 - if ((exist->type == BTRFS_EXTENT_DATA_REF_KEY || 445 - exist->type == BTRFS_SHARED_DATA_REF_KEY) && 446 - comp_data_refs(btrfs_delayed_node_to_data_ref(exist), 447 - btrfs_delayed_node_to_data_ref(ref))) 448 - goto add_tail; 404 + exist = tree_insert(&href->ref_tree, ref); 405 + if (!exist) 406 + goto inserted; 449 407 450 408 /* Now we are sure we can merge */ 451 409 ret = 1; ··· 458 452 drop_delayed_ref(trans, root, href, exist); 459 453 spin_unlock(&href->lock); 460 454 return ret; 461 - 462 - add_tail: 463 - list_add_tail(&ref->list, &href->ref_list); 455 + inserted: 464 456 if (ref->action == BTRFS_ADD_DELAYED_REF) 465 457 list_add_tail(&ref->add_list, &href->ref_add_list); 466 458 atomic_inc(&root->num_entries); ··· 473 469 */ 474 470 static noinline void 475 471 update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, 476 - struct btrfs_delayed_ref_node *existing, 477 - struct btrfs_delayed_ref_node *update, 472 + struct btrfs_delayed_ref_head *existing, 473 + struct btrfs_delayed_ref_head *update, 478 474 int *old_ref_mod_ret) 479 475 { 480 - struct btrfs_delayed_ref_head *existing_ref; 481 - struct btrfs_delayed_ref_head *ref; 482 476 int old_ref_mod; 483 477 484 - existing_ref = btrfs_delayed_node_to_head(existing); 485 - ref = btrfs_delayed_node_to_head(update); 486 - BUG_ON(existing_ref->is_data != ref->is_data); 478 + BUG_ON(existing->is_data != update->is_data); 487 479 488 - spin_lock(&existing_ref->lock); 489 - if (ref->must_insert_reserved) { 480 + spin_lock(&existing->lock); 481 + if (update->must_insert_reserved) { 490 482 /* if the extent was freed and then 491 483 * reallocated before the delayed ref 492 484 * entries were processed, we can end up ··· 490 490 * the must_insert_reserved flag set. 491 491 * Set it again here 492 492 */ 493 - existing_ref->must_insert_reserved = ref->must_insert_reserved; 493 + existing->must_insert_reserved = update->must_insert_reserved; 494 494 495 495 /* 496 496 * update the num_bytes so we make sure the accounting ··· 500 500 501 501 } 502 502 503 - if (ref->extent_op) { 504 - if (!existing_ref->extent_op) { 505 - existing_ref->extent_op = ref->extent_op; 503 + if (update->extent_op) { 504 + if (!existing->extent_op) { 505 + existing->extent_op = update->extent_op; 506 506 } else { 507 - if (ref->extent_op->update_key) { 508 - memcpy(&existing_ref->extent_op->key, 509 - &ref->extent_op->key, 510 - sizeof(ref->extent_op->key)); 511 - existing_ref->extent_op->update_key = true; 507 + if (update->extent_op->update_key) { 508 + memcpy(&existing->extent_op->key, 509 + &update->extent_op->key, 510 + sizeof(update->extent_op->key)); 511 + existing->extent_op->update_key = true; 512 512 } 513 - if (ref->extent_op->update_flags) { 514 - existing_ref->extent_op->flags_to_set |= 515 - ref->extent_op->flags_to_set; 516 - existing_ref->extent_op->update_flags = true; 513 + if (update->extent_op->update_flags) { 514 + existing->extent_op->flags_to_set |= 515 + update->extent_op->flags_to_set; 516 + existing->extent_op->update_flags = true; 517 517 } 518 - btrfs_free_delayed_extent_op(ref->extent_op); 518 + btrfs_free_delayed_extent_op(update->extent_op); 519 519 } 520 520 } 521 521 /* ··· 523 523 * only need the lock for this case cause we could be processing it 524 524 * currently, for refs we just added we know we're a-ok. 525 525 */ 526 - old_ref_mod = existing_ref->total_ref_mod; 526 + old_ref_mod = existing->total_ref_mod; 527 527 if (old_ref_mod_ret) 528 528 *old_ref_mod_ret = old_ref_mod; 529 529 existing->ref_mod += update->ref_mod; 530 - existing_ref->total_ref_mod += update->ref_mod; 530 + existing->total_ref_mod += update->ref_mod; 531 531 532 532 /* 533 533 * If we are going to from a positive ref mod to a negative or vice 534 534 * versa we need to make sure to adjust pending_csums accordingly. 535 535 */ 536 - if (existing_ref->is_data) { 537 - if (existing_ref->total_ref_mod >= 0 && old_ref_mod < 0) 536 + if (existing->is_data) { 537 + if (existing->total_ref_mod >= 0 && old_ref_mod < 0) 538 538 delayed_refs->pending_csums -= existing->num_bytes; 539 - if (existing_ref->total_ref_mod < 0 && old_ref_mod >= 0) 539 + if (existing->total_ref_mod < 0 && old_ref_mod >= 0) 540 540 delayed_refs->pending_csums += existing->num_bytes; 541 541 } 542 - spin_unlock(&existing_ref->lock); 542 + spin_unlock(&existing->lock); 543 543 } 544 544 545 545 /* ··· 550 550 static noinline struct btrfs_delayed_ref_head * 551 551 add_delayed_ref_head(struct btrfs_fs_info *fs_info, 552 552 struct btrfs_trans_handle *trans, 553 - struct btrfs_delayed_ref_node *ref, 553 + struct btrfs_delayed_ref_head *head_ref, 554 554 struct btrfs_qgroup_extent_record *qrecord, 555 555 u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved, 556 556 int action, int is_data, int *qrecord_inserted_ret, 557 557 int *old_ref_mod, int *new_ref_mod) 558 558 { 559 559 struct btrfs_delayed_ref_head *existing; 560 - struct btrfs_delayed_ref_head *head_ref = NULL; 561 560 struct btrfs_delayed_ref_root *delayed_refs; 562 561 int count_mod = 1; 563 562 int must_insert_reserved = 0; ··· 592 593 593 594 delayed_refs = &trans->transaction->delayed_refs; 594 595 595 - /* first set the basic ref node struct up */ 596 - refcount_set(&ref->refs, 1); 597 - ref->bytenr = bytenr; 598 - ref->num_bytes = num_bytes; 599 - ref->ref_mod = count_mod; 600 - ref->type = 0; 601 - ref->action = 0; 602 - ref->is_head = 1; 603 - ref->in_tree = 1; 604 - ref->seq = 0; 605 - 606 - head_ref = btrfs_delayed_node_to_head(ref); 596 + refcount_set(&head_ref->refs, 1); 597 + head_ref->bytenr = bytenr; 598 + head_ref->num_bytes = num_bytes; 599 + head_ref->ref_mod = count_mod; 607 600 head_ref->must_insert_reserved = must_insert_reserved; 608 601 head_ref->is_data = is_data; 609 - INIT_LIST_HEAD(&head_ref->ref_list); 602 + head_ref->ref_tree = RB_ROOT; 610 603 INIT_LIST_HEAD(&head_ref->ref_add_list); 604 + RB_CLEAR_NODE(&head_ref->href_node); 611 605 head_ref->processing = 0; 612 606 head_ref->total_ref_mod = count_mod; 613 607 head_ref->qgroup_reserved = 0; 614 608 head_ref->qgroup_ref_root = 0; 609 + spin_lock_init(&head_ref->lock); 610 + mutex_init(&head_ref->mutex); 615 611 616 612 /* Record qgroup extent info if provided */ 617 613 if (qrecord) { ··· 626 632 qrecord_inserted = 1; 627 633 } 628 634 629 - spin_lock_init(&head_ref->lock); 630 - mutex_init(&head_ref->mutex); 631 - 632 - trace_add_delayed_ref_head(fs_info, ref, head_ref, action); 635 + trace_add_delayed_ref_head(fs_info, head_ref, action); 633 636 634 637 existing = htree_insert(&delayed_refs->href_root, 635 638 &head_ref->href_node); 636 639 if (existing) { 637 640 WARN_ON(ref_root && reserved && existing->qgroup_ref_root 638 641 && existing->qgroup_reserved); 639 - update_existing_head_ref(delayed_refs, &existing->node, ref, 642 + update_existing_head_ref(delayed_refs, existing, head_ref, 640 643 old_ref_mod); 641 644 /* 642 645 * we've updated the existing ref, free the newly ··· 690 699 ref->is_head = 0; 691 700 ref->in_tree = 1; 692 701 ref->seq = seq; 693 - INIT_LIST_HEAD(&ref->list); 702 + RB_CLEAR_NODE(&ref->ref_node); 694 703 INIT_LIST_HEAD(&ref->add_list); 695 704 696 705 full_ref = btrfs_delayed_node_to_tree_ref(ref); ··· 704 713 705 714 trace_add_delayed_tree_ref(fs_info, ref, full_ref, action); 706 715 707 - ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); 716 + ret = insert_delayed_ref(trans, delayed_refs, head_ref, ref); 708 717 709 718 /* 710 719 * XXX: memory should be freed at the same level allocated. ··· 747 756 ref->is_head = 0; 748 757 ref->in_tree = 1; 749 758 ref->seq = seq; 750 - INIT_LIST_HEAD(&ref->list); 759 + RB_CLEAR_NODE(&ref->ref_node); 751 760 INIT_LIST_HEAD(&ref->add_list); 752 761 753 762 full_ref = btrfs_delayed_node_to_data_ref(ref); ··· 763 772 764 773 trace_add_delayed_data_ref(fs_info, ref, full_ref, action); 765 774 766 - ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); 767 - 775 + ret = insert_delayed_ref(trans, delayed_refs, head_ref, ref); 768 776 if (ret > 0) 769 777 kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); 770 778 } ··· 811 821 * insert both the head node and the new ref without dropping 812 822 * the spin lock 813 823 */ 814 - head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, 824 + head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record, 815 825 bytenr, num_bytes, 0, 0, action, 0, 816 826 &qrecord_inserted, old_ref_mod, 817 827 new_ref_mod); ··· 878 888 * insert both the head node and the new ref without dropping 879 889 * the spin lock 880 890 */ 881 - head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, 891 + head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record, 882 892 bytenr, num_bytes, ref_root, reserved, 883 893 action, 1, &qrecord_inserted, 884 894 old_ref_mod, new_ref_mod); ··· 910 920 delayed_refs = &trans->transaction->delayed_refs; 911 921 spin_lock(&delayed_refs->lock); 912 922 913 - add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr, 923 + add_delayed_ref_head(fs_info, trans, head_ref, NULL, bytenr, 914 924 num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD, 915 925 extent_op->is_data, NULL, NULL, NULL); 916 926

+19 -35

fs/btrfs/delayed-ref.h

··· 26 26 #define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ 27 27 #define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ 28 28 29 - /* 30 - * XXX: Qu: I really hate the design that ref_head and tree/data ref shares the 31 - * same ref_node structure. 32 - * Ref_head is in a higher logic level than tree/data ref, and duplicated 33 - * bytenr/num_bytes in ref_node is really a waste or memory, they should be 34 - * referred from ref_head. 35 - * This gets more disgusting after we use list to store tree/data ref in 36 - * ref_head. Must clean this mess up later. 37 - */ 38 29 struct btrfs_delayed_ref_node { 39 - /*data/tree ref use list, stored in ref_head->ref_list. */ 40 - struct list_head list; 30 + struct rb_node ref_node; 41 31 /* 42 32 * If action is BTRFS_ADD_DELAYED_REF, also link this node to 43 33 * ref_head->ref_add_list, then we do not need to iterate the ··· 81 91 * reference count modifications we've queued up. 82 92 */ 83 93 struct btrfs_delayed_ref_head { 84 - struct btrfs_delayed_ref_node node; 85 - 94 + u64 bytenr; 95 + u64 num_bytes; 96 + refcount_t refs; 86 97 /* 87 98 * the mutex is held while running the refs, and it is also 88 99 * held when checking the sum of reference modifications. ··· 91 100 struct mutex mutex; 92 101 93 102 spinlock_t lock; 94 - struct list_head ref_list; 103 + struct rb_root ref_tree; 95 104 /* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */ 96 105 struct list_head ref_add_list; 97 106 ··· 105 114 * this is meant to track if we need to do the csum accounting or not. 106 115 */ 107 116 int total_ref_mod; 117 + 118 + /* 119 + * This is the current outstanding mod references for this bytenr. This 120 + * is used with lookup_extent_info to get an accurate reference count 121 + * for a bytenr, so it is adjusted as delayed refs are run so that any 122 + * on disk reference count + ref_mod is accurate. 123 + */ 124 + int ref_mod; 108 125 109 126 /* 110 127 * For qgroup reserved space freeing. ··· 233 234 case BTRFS_SHARED_DATA_REF_KEY: 234 235 kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); 235 236 break; 236 - case 0: 237 - kmem_cache_free(btrfs_delayed_ref_head_cachep, ref); 238 - break; 239 237 default: 240 238 BUG(); 241 239 } 242 240 } 241 + } 242 + 243 + static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *head) 244 + { 245 + if (refcount_dec_and_test(&head->refs)) 246 + kmem_cache_free(btrfs_delayed_ref_head_cachep, head); 243 247 } 244 248 245 249 int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ··· 285 283 u64 seq); 286 284 287 285 /* 288 - * a node might live in a head or a regular ref, this lets you 289 - * test for the proper type to use. 290 - */ 291 - static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node) 292 - { 293 - return node->is_head; 294 - } 295 - 296 - /* 297 286 * helper functions to cast a node into its container 298 287 */ 299 288 static inline struct btrfs_delayed_tree_ref * 300 289 btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node) 301 290 { 302 - WARN_ON(btrfs_delayed_ref_is_head(node)); 303 291 return container_of(node, struct btrfs_delayed_tree_ref, node); 304 292 } 305 293 306 294 static inline struct btrfs_delayed_data_ref * 307 295 btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node) 308 296 { 309 - WARN_ON(btrfs_delayed_ref_is_head(node)); 310 297 return container_of(node, struct btrfs_delayed_data_ref, node); 311 - } 312 - 313 - static inline struct btrfs_delayed_ref_head * 314 - btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node) 315 - { 316 - WARN_ON(!btrfs_delayed_ref_is_head(node)); 317 - return container_of(node, struct btrfs_delayed_ref_head, node); 318 298 } 319 299 #endif

+35 -192

fs/btrfs/disk-io.c

··· 50 50 #include "sysfs.h" 51 51 #include "qgroup.h" 52 52 #include "compression.h" 53 + #include "tree-checker.h" 54 + #include "ref-verify.h" 53 55 54 56 #ifdef CONFIG_X86 55 57 #include <asm/cpufeature.h> ··· 545 543 return ret; 546 544 } 547 545 548 - #define CORRUPT(reason, eb, root, slot) \ 549 - btrfs_crit(root->fs_info, \ 550 - "corrupt %s, %s: block=%llu, root=%llu, slot=%d", \ 551 - btrfs_header_level(eb) == 0 ? "leaf" : "node", \ 552 - reason, btrfs_header_bytenr(eb), root->objectid, slot) 553 - 554 - static noinline int check_leaf(struct btrfs_root *root, 555 - struct extent_buffer *leaf) 556 - { 557 - struct btrfs_fs_info *fs_info = root->fs_info; 558 - struct btrfs_key key; 559 - struct btrfs_key leaf_key; 560 - u32 nritems = btrfs_header_nritems(leaf); 561 - int slot; 562 - 563 - /* 564 - * Extent buffers from a relocation tree have a owner field that 565 - * corresponds to the subvolume tree they are based on. So just from an 566 - * extent buffer alone we can not find out what is the id of the 567 - * corresponding subvolume tree, so we can not figure out if the extent 568 - * buffer corresponds to the root of the relocation tree or not. So skip 569 - * this check for relocation trees. 570 - */ 571 - if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) { 572 - struct btrfs_root *check_root; 573 - 574 - key.objectid = btrfs_header_owner(leaf); 575 - key.type = BTRFS_ROOT_ITEM_KEY; 576 - key.offset = (u64)-1; 577 - 578 - check_root = btrfs_get_fs_root(fs_info, &key, false); 579 - /* 580 - * The only reason we also check NULL here is that during 581 - * open_ctree() some roots has not yet been set up. 582 - */ 583 - if (!IS_ERR_OR_NULL(check_root)) { 584 - struct extent_buffer *eb; 585 - 586 - eb = btrfs_root_node(check_root); 587 - /* if leaf is the root, then it's fine */ 588 - if (leaf != eb) { 589 - CORRUPT("non-root leaf's nritems is 0", 590 - leaf, check_root, 0); 591 - free_extent_buffer(eb); 592 - return -EIO; 593 - } 594 - free_extent_buffer(eb); 595 - } 596 - return 0; 597 - } 598 - 599 - if (nritems == 0) 600 - return 0; 601 - 602 - /* Check the 0 item */ 603 - if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) != 604 - BTRFS_LEAF_DATA_SIZE(fs_info)) { 605 - CORRUPT("invalid item offset size pair", leaf, root, 0); 606 - return -EIO; 607 - } 608 - 609 - /* 610 - * Check to make sure each items keys are in the correct order and their 611 - * offsets make sense. We only have to loop through nritems-1 because 612 - * we check the current slot against the next slot, which verifies the 613 - * next slot's offset+size makes sense and that the current's slot 614 - * offset is correct. 615 - */ 616 - for (slot = 0; slot < nritems - 1; slot++) { 617 - btrfs_item_key_to_cpu(leaf, &leaf_key, slot); 618 - btrfs_item_key_to_cpu(leaf, &key, slot + 1); 619 - 620 - /* Make sure the keys are in the right order */ 621 - if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) { 622 - CORRUPT("bad key order", leaf, root, slot); 623 - return -EIO; 624 - } 625 - 626 - /* 627 - * Make sure the offset and ends are right, remember that the 628 - * item data starts at the end of the leaf and grows towards the 629 - * front. 630 - */ 631 - if (btrfs_item_offset_nr(leaf, slot) != 632 - btrfs_item_end_nr(leaf, slot + 1)) { 633 - CORRUPT("slot offset bad", leaf, root, slot); 634 - return -EIO; 635 - } 636 - 637 - /* 638 - * Check to make sure that we don't point outside of the leaf, 639 - * just in case all the items are consistent to each other, but 640 - * all point outside of the leaf. 641 - */ 642 - if (btrfs_item_end_nr(leaf, slot) > 643 - BTRFS_LEAF_DATA_SIZE(fs_info)) { 644 - CORRUPT("slot end outside of leaf", leaf, root, slot); 645 - return -EIO; 646 - } 647 - } 648 - 649 - return 0; 650 - } 651 - 652 - static int check_node(struct btrfs_root *root, struct extent_buffer *node) 653 - { 654 - unsigned long nr = btrfs_header_nritems(node); 655 - struct btrfs_key key, next_key; 656 - int slot; 657 - u64 bytenr; 658 - int ret = 0; 659 - 660 - if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)) { 661 - btrfs_crit(root->fs_info, 662 - "corrupt node: block %llu root %llu nritems %lu", 663 - node->start, root->objectid, nr); 664 - return -EIO; 665 - } 666 - 667 - for (slot = 0; slot < nr - 1; slot++) { 668 - bytenr = btrfs_node_blockptr(node, slot); 669 - btrfs_node_key_to_cpu(node, &key, slot); 670 - btrfs_node_key_to_cpu(node, &next_key, slot + 1); 671 - 672 - if (!bytenr) { 673 - CORRUPT("invalid item slot", node, root, slot); 674 - ret = -EIO; 675 - goto out; 676 - } 677 - 678 - if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) { 679 - CORRUPT("bad key order", node, root, slot); 680 - ret = -EIO; 681 - goto out; 682 - } 683 - } 684 - out: 685 - return ret; 686 - } 687 - 688 546 static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, 689 547 u64 phy_offset, struct page *page, 690 548 u64 start, u64 end, int mirror) ··· 610 748 * that we don't try and read the other copies of this block, just 611 749 * return -EIO. 612 750 */ 613 - if (found_level == 0 && check_leaf(root, eb)) { 751 + if (found_level == 0 && btrfs_check_leaf(root, eb)) { 614 752 set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); 615 753 ret = -EIO; 616 754 } 617 755 618 - if (found_level > 0 && check_node(root, eb)) 756 + if (found_level > 0 && btrfs_check_node(root, eb)) 619 757 ret = -EIO; 620 758 621 759 if (!ret) ··· 741 879 742 880 static void run_one_async_done(struct btrfs_work *work) 743 881 { 744 - struct btrfs_fs_info *fs_info; 745 882 struct async_submit_bio *async; 746 - int limit; 747 883 748 884 async = container_of(work, struct async_submit_bio, work); 749 - fs_info = async->fs_info; 750 - 751 - limit = btrfs_async_submit_limit(fs_info); 752 - limit = limit * 2 / 3; 753 - 754 - /* 755 - * atomic_dec_return implies a barrier for waitqueue_active 756 - */ 757 - if (atomic_dec_return(&fs_info->nr_async_submits) < limit && 758 - waitqueue_active(&fs_info->async_submit_wait)) 759 - wake_up(&fs_info->async_submit_wait); 760 885 761 886 /* If an error occurred we just want to clean up the bio and move on */ 762 887 if (async->status) { ··· 791 942 792 943 async->status = 0; 793 944 794 - atomic_inc(&fs_info->nr_async_submits); 795 - 796 945 if (op_is_sync(bio->bi_opf)) 797 946 btrfs_set_work_high_priority(&async->work); 798 947 799 948 btrfs_queue_work(fs_info->workers, &async->work); 800 - 801 - while (atomic_read(&fs_info->async_submit_draining) && 802 - atomic_read(&fs_info->nr_async_submits)) { 803 - wait_event(fs_info->async_submit_wait, 804 - (atomic_read(&fs_info->nr_async_submits) == 0)); 805 - } 806 - 807 949 return 0; 808 950 } 809 951 ··· 845 1005 return ret; 846 1006 } 847 1007 848 - static int check_async_write(unsigned long bio_flags) 1008 + static int check_async_write(struct btrfs_inode *bi) 849 1009 { 850 - if (bio_flags & EXTENT_BIO_TREE_LOG) 1010 + if (atomic_read(&bi->sync_writers)) 851 1011 return 0; 852 1012 #ifdef CONFIG_X86 853 1013 if (static_cpu_has(X86_FEATURE_XMM4_2)) ··· 862 1022 { 863 1023 struct inode *inode = private_data; 864 1024 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 865 - int async = check_async_write(bio_flags); 1025 + int async = check_async_write(BTRFS_I(inode)); 866 1026 blk_status_t ret; 867 1027 868 1028 if (bio_op(bio) != REQ_OP_WRITE) { ··· 2447 2607 goto fail_delalloc_bytes; 2448 2608 } 2449 2609 2450 - fs_info->btree_inode = new_inode(sb); 2451 - if (!fs_info->btree_inode) { 2452 - err = -ENOMEM; 2453 - goto fail_bio_counter; 2454 - } 2455 - 2456 - mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); 2457 - 2458 2610 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); 2459 2611 INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); 2460 2612 INIT_LIST_HEAD(&fs_info->trans_list); ··· 2479 2647 btrfs_mapping_init(&fs_info->mapping_tree); 2480 2648 btrfs_init_block_rsv(&fs_info->global_block_rsv, 2481 2649 BTRFS_BLOCK_RSV_GLOBAL); 2482 - btrfs_init_block_rsv(&fs_info->delalloc_block_rsv, 2483 - BTRFS_BLOCK_RSV_DELALLOC); 2484 2650 btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); 2485 2651 btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); 2486 2652 btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); 2487 2653 btrfs_init_block_rsv(&fs_info->delayed_block_rsv, 2488 2654 BTRFS_BLOCK_RSV_DELOPS); 2489 - atomic_set(&fs_info->nr_async_submits, 0); 2490 2655 atomic_set(&fs_info->async_delalloc_pages, 0); 2491 - atomic_set(&fs_info->async_submit_draining, 0); 2492 - atomic_set(&fs_info->nr_async_bios, 0); 2493 2656 atomic_set(&fs_info->defrag_running, 0); 2494 2657 atomic_set(&fs_info->qgroup_op_seq, 0); 2495 2658 atomic_set(&fs_info->reada_works_cnt, 0); ··· 2500 2673 /* readahead state */ 2501 2674 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 2502 2675 spin_lock_init(&fs_info->reada_lock); 2676 + btrfs_init_ref_verify(fs_info); 2503 2677 2504 2678 fs_info->thread_pool_size = min_t(unsigned long, 2505 2679 num_online_cpus() + 2, 8); 2506 2680 2507 2681 INIT_LIST_HEAD(&fs_info->ordered_roots); 2508 2682 spin_lock_init(&fs_info->ordered_root_lock); 2683 + 2684 + fs_info->btree_inode = new_inode(sb); 2685 + if (!fs_info->btree_inode) { 2686 + err = -ENOMEM; 2687 + goto fail_bio_counter; 2688 + } 2689 + mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); 2690 + 2509 2691 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), 2510 2692 GFP_KERNEL); 2511 2693 if (!fs_info->delayed_root) { ··· 2731 2895 sb->s_bdi->congested_fn = btrfs_congested_fn; 2732 2896 sb->s_bdi->congested_data = fs_info; 2733 2897 sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; 2734 - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; 2898 + sb->s_bdi->ra_pages = VM_MAX_READAHEAD * SZ_1K / PAGE_SIZE; 2735 2899 sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); 2736 2900 sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); 2737 2901 2738 2902 sb->s_blocksize = sectorsize; 2739 2903 sb->s_blocksize_bits = blksize_bits(sectorsize); 2904 + memcpy(&sb->s_uuid, fs_info->fsid, BTRFS_FSID_SIZE); 2740 2905 2741 2906 mutex_lock(&fs_info->chunk_mutex); 2742 2907 ret = btrfs_read_sys_array(fs_info); ··· 2919 3082 ret = btrfs_read_qgroup_config(fs_info); 2920 3083 if (ret) 2921 3084 goto fail_trans_kthread; 3085 + 3086 + if (btrfs_build_ref_tree(fs_info)) 3087 + btrfs_err(fs_info, "couldn't build ref tree"); 2922 3088 2923 3089 /* do not make disk changes in broken FS or nologreplay is given */ 2924 3090 if (btrfs_super_log_root(disk_super) != 0 && ··· 3788 3948 cleanup_srcu_struct(&fs_info->subvol_srcu); 3789 3949 3790 3950 btrfs_free_stripe_hash_table(fs_info); 3951 + btrfs_free_ref_cache(fs_info); 3791 3952 3792 3953 __btrfs_free_block_rsv(root->orphan_block_rsv); 3793 3954 root->orphan_block_rsv = NULL; ··· 3848 4007 buf->len, 3849 4008 fs_info->dirty_metadata_batch); 3850 4009 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 3851 - if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) { 4010 + if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(root, buf)) { 3852 4011 btrfs_print_leaf(buf); 3853 4012 ASSERT(0); 3854 4013 } ··· 4113 4272 4114 4273 while ((node = rb_first(&delayed_refs->href_root)) != NULL) { 4115 4274 struct btrfs_delayed_ref_head *head; 4116 - struct btrfs_delayed_ref_node *tmp; 4275 + struct rb_node *n; 4117 4276 bool pin_bytes = false; 4118 4277 4119 4278 head = rb_entry(node, struct btrfs_delayed_ref_head, 4120 4279 href_node); 4121 4280 if (!mutex_trylock(&head->mutex)) { 4122 - refcount_inc(&head->node.refs); 4281 + refcount_inc(&head->refs); 4123 4282 spin_unlock(&delayed_refs->lock); 4124 4283 4125 4284 mutex_lock(&head->mutex); 4126 4285 mutex_unlock(&head->mutex); 4127 - btrfs_put_delayed_ref(&head->node); 4286 + btrfs_put_delayed_ref_head(head); 4128 4287 spin_lock(&delayed_refs->lock); 4129 4288 continue; 4130 4289 } 4131 4290 spin_lock(&head->lock); 4132 - list_for_each_entry_safe_reverse(ref, tmp, &head->ref_list, 4133 - list) { 4291 + while ((n = rb_first(&head->ref_tree)) != NULL) { 4292 + ref = rb_entry(n, struct btrfs_delayed_ref_node, 4293 + ref_node); 4134 4294 ref->in_tree = 0; 4135 - list_del(&ref->list); 4295 + rb_erase(&ref->ref_node, &head->ref_tree); 4296 + RB_CLEAR_NODE(&ref->ref_node); 4136 4297 if (!list_empty(&ref->add_list)) 4137 4298 list_del(&ref->add_list); 4138 4299 atomic_dec(&delayed_refs->num_entries); ··· 4147 4304 if (head->processing == 0) 4148 4305 delayed_refs->num_heads_ready--; 4149 4306 atomic_dec(&delayed_refs->num_entries); 4150 - head->node.in_tree = 0; 4151 4307 rb_erase(&head->href_node, &delayed_refs->href_root); 4308 + RB_CLEAR_NODE(&head->href_node); 4152 4309 spin_unlock(&head->lock); 4153 4310 spin_unlock(&delayed_refs->lock); 4154 4311 mutex_unlock(&head->mutex); 4155 4312 4156 4313 if (pin_bytes) 4157 - btrfs_pin_extent(fs_info, head->node.bytenr, 4158 - head->node.num_bytes, 1); 4159 - btrfs_put_delayed_ref(&head->node); 4314 + btrfs_pin_extent(fs_info, head->bytenr, 4315 + head->num_bytes, 1); 4316 + btrfs_put_delayed_ref_head(head); 4160 4317 cond_resched(); 4161 4318 spin_lock(&delayed_refs->lock); 4162 4319 }

+394 -441

fs/btrfs/extent-tree.c

··· 26 26 #include <linux/slab.h> 27 27 #include <linux/ratelimit.h> 28 28 #include <linux/percpu_counter.h> 29 + #include <linux/lockdep.h> 29 30 #include "hash.h" 30 31 #include "tree-log.h" 31 32 #include "disk-io.h" ··· 39 38 #include "math.h" 40 39 #include "sysfs.h" 41 40 #include "qgroup.h" 41 + #include "ref-verify.h" 42 42 43 43 #undef SCRAMBLE_DELAYED_REFS 44 44 ··· 63 61 CHUNK_ALLOC_FORCE = 2, 64 62 }; 65 63 66 - static int update_block_group(struct btrfs_trans_handle *trans, 67 - struct btrfs_fs_info *fs_info, u64 bytenr, 68 - u64 num_bytes, int alloc); 69 64 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 70 65 struct btrfs_fs_info *fs_info, 71 66 struct btrfs_delayed_ref_node *node, u64 parent, ··· 90 91 static void dump_space_info(struct btrfs_fs_info *fs_info, 91 92 struct btrfs_space_info *info, u64 bytes, 92 93 int dump_block_groups); 93 - static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, 94 - u64 ram_bytes, u64 num_bytes, int delalloc); 95 - static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, 96 - u64 num_bytes, int delalloc); 97 94 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 98 95 u64 num_bytes); 99 - static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, 100 - struct btrfs_space_info *space_info, 101 - u64 orig_bytes, 102 - enum btrfs_reserve_flush_enum flush, 103 - bool system_chunk); 104 96 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, 105 97 struct btrfs_space_info *space_info, 106 98 u64 num_bytes); ··· 642 652 cache->cached = BTRFS_CACHE_FAST; 643 653 spin_unlock(&cache->lock); 644 654 645 - if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { 655 + if (btrfs_test_opt(fs_info, SPACE_CACHE)) { 646 656 mutex_lock(&caching_ctl->mutex); 647 657 ret = load_free_space_cache(fs_info, cache); 648 658 ··· 913 923 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 914 924 if (head) { 915 925 if (!mutex_trylock(&head->mutex)) { 916 - refcount_inc(&head->node.refs); 926 + refcount_inc(&head->refs); 917 927 spin_unlock(&delayed_refs->lock); 918 928 919 929 btrfs_release_path(path); ··· 924 934 */ 925 935 mutex_lock(&head->mutex); 926 936 mutex_unlock(&head->mutex); 927 - btrfs_put_delayed_ref(&head->node); 937 + btrfs_put_delayed_ref_head(head); 928 938 goto search_again; 929 939 } 930 940 spin_lock(&head->lock); ··· 933 943 else 934 944 BUG_ON(num_refs == 0); 935 945 936 - num_refs += head->node.ref_mod; 946 + num_refs += head->ref_mod; 937 947 spin_unlock(&head->lock); 938 948 mutex_unlock(&head->mutex); 939 949 } ··· 2179 2189 2180 2190 /* Can return -ENOMEM */ 2181 2191 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2182 - struct btrfs_fs_info *fs_info, 2192 + struct btrfs_root *root, 2183 2193 u64 bytenr, u64 num_bytes, u64 parent, 2184 2194 u64 root_objectid, u64 owner, u64 offset) 2185 2195 { 2196 + struct btrfs_fs_info *fs_info = root->fs_info; 2186 2197 int old_ref_mod, new_ref_mod; 2187 2198 int ret; 2188 2199 2189 2200 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 2190 2201 root_objectid == BTRFS_TREE_LOG_OBJECTID); 2202 + 2203 + btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid, 2204 + owner, offset, BTRFS_ADD_DELAYED_REF); 2191 2205 2192 2206 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 2193 2207 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, ··· 2338 2344 2339 2345 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 2340 2346 struct btrfs_fs_info *fs_info, 2341 - struct btrfs_delayed_ref_node *node, 2347 + struct btrfs_delayed_ref_head *head, 2342 2348 struct btrfs_delayed_extent_op *extent_op) 2343 2349 { 2344 2350 struct btrfs_key key; ··· 2360 2366 if (!path) 2361 2367 return -ENOMEM; 2362 2368 2363 - key.objectid = node->bytenr; 2369 + key.objectid = head->bytenr; 2364 2370 2365 2371 if (metadata) { 2366 2372 key.type = BTRFS_METADATA_ITEM_KEY; 2367 2373 key.offset = extent_op->level; 2368 2374 } else { 2369 2375 key.type = BTRFS_EXTENT_ITEM_KEY; 2370 - key.offset = node->num_bytes; 2376 + key.offset = head->num_bytes; 2371 2377 } 2372 2378 2373 2379 again: ··· 2384 2390 path->slots[0]--; 2385 2391 btrfs_item_key_to_cpu(path->nodes[0], &key, 2386 2392 path->slots[0]); 2387 - if (key.objectid == node->bytenr && 2393 + if (key.objectid == head->bytenr && 2388 2394 key.type == BTRFS_EXTENT_ITEM_KEY && 2389 - key.offset == node->num_bytes) 2395 + key.offset == head->num_bytes) 2390 2396 ret = 0; 2391 2397 } 2392 2398 if (ret > 0) { 2393 2399 btrfs_release_path(path); 2394 2400 metadata = 0; 2395 2401 2396 - key.objectid = node->bytenr; 2397 - key.offset = node->num_bytes; 2402 + key.objectid = head->bytenr; 2403 + key.offset = head->num_bytes; 2398 2404 key.type = BTRFS_EXTENT_ITEM_KEY; 2399 2405 goto again; 2400 2406 } ··· 2501 2507 return 0; 2502 2508 } 2503 2509 2504 - if (btrfs_delayed_ref_is_head(node)) { 2505 - struct btrfs_delayed_ref_head *head; 2506 - /* 2507 - * we've hit the end of the chain and we were supposed 2508 - * to insert this extent into the tree. But, it got 2509 - * deleted before we ever needed to insert it, so all 2510 - * we have to do is clean up the accounting 2511 - */ 2512 - BUG_ON(extent_op); 2513 - head = btrfs_delayed_node_to_head(node); 2514 - trace_run_delayed_ref_head(fs_info, node, head, node->action); 2515 - 2516 - if (head->total_ref_mod < 0) { 2517 - struct btrfs_block_group_cache *cache; 2518 - 2519 - cache = btrfs_lookup_block_group(fs_info, node->bytenr); 2520 - ASSERT(cache); 2521 - percpu_counter_add(&cache->space_info->total_bytes_pinned, 2522 - -node->num_bytes); 2523 - btrfs_put_block_group(cache); 2524 - } 2525 - 2526 - if (insert_reserved) { 2527 - btrfs_pin_extent(fs_info, node->bytenr, 2528 - node->num_bytes, 1); 2529 - if (head->is_data) { 2530 - ret = btrfs_del_csums(trans, fs_info, 2531 - node->bytenr, 2532 - node->num_bytes); 2533 - } 2534 - } 2535 - 2536 - /* Also free its reserved qgroup space */ 2537 - btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, 2538 - head->qgroup_reserved); 2539 - return ret; 2540 - } 2541 - 2542 2510 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 2543 2511 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2544 2512 ret = run_delayed_tree_ref(trans, fs_info, node, extent_op, ··· 2519 2563 { 2520 2564 struct btrfs_delayed_ref_node *ref; 2521 2565 2522 - if (list_empty(&head->ref_list)) 2566 + if (RB_EMPTY_ROOT(&head->ref_tree)) 2523 2567 return NULL; 2524 2568 2525 2569 /* ··· 2532 2576 return list_first_entry(&head->ref_add_list, 2533 2577 struct btrfs_delayed_ref_node, add_list); 2534 2578 2535 - ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, 2536 - list); 2579 + ref = rb_entry(rb_first(&head->ref_tree), 2580 + struct btrfs_delayed_ref_node, ref_node); 2537 2581 ASSERT(list_empty(&ref->add_list)); 2538 2582 return ref; 2583 + } 2584 + 2585 + static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, 2586 + struct btrfs_delayed_ref_head *head) 2587 + { 2588 + spin_lock(&delayed_refs->lock); 2589 + head->processing = 0; 2590 + delayed_refs->num_heads_ready++; 2591 + spin_unlock(&delayed_refs->lock); 2592 + btrfs_delayed_ref_unlock(head); 2593 + } 2594 + 2595 + static int cleanup_extent_op(struct btrfs_trans_handle *trans, 2596 + struct btrfs_fs_info *fs_info, 2597 + struct btrfs_delayed_ref_head *head) 2598 + { 2599 + struct btrfs_delayed_extent_op *extent_op = head->extent_op; 2600 + int ret; 2601 + 2602 + if (!extent_op) 2603 + return 0; 2604 + head->extent_op = NULL; 2605 + if (head->must_insert_reserved) { 2606 + btrfs_free_delayed_extent_op(extent_op); 2607 + return 0; 2608 + } 2609 + spin_unlock(&head->lock); 2610 + ret = run_delayed_extent_op(trans, fs_info, head, extent_op); 2611 + btrfs_free_delayed_extent_op(extent_op); 2612 + return ret ? ret : 1; 2613 + } 2614 + 2615 + static int cleanup_ref_head(struct btrfs_trans_handle *trans, 2616 + struct btrfs_fs_info *fs_info, 2617 + struct btrfs_delayed_ref_head *head) 2618 + { 2619 + struct btrfs_delayed_ref_root *delayed_refs; 2620 + int ret; 2621 + 2622 + delayed_refs = &trans->transaction->delayed_refs; 2623 + 2624 + ret = cleanup_extent_op(trans, fs_info, head); 2625 + if (ret < 0) { 2626 + unselect_delayed_ref_head(delayed_refs, head); 2627 + btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); 2628 + return ret; 2629 + } else if (ret) { 2630 + return ret; 2631 + } 2632 + 2633 + /* 2634 + * Need to drop our head ref lock and re-acquire the delayed ref lock 2635 + * and then re-check to make sure nobody got added. 2636 + */ 2637 + spin_unlock(&head->lock); 2638 + spin_lock(&delayed_refs->lock); 2639 + spin_lock(&head->lock); 2640 + if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) { 2641 + spin_unlock(&head->lock); 2642 + spin_unlock(&delayed_refs->lock); 2643 + return 1; 2644 + } 2645 + delayed_refs->num_heads--; 2646 + rb_erase(&head->href_node, &delayed_refs->href_root); 2647 + RB_CLEAR_NODE(&head->href_node); 2648 + spin_unlock(&delayed_refs->lock); 2649 + spin_unlock(&head->lock); 2650 + atomic_dec(&delayed_refs->num_entries); 2651 + 2652 + trace_run_delayed_ref_head(fs_info, head, 0); 2653 + 2654 + if (head->total_ref_mod < 0) { 2655 + struct btrfs_block_group_cache *cache; 2656 + 2657 + cache = btrfs_lookup_block_group(fs_info, head->bytenr); 2658 + ASSERT(cache); 2659 + percpu_counter_add(&cache->space_info->total_bytes_pinned, 2660 + -head->num_bytes); 2661 + btrfs_put_block_group(cache); 2662 + 2663 + if (head->is_data) { 2664 + spin_lock(&delayed_refs->lock); 2665 + delayed_refs->pending_csums -= head->num_bytes; 2666 + spin_unlock(&delayed_refs->lock); 2667 + } 2668 + } 2669 + 2670 + if (head->must_insert_reserved) { 2671 + btrfs_pin_extent(fs_info, head->bytenr, 2672 + head->num_bytes, 1); 2673 + if (head->is_data) { 2674 + ret = btrfs_del_csums(trans, fs_info, head->bytenr, 2675 + head->num_bytes); 2676 + } 2677 + } 2678 + 2679 + /* Also free its reserved qgroup space */ 2680 + btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, 2681 + head->qgroup_reserved); 2682 + btrfs_delayed_ref_unlock(head); 2683 + btrfs_put_delayed_ref_head(head); 2684 + return 0; 2539 2685 } 2540 2686 2541 2687 /* ··· 2713 2655 if (ref && ref->seq && 2714 2656 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { 2715 2657 spin_unlock(&locked_ref->lock); 2716 - spin_lock(&delayed_refs->lock); 2717 - locked_ref->processing = 0; 2718 - delayed_refs->num_heads_ready++; 2719 - spin_unlock(&delayed_refs->lock); 2720 - btrfs_delayed_ref_unlock(locked_ref); 2658 + unselect_delayed_ref_head(delayed_refs, locked_ref); 2721 2659 locked_ref = NULL; 2722 2660 cond_resched(); 2723 2661 count++; ··· 2721 2667 } 2722 2668 2723 2669 /* 2724 - * record the must insert reserved flag before we 2725 - * drop the spin lock. 2670 + * We're done processing refs in this ref_head, clean everything 2671 + * up and move on to the next ref_head. 2672 + */ 2673 + if (!ref) { 2674 + ret = cleanup_ref_head(trans, fs_info, locked_ref); 2675 + if (ret > 0 ) { 2676 + /* We dropped our lock, we need to loop. */ 2677 + ret = 0; 2678 + continue; 2679 + } else if (ret) { 2680 + return ret; 2681 + } 2682 + locked_ref = NULL; 2683 + count++; 2684 + continue; 2685 + } 2686 + 2687 + actual_count++; 2688 + ref->in_tree = 0; 2689 + rb_erase(&ref->ref_node, &locked_ref->ref_tree); 2690 + RB_CLEAR_NODE(&ref->ref_node); 2691 + if (!list_empty(&ref->add_list)) 2692 + list_del(&ref->add_list); 2693 + /* 2694 + * When we play the delayed ref, also correct the ref_mod on 2695 + * head 2696 + */ 2697 + switch (ref->action) { 2698 + case BTRFS_ADD_DELAYED_REF: 2699 + case BTRFS_ADD_DELAYED_EXTENT: 2700 + locked_ref->ref_mod -= ref->ref_mod; 2701 + break; 2702 + case BTRFS_DROP_DELAYED_REF: 2703 + locked_ref->ref_mod += ref->ref_mod; 2704 + break; 2705 + default: 2706 + WARN_ON(1); 2707 + } 2708 + atomic_dec(&delayed_refs->num_entries); 2709 + 2710 + /* 2711 + * Record the must-insert_reserved flag before we drop the spin 2712 + * lock. 2726 2713 */ 2727 2714 must_insert_reserved = locked_ref->must_insert_reserved; 2728 2715 locked_ref->must_insert_reserved = 0; 2729 2716 2730 2717 extent_op = locked_ref->extent_op; 2731 2718 locked_ref->extent_op = NULL; 2732 - 2733 - if (!ref) { 2734 - 2735 - 2736 - /* All delayed refs have been processed, Go ahead 2737 - * and send the head node to run_one_delayed_ref, 2738 - * so that any accounting fixes can happen 2739 - */ 2740 - ref = &locked_ref->node; 2741 - 2742 - if (extent_op && must_insert_reserved) { 2743 - btrfs_free_delayed_extent_op(extent_op); 2744 - extent_op = NULL; 2745 - } 2746 - 2747 - if (extent_op) { 2748 - spin_unlock(&locked_ref->lock); 2749 - ret = run_delayed_extent_op(trans, fs_info, 2750 - ref, extent_op); 2751 - btrfs_free_delayed_extent_op(extent_op); 2752 - 2753 - if (ret) { 2754 - /* 2755 - * Need to reset must_insert_reserved if 2756 - * there was an error so the abort stuff 2757 - * can cleanup the reserved space 2758 - * properly. 2759 - */ 2760 - if (must_insert_reserved) 2761 - locked_ref->must_insert_reserved = 1; 2762 - spin_lock(&delayed_refs->lock); 2763 - locked_ref->processing = 0; 2764 - delayed_refs->num_heads_ready++; 2765 - spin_unlock(&delayed_refs->lock); 2766 - btrfs_debug(fs_info, 2767 - "run_delayed_extent_op returned %d", 2768 - ret); 2769 - btrfs_delayed_ref_unlock(locked_ref); 2770 - return ret; 2771 - } 2772 - continue; 2773 - } 2774 - 2775 - /* 2776 - * Need to drop our head ref lock and re-acquire the 2777 - * delayed ref lock and then re-check to make sure 2778 - * nobody got added. 2779 - */ 2780 - spin_unlock(&locked_ref->lock); 2781 - spin_lock(&delayed_refs->lock); 2782 - spin_lock(&locked_ref->lock); 2783 - if (!list_empty(&locked_ref->ref_list) || 2784 - locked_ref->extent_op) { 2785 - spin_unlock(&locked_ref->lock); 2786 - spin_unlock(&delayed_refs->lock); 2787 - continue; 2788 - } 2789 - ref->in_tree = 0; 2790 - delayed_refs->num_heads--; 2791 - rb_erase(&locked_ref->href_node, 2792 - &delayed_refs->href_root); 2793 - spin_unlock(&delayed_refs->lock); 2794 - } else { 2795 - actual_count++; 2796 - ref->in_tree = 0; 2797 - list_del(&ref->list); 2798 - if (!list_empty(&ref->add_list)) 2799 - list_del(&ref->add_list); 2800 - } 2801 - atomic_dec(&delayed_refs->num_entries); 2802 - 2803 - if (!btrfs_delayed_ref_is_head(ref)) { 2804 - /* 2805 - * when we play the delayed ref, also correct the 2806 - * ref_mod on head 2807 - */ 2808 - switch (ref->action) { 2809 - case BTRFS_ADD_DELAYED_REF: 2810 - case BTRFS_ADD_DELAYED_EXTENT: 2811 - locked_ref->node.ref_mod -= ref->ref_mod; 2812 - break; 2813 - case BTRFS_DROP_DELAYED_REF: 2814 - locked_ref->node.ref_mod += ref->ref_mod; 2815 - break; 2816 - default: 2817 - WARN_ON(1); 2818 - } 2819 - } 2820 2719 spin_unlock(&locked_ref->lock); 2821 2720 2822 2721 ret = run_one_delayed_ref(trans, fs_info, ref, extent_op, ··· 2777 2770 2778 2771 btrfs_free_delayed_extent_op(extent_op); 2779 2772 if (ret) { 2780 - spin_lock(&delayed_refs->lock); 2781 - locked_ref->processing = 0; 2782 - delayed_refs->num_heads_ready++; 2783 - spin_unlock(&delayed_refs->lock); 2784 - btrfs_delayed_ref_unlock(locked_ref); 2773 + unselect_delayed_ref_head(delayed_refs, locked_ref); 2785 2774 btrfs_put_delayed_ref(ref); 2786 2775 btrfs_debug(fs_info, "run_one_delayed_ref returned %d", 2787 2776 ret); 2788 2777 return ret; 2789 2778 } 2790 2779 2791 - /* 2792 - * If this node is a head, that means all the refs in this head 2793 - * have been dealt with, and we will pick the next head to deal 2794 - * with, so we must unlock the head and drop it from the cluster 2795 - * list before we release it. 2796 - */ 2797 - if (btrfs_delayed_ref_is_head(ref)) { 2798 - if (locked_ref->is_data && 2799 - locked_ref->total_ref_mod < 0) { 2800 - spin_lock(&delayed_refs->lock); 2801 - delayed_refs->pending_csums -= ref->num_bytes; 2802 - spin_unlock(&delayed_refs->lock); 2803 - } 2804 - btrfs_delayed_ref_unlock(locked_ref); 2805 - locked_ref = NULL; 2806 - } 2807 2780 btrfs_put_delayed_ref(ref); 2808 2781 count++; 2809 2782 cond_resched(); ··· 3087 3100 spin_unlock(&delayed_refs->lock); 3088 3101 goto out; 3089 3102 } 3090 - 3091 - while (node) { 3092 - head = rb_entry(node, struct btrfs_delayed_ref_head, 3093 - href_node); 3094 - if (btrfs_delayed_ref_is_head(&head->node)) { 3095 - struct btrfs_delayed_ref_node *ref; 3096 - 3097 - ref = &head->node; 3098 - refcount_inc(&ref->refs); 3099 - 3100 - spin_unlock(&delayed_refs->lock); 3101 - /* 3102 - * Mutex was contended, block until it's 3103 - * released and try again 3104 - */ 3105 - mutex_lock(&head->mutex); 3106 - mutex_unlock(&head->mutex); 3107 - 3108 - btrfs_put_delayed_ref(ref); 3109 - cond_resched(); 3110 - goto again; 3111 - } else { 3112 - WARN_ON(1); 3113 - } 3114 - node = rb_next(node); 3115 - } 3103 + head = rb_entry(node, struct btrfs_delayed_ref_head, 3104 + href_node); 3105 + refcount_inc(&head->refs); 3116 3106 spin_unlock(&delayed_refs->lock); 3107 + 3108 + /* Mutex was contended, block until it's released and retry. */ 3109 + mutex_lock(&head->mutex); 3110 + mutex_unlock(&head->mutex); 3111 + 3112 + btrfs_put_delayed_ref_head(head); 3117 3113 cond_resched(); 3118 3114 goto again; 3119 3115 } ··· 3139 3169 struct btrfs_delayed_data_ref *data_ref; 3140 3170 struct btrfs_delayed_ref_root *delayed_refs; 3141 3171 struct btrfs_transaction *cur_trans; 3172 + struct rb_node *node; 3142 3173 int ret = 0; 3143 3174 3144 3175 cur_trans = root->fs_info->running_transaction; ··· 3155 3184 } 3156 3185 3157 3186 if (!mutex_trylock(&head->mutex)) { 3158 - refcount_inc(&head->node.refs); 3187 + refcount_inc(&head->refs); 3159 3188 spin_unlock(&delayed_refs->lock); 3160 3189 3161 3190 btrfs_release_path(path); ··· 3166 3195 */ 3167 3196 mutex_lock(&head->mutex); 3168 3197 mutex_unlock(&head->mutex); 3169 - btrfs_put_delayed_ref(&head->node); 3198 + btrfs_put_delayed_ref_head(head); 3170 3199 return -EAGAIN; 3171 3200 } 3172 3201 spin_unlock(&delayed_refs->lock); 3173 3202 3174 3203 spin_lock(&head->lock); 3175 - list_for_each_entry(ref, &head->ref_list, list) { 3204 + /* 3205 + * XXX: We should replace this with a proper search function in the 3206 + * future. 3207 + */ 3208 + for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) { 3209 + ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); 3176 3210 /* If it's a shared ref we know a cross reference exists */ 3177 3211 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { 3178 3212 ret = 1; ··· 3327 3351 int level; 3328 3352 int ret = 0; 3329 3353 int (*process_func)(struct btrfs_trans_handle *, 3330 - struct btrfs_fs_info *, 3354 + struct btrfs_root *, 3331 3355 u64, u64, u64, u64, u64, u64); 3332 3356 3333 3357 ··· 3367 3391 3368 3392 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 3369 3393 key.offset -= btrfs_file_extent_offset(buf, fi); 3370 - ret = process_func(trans, fs_info, bytenr, num_bytes, 3394 + ret = process_func(trans, root, bytenr, num_bytes, 3371 3395 parent, ref_root, key.objectid, 3372 3396 key.offset); 3373 3397 if (ret) ··· 3375 3399 } else { 3376 3400 bytenr = btrfs_node_blockptr(buf, i); 3377 3401 num_bytes = fs_info->nodesize; 3378 - ret = process_func(trans, fs_info, bytenr, num_bytes, 3402 + ret = process_func(trans, root, bytenr, num_bytes, 3379 3403 parent, ref_root, level - 1, 0); 3380 3404 if (ret) 3381 3405 goto fail; ··· 4819 4843 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, 4820 4844 u64 orig, bool wait_ordered) 4821 4845 { 4822 - struct btrfs_block_rsv *block_rsv; 4823 4846 struct btrfs_space_info *space_info; 4824 4847 struct btrfs_trans_handle *trans; 4825 4848 u64 delalloc_bytes; ··· 4834 4859 to_reclaim = items * EXTENT_SIZE_PER_ITEM; 4835 4860 4836 4861 trans = (struct btrfs_trans_handle *)current->journal_info; 4837 - block_rsv = &fs_info->delalloc_block_rsv; 4838 - space_info = block_rsv->space_info; 4862 + space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4839 4863 4840 4864 delalloc_bytes = percpu_counter_sum_positive( 4841 4865 &fs_info->delalloc_bytes); ··· 4893 4919 } 4894 4920 } 4895 4921 4922 + struct reserve_ticket { 4923 + u64 bytes; 4924 + int error; 4925 + struct list_head list; 4926 + wait_queue_head_t wait; 4927 + }; 4928 + 4896 4929 /** 4897 4930 * maybe_commit_transaction - possibly commit the transaction if its ok to 4898 4931 * @root - the root we're allocating for ··· 4911 4930 * will return -ENOSPC. 4912 4931 */ 4913 4932 static int may_commit_transaction(struct btrfs_fs_info *fs_info, 4914 - struct btrfs_space_info *space_info, 4915 - u64 bytes, int force) 4933 + struct btrfs_space_info *space_info) 4916 4934 { 4935 + struct reserve_ticket *ticket = NULL; 4917 4936 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; 4918 4937 struct btrfs_trans_handle *trans; 4938 + u64 bytes; 4919 4939 4920 4940 trans = (struct btrfs_trans_handle *)current->journal_info; 4921 4941 if (trans) 4922 4942 return -EAGAIN; 4923 4943 4924 - if (force) 4925 - goto commit; 4944 + spin_lock(&space_info->lock); 4945 + if (!list_empty(&space_info->priority_tickets)) 4946 + ticket = list_first_entry(&space_info->priority_tickets, 4947 + struct reserve_ticket, list); 4948 + else if (!list_empty(&space_info->tickets)) 4949 + ticket = list_first_entry(&space_info->tickets, 4950 + struct reserve_ticket, list); 4951 + bytes = (ticket) ? ticket->bytes : 0; 4952 + spin_unlock(&space_info->lock); 4953 + 4954 + if (!bytes) 4955 + return 0; 4926 4956 4927 4957 /* See if there is enough pinned space to make this reservation */ 4928 4958 if (percpu_counter_compare(&space_info->total_bytes_pinned, ··· 4948 4956 return -ENOSPC; 4949 4957 4950 4958 spin_lock(&delayed_rsv->lock); 4959 + if (delayed_rsv->size > bytes) 4960 + bytes = 0; 4961 + else 4962 + bytes -= delayed_rsv->size; 4951 4963 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4952 - bytes - delayed_rsv->size) < 0) { 4964 + bytes) < 0) { 4953 4965 spin_unlock(&delayed_rsv->lock); 4954 4966 return -ENOSPC; 4955 4967 } ··· 4966 4970 4967 4971 return btrfs_commit_transaction(trans); 4968 4972 } 4969 - 4970 - struct reserve_ticket { 4971 - u64 bytes; 4972 - int error; 4973 - struct list_head list; 4974 - wait_queue_head_t wait; 4975 - }; 4976 4973 4977 4974 /* 4978 4975 * Try to flush some data based on policy set by @state. This is only advisory ··· 5016 5027 ret = 0; 5017 5028 break; 5018 5029 case COMMIT_TRANS: 5019 - ret = may_commit_transaction(fs_info, space_info, 5020 - num_bytes, 0); 5030 + ret = may_commit_transaction(fs_info, space_info); 5021 5031 break; 5022 5032 default: 5023 5033 ret = -ENOSPC; ··· 5570 5582 } 5571 5583 } 5572 5584 5573 - static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 5585 + static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 5574 5586 struct btrfs_block_rsv *block_rsv, 5575 5587 struct btrfs_block_rsv *dest, u64 num_bytes) 5576 5588 { 5577 5589 struct btrfs_space_info *space_info = block_rsv->space_info; 5590 + u64 ret; 5578 5591 5579 5592 spin_lock(&block_rsv->lock); 5580 5593 if (num_bytes == (u64)-1) ··· 5590 5601 } 5591 5602 spin_unlock(&block_rsv->lock); 5592 5603 5604 + ret = num_bytes; 5593 5605 if (num_bytes > 0) { 5594 5606 if (dest) { 5595 5607 spin_lock(&dest->lock); ··· 5610 5620 space_info_add_old_bytes(fs_info, space_info, 5611 5621 num_bytes); 5612 5622 } 5623 + return ret; 5613 5624 } 5614 5625 5615 5626 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, ··· 5634 5643 rsv->type = type; 5635 5644 } 5636 5645 5646 + void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, 5647 + struct btrfs_block_rsv *rsv, 5648 + unsigned short type) 5649 + { 5650 + btrfs_init_block_rsv(rsv, type); 5651 + rsv->space_info = __find_space_info(fs_info, 5652 + BTRFS_BLOCK_GROUP_METADATA); 5653 + } 5654 + 5637 5655 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, 5638 5656 unsigned short type) 5639 5657 { ··· 5652 5652 if (!block_rsv) 5653 5653 return NULL; 5654 5654 5655 - btrfs_init_block_rsv(block_rsv, type); 5656 - block_rsv->space_info = __find_space_info(fs_info, 5657 - BTRFS_BLOCK_GROUP_METADATA); 5655 + btrfs_init_metadata_block_rsv(fs_info, block_rsv, type); 5658 5656 return block_rsv; 5659 5657 } 5660 5658 ··· 5735 5737 return ret; 5736 5738 } 5737 5739 5740 + /** 5741 + * btrfs_inode_rsv_refill - refill the inode block rsv. 5742 + * @inode - the inode we are refilling. 5743 + * @flush - the flusing restriction. 5744 + * 5745 + * Essentially the same as btrfs_block_rsv_refill, except it uses the 5746 + * block_rsv->size as the minimum size. We'll either refill the missing amount 5747 + * or return if we already have enough space. This will also handle the resreve 5748 + * tracepoint for the reserved amount. 5749 + */ 5750 + int btrfs_inode_rsv_refill(struct btrfs_inode *inode, 5751 + enum btrfs_reserve_flush_enum flush) 5752 + { 5753 + struct btrfs_root *root = inode->root; 5754 + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; 5755 + u64 num_bytes = 0; 5756 + int ret = -ENOSPC; 5757 + 5758 + spin_lock(&block_rsv->lock); 5759 + if (block_rsv->reserved < block_rsv->size) 5760 + num_bytes = block_rsv->size - block_rsv->reserved; 5761 + spin_unlock(&block_rsv->lock); 5762 + 5763 + if (num_bytes == 0) 5764 + return 0; 5765 + 5766 + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 5767 + if (!ret) { 5768 + block_rsv_add_bytes(block_rsv, num_bytes, 0); 5769 + trace_btrfs_space_reservation(root->fs_info, "delalloc", 5770 + btrfs_ino(inode), num_bytes, 1); 5771 + } 5772 + return ret; 5773 + } 5774 + 5775 + /** 5776 + * btrfs_inode_rsv_release - release any excessive reservation. 5777 + * @inode - the inode we need to release from. 5778 + * 5779 + * This is the same as btrfs_block_rsv_release, except that it handles the 5780 + * tracepoint for the reservation. 5781 + */ 5782 + void btrfs_inode_rsv_release(struct btrfs_inode *inode) 5783 + { 5784 + struct btrfs_fs_info *fs_info = inode->root->fs_info; 5785 + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5786 + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; 5787 + u64 released = 0; 5788 + 5789 + /* 5790 + * Since we statically set the block_rsv->size we just want to say we 5791 + * are releasing 0 bytes, and then we'll just get the reservation over 5792 + * the size free'd. 5793 + */ 5794 + released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0); 5795 + if (released > 0) 5796 + trace_btrfs_space_reservation(fs_info, "delalloc", 5797 + btrfs_ino(inode), released, 0); 5798 + } 5799 + 5738 5800 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, 5739 5801 struct btrfs_block_rsv *block_rsv, 5740 5802 u64 num_bytes) ··· 5866 5808 5867 5809 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 5868 5810 fs_info->global_block_rsv.space_info = space_info; 5869 - fs_info->delalloc_block_rsv.space_info = space_info; 5870 5811 fs_info->trans_block_rsv.space_info = space_info; 5871 5812 fs_info->empty_block_rsv.space_info = space_info; 5872 5813 fs_info->delayed_block_rsv.space_info = space_info; ··· 5885 5828 { 5886 5829 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, 5887 5830 (u64)-1); 5888 - WARN_ON(fs_info->delalloc_block_rsv.size > 0); 5889 - WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); 5890 5831 WARN_ON(fs_info->trans_block_rsv.size > 0); 5891 5832 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 5892 5833 WARN_ON(fs_info->chunk_block_rsv.size > 0); ··· 5896 5841 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 5897 5842 struct btrfs_fs_info *fs_info) 5898 5843 { 5899 - if (!trans->block_rsv) 5844 + if (!trans->block_rsv) { 5845 + ASSERT(!trans->bytes_reserved); 5900 5846 return; 5847 + } 5901 5848 5902 5849 if (!trans->bytes_reserved) 5903 5850 return; 5904 5851 5852 + ASSERT(trans->block_rsv == &fs_info->trans_block_rsv); 5905 5853 trace_btrfs_space_reservation(fs_info, "transaction", 5906 5854 trans->transid, trans->bytes_reserved, 0); 5907 5855 btrfs_block_rsv_release(fs_info, trans->block_rsv, ··· 6026 5968 btrfs_block_rsv_release(fs_info, rsv, (u64)-1); 6027 5969 } 6028 5970 6029 - /** 6030 - * drop_outstanding_extent - drop an outstanding extent 6031 - * @inode: the inode we're dropping the extent for 6032 - * @num_bytes: the number of bytes we're releasing. 6033 - * 6034 - * This is called when we are freeing up an outstanding extent, either called 6035 - * after an error or after an extent is written. This will return the number of 6036 - * reserved extents that need to be freed. This must be called with 6037 - * BTRFS_I(inode)->lock held. 6038 - */ 6039 - static unsigned drop_outstanding_extent(struct btrfs_inode *inode, 6040 - u64 num_bytes) 5971 + static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, 5972 + struct btrfs_inode *inode) 6041 5973 { 6042 - unsigned drop_inode_space = 0; 6043 - unsigned dropped_extents = 0; 6044 - unsigned num_extents; 5974 + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; 5975 + u64 reserve_size = 0; 5976 + u64 csum_leaves; 5977 + unsigned outstanding_extents; 6045 5978 6046 - num_extents = count_max_extents(num_bytes); 6047 - ASSERT(num_extents); 6048 - ASSERT(inode->outstanding_extents >= num_extents); 6049 - inode->outstanding_extents -= num_extents; 5979 + lockdep_assert_held(&inode->lock); 5980 + outstanding_extents = inode->outstanding_extents; 5981 + if (outstanding_extents) 5982 + reserve_size = btrfs_calc_trans_metadata_size(fs_info, 5983 + outstanding_extents + 1); 5984 + csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, 5985 + inode->csum_bytes); 5986 + reserve_size += btrfs_calc_trans_metadata_size(fs_info, 5987 + csum_leaves); 6050 5988 6051 - if (inode->outstanding_extents == 0 && 6052 - test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 6053 - &inode->runtime_flags)) 6054 - drop_inode_space = 1; 6055 - 6056 - /* 6057 - * If we have more or the same amount of outstanding extents than we have 6058 - * reserved then we need to leave the reserved extents count alone. 6059 - */ 6060 - if (inode->outstanding_extents >= inode->reserved_extents) 6061 - return drop_inode_space; 6062 - 6063 - dropped_extents = inode->reserved_extents - inode->outstanding_extents; 6064 - inode->reserved_extents -= dropped_extents; 6065 - return dropped_extents + drop_inode_space; 6066 - } 6067 - 6068 - /** 6069 - * calc_csum_metadata_size - return the amount of metadata space that must be 6070 - * reserved/freed for the given bytes. 6071 - * @inode: the inode we're manipulating 6072 - * @num_bytes: the number of bytes in question 6073 - * @reserve: 1 if we are reserving space, 0 if we are freeing space 6074 - * 6075 - * This adjusts the number of csum_bytes in the inode and then returns the 6076 - * correct amount of metadata that must either be reserved or freed. We 6077 - * calculate how many checksums we can fit into one leaf and then divide the 6078 - * number of bytes that will need to be checksumed by this value to figure out 6079 - * how many checksums will be required. If we are adding bytes then the number 6080 - * may go up and we will return the number of additional bytes that must be 6081 - * reserved. If it is going down we will return the number of bytes that must 6082 - * be freed. 6083 - * 6084 - * This must be called with BTRFS_I(inode)->lock held. 6085 - */ 6086 - static u64 calc_csum_metadata_size(struct btrfs_inode *inode, u64 num_bytes, 6087 - int reserve) 6088 - { 6089 - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 6090 - u64 old_csums, num_csums; 6091 - 6092 - if (inode->flags & BTRFS_INODE_NODATASUM && inode->csum_bytes == 0) 6093 - return 0; 6094 - 6095 - old_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes); 6096 - if (reserve) 6097 - inode->csum_bytes += num_bytes; 6098 - else 6099 - inode->csum_bytes -= num_bytes; 6100 - num_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes); 6101 - 6102 - /* No change, no need to reserve more */ 6103 - if (old_csums == num_csums) 6104 - return 0; 6105 - 6106 - if (reserve) 6107 - return btrfs_calc_trans_metadata_size(fs_info, 6108 - num_csums - old_csums); 6109 - 6110 - return btrfs_calc_trans_metadata_size(fs_info, old_csums - num_csums); 5989 + spin_lock(&block_rsv->lock); 5990 + block_rsv->size = reserve_size; 5991 + spin_unlock(&block_rsv->lock); 6111 5992 } 6112 5993 6113 5994 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) 6114 5995 { 6115 5996 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 6116 5997 struct btrfs_root *root = inode->root; 6117 - struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv; 6118 - u64 to_reserve = 0; 6119 - u64 csum_bytes; 6120 5998 unsigned nr_extents; 6121 5999 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 6122 6000 int ret = 0; 6123 6001 bool delalloc_lock = true; 6124 - u64 to_free = 0; 6125 - unsigned dropped; 6126 - bool release_extra = false; 6127 6002 6128 6003 /* If we are a free space inode we need to not flush since we will be in 6129 6004 * the middle of a transaction commit. We also don't need the delalloc ··· 6082 6091 6083 6092 num_bytes = ALIGN(num_bytes, fs_info->sectorsize); 6084 6093 6094 + /* Add our new extents and calculate the new rsv size. */ 6085 6095 spin_lock(&inode->lock); 6086 6096 nr_extents = count_max_extents(num_bytes); 6087 - inode->outstanding_extents += nr_extents; 6088 - 6089 - nr_extents = 0; 6090 - if (inode->outstanding_extents > inode->reserved_extents) 6091 - nr_extents += inode->outstanding_extents - 6092 - inode->reserved_extents; 6093 - 6094 - /* We always want to reserve a slot for updating the inode. */ 6095 - to_reserve = btrfs_calc_trans_metadata_size(fs_info, nr_extents + 1); 6096 - to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); 6097 - csum_bytes = inode->csum_bytes; 6097 + btrfs_mod_outstanding_extents(inode, nr_extents); 6098 + inode->csum_bytes += num_bytes; 6099 + btrfs_calculate_inode_block_rsv_size(fs_info, inode); 6098 6100 spin_unlock(&inode->lock); 6099 6101 6100 6102 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { ··· 6097 6113 goto out_fail; 6098 6114 } 6099 6115 6100 - ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush); 6116 + ret = btrfs_inode_rsv_refill(inode, flush); 6101 6117 if (unlikely(ret)) { 6102 6118 btrfs_qgroup_free_meta(root, 6103 6119 nr_extents * fs_info->nodesize); 6104 6120 goto out_fail; 6105 6121 } 6106 6122 6107 - spin_lock(&inode->lock); 6108 - if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 6109 - &inode->runtime_flags)) { 6110 - to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1); 6111 - release_extra = true; 6112 - } 6113 - inode->reserved_extents += nr_extents; 6114 - spin_unlock(&inode->lock); 6115 - 6116 6123 if (delalloc_lock) 6117 6124 mutex_unlock(&inode->delalloc_mutex); 6118 - 6119 - if (to_reserve) 6120 - trace_btrfs_space_reservation(fs_info, "delalloc", 6121 - btrfs_ino(inode), to_reserve, 1); 6122 - if (release_extra) 6123 - btrfs_block_rsv_release(fs_info, block_rsv, 6124 - btrfs_calc_trans_metadata_size(fs_info, 1)); 6125 6125 return 0; 6126 6126 6127 6127 out_fail: 6128 6128 spin_lock(&inode->lock); 6129 - dropped = drop_outstanding_extent(inode, num_bytes); 6130 - /* 6131 - * If the inodes csum_bytes is the same as the original 6132 - * csum_bytes then we know we haven't raced with any free()ers 6133 - * so we can just reduce our inodes csum bytes and carry on. 6134 - */ 6135 - if (inode->csum_bytes == csum_bytes) { 6136 - calc_csum_metadata_size(inode, num_bytes, 0); 6137 - } else { 6138 - u64 orig_csum_bytes = inode->csum_bytes; 6139 - u64 bytes; 6140 - 6141 - /* 6142 - * This is tricky, but first we need to figure out how much we 6143 - * freed from any free-ers that occurred during this 6144 - * reservation, so we reset ->csum_bytes to the csum_bytes 6145 - * before we dropped our lock, and then call the free for the 6146 - * number of bytes that were freed while we were trying our 6147 - * reservation. 6148 - */ 6149 - bytes = csum_bytes - inode->csum_bytes; 6150 - inode->csum_bytes = csum_bytes; 6151 - to_free = calc_csum_metadata_size(inode, bytes, 0); 6152 - 6153 - 6154 - /* 6155 - * Now we need to see how much we would have freed had we not 6156 - * been making this reservation and our ->csum_bytes were not 6157 - * artificially inflated. 6158 - */ 6159 - inode->csum_bytes = csum_bytes - num_bytes; 6160 - bytes = csum_bytes - orig_csum_bytes; 6161 - bytes = calc_csum_metadata_size(inode, bytes, 0); 6162 - 6163 - /* 6164 - * Now reset ->csum_bytes to what it should be. If bytes is 6165 - * more than to_free then we would have freed more space had we 6166 - * not had an artificially high ->csum_bytes, so we need to free 6167 - * the remainder. If bytes is the same or less then we don't 6168 - * need to do anything, the other free-ers did the correct 6169 - * thing. 6170 - */ 6171 - inode->csum_bytes = orig_csum_bytes - num_bytes; 6172 - if (bytes > to_free) 6173 - to_free = bytes - to_free; 6174 - else 6175 - to_free = 0; 6176 - } 6129 + nr_extents = count_max_extents(num_bytes); 6130 + btrfs_mod_outstanding_extents(inode, -nr_extents); 6131 + inode->csum_bytes -= num_bytes; 6132 + btrfs_calculate_inode_block_rsv_size(fs_info, inode); 6177 6133 spin_unlock(&inode->lock); 6178 - if (dropped) 6179 - to_free += btrfs_calc_trans_metadata_size(fs_info, dropped); 6180 6134 6181 - if (to_free) { 6182 - btrfs_block_rsv_release(fs_info, block_rsv, to_free); 6183 - trace_btrfs_space_reservation(fs_info, "delalloc", 6184 - btrfs_ino(inode), to_free, 0); 6185 - } 6135 + btrfs_inode_rsv_release(inode); 6186 6136 if (delalloc_lock) 6187 6137 mutex_unlock(&inode->delalloc_mutex); 6188 6138 return ret; ··· 6124 6206 6125 6207 /** 6126 6208 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode 6127 - * @inode: the inode to release the reservation for 6128 - * @num_bytes: the number of bytes we're releasing 6209 + * @inode: the inode to release the reservation for. 6210 + * @num_bytes: the number of bytes we are releasing. 6129 6211 * 6130 6212 * This will release the metadata reservation for an inode. This can be called 6131 6213 * once we complete IO for a given set of bytes to release their metadata 6132 - * reservations. 6214 + * reservations, or on error for the same reason. 6133 6215 */ 6134 6216 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) 6135 6217 { 6136 6218 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 6137 - u64 to_free = 0; 6138 - unsigned dropped; 6139 6219 6140 6220 num_bytes = ALIGN(num_bytes, fs_info->sectorsize); 6141 6221 spin_lock(&inode->lock); 6142 - dropped = drop_outstanding_extent(inode, num_bytes); 6143 - 6144 - if (num_bytes) 6145 - to_free = calc_csum_metadata_size(inode, num_bytes, 0); 6222 + inode->csum_bytes -= num_bytes; 6223 + btrfs_calculate_inode_block_rsv_size(fs_info, inode); 6146 6224 spin_unlock(&inode->lock); 6147 - if (dropped > 0) 6148 - to_free += btrfs_calc_trans_metadata_size(fs_info, dropped); 6149 6225 6150 6226 if (btrfs_is_testing(fs_info)) 6151 6227 return; 6152 6228 6153 - trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), 6154 - to_free, 0); 6229 + btrfs_inode_rsv_release(inode); 6230 + } 6155 6231 6156 - btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free); 6232 + /** 6233 + * btrfs_delalloc_release_extents - release our outstanding_extents 6234 + * @inode: the inode to balance the reservation for. 6235 + * @num_bytes: the number of bytes we originally reserved with 6236 + * 6237 + * When we reserve space we increase outstanding_extents for the extents we may 6238 + * add. Once we've set the range as delalloc or created our ordered extents we 6239 + * have outstanding_extents to track the real usage, so we use this to free our 6240 + * temporarily tracked outstanding_extents. This _must_ be used in conjunction 6241 + * with btrfs_delalloc_reserve_metadata. 6242 + */ 6243 + void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) 6244 + { 6245 + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 6246 + unsigned num_extents; 6247 + 6248 + spin_lock(&inode->lock); 6249 + num_extents = count_max_extents(num_bytes); 6250 + btrfs_mod_outstanding_extents(inode, -num_extents); 6251 + btrfs_calculate_inode_block_rsv_size(fs_info, inode); 6252 + spin_unlock(&inode->lock); 6253 + 6254 + if (btrfs_is_testing(fs_info)) 6255 + return; 6256 + 6257 + btrfs_inode_rsv_release(inode); 6157 6258 } 6158 6259 6159 6260 /** ··· 6219 6282 * @inode: inode we're releasing space for 6220 6283 * @start: start position of the space already reserved 6221 6284 * @len: the len of the space already reserved 6222 - * 6223 - * This must be matched with a call to btrfs_delalloc_reserve_space. This is 6224 - * called in the case that we don't need the metadata AND data reservations 6225 - * anymore. So if there is an error or we insert an inline extent. 6285 + * @release_bytes: the len of the space we consumed or didn't use 6226 6286 * 6227 6287 * This function will release the metadata space that was not used and will 6228 6288 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes ··· 6227 6293 * Also it will handle the qgroup reserved space. 6228 6294 */ 6229 6295 void btrfs_delalloc_release_space(struct inode *inode, 6230 - struct extent_changeset *reserved, u64 start, u64 len) 6296 + struct extent_changeset *reserved, 6297 + u64 start, u64 len) 6231 6298 { 6232 6299 btrfs_delalloc_release_metadata(BTRFS_I(inode), len); 6233 6300 btrfs_free_reserved_data_space(inode, reserved, start, len); ··· 6893 6958 BUG_ON(!is_data && refs_to_drop != 1); 6894 6959 6895 6960 if (is_data) 6896 - skinny_metadata = 0; 6961 + skinny_metadata = false; 6897 6962 6898 6963 ret = lookup_extent_backref(trans, info, path, &iref, 6899 6964 bytenr, num_bytes, parent, ··· 7148 7213 goto out_delayed_unlock; 7149 7214 7150 7215 spin_lock(&head->lock); 7151 - if (!list_empty(&head->ref_list)) 7216 + if (!RB_EMPTY_ROOT(&head->ref_tree)) 7152 7217 goto out; 7153 7218 7154 7219 if (head->extent_op) { ··· 7169 7234 * at this point we have a head with no other entries. Go 7170 7235 * ahead and process it. 7171 7236 */ 7172 - head->node.in_tree = 0; 7173 7237 rb_erase(&head->href_node, &delayed_refs->href_root); 7174 - 7238 + RB_CLEAR_NODE(&head->href_node); 7175 7239 atomic_dec(&delayed_refs->num_entries); 7176 7240 7177 7241 /* ··· 7189 7255 ret = 1; 7190 7256 7191 7257 mutex_unlock(&head->mutex); 7192 - btrfs_put_delayed_ref(&head->node); 7258 + btrfs_put_delayed_ref_head(head); 7193 7259 return ret; 7194 7260 out: 7195 7261 spin_unlock(&head->lock); ··· 7211 7277 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 7212 7278 int old_ref_mod, new_ref_mod; 7213 7279 7280 + btrfs_ref_tree_mod(root, buf->start, buf->len, parent, 7281 + root->root_key.objectid, 7282 + btrfs_header_level(buf), 0, 7283 + BTRFS_DROP_DELAYED_REF); 7214 7284 ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start, 7215 7285 buf->len, parent, 7216 7286 root->root_key.objectid, ··· 7267 7329 7268 7330 /* Can return -ENOMEM */ 7269 7331 int btrfs_free_extent(struct btrfs_trans_handle *trans, 7270 - struct btrfs_fs_info *fs_info, 7332 + struct btrfs_root *root, 7271 7333 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 7272 7334 u64 owner, u64 offset) 7273 7335 { 7336 + struct btrfs_fs_info *fs_info = root->fs_info; 7274 7337 int old_ref_mod, new_ref_mod; 7275 7338 int ret; 7276 7339 7277 7340 if (btrfs_is_testing(fs_info)) 7278 7341 return 0; 7279 7342 7343 + if (root_objectid != BTRFS_TREE_LOG_OBJECTID) 7344 + btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, 7345 + root_objectid, owner, offset, 7346 + BTRFS_DROP_DELAYED_REF); 7280 7347 7281 7348 /* 7282 7349 * tree log blocks never actually go into the extent allocation ··· 8249 8306 } 8250 8307 8251 8308 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 8252 - u64 root_objectid, u64 owner, 8309 + struct btrfs_root *root, u64 owner, 8253 8310 u64 offset, u64 ram_bytes, 8254 8311 struct btrfs_key *ins) 8255 8312 { 8256 - struct btrfs_fs_info *fs_info = trans->fs_info; 8313 + struct btrfs_fs_info *fs_info = root->fs_info; 8257 8314 int ret; 8258 8315 8259 - BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); 8316 + BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); 8317 + 8318 + btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0, 8319 + root->root_key.objectid, owner, offset, 8320 + BTRFS_ADD_DELAYED_EXTENT); 8260 8321 8261 8322 ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid, 8262 - ins->offset, 0, root_objectid, owner, 8323 + ins->offset, 0, 8324 + root->root_key.objectid, owner, 8263 8325 offset, ram_bytes, 8264 8326 BTRFS_ADD_DELAYED_EXTENT, NULL, NULL); 8265 8327 return ret; ··· 8486 8538 extent_op->is_data = false; 8487 8539 extent_op->level = level; 8488 8540 8541 + btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent, 8542 + root_objectid, level, 0, 8543 + BTRFS_ADD_DELAYED_EXTENT); 8489 8544 ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid, 8490 8545 ins.offset, parent, 8491 8546 root_objectid, level, ··· 8845 8894 ret); 8846 8895 } 8847 8896 } 8848 - ret = btrfs_free_extent(trans, fs_info, bytenr, blocksize, 8897 + ret = btrfs_free_extent(trans, root, bytenr, blocksize, 8849 8898 parent, root->root_key.objectid, 8850 8899 level - 1, 0); 8851 8900 if (ret) ··· 9262 9311 * don't have it in the radix (like when we recover after a power fail 9263 9312 * or unmount) so we don't leak memory. 9264 9313 */ 9265 - if (!for_reloc && root_dropped == false) 9314 + if (!for_reloc && !root_dropped) 9266 9315 btrfs_add_dead_root(root); 9267 9316 if (err && err != -EAGAIN) 9268 9317 btrfs_handle_fs_error(fs_info, err, NULL); ··· 9919 9968 return 0; 9920 9969 } 9921 9970 9922 - static void __link_block_group(struct btrfs_space_info *space_info, 9923 - struct btrfs_block_group_cache *cache) 9971 + static void link_block_group(struct btrfs_block_group_cache *cache) 9924 9972 { 9973 + struct btrfs_space_info *space_info = cache->space_info; 9925 9974 int index = get_block_group_index(cache); 9926 9975 bool first = false; 9927 9976 ··· 10129 10178 10130 10179 cache->space_info = space_info; 10131 10180 10132 - __link_block_group(space_info, cache); 10181 + link_block_group(cache); 10133 10182 10134 10183 set_avail_alloc_bits(info, cache->flags); 10135 10184 if (btrfs_chunk_readonly(info, cache->key.objectid)) { ··· 10288 10337 cache->bytes_super, &cache->space_info); 10289 10338 update_global_block_rsv(fs_info); 10290 10339 10291 - __link_block_group(cache->space_info, cache); 10340 + link_block_group(cache); 10292 10341 10293 10342 list_add_tail(&cache->bg_list, &trans->new_bgs); 10294 10343 ··· 10338 10387 * remove it. 10339 10388 */ 10340 10389 free_excluded_extents(fs_info, block_group); 10390 + btrfs_free_ref_tree_range(fs_info, block_group->key.objectid, 10391 + block_group->key.offset); 10341 10392 10342 10393 memcpy(&key, &block_group->key, sizeof(key)); 10343 10394 index = get_block_group_index(block_group);

+18 -26

fs/btrfs/extent_io.c

··· 110 110 struct bio *bio; 111 111 struct extent_io_tree *tree; 112 112 get_extent_t *get_extent; 113 - unsigned long bio_flags; 114 113 115 114 /* tells writepage not to lock the state bits for this range 116 115 * it still does the unlocking ··· 2761 2762 */ 2762 2763 static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, 2763 2764 struct writeback_control *wbc, 2764 - struct page *page, sector_t sector, 2765 - size_t size, unsigned long offset, 2765 + struct page *page, u64 offset, 2766 + size_t size, unsigned long pg_offset, 2766 2767 struct block_device *bdev, 2767 2768 struct bio **bio_ret, 2768 2769 bio_end_io_t end_io_func, ··· 2776 2777 int contig = 0; 2777 2778 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; 2778 2779 size_t page_size = min_t(size_t, size, PAGE_SIZE); 2780 + sector_t sector = offset >> 9; 2779 2781 2780 2782 if (bio_ret && *bio_ret) { 2781 2783 bio = *bio_ret; ··· 2787 2787 2788 2788 if (prev_bio_flags != bio_flags || !contig || 2789 2789 force_bio_submit || 2790 - merge_bio(tree, page, offset, page_size, bio, bio_flags) || 2791 - bio_add_page(bio, page, page_size, offset) < page_size) { 2790 + merge_bio(tree, page, pg_offset, page_size, bio, bio_flags) || 2791 + bio_add_page(bio, page, page_size, pg_offset) < page_size) { 2792 2792 ret = submit_one_bio(bio, mirror_num, prev_bio_flags); 2793 2793 if (ret < 0) { 2794 2794 *bio_ret = NULL; ··· 2802 2802 } 2803 2803 } 2804 2804 2805 - bio = btrfs_bio_alloc(bdev, (u64)sector << 9); 2806 - bio_add_page(bio, page, page_size, offset); 2805 + bio = btrfs_bio_alloc(bdev, offset); 2806 + bio_add_page(bio, page, page_size, pg_offset); 2807 2807 bio->bi_end_io = end_io_func; 2808 2808 bio->bi_private = tree; 2809 2809 bio->bi_write_hint = page->mapping->host->i_write_hint; ··· 2893 2893 u64 last_byte = i_size_read(inode); 2894 2894 u64 block_start; 2895 2895 u64 cur_end; 2896 - sector_t sector; 2897 2896 struct extent_map *em; 2898 2897 struct block_device *bdev; 2899 2898 int ret = 0; ··· 2928 2929 } 2929 2930 while (cur <= end) { 2930 2931 bool force_bio_submit = false; 2932 + u64 offset; 2931 2933 2932 2934 if (cur >= last_byte) { 2933 2935 char *userpage; ··· 2968 2968 iosize = ALIGN(iosize, blocksize); 2969 2969 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 2970 2970 disk_io_size = em->block_len; 2971 - sector = em->block_start >> 9; 2971 + offset = em->block_start; 2972 2972 } else { 2973 - sector = (em->block_start + extent_offset) >> 9; 2973 + offset = em->block_start + extent_offset; 2974 2974 disk_io_size = iosize; 2975 2975 } 2976 2976 bdev = em->bdev; ··· 3063 3063 } 3064 3064 3065 3065 ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL, 3066 - page, sector, disk_io_size, pg_offset, 3067 - bdev, bio, 3066 + page, offset, disk_io_size, 3067 + pg_offset, bdev, bio, 3068 3068 end_bio_extent_readpage, mirror_num, 3069 3069 *bio_flags, 3070 3070 this_bio_flag, ··· 3325 3325 u64 extent_offset; 3326 3326 u64 block_start; 3327 3327 u64 iosize; 3328 - sector_t sector; 3329 3328 struct extent_map *em; 3330 3329 struct block_device *bdev; 3331 3330 size_t pg_offset = 0; ··· 3367 3368 3368 3369 while (cur <= end) { 3369 3370 u64 em_end; 3371 + u64 offset; 3370 3372 3371 3373 if (cur >= i_size) { 3372 3374 if (tree->ops && tree->ops->writepage_end_io_hook) ··· 3389 3389 BUG_ON(end < cur); 3390 3390 iosize = min(em_end - cur, end - cur + 1); 3391 3391 iosize = ALIGN(iosize, blocksize); 3392 - sector = (em->block_start + extent_offset) >> 9; 3392 + offset = em->block_start + extent_offset; 3393 3393 bdev = em->bdev; 3394 3394 block_start = em->block_start; 3395 3395 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); ··· 3432 3432 } 3433 3433 3434 3434 ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, 3435 - page, sector, iosize, pg_offset, 3435 + page, offset, iosize, pg_offset, 3436 3436 bdev, &epd->bio, 3437 3437 end_bio_extent_writepage, 3438 3438 0, 0, 0, false); ··· 3716 3716 u64 offset = eb->start; 3717 3717 u32 nritems; 3718 3718 unsigned long i, num_pages; 3719 - unsigned long bio_flags = 0; 3720 3719 unsigned long start, end; 3721 3720 unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; 3722 3721 int ret = 0; ··· 3723 3724 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); 3724 3725 num_pages = num_extent_pages(eb->start, eb->len); 3725 3726 atomic_set(&eb->io_pages, num_pages); 3726 - if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) 3727 - bio_flags = EXTENT_BIO_TREE_LOG; 3728 3727 3729 3728 /* set btree blocks beyond nritems with 0 to avoid stale content. */ 3730 3729 nritems = btrfs_header_nritems(eb); ··· 3746 3749 clear_page_dirty_for_io(p); 3747 3750 set_page_writeback(p); 3748 3751 ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, 3749 - p, offset >> 9, PAGE_SIZE, 0, bdev, 3752 + p, offset, PAGE_SIZE, 0, bdev, 3750 3753 &epd->bio, 3751 3754 end_bio_extent_buffer_writepage, 3752 - 0, epd->bio_flags, bio_flags, false); 3753 - epd->bio_flags = bio_flags; 3755 + 0, 0, 0, false); 3754 3756 if (ret) { 3755 3757 set_btree_ioerr(p); 3756 3758 if (PageWriteback(p)) ··· 3786 3790 .tree = tree, 3787 3791 .extent_locked = 0, 3788 3792 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3789 - .bio_flags = 0, 3790 3793 }; 3791 3794 int ret = 0; 3792 3795 int done = 0; ··· 4058 4063 if (epd->bio) { 4059 4064 int ret; 4060 4065 4061 - ret = submit_one_bio(epd->bio, 0, epd->bio_flags); 4066 + ret = submit_one_bio(epd->bio, 0, 0); 4062 4067 BUG_ON(ret < 0); /* -ENOMEM */ 4063 4068 epd->bio = NULL; 4064 4069 } ··· 4081 4086 .get_extent = get_extent, 4082 4087 .extent_locked = 0, 4083 4088 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4084 - .bio_flags = 0, 4085 4089 }; 4086 4090 4087 4091 ret = __extent_writepage(page, wbc, &epd); ··· 4105 4111 .get_extent = get_extent, 4106 4112 .extent_locked = 1, 4107 4113 .sync_io = mode == WB_SYNC_ALL, 4108 - .bio_flags = 0, 4109 4114 }; 4110 4115 struct writeback_control wbc_writepages = { 4111 4116 .sync_mode = mode, ··· 4144 4151 .get_extent = get_extent, 4145 4152 .extent_locked = 0, 4146 4153 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4147 - .bio_flags = 0, 4148 4154 }; 4149 4155 4150 4156 ret = extent_write_cache_pages(mapping, wbc, __extent_writepage, &epd,

-1

fs/btrfs/extent_io.h

··· 34 34 * type for this bio 35 35 */ 36 36 #define EXTENT_BIO_COMPRESSED 1 37 - #define EXTENT_BIO_TREE_LOG 2 38 37 #define EXTENT_BIO_FLAG_SHIFT 16 39 38 40 39 /* these are bit numbers for test/set bit */

+21 -29

fs/btrfs/file.c

··· 856 856 btrfs_mark_buffer_dirty(leaf); 857 857 858 858 if (update_refs && disk_bytenr > 0) { 859 - ret = btrfs_inc_extent_ref(trans, fs_info, 859 + ret = btrfs_inc_extent_ref(trans, root, 860 860 disk_bytenr, num_bytes, 0, 861 861 root->root_key.objectid, 862 862 new_key.objectid, ··· 940 940 extent_end = ALIGN(extent_end, 941 941 fs_info->sectorsize); 942 942 } else if (update_refs && disk_bytenr > 0) { 943 - ret = btrfs_free_extent(trans, fs_info, 943 + ret = btrfs_free_extent(trans, root, 944 944 disk_bytenr, num_bytes, 0, 945 945 root->root_key.objectid, 946 946 key.objectid, key.offset - ··· 1234 1234 extent_end - split); 1235 1235 btrfs_mark_buffer_dirty(leaf); 1236 1236 1237 - ret = btrfs_inc_extent_ref(trans, fs_info, bytenr, num_bytes, 1237 + ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 1238 1238 0, root->root_key.objectid, 1239 1239 ino, orig_offset); 1240 1240 if (ret) { ··· 1268 1268 extent_end = other_end; 1269 1269 del_slot = path->slots[0] + 1; 1270 1270 del_nr++; 1271 - ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes, 1271 + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1272 1272 0, root->root_key.objectid, 1273 1273 ino, orig_offset); 1274 1274 if (ret) { ··· 1288 1288 key.offset = other_start; 1289 1289 del_slot = path->slots[0]; 1290 1290 del_nr++; 1291 - ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes, 1291 + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1292 1292 0, root->root_key.objectid, 1293 1293 ino, orig_offset); 1294 1294 if (ret) { ··· 1590 1590 int ret = 0; 1591 1591 bool only_release_metadata = false; 1592 1592 bool force_page_uptodate = false; 1593 - bool need_unlock; 1594 1593 1595 1594 nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE), 1596 1595 PAGE_SIZE / (sizeof(struct page *))); ··· 1612 1613 size_t copied; 1613 1614 size_t dirty_sectors; 1614 1615 size_t num_sectors; 1616 + int extents_locked; 1615 1617 1616 1618 WARN_ON(num_pages > nrptrs); 1617 1619 ··· 1656 1656 } 1657 1657 } 1658 1658 1659 + WARN_ON(reserve_bytes == 0); 1659 1660 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), 1660 1661 reserve_bytes); 1661 1662 if (ret) { ··· 1670 1669 } 1671 1670 1672 1671 release_bytes = reserve_bytes; 1673 - need_unlock = false; 1674 1672 again: 1675 1673 /* 1676 1674 * This is going to setup the pages array with the number of ··· 1679 1679 ret = prepare_pages(inode, pages, num_pages, 1680 1680 pos, write_bytes, 1681 1681 force_page_uptodate); 1682 - if (ret) 1682 + if (ret) { 1683 + btrfs_delalloc_release_extents(BTRFS_I(inode), 1684 + reserve_bytes); 1683 1685 break; 1686 + } 1684 1687 1685 - ret = lock_and_cleanup_extent_if_need(BTRFS_I(inode), pages, 1688 + extents_locked = lock_and_cleanup_extent_if_need( 1689 + BTRFS_I(inode), pages, 1686 1690 num_pages, pos, write_bytes, &lockstart, 1687 1691 &lockend, &cached_state); 1688 - if (ret < 0) { 1689 - if (ret == -EAGAIN) 1692 + if (extents_locked < 0) { 1693 + if (extents_locked == -EAGAIN) 1690 1694 goto again; 1695 + btrfs_delalloc_release_extents(BTRFS_I(inode), 1696 + reserve_bytes); 1697 + ret = extents_locked; 1691 1698 break; 1692 - } else if (ret > 0) { 1693 - need_unlock = true; 1694 - ret = 0; 1695 1699 } 1696 1700 1697 1701 copied = btrfs_copy_from_user(pos, write_bytes, pages, i); ··· 1722 1718 PAGE_SIZE); 1723 1719 } 1724 1720 1725 - /* 1726 - * If we had a short copy we need to release the excess delaloc 1727 - * bytes we reserved. We need to increment outstanding_extents 1728 - * because btrfs_delalloc_release_space and 1729 - * btrfs_delalloc_release_metadata will decrement it, but 1730 - * we still have an outstanding extent for the chunk we actually 1731 - * managed to copy. 1732 - */ 1733 1721 if (num_sectors > dirty_sectors) { 1734 1722 /* release everything except the sectors we dirtied */ 1735 1723 release_bytes -= dirty_sectors << 1736 1724 fs_info->sb->s_blocksize_bits; 1737 - if (copied > 0) { 1738 - spin_lock(&BTRFS_I(inode)->lock); 1739 - BTRFS_I(inode)->outstanding_extents++; 1740 - spin_unlock(&BTRFS_I(inode)->lock); 1741 - } 1742 1725 if (only_release_metadata) { 1743 1726 btrfs_delalloc_release_metadata(BTRFS_I(inode), 1744 1727 release_bytes); ··· 1747 1756 if (copied > 0) 1748 1757 ret = btrfs_dirty_pages(inode, pages, dirty_pages, 1749 1758 pos, copied, NULL); 1750 - if (need_unlock) 1759 + if (extents_locked) 1751 1760 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1752 1761 lockstart, lockend, &cached_state, 1753 1762 GFP_NOFS); 1763 + btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); 1754 1764 if (ret) { 1755 1765 btrfs_drop_pages(pages, num_pages); 1756 1766 break; ··· 2038 2046 struct btrfs_trans_handle *trans; 2039 2047 struct btrfs_log_ctx ctx; 2040 2048 int ret = 0, err; 2041 - bool full_sync = 0; 2049 + bool full_sync = false; 2042 2050 u64 len; 2043 2051 2044 2052 /*

-4

fs/btrfs/free-space-tree.c

··· 1286 1286 struct btrfs_block_group_cache *block_group, 1287 1287 struct btrfs_path *path) 1288 1288 { 1289 - u64 start, end; 1290 1289 int ret; 1291 - 1292 - start = block_group->key.objectid; 1293 - end = block_group->key.objectid + block_group->key.offset; 1294 1290 1295 1291 block_group->needs_free_space = 0; 1296 1292

+2 -1

fs/btrfs/inode-map.c

··· 500 500 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, 501 501 prealloc, prealloc, &alloc_hint); 502 502 if (ret) { 503 - btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc); 503 + btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc); 504 504 goto out_put; 505 505 } 506 506 507 507 ret = btrfs_write_out_ino_cache(root, trans, path, inode); 508 + btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc); 508 509 out_put: 509 510 iput(inode); 510 511 out_release:

+115 -212

fs/btrfs/inode.c

··· 42 42 #include <linux/blkdev.h> 43 43 #include <linux/posix_acl_xattr.h> 44 44 #include <linux/uio.h> 45 + #include <linux/magic.h> 45 46 #include "ctree.h" 46 47 #include "disk-io.h" 47 48 #include "transaction.h" ··· 68 67 }; 69 68 70 69 struct btrfs_dio_data { 71 - u64 outstanding_extents; 72 70 u64 reserve; 73 71 u64 unsubmitted_oe_range_start; 74 72 u64 unsubmitted_oe_range_end; ··· 316 316 btrfs_free_path(path); 317 317 return PTR_ERR(trans); 318 318 } 319 - trans->block_rsv = &fs_info->delalloc_block_rsv; 319 + trans->block_rsv = &BTRFS_I(inode)->block_rsv; 320 320 321 321 if (compressed_size && compressed_pages) 322 322 extent_item_size = btrfs_file_extent_calc_inline_size( ··· 348 348 } 349 349 350 350 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 351 - btrfs_delalloc_release_metadata(BTRFS_I(inode), end + 1 - start); 352 351 btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0); 353 352 out: 354 353 /* ··· 457 458 { 458 459 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 459 460 struct btrfs_root *root = BTRFS_I(inode)->root; 460 - u64 num_bytes; 461 461 u64 blocksize = fs_info->sectorsize; 462 462 u64 actual_end; 463 463 u64 isize = i_size_read(inode); ··· 506 508 507 509 total_compressed = min_t(unsigned long, total_compressed, 508 510 BTRFS_MAX_UNCOMPRESSED); 509 - num_bytes = ALIGN(end - start + 1, blocksize); 510 - num_bytes = max(blocksize, num_bytes); 511 511 total_in = 0; 512 512 ret = 0; 513 513 ··· 538 542 */ 539 543 extent_range_clear_dirty_for_io(inode, start, end); 540 544 redirty = 1; 541 - ret = btrfs_compress_pages(compress_type, 545 + 546 + /* Compression level is applied here and only here */ 547 + ret = btrfs_compress_pages( 548 + compress_type | (fs_info->compress_level << 4), 542 549 inode->i_mapping, start, 543 550 pages, 544 551 &nr_pages, ··· 569 570 cont: 570 571 if (start == 0) { 571 572 /* lets try to make an inline extent */ 572 - if (ret || total_in < (actual_end - start)) { 573 + if (ret || total_in < actual_end) { 573 574 /* we didn't compress the entire range, try 574 575 * to make an uncompressed inline extent. 575 576 */ ··· 583 584 } 584 585 if (ret <= 0) { 585 586 unsigned long clear_flags = EXTENT_DELALLOC | 586 - EXTENT_DELALLOC_NEW | EXTENT_DEFRAG; 587 + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | 588 + EXTENT_DO_ACCOUNTING; 587 589 unsigned long page_error_op; 588 590 589 - clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0; 590 591 page_error_op = ret < 0 ? PAGE_SET_ERROR : 0; 591 592 592 593 /* 593 594 * inline extent creation worked or returned error, 594 595 * we don't need to create any more async work items. 595 596 * Unlock and free up our temp pages. 597 + * 598 + * We use DO_ACCOUNTING here because we need the 599 + * delalloc_release_metadata to be done _after_ we drop 600 + * our outstanding extent for clearing delalloc for this 601 + * range. 596 602 */ 597 603 extent_clear_unlock_delalloc(inode, start, end, end, 598 604 NULL, clear_flags, ··· 606 602 PAGE_SET_WRITEBACK | 607 603 page_error_op | 608 604 PAGE_END_WRITEBACK); 609 - if (ret == 0) 610 - btrfs_free_reserved_data_space_noquota(inode, 611 - start, 612 - end - start + 1); 613 605 goto free_pages_out; 614 606 } 615 607 } ··· 625 625 */ 626 626 total_in = ALIGN(total_in, PAGE_SIZE); 627 627 if (total_compressed + blocksize <= total_in) { 628 - num_bytes = total_in; 629 628 *num_added += 1; 630 629 631 630 /* ··· 632 633 * allocation on disk for these compressed pages, and 633 634 * will submit them to the elevator. 634 635 */ 635 - add_async_extent(async_cow, start, num_bytes, 636 + add_async_extent(async_cow, start, total_in, 636 637 total_compressed, pages, nr_pages, 637 638 compress_type); 638 639 639 - if (start + num_bytes < end) { 640 - start += num_bytes; 640 + if (start + total_in < end) { 641 + start += total_in; 641 642 pages = NULL; 642 643 cond_resched(); 643 644 goto again; ··· 981 982 ret = cow_file_range_inline(root, inode, start, end, 0, 982 983 BTRFS_COMPRESS_NONE, NULL); 983 984 if (ret == 0) { 985 + /* 986 + * We use DO_ACCOUNTING here because we need the 987 + * delalloc_release_metadata to be run _after_ we drop 988 + * our outstanding extent for clearing delalloc for this 989 + * range. 990 + */ 984 991 extent_clear_unlock_delalloc(inode, start, end, 985 992 delalloc_end, NULL, 986 993 EXTENT_LOCKED | EXTENT_DELALLOC | 987 - EXTENT_DELALLOC_NEW | 988 - EXTENT_DEFRAG, PAGE_UNLOCK | 994 + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | 995 + EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | 989 996 PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | 990 997 PAGE_END_WRITEBACK); 991 - btrfs_free_reserved_data_space_noquota(inode, start, 992 - end - start + 1); 993 998 *nr_written = *nr_written + 994 999 (end - start + PAGE_SIZE) / PAGE_SIZE; 995 1000 *page_started = 1; ··· 1228 1225 atomic_add(nr_pages, &fs_info->async_delalloc_pages); 1229 1226 1230 1227 btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work); 1231 - 1232 - while (atomic_read(&fs_info->async_submit_draining) && 1233 - atomic_read(&fs_info->async_delalloc_pages)) { 1234 - wait_event(fs_info->async_submit_wait, 1235 - (atomic_read(&fs_info->async_delalloc_pages) == 1236 - 0)); 1237 - } 1238 1228 1239 1229 *nr_written += nr_pages; 1240 1230 start = cur_end + 1; ··· 1631 1635 } 1632 1636 1633 1637 spin_lock(&BTRFS_I(inode)->lock); 1634 - BTRFS_I(inode)->outstanding_extents++; 1638 + btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); 1635 1639 spin_unlock(&BTRFS_I(inode)->lock); 1636 1640 } 1637 1641 ··· 1661 1665 /* we're not bigger than the max, unreserve the space and go */ 1662 1666 if (new_size <= BTRFS_MAX_EXTENT_SIZE) { 1663 1667 spin_lock(&BTRFS_I(inode)->lock); 1664 - BTRFS_I(inode)->outstanding_extents--; 1668 + btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); 1665 1669 spin_unlock(&BTRFS_I(inode)->lock); 1666 1670 return; 1667 1671 } ··· 1692 1696 return; 1693 1697 1694 1698 spin_lock(&BTRFS_I(inode)->lock); 1695 - BTRFS_I(inode)->outstanding_extents--; 1699 + btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); 1696 1700 spin_unlock(&BTRFS_I(inode)->lock); 1697 1701 } 1698 1702 ··· 1762 1766 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1763 1767 struct btrfs_root *root = BTRFS_I(inode)->root; 1764 1768 u64 len = state->end + 1 - state->start; 1769 + u32 num_extents = count_max_extents(len); 1765 1770 bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode)); 1766 1771 1767 - if (*bits & EXTENT_FIRST_DELALLOC) { 1768 - *bits &= ~EXTENT_FIRST_DELALLOC; 1769 - } else { 1770 - spin_lock(&BTRFS_I(inode)->lock); 1771 - BTRFS_I(inode)->outstanding_extents++; 1772 - spin_unlock(&BTRFS_I(inode)->lock); 1773 - } 1772 + spin_lock(&BTRFS_I(inode)->lock); 1773 + btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents); 1774 + spin_unlock(&BTRFS_I(inode)->lock); 1774 1775 1775 1776 /* For sanity tests */ 1776 1777 if (btrfs_is_testing(fs_info)) ··· 1821 1828 struct btrfs_root *root = inode->root; 1822 1829 bool do_list = !btrfs_is_free_space_inode(inode); 1823 1830 1824 - if (*bits & EXTENT_FIRST_DELALLOC) { 1825 - *bits &= ~EXTENT_FIRST_DELALLOC; 1826 - } else if (!(*bits & EXTENT_CLEAR_META_RESV)) { 1827 - spin_lock(&inode->lock); 1828 - inode->outstanding_extents -= num_extents; 1829 - spin_unlock(&inode->lock); 1830 - } 1831 + spin_lock(&inode->lock); 1832 + btrfs_mod_outstanding_extents(inode, -num_extents); 1833 + spin_unlock(&inode->lock); 1831 1834 1832 1835 /* 1833 1836 * We don't reserve metadata space for space cache inodes so we ··· 2094 2105 0); 2095 2106 ClearPageChecked(page); 2096 2107 set_page_dirty(page); 2108 + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); 2097 2109 out: 2098 2110 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, 2099 2111 &cached_state, GFP_NOFS); ··· 2219 2229 if (ret < 0) 2220 2230 goto out; 2221 2231 qg_released = ret; 2222 - ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid, 2223 - btrfs_ino(BTRFS_I(inode)), file_pos, qg_released, &ins); 2232 + ret = btrfs_alloc_reserved_file_extent(trans, root, 2233 + btrfs_ino(BTRFS_I(inode)), 2234 + file_pos, qg_released, &ins); 2224 2235 out: 2225 2236 btrfs_free_path(path); 2226 2237 ··· 2455 2464 ret = iterate_inodes_from_logical(old->bytenr + 2456 2465 old->extent_offset, fs_info, 2457 2466 path, record_one_backref, 2458 - old); 2467 + old, false); 2459 2468 if (ret < 0 && ret != -ENOENT) 2460 2469 return false; 2461 2470 ··· 2673 2682 inode_add_bytes(inode, len); 2674 2683 btrfs_release_path(path); 2675 2684 2676 - ret = btrfs_inc_extent_ref(trans, fs_info, new->bytenr, 2685 + ret = btrfs_inc_extent_ref(trans, root, new->bytenr, 2677 2686 new->disk_len, 0, 2678 2687 backref->root_id, backref->inum, 2679 2688 new->file_pos); /* start - extent_offset */ ··· 2955 2964 trans = NULL; 2956 2965 goto out; 2957 2966 } 2958 - trans->block_rsv = &fs_info->delalloc_block_rsv; 2967 + trans->block_rsv = &BTRFS_I(inode)->block_rsv; 2959 2968 ret = btrfs_update_inode_fallback(trans, root, inode); 2960 2969 if (ret) /* -ENOMEM or corruption */ 2961 2970 btrfs_abort_transaction(trans, ret); ··· 2991 3000 goto out; 2992 3001 } 2993 3002 2994 - trans->block_rsv = &fs_info->delalloc_block_rsv; 3003 + trans->block_rsv = &BTRFS_I(inode)->block_rsv; 2995 3004 2996 3005 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 2997 3006 compress_type = ordered_extent->compress_type; ··· 3049 3058 0, &cached_state, GFP_NOFS); 3050 3059 } 3051 3060 3052 - if (root != fs_info->tree_root) 3053 - btrfs_delalloc_release_metadata(BTRFS_I(inode), 3054 - ordered_extent->len); 3055 3061 if (trans) 3056 3062 btrfs_end_transaction(trans); 3057 3063 ··· 4360 4372 4361 4373 } 4362 4374 4363 - static int truncate_inline_extent(struct inode *inode, 4364 - struct btrfs_path *path, 4365 - struct btrfs_key *found_key, 4366 - const u64 item_end, 4367 - const u64 new_size) 4368 - { 4369 - struct extent_buffer *leaf = path->nodes[0]; 4370 - int slot = path->slots[0]; 4371 - struct btrfs_file_extent_item *fi; 4372 - u32 size = (u32)(new_size - found_key->offset); 4373 - struct btrfs_root *root = BTRFS_I(inode)->root; 4374 - 4375 - fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 4376 - 4377 - if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) { 4378 - loff_t offset = new_size; 4379 - loff_t page_end = ALIGN(offset, PAGE_SIZE); 4380 - 4381 - /* 4382 - * Zero out the remaining of the last page of our inline extent, 4383 - * instead of directly truncating our inline extent here - that 4384 - * would be much more complex (decompressing all the data, then 4385 - * compressing the truncated data, which might be bigger than 4386 - * the size of the inline extent, resize the extent, etc). 4387 - * We release the path because to get the page we might need to 4388 - * read the extent item from disk (data not in the page cache). 4389 - */ 4390 - btrfs_release_path(path); 4391 - return btrfs_truncate_block(inode, offset, page_end - offset, 4392 - 0); 4393 - } 4394 - 4395 - btrfs_set_file_extent_ram_bytes(leaf, fi, size); 4396 - size = btrfs_file_extent_calc_inline_size(size); 4397 - btrfs_truncate_item(root->fs_info, path, size, 1); 4398 - 4399 - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 4400 - inode_sub_bytes(inode, item_end + 1 - new_size); 4401 - 4402 - return 0; 4403 - } 4375 + /* 4376 + * Return this if we need to call truncate_block for the last bit of the 4377 + * truncate. 4378 + */ 4379 + #define NEED_TRUNCATE_BLOCK 1 4404 4380 4405 4381 /* 4406 4382 * this can truncate away extent items, csum items and directory items. ··· 4403 4451 int err = 0; 4404 4452 u64 ino = btrfs_ino(BTRFS_I(inode)); 4405 4453 u64 bytes_deleted = 0; 4406 - bool be_nice = 0; 4407 - bool should_throttle = 0; 4408 - bool should_end = 0; 4454 + bool be_nice = false; 4455 + bool should_throttle = false; 4456 + bool should_end = false; 4409 4457 4410 4458 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); 4411 4459 ··· 4415 4463 */ 4416 4464 if (!btrfs_is_free_space_inode(BTRFS_I(inode)) && 4417 4465 test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 4418 - be_nice = 1; 4466 + be_nice = true; 4419 4467 4420 4468 path = btrfs_alloc_path(); 4421 4469 if (!path) ··· 4525 4573 if (found_type != BTRFS_EXTENT_DATA_KEY) 4526 4574 goto delete; 4527 4575 4528 - if (del_item) 4529 - last_size = found_key.offset; 4530 - else 4531 - last_size = new_size; 4532 - 4533 4576 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 4534 4577 u64 num_dec; 4535 4578 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); ··· 4566 4619 */ 4567 4620 if (!del_item && 4568 4621 btrfs_file_extent_encryption(leaf, fi) == 0 && 4569 - btrfs_file_extent_other_encoding(leaf, fi) == 0) { 4622 + btrfs_file_extent_other_encoding(leaf, fi) == 0 && 4623 + btrfs_file_extent_compression(leaf, fi) == 0) { 4624 + u32 size = (u32)(new_size - found_key.offset); 4570 4625 4626 + btrfs_set_file_extent_ram_bytes(leaf, fi, size); 4627 + size = btrfs_file_extent_calc_inline_size(size); 4628 + btrfs_truncate_item(root->fs_info, path, size, 1); 4629 + } else if (!del_item) { 4571 4630 /* 4572 - * Need to release path in order to truncate a 4573 - * compressed extent. So delete any accumulated 4574 - * extent items so far. 4631 + * We have to bail so the last_size is set to 4632 + * just before this extent. 4575 4633 */ 4576 - if (btrfs_file_extent_compression(leaf, fi) != 4577 - BTRFS_COMPRESS_NONE && pending_del_nr) { 4578 - err = btrfs_del_items(trans, root, path, 4579 - pending_del_slot, 4580 - pending_del_nr); 4581 - if (err) { 4582 - btrfs_abort_transaction(trans, 4583 - err); 4584 - goto error; 4585 - } 4586 - pending_del_nr = 0; 4587 - } 4588 - 4589 - err = truncate_inline_extent(inode, path, 4590 - &found_key, 4591 - item_end, 4592 - new_size); 4593 - if (err) { 4594 - btrfs_abort_transaction(trans, err); 4595 - goto error; 4596 - } 4597 - } else if (test_bit(BTRFS_ROOT_REF_COWS, 4598 - &root->state)) { 4599 - inode_sub_bytes(inode, item_end + 1 - new_size); 4634 + err = NEED_TRUNCATE_BLOCK; 4635 + break; 4600 4636 } 4637 + 4638 + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 4639 + inode_sub_bytes(inode, item_end + 1 - new_size); 4601 4640 } 4602 4641 delete: 4642 + if (del_item) 4643 + last_size = found_key.offset; 4644 + else 4645 + last_size = new_size; 4603 4646 if (del_item) { 4604 4647 if (!pending_del_nr) { 4605 4648 /* no pending yet, add ourselves */ ··· 4606 4669 } else { 4607 4670 break; 4608 4671 } 4609 - should_throttle = 0; 4672 + should_throttle = false; 4610 4673 4611 4674 if (found_extent && 4612 4675 (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || 4613 4676 root == fs_info->tree_root)) { 4614 4677 btrfs_set_path_blocking(path); 4615 4678 bytes_deleted += extent_num_bytes; 4616 - ret = btrfs_free_extent(trans, fs_info, extent_start, 4679 + ret = btrfs_free_extent(trans, root, extent_start, 4617 4680 extent_num_bytes, 0, 4618 4681 btrfs_header_owner(leaf), 4619 4682 ino, extent_offset); ··· 4625 4688 if (be_nice) { 4626 4689 if (truncate_space_check(trans, root, 4627 4690 extent_num_bytes)) { 4628 - should_end = 1; 4691 + should_end = true; 4629 4692 } 4630 4693 if (btrfs_should_throttle_delayed_refs(trans, 4631 4694 fs_info)) 4632 - should_throttle = 1; 4695 + should_throttle = true; 4633 4696 } 4634 4697 } 4635 4698 ··· 4738 4801 (!len || ((len & (blocksize - 1)) == 0))) 4739 4802 goto out; 4740 4803 4804 + block_start = round_down(from, blocksize); 4805 + block_end = block_start + blocksize - 1; 4806 + 4741 4807 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 4742 - round_down(from, blocksize), blocksize); 4808 + block_start, blocksize); 4743 4809 if (ret) 4744 4810 goto out; 4745 4811 ··· 4750 4810 page = find_or_create_page(mapping, index, mask); 4751 4811 if (!page) { 4752 4812 btrfs_delalloc_release_space(inode, data_reserved, 4753 - round_down(from, blocksize), 4754 - blocksize); 4813 + block_start, blocksize); 4814 + btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); 4755 4815 ret = -ENOMEM; 4756 4816 goto out; 4757 4817 } 4758 - 4759 - block_start = round_down(from, blocksize); 4760 - block_end = block_start + blocksize - 1; 4761 4818 4762 4819 if (!PageUptodate(page)) { 4763 4820 ret = btrfs_readpage(NULL, page); ··· 4820 4883 if (ret) 4821 4884 btrfs_delalloc_release_space(inode, data_reserved, block_start, 4822 4885 blocksize); 4886 + btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); 4823 4887 unlock_page(page); 4824 4888 put_page(page); 4825 4889 out: ··· 7735 7797 return em; 7736 7798 } 7737 7799 7738 - static void adjust_dio_outstanding_extents(struct inode *inode, 7739 - struct btrfs_dio_data *dio_data, 7740 - const u64 len) 7741 - { 7742 - unsigned num_extents = count_max_extents(len); 7743 - 7744 - /* 7745 - * If we have an outstanding_extents count still set then we're 7746 - * within our reservation, otherwise we need to adjust our inode 7747 - * counter appropriately. 7748 - */ 7749 - if (dio_data->outstanding_extents >= num_extents) { 7750 - dio_data->outstanding_extents -= num_extents; 7751 - } else { 7752 - /* 7753 - * If dio write length has been split due to no large enough 7754 - * contiguous space, we need to compensate our inode counter 7755 - * appropriately. 7756 - */ 7757 - u64 num_needed = num_extents - dio_data->outstanding_extents; 7758 - 7759 - spin_lock(&BTRFS_I(inode)->lock); 7760 - BTRFS_I(inode)->outstanding_extents += num_needed; 7761 - spin_unlock(&BTRFS_I(inode)->lock); 7762 - } 7763 - } 7764 - 7765 7800 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, 7766 7801 struct buffer_head *bh_result, int create) 7767 7802 { ··· 7896 7985 if (!dio_data->overwrite && start + len > i_size_read(inode)) 7897 7986 i_size_write(inode, start + len); 7898 7987 7899 - adjust_dio_outstanding_extents(inode, dio_data, len); 7900 7988 WARN_ON(dio_data->reserve < len); 7901 7989 dio_data->reserve -= len; 7902 7990 dio_data->unsubmitted_oe_range_end = start + len; ··· 7925 8015 err: 7926 8016 if (dio_data) 7927 8017 current->journal_info = dio_data; 7928 - /* 7929 - * Compensate the delalloc release we do in btrfs_direct_IO() when we 7930 - * write less data then expected, so that we don't underflow our inode's 7931 - * outstanding extents counter. 7932 - */ 7933 - if (create && dio_data) 7934 - adjust_dio_outstanding_extents(inode, dio_data, len); 7935 - 7936 8018 return ret; 7937 8019 } 7938 8020 ··· 8397 8495 if (dip->errors) { 8398 8496 bio_io_error(dip->orig_bio); 8399 8497 } else { 8400 - dip->dio_bio->bi_status = 0; 8498 + dip->dio_bio->bi_status = BLK_STS_OK; 8401 8499 bio_endio(dip->orig_bio); 8402 8500 } 8403 8501 out: ··· 8479 8577 goto err; 8480 8578 } 8481 8579 map: 8482 - ret = btrfs_map_bio(fs_info, bio, 0, async_submit); 8580 + ret = btrfs_map_bio(fs_info, bio, 0, 0); 8483 8581 err: 8484 8582 bio_put(bio); 8485 8583 return ret; ··· 8688 8786 } 8689 8787 8690 8788 static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, 8691 - struct kiocb *iocb, 8692 8789 const struct iov_iter *iter, loff_t offset) 8693 8790 { 8694 8791 int seg; ··· 8734 8833 bool relock = false; 8735 8834 ssize_t ret; 8736 8835 8737 - if (check_direct_IO(fs_info, iocb, iter, offset)) 8836 + if (check_direct_IO(fs_info, iter, offset)) 8738 8837 return 0; 8739 8838 8740 8839 inode_dio_begin(inode); ··· 8769 8868 offset, count); 8770 8869 if (ret) 8771 8870 goto out; 8772 - dio_data.outstanding_extents = count_max_extents(count); 8773 8871 8774 8872 /* 8775 8873 * We need to know how many extents we reserved so that we can ··· 8815 8915 } else if (ret >= 0 && (size_t)ret < count) 8816 8916 btrfs_delalloc_release_space(inode, data_reserved, 8817 8917 offset, count - (size_t)ret); 8918 + btrfs_delalloc_release_extents(BTRFS_I(inode), count); 8818 8919 } 8819 8920 out: 8820 8921 if (wakeup) ··· 9133 9232 fs_info->sectorsize); 9134 9233 if (reserved_space < PAGE_SIZE) { 9135 9234 end = page_start + reserved_space - 1; 9136 - spin_lock(&BTRFS_I(inode)->lock); 9137 - BTRFS_I(inode)->outstanding_extents++; 9138 - spin_unlock(&BTRFS_I(inode)->lock); 9139 9235 btrfs_delalloc_release_space(inode, data_reserved, 9140 9236 page_start, PAGE_SIZE - reserved_space); 9141 9237 } ··· 9184 9286 9185 9287 out_unlock: 9186 9288 if (!ret) { 9289 + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); 9187 9290 sb_end_pagefault(inode->i_sb); 9188 9291 extent_changeset_free(data_reserved); 9189 9292 return VM_FAULT_LOCKED; 9190 9293 } 9191 9294 unlock_page(page); 9192 9295 out: 9296 + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); 9193 9297 btrfs_delalloc_release_space(inode, data_reserved, page_start, 9194 9298 reserved_space); 9195 9299 out_noreserve: ··· 9287 9387 ret = btrfs_truncate_inode_items(trans, root, inode, 9288 9388 inode->i_size, 9289 9389 BTRFS_EXTENT_DATA_KEY); 9390 + trans->block_rsv = &fs_info->trans_block_rsv; 9290 9391 if (ret != -ENOSPC && ret != -EAGAIN) { 9291 9392 err = ret; 9292 9393 break; 9293 9394 } 9294 9395 9295 - trans->block_rsv = &fs_info->trans_block_rsv; 9296 9396 ret = btrfs_update_inode(trans, root, inode); 9297 9397 if (ret) { 9298 9398 err = ret; ··· 9314 9414 rsv, min_size, 0); 9315 9415 BUG_ON(ret); /* shouldn't happen */ 9316 9416 trans->block_rsv = rsv; 9417 + } 9418 + 9419 + /* 9420 + * We can't call btrfs_truncate_block inside a trans handle as we could 9421 + * deadlock with freeze, if we got NEED_TRUNCATE_BLOCK then we know 9422 + * we've truncated everything except the last little bit, and can do 9423 + * btrfs_truncate_block and then update the disk_i_size. 9424 + */ 9425 + if (ret == NEED_TRUNCATE_BLOCK) { 9426 + btrfs_end_transaction(trans); 9427 + btrfs_btree_balance_dirty(fs_info); 9428 + 9429 + ret = btrfs_truncate_block(inode, inode->i_size, 0, 0); 9430 + if (ret) 9431 + goto out; 9432 + trans = btrfs_start_transaction(root, 1); 9433 + if (IS_ERR(trans)) { 9434 + ret = PTR_ERR(trans); 9435 + goto out; 9436 + } 9437 + btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 9317 9438 } 9318 9439 9319 9440 if (ret == 0 && inode->i_nlink > 0) { ··· 9401 9480 9402 9481 struct inode *btrfs_alloc_inode(struct super_block *sb) 9403 9482 { 9483 + struct btrfs_fs_info *fs_info = btrfs_sb(sb); 9404 9484 struct btrfs_inode *ei; 9405 9485 struct inode *inode; 9406 9486 ··· 9428 9506 9429 9507 spin_lock_init(&ei->lock); 9430 9508 ei->outstanding_extents = 0; 9431 - ei->reserved_extents = 0; 9432 - 9509 + if (sb->s_magic != BTRFS_TEST_MAGIC) 9510 + btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv, 9511 + BTRFS_BLOCK_RSV_DELALLOC); 9433 9512 ei->runtime_flags = 0; 9434 9513 ei->prop_compress = BTRFS_COMPRESS_NONE; 9435 9514 ei->defrag_compress = BTRFS_COMPRESS_NONE; ··· 9480 9557 9481 9558 WARN_ON(!hlist_empty(&inode->i_dentry)); 9482 9559 WARN_ON(inode->i_data.nrpages); 9560 + WARN_ON(BTRFS_I(inode)->block_rsv.reserved); 9561 + WARN_ON(BTRFS_I(inode)->block_rsv.size); 9483 9562 WARN_ON(BTRFS_I(inode)->outstanding_extents); 9484 - WARN_ON(BTRFS_I(inode)->reserved_extents); 9485 9563 WARN_ON(BTRFS_I(inode)->delalloc_bytes); 9486 9564 WARN_ON(BTRFS_I(inode)->new_delalloc_bytes); 9487 9565 WARN_ON(BTRFS_I(inode)->csum_bytes); ··· 10261 10337 ret = __start_delalloc_inodes(root, delay_iput, -1); 10262 10338 if (ret > 0) 10263 10339 ret = 0; 10264 - /* 10265 - * the filemap_flush will queue IO into the worker threads, but 10266 - * we have to make sure the IO is actually started and that 10267 - * ordered extents get created before we return 10268 - */ 10269 - atomic_inc(&fs_info->async_submit_draining); 10270 - while (atomic_read(&fs_info->nr_async_submits) || 10271 - atomic_read(&fs_info->async_delalloc_pages)) { 10272 - wait_event(fs_info->async_submit_wait, 10273 - (atomic_read(&fs_info->nr_async_submits) == 0 && 10274 - atomic_read(&fs_info->async_delalloc_pages) == 0)); 10275 - } 10276 - atomic_dec(&fs_info->async_submit_draining); 10277 10340 return ret; 10278 10341 } 10279 10342 ··· 10302 10391 spin_unlock(&fs_info->delalloc_root_lock); 10303 10392 10304 10393 ret = 0; 10305 - atomic_inc(&fs_info->async_submit_draining); 10306 - while (atomic_read(&fs_info->nr_async_submits) || 10307 - atomic_read(&fs_info->async_delalloc_pages)) { 10308 - wait_event(fs_info->async_submit_wait, 10309 - (atomic_read(&fs_info->nr_async_submits) == 0 && 10310 - atomic_read(&fs_info->async_delalloc_pages) == 0)); 10311 - } 10312 - atomic_dec(&fs_info->async_submit_draining); 10313 10394 out: 10314 10395 if (!list_empty_careful(&splice)) { 10315 10396 spin_lock(&fs_info->delalloc_root_lock);

+102 -54

fs/btrfs/ioctl.c

··· 86 86 struct btrfs_ioctl_received_subvol_args_32) 87 87 #endif 88 88 89 + #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) 90 + struct btrfs_ioctl_send_args_32 { 91 + __s64 send_fd; /* in */ 92 + __u64 clone_sources_count; /* in */ 93 + compat_uptr_t clone_sources; /* in */ 94 + __u64 parent_root; /* in */ 95 + __u64 flags; /* in */ 96 + __u64 reserved[4]; /* in */ 97 + } __attribute__ ((__packed__)); 98 + 99 + #define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \ 100 + struct btrfs_ioctl_send_args_32) 101 + #endif 89 102 90 103 static int btrfs_clone(struct inode *src, struct inode *inode, 91 104 u64 off, u64 olen, u64 olen_aligned, u64 destoff, ··· 622 609 return ret; 623 610 } 624 611 625 - static void btrfs_wait_for_no_snapshotting_writes(struct btrfs_root *root) 626 - { 627 - s64 writers; 628 - DEFINE_WAIT(wait); 629 - 630 - do { 631 - prepare_to_wait(&root->subv_writers->wait, &wait, 632 - TASK_UNINTERRUPTIBLE); 633 - 634 - writers = percpu_counter_sum(&root->subv_writers->counter); 635 - if (writers) 636 - schedule(); 637 - 638 - finish_wait(&root->subv_writers->wait, &wait); 639 - } while (writers); 640 - } 641 - 642 612 static int create_snapshot(struct btrfs_root *root, struct inode *dir, 643 613 struct dentry *dentry, 644 614 u64 *async_transid, bool readonly, ··· 650 654 651 655 atomic_inc(&root->will_be_snapshotted); 652 656 smp_mb__after_atomic(); 653 - btrfs_wait_for_no_snapshotting_writes(root); 657 + /* wait for no snapshot writes */ 658 + wait_event(root->subv_writers->wait, 659 + percpu_counter_sum(&root->subv_writers->counter) == 0); 654 660 655 661 ret = btrfs_start_delalloc_inodes(root, 0); 656 662 if (ret) ··· 1217 1219 unlock_page(pages[i]); 1218 1220 put_page(pages[i]); 1219 1221 } 1222 + btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); 1220 1223 extent_changeset_free(data_reserved); 1221 1224 return i_done; 1222 1225 out: ··· 1228 1229 btrfs_delalloc_release_space(inode, data_reserved, 1229 1230 start_index << PAGE_SHIFT, 1230 1231 page_cnt << PAGE_SHIFT); 1232 + btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); 1231 1233 extent_changeset_free(data_reserved); 1232 1234 return ret; 1233 1235 ··· 1418 1418 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1419 1419 &BTRFS_I(inode)->runtime_flags)) 1420 1420 filemap_flush(inode->i_mapping); 1421 - } 1422 - 1423 - if (do_compress) { 1424 - /* the filemap_flush will queue IO into the worker threads, but 1425 - * we have to make sure the IO is actually started and that 1426 - * ordered extents get created before we return 1427 - */ 1428 - atomic_inc(&fs_info->async_submit_draining); 1429 - while (atomic_read(&fs_info->nr_async_submits) || 1430 - atomic_read(&fs_info->async_delalloc_pages)) { 1431 - wait_event(fs_info->async_submit_wait, 1432 - (atomic_read(&fs_info->nr_async_submits) == 0 && 1433 - atomic_read(&fs_info->async_delalloc_pages) == 0)); 1434 - } 1435 - atomic_dec(&fs_info->async_submit_draining); 1436 1421 } 1437 1422 1438 1423 if (range->compress_type == BTRFS_COMPRESS_LZO) { ··· 1827 1842 1828 1843 ret = btrfs_update_root(trans, fs_info->tree_root, 1829 1844 &root->root_key, &root->root_item); 1845 + if (ret < 0) { 1846 + btrfs_end_transaction(trans); 1847 + goto out_reset; 1848 + } 1830 1849 1831 - btrfs_commit_transaction(trans); 1850 + ret = btrfs_commit_transaction(trans); 1851 + 1832 1852 out_reset: 1833 1853 if (ret) 1834 1854 btrfs_set_root_flags(&root->root_item, root_flags); ··· 2169 2179 2170 2180 inode = file_inode(file); 2171 2181 ret = search_ioctl(inode, &args.key, &buf_size, 2172 - (char *)(&uarg->buf[0])); 2182 + (char __user *)(&uarg->buf[0])); 2173 2183 if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) 2174 2184 ret = -EFAULT; 2175 2185 else if (ret == -EOVERFLOW && ··· 3696 3706 if (disko) { 3697 3707 inode_add_bytes(inode, datal); 3698 3708 ret = btrfs_inc_extent_ref(trans, 3699 - fs_info, 3709 + root, 3700 3710 disko, diskl, 0, 3701 3711 root->root_key.objectid, 3702 3712 btrfs_ino(BTRFS_I(inode)), ··· 4119 4129 struct btrfs_ioctl_space_info *dest_orig; 4120 4130 struct btrfs_ioctl_space_info __user *user_dest; 4121 4131 struct btrfs_space_info *info; 4122 - u64 types[] = {BTRFS_BLOCK_GROUP_DATA, 4123 - BTRFS_BLOCK_GROUP_SYSTEM, 4124 - BTRFS_BLOCK_GROUP_METADATA, 4125 - BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA}; 4132 + static const u64 types[] = { 4133 + BTRFS_BLOCK_GROUP_DATA, 4134 + BTRFS_BLOCK_GROUP_SYSTEM, 4135 + BTRFS_BLOCK_GROUP_METADATA, 4136 + BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA 4137 + }; 4126 4138 int num_types = 4; 4127 4139 int alloc_size; 4128 4140 int ret = 0; ··· 4496 4504 ipath->fspath->val[i] = rel_ptr; 4497 4505 } 4498 4506 4499 - ret = copy_to_user((void *)(unsigned long)ipa->fspath, 4500 - (void *)(unsigned long)ipath->fspath, size); 4507 + ret = copy_to_user((void __user *)(unsigned long)ipa->fspath, 4508 + ipath->fspath, size); 4501 4509 if (ret) { 4502 4510 ret = -EFAULT; 4503 4511 goto out; ··· 4532 4540 } 4533 4541 4534 4542 static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, 4535 - void __user *arg) 4543 + void __user *arg, int version) 4536 4544 { 4537 4545 int ret = 0; 4538 4546 int size; 4539 4547 struct btrfs_ioctl_logical_ino_args *loi; 4540 4548 struct btrfs_data_container *inodes = NULL; 4541 4549 struct btrfs_path *path = NULL; 4550 + bool ignore_offset; 4542 4551 4543 4552 if (!capable(CAP_SYS_ADMIN)) 4544 4553 return -EPERM; ··· 4548 4555 if (IS_ERR(loi)) 4549 4556 return PTR_ERR(loi); 4550 4557 4558 + if (version == 1) { 4559 + ignore_offset = false; 4560 + size = min_t(u32, loi->size, SZ_64K); 4561 + } else { 4562 + /* All reserved bits must be 0 for now */ 4563 + if (memchr_inv(loi->reserved, 0, sizeof(loi->reserved))) { 4564 + ret = -EINVAL; 4565 + goto out_loi; 4566 + } 4567 + /* Only accept flags we have defined so far */ 4568 + if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) { 4569 + ret = -EINVAL; 4570 + goto out_loi; 4571 + } 4572 + ignore_offset = loi->flags & BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET; 4573 + size = min_t(u32, loi->size, SZ_16M); 4574 + } 4575 + 4551 4576 path = btrfs_alloc_path(); 4552 4577 if (!path) { 4553 4578 ret = -ENOMEM; 4554 4579 goto out; 4555 4580 } 4556 4581 4557 - size = min_t(u32, loi->size, SZ_64K); 4558 4582 inodes = init_data_container(size); 4559 4583 if (IS_ERR(inodes)) { 4560 4584 ret = PTR_ERR(inodes); ··· 4580 4570 } 4581 4571 4582 4572 ret = iterate_inodes_from_logical(loi->logical, fs_info, path, 4583 - build_ino_list, inodes); 4573 + build_ino_list, inodes, ignore_offset); 4584 4574 if (ret == -EINVAL) 4585 4575 ret = -ENOENT; 4586 4576 if (ret < 0) 4587 4577 goto out; 4588 4578 4589 - ret = copy_to_user((void *)(unsigned long)loi->inodes, 4590 - (void *)(unsigned long)inodes, size); 4579 + ret = copy_to_user((void __user *)(unsigned long)loi->inodes, inodes, 4580 + size); 4591 4581 if (ret) 4592 4582 ret = -EFAULT; 4593 4583 4594 4584 out: 4595 4585 btrfs_free_path(path); 4596 4586 kvfree(inodes); 4587 + out_loi: 4597 4588 kfree(loi); 4598 4589 4599 4590 return ret; ··· 5171 5160 root->root_key.objectid); 5172 5161 if (ret < 0 && ret != -EEXIST) { 5173 5162 btrfs_abort_transaction(trans, ret); 5163 + btrfs_end_transaction(trans); 5174 5164 goto out; 5175 5165 } 5176 5166 } 5177 5167 ret = btrfs_commit_transaction(trans); 5178 - if (ret < 0) { 5179 - btrfs_abort_transaction(trans, ret); 5180 - goto out; 5181 - } 5182 - 5183 5168 out: 5184 5169 up_write(&fs_info->subvol_sem); 5185 5170 mnt_drop_write_file(file); ··· 5497 5490 return ret; 5498 5491 } 5499 5492 5493 + static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat) 5494 + { 5495 + struct btrfs_ioctl_send_args *arg; 5496 + int ret; 5497 + 5498 + if (compat) { 5499 + #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) 5500 + struct btrfs_ioctl_send_args_32 args32; 5501 + 5502 + ret = copy_from_user(&args32, argp, sizeof(args32)); 5503 + if (ret) 5504 + return -EFAULT; 5505 + arg = kzalloc(sizeof(*arg), GFP_KERNEL); 5506 + if (!arg) 5507 + return -ENOMEM; 5508 + arg->send_fd = args32.send_fd; 5509 + arg->clone_sources_count = args32.clone_sources_count; 5510 + arg->clone_sources = compat_ptr(args32.clone_sources); 5511 + arg->parent_root = args32.parent_root; 5512 + arg->flags = args32.flags; 5513 + memcpy(arg->reserved, args32.reserved, 5514 + sizeof(args32.reserved)); 5515 + #else 5516 + return -ENOTTY; 5517 + #endif 5518 + } else { 5519 + arg = memdup_user(argp, sizeof(*arg)); 5520 + if (IS_ERR(arg)) 5521 + return PTR_ERR(arg); 5522 + } 5523 + ret = btrfs_ioctl_send(file, arg); 5524 + kfree(arg); 5525 + return ret; 5526 + } 5527 + 5500 5528 long btrfs_ioctl(struct file *file, unsigned int 5501 5529 cmd, unsigned long arg) 5502 5530 { ··· 5596 5554 case BTRFS_IOC_INO_PATHS: 5597 5555 return btrfs_ioctl_ino_to_path(root, argp); 5598 5556 case BTRFS_IOC_LOGICAL_INO: 5599 - return btrfs_ioctl_logical_to_ino(fs_info, argp); 5557 + return btrfs_ioctl_logical_to_ino(fs_info, argp, 1); 5558 + case BTRFS_IOC_LOGICAL_INO_V2: 5559 + return btrfs_ioctl_logical_to_ino(fs_info, argp, 2); 5600 5560 case BTRFS_IOC_SPACE_INFO: 5601 5561 return btrfs_ioctl_space_info(fs_info, argp); 5602 5562 case BTRFS_IOC_SYNC: { ··· 5639 5595 return btrfs_ioctl_set_received_subvol_32(file, argp); 5640 5596 #endif 5641 5597 case BTRFS_IOC_SEND: 5642 - return btrfs_ioctl_send(file, argp); 5598 + return _btrfs_ioctl_send(file, argp, false); 5599 + #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) 5600 + case BTRFS_IOC_SEND_32: 5601 + return _btrfs_ioctl_send(file, argp, true); 5602 + #endif 5643 5603 case BTRFS_IOC_GET_DEV_STATS: 5644 5604 return btrfs_ioctl_get_dev_stats(fs_info, argp); 5645 5605 case BTRFS_IOC_QUOTA_CTL:

+5

fs/btrfs/lzo.c

··· 430 430 return ret; 431 431 } 432 432 433 + static void lzo_set_level(struct list_head *ws, unsigned int type) 434 + { 435 + } 436 + 433 437 const struct btrfs_compress_op btrfs_lzo_compress = { 434 438 .alloc_workspace = lzo_alloc_workspace, 435 439 .free_workspace = lzo_free_workspace, 436 440 .compress_pages = lzo_compress_pages, 437 441 .decompress_bio = lzo_decompress_bio, 438 442 .decompress = lzo_decompress, 443 + .set_level = lzo_set_level, 439 444 };

+19 -2

fs/btrfs/ordered-data.c

··· 242 242 } 243 243 spin_unlock(&root->ordered_extent_lock); 244 244 245 + /* 246 + * We don't need the count_max_extents here, we can assume that all of 247 + * that work has been done at higher layers, so this is truly the 248 + * smallest the extent is going to get. 249 + */ 250 + spin_lock(&BTRFS_I(inode)->lock); 251 + btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); 252 + spin_unlock(&BTRFS_I(inode)->lock); 253 + 245 254 return 0; 246 255 } 247 256 ··· 600 591 { 601 592 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 602 593 struct btrfs_ordered_inode_tree *tree; 603 - struct btrfs_root *root = BTRFS_I(inode)->root; 594 + struct btrfs_inode *btrfs_inode = BTRFS_I(inode); 595 + struct btrfs_root *root = btrfs_inode->root; 604 596 struct rb_node *node; 605 597 bool dec_pending_ordered = false; 606 598 607 - tree = &BTRFS_I(inode)->ordered_tree; 599 + /* This is paired with btrfs_add_ordered_extent. */ 600 + spin_lock(&btrfs_inode->lock); 601 + btrfs_mod_outstanding_extents(btrfs_inode, -1); 602 + spin_unlock(&btrfs_inode->lock); 603 + if (root != fs_info->tree_root) 604 + btrfs_delalloc_release_metadata(btrfs_inode, entry->len); 605 + 606 + tree = &btrfs_inode->ordered_tree; 608 607 spin_lock_irq(&tree->lock); 609 608 node = &entry->rb_node; 610 609 rb_erase(node, &tree->tree);

+4 -4

fs/btrfs/qgroup.c

··· 1441 1441 u64 bytenr = qrecord->bytenr; 1442 1442 int ret; 1443 1443 1444 - ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root); 1444 + ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root, false); 1445 1445 if (ret < 0) 1446 1446 return ret; 1447 1447 ··· 2031 2031 /* Search commit root to find old_roots */ 2032 2032 ret = btrfs_find_all_roots(NULL, fs_info, 2033 2033 record->bytenr, 0, 2034 - &record->old_roots); 2034 + &record->old_roots, false); 2035 2035 if (ret < 0) 2036 2036 goto cleanup; 2037 2037 } ··· 2042 2042 * root. It's safe inside commit_transaction(). 2043 2043 */ 2044 2044 ret = btrfs_find_all_roots(trans, fs_info, 2045 - record->bytenr, SEQ_LAST, &new_roots); 2045 + record->bytenr, SEQ_LAST, &new_roots, false); 2046 2046 if (ret < 0) 2047 2047 goto cleanup; 2048 2048 if (qgroup_to_skip) { ··· 2570 2570 num_bytes = found.offset; 2571 2571 2572 2572 ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, 2573 - &roots); 2573 + &roots, false); 2574 2574 if (ret < 0) 2575 2575 goto out; 2576 2576 /* For rescan, just pass old_roots as NULL */

+27 -3

fs/btrfs/raid56.c

··· 1326 1326 1327 1327 cleanup: 1328 1328 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1329 + 1330 + while ((bio = bio_list_pop(&bio_list))) 1331 + bio_put(bio); 1329 1332 } 1330 1333 1331 1334 /* ··· 1585 1582 1586 1583 cleanup: 1587 1584 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1585 + 1586 + while ((bio = bio_list_pop(&bio_list))) 1587 + bio_put(bio); 1588 + 1588 1589 return -EIO; 1589 1590 1590 1591 finish: ··· 2114 2107 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 2115 2108 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) 2116 2109 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2110 + 2111 + while ((bio = bio_list_pop(&bio_list))) 2112 + bio_put(bio); 2113 + 2117 2114 return -EIO; 2118 2115 } 2119 2116 ··· 2242 2231 ASSERT(!bio->bi_iter.bi_size); 2243 2232 rbio->operation = BTRFS_RBIO_PARITY_SCRUB; 2244 2233 2245 - for (i = 0; i < rbio->real_stripes; i++) { 2234 + /* 2235 + * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted 2236 + * to the end position, so this search can start from the first parity 2237 + * stripe. 2238 + */ 2239 + for (i = rbio->nr_data; i < rbio->real_stripes; i++) { 2246 2240 if (bbio->stripes[i].dev == scrub_dev) { 2247 2241 rbio->scrubp = i; 2248 2242 break; 2249 2243 } 2250 2244 } 2245 + ASSERT(i < rbio->real_stripes); 2251 2246 2252 2247 /* Now we just support the sectorsize equals to page size */ 2253 2248 ASSERT(fs_info->sectorsize == PAGE_SIZE); ··· 2471 2454 2472 2455 cleanup: 2473 2456 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2457 + 2458 + while ((bio = bio_list_pop(&bio_list))) 2459 + bio_put(bio); 2474 2460 } 2475 2461 2476 2462 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) ··· 2583 2563 int stripe; 2584 2564 struct bio *bio; 2585 2565 2566 + bio_list_init(&bio_list); 2567 + 2586 2568 ret = alloc_rbio_essential_pages(rbio); 2587 2569 if (ret) 2588 2570 goto cleanup; 2589 - 2590 - bio_list_init(&bio_list); 2591 2571 2592 2572 atomic_set(&rbio->error, 0); 2593 2573 /* ··· 2656 2636 2657 2637 cleanup: 2658 2638 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2639 + 2640 + while ((bio = bio_list_pop(&bio_list))) 2641 + bio_put(bio); 2642 + 2659 2643 return; 2660 2644 2661 2645 finish:

+1031

fs/btrfs/ref-verify.c

··· 1 + /* 2 + * Copyright (C) 2014 Facebook. All rights reserved. 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public 6 + * License v2 as published by the Free Software Foundation. 7 + * 8 + * This program is distributed in the hope that it will be useful, 9 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 + * General Public License for more details. 12 + * 13 + * You should have received a copy of the GNU General Public 14 + * License along with this program; if not, write to the 15 + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 + * Boston, MA 021110-1307, USA. 17 + */ 18 + 19 + #include <linux/sched.h> 20 + #include <linux/stacktrace.h> 21 + #include "ctree.h" 22 + #include "disk-io.h" 23 + #include "locking.h" 24 + #include "delayed-ref.h" 25 + #include "ref-verify.h" 26 + 27 + /* 28 + * Used to keep track the roots and number of refs each root has for a given 29 + * bytenr. This just tracks the number of direct references, no shared 30 + * references. 31 + */ 32 + struct root_entry { 33 + u64 root_objectid; 34 + u64 num_refs; 35 + struct rb_node node; 36 + }; 37 + 38 + /* 39 + * These are meant to represent what should exist in the extent tree, these can 40 + * be used to verify the extent tree is consistent as these should all match 41 + * what the extent tree says. 42 + */ 43 + struct ref_entry { 44 + u64 root_objectid; 45 + u64 parent; 46 + u64 owner; 47 + u64 offset; 48 + u64 num_refs; 49 + struct rb_node node; 50 + }; 51 + 52 + #define MAX_TRACE 16 53 + 54 + /* 55 + * Whenever we add/remove a reference we record the action. The action maps 56 + * back to the delayed ref action. We hold the ref we are changing in the 57 + * action so we can account for the history properly, and we record the root we 58 + * were called with since it could be different from ref_root. We also store 59 + * stack traces because thats how I roll. 60 + */ 61 + struct ref_action { 62 + int action; 63 + u64 root; 64 + struct ref_entry ref; 65 + struct list_head list; 66 + unsigned long trace[MAX_TRACE]; 67 + unsigned int trace_len; 68 + }; 69 + 70 + /* 71 + * One of these for every block we reference, it holds the roots and references 72 + * to it as well as all of the ref actions that have occured to it. We never 73 + * free it until we unmount the file system in order to make sure re-allocations 74 + * are happening properly. 75 + */ 76 + struct block_entry { 77 + u64 bytenr; 78 + u64 len; 79 + u64 num_refs; 80 + int metadata; 81 + int from_disk; 82 + struct rb_root roots; 83 + struct rb_root refs; 84 + struct rb_node node; 85 + struct list_head actions; 86 + }; 87 + 88 + static struct block_entry *insert_block_entry(struct rb_root *root, 89 + struct block_entry *be) 90 + { 91 + struct rb_node **p = &root->rb_node; 92 + struct rb_node *parent_node = NULL; 93 + struct block_entry *entry; 94 + 95 + while (*p) { 96 + parent_node = *p; 97 + entry = rb_entry(parent_node, struct block_entry, node); 98 + if (entry->bytenr > be->bytenr) 99 + p = &(*p)->rb_left; 100 + else if (entry->bytenr < be->bytenr) 101 + p = &(*p)->rb_right; 102 + else 103 + return entry; 104 + } 105 + 106 + rb_link_node(&be->node, parent_node, p); 107 + rb_insert_color(&be->node, root); 108 + return NULL; 109 + } 110 + 111 + static struct block_entry *lookup_block_entry(struct rb_root *root, u64 bytenr) 112 + { 113 + struct rb_node *n; 114 + struct block_entry *entry = NULL; 115 + 116 + n = root->rb_node; 117 + while (n) { 118 + entry = rb_entry(n, struct block_entry, node); 119 + if (entry->bytenr < bytenr) 120 + n = n->rb_right; 121 + else if (entry->bytenr > bytenr) 122 + n = n->rb_left; 123 + else 124 + return entry; 125 + } 126 + return NULL; 127 + } 128 + 129 + static struct root_entry *insert_root_entry(struct rb_root *root, 130 + struct root_entry *re) 131 + { 132 + struct rb_node **p = &root->rb_node; 133 + struct rb_node *parent_node = NULL; 134 + struct root_entry *entry; 135 + 136 + while (*p) { 137 + parent_node = *p; 138 + entry = rb_entry(parent_node, struct root_entry, node); 139 + if (entry->root_objectid > re->root_objectid) 140 + p = &(*p)->rb_left; 141 + else if (entry->root_objectid < re->root_objectid) 142 + p = &(*p)->rb_right; 143 + else 144 + return entry; 145 + } 146 + 147 + rb_link_node(&re->node, parent_node, p); 148 + rb_insert_color(&re->node, root); 149 + return NULL; 150 + 151 + } 152 + 153 + static int comp_refs(struct ref_entry *ref1, struct ref_entry *ref2) 154 + { 155 + if (ref1->root_objectid < ref2->root_objectid) 156 + return -1; 157 + if (ref1->root_objectid > ref2->root_objectid) 158 + return 1; 159 + if (ref1->parent < ref2->parent) 160 + return -1; 161 + if (ref1->parent > ref2->parent) 162 + return 1; 163 + if (ref1->owner < ref2->owner) 164 + return -1; 165 + if (ref1->owner > ref2->owner) 166 + return 1; 167 + if (ref1->offset < ref2->offset) 168 + return -1; 169 + if (ref1->offset > ref2->offset) 170 + return 1; 171 + return 0; 172 + } 173 + 174 + static struct ref_entry *insert_ref_entry(struct rb_root *root, 175 + struct ref_entry *ref) 176 + { 177 + struct rb_node **p = &root->rb_node; 178 + struct rb_node *parent_node = NULL; 179 + struct ref_entry *entry; 180 + int cmp; 181 + 182 + while (*p) { 183 + parent_node = *p; 184 + entry = rb_entry(parent_node, struct ref_entry, node); 185 + cmp = comp_refs(entry, ref); 186 + if (cmp > 0) 187 + p = &(*p)->rb_left; 188 + else if (cmp < 0) 189 + p = &(*p)->rb_right; 190 + else 191 + return entry; 192 + } 193 + 194 + rb_link_node(&ref->node, parent_node, p); 195 + rb_insert_color(&ref->node, root); 196 + return NULL; 197 + 198 + } 199 + 200 + static struct root_entry *lookup_root_entry(struct rb_root *root, u64 objectid) 201 + { 202 + struct rb_node *n; 203 + struct root_entry *entry = NULL; 204 + 205 + n = root->rb_node; 206 + while (n) { 207 + entry = rb_entry(n, struct root_entry, node); 208 + if (entry->root_objectid < objectid) 209 + n = n->rb_right; 210 + else if (entry->root_objectid > objectid) 211 + n = n->rb_left; 212 + else 213 + return entry; 214 + } 215 + return NULL; 216 + } 217 + 218 + #ifdef CONFIG_STACKTRACE 219 + static void __save_stack_trace(struct ref_action *ra) 220 + { 221 + struct stack_trace stack_trace; 222 + 223 + stack_trace.max_entries = MAX_TRACE; 224 + stack_trace.nr_entries = 0; 225 + stack_trace.entries = ra->trace; 226 + stack_trace.skip = 2; 227 + save_stack_trace(&stack_trace); 228 + ra->trace_len = stack_trace.nr_entries; 229 + } 230 + 231 + static void __print_stack_trace(struct btrfs_fs_info *fs_info, 232 + struct ref_action *ra) 233 + { 234 + struct stack_trace trace; 235 + 236 + if (ra->trace_len == 0) { 237 + btrfs_err(fs_info, " ref-verify: no stacktrace"); 238 + return; 239 + } 240 + trace.nr_entries = ra->trace_len; 241 + trace.entries = ra->trace; 242 + print_stack_trace(&trace, 2); 243 + } 244 + #else 245 + static void inline __save_stack_trace(struct ref_action *ra) 246 + { 247 + } 248 + 249 + static void inline __print_stack_trace(struct btrfs_fs_info *fs_info, 250 + struct ref_action *ra) 251 + { 252 + btrfs_err(fs_info, " ref-verify: no stacktrace support"); 253 + } 254 + #endif 255 + 256 + static void free_block_entry(struct block_entry *be) 257 + { 258 + struct root_entry *re; 259 + struct ref_entry *ref; 260 + struct ref_action *ra; 261 + struct rb_node *n; 262 + 263 + while ((n = rb_first(&be->roots))) { 264 + re = rb_entry(n, struct root_entry, node); 265 + rb_erase(&re->node, &be->roots); 266 + kfree(re); 267 + } 268 + 269 + while((n = rb_first(&be->refs))) { 270 + ref = rb_entry(n, struct ref_entry, node); 271 + rb_erase(&ref->node, &be->refs); 272 + kfree(ref); 273 + } 274 + 275 + while (!list_empty(&be->actions)) { 276 + ra = list_first_entry(&be->actions, struct ref_action, 277 + list); 278 + list_del(&ra->list); 279 + kfree(ra); 280 + } 281 + kfree(be); 282 + } 283 + 284 + static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info, 285 + u64 bytenr, u64 len, 286 + u64 root_objectid) 287 + { 288 + struct block_entry *be = NULL, *exist; 289 + struct root_entry *re = NULL; 290 + 291 + re = kzalloc(sizeof(struct root_entry), GFP_KERNEL); 292 + be = kzalloc(sizeof(struct block_entry), GFP_KERNEL); 293 + if (!be || !re) { 294 + kfree(re); 295 + kfree(be); 296 + return ERR_PTR(-ENOMEM); 297 + } 298 + be->bytenr = bytenr; 299 + be->len = len; 300 + 301 + re->root_objectid = root_objectid; 302 + re->num_refs = 0; 303 + 304 + spin_lock(&fs_info->ref_verify_lock); 305 + exist = insert_block_entry(&fs_info->block_tree, be); 306 + if (exist) { 307 + if (root_objectid) { 308 + struct root_entry *exist_re; 309 + 310 + exist_re = insert_root_entry(&exist->roots, re); 311 + if (exist_re) 312 + kfree(re); 313 + } 314 + kfree(be); 315 + return exist; 316 + } 317 + 318 + be->num_refs = 0; 319 + be->metadata = 0; 320 + be->from_disk = 0; 321 + be->roots = RB_ROOT; 322 + be->refs = RB_ROOT; 323 + INIT_LIST_HEAD(&be->actions); 324 + if (root_objectid) 325 + insert_root_entry(&be->roots, re); 326 + else 327 + kfree(re); 328 + return be; 329 + } 330 + 331 + static int add_tree_block(struct btrfs_fs_info *fs_info, u64 ref_root, 332 + u64 parent, u64 bytenr, int level) 333 + { 334 + struct block_entry *be; 335 + struct root_entry *re; 336 + struct ref_entry *ref = NULL, *exist; 337 + 338 + ref = kmalloc(sizeof(struct ref_entry), GFP_KERNEL); 339 + if (!ref) 340 + return -ENOMEM; 341 + 342 + if (parent) 343 + ref->root_objectid = 0; 344 + else 345 + ref->root_objectid = ref_root; 346 + ref->parent = parent; 347 + ref->owner = level; 348 + ref->offset = 0; 349 + ref->num_refs = 1; 350 + 351 + be = add_block_entry(fs_info, bytenr, fs_info->nodesize, ref_root); 352 + if (IS_ERR(be)) { 353 + kfree(ref); 354 + return PTR_ERR(be); 355 + } 356 + be->num_refs++; 357 + be->from_disk = 1; 358 + be->metadata = 1; 359 + 360 + if (!parent) { 361 + ASSERT(ref_root); 362 + re = lookup_root_entry(&be->roots, ref_root); 363 + ASSERT(re); 364 + re->num_refs++; 365 + } 366 + exist = insert_ref_entry(&be->refs, ref); 367 + if (exist) { 368 + exist->num_refs++; 369 + kfree(ref); 370 + } 371 + spin_unlock(&fs_info->ref_verify_lock); 372 + 373 + return 0; 374 + } 375 + 376 + static int add_shared_data_ref(struct btrfs_fs_info *fs_info, 377 + u64 parent, u32 num_refs, u64 bytenr, 378 + u64 num_bytes) 379 + { 380 + struct block_entry *be; 381 + struct ref_entry *ref; 382 + 383 + ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL); 384 + if (!ref) 385 + return -ENOMEM; 386 + be = add_block_entry(fs_info, bytenr, num_bytes, 0); 387 + if (IS_ERR(be)) { 388 + kfree(ref); 389 + return PTR_ERR(be); 390 + } 391 + be->num_refs += num_refs; 392 + 393 + ref->parent = parent; 394 + ref->num_refs = num_refs; 395 + if (insert_ref_entry(&be->refs, ref)) { 396 + spin_unlock(&fs_info->ref_verify_lock); 397 + btrfs_err(fs_info, "existing shared ref when reading from disk?"); 398 + kfree(ref); 399 + return -EINVAL; 400 + } 401 + spin_unlock(&fs_info->ref_verify_lock); 402 + return 0; 403 + } 404 + 405 + static int add_extent_data_ref(struct btrfs_fs_info *fs_info, 406 + struct extent_buffer *leaf, 407 + struct btrfs_extent_data_ref *dref, 408 + u64 bytenr, u64 num_bytes) 409 + { 410 + struct block_entry *be; 411 + struct ref_entry *ref; 412 + struct root_entry *re; 413 + u64 ref_root = btrfs_extent_data_ref_root(leaf, dref); 414 + u64 owner = btrfs_extent_data_ref_objectid(leaf, dref); 415 + u64 offset = btrfs_extent_data_ref_offset(leaf, dref); 416 + u32 num_refs = btrfs_extent_data_ref_count(leaf, dref); 417 + 418 + ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL); 419 + if (!ref) 420 + return -ENOMEM; 421 + be = add_block_entry(fs_info, bytenr, num_bytes, ref_root); 422 + if (IS_ERR(be)) { 423 + kfree(ref); 424 + return PTR_ERR(be); 425 + } 426 + be->num_refs += num_refs; 427 + 428 + ref->parent = 0; 429 + ref->owner = owner; 430 + ref->root_objectid = ref_root; 431 + ref->offset = offset; 432 + ref->num_refs = num_refs; 433 + if (insert_ref_entry(&be->refs, ref)) { 434 + spin_unlock(&fs_info->ref_verify_lock); 435 + btrfs_err(fs_info, "existing ref when reading from disk?"); 436 + kfree(ref); 437 + return -EINVAL; 438 + } 439 + 440 + re = lookup_root_entry(&be->roots, ref_root); 441 + if (!re) { 442 + spin_unlock(&fs_info->ref_verify_lock); 443 + btrfs_err(fs_info, "missing root in new block entry?"); 444 + return -EINVAL; 445 + } 446 + re->num_refs += num_refs; 447 + spin_unlock(&fs_info->ref_verify_lock); 448 + return 0; 449 + } 450 + 451 + static int process_extent_item(struct btrfs_fs_info *fs_info, 452 + struct btrfs_path *path, struct btrfs_key *key, 453 + int slot, int *tree_block_level) 454 + { 455 + struct btrfs_extent_item *ei; 456 + struct btrfs_extent_inline_ref *iref; 457 + struct btrfs_extent_data_ref *dref; 458 + struct btrfs_shared_data_ref *sref; 459 + struct extent_buffer *leaf = path->nodes[0]; 460 + u32 item_size = btrfs_item_size_nr(leaf, slot); 461 + unsigned long end, ptr; 462 + u64 offset, flags, count; 463 + int type, ret; 464 + 465 + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); 466 + flags = btrfs_extent_flags(leaf, ei); 467 + 468 + if ((key->type == BTRFS_EXTENT_ITEM_KEY) && 469 + flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 470 + struct btrfs_tree_block_info *info; 471 + 472 + info = (struct btrfs_tree_block_info *)(ei + 1); 473 + *tree_block_level = btrfs_tree_block_level(leaf, info); 474 + iref = (struct btrfs_extent_inline_ref *)(info + 1); 475 + } else { 476 + if (key->type == BTRFS_METADATA_ITEM_KEY) 477 + *tree_block_level = key->offset; 478 + iref = (struct btrfs_extent_inline_ref *)(ei + 1); 479 + } 480 + 481 + ptr = (unsigned long)iref; 482 + end = (unsigned long)ei + item_size; 483 + while (ptr < end) { 484 + iref = (struct btrfs_extent_inline_ref *)ptr; 485 + type = btrfs_extent_inline_ref_type(leaf, iref); 486 + offset = btrfs_extent_inline_ref_offset(leaf, iref); 487 + switch (type) { 488 + case BTRFS_TREE_BLOCK_REF_KEY: 489 + ret = add_tree_block(fs_info, offset, 0, key->objectid, 490 + *tree_block_level); 491 + break; 492 + case BTRFS_SHARED_BLOCK_REF_KEY: 493 + ret = add_tree_block(fs_info, 0, offset, key->objectid, 494 + *tree_block_level); 495 + break; 496 + case BTRFS_EXTENT_DATA_REF_KEY: 497 + dref = (struct btrfs_extent_data_ref *)(&iref->offset); 498 + ret = add_extent_data_ref(fs_info, leaf, dref, 499 + key->objectid, key->offset); 500 + break; 501 + case BTRFS_SHARED_DATA_REF_KEY: 502 + sref = (struct btrfs_shared_data_ref *)(iref + 1); 503 + count = btrfs_shared_data_ref_count(leaf, sref); 504 + ret = add_shared_data_ref(fs_info, offset, count, 505 + key->objectid, key->offset); 506 + break; 507 + default: 508 + btrfs_err(fs_info, "invalid key type in iref"); 509 + ret = -EINVAL; 510 + break; 511 + } 512 + if (ret) 513 + break; 514 + ptr += btrfs_extent_inline_ref_size(type); 515 + } 516 + return ret; 517 + } 518 + 519 + static int process_leaf(struct btrfs_root *root, 520 + struct btrfs_path *path, u64 *bytenr, u64 *num_bytes) 521 + { 522 + struct btrfs_fs_info *fs_info = root->fs_info; 523 + struct extent_buffer *leaf = path->nodes[0]; 524 + struct btrfs_extent_data_ref *dref; 525 + struct btrfs_shared_data_ref *sref; 526 + u32 count; 527 + int i = 0, tree_block_level = 0, ret; 528 + struct btrfs_key key; 529 + int nritems = btrfs_header_nritems(leaf); 530 + 531 + for (i = 0; i < nritems; i++) { 532 + btrfs_item_key_to_cpu(leaf, &key, i); 533 + switch (key.type) { 534 + case BTRFS_EXTENT_ITEM_KEY: 535 + *num_bytes = key.offset; 536 + case BTRFS_METADATA_ITEM_KEY: 537 + *bytenr = key.objectid; 538 + ret = process_extent_item(fs_info, path, &key, i, 539 + &tree_block_level); 540 + break; 541 + case BTRFS_TREE_BLOCK_REF_KEY: 542 + ret = add_tree_block(fs_info, key.offset, 0, 543 + key.objectid, tree_block_level); 544 + break; 545 + case BTRFS_SHARED_BLOCK_REF_KEY: 546 + ret = add_tree_block(fs_info, 0, key.offset, 547 + key.objectid, tree_block_level); 548 + break; 549 + case BTRFS_EXTENT_DATA_REF_KEY: 550 + dref = btrfs_item_ptr(leaf, i, 551 + struct btrfs_extent_data_ref); 552 + ret = add_extent_data_ref(fs_info, leaf, dref, *bytenr, 553 + *num_bytes); 554 + break; 555 + case BTRFS_SHARED_DATA_REF_KEY: 556 + sref = btrfs_item_ptr(leaf, i, 557 + struct btrfs_shared_data_ref); 558 + count = btrfs_shared_data_ref_count(leaf, sref); 559 + ret = add_shared_data_ref(fs_info, key.offset, count, 560 + *bytenr, *num_bytes); 561 + break; 562 + default: 563 + break; 564 + } 565 + if (ret) 566 + break; 567 + } 568 + return ret; 569 + } 570 + 571 + /* Walk down to the leaf from the given level */ 572 + static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, 573 + int level, u64 *bytenr, u64 *num_bytes) 574 + { 575 + struct btrfs_fs_info *fs_info = root->fs_info; 576 + struct extent_buffer *eb; 577 + u64 block_bytenr, gen; 578 + int ret = 0; 579 + 580 + while (level >= 0) { 581 + if (level) { 582 + block_bytenr = btrfs_node_blockptr(path->nodes[level], 583 + path->slots[level]); 584 + gen = btrfs_node_ptr_generation(path->nodes[level], 585 + path->slots[level]); 586 + eb = read_tree_block(fs_info, block_bytenr, gen); 587 + if (IS_ERR(eb)) 588 + return PTR_ERR(eb); 589 + if (!extent_buffer_uptodate(eb)) { 590 + free_extent_buffer(eb); 591 + return -EIO; 592 + } 593 + btrfs_tree_read_lock(eb); 594 + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 595 + path->nodes[level-1] = eb; 596 + path->slots[level-1] = 0; 597 + path->locks[level-1] = BTRFS_READ_LOCK_BLOCKING; 598 + } else { 599 + ret = process_leaf(root, path, bytenr, num_bytes); 600 + if (ret) 601 + break; 602 + } 603 + level--; 604 + } 605 + return ret; 606 + } 607 + 608 + /* Walk up to the next node that needs to be processed */ 609 + static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path, 610 + int *level) 611 + { 612 + int l; 613 + 614 + for (l = 0; l < BTRFS_MAX_LEVEL; l++) { 615 + if (!path->nodes[l]) 616 + continue; 617 + if (l) { 618 + path->slots[l]++; 619 + if (path->slots[l] < 620 + btrfs_header_nritems(path->nodes[l])) { 621 + *level = l; 622 + return 0; 623 + } 624 + } 625 + btrfs_tree_unlock_rw(path->nodes[l], path->locks[l]); 626 + free_extent_buffer(path->nodes[l]); 627 + path->nodes[l] = NULL; 628 + path->slots[l] = 0; 629 + path->locks[l] = 0; 630 + } 631 + 632 + return 1; 633 + } 634 + 635 + static void dump_ref_action(struct btrfs_fs_info *fs_info, 636 + struct ref_action *ra) 637 + { 638 + btrfs_err(fs_info, 639 + " Ref action %d, root %llu, ref_root %llu, parent %llu, owner %llu, offset %llu, num_refs %llu", 640 + ra->action, ra->root, ra->ref.root_objectid, ra->ref.parent, 641 + ra->ref.owner, ra->ref.offset, ra->ref.num_refs); 642 + __print_stack_trace(fs_info, ra); 643 + } 644 + 645 + /* 646 + * Dumps all the information from the block entry to printk, it's going to be 647 + * awesome. 648 + */ 649 + static void dump_block_entry(struct btrfs_fs_info *fs_info, 650 + struct block_entry *be) 651 + { 652 + struct ref_entry *ref; 653 + struct root_entry *re; 654 + struct ref_action *ra; 655 + struct rb_node *n; 656 + 657 + btrfs_err(fs_info, 658 + "dumping block entry [%llu %llu], num_refs %llu, metadata %d, from disk %d", 659 + be->bytenr, be->len, be->num_refs, be->metadata, 660 + be->from_disk); 661 + 662 + for (n = rb_first(&be->refs); n; n = rb_next(n)) { 663 + ref = rb_entry(n, struct ref_entry, node); 664 + btrfs_err(fs_info, 665 + " ref root %llu, parent %llu, owner %llu, offset %llu, num_refs %llu", 666 + ref->root_objectid, ref->parent, ref->owner, 667 + ref->offset, ref->num_refs); 668 + } 669 + 670 + for (n = rb_first(&be->roots); n; n = rb_next(n)) { 671 + re = rb_entry(n, struct root_entry, node); 672 + btrfs_err(fs_info, " root entry %llu, num_refs %llu", 673 + re->root_objectid, re->num_refs); 674 + } 675 + 676 + list_for_each_entry(ra, &be->actions, list) 677 + dump_ref_action(fs_info, ra); 678 + } 679 + 680 + /* 681 + * btrfs_ref_tree_mod: called when we modify a ref for a bytenr 682 + * @root: the root we are making this modification from. 683 + * @bytenr: the bytenr we are modifying. 684 + * @num_bytes: number of bytes. 685 + * @parent: the parent bytenr. 686 + * @ref_root: the original root owner of the bytenr. 687 + * @owner: level in the case of metadata, inode in the case of data. 688 + * @offset: 0 for metadata, file offset for data. 689 + * @action: the action that we are doing, this is the same as the delayed ref 690 + * action. 691 + * 692 + * This will add an action item to the given bytenr and do sanity checks to make 693 + * sure we haven't messed something up. If we are making a new allocation and 694 + * this block entry has history we will delete all previous actions as long as 695 + * our sanity checks pass as they are no longer needed. 696 + */ 697 + int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, 698 + u64 parent, u64 ref_root, u64 owner, u64 offset, 699 + int action) 700 + { 701 + struct btrfs_fs_info *fs_info = root->fs_info; 702 + struct ref_entry *ref = NULL, *exist; 703 + struct ref_action *ra = NULL; 704 + struct block_entry *be = NULL; 705 + struct root_entry *re = NULL; 706 + int ret = 0; 707 + bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID; 708 + 709 + if (!btrfs_test_opt(root->fs_info, REF_VERIFY)) 710 + return 0; 711 + 712 + ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS); 713 + ra = kmalloc(sizeof(struct ref_action), GFP_NOFS); 714 + if (!ra || !ref) { 715 + kfree(ref); 716 + kfree(ra); 717 + ret = -ENOMEM; 718 + goto out; 719 + } 720 + 721 + if (parent) { 722 + ref->parent = parent; 723 + } else { 724 + ref->root_objectid = ref_root; 725 + ref->owner = owner; 726 + ref->offset = offset; 727 + } 728 + ref->num_refs = (action == BTRFS_DROP_DELAYED_REF) ? -1 : 1; 729 + 730 + memcpy(&ra->ref, ref, sizeof(struct ref_entry)); 731 + /* 732 + * Save the extra info from the delayed ref in the ref action to make it 733 + * easier to figure out what is happening. The real ref's we add to the 734 + * ref tree need to reflect what we save on disk so it matches any 735 + * on-disk refs we pre-loaded. 736 + */ 737 + ra->ref.owner = owner; 738 + ra->ref.offset = offset; 739 + ra->ref.root_objectid = ref_root; 740 + __save_stack_trace(ra); 741 + 742 + INIT_LIST_HEAD(&ra->list); 743 + ra->action = action; 744 + ra->root = root->objectid; 745 + 746 + /* 747 + * This is an allocation, preallocate the block_entry in case we haven't 748 + * used it before. 749 + */ 750 + ret = -EINVAL; 751 + if (action == BTRFS_ADD_DELAYED_EXTENT) { 752 + /* 753 + * For subvol_create we'll just pass in whatever the parent root 754 + * is and the new root objectid, so let's not treat the passed 755 + * in root as if it really has a ref for this bytenr. 756 + */ 757 + be = add_block_entry(root->fs_info, bytenr, num_bytes, ref_root); 758 + if (IS_ERR(be)) { 759 + kfree(ra); 760 + ret = PTR_ERR(be); 761 + goto out; 762 + } 763 + be->num_refs++; 764 + if (metadata) 765 + be->metadata = 1; 766 + 767 + if (be->num_refs != 1) { 768 + btrfs_err(fs_info, 769 + "re-allocated a block that still has references to it!"); 770 + dump_block_entry(fs_info, be); 771 + dump_ref_action(fs_info, ra); 772 + goto out_unlock; 773 + } 774 + 775 + while (!list_empty(&be->actions)) { 776 + struct ref_action *tmp; 777 + 778 + tmp = list_first_entry(&be->actions, struct ref_action, 779 + list); 780 + list_del(&tmp->list); 781 + kfree(tmp); 782 + } 783 + } else { 784 + struct root_entry *tmp; 785 + 786 + if (!parent) { 787 + re = kmalloc(sizeof(struct root_entry), GFP_NOFS); 788 + if (!re) { 789 + kfree(ref); 790 + kfree(ra); 791 + ret = -ENOMEM; 792 + goto out; 793 + } 794 + /* 795 + * This is the root that is modifying us, so it's the 796 + * one we want to lookup below when we modify the 797 + * re->num_refs. 798 + */ 799 + ref_root = root->objectid; 800 + re->root_objectid = root->objectid; 801 + re->num_refs = 0; 802 + } 803 + 804 + spin_lock(&root->fs_info->ref_verify_lock); 805 + be = lookup_block_entry(&root->fs_info->block_tree, bytenr); 806 + if (!be) { 807 + btrfs_err(fs_info, 808 + "trying to do action %d to bytenr %llu num_bytes %llu but there is no existing entry!", 809 + action, (unsigned long long)bytenr, 810 + (unsigned long long)num_bytes); 811 + dump_ref_action(fs_info, ra); 812 + kfree(ref); 813 + kfree(ra); 814 + goto out_unlock; 815 + } 816 + 817 + if (!parent) { 818 + tmp = insert_root_entry(&be->roots, re); 819 + if (tmp) { 820 + kfree(re); 821 + re = tmp; 822 + } 823 + } 824 + } 825 + 826 + exist = insert_ref_entry(&be->refs, ref); 827 + if (exist) { 828 + if (action == BTRFS_DROP_DELAYED_REF) { 829 + if (exist->num_refs == 0) { 830 + btrfs_err(fs_info, 831 + "dropping a ref for a existing root that doesn't have a ref on the block"); 832 + dump_block_entry(fs_info, be); 833 + dump_ref_action(fs_info, ra); 834 + kfree(ra); 835 + goto out_unlock; 836 + } 837 + exist->num_refs--; 838 + if (exist->num_refs == 0) { 839 + rb_erase(&exist->node, &be->refs); 840 + kfree(exist); 841 + } 842 + } else if (!be->metadata) { 843 + exist->num_refs++; 844 + } else { 845 + btrfs_err(fs_info, 846 + "attempting to add another ref for an existing ref on a tree block"); 847 + dump_block_entry(fs_info, be); 848 + dump_ref_action(fs_info, ra); 849 + kfree(ra); 850 + goto out_unlock; 851 + } 852 + kfree(ref); 853 + } else { 854 + if (action == BTRFS_DROP_DELAYED_REF) { 855 + btrfs_err(fs_info, 856 + "dropping a ref for a root that doesn't have a ref on the block"); 857 + dump_block_entry(fs_info, be); 858 + dump_ref_action(fs_info, ra); 859 + kfree(ra); 860 + goto out_unlock; 861 + } 862 + } 863 + 864 + if (!parent && !re) { 865 + re = lookup_root_entry(&be->roots, ref_root); 866 + if (!re) { 867 + /* 868 + * This shouldn't happen because we will add our re 869 + * above when we lookup the be with !parent, but just in 870 + * case catch this case so we don't panic because I 871 + * didn't thik of some other corner case. 872 + */ 873 + btrfs_err(fs_info, "failed to find root %llu for %llu", 874 + root->objectid, be->bytenr); 875 + dump_block_entry(fs_info, be); 876 + dump_ref_action(fs_info, ra); 877 + kfree(ra); 878 + goto out_unlock; 879 + } 880 + } 881 + if (action == BTRFS_DROP_DELAYED_REF) { 882 + if (re) 883 + re->num_refs--; 884 + be->num_refs--; 885 + } else if (action == BTRFS_ADD_DELAYED_REF) { 886 + be->num_refs++; 887 + if (re) 888 + re->num_refs++; 889 + } 890 + list_add_tail(&ra->list, &be->actions); 891 + ret = 0; 892 + out_unlock: 893 + spin_unlock(&root->fs_info->ref_verify_lock); 894 + out: 895 + if (ret) 896 + btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); 897 + return ret; 898 + } 899 + 900 + /* Free up the ref cache */ 901 + void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info) 902 + { 903 + struct block_entry *be; 904 + struct rb_node *n; 905 + 906 + if (!btrfs_test_opt(fs_info, REF_VERIFY)) 907 + return; 908 + 909 + spin_lock(&fs_info->ref_verify_lock); 910 + while ((n = rb_first(&fs_info->block_tree))) { 911 + be = rb_entry(n, struct block_entry, node); 912 + rb_erase(&be->node, &fs_info->block_tree); 913 + free_block_entry(be); 914 + cond_resched_lock(&fs_info->ref_verify_lock); 915 + } 916 + spin_unlock(&fs_info->ref_verify_lock); 917 + } 918 + 919 + void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start, 920 + u64 len) 921 + { 922 + struct block_entry *be = NULL, *entry; 923 + struct rb_node *n; 924 + 925 + if (!btrfs_test_opt(fs_info, REF_VERIFY)) 926 + return; 927 + 928 + spin_lock(&fs_info->ref_verify_lock); 929 + n = fs_info->block_tree.rb_node; 930 + while (n) { 931 + entry = rb_entry(n, struct block_entry, node); 932 + if (entry->bytenr < start) { 933 + n = n->rb_right; 934 + } else if (entry->bytenr > start) { 935 + n = n->rb_left; 936 + } else { 937 + be = entry; 938 + break; 939 + } 940 + /* We want to get as close to start as possible */ 941 + if (be == NULL || 942 + (entry->bytenr < start && be->bytenr > start) || 943 + (entry->bytenr < start && entry->bytenr > be->bytenr)) 944 + be = entry; 945 + } 946 + 947 + /* 948 + * Could have an empty block group, maybe have something to check for 949 + * this case to verify we were actually empty? 950 + */ 951 + if (!be) { 952 + spin_unlock(&fs_info->ref_verify_lock); 953 + return; 954 + } 955 + 956 + n = &be->node; 957 + while (n) { 958 + be = rb_entry(n, struct block_entry, node); 959 + n = rb_next(n); 960 + if (be->bytenr < start && be->bytenr + be->len > start) { 961 + btrfs_err(fs_info, 962 + "block entry overlaps a block group [%llu,%llu]!", 963 + start, len); 964 + dump_block_entry(fs_info, be); 965 + continue; 966 + } 967 + if (be->bytenr < start) 968 + continue; 969 + if (be->bytenr >= start + len) 970 + break; 971 + if (be->bytenr + be->len > start + len) { 972 + btrfs_err(fs_info, 973 + "block entry overlaps a block group [%llu,%llu]!", 974 + start, len); 975 + dump_block_entry(fs_info, be); 976 + } 977 + rb_erase(&be->node, &fs_info->block_tree); 978 + free_block_entry(be); 979 + } 980 + spin_unlock(&fs_info->ref_verify_lock); 981 + } 982 + 983 + /* Walk down all roots and build the ref tree, meant to be called at mount */ 984 + int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) 985 + { 986 + struct btrfs_path *path; 987 + struct btrfs_root *root; 988 + struct extent_buffer *eb; 989 + u64 bytenr = 0, num_bytes = 0; 990 + int ret, level; 991 + 992 + if (!btrfs_test_opt(fs_info, REF_VERIFY)) 993 + return 0; 994 + 995 + path = btrfs_alloc_path(); 996 + if (!path) 997 + return -ENOMEM; 998 + 999 + eb = btrfs_read_lock_root_node(fs_info->extent_root); 1000 + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 1001 + level = btrfs_header_level(eb); 1002 + path->nodes[level] = eb; 1003 + path->slots[level] = 0; 1004 + path->locks[level] = BTRFS_READ_LOCK_BLOCKING; 1005 + 1006 + while (1) { 1007 + /* 1008 + * We have to keep track of the bytenr/num_bytes we last hit 1009 + * because we could have run out of space for an inline ref, and 1010 + * would have had to added a ref key item which may appear on a 1011 + * different leaf from the original extent item. 1012 + */ 1013 + ret = walk_down_tree(fs_info->extent_root, path, level, 1014 + &bytenr, &num_bytes); 1015 + if (ret) 1016 + break; 1017 + ret = walk_up_tree(root, path, &level); 1018 + if (ret < 0) 1019 + break; 1020 + if (ret > 0) { 1021 + ret = 0; 1022 + break; 1023 + } 1024 + } 1025 + if (ret) { 1026 + btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); 1027 + btrfs_free_ref_cache(fs_info); 1028 + } 1029 + btrfs_free_path(path); 1030 + return ret; 1031 + }

+62

fs/btrfs/ref-verify.h

··· 1 + /* 2 + * Copyright (C) 2014 Facebook. All rights reserved. 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public 6 + * License v2 as published by the Free Software Foundation. 7 + * 8 + * This program is distributed in the hope that it will be useful, 9 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 + * General Public License for more details. 12 + * 13 + * You should have received a copy of the GNU General Public 14 + * License along with this program; if not, write to the 15 + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 + * Boston, MA 021110-1307, USA. 17 + */ 18 + #ifndef __REF_VERIFY__ 19 + #define __REF_VERIFY__ 20 + 21 + #ifdef CONFIG_BTRFS_FS_REF_VERIFY 22 + int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info); 23 + void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info); 24 + int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, 25 + u64 parent, u64 ref_root, u64 owner, u64 offset, 26 + int action); 27 + void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start, 28 + u64 len); 29 + 30 + static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info) 31 + { 32 + spin_lock_init(&fs_info->ref_verify_lock); 33 + fs_info->block_tree = RB_ROOT; 34 + } 35 + #else 36 + static inline int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) 37 + { 38 + return 0; 39 + } 40 + 41 + static inline void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info) 42 + { 43 + } 44 + 45 + static inline int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, 46 + u64 num_bytes, u64 parent, u64 ref_root, 47 + u64 owner, u64 offset, int action) 48 + { 49 + return 0; 50 + } 51 + 52 + static inline void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, 53 + u64 start, u64 len) 54 + { 55 + } 56 + 57 + static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info) 58 + { 59 + } 60 + 61 + #endif /* CONFIG_BTRFS_FS_REF_VERIFY */ 62 + #endif /* _REF_VERIFY__ */

+10 -7

fs/btrfs/relocation.c

··· 1742 1742 dirty = 1; 1743 1743 1744 1744 key.offset -= btrfs_file_extent_offset(leaf, fi); 1745 - ret = btrfs_inc_extent_ref(trans, fs_info, new_bytenr, 1745 + ret = btrfs_inc_extent_ref(trans, root, new_bytenr, 1746 1746 num_bytes, parent, 1747 1747 btrfs_header_owner(leaf), 1748 1748 key.objectid, key.offset); ··· 1751 1751 break; 1752 1752 } 1753 1753 1754 - ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes, 1754 + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1755 1755 parent, btrfs_header_owner(leaf), 1756 1756 key.objectid, key.offset); 1757 1757 if (ret) { ··· 1952 1952 path->slots[level], old_ptr_gen); 1953 1953 btrfs_mark_buffer_dirty(path->nodes[level]); 1954 1954 1955 - ret = btrfs_inc_extent_ref(trans, fs_info, old_bytenr, 1955 + ret = btrfs_inc_extent_ref(trans, src, old_bytenr, 1956 1956 blocksize, path->nodes[level]->start, 1957 1957 src->root_key.objectid, level - 1, 0); 1958 1958 BUG_ON(ret); 1959 - ret = btrfs_inc_extent_ref(trans, fs_info, new_bytenr, 1959 + ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, 1960 1960 blocksize, 0, dest->root_key.objectid, 1961 1961 level - 1, 0); 1962 1962 BUG_ON(ret); 1963 1963 1964 - ret = btrfs_free_extent(trans, fs_info, new_bytenr, blocksize, 1964 + ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, 1965 1965 path->nodes[level]->start, 1966 1966 src->root_key.objectid, level - 1, 0); 1967 1967 BUG_ON(ret); 1968 1968 1969 - ret = btrfs_free_extent(trans, fs_info, old_bytenr, blocksize, 1969 + ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, 1970 1970 0, dest->root_key.objectid, level - 1, 1971 1971 0); 1972 1972 BUG_ON(ret); ··· 2808 2808 trans->transid); 2809 2809 btrfs_mark_buffer_dirty(upper->eb); 2810 2810 2811 - ret = btrfs_inc_extent_ref(trans, root->fs_info, 2811 + ret = btrfs_inc_extent_ref(trans, root, 2812 2812 node->eb->start, blocksize, 2813 2813 upper->eb->start, 2814 2814 btrfs_header_owner(upper->eb), ··· 3246 3246 put_page(page); 3247 3247 btrfs_delalloc_release_metadata(BTRFS_I(inode), 3248 3248 PAGE_SIZE); 3249 + btrfs_delalloc_release_extents(BTRFS_I(inode), 3250 + PAGE_SIZE); 3249 3251 ret = -EIO; 3250 3252 goto out; 3251 3253 } ··· 3277 3275 put_page(page); 3278 3276 3279 3277 index++; 3278 + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); 3280 3279 balance_dirty_pages_ratelimited(inode->i_mapping); 3281 3280 btrfs_throttle(fs_info); 3282 3281 }

-4

fs/btrfs/root-tree.c

··· 226 226 struct btrfs_root *root; 227 227 int err = 0; 228 228 int ret; 229 - bool can_recover = true; 230 - 231 - if (sb_rdonly(fs_info->sb)) 232 - can_recover = false; 233 229 234 230 path = btrfs_alloc_path(); 235 231 if (!path)

+11 -11

fs/btrfs/scrub.c

··· 231 231 struct btrfs_path *path; 232 232 u64 extent_item_size; 233 233 const char *errstr; 234 - sector_t sector; 234 + u64 physical; 235 235 u64 logical; 236 236 struct btrfs_device *dev; 237 237 }; ··· 797 797 */ 798 798 for (i = 0; i < ipath->fspath->elem_cnt; ++i) 799 799 btrfs_warn_in_rcu(fs_info, 800 - "%s at logical %llu on dev %s, sector %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)", 800 + "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)", 801 801 swarn->errstr, swarn->logical, 802 802 rcu_str_deref(swarn->dev->name), 803 - (unsigned long long)swarn->sector, 803 + swarn->physical, 804 804 root, inum, offset, 805 805 min(isize - offset, (u64)PAGE_SIZE), nlink, 806 806 (char *)(unsigned long)ipath->fspath->val[i]); ··· 810 810 811 811 err: 812 812 btrfs_warn_in_rcu(fs_info, 813 - "%s at logical %llu on dev %s, sector %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d", 813 + "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d", 814 814 swarn->errstr, swarn->logical, 815 815 rcu_str_deref(swarn->dev->name), 816 - (unsigned long long)swarn->sector, 816 + swarn->physical, 817 817 root, inum, offset, ret); 818 818 819 819 free_ipath(ipath); ··· 845 845 if (!path) 846 846 return; 847 847 848 - swarn.sector = (sblock->pagev[0]->physical) >> 9; 848 + swarn.physical = sblock->pagev[0]->physical; 849 849 swarn.logical = sblock->pagev[0]->logical; 850 850 swarn.errstr = errstr; 851 851 swarn.dev = NULL; ··· 868 868 item_size, &ref_root, 869 869 &ref_level); 870 870 btrfs_warn_in_rcu(fs_info, 871 - "%s at logical %llu on dev %s, sector %llu: metadata %s (level %d) in tree %llu", 871 + "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", 872 872 errstr, swarn.logical, 873 873 rcu_str_deref(dev->name), 874 - (unsigned long long)swarn.sector, 874 + swarn.physical, 875 875 ref_level ? "node" : "leaf", 876 876 ret < 0 ? -1 : ref_level, 877 877 ret < 0 ? -1 : ref_root); ··· 883 883 swarn.dev = dev; 884 884 iterate_extent_inodes(fs_info, found_key.objectid, 885 885 extent_item_pos, 1, 886 - scrub_print_warning_inode, &swarn); 886 + scrub_print_warning_inode, &swarn, false); 887 887 } 888 888 889 889 out: ··· 1047 1047 * can be found. 1048 1048 */ 1049 1049 ret = iterate_inodes_from_logical(fixup->logical, fs_info, path, 1050 - scrub_fixup_readpage, fixup); 1050 + scrub_fixup_readpage, fixup, false); 1051 1051 if (ret < 0) { 1052 1052 uncorrectable = 1; 1053 1053 goto out; ··· 4390 4390 } 4391 4391 4392 4392 ret = iterate_inodes_from_logical(logical, fs_info, path, 4393 - record_inode_for_nocow, nocow_ctx); 4393 + record_inode_for_nocow, nocow_ctx, false); 4394 4394 if (ret != 0 && ret != -ENOENT) { 4395 4395 btrfs_warn(fs_info, 4396 4396 "iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d",

+33 -41

fs/btrfs/send.c

··· 26 26 #include <linux/radix-tree.h> 27 27 #include <linux/vmalloc.h> 28 28 #include <linux/string.h> 29 + #include <linux/compat.h> 29 30 30 31 #include "send.h" 31 32 #include "backref.h" ··· 993 992 * path must point to the dir item when called. 994 993 */ 995 994 static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, 996 - struct btrfs_key *found_key, 997 995 iterate_dir_item_t iterate, void *ctx) 998 996 { 999 997 int ret = 0; ··· 1271 1271 */ 1272 1272 if (ino >= bctx->cur_objectid) 1273 1273 return 0; 1274 - #if 0 1275 - if (ino > bctx->cur_objectid) 1276 - return 0; 1277 - if (offset + bctx->extent_len > bctx->cur_offset) 1278 - return 0; 1279 - #endif 1280 1274 } 1281 1275 1282 1276 bctx->found++; ··· 1423 1429 extent_item_pos = 0; 1424 1430 ret = iterate_extent_inodes(fs_info, found_key.objectid, 1425 1431 extent_item_pos, 1, __iterate_backrefs, 1426 - backref_ctx); 1432 + backref_ctx, false); 1427 1433 1428 1434 if (ret < 0) 1429 1435 goto out; ··· 4100 4106 return ret; 4101 4107 } 4102 4108 4103 - static int record_ref(struct btrfs_root *root, int num, u64 dir, int index, 4104 - struct fs_path *name, void *ctx, struct list_head *refs) 4109 + static int record_ref(struct btrfs_root *root, u64 dir, struct fs_path *name, 4110 + void *ctx, struct list_head *refs) 4105 4111 { 4106 4112 int ret = 0; 4107 4113 struct send_ctx *sctx = ctx; ··· 4137 4143 void *ctx) 4138 4144 { 4139 4145 struct send_ctx *sctx = ctx; 4140 - return record_ref(sctx->send_root, num, dir, index, name, 4141 - ctx, &sctx->new_refs); 4146 + return record_ref(sctx->send_root, dir, name, ctx, &sctx->new_refs); 4142 4147 } 4143 4148 4144 4149 ··· 4146 4153 void *ctx) 4147 4154 { 4148 4155 struct send_ctx *sctx = ctx; 4149 - return record_ref(sctx->parent_root, num, dir, index, name, 4150 - ctx, &sctx->deleted_refs); 4156 + return record_ref(sctx->parent_root, dir, name, ctx, 4157 + &sctx->deleted_refs); 4151 4158 } 4152 4159 4153 4160 static int record_new_ref(struct send_ctx *sctx) ··· 4491 4498 int ret = 0; 4492 4499 4493 4500 ret = iterate_dir_item(sctx->send_root, sctx->left_path, 4494 - sctx->cmp_key, __process_new_xattr, sctx); 4501 + __process_new_xattr, sctx); 4495 4502 4496 4503 return ret; 4497 4504 } ··· 4499 4506 static int process_deleted_xattr(struct send_ctx *sctx) 4500 4507 { 4501 4508 return iterate_dir_item(sctx->parent_root, sctx->right_path, 4502 - sctx->cmp_key, __process_deleted_xattr, sctx); 4509 + __process_deleted_xattr, sctx); 4503 4510 } 4504 4511 4505 4512 struct find_xattr_ctx { ··· 4544 4551 ctx.found_data = NULL; 4545 4552 ctx.found_data_len = 0; 4546 4553 4547 - ret = iterate_dir_item(root, path, key, __find_xattr, &ctx); 4554 + ret = iterate_dir_item(root, path, __find_xattr, &ctx); 4548 4555 if (ret < 0) 4549 4556 return ret; 4550 4557 ··· 4614 4621 int ret = 0; 4615 4622 4616 4623 ret = iterate_dir_item(sctx->send_root, sctx->left_path, 4617 - sctx->cmp_key, __process_changed_new_xattr, sctx); 4624 + __process_changed_new_xattr, sctx); 4618 4625 if (ret < 0) 4619 4626 goto out; 4620 4627 ret = iterate_dir_item(sctx->parent_root, sctx->right_path, 4621 - sctx->cmp_key, __process_changed_deleted_xattr, sctx); 4628 + __process_changed_deleted_xattr, sctx); 4622 4629 4623 4630 out: 4624 4631 return ret; ··· 4668 4675 goto out; 4669 4676 } 4670 4677 4671 - ret = iterate_dir_item(root, path, &found_key, 4672 - __process_new_xattr, sctx); 4678 + ret = iterate_dir_item(root, path, __process_new_xattr, sctx); 4673 4679 if (ret < 0) 4674 4680 goto out; 4675 4681 ··· 4715 4723 /* initial readahead */ 4716 4724 memset(&sctx->ra, 0, sizeof(struct file_ra_state)); 4717 4725 file_ra_state_init(&sctx->ra, inode->i_mapping); 4718 - page_cache_sync_readahead(inode->i_mapping, &sctx->ra, NULL, index, 4719 - last_index - index + 1); 4720 4726 4721 4727 while (index <= last_index) { 4722 4728 unsigned cur_len = min_t(unsigned, len, 4723 4729 PAGE_SIZE - pg_offset); 4724 - page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL); 4730 + 4731 + page = find_lock_page(inode->i_mapping, index); 4725 4732 if (!page) { 4726 - ret = -ENOMEM; 4727 - break; 4733 + page_cache_sync_readahead(inode->i_mapping, &sctx->ra, 4734 + NULL, index, last_index + 1 - index); 4735 + 4736 + page = find_or_create_page(inode->i_mapping, index, 4737 + GFP_KERNEL); 4738 + if (!page) { 4739 + ret = -ENOMEM; 4740 + break; 4741 + } 4742 + } 4743 + 4744 + if (PageReadahead(page)) { 4745 + page_cache_async_readahead(inode->i_mapping, &sctx->ra, 4746 + NULL, page, index, last_index + 1 - index); 4728 4747 } 4729 4748 4730 4749 if (!PageUptodate(page)) { ··· 6165 6162 * Updates compare related fields in sctx and simply forwards to the actual 6166 6163 * changed_xxx functions. 6167 6164 */ 6168 - static int changed_cb(struct btrfs_root *left_root, 6169 - struct btrfs_root *right_root, 6170 - struct btrfs_path *left_path, 6165 + static int changed_cb(struct btrfs_path *left_path, 6171 6166 struct btrfs_path *right_path, 6172 6167 struct btrfs_key *key, 6173 6168 enum btrfs_compare_tree_result result, ··· 6247 6246 slot = path->slots[0]; 6248 6247 btrfs_item_key_to_cpu(eb, &found_key, slot); 6249 6248 6250 - ret = changed_cb(send_root, NULL, path, NULL, 6251 - &found_key, BTRFS_COMPARE_TREE_NEW, sctx); 6249 + ret = changed_cb(path, NULL, &found_key, 6250 + BTRFS_COMPARE_TREE_NEW, sctx); 6252 6251 if (ret < 0) 6253 6252 goto out; 6254 6253 ··· 6366 6365 spin_unlock(&root->root_item_lock); 6367 6366 } 6368 6367 6369 - long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) 6368 + long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) 6370 6369 { 6371 6370 int ret = 0; 6372 6371 struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root; 6373 6372 struct btrfs_fs_info *fs_info = send_root->fs_info; 6374 6373 struct btrfs_root *clone_root; 6375 - struct btrfs_ioctl_send_args *arg = NULL; 6376 6374 struct btrfs_key key; 6377 6375 struct send_ctx *sctx = NULL; 6378 6376 u32 i; ··· 6404 6404 */ 6405 6405 if (!btrfs_root_readonly(send_root)) { 6406 6406 ret = -EPERM; 6407 - goto out; 6408 - } 6409 - 6410 - arg = memdup_user(arg_, sizeof(*arg)); 6411 - if (IS_ERR(arg)) { 6412 - ret = PTR_ERR(arg); 6413 - arg = NULL; 6414 6407 goto out; 6415 6408 } 6416 6409 ··· 6647 6654 if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) 6648 6655 btrfs_root_dec_send_in_progress(sctx->parent_root); 6649 6656 6650 - kfree(arg); 6651 6657 kvfree(clone_sources_tmp); 6652 6658 6653 6659 if (sctx) {

+1 -1

fs/btrfs/send.h

··· 130 130 #define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1) 131 131 132 132 #ifdef __KERNEL__ 133 - long btrfs_ioctl_send(struct file *mnt_file, void __user *arg); 133 + long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg); 134 134 #endif

+29 -8

fs/btrfs/super.c

··· 202 202 203 203 void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) 204 204 { 205 - struct super_block *sb = fs_info->sb; 206 205 char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; 207 206 struct va_format vaf; 208 207 va_list args; ··· 227 228 vaf.va = &args; 228 229 229 230 if (__ratelimit(ratelimit)) 230 - printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf); 231 + printk("%sBTRFS %s (device %s): %pV\n", lvl, type, 232 + fs_info ? fs_info->sb->s_id : "<unknown>", &vaf); 231 233 232 234 va_end(args); 233 235 } ··· 292 292 vaf.va = &args; 293 293 294 294 errstr = btrfs_decode_error(errno); 295 - if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)) 295 + if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR))) 296 296 panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", 297 297 s_id, function, line, &vaf, errno, errstr); 298 298 ··· 325 325 Opt_nologreplay, Opt_norecovery, 326 326 #ifdef CONFIG_BTRFS_DEBUG 327 327 Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, 328 + #endif 329 + #ifdef CONFIG_BTRFS_FS_REF_VERIFY 330 + Opt_ref_verify, 328 331 #endif 329 332 Opt_err, 330 333 }; ··· 389 386 {Opt_fragment_data, "fragment=data"}, 390 387 {Opt_fragment_metadata, "fragment=metadata"}, 391 388 {Opt_fragment_all, "fragment=all"}, 389 + #endif 390 + #ifdef CONFIG_BTRFS_FS_REF_VERIFY 391 + {Opt_ref_verify, "ref_verify"}, 392 392 #endif 393 393 {Opt_err, NULL}, 394 394 }; ··· 508 502 strncmp(args[0].from, "zlib", 4) == 0) { 509 503 compress_type = "zlib"; 510 504 info->compress_type = BTRFS_COMPRESS_ZLIB; 505 + info->compress_level = 506 + btrfs_compress_str2level(args[0].from); 511 507 btrfs_set_opt(info->mount_opt, COMPRESS); 512 508 btrfs_clear_opt(info->mount_opt, NODATACOW); 513 509 btrfs_clear_opt(info->mount_opt, NODATASUM); ··· 557 549 compress_force != saved_compress_force)) || 558 550 (!btrfs_test_opt(info, COMPRESS) && 559 551 no_compress == 1)) { 560 - btrfs_info(info, "%s %s compression", 552 + btrfs_info(info, "%s %s compression, level %d", 561 553 (compress_force) ? "force" : "use", 562 - compress_type); 554 + compress_type, info->compress_level); 563 555 } 564 556 compress_force = false; 565 557 break; ··· 831 823 case Opt_fragment_data: 832 824 btrfs_info(info, "fragmenting data"); 833 825 btrfs_set_opt(info->mount_opt, FRAGMENT_DATA); 826 + break; 827 + #endif 828 + #ifdef CONFIG_BTRFS_FS_REF_VERIFY 829 + case Opt_ref_verify: 830 + btrfs_info(info, "doing ref verification"); 831 + btrfs_set_opt(info->mount_opt, REF_VERIFY); 834 832 break; 835 833 #endif 836 834 case Opt_err: ··· 1219 1205 * happens. The pending operations are delayed to the 1220 1206 * next commit after thawing. 1221 1207 */ 1222 - if (__sb_start_write(sb, SB_FREEZE_WRITE, false)) 1223 - __sb_end_write(sb, SB_FREEZE_WRITE); 1208 + if (sb_start_write_trylock(sb)) 1209 + sb_end_write(sb); 1224 1210 else 1225 1211 return 0; 1226 1212 trans = btrfs_start_transaction(root, 0); ··· 1260 1246 seq_printf(seq, ",compress-force=%s", compress_type); 1261 1247 else 1262 1248 seq_printf(seq, ",compress=%s", compress_type); 1249 + if (info->compress_level) 1250 + seq_printf(seq, ":%d", info->compress_level); 1263 1251 } 1264 1252 if (btrfs_test_opt(info, NOSSD)) 1265 1253 seq_puts(seq, ",nossd"); ··· 1321 1305 if (btrfs_test_opt(info, FRAGMENT_METADATA)) 1322 1306 seq_puts(seq, ",fragment=metadata"); 1323 1307 #endif 1308 + if (btrfs_test_opt(info, REF_VERIFY)) 1309 + seq_puts(seq, ",ref_verify"); 1324 1310 seq_printf(seq, ",subvolid=%llu", 1325 1311 BTRFS_I(d_inode(dentry))->root->root_key.objectid); 1326 1312 seq_puts(seq, ",subvol="); ··· 2130 2112 * succeed even if the Avail is zero. But this is better than the other 2131 2113 * way around. 2132 2114 */ 2133 - thresh = 4 * 1024 * 1024; 2115 + thresh = SZ_4M; 2134 2116 2135 2117 if (!mixed && total_free_meta - thresh < block_rsv->size) 2136 2118 buf->f_bavail = 0; ··· 2336 2318 #endif 2337 2319 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 2338 2320 ", integrity-checker=on" 2321 + #endif 2322 + #ifdef CONFIG_BTRFS_FS_REF_VERIFY 2323 + ", ref-verify=on" 2339 2324 #endif 2340 2325 "\n", 2341 2326 btrfs_crc32c_impl());

+32 -31

fs/btrfs/sysfs.c

··· 247 247 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 248 248 return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf); 249 249 } 250 - BTRFS_ATTR(global_rsv_size, global_rsv_size_show); 250 + BTRFS_ATTR(allocation, global_rsv_size, global_rsv_size_show); 251 251 252 252 static ssize_t global_rsv_reserved_show(struct kobject *kobj, 253 253 struct kobj_attribute *a, char *buf) ··· 256 256 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 257 257 return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf); 258 258 } 259 - BTRFS_ATTR(global_rsv_reserved, global_rsv_reserved_show); 259 + BTRFS_ATTR(allocation, global_rsv_reserved, global_rsv_reserved_show); 260 260 261 261 #define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj) 262 262 #define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj) 263 263 264 264 static ssize_t raid_bytes_show(struct kobject *kobj, 265 265 struct kobj_attribute *attr, char *buf); 266 - BTRFS_RAID_ATTR(total_bytes, raid_bytes_show); 267 - BTRFS_RAID_ATTR(used_bytes, raid_bytes_show); 266 + BTRFS_ATTR(raid, total_bytes, raid_bytes_show); 267 + BTRFS_ATTR(raid, used_bytes, raid_bytes_show); 268 268 269 269 static ssize_t raid_bytes_show(struct kobject *kobj, 270 270 struct kobj_attribute *attr, char *buf) ··· 277 277 278 278 down_read(&sinfo->groups_sem); 279 279 list_for_each_entry(block_group, &sinfo->block_groups[index], list) { 280 - if (&attr->attr == BTRFS_RAID_ATTR_PTR(total_bytes)) 280 + if (&attr->attr == BTRFS_ATTR_PTR(raid, total_bytes)) 281 281 val += block_group->key.offset; 282 282 else 283 283 val += btrfs_block_group_used(&block_group->item); ··· 287 287 } 288 288 289 289 static struct attribute *raid_attributes[] = { 290 - BTRFS_RAID_ATTR_PTR(total_bytes), 291 - BTRFS_RAID_ATTR_PTR(used_bytes), 290 + BTRFS_ATTR_PTR(raid, total_bytes), 291 + BTRFS_ATTR_PTR(raid, used_bytes), 292 292 NULL 293 293 }; 294 294 ··· 311 311 struct btrfs_space_info *sinfo = to_space_info(kobj); \ 312 312 return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \ 313 313 } \ 314 - BTRFS_ATTR(field, btrfs_space_info_show_##field) 314 + BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field) 315 315 316 316 static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj, 317 317 struct kobj_attribute *a, ··· 331 331 SPACE_INFO_ATTR(bytes_readonly); 332 332 SPACE_INFO_ATTR(disk_used); 333 333 SPACE_INFO_ATTR(disk_total); 334 - BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned); 334 + BTRFS_ATTR(space_info, total_bytes_pinned, 335 + btrfs_space_info_show_total_bytes_pinned); 335 336 336 337 static struct attribute *space_info_attrs[] = { 337 - BTRFS_ATTR_PTR(flags), 338 - BTRFS_ATTR_PTR(total_bytes), 339 - BTRFS_ATTR_PTR(bytes_used), 340 - BTRFS_ATTR_PTR(bytes_pinned), 341 - BTRFS_ATTR_PTR(bytes_reserved), 342 - BTRFS_ATTR_PTR(bytes_may_use), 343 - BTRFS_ATTR_PTR(bytes_readonly), 344 - BTRFS_ATTR_PTR(disk_used), 345 - BTRFS_ATTR_PTR(disk_total), 346 - BTRFS_ATTR_PTR(total_bytes_pinned), 338 + BTRFS_ATTR_PTR(space_info, flags), 339 + BTRFS_ATTR_PTR(space_info, total_bytes), 340 + BTRFS_ATTR_PTR(space_info, bytes_used), 341 + BTRFS_ATTR_PTR(space_info, bytes_pinned), 342 + BTRFS_ATTR_PTR(space_info, bytes_reserved), 343 + BTRFS_ATTR_PTR(space_info, bytes_may_use), 344 + BTRFS_ATTR_PTR(space_info, bytes_readonly), 345 + BTRFS_ATTR_PTR(space_info, disk_used), 346 + BTRFS_ATTR_PTR(space_info, disk_total), 347 + BTRFS_ATTR_PTR(space_info, total_bytes_pinned), 347 348 NULL, 348 349 }; 349 350 ··· 362 361 }; 363 362 364 363 static const struct attribute *allocation_attrs[] = { 365 - BTRFS_ATTR_PTR(global_rsv_reserved), 366 - BTRFS_ATTR_PTR(global_rsv_size), 364 + BTRFS_ATTR_PTR(allocation, global_rsv_reserved), 365 + BTRFS_ATTR_PTR(allocation, global_rsv_size), 367 366 NULL, 368 367 }; 369 368 ··· 416 415 417 416 return len; 418 417 } 419 - BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store); 418 + BTRFS_ATTR_RW(, label, btrfs_label_show, btrfs_label_store); 420 419 421 420 static ssize_t btrfs_nodesize_show(struct kobject *kobj, 422 421 struct kobj_attribute *a, char *buf) ··· 426 425 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); 427 426 } 428 427 429 - BTRFS_ATTR(nodesize, btrfs_nodesize_show); 428 + BTRFS_ATTR(, nodesize, btrfs_nodesize_show); 430 429 431 430 static ssize_t btrfs_sectorsize_show(struct kobject *kobj, 432 431 struct kobj_attribute *a, char *buf) ··· 437 436 fs_info->super_copy->sectorsize); 438 437 } 439 438 440 - BTRFS_ATTR(sectorsize, btrfs_sectorsize_show); 439 + BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); 441 440 442 441 static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, 443 442 struct kobj_attribute *a, char *buf) ··· 448 447 fs_info->super_copy->sectorsize); 449 448 } 450 449 451 - BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show); 450 + BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show); 452 451 453 452 static ssize_t quota_override_show(struct kobject *kobj, 454 453 struct kobj_attribute *a, char *buf) ··· 488 487 return len; 489 488 } 490 489 491 - BTRFS_ATTR_RW(quota_override, quota_override_show, quota_override_store); 490 + BTRFS_ATTR_RW(, quota_override, quota_override_show, quota_override_store); 492 491 493 492 static const struct attribute *btrfs_attrs[] = { 494 - BTRFS_ATTR_PTR(label), 495 - BTRFS_ATTR_PTR(nodesize), 496 - BTRFS_ATTR_PTR(sectorsize), 497 - BTRFS_ATTR_PTR(clone_alignment), 498 - BTRFS_ATTR_PTR(quota_override), 493 + BTRFS_ATTR_PTR(, label), 494 + BTRFS_ATTR_PTR(, nodesize), 495 + BTRFS_ATTR_PTR(, sectorsize), 496 + BTRFS_ATTR_PTR(, clone_alignment), 497 + BTRFS_ATTR_PTR(, quota_override), 499 498 NULL, 500 499 }; 501 500

+11 -15

fs/btrfs/sysfs.h

··· 21 21 .store = _store, \ 22 22 } 23 23 24 - #define BTRFS_ATTR_RW(_name, _show, _store) \ 25 - static struct kobj_attribute btrfs_attr_##_name = \ 24 + #define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \ 25 + static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ 26 26 __INIT_KOBJ_ATTR(_name, 0644, _show, _store) 27 27 28 - #define BTRFS_ATTR(_name, _show) \ 29 - static struct kobj_attribute btrfs_attr_##_name = \ 28 + #define BTRFS_ATTR(_prefix, _name, _show) \ 29 + static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ 30 30 __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) 31 31 32 - #define BTRFS_ATTR_PTR(_name) (&btrfs_attr_##_name.attr) 33 - 34 - #define BTRFS_RAID_ATTR(_name, _show) \ 35 - static struct kobj_attribute btrfs_raid_attr_##_name = \ 36 - __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) 37 - 38 - #define BTRFS_RAID_ATTR_PTR(_name) (&btrfs_raid_attr_##_name.attr) 32 + #define BTRFS_ATTR_PTR(_prefix, _name) \ 33 + (&btrfs_attr_##_prefix##_##_name.attr) 39 34 40 35 41 36 struct btrfs_feature_attr { ··· 39 44 u64 feature_bit; 40 45 }; 41 46 42 - #define BTRFS_FEAT_ATTR(_name, _feature_set, _prefix, _feature_bit) \ 43 - static struct btrfs_feature_attr btrfs_attr_##_name = { \ 47 + #define BTRFS_FEAT_ATTR(_name, _feature_set, _feature_prefix, _feature_bit) \ 48 + static struct btrfs_feature_attr btrfs_attr_features_##_name = { \ 44 49 .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \ 45 50 btrfs_feature_attr_show, \ 46 51 btrfs_feature_attr_store), \ 47 52 .feature_set = _feature_set, \ 48 - .feature_bit = _prefix ##_## _feature_bit, \ 53 + .feature_bit = _feature_prefix ##_## _feature_bit, \ 49 54 } 50 - #define BTRFS_FEAT_ATTR_PTR(_name) (&btrfs_attr_##_name.kobj_attr.attr) 55 + #define BTRFS_FEAT_ATTR_PTR(_name) \ 56 + (&btrfs_attr_features_##_name.kobj_attr.attr) 51 57 52 58 #define BTRFS_FEAT_ATTR_COMPAT(name, feature) \ 53 59 BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature)

+2 -1

fs/btrfs/tests/free-space-tree-tests.c

··· 500 500 path = btrfs_alloc_path(); 501 501 if (!path) { 502 502 test_msg("Couldn't allocate path\n"); 503 - return -ENOMEM; 503 + ret = -ENOMEM; 504 + goto out; 504 505 } 505 506 506 507 ret = add_block_group_free_space(&trans, root->fs_info, cache);

+5 -15

fs/btrfs/tests/inode-tests.c

··· 770 770 offset = em->start + em->len; 771 771 free_extent_map(em); 772 772 773 - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, 4096 * 1024, 0); 773 + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M, 0); 774 774 if (IS_ERR(em)) { 775 775 test_msg("Got an error when we shouldn't have\n"); 776 776 goto out; ··· 968 968 btrfs_test_inode_set_ops(inode); 969 969 970 970 /* [BTRFS_MAX_EXTENT_SIZE] */ 971 - BTRFS_I(inode)->outstanding_extents++; 972 971 ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, 973 972 NULL, 0); 974 973 if (ret) { ··· 982 983 } 983 984 984 985 /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ 985 - BTRFS_I(inode)->outstanding_extents++; 986 986 ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE, 987 987 BTRFS_MAX_EXTENT_SIZE + sectorsize - 1, 988 988 NULL, 0); ··· 1001 1003 BTRFS_MAX_EXTENT_SIZE >> 1, 1002 1004 (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, 1003 1005 EXTENT_DELALLOC | EXTENT_DIRTY | 1004 - EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0, 1006 + EXTENT_UPTODATE, 0, 0, 1005 1007 NULL, GFP_KERNEL); 1006 1008 if (ret) { 1007 1009 test_msg("clear_extent_bit returned %d\n", ret); ··· 1015 1017 } 1016 1018 1017 1019 /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ 1018 - BTRFS_I(inode)->outstanding_extents++; 1019 1020 ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1, 1020 1021 (BTRFS_MAX_EXTENT_SIZE >> 1) 1021 1022 + sectorsize - 1, ··· 1032 1035 1033 1036 /* 1034 1037 * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize HOLE][BTRFS_MAX_EXTENT_SIZE+sectorsize] 1035 - * 1036 - * I'm artificially adding 2 to outstanding_extents because in the 1037 - * buffered IO case we'd add things up as we go, but I don't feel like 1038 - * doing that here, this isn't the interesting case we want to test. 1039 1038 */ 1040 - BTRFS_I(inode)->outstanding_extents += 2; 1041 1039 ret = btrfs_set_extent_delalloc(inode, 1042 1040 BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize, 1043 1041 (BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1, ··· 1051 1059 /* 1052 1060 * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize][BTRFS_MAX_EXTENT_SIZE+sectorsize] 1053 1061 */ 1054 - BTRFS_I(inode)->outstanding_extents++; 1055 1062 ret = btrfs_set_extent_delalloc(inode, 1056 1063 BTRFS_MAX_EXTENT_SIZE + sectorsize, 1057 1064 BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL, 0); ··· 1070 1079 BTRFS_MAX_EXTENT_SIZE + sectorsize, 1071 1080 BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 1072 1081 EXTENT_DIRTY | EXTENT_DELALLOC | 1073 - EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, 1082 + EXTENT_UPTODATE, 0, 0, 1074 1083 NULL, GFP_KERNEL); 1075 1084 if (ret) { 1076 1085 test_msg("clear_extent_bit returned %d\n", ret); ··· 1087 1096 * Refill the hole again just for good measure, because I thought it 1088 1097 * might fail and I'd rather satisfy my paranoia at this point. 1089 1098 */ 1090 - BTRFS_I(inode)->outstanding_extents++; 1091 1099 ret = btrfs_set_extent_delalloc(inode, 1092 1100 BTRFS_MAX_EXTENT_SIZE + sectorsize, 1093 1101 BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL, 0); ··· 1104 1114 /* Empty */ 1105 1115 ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, 1106 1116 EXTENT_DIRTY | EXTENT_DELALLOC | 1107 - EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, 1117 + EXTENT_UPTODATE, 0, 0, 1108 1118 NULL, GFP_KERNEL); 1109 1119 if (ret) { 1110 1120 test_msg("clear_extent_bit returned %d\n", ret); ··· 1121 1131 if (ret) 1122 1132 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, 1123 1133 EXTENT_DIRTY | EXTENT_DELALLOC | 1124 - EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, 1134 + EXTENT_UPTODATE, 0, 0, 1125 1135 NULL, GFP_KERNEL); 1126 1136 iput(inode); 1127 1137 btrfs_free_dummy_root(root);

+20 -10

fs/btrfs/tests/qgroup-tests.c

··· 240 240 * we can only call btrfs_qgroup_account_extent() directly to test 241 241 * quota. 242 242 */ 243 - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); 243 + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, 244 + false); 244 245 if (ret) { 245 246 ulist_free(old_roots); 246 247 test_msg("Couldn't find old roots: %d\n", ret); ··· 253 252 if (ret) 254 253 return ret; 255 254 256 - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); 255 + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, 256 + false); 257 257 if (ret) { 258 258 ulist_free(old_roots); 259 259 ulist_free(new_roots); ··· 277 275 old_roots = NULL; 278 276 new_roots = NULL; 279 277 280 - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); 278 + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, 279 + false); 281 280 if (ret) { 282 281 ulist_free(old_roots); 283 282 test_msg("Couldn't find old roots: %d\n", ret); ··· 289 286 if (ret) 290 287 return -EINVAL; 291 288 292 - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); 289 + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, 290 + false); 293 291 if (ret) { 294 292 ulist_free(old_roots); 295 293 ulist_free(new_roots); ··· 341 337 return ret; 342 338 } 343 339 344 - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); 340 + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, 341 + false); 345 342 if (ret) { 346 343 ulist_free(old_roots); 347 344 test_msg("Couldn't find old roots: %d\n", ret); ··· 354 349 if (ret) 355 350 return ret; 356 351 357 - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); 352 + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, 353 + false); 358 354 if (ret) { 359 355 ulist_free(old_roots); 360 356 ulist_free(new_roots); ··· 376 370 return -EINVAL; 377 371 } 378 372 379 - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); 373 + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, 374 + false); 380 375 if (ret) { 381 376 ulist_free(old_roots); 382 377 test_msg("Couldn't find old roots: %d\n", ret); ··· 389 382 if (ret) 390 383 return ret; 391 384 392 - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); 385 + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, 386 + false); 393 387 if (ret) { 394 388 ulist_free(old_roots); 395 389 ulist_free(new_roots); ··· 417 409 return -EINVAL; 418 410 } 419 411 420 - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); 412 + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, 413 + false); 421 414 if (ret) { 422 415 ulist_free(old_roots); 423 416 test_msg("Couldn't find old roots: %d\n", ret); ··· 430 421 if (ret) 431 422 return ret; 432 423 433 - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); 424 + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, 425 + false); 434 426 if (ret) { 435 427 ulist_free(old_roots); 436 428 ulist_free(new_roots);

+13 -3

fs/btrfs/transaction.c

··· 797 797 { 798 798 struct btrfs_fs_info *fs_info = trans->fs_info; 799 799 800 - if (fs_info->global_block_rsv.space_info->full && 801 - btrfs_check_space_for_delayed_refs(trans, fs_info)) 800 + if (btrfs_check_space_for_delayed_refs(trans, fs_info)) 802 801 return 1; 803 802 804 803 return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5); ··· 949 950 u64 start = 0; 950 951 u64 end; 951 952 953 + atomic_inc(&BTRFS_I(fs_info->btree_inode)->sync_writers); 952 954 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 953 955 mark, &cached_state)) { 954 956 bool wait_writeback = false; ··· 985 985 cond_resched(); 986 986 start = end + 1; 987 987 } 988 + atomic_dec(&BTRFS_I(fs_info->btree_inode)->sync_writers); 988 989 return werr; 989 990 } 990 991 ··· 1916 1915 1917 1916 static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) 1918 1917 { 1918 + /* 1919 + * We use writeback_inodes_sb here because if we used 1920 + * btrfs_start_delalloc_roots we would deadlock with fs freeze. 1921 + * Currently are holding the fs freeze lock, if we do an async flush 1922 + * we'll do btrfs_join_transaction() and deadlock because we need to 1923 + * wait for the fs freeze lock. Using the direct flushing we benefit 1924 + * from already being in a transaction and our join_transaction doesn't 1925 + * have to re-take the fs freeze lock. 1926 + */ 1919 1927 if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) 1920 - return btrfs_start_delalloc_roots(fs_info, 1, -1); 1928 + writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC); 1921 1929 return 0; 1922 1930 } 1923 1931

+425

fs/btrfs/tree-checker.c

··· 1 + /* 2 + * Copyright (C) Qu Wenruo 2017. All rights reserved. 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public 6 + * License v2 as published by the Free Software Foundation. 7 + * 8 + * This program is distributed in the hope that it will be useful, 9 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 + * General Public License for more details. 12 + * 13 + * You should have received a copy of the GNU General Public 14 + * License along with this program. 15 + */ 16 + 17 + /* 18 + * The module is used to catch unexpected/corrupted tree block data. 19 + * Such behavior can be caused either by a fuzzed image or bugs. 20 + * 21 + * The objective is to do leaf/node validation checks when tree block is read 22 + * from disk, and check *every* possible member, so other code won't 23 + * need to checking them again. 24 + * 25 + * Due to the potential and unwanted damage, every checker needs to be 26 + * carefully reviewed otherwise so it does not prevent mount of valid images. 27 + */ 28 + 29 + #include "ctree.h" 30 + #include "tree-checker.h" 31 + #include "disk-io.h" 32 + #include "compression.h" 33 + 34 + /* 35 + * Error message should follow the following format: 36 + * corrupt <type>: <identifier>, <reason>[, <bad_value>] 37 + * 38 + * @type: leaf or node 39 + * @identifier: the necessary info to locate the leaf/node. 40 + * It's recommened to decode key.objecitd/offset if it's 41 + * meaningful. 42 + * @reason: describe the error 43 + * @bad_value: optional, it's recommened to output bad value and its 44 + * expected value (range). 45 + * 46 + * Since comma is used to separate the components, only space is allowed 47 + * inside each component. 48 + */ 49 + 50 + /* 51 + * Append generic "corrupt leaf/node root=%llu block=%llu slot=%d: " to @fmt. 52 + * Allows callers to customize the output. 53 + */ 54 + __printf(4, 5) 55 + static void generic_err(const struct btrfs_root *root, 56 + const struct extent_buffer *eb, int slot, 57 + const char *fmt, ...) 58 + { 59 + struct va_format vaf; 60 + va_list args; 61 + 62 + va_start(args, fmt); 63 + 64 + vaf.fmt = fmt; 65 + vaf.va = &args; 66 + 67 + btrfs_crit(root->fs_info, 68 + "corrupt %s: root=%llu block=%llu slot=%d, %pV", 69 + btrfs_header_level(eb) == 0 ? "leaf" : "node", 70 + root->objectid, btrfs_header_bytenr(eb), slot, &vaf); 71 + va_end(args); 72 + } 73 + 74 + /* 75 + * Customized reporter for extent data item, since its key objectid and 76 + * offset has its own meaning. 77 + */ 78 + __printf(4, 5) 79 + static void file_extent_err(const struct btrfs_root *root, 80 + const struct extent_buffer *eb, int slot, 81 + const char *fmt, ...) 82 + { 83 + struct btrfs_key key; 84 + struct va_format vaf; 85 + va_list args; 86 + 87 + btrfs_item_key_to_cpu(eb, &key, slot); 88 + va_start(args, fmt); 89 + 90 + vaf.fmt = fmt; 91 + vaf.va = &args; 92 + 93 + btrfs_crit(root->fs_info, 94 + "corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, %pV", 95 + btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid, 96 + btrfs_header_bytenr(eb), slot, key.objectid, key.offset, &vaf); 97 + va_end(args); 98 + } 99 + 100 + /* 101 + * Return 0 if the btrfs_file_extent_##name is aligned to @alignment 102 + * Else return 1 103 + */ 104 + #define CHECK_FE_ALIGNED(root, leaf, slot, fi, name, alignment) \ 105 + ({ \ 106 + if (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))) \ 107 + file_extent_err((root), (leaf), (slot), \ 108 + "invalid %s for file extent, have %llu, should be aligned to %u", \ 109 + (#name), btrfs_file_extent_##name((leaf), (fi)), \ 110 + (alignment)); \ 111 + (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))); \ 112 + }) 113 + 114 + static int check_extent_data_item(struct btrfs_root *root, 115 + struct extent_buffer *leaf, 116 + struct btrfs_key *key, int slot) 117 + { 118 + struct btrfs_file_extent_item *fi; 119 + u32 sectorsize = root->fs_info->sectorsize; 120 + u32 item_size = btrfs_item_size_nr(leaf, slot); 121 + 122 + if (!IS_ALIGNED(key->offset, sectorsize)) { 123 + file_extent_err(root, leaf, slot, 124 + "unaligned file_offset for file extent, have %llu should be aligned to %u", 125 + key->offset, sectorsize); 126 + return -EUCLEAN; 127 + } 128 + 129 + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 130 + 131 + if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) { 132 + file_extent_err(root, leaf, slot, 133 + "invalid type for file extent, have %u expect range [0, %u]", 134 + btrfs_file_extent_type(leaf, fi), 135 + BTRFS_FILE_EXTENT_TYPES); 136 + return -EUCLEAN; 137 + } 138 + 139 + /* 140 + * Support for new compression/encrption must introduce incompat flag, 141 + * and must be caught in open_ctree(). 142 + */ 143 + if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) { 144 + file_extent_err(root, leaf, slot, 145 + "invalid compression for file extent, have %u expect range [0, %u]", 146 + btrfs_file_extent_compression(leaf, fi), 147 + BTRFS_COMPRESS_TYPES); 148 + return -EUCLEAN; 149 + } 150 + if (btrfs_file_extent_encryption(leaf, fi)) { 151 + file_extent_err(root, leaf, slot, 152 + "invalid encryption for file extent, have %u expect 0", 153 + btrfs_file_extent_encryption(leaf, fi)); 154 + return -EUCLEAN; 155 + } 156 + if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { 157 + /* Inline extent must have 0 as key offset */ 158 + if (key->offset) { 159 + file_extent_err(root, leaf, slot, 160 + "invalid file_offset for inline file extent, have %llu expect 0", 161 + key->offset); 162 + return -EUCLEAN; 163 + } 164 + 165 + /* Compressed inline extent has no on-disk size, skip it */ 166 + if (btrfs_file_extent_compression(leaf, fi) != 167 + BTRFS_COMPRESS_NONE) 168 + return 0; 169 + 170 + /* Uncompressed inline extent size must match item size */ 171 + if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START + 172 + btrfs_file_extent_ram_bytes(leaf, fi)) { 173 + file_extent_err(root, leaf, slot, 174 + "invalid ram_bytes for uncompressed inline extent, have %u expect %llu", 175 + item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START + 176 + btrfs_file_extent_ram_bytes(leaf, fi)); 177 + return -EUCLEAN; 178 + } 179 + return 0; 180 + } 181 + 182 + /* Regular or preallocated extent has fixed item size */ 183 + if (item_size != sizeof(*fi)) { 184 + file_extent_err(root, leaf, slot, 185 + "invalid item size for reg/prealloc file extent, have %u expect %zu", 186 + item_size, sizeof(*fi)); 187 + return -EUCLEAN; 188 + } 189 + if (CHECK_FE_ALIGNED(root, leaf, slot, fi, ram_bytes, sectorsize) || 190 + CHECK_FE_ALIGNED(root, leaf, slot, fi, disk_bytenr, sectorsize) || 191 + CHECK_FE_ALIGNED(root, leaf, slot, fi, disk_num_bytes, sectorsize) || 192 + CHECK_FE_ALIGNED(root, leaf, slot, fi, offset, sectorsize) || 193 + CHECK_FE_ALIGNED(root, leaf, slot, fi, num_bytes, sectorsize)) 194 + return -EUCLEAN; 195 + return 0; 196 + } 197 + 198 + static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf, 199 + struct btrfs_key *key, int slot) 200 + { 201 + u32 sectorsize = root->fs_info->sectorsize; 202 + u32 csumsize = btrfs_super_csum_size(root->fs_info->super_copy); 203 + 204 + if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) { 205 + generic_err(root, leaf, slot, 206 + "invalid key objectid for csum item, have %llu expect %llu", 207 + key->objectid, BTRFS_EXTENT_CSUM_OBJECTID); 208 + return -EUCLEAN; 209 + } 210 + if (!IS_ALIGNED(key->offset, sectorsize)) { 211 + generic_err(root, leaf, slot, 212 + "unaligned key offset for csum item, have %llu should be aligned to %u", 213 + key->offset, sectorsize); 214 + return -EUCLEAN; 215 + } 216 + if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) { 217 + generic_err(root, leaf, slot, 218 + "unaligned item size for csum item, have %u should be aligned to %u", 219 + btrfs_item_size_nr(leaf, slot), csumsize); 220 + return -EUCLEAN; 221 + } 222 + return 0; 223 + } 224 + 225 + /* 226 + * Common point to switch the item-specific validation. 227 + */ 228 + static int check_leaf_item(struct btrfs_root *root, 229 + struct extent_buffer *leaf, 230 + struct btrfs_key *key, int slot) 231 + { 232 + int ret = 0; 233 + 234 + switch (key->type) { 235 + case BTRFS_EXTENT_DATA_KEY: 236 + ret = check_extent_data_item(root, leaf, key, slot); 237 + break; 238 + case BTRFS_EXTENT_CSUM_KEY: 239 + ret = check_csum_item(root, leaf, key, slot); 240 + break; 241 + } 242 + return ret; 243 + } 244 + 245 + int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf) 246 + { 247 + struct btrfs_fs_info *fs_info = root->fs_info; 248 + /* No valid key type is 0, so all key should be larger than this key */ 249 + struct btrfs_key prev_key = {0, 0, 0}; 250 + struct btrfs_key key; 251 + u32 nritems = btrfs_header_nritems(leaf); 252 + int slot; 253 + 254 + /* 255 + * Extent buffers from a relocation tree have a owner field that 256 + * corresponds to the subvolume tree they are based on. So just from an 257 + * extent buffer alone we can not find out what is the id of the 258 + * corresponding subvolume tree, so we can not figure out if the extent 259 + * buffer corresponds to the root of the relocation tree or not. So 260 + * skip this check for relocation trees. 261 + */ 262 + if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) { 263 + struct btrfs_root *check_root; 264 + 265 + key.objectid = btrfs_header_owner(leaf); 266 + key.type = BTRFS_ROOT_ITEM_KEY; 267 + key.offset = (u64)-1; 268 + 269 + check_root = btrfs_get_fs_root(fs_info, &key, false); 270 + /* 271 + * The only reason we also check NULL here is that during 272 + * open_ctree() some roots has not yet been set up. 273 + */ 274 + if (!IS_ERR_OR_NULL(check_root)) { 275 + struct extent_buffer *eb; 276 + 277 + eb = btrfs_root_node(check_root); 278 + /* if leaf is the root, then it's fine */ 279 + if (leaf != eb) { 280 + generic_err(check_root, leaf, 0, 281 + "invalid nritems, have %u should not be 0 for non-root leaf", 282 + nritems); 283 + free_extent_buffer(eb); 284 + return -EUCLEAN; 285 + } 286 + free_extent_buffer(eb); 287 + } 288 + return 0; 289 + } 290 + 291 + if (nritems == 0) 292 + return 0; 293 + 294 + /* 295 + * Check the following things to make sure this is a good leaf, and 296 + * leaf users won't need to bother with similar sanity checks: 297 + * 298 + * 1) key ordering 299 + * 2) item offset and size 300 + * No overlap, no hole, all inside the leaf. 301 + * 3) item content 302 + * If possible, do comprehensive sanity check. 303 + * NOTE: All checks must only rely on the item data itself. 304 + */ 305 + for (slot = 0; slot < nritems; slot++) { 306 + u32 item_end_expected; 307 + int ret; 308 + 309 + btrfs_item_key_to_cpu(leaf, &key, slot); 310 + 311 + /* Make sure the keys are in the right order */ 312 + if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) { 313 + generic_err(root, leaf, slot, 314 + "bad key order, prev (%llu %u %llu) current (%llu %u %llu)", 315 + prev_key.objectid, prev_key.type, 316 + prev_key.offset, key.objectid, key.type, 317 + key.offset); 318 + return -EUCLEAN; 319 + } 320 + 321 + /* 322 + * Make sure the offset and ends are right, remember that the 323 + * item data starts at the end of the leaf and grows towards the 324 + * front. 325 + */ 326 + if (slot == 0) 327 + item_end_expected = BTRFS_LEAF_DATA_SIZE(fs_info); 328 + else 329 + item_end_expected = btrfs_item_offset_nr(leaf, 330 + slot - 1); 331 + if (btrfs_item_end_nr(leaf, slot) != item_end_expected) { 332 + generic_err(root, leaf, slot, 333 + "unexpected item end, have %u expect %u", 334 + btrfs_item_end_nr(leaf, slot), 335 + item_end_expected); 336 + return -EUCLEAN; 337 + } 338 + 339 + /* 340 + * Check to make sure that we don't point outside of the leaf, 341 + * just in case all the items are consistent to each other, but 342 + * all point outside of the leaf. 343 + */ 344 + if (btrfs_item_end_nr(leaf, slot) > 345 + BTRFS_LEAF_DATA_SIZE(fs_info)) { 346 + generic_err(root, leaf, slot, 347 + "slot end outside of leaf, have %u expect range [0, %u]", 348 + btrfs_item_end_nr(leaf, slot), 349 + BTRFS_LEAF_DATA_SIZE(fs_info)); 350 + return -EUCLEAN; 351 + } 352 + 353 + /* Also check if the item pointer overlaps with btrfs item. */ 354 + if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) > 355 + btrfs_item_ptr_offset(leaf, slot)) { 356 + generic_err(root, leaf, slot, 357 + "slot overlaps with its data, item end %lu data start %lu", 358 + btrfs_item_nr_offset(slot) + 359 + sizeof(struct btrfs_item), 360 + btrfs_item_ptr_offset(leaf, slot)); 361 + return -EUCLEAN; 362 + } 363 + 364 + /* Check if the item size and content meet other criteria */ 365 + ret = check_leaf_item(root, leaf, &key, slot); 366 + if (ret < 0) 367 + return ret; 368 + 369 + prev_key.objectid = key.objectid; 370 + prev_key.type = key.type; 371 + prev_key.offset = key.offset; 372 + } 373 + 374 + return 0; 375 + } 376 + 377 + int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node) 378 + { 379 + unsigned long nr = btrfs_header_nritems(node); 380 + struct btrfs_key key, next_key; 381 + int slot; 382 + u64 bytenr; 383 + int ret = 0; 384 + 385 + if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)) { 386 + btrfs_crit(root->fs_info, 387 + "corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]", 388 + root->objectid, node->start, 389 + nr == 0 ? "small" : "large", nr, 390 + BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)); 391 + return -EUCLEAN; 392 + } 393 + 394 + for (slot = 0; slot < nr - 1; slot++) { 395 + bytenr = btrfs_node_blockptr(node, slot); 396 + btrfs_node_key_to_cpu(node, &key, slot); 397 + btrfs_node_key_to_cpu(node, &next_key, slot + 1); 398 + 399 + if (!bytenr) { 400 + generic_err(root, node, slot, 401 + "invalid NULL node pointer"); 402 + ret = -EUCLEAN; 403 + goto out; 404 + } 405 + if (!IS_ALIGNED(bytenr, root->fs_info->sectorsize)) { 406 + generic_err(root, node, slot, 407 + "unaligned pointer, have %llu should be aligned to %u", 408 + bytenr, root->fs_info->sectorsize); 409 + ret = -EUCLEAN; 410 + goto out; 411 + } 412 + 413 + if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) { 414 + generic_err(root, node, slot, 415 + "bad key order, current (%llu %u %llu) next (%llu %u %llu)", 416 + key.objectid, key.type, key.offset, 417 + next_key.objectid, next_key.type, 418 + next_key.offset); 419 + ret = -EUCLEAN; 420 + goto out; 421 + } 422 + } 423 + out: 424 + return ret; 425 + }

+26

fs/btrfs/tree-checker.h

··· 1 + /* 2 + * Copyright (C) Qu Wenruo 2017. All rights reserved. 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public 6 + * License v2 as published by the Free Software Foundation. 7 + * 8 + * This program is distributed in the hope that it will be useful, 9 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 + * General Public License for more details. 12 + * 13 + * You should have received a copy of the GNU General Public 14 + * License along with this program. 15 + */ 16 + 17 + #ifndef __BTRFS_TREE_CHECKER__ 18 + #define __BTRFS_TREE_CHECKER__ 19 + 20 + #include "ctree.h" 21 + #include "extent_io.h" 22 + 23 + int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf); 24 + int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node); 25 + 26 + #endif

+18 -18

fs/btrfs/tree-log.c

··· 717 717 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 718 718 ins.offset); 719 719 if (ret == 0) { 720 - ret = btrfs_inc_extent_ref(trans, fs_info, 720 + ret = btrfs_inc_extent_ref(trans, root, 721 721 ins.objectid, ins.offset, 722 722 0, root->root_key.objectid, 723 723 key->objectid, offset); ··· 2699 2699 * so we know that if ours is more than 2 older than the 2700 2700 * current transaction, we're done 2701 2701 */ 2702 - do { 2702 + for (;;) { 2703 2703 prepare_to_wait(&root->log_commit_wait[index], 2704 2704 &wait, TASK_UNINTERRUPTIBLE); 2705 + 2706 + if (!(root->log_transid_committed < transid && 2707 + atomic_read(&root->log_commit[index]))) 2708 + break; 2709 + 2705 2710 mutex_unlock(&root->log_mutex); 2706 - 2707 - if (root->log_transid_committed < transid && 2708 - atomic_read(&root->log_commit[index])) 2709 - schedule(); 2710 - 2711 - finish_wait(&root->log_commit_wait[index], &wait); 2711 + schedule(); 2712 2712 mutex_lock(&root->log_mutex); 2713 - } while (root->log_transid_committed < transid && 2714 - atomic_read(&root->log_commit[index])); 2713 + } 2714 + finish_wait(&root->log_commit_wait[index], &wait); 2715 2715 } 2716 2716 2717 2717 static void wait_for_writer(struct btrfs_root *root) 2718 2718 { 2719 2719 DEFINE_WAIT(wait); 2720 2720 2721 - while (atomic_read(&root->log_writers)) { 2722 - prepare_to_wait(&root->log_writer_wait, 2723 - &wait, TASK_UNINTERRUPTIBLE); 2721 + for (;;) { 2722 + prepare_to_wait(&root->log_writer_wait, &wait, 2723 + TASK_UNINTERRUPTIBLE); 2724 + if (!atomic_read(&root->log_writers)) 2725 + break; 2726 + 2724 2727 mutex_unlock(&root->log_mutex); 2725 - if (atomic_read(&root->log_writers)) 2726 - schedule(); 2727 - finish_wait(&root->log_writer_wait, &wait); 2728 + schedule(); 2728 2729 mutex_lock(&root->log_mutex); 2729 2730 } 2731 + finish_wait(&root->log_writer_wait, &wait); 2730 2732 } 2731 2733 2732 2734 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, ··· 4647 4645 struct btrfs_key min_key; 4648 4646 struct btrfs_key max_key; 4649 4647 struct btrfs_root *log = root->log_root; 4650 - struct extent_buffer *src = NULL; 4651 4648 LIST_HEAD(logged_list); 4652 4649 u64 last_extent = 0; 4653 4650 int err = 0; ··· 4889 4888 goto next_slot; 4890 4889 } 4891 4890 4892 - src = path->nodes[0]; 4893 4891 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 4894 4892 ins_nr++; 4895 4893 goto next_slot;

+88 -82

fs/btrfs/volumes.c

··· 360 360 int again = 0; 361 361 unsigned long num_run; 362 362 unsigned long batch_run = 0; 363 - unsigned long limit; 364 363 unsigned long last_waited = 0; 365 364 int force_reg = 0; 366 365 int sync_pending = 0; ··· 374 375 blk_start_plug(&plug); 375 376 376 377 bdi = device->bdev->bd_bdi; 377 - limit = btrfs_async_submit_limit(fs_info); 378 - limit = limit * 2 / 3; 379 378 380 379 loop: 381 380 spin_lock(&device->io_lock); ··· 439 442 cur = pending; 440 443 pending = pending->bi_next; 441 444 cur->bi_next = NULL; 442 - 443 - /* 444 - * atomic_dec_return implies a barrier for waitqueue_active 445 - */ 446 - if (atomic_dec_return(&fs_info->nr_async_bios) < limit && 447 - waitqueue_active(&fs_info->async_submit_wait)) 448 - wake_up(&fs_info->async_submit_wait); 449 445 450 446 BUG_ON(atomic_read(&cur->__bi_cnt) == 0); 451 447 ··· 507 517 &device->work); 508 518 goto done; 509 519 } 510 - /* unplug every 64 requests just for good measure */ 511 - if (batch_run % 64 == 0) { 512 - blk_finish_plug(&plug); 513 - blk_start_plug(&plug); 514 - sync_pending = 0; 515 - } 516 520 } 517 521 518 522 cond_resched(); ··· 531 547 } 532 548 533 549 534 - void btrfs_free_stale_device(struct btrfs_device *cur_dev) 550 + static void btrfs_free_stale_device(struct btrfs_device *cur_dev) 535 551 { 536 552 struct btrfs_fs_devices *fs_devs; 537 553 struct btrfs_device *dev; ··· 1052 1068 return ret; 1053 1069 } 1054 1070 1055 - void btrfs_release_disk_super(struct page *page) 1071 + static void btrfs_release_disk_super(struct page *page) 1056 1072 { 1057 1073 kunmap(page); 1058 1074 put_page(page); 1059 1075 } 1060 1076 1061 - int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, 1062 - struct page **page, struct btrfs_super_block **disk_super) 1077 + static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, 1078 + struct page **page, 1079 + struct btrfs_super_block **disk_super) 1063 1080 { 1064 1081 void *p; 1065 1082 pgoff_t index; ··· 1802 1817 return 0; 1803 1818 } 1804 1819 1805 - struct btrfs_device *btrfs_find_next_active_device(struct btrfs_fs_devices *fs_devs, 1806 - struct btrfs_device *device) 1820 + static struct btrfs_device * btrfs_find_next_active_device( 1821 + struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) 1807 1822 { 1808 1823 struct btrfs_device *next_device; 1809 1824 ··· 2016 2031 } 2017 2032 2018 2033 btrfs_close_bdev(srcdev); 2019 - 2020 2034 call_rcu(&srcdev->rcu, free_device); 2021 - 2022 - /* 2023 - * unless fs_devices is seed fs, num_devices shouldn't go 2024 - * zero 2025 - */ 2026 - BUG_ON(!fs_devices->num_devices && !fs_devices->seeding); 2027 2035 2028 2036 /* if this is no devs we rather delete the fs_devices */ 2029 2037 if (!fs_devices->num_devices) { 2030 2038 struct btrfs_fs_devices *tmp_fs_devices; 2039 + 2040 + /* 2041 + * On a mounted FS, num_devices can't be zero unless it's a 2042 + * seed. In case of a seed device being replaced, the replace 2043 + * target added to the sprout FS, so there will be no more 2044 + * device left under the seed FS. 2045 + */ 2046 + ASSERT(fs_devices->seeding); 2031 2047 2032 2048 tmp_fs_devices = fs_info->fs_devices; 2033 2049 while (tmp_fs_devices) { ··· 2309 2323 u64 tmp; 2310 2324 int seeding_dev = 0; 2311 2325 int ret = 0; 2326 + bool unlocked = false; 2312 2327 2313 2328 if (sb_rdonly(sb) && !fs_info->fs_devices->seeding) 2314 2329 return -EROFS; ··· 2386 2399 if (seeding_dev) { 2387 2400 sb->s_flags &= ~MS_RDONLY; 2388 2401 ret = btrfs_prepare_sprout(fs_info); 2389 - BUG_ON(ret); /* -ENOMEM */ 2402 + if (ret) { 2403 + btrfs_abort_transaction(trans, ret); 2404 + goto error_trans; 2405 + } 2390 2406 } 2391 2407 2392 2408 device->fs_devices = fs_info->fs_devices; ··· 2435 2445 mutex_unlock(&fs_info->chunk_mutex); 2436 2446 if (ret) { 2437 2447 btrfs_abort_transaction(trans, ret); 2438 - goto error_trans; 2448 + goto error_sysfs; 2439 2449 } 2440 2450 } 2441 2451 2442 2452 ret = btrfs_add_device(trans, fs_info, device); 2443 2453 if (ret) { 2444 2454 btrfs_abort_transaction(trans, ret); 2445 - goto error_trans; 2455 + goto error_sysfs; 2446 2456 } 2447 2457 2448 2458 if (seeding_dev) { ··· 2451 2461 ret = btrfs_finish_sprout(trans, fs_info); 2452 2462 if (ret) { 2453 2463 btrfs_abort_transaction(trans, ret); 2454 - goto error_trans; 2464 + goto error_sysfs; 2455 2465 } 2456 2466 2457 2467 /* Sprouting would change fsid of the mounted root, ··· 2469 2479 if (seeding_dev) { 2470 2480 mutex_unlock(&uuid_mutex); 2471 2481 up_write(&sb->s_umount); 2482 + unlocked = true; 2472 2483 2473 2484 if (ret) /* transaction commit */ 2474 2485 return ret; ··· 2482 2491 if (IS_ERR(trans)) { 2483 2492 if (PTR_ERR(trans) == -ENOENT) 2484 2493 return 0; 2485 - return PTR_ERR(trans); 2494 + ret = PTR_ERR(trans); 2495 + trans = NULL; 2496 + goto error_sysfs; 2486 2497 } 2487 2498 ret = btrfs_commit_transaction(trans); 2488 2499 } ··· 2493 2500 update_dev_time(device_path); 2494 2501 return ret; 2495 2502 2496 - error_trans: 2497 - btrfs_end_transaction(trans); 2498 - rcu_string_free(device->name); 2503 + error_sysfs: 2499 2504 btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); 2505 + error_trans: 2506 + if (seeding_dev) 2507 + sb->s_flags |= MS_RDONLY; 2508 + if (trans) 2509 + btrfs_end_transaction(trans); 2510 + rcu_string_free(device->name); 2500 2511 kfree(device); 2501 2512 error: 2502 2513 blkdev_put(bdev, FMODE_EXCL); 2503 - if (seeding_dev) { 2514 + if (seeding_dev && !unlocked) { 2504 2515 mutex_unlock(&uuid_mutex); 2505 2516 up_write(&sb->s_umount); 2506 2517 } ··· 4810 4813 em_tree = &info->mapping_tree.map_tree; 4811 4814 write_lock(&em_tree->lock); 4812 4815 ret = add_extent_mapping(em_tree, em, 0); 4813 - if (!ret) { 4814 - list_add_tail(&em->list, &trans->transaction->pending_chunks); 4815 - refcount_inc(&em->refs); 4816 - } 4817 - write_unlock(&em_tree->lock); 4818 4816 if (ret) { 4817 + write_unlock(&em_tree->lock); 4819 4818 free_extent_map(em); 4820 4819 goto error; 4821 4820 } 4821 + 4822 + list_add_tail(&em->list, &trans->transaction->pending_chunks); 4823 + refcount_inc(&em->refs); 4824 + write_unlock(&em_tree->lock); 4822 4825 4823 4826 ret = btrfs_make_block_group(trans, info, 0, type, start, num_bytes); 4824 4827 if (ret) ··· 5692 5695 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 5693 5696 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5694 5697 &stripe_index); 5695 - if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_GET_READ_MIRRORS) 5698 + if (!need_full_stripe(op)) 5696 5699 mirror_num = 1; 5697 5700 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 5698 - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) 5701 + if (need_full_stripe(op)) 5699 5702 num_stripes = map->num_stripes; 5700 5703 else if (mirror_num) 5701 5704 stripe_index = mirror_num - 1; ··· 5708 5711 } 5709 5712 5710 5713 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 5711 - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) { 5714 + if (need_full_stripe(op)) { 5712 5715 num_stripes = map->num_stripes; 5713 5716 } else if (mirror_num) { 5714 5717 stripe_index = mirror_num - 1; ··· 5722 5725 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 5723 5726 stripe_index *= map->sub_stripes; 5724 5727 5725 - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) 5728 + if (need_full_stripe(op)) 5726 5729 num_stripes = map->sub_stripes; 5727 5730 else if (mirror_num) 5728 5731 stripe_index += mirror_num - 1; ··· 5737 5740 } 5738 5741 5739 5742 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5740 - if (need_raid_map && 5741 - (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS || 5742 - mirror_num > 1)) { 5743 + if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { 5743 5744 /* push stripe_nr back to the start of the full stripe */ 5744 5745 stripe_nr = div64_u64(raid56_full_stripe_start, 5745 5746 stripe_len * nr_data_stripes(map)); ··· 5764 5769 /* We distribute the parity blocks across stripes */ 5765 5770 div_u64_rem(stripe_nr + stripe_index, map->num_stripes, 5766 5771 &stripe_index); 5767 - if ((op != BTRFS_MAP_WRITE && 5768 - op != BTRFS_MAP_GET_READ_MIRRORS) && 5769 - mirror_num <= 1) 5772 + if (!need_full_stripe(op) && mirror_num <= 1) 5770 5773 mirror_num = 1; 5771 5774 } 5772 5775 } else { ··· 6026 6033 * this bio is actually up to date, we didn't 6027 6034 * go over the max number of errors 6028 6035 */ 6029 - bio->bi_status = 0; 6036 + bio->bi_status = BLK_STS_OK; 6030 6037 } 6031 6038 6032 6039 btrfs_end_bbio(bbio, bio); ··· 6062 6069 return; 6063 6070 } 6064 6071 6065 - /* 6066 - * nr_async_bios allows us to reliably return congestion to the 6067 - * higher layers. Otherwise, the async bio makes it appear we have 6068 - * made progress against dirty pages when we've really just put it 6069 - * on a queue for later 6070 - */ 6071 - atomic_inc(&fs_info->nr_async_bios); 6072 6072 WARN_ON(bio->bi_next); 6073 6073 bio->bi_next = NULL; 6074 6074 ··· 6130 6144 6131 6145 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6132 6146 bio->bi_iter.bi_sector = logical >> 9; 6133 - bio->bi_status = BLK_STS_IOERR; 6147 + if (atomic_read(&bbio->error) > bbio->max_errors) 6148 + bio->bi_status = BLK_STS_IOERR; 6149 + else 6150 + bio->bi_status = BLK_STS_OK; 6134 6151 btrfs_end_bbio(bbio, bio); 6135 6152 } 6136 6153 } ··· 6238 6249 6239 6250 device = btrfs_alloc_device(NULL, &devid, dev_uuid); 6240 6251 if (IS_ERR(device)) 6241 - return NULL; 6252 + return device; 6242 6253 6243 6254 list_add(&device->dev_list, &fs_devices->devices); 6244 6255 device->fs_devices = fs_devices; ··· 6366 6377 return 0; 6367 6378 } 6368 6379 6380 + static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, 6381 + u64 devid, u8 *uuid, bool error) 6382 + { 6383 + if (error) 6384 + btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", 6385 + devid, uuid); 6386 + else 6387 + btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", 6388 + devid, uuid); 6389 + } 6390 + 6369 6391 static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, 6370 6392 struct extent_buffer *leaf, 6371 6393 struct btrfs_chunk *chunk) ··· 6447 6447 if (!map->stripes[i].dev && 6448 6448 !btrfs_test_opt(fs_info, DEGRADED)) { 6449 6449 free_extent_map(em); 6450 - btrfs_report_missing_device(fs_info, devid, uuid); 6451 - return -EIO; 6450 + btrfs_report_missing_device(fs_info, devid, uuid, true); 6451 + return -ENOENT; 6452 6452 } 6453 6453 if (!map->stripes[i].dev) { 6454 6454 map->stripes[i].dev = 6455 6455 add_missing_dev(fs_info->fs_devices, devid, 6456 6456 uuid); 6457 - if (!map->stripes[i].dev) { 6457 + if (IS_ERR(map->stripes[i].dev)) { 6458 6458 free_extent_map(em); 6459 - return -EIO; 6459 + btrfs_err(fs_info, 6460 + "failed to init missing dev %llu: %ld", 6461 + devid, PTR_ERR(map->stripes[i].dev)); 6462 + return PTR_ERR(map->stripes[i].dev); 6460 6463 } 6461 - btrfs_report_missing_device(fs_info, devid, uuid); 6464 + btrfs_report_missing_device(fs_info, devid, uuid, false); 6462 6465 } 6463 6466 map->stripes[i].dev->in_fs_metadata = 1; 6464 6467 } ··· 6580 6577 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); 6581 6578 if (!device) { 6582 6579 if (!btrfs_test_opt(fs_info, DEGRADED)) { 6583 - btrfs_report_missing_device(fs_info, devid, dev_uuid); 6584 - return -EIO; 6580 + btrfs_report_missing_device(fs_info, devid, 6581 + dev_uuid, true); 6582 + return -ENOENT; 6585 6583 } 6586 6584 6587 6585 device = add_missing_dev(fs_devices, devid, dev_uuid); 6588 - if (!device) 6589 - return -ENOMEM; 6590 - btrfs_report_missing_device(fs_info, devid, dev_uuid); 6586 + if (IS_ERR(device)) { 6587 + btrfs_err(fs_info, 6588 + "failed to add missing dev %llu: %ld", 6589 + devid, PTR_ERR(device)); 6590 + return PTR_ERR(device); 6591 + } 6592 + btrfs_report_missing_device(fs_info, devid, dev_uuid, false); 6591 6593 } else { 6592 6594 if (!device->bdev) { 6593 - btrfs_report_missing_device(fs_info, devid, dev_uuid); 6594 - if (!btrfs_test_opt(fs_info, DEGRADED)) 6595 - return -EIO; 6595 + if (!btrfs_test_opt(fs_info, DEGRADED)) { 6596 + btrfs_report_missing_device(fs_info, 6597 + devid, dev_uuid, true); 6598 + return -ENOENT; 6599 + } 6600 + btrfs_report_missing_device(fs_info, devid, 6601 + dev_uuid, false); 6596 6602 } 6597 6603 6598 6604 if(!device->bdev && !device->missing) { ··· 6766 6754 clear_extent_buffer_uptodate(sb); 6767 6755 free_extent_buffer_stale(sb); 6768 6756 return -EIO; 6769 - } 6770 - 6771 - void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid, 6772 - u8 *uuid) 6773 - { 6774 - btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", devid, uuid); 6775 6757 } 6776 6758 6777 6759 /*

-2

fs/btrfs/volumes.h

··· 542 542 void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); 543 543 544 544 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info); 545 - void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid, 546 - u8 *uuid); 547 545 548 546 #endif

+14 -1

fs/btrfs/zlib.c

··· 37 37 z_stream strm; 38 38 char *buf; 39 39 struct list_head list; 40 + int level; 40 41 }; 41 42 42 43 static void zlib_free_workspace(struct list_head *ws) ··· 97 96 *total_out = 0; 98 97 *total_in = 0; 99 98 100 - if (Z_OK != zlib_deflateInit(&workspace->strm, 3)) { 99 + if (Z_OK != zlib_deflateInit(&workspace->strm, workspace->level)) { 101 100 pr_warn("BTRFS: deflateInit failed\n"); 102 101 ret = -EIO; 103 102 goto out; ··· 403 402 return ret; 404 403 } 405 404 405 + static void zlib_set_level(struct list_head *ws, unsigned int type) 406 + { 407 + struct workspace *workspace = list_entry(ws, struct workspace, list); 408 + unsigned level = (type & 0xF0) >> 4; 409 + 410 + if (level > 9) 411 + level = 9; 412 + 413 + workspace->level = level > 0 ? level : 3; 414 + } 415 + 406 416 const struct btrfs_compress_op btrfs_zlib_compress = { 407 417 .alloc_workspace = zlib_alloc_workspace, 408 418 .free_workspace = zlib_free_workspace, 409 419 .compress_pages = zlib_compress_pages, 410 420 .decompress_bio = zlib_decompress_bio, 411 421 .decompress = zlib_decompress, 422 + .set_level = zlib_set_level, 412 423 };

+5

fs/btrfs/zstd.c

··· 423 423 return ret; 424 424 } 425 425 426 + static void zstd_set_level(struct list_head *ws, unsigned int type) 427 + { 428 + } 429 + 426 430 const struct btrfs_compress_op btrfs_zstd_compress = { 427 431 .alloc_workspace = zstd_alloc_workspace, 428 432 .free_workspace = zstd_free_workspace, 429 433 .compress_pages = zstd_compress_pages, 430 434 .decompress_bio = zstd_decompress_bio, 431 435 .decompress = zstd_decompress, 436 + .set_level = zstd_set_level, 432 437 };

+33 -8

include/trace/events/btrfs.h

··· 29 29 struct btrfs_qgroup; 30 30 struct prelim_ref; 31 31 32 + TRACE_DEFINE_ENUM(FLUSH_DELAYED_ITEMS_NR); 33 + TRACE_DEFINE_ENUM(FLUSH_DELAYED_ITEMS); 34 + TRACE_DEFINE_ENUM(FLUSH_DELALLOC); 35 + TRACE_DEFINE_ENUM(FLUSH_DELALLOC_WAIT); 36 + TRACE_DEFINE_ENUM(ALLOC_CHUNK); 37 + TRACE_DEFINE_ENUM(COMMIT_TRANS); 38 + 32 39 #define show_ref_type(type) \ 33 40 __print_symbolic(type, \ 34 41 { BTRFS_TREE_BLOCK_REF_KEY, "TREE_BLOCK_REF" }, \ ··· 799 792 DECLARE_EVENT_CLASS(btrfs_delayed_ref_head, 800 793 801 794 TP_PROTO(const struct btrfs_fs_info *fs_info, 802 - const struct btrfs_delayed_ref_node *ref, 803 795 const struct btrfs_delayed_ref_head *head_ref, 804 796 int action), 805 797 806 - TP_ARGS(fs_info, ref, head_ref, action), 798 + TP_ARGS(fs_info, head_ref, action), 807 799 808 800 TP_STRUCT__entry_btrfs( 809 801 __field( u64, bytenr ) ··· 812 806 ), 813 807 814 808 TP_fast_assign_btrfs(fs_info, 815 - __entry->bytenr = ref->bytenr; 816 - __entry->num_bytes = ref->num_bytes; 809 + __entry->bytenr = head_ref->bytenr; 810 + __entry->num_bytes = head_ref->num_bytes; 817 811 __entry->action = action; 818 812 __entry->is_data = head_ref->is_data; 819 813 ), ··· 828 822 DEFINE_EVENT(btrfs_delayed_ref_head, add_delayed_ref_head, 829 823 830 824 TP_PROTO(const struct btrfs_fs_info *fs_info, 831 - const struct btrfs_delayed_ref_node *ref, 832 825 const struct btrfs_delayed_ref_head *head_ref, 833 826 int action), 834 827 835 - TP_ARGS(fs_info, ref, head_ref, action) 828 + TP_ARGS(fs_info, head_ref, action) 836 829 ); 837 830 838 831 DEFINE_EVENT(btrfs_delayed_ref_head, run_delayed_ref_head, 839 832 840 833 TP_PROTO(const struct btrfs_fs_info *fs_info, 841 - const struct btrfs_delayed_ref_node *ref, 842 834 const struct btrfs_delayed_ref_head *head_ref, 843 835 int action), 844 836 845 - TP_ARGS(fs_info, ref, head_ref, action) 837 + TP_ARGS(fs_info, head_ref, action) 846 838 ); 847 839 848 840 #define show_chunk_type(type) \ ··· 1696 1692 TP_ARGS(fs_info, oldref, newref, tree_size) 1697 1693 ); 1698 1694 1695 + TRACE_EVENT(btrfs_inode_mod_outstanding_extents, 1696 + TP_PROTO(struct btrfs_root *root, u64 ino, int mod), 1697 + 1698 + TP_ARGS(root, ino, mod), 1699 + 1700 + TP_STRUCT__entry_btrfs( 1701 + __field( u64, root_objectid ) 1702 + __field( u64, ino ) 1703 + __field( int, mod ) 1704 + ), 1705 + 1706 + TP_fast_assign_btrfs(root->fs_info, 1707 + __entry->root_objectid = root->objectid; 1708 + __entry->ino = ino; 1709 + __entry->mod = mod; 1710 + ), 1711 + 1712 + TP_printk_btrfs("root=%llu(%s) ino=%llu mod=%d", 1713 + show_root_type(__entry->root_objectid), 1714 + (unsigned long long)__entry->ino, __entry->mod) 1715 + ); 1699 1716 #endif /* _TRACE_BTRFS_H */ 1700 1717 1701 1718 /* This part must be outside protection */

+7 -1

include/uapi/linux/btrfs.h

··· 609 609 struct btrfs_ioctl_logical_ino_args { 610 610 __u64 logical; /* in */ 611 611 __u64 size; /* in */ 612 - __u64 reserved[4]; 612 + __u64 reserved[3]; /* must be 0 for now */ 613 + __u64 flags; /* in, v2 only */ 613 614 /* struct btrfs_data_container *inodes; out */ 614 615 __u64 inodes; 615 616 }; 617 + /* Return every ref to the extent, not just those containing logical block. 618 + * Requires logical == extent bytenr. */ 619 + #define BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET (1ULL << 0) 616 620 617 621 enum btrfs_dev_stat_values { 618 622 /* disk I/O failure stats */ ··· 840 836 struct btrfs_ioctl_feature_flags[3]) 841 837 #define BTRFS_IOC_RM_DEV_V2 _IOW(BTRFS_IOCTL_MAGIC, 58, \ 842 838 struct btrfs_ioctl_vol_args_v2) 839 + #define BTRFS_IOC_LOGICAL_INO_V2 _IOWR(BTRFS_IOCTL_MAGIC, 59, \ 840 + struct btrfs_ioctl_logical_ino_args) 843 841 844 842 #endif /* _UAPI_LINUX_BTRFS_H */

+1

include/uapi/linux/btrfs_tree.h

··· 733 733 #define BTRFS_FILE_EXTENT_INLINE 0 734 734 #define BTRFS_FILE_EXTENT_REG 1 735 735 #define BTRFS_FILE_EXTENT_PREALLOC 2 736 + #define BTRFS_FILE_EXTENT_TYPES 2 736 737 737 738 struct btrfs_file_extent_item { 738 739 /*