commit 925d169f5b86fe57e2f5264ea574cce9a89b719d · tjh.dev/kernel

-2

fs/btrfs/compression.c

··· 163 163 */ 164 164 static void end_compressed_bio_read(struct bio *bio, int err) 165 165 { 166 - struct extent_io_tree *tree; 167 166 struct compressed_bio *cb = bio->bi_private; 168 167 struct inode *inode; 169 168 struct page *page; ··· 186 187 /* ok, we're the last bio for this extent, lets start 187 188 * the decompression. 188 189 */ 189 - tree = &BTRFS_I(inode)->io_tree; 190 190 ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, 191 191 cb->start, 192 192 cb->orig_bio->bi_io_vec,

+30 -27

fs/btrfs/ctree.c

··· 200 200 struct extent_buffer **cow_ret, u64 new_root_objectid) 201 201 { 202 202 struct extent_buffer *cow; 203 - u32 nritems; 204 203 int ret = 0; 205 204 int level; 206 205 struct btrfs_disk_key disk_key; ··· 209 210 WARN_ON(root->ref_cows && trans->transid != root->last_trans); 210 211 211 212 level = btrfs_header_level(buf); 212 - nritems = btrfs_header_nritems(buf); 213 213 if (level == 0) 214 214 btrfs_item_key(buf, &disk_key, 0); 215 215 else ··· 1006 1008 int wret; 1007 1009 int pslot; 1008 1010 int orig_slot = path->slots[level]; 1009 - int err_on_enospc = 0; 1010 1011 u64 orig_ptr; 1011 1012 1012 1013 if (level == 0) ··· 1068 1071 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 1069 1072 return 0; 1070 1073 1071 - if (btrfs_header_nritems(mid) < 2) 1072 - err_on_enospc = 1; 1074 + btrfs_header_nritems(mid); 1073 1075 1074 1076 left = read_node_slot(root, parent, pslot - 1); 1075 1077 if (left) { ··· 1099 1103 wret = push_node_left(trans, root, left, mid, 1); 1100 1104 if (wret < 0) 1101 1105 ret = wret; 1102 - if (btrfs_header_nritems(mid) < 2) 1103 - err_on_enospc = 1; 1106 + btrfs_header_nritems(mid); 1104 1107 } 1105 1108 1106 1109 /* ··· 1219 1224 int wret; 1220 1225 int pslot; 1221 1226 int orig_slot = path->slots[level]; 1222 - u64 orig_ptr; 1223 1227 1224 1228 if (level == 0) 1225 1229 return 1; 1226 1230 1227 1231 mid = path->nodes[level]; 1228 1232 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1229 - orig_ptr = btrfs_node_blockptr(mid, orig_slot); 1230 1233 1231 1234 if (level < BTRFS_MAX_LEVEL - 1) 1232 1235 parent = path->nodes[level + 1]; ··· 1570 1577 blocksize = btrfs_level_size(root, level - 1); 1571 1578 1572 1579 tmp = btrfs_find_tree_block(root, blocknr, blocksize); 1573 - if (tmp && btrfs_buffer_uptodate(tmp, gen)) { 1574 - /* 1575 - * we found an up to date block without sleeping, return 1576 - * right away 1577 - */ 1578 - *eb_ret = tmp; 1579 - return 0; 1580 + if (tmp) { 1581 + if (btrfs_buffer_uptodate(tmp, 0)) { 1582 + if (btrfs_buffer_uptodate(tmp, gen)) { 1583 + /* 1584 + * we found an up to date block without 1585 + * sleeping, return 1586 + * right away 1587 + */ 1588 + *eb_ret = tmp; 1589 + return 0; 1590 + } 1591 + /* the pages were up to date, but we failed 1592 + * the generation number check. Do a full 1593 + * read for the generation number that is correct. 1594 + * We must do this without dropping locks so 1595 + * we can trust our generation number 1596 + */ 1597 + free_extent_buffer(tmp); 1598 + tmp = read_tree_block(root, blocknr, blocksize, gen); 1599 + if (tmp && btrfs_buffer_uptodate(tmp, gen)) { 1600 + *eb_ret = tmp; 1601 + return 0; 1602 + } 1603 + free_extent_buffer(tmp); 1604 + btrfs_release_path(NULL, p); 1605 + return -EIO; 1606 + } 1580 1607 } 1581 1608 1582 1609 /* ··· 1609 1596 btrfs_unlock_up_safe(p, level + 1); 1610 1597 btrfs_set_path_blocking(p); 1611 1598 1612 - if (tmp) 1613 - free_extent_buffer(tmp); 1599 + free_extent_buffer(tmp); 1614 1600 if (p->reada) 1615 1601 reada_for_search(root, p, level, slot, key->objectid); 1616 1602 ··· 2560 2548 { 2561 2549 struct btrfs_disk_key disk_key; 2562 2550 struct extent_buffer *right = path->nodes[0]; 2563 - int slot; 2564 2551 int i; 2565 2552 int push_space = 0; 2566 2553 int push_items = 0; ··· 2570 2559 int wret; 2571 2560 u32 this_item_size; 2572 2561 u32 old_left_item_size; 2573 - 2574 - slot = path->slots[1]; 2575 2562 2576 2563 if (empty) 2577 2564 nr = min(right_nritems, max_slot); ··· 3339 3330 { 3340 3331 int ret = 0; 3341 3332 int slot; 3342 - int slot_orig; 3343 3333 struct extent_buffer *leaf; 3344 3334 struct btrfs_item *item; 3345 3335 u32 nritems; ··· 3348 3340 unsigned int size_diff; 3349 3341 int i; 3350 3342 3351 - slot_orig = path->slots[0]; 3352 3343 leaf = path->nodes[0]; 3353 3344 slot = path->slots[0]; 3354 3345 ··· 3452 3445 { 3453 3446 int ret = 0; 3454 3447 int slot; 3455 - int slot_orig; 3456 3448 struct extent_buffer *leaf; 3457 3449 struct btrfs_item *item; 3458 3450 u32 nritems; ··· 3460 3454 unsigned int old_size; 3461 3455 int i; 3462 3456 3463 - slot_orig = path->slots[0]; 3464 3457 leaf = path->nodes[0]; 3465 3458 3466 3459 nritems = btrfs_header_nritems(leaf); ··· 3792 3787 struct btrfs_key *cpu_key, u32 *data_size, 3793 3788 int nr) 3794 3789 { 3795 - struct extent_buffer *leaf; 3796 3790 int ret = 0; 3797 3791 int slot; 3798 3792 int i; ··· 3808 3804 if (ret < 0) 3809 3805 goto out; 3810 3806 3811 - leaf = path->nodes[0]; 3812 3807 slot = path->slots[0]; 3813 3808 BUG_ON(slot < 0); 3814 3809

+89 -11

fs/btrfs/ctree.h

··· 99 99 */ 100 100 #define BTRFS_EXTENT_CSUM_OBJECTID -10ULL 101 101 102 + /* For storing free space cache */ 103 + #define BTRFS_FREE_SPACE_OBJECTID -11ULL 104 + 102 105 /* dummy objectid represents multiple objectids */ 103 106 #define BTRFS_MULTIPLE_OBJECTIDS -255ULL 104 107 ··· 268 265 /* additional stripes go here */ 269 266 } __attribute__ ((__packed__)); 270 267 268 + #define BTRFS_FREE_SPACE_EXTENT 1 269 + #define BTRFS_FREE_SPACE_BITMAP 2 270 + 271 + struct btrfs_free_space_entry { 272 + __le64 offset; 273 + __le64 bytes; 274 + u8 type; 275 + } __attribute__ ((__packed__)); 276 + 277 + struct btrfs_free_space_header { 278 + struct btrfs_disk_key location; 279 + __le64 generation; 280 + __le64 num_entries; 281 + __le64 num_bitmaps; 282 + } __attribute__ ((__packed__)); 283 + 271 284 static inline unsigned long btrfs_chunk_item_size(int num_stripes) 272 285 { 273 286 BUG_ON(num_stripes == 0); ··· 384 365 385 366 char label[BTRFS_LABEL_SIZE]; 386 367 368 + __le64 cache_generation; 369 + 387 370 /* future expansion */ 388 - __le64 reserved[32]; 371 + __le64 reserved[31]; 389 372 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; 390 373 } __attribute__ ((__packed__)); 391 374 ··· 396 375 * ones specified below then we will fail to mount 397 376 */ 398 377 #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) 399 - #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (2ULL << 0) 378 + #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) 379 + #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) 400 380 401 381 #define BTRFS_FEATURE_COMPAT_SUPP 0ULL 402 382 #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 403 - #define BTRFS_FEATURE_INCOMPAT_SUPP \ 404 - (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ 405 - BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL) 383 + #define BTRFS_FEATURE_INCOMPAT_SUPP \ 384 + (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ 385 + BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ 386 + BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 406 387 407 388 /* 408 389 * A leaf is full of items. offset and size tell us where to find ··· 698 675 struct btrfs_space_info { 699 676 u64 flags; 700 677 701 - u64 total_bytes; /* total bytes in the space */ 678 + u64 total_bytes; /* total bytes in the space, 679 + this doesn't take mirrors into account */ 702 680 u64 bytes_used; /* total bytes used, 703 681 this does't take mirrors into account */ 704 682 u64 bytes_pinned; /* total bytes pinned, will be freed when the ··· 711 687 u64 bytes_may_use; /* number of bytes that may be used for 712 688 delalloc/allocations */ 713 689 u64 disk_used; /* total bytes used on disk */ 690 + u64 disk_total; /* total bytes on disk, takes mirrors into 691 + account */ 714 692 715 693 int full; /* indicates that we cannot allocate any more 716 694 chunks for this space */ ··· 776 750 BTRFS_CACHE_FINISHED = 2, 777 751 }; 778 752 753 + enum btrfs_disk_cache_state { 754 + BTRFS_DC_WRITTEN = 0, 755 + BTRFS_DC_ERROR = 1, 756 + BTRFS_DC_CLEAR = 2, 757 + BTRFS_DC_SETUP = 3, 758 + BTRFS_DC_NEED_WRITE = 4, 759 + }; 760 + 779 761 struct btrfs_caching_control { 780 762 struct list_head list; 781 763 struct mutex mutex; ··· 797 763 struct btrfs_key key; 798 764 struct btrfs_block_group_item item; 799 765 struct btrfs_fs_info *fs_info; 766 + struct inode *inode; 800 767 spinlock_t lock; 801 768 u64 pinned; 802 769 u64 reserved; ··· 808 773 int extents_thresh; 809 774 int free_extents; 810 775 int total_bitmaps; 811 - int ro; 812 - int dirty; 776 + int ro:1; 777 + int dirty:1; 778 + int iref:1; 779 + 780 + int disk_cache_state; 813 781 814 782 /* cache tracking stuff */ 815 783 int cached; ··· 901 863 struct btrfs_transaction *running_transaction; 902 864 wait_queue_head_t transaction_throttle; 903 865 wait_queue_head_t transaction_wait; 866 + wait_queue_head_t transaction_blocked_wait; 904 867 wait_queue_head_t async_submit_wait; 905 868 906 869 struct btrfs_super_block super_copy; ··· 988 949 struct btrfs_workers endio_meta_workers; 989 950 struct btrfs_workers endio_meta_write_workers; 990 951 struct btrfs_workers endio_write_workers; 952 + struct btrfs_workers endio_freespace_worker; 991 953 struct btrfs_workers submit_workers; 992 954 /* 993 955 * fixup workers take dirty pages that didn't properly go through ··· 1232 1192 #define BTRFS_MOUNT_NOSSD (1 << 9) 1233 1193 #define BTRFS_MOUNT_DISCARD (1 << 10) 1234 1194 #define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11) 1195 + #define BTRFS_MOUNT_SPACE_CACHE (1 << 12) 1196 + #define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) 1197 + #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) 1235 1198 1236 1199 #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1237 1200 #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) ··· 1708 1665 write_eb_member(eb, item, struct btrfs_dir_item, location, key); 1709 1666 } 1710 1667 1668 + BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header, 1669 + num_entries, 64); 1670 + BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header, 1671 + num_bitmaps, 64); 1672 + BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header, 1673 + generation, 64); 1674 + 1675 + static inline void btrfs_free_space_key(struct extent_buffer *eb, 1676 + struct btrfs_free_space_header *h, 1677 + struct btrfs_disk_key *key) 1678 + { 1679 + read_eb_member(eb, h, struct btrfs_free_space_header, location, key); 1680 + } 1681 + 1682 + static inline void btrfs_set_free_space_key(struct extent_buffer *eb, 1683 + struct btrfs_free_space_header *h, 1684 + struct btrfs_disk_key *key) 1685 + { 1686 + write_eb_member(eb, h, struct btrfs_free_space_header, location, key); 1687 + } 1688 + 1711 1689 /* struct btrfs_disk_key */ 1712 1690 BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, 1713 1691 objectid, 64); ··· 1940 1876 incompat_flags, 64); 1941 1877 BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, 1942 1878 csum_type, 16); 1879 + BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, 1880 + cache_generation, 64); 1943 1881 1944 1882 static inline int btrfs_super_csum_size(struct btrfs_super_block *s) 1945 1883 { ··· 2054 1988 return file->f_path.dentry; 2055 1989 } 2056 1990 1991 + static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) 1992 + { 1993 + return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && 1994 + (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); 1995 + } 1996 + 2057 1997 /* extent-tree.c */ 2058 1998 void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 2059 1999 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, ··· 2151 2079 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); 2152 2080 int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, 2153 2081 struct btrfs_root *root, 2154 - int num_items, int *retries); 2082 + int num_items); 2155 2083 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 2156 2084 struct btrfs_root *root); 2157 2085 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, ··· 2172 2100 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, 2173 2101 struct btrfs_root *root, 2174 2102 struct btrfs_block_rsv *block_rsv, 2175 - u64 num_bytes, int *retries); 2103 + u64 num_bytes); 2176 2104 int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, 2177 2105 struct btrfs_root *root, 2178 2106 struct btrfs_block_rsv *block_rsv, ··· 2187 2115 struct btrfs_block_group_cache *cache); 2188 2116 int btrfs_set_block_group_rw(struct btrfs_root *root, 2189 2117 struct btrfs_block_group_cache *cache); 2118 + void btrfs_put_block_group_cache(struct btrfs_fs_info *info); 2190 2119 /* ctree.c */ 2191 2120 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2192 2121 int level, int *slot); ··· 2446 2373 u32 min_type); 2447 2374 2448 2375 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 2449 - int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput); 2376 + int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput, 2377 + int sync); 2450 2378 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 2451 2379 struct extent_state **cached_state); 2452 2380 int btrfs_writepages(struct address_space *mapping, ··· 2500 2426 int btrfs_prealloc_file_range(struct inode *inode, int mode, 2501 2427 u64 start, u64 num_bytes, u64 min_size, 2502 2428 loff_t actual_len, u64 *alloc_hint); 2429 + int btrfs_prealloc_file_range_trans(struct inode *inode, 2430 + struct btrfs_trans_handle *trans, int mode, 2431 + u64 start, u64 num_bytes, u64 min_size, 2432 + loff_t actual_len, u64 *alloc_hint); 2503 2433 extern const struct dentry_operations btrfs_dentry_operations; 2504 2434 2505 2435 /* ioctl.c */

+1 -1

fs/btrfs/dir-item.c

··· 427 427 ret = btrfs_truncate_item(trans, root, path, 428 428 item_len - sub_item_len, 1); 429 429 } 430 - return 0; 430 + return ret; 431 431 }

+19 -13

fs/btrfs/disk-io.c

··· 338 338 struct extent_io_tree *tree; 339 339 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 340 340 u64 found_start; 341 - int found_level; 342 341 unsigned long len; 343 342 struct extent_buffer *eb; 344 343 int ret; ··· 368 369 WARN_ON(1); 369 370 goto err; 370 371 } 371 - found_level = btrfs_header_level(eb); 372 - 373 372 csum_tree_block(root, eb, 0); 374 373 err: 375 374 free_extent_buffer(eb); ··· 478 481 end_io_wq->work.flags = 0; 479 482 480 483 if (bio->bi_rw & REQ_WRITE) { 481 - if (end_io_wq->metadata) 484 + if (end_io_wq->metadata == 1) 482 485 btrfs_queue_worker(&fs_info->endio_meta_write_workers, 486 + &end_io_wq->work); 487 + else if (end_io_wq->metadata == 2) 488 + btrfs_queue_worker(&fs_info->endio_freespace_worker, 483 489 &end_io_wq->work); 484 490 else 485 491 btrfs_queue_worker(&fs_info->endio_write_workers, ··· 497 497 } 498 498 } 499 499 500 + /* 501 + * For the metadata arg you want 502 + * 503 + * 0 - if data 504 + * 1 - if normal metadta 505 + * 2 - if writing to the free space cache area 506 + */ 500 507 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 501 508 int metadata) 502 509 { ··· 540 533 541 534 static void run_one_async_start(struct btrfs_work *work) 542 535 { 543 - struct btrfs_fs_info *fs_info; 544 536 struct async_submit_bio *async; 545 537 546 538 async = container_of(work, struct async_submit_bio, work); 547 - fs_info = BTRFS_I(async->inode)->root->fs_info; 548 539 async->submit_bio_start(async->inode, async->rw, async->bio, 549 540 async->mirror_num, async->bio_flags, 550 541 async->bio_offset); ··· 855 850 u32 blocksize, u64 parent_transid) 856 851 { 857 852 struct extent_buffer *buf = NULL; 858 - struct inode *btree_inode = root->fs_info->btree_inode; 859 - struct extent_io_tree *io_tree; 860 853 int ret; 861 - 862 - io_tree = &BTRFS_I(btree_inode)->io_tree; 863 854 864 855 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 865 856 if (!buf) ··· 1378 1377 u64 start = 0; 1379 1378 struct page *page; 1380 1379 struct extent_io_tree *io_tree = NULL; 1381 - struct btrfs_fs_info *info = NULL; 1382 1380 struct bio_vec *bvec; 1383 1381 int i; 1384 1382 int ret; ··· 1396 1396 buf_len = page->private >> 2; 1397 1397 start = page_offset(page) + bvec->bv_offset; 1398 1398 io_tree = &BTRFS_I(page->mapping->host)->io_tree; 1399 - info = BTRFS_I(page->mapping->host)->root->fs_info; 1400 1399 } 1401 1400 /* are we fully contained in this bio? */ 1402 1401 if (buf_len <= length) ··· 1679 1680 1680 1681 init_waitqueue_head(&fs_info->transaction_throttle); 1681 1682 init_waitqueue_head(&fs_info->transaction_wait); 1683 + init_waitqueue_head(&fs_info->transaction_blocked_wait); 1682 1684 init_waitqueue_head(&fs_info->async_submit_wait); 1683 1685 1684 1686 __setup_root(4096, 4096, 4096, 4096, tree_root, 1685 1687 fs_info, BTRFS_ROOT_TREE_OBJECTID); 1686 - 1687 1688 1688 1689 bh = btrfs_read_dev_super(fs_devices->latest_bdev); 1689 1690 if (!bh) ··· 1774 1775 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", 1775 1776 fs_info->thread_pool_size, 1776 1777 &fs_info->generic_worker); 1778 + btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write", 1779 + 1, &fs_info->generic_worker); 1777 1780 1778 1781 /* 1779 1782 * endios are largely parallel and should have a very ··· 1796 1795 btrfs_start_workers(&fs_info->endio_meta_workers, 1); 1797 1796 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); 1798 1797 btrfs_start_workers(&fs_info->endio_write_workers, 1); 1798 + btrfs_start_workers(&fs_info->endio_freespace_worker, 1); 1799 1799 1800 1800 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1801 1801 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, ··· 1995 1993 if (!(sb->s_flags & MS_RDONLY)) { 1996 1994 down_read(&fs_info->cleanup_work_sem); 1997 1995 btrfs_orphan_cleanup(fs_info->fs_root); 1996 + btrfs_orphan_cleanup(fs_info->tree_root); 1998 1997 up_read(&fs_info->cleanup_work_sem); 1999 1998 } 2000 1999 ··· 2038 2035 btrfs_stop_workers(&fs_info->endio_meta_workers); 2039 2036 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2040 2037 btrfs_stop_workers(&fs_info->endio_write_workers); 2038 + btrfs_stop_workers(&fs_info->endio_freespace_worker); 2041 2039 btrfs_stop_workers(&fs_info->submit_workers); 2042 2040 fail_iput: 2043 2041 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); ··· 2414 2410 fs_info->closing = 1; 2415 2411 smp_mb(); 2416 2412 2413 + btrfs_put_block_group_cache(fs_info); 2417 2414 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 2418 2415 ret = btrfs_commit_super(root); 2419 2416 if (ret) ··· 2461 2456 btrfs_stop_workers(&fs_info->endio_meta_workers); 2462 2457 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2463 2458 btrfs_stop_workers(&fs_info->endio_write_workers); 2459 + btrfs_stop_workers(&fs_info->endio_freespace_worker); 2464 2460 btrfs_stop_workers(&fs_info->submit_workers); 2465 2461 2466 2462 btrfs_close_devices(fs_info->fs_devices);

+545 -159

fs/btrfs/extent-tree.c

··· 242 242 return NULL; 243 243 } 244 244 245 + /* We're loading it the fast way, so we don't have a caching_ctl. */ 246 + if (!cache->caching_ctl) { 247 + spin_unlock(&cache->lock); 248 + return NULL; 249 + } 250 + 245 251 ctl = cache->caching_ctl; 246 252 atomic_inc(&ctl->count); 247 253 spin_unlock(&cache->lock); ··· 427 421 return 0; 428 422 } 429 423 430 - static int cache_block_group(struct btrfs_block_group_cache *cache) 424 + static int cache_block_group(struct btrfs_block_group_cache *cache, 425 + struct btrfs_trans_handle *trans, 426 + int load_cache_only) 431 427 { 432 428 struct btrfs_fs_info *fs_info = cache->fs_info; 433 429 struct btrfs_caching_control *caching_ctl; ··· 438 430 439 431 smp_mb(); 440 432 if (cache->cached != BTRFS_CACHE_NO) 433 + return 0; 434 + 435 + /* 436 + * We can't do the read from on-disk cache during a commit since we need 437 + * to have the normal tree locking. 438 + */ 439 + if (!trans->transaction->in_commit) { 440 + spin_lock(&cache->lock); 441 + if (cache->cached != BTRFS_CACHE_NO) { 442 + spin_unlock(&cache->lock); 443 + return 0; 444 + } 445 + cache->cached = BTRFS_CACHE_STARTED; 446 + spin_unlock(&cache->lock); 447 + 448 + ret = load_free_space_cache(fs_info, cache); 449 + 450 + spin_lock(&cache->lock); 451 + if (ret == 1) { 452 + cache->cached = BTRFS_CACHE_FINISHED; 453 + cache->last_byte_to_unpin = (u64)-1; 454 + } else { 455 + cache->cached = BTRFS_CACHE_NO; 456 + } 457 + spin_unlock(&cache->lock); 458 + if (ret == 1) 459 + return 0; 460 + } 461 + 462 + if (load_cache_only) 441 463 return 0; 442 464 443 465 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL); ··· 547 509 548 510 rcu_read_lock(); 549 511 list_for_each_entry_rcu(found, head, list) { 550 - if (found->flags == flags) { 512 + if (found->flags & flags) { 551 513 rcu_read_unlock(); 552 514 return found; 553 515 } ··· 577 539 return num; 578 540 num *= factor; 579 541 do_div(num, 10); 542 + return num; 543 + } 544 + 545 + static u64 div_factor_fine(u64 num, int factor) 546 + { 547 + if (factor == 100) 548 + return num; 549 + num *= factor; 550 + do_div(num, 100); 580 551 return num; 581 552 } 582 553 ··· 2734 2687 return cache; 2735 2688 } 2736 2689 2690 + static int cache_save_setup(struct btrfs_block_group_cache *block_group, 2691 + struct btrfs_trans_handle *trans, 2692 + struct btrfs_path *path) 2693 + { 2694 + struct btrfs_root *root = block_group->fs_info->tree_root; 2695 + struct inode *inode = NULL; 2696 + u64 alloc_hint = 0; 2697 + int num_pages = 0; 2698 + int retries = 0; 2699 + int ret = 0; 2700 + 2701 + /* 2702 + * If this block group is smaller than 100 megs don't bother caching the 2703 + * block group. 2704 + */ 2705 + if (block_group->key.offset < (100 * 1024 * 1024)) { 2706 + spin_lock(&block_group->lock); 2707 + block_group->disk_cache_state = BTRFS_DC_WRITTEN; 2708 + spin_unlock(&block_group->lock); 2709 + return 0; 2710 + } 2711 + 2712 + again: 2713 + inode = lookup_free_space_inode(root, block_group, path); 2714 + if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 2715 + ret = PTR_ERR(inode); 2716 + btrfs_release_path(root, path); 2717 + goto out; 2718 + } 2719 + 2720 + if (IS_ERR(inode)) { 2721 + BUG_ON(retries); 2722 + retries++; 2723 + 2724 + if (block_group->ro) 2725 + goto out_free; 2726 + 2727 + ret = create_free_space_inode(root, trans, block_group, path); 2728 + if (ret) 2729 + goto out_free; 2730 + goto again; 2731 + } 2732 + 2733 + /* 2734 + * We want to set the generation to 0, that way if anything goes wrong 2735 + * from here on out we know not to trust this cache when we load up next 2736 + * time. 2737 + */ 2738 + BTRFS_I(inode)->generation = 0; 2739 + ret = btrfs_update_inode(trans, root, inode); 2740 + WARN_ON(ret); 2741 + 2742 + if (i_size_read(inode) > 0) { 2743 + ret = btrfs_truncate_free_space_cache(root, trans, path, 2744 + inode); 2745 + if (ret) 2746 + goto out_put; 2747 + } 2748 + 2749 + spin_lock(&block_group->lock); 2750 + if (block_group->cached != BTRFS_CACHE_FINISHED) { 2751 + spin_unlock(&block_group->lock); 2752 + goto out_put; 2753 + } 2754 + spin_unlock(&block_group->lock); 2755 + 2756 + num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024); 2757 + if (!num_pages) 2758 + num_pages = 1; 2759 + 2760 + /* 2761 + * Just to make absolutely sure we have enough space, we're going to 2762 + * preallocate 12 pages worth of space for each block group. In 2763 + * practice we ought to use at most 8, but we need extra space so we can 2764 + * add our header and have a terminator between the extents and the 2765 + * bitmaps. 2766 + */ 2767 + num_pages *= 16; 2768 + num_pages *= PAGE_CACHE_SIZE; 2769 + 2770 + ret = btrfs_check_data_free_space(inode, num_pages); 2771 + if (ret) 2772 + goto out_put; 2773 + 2774 + ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 2775 + num_pages, num_pages, 2776 + &alloc_hint); 2777 + btrfs_free_reserved_data_space(inode, num_pages); 2778 + out_put: 2779 + iput(inode); 2780 + out_free: 2781 + btrfs_release_path(root, path); 2782 + out: 2783 + spin_lock(&block_group->lock); 2784 + if (ret) 2785 + block_group->disk_cache_state = BTRFS_DC_ERROR; 2786 + else 2787 + block_group->disk_cache_state = BTRFS_DC_SETUP; 2788 + spin_unlock(&block_group->lock); 2789 + 2790 + return ret; 2791 + } 2792 + 2737 2793 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 2738 2794 struct btrfs_root *root) 2739 2795 { ··· 2849 2699 if (!path) 2850 2700 return -ENOMEM; 2851 2701 2702 + again: 2703 + while (1) { 2704 + cache = btrfs_lookup_first_block_group(root->fs_info, last); 2705 + while (cache) { 2706 + if (cache->disk_cache_state == BTRFS_DC_CLEAR) 2707 + break; 2708 + cache = next_block_group(root, cache); 2709 + } 2710 + if (!cache) { 2711 + if (last == 0) 2712 + break; 2713 + last = 0; 2714 + continue; 2715 + } 2716 + err = cache_save_setup(cache, trans, path); 2717 + last = cache->key.objectid + cache->key.offset; 2718 + btrfs_put_block_group(cache); 2719 + } 2720 + 2852 2721 while (1) { 2853 2722 if (last == 0) { 2854 2723 err = btrfs_run_delayed_refs(trans, root, ··· 2877 2708 2878 2709 cache = btrfs_lookup_first_block_group(root->fs_info, last); 2879 2710 while (cache) { 2711 + if (cache->disk_cache_state == BTRFS_DC_CLEAR) { 2712 + btrfs_put_block_group(cache); 2713 + goto again; 2714 + } 2715 + 2880 2716 if (cache->dirty) 2881 2717 break; 2882 2718 cache = next_block_group(root, cache); ··· 2893 2719 continue; 2894 2720 } 2895 2721 2722 + if (cache->disk_cache_state == BTRFS_DC_SETUP) 2723 + cache->disk_cache_state = BTRFS_DC_NEED_WRITE; 2896 2724 cache->dirty = 0; 2897 2725 last = cache->key.objectid + cache->key.offset; 2898 2726 2899 2727 err = write_one_cache_group(trans, root, path, cache); 2900 2728 BUG_ON(err); 2729 + btrfs_put_block_group(cache); 2730 + } 2731 + 2732 + while (1) { 2733 + /* 2734 + * I don't think this is needed since we're just marking our 2735 + * preallocated extent as written, but just in case it can't 2736 + * hurt. 2737 + */ 2738 + if (last == 0) { 2739 + err = btrfs_run_delayed_refs(trans, root, 2740 + (unsigned long)-1); 2741 + BUG_ON(err); 2742 + } 2743 + 2744 + cache = btrfs_lookup_first_block_group(root->fs_info, last); 2745 + while (cache) { 2746 + /* 2747 + * Really this shouldn't happen, but it could if we 2748 + * couldn't write the entire preallocated extent and 2749 + * splitting the extent resulted in a new block. 2750 + */ 2751 + if (cache->dirty) { 2752 + btrfs_put_block_group(cache); 2753 + goto again; 2754 + } 2755 + if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) 2756 + break; 2757 + cache = next_block_group(root, cache); 2758 + } 2759 + if (!cache) { 2760 + if (last == 0) 2761 + break; 2762 + last = 0; 2763 + continue; 2764 + } 2765 + 2766 + btrfs_write_out_cache(root, trans, cache, path); 2767 + 2768 + /* 2769 + * If we didn't have an error then the cache state is still 2770 + * NEED_WRITE, so we can set it to WRITTEN. 2771 + */ 2772 + if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) 2773 + cache->disk_cache_state = BTRFS_DC_WRITTEN; 2774 + last = cache->key.objectid + cache->key.offset; 2901 2775 btrfs_put_block_group(cache); 2902 2776 } 2903 2777 ··· 2984 2762 if (found) { 2985 2763 spin_lock(&found->lock); 2986 2764 found->total_bytes += total_bytes; 2765 + found->disk_total += total_bytes * factor; 2987 2766 found->bytes_used += bytes_used; 2988 2767 found->disk_used += bytes_used * factor; 2989 2768 found->full = 0; ··· 3004 2781 BTRFS_BLOCK_GROUP_SYSTEM | 3005 2782 BTRFS_BLOCK_GROUP_METADATA); 3006 2783 found->total_bytes = total_bytes; 2784 + found->disk_total = total_bytes * factor; 3007 2785 found->bytes_used = bytes_used; 3008 2786 found->disk_used = bytes_used * factor; 3009 2787 found->bytes_pinned = 0; ··· 3106 2882 struct btrfs_space_info *data_sinfo; 3107 2883 struct btrfs_root *root = BTRFS_I(inode)->root; 3108 2884 u64 used; 3109 - int ret = 0, committed = 0; 2885 + int ret = 0, committed = 0, alloc_chunk = 1; 3110 2886 3111 2887 /* make sure bytes are sectorsize aligned */ 3112 2888 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 2889 + 2890 + if (root == root->fs_info->tree_root) { 2891 + alloc_chunk = 0; 2892 + committed = 1; 2893 + } 3113 2894 3114 2895 data_sinfo = BTRFS_I(inode)->space_info; 3115 2896 if (!data_sinfo) ··· 3134 2905 * if we don't have enough free bytes in this space then we need 3135 2906 * to alloc a new chunk. 3136 2907 */ 3137 - if (!data_sinfo->full) { 2908 + if (!data_sinfo->full && alloc_chunk) { 3138 2909 u64 alloc_target; 3139 2910 3140 2911 data_sinfo->force_alloc = 1; ··· 3226 2997 rcu_read_unlock(); 3227 2998 } 3228 2999 3229 - static int should_alloc_chunk(struct btrfs_space_info *sinfo, 3230 - u64 alloc_bytes) 3000 + static int should_alloc_chunk(struct btrfs_root *root, 3001 + struct btrfs_space_info *sinfo, u64 alloc_bytes) 3231 3002 { 3232 3003 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3004 + u64 thresh; 3233 3005 3234 3006 if (sinfo->bytes_used + sinfo->bytes_reserved + 3235 3007 alloc_bytes + 256 * 1024 * 1024 < num_bytes) ··· 3238 3008 3239 3009 if (sinfo->bytes_used + sinfo->bytes_reserved + 3240 3010 alloc_bytes < div_factor(num_bytes, 8)) 3011 + return 0; 3012 + 3013 + thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); 3014 + thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); 3015 + 3016 + if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3)) 3241 3017 return 0; 3242 3018 3243 3019 return 1; ··· 3277 3041 goto out; 3278 3042 } 3279 3043 3280 - if (!force && !should_alloc_chunk(space_info, alloc_bytes)) { 3044 + if (!force && !should_alloc_chunk(extent_root, space_info, 3045 + alloc_bytes)) { 3281 3046 spin_unlock(&space_info->lock); 3282 3047 goto out; 3283 3048 } 3284 3049 spin_unlock(&space_info->lock); 3050 + 3051 + /* 3052 + * If we have mixed data/metadata chunks we want to make sure we keep 3053 + * allocating mixed chunks instead of individual chunks. 3054 + */ 3055 + if (btrfs_mixed_space_info(space_info)) 3056 + flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 3285 3057 3286 3058 /* 3287 3059 * if we're doing a data chunk, go ahead and make sure that ··· 3316 3072 return ret; 3317 3073 } 3318 3074 3319 - static int maybe_allocate_chunk(struct btrfs_trans_handle *trans, 3320 - struct btrfs_root *root, 3321 - struct btrfs_space_info *sinfo, u64 num_bytes) 3322 - { 3323 - int ret; 3324 - int end_trans = 0; 3325 - 3326 - if (sinfo->full) 3327 - return 0; 3328 - 3329 - spin_lock(&sinfo->lock); 3330 - ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024); 3331 - spin_unlock(&sinfo->lock); 3332 - if (!ret) 3333 - return 0; 3334 - 3335 - if (!trans) { 3336 - trans = btrfs_join_transaction(root, 1); 3337 - BUG_ON(IS_ERR(trans)); 3338 - end_trans = 1; 3339 - } 3340 - 3341 - ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3342 - num_bytes + 2 * 1024 * 1024, 3343 - get_alloc_profile(root, sinfo->flags), 0); 3344 - 3345 - if (end_trans) 3346 - btrfs_end_transaction(trans, root); 3347 - 3348 - return ret == 1 ? 1 : 0; 3349 - } 3350 - 3351 3075 /* 3352 3076 * shrink metadata reservation for delalloc 3353 3077 */ 3354 3078 static int shrink_delalloc(struct btrfs_trans_handle *trans, 3355 - struct btrfs_root *root, u64 to_reclaim) 3079 + struct btrfs_root *root, u64 to_reclaim, int sync) 3356 3080 { 3357 3081 struct btrfs_block_rsv *block_rsv; 3082 + struct btrfs_space_info *space_info; 3358 3083 u64 reserved; 3359 3084 u64 max_reclaim; 3360 3085 u64 reclaimed = 0; 3361 3086 int pause = 1; 3362 - int ret; 3087 + int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3363 3088 3364 3089 block_rsv = &root->fs_info->delalloc_block_rsv; 3365 - spin_lock(&block_rsv->lock); 3366 - reserved = block_rsv->reserved; 3367 - spin_unlock(&block_rsv->lock); 3090 + space_info = block_rsv->space_info; 3091 + 3092 + smp_mb(); 3093 + reserved = space_info->bytes_reserved; 3368 3094 3369 3095 if (reserved == 0) 3370 3096 return 0; ··· 3342 3128 max_reclaim = min(reserved, to_reclaim); 3343 3129 3344 3130 while (1) { 3345 - ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0); 3346 - if (!ret) { 3347 - __set_current_state(TASK_INTERRUPTIBLE); 3348 - schedule_timeout(pause); 3349 - pause <<= 1; 3350 - if (pause > HZ / 10) 3351 - pause = HZ / 10; 3352 - } else { 3353 - pause = 1; 3354 - } 3131 + /* have the flusher threads jump in and do some IO */ 3132 + smp_mb(); 3133 + nr_pages = min_t(unsigned long, nr_pages, 3134 + root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT); 3135 + writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); 3355 3136 3356 - spin_lock(&block_rsv->lock); 3357 - if (reserved > block_rsv->reserved) 3358 - reclaimed = reserved - block_rsv->reserved; 3359 - reserved = block_rsv->reserved; 3360 - spin_unlock(&block_rsv->lock); 3137 + spin_lock(&space_info->lock); 3138 + if (reserved > space_info->bytes_reserved) 3139 + reclaimed += reserved - space_info->bytes_reserved; 3140 + reserved = space_info->bytes_reserved; 3141 + spin_unlock(&space_info->lock); 3361 3142 3362 3143 if (reserved == 0 || reclaimed >= max_reclaim) 3363 3144 break; 3364 3145 3365 3146 if (trans && trans->transaction->blocked) 3366 3147 return -EAGAIN; 3148 + 3149 + __set_current_state(TASK_INTERRUPTIBLE); 3150 + schedule_timeout(pause); 3151 + pause <<= 1; 3152 + if (pause > HZ / 10) 3153 + pause = HZ / 10; 3154 + 3367 3155 } 3368 3156 return reclaimed >= to_reclaim; 3369 3157 } 3370 3158 3371 - static int should_retry_reserve(struct btrfs_trans_handle *trans, 3372 - struct btrfs_root *root, 3373 - struct btrfs_block_rsv *block_rsv, 3374 - u64 num_bytes, int *retries) 3375 - { 3376 - struct btrfs_space_info *space_info = block_rsv->space_info; 3377 - int ret; 3378 - 3379 - if ((*retries) > 2) 3380 - return -ENOSPC; 3381 - 3382 - ret = maybe_allocate_chunk(trans, root, space_info, num_bytes); 3383 - if (ret) 3384 - return 1; 3385 - 3386 - if (trans && trans->transaction->in_commit) 3387 - return -ENOSPC; 3388 - 3389 - ret = shrink_delalloc(trans, root, num_bytes); 3390 - if (ret) 3391 - return ret; 3392 - 3393 - spin_lock(&space_info->lock); 3394 - if (space_info->bytes_pinned < num_bytes) 3395 - ret = 1; 3396 - spin_unlock(&space_info->lock); 3397 - if (ret) 3398 - return -ENOSPC; 3399 - 3400 - (*retries)++; 3401 - 3402 - if (trans) 3403 - return -EAGAIN; 3404 - 3405 - trans = btrfs_join_transaction(root, 1); 3406 - BUG_ON(IS_ERR(trans)); 3407 - ret = btrfs_commit_transaction(trans, root); 3408 - BUG_ON(ret); 3409 - 3410 - return 1; 3411 - } 3412 - 3413 - static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv, 3414 - u64 num_bytes) 3159 + /* 3160 + * Retries tells us how many times we've called reserve_metadata_bytes. The 3161 + * idea is if this is the first call (retries == 0) then we will add to our 3162 + * reserved count if we can't make the allocation in order to hold our place 3163 + * while we go and try and free up space. That way for retries > 1 we don't try 3164 + * and add space, we just check to see if the amount of unused space is >= the 3165 + * total space, meaning that our reservation is valid. 3166 + * 3167 + * However if we don't intend to retry this reservation, pass -1 as retries so 3168 + * that it short circuits this logic. 3169 + */ 3170 + static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, 3171 + struct btrfs_root *root, 3172 + struct btrfs_block_rsv *block_rsv, 3173 + u64 orig_bytes, int flush) 3415 3174 { 3416 3175 struct btrfs_space_info *space_info = block_rsv->space_info; 3417 3176 u64 unused; 3418 - int ret = -ENOSPC; 3177 + u64 num_bytes = orig_bytes; 3178 + int retries = 0; 3179 + int ret = 0; 3180 + bool reserved = false; 3181 + bool committed = false; 3182 + 3183 + again: 3184 + ret = -ENOSPC; 3185 + if (reserved) 3186 + num_bytes = 0; 3419 3187 3420 3188 spin_lock(&space_info->lock); 3421 3189 unused = space_info->bytes_used + space_info->bytes_reserved + 3422 - space_info->bytes_pinned + space_info->bytes_readonly; 3190 + space_info->bytes_pinned + space_info->bytes_readonly + 3191 + space_info->bytes_may_use; 3423 3192 3424 - if (unused < space_info->total_bytes) 3425 - unused = space_info->total_bytes - unused; 3426 - else 3427 - unused = 0; 3428 - 3429 - if (unused >= num_bytes) { 3430 - if (block_rsv->priority >= 10) { 3431 - space_info->bytes_reserved += num_bytes; 3193 + /* 3194 + * The idea here is that we've not already over-reserved the block group 3195 + * then we can go ahead and save our reservation first and then start 3196 + * flushing if we need to. Otherwise if we've already overcommitted 3197 + * lets start flushing stuff first and then come back and try to make 3198 + * our reservation. 3199 + */ 3200 + if (unused <= space_info->total_bytes) { 3201 + unused -= space_info->total_bytes; 3202 + if (unused >= num_bytes) { 3203 + if (!reserved) 3204 + space_info->bytes_reserved += orig_bytes; 3432 3205 ret = 0; 3433 3206 } else { 3434 - if ((unused + block_rsv->reserved) * 3435 - block_rsv->priority >= 3436 - (num_bytes + block_rsv->reserved) * 10) { 3437 - space_info->bytes_reserved += num_bytes; 3438 - ret = 0; 3439 - } 3207 + /* 3208 + * Ok set num_bytes to orig_bytes since we aren't 3209 + * overocmmitted, this way we only try and reclaim what 3210 + * we need. 3211 + */ 3212 + num_bytes = orig_bytes; 3440 3213 } 3214 + } else { 3215 + /* 3216 + * Ok we're over committed, set num_bytes to the overcommitted 3217 + * amount plus the amount of bytes that we need for this 3218 + * reservation. 3219 + */ 3220 + num_bytes = unused - space_info->total_bytes + 3221 + (orig_bytes * (retries + 1)); 3441 3222 } 3223 + 3224 + /* 3225 + * Couldn't make our reservation, save our place so while we're trying 3226 + * to reclaim space we can actually use it instead of somebody else 3227 + * stealing it from us. 3228 + */ 3229 + if (ret && !reserved) { 3230 + space_info->bytes_reserved += orig_bytes; 3231 + reserved = true; 3232 + } 3233 + 3442 3234 spin_unlock(&space_info->lock); 3235 + 3236 + if (!ret) 3237 + return 0; 3238 + 3239 + if (!flush) 3240 + goto out; 3241 + 3242 + /* 3243 + * We do synchronous shrinking since we don't actually unreserve 3244 + * metadata until after the IO is completed. 3245 + */ 3246 + ret = shrink_delalloc(trans, root, num_bytes, 1); 3247 + if (ret > 0) 3248 + return 0; 3249 + else if (ret < 0) 3250 + goto out; 3251 + 3252 + /* 3253 + * So if we were overcommitted it's possible that somebody else flushed 3254 + * out enough space and we simply didn't have enough space to reclaim, 3255 + * so go back around and try again. 3256 + */ 3257 + if (retries < 2) { 3258 + retries++; 3259 + goto again; 3260 + } 3261 + 3262 + spin_lock(&space_info->lock); 3263 + /* 3264 + * Not enough space to be reclaimed, don't bother committing the 3265 + * transaction. 3266 + */ 3267 + if (space_info->bytes_pinned < orig_bytes) 3268 + ret = -ENOSPC; 3269 + spin_unlock(&space_info->lock); 3270 + if (ret) 3271 + goto out; 3272 + 3273 + ret = -EAGAIN; 3274 + if (trans || committed) 3275 + goto out; 3276 + 3277 + ret = -ENOSPC; 3278 + trans = btrfs_join_transaction(root, 1); 3279 + if (IS_ERR(trans)) 3280 + goto out; 3281 + ret = btrfs_commit_transaction(trans, root); 3282 + if (!ret) { 3283 + trans = NULL; 3284 + committed = true; 3285 + goto again; 3286 + } 3287 + 3288 + out: 3289 + if (reserved) { 3290 + spin_lock(&space_info->lock); 3291 + space_info->bytes_reserved -= orig_bytes; 3292 + spin_unlock(&space_info->lock); 3293 + } 3443 3294 3444 3295 return ret; 3445 3296 } ··· 3606 3327 { 3607 3328 struct btrfs_block_rsv *block_rsv; 3608 3329 struct btrfs_fs_info *fs_info = root->fs_info; 3609 - u64 alloc_target; 3610 3330 3611 3331 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); 3612 3332 if (!block_rsv) 3613 3333 return NULL; 3614 3334 3615 3335 btrfs_init_block_rsv(block_rsv); 3616 - 3617 - alloc_target = btrfs_get_alloc_profile(root, 0); 3618 3336 block_rsv->space_info = __find_space_info(fs_info, 3619 3337 BTRFS_BLOCK_GROUP_METADATA); 3620 - 3621 3338 return block_rsv; 3622 3339 } 3623 3340 ··· 3644 3369 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, 3645 3370 struct btrfs_root *root, 3646 3371 struct btrfs_block_rsv *block_rsv, 3647 - u64 num_bytes, int *retries) 3372 + u64 num_bytes) 3648 3373 { 3649 3374 int ret; 3650 3375 3651 3376 if (num_bytes == 0) 3652 3377 return 0; 3653 - again: 3654 - ret = reserve_metadata_bytes(block_rsv, num_bytes); 3378 + 3379 + ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1); 3655 3380 if (!ret) { 3656 3381 block_rsv_add_bytes(block_rsv, num_bytes, 1); 3657 3382 return 0; 3658 3383 } 3659 - 3660 - ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries); 3661 - if (ret > 0) 3662 - goto again; 3663 3384 3664 3385 return ret; 3665 3386 } ··· 3691 3420 return 0; 3692 3421 3693 3422 if (block_rsv->refill_used) { 3694 - ret = reserve_metadata_bytes(block_rsv, num_bytes); 3423 + ret = reserve_metadata_bytes(trans, root, block_rsv, 3424 + num_bytes, 0); 3695 3425 if (!ret) { 3696 3426 block_rsv_add_bytes(block_rsv, num_bytes, 0); 3697 3427 return 0; ··· 3771 3499 3772 3500 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 3773 3501 spin_lock(&sinfo->lock); 3502 + if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) 3503 + data_used = 0; 3774 3504 meta_used = sinfo->bytes_used; 3775 3505 spin_unlock(&sinfo->lock); 3776 3506 ··· 3800 3526 block_rsv->size = num_bytes; 3801 3527 3802 3528 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + 3803 - sinfo->bytes_reserved + sinfo->bytes_readonly; 3529 + sinfo->bytes_reserved + sinfo->bytes_readonly + 3530 + sinfo->bytes_may_use; 3804 3531 3805 3532 if (sinfo->total_bytes > num_bytes) { 3806 3533 num_bytes = sinfo->total_bytes - num_bytes; ··· 3872 3597 3873 3598 int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, 3874 3599 struct btrfs_root *root, 3875 - int num_items, int *retries) 3600 + int num_items) 3876 3601 { 3877 3602 u64 num_bytes; 3878 3603 int ret; ··· 3882 3607 3883 3608 num_bytes = calc_trans_metadata_size(root, num_items); 3884 3609 ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv, 3885 - num_bytes, retries); 3610 + num_bytes); 3886 3611 if (!ret) { 3887 3612 trans->bytes_reserved += num_bytes; 3888 3613 trans->block_rsv = &root->fs_info->trans_block_rsv; ··· 3956 3681 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 3957 3682 u64 to_reserve; 3958 3683 int nr_extents; 3959 - int retries = 0; 3960 3684 int ret; 3961 3685 3962 3686 if (btrfs_transaction_in_commit(root->fs_info)) 3963 3687 schedule_timeout(1); 3964 3688 3965 3689 num_bytes = ALIGN(num_bytes, root->sectorsize); 3966 - again: 3690 + 3967 3691 spin_lock(&BTRFS_I(inode)->accounting_lock); 3968 3692 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; 3969 3693 if (nr_extents > BTRFS_I(inode)->reserved_extents) { ··· 3972 3698 nr_extents = 0; 3973 3699 to_reserve = 0; 3974 3700 } 3701 + spin_unlock(&BTRFS_I(inode)->accounting_lock); 3975 3702 3976 3703 to_reserve += calc_csum_metadata_size(inode, num_bytes); 3977 - ret = reserve_metadata_bytes(block_rsv, to_reserve); 3978 - if (ret) { 3979 - spin_unlock(&BTRFS_I(inode)->accounting_lock); 3980 - ret = should_retry_reserve(NULL, root, block_rsv, to_reserve, 3981 - &retries); 3982 - if (ret > 0) 3983 - goto again; 3704 + ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); 3705 + if (ret) 3984 3706 return ret; 3985 - } 3986 3707 3708 + spin_lock(&BTRFS_I(inode)->accounting_lock); 3987 3709 BTRFS_I(inode)->reserved_extents += nr_extents; 3988 3710 atomic_inc(&BTRFS_I(inode)->outstanding_extents); 3989 3711 spin_unlock(&BTRFS_I(inode)->accounting_lock); ··· 3987 3717 block_rsv_add_bytes(block_rsv, to_reserve, 1); 3988 3718 3989 3719 if (block_rsv->size > 512 * 1024 * 1024) 3990 - shrink_delalloc(NULL, root, to_reserve); 3720 + shrink_delalloc(NULL, root, to_reserve, 0); 3991 3721 3992 3722 return 0; 3993 3723 } ··· 4046 3776 struct btrfs_root *root, 4047 3777 u64 bytenr, u64 num_bytes, int alloc) 4048 3778 { 4049 - struct btrfs_block_group_cache *cache; 3779 + struct btrfs_block_group_cache *cache = NULL; 4050 3780 struct btrfs_fs_info *info = root->fs_info; 4051 - int factor; 4052 3781 u64 total = num_bytes; 4053 3782 u64 old_val; 4054 3783 u64 byte_in_group; 3784 + int factor; 4055 3785 4056 3786 /* block accounting for super block */ 4057 3787 spin_lock(&info->delalloc_lock); ··· 4073 3803 factor = 2; 4074 3804 else 4075 3805 factor = 1; 3806 + /* 3807 + * If this block group has free space cache written out, we 3808 + * need to make sure to load it if we are removing space. This 3809 + * is because we need the unpinning stage to actually add the 3810 + * space back to the block group, otherwise we will leak space. 3811 + */ 3812 + if (!alloc && cache->cached == BTRFS_CACHE_NO) 3813 + cache_block_group(cache, trans, 1); 3814 + 4076 3815 byte_in_group = bytenr - cache->key.objectid; 4077 3816 WARN_ON(byte_in_group > cache->key.offset); 4078 3817 4079 3818 spin_lock(&cache->space_info->lock); 4080 3819 spin_lock(&cache->lock); 3820 + 3821 + if (btrfs_super_cache_generation(&info->super_copy) != 0 && 3822 + cache->disk_cache_state < BTRFS_DC_CLEAR) 3823 + cache->disk_cache_state = BTRFS_DC_CLEAR; 3824 + 4081 3825 cache->dirty = 1; 4082 3826 old_val = btrfs_block_group_used(&cache->item); 4083 3827 num_bytes = min(total, cache->key.offset - byte_in_group); ··· 4838 4554 bool found_uncached_bg = false; 4839 4555 bool failed_cluster_refill = false; 4840 4556 bool failed_alloc = false; 4557 + bool use_cluster = true; 4841 4558 u64 ideal_cache_percent = 0; 4842 4559 u64 ideal_cache_offset = 0; 4843 4560 ··· 4853 4568 return -ENOSPC; 4854 4569 } 4855 4570 4571 + /* 4572 + * If the space info is for both data and metadata it means we have a 4573 + * small filesystem and we can't use the clustering stuff. 4574 + */ 4575 + if (btrfs_mixed_space_info(space_info)) 4576 + use_cluster = false; 4577 + 4856 4578 if (orig_root->ref_cows || empty_size) 4857 4579 allowed_chunk_alloc = 1; 4858 4580 4859 - if (data & BTRFS_BLOCK_GROUP_METADATA) { 4581 + if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { 4860 4582 last_ptr = &root->fs_info->meta_alloc_cluster; 4861 4583 if (!btrfs_test_opt(root, SSD)) 4862 4584 empty_cluster = 64 * 1024; 4863 4585 } 4864 4586 4865 - if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) { 4587 + if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster && 4588 + btrfs_test_opt(root, SSD)) { 4866 4589 last_ptr = &root->fs_info->data_alloc_cluster; 4867 4590 } 4868 4591 ··· 4934 4641 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { 4935 4642 u64 free_percent; 4936 4643 4644 + ret = cache_block_group(block_group, trans, 1); 4645 + if (block_group->cached == BTRFS_CACHE_FINISHED) 4646 + goto have_block_group; 4647 + 4937 4648 free_percent = btrfs_block_group_used(&block_group->item); 4938 4649 free_percent *= 100; 4939 4650 free_percent = div64_u64(free_percent, ··· 4958 4661 if (loop > LOOP_CACHING_NOWAIT || 4959 4662 (loop > LOOP_FIND_IDEAL && 4960 4663 atomic_read(&space_info->caching_threads) < 2)) { 4961 - ret = cache_block_group(block_group); 4664 + ret = cache_block_group(block_group, trans, 0); 4962 4665 BUG_ON(ret); 4963 4666 } 4964 4667 found_uncached_bg = true; ··· 5515 5218 u64 num_bytes = ins->offset; 5516 5219 5517 5220 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 5518 - cache_block_group(block_group); 5221 + cache_block_group(block_group, trans, 0); 5519 5222 caching_ctl = get_caching_control(block_group); 5520 5223 5521 5224 if (!caching_ctl) { ··· 5605 5308 block_rsv = get_block_rsv(trans, root); 5606 5309 5607 5310 if (block_rsv->size == 0) { 5608 - ret = reserve_metadata_bytes(block_rsv, blocksize); 5311 + ret = reserve_metadata_bytes(trans, root, block_rsv, 5312 + blocksize, 0); 5609 5313 if (ret) 5610 5314 return ERR_PTR(ret); 5611 5315 return block_rsv; ··· 5615 5317 ret = block_rsv_use_bytes(block_rsv, blocksize); 5616 5318 if (!ret) 5617 5319 return block_rsv; 5618 - 5619 - WARN_ON(1); 5620 - printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n", 5621 - block_rsv->size, block_rsv->reserved, 5622 - block_rsv->freed[0], block_rsv->freed[1]); 5623 5320 5624 5321 return ERR_PTR(-ENOSPC); 5625 5322 } ··· 5714 5421 u64 generation; 5715 5422 u64 refs; 5716 5423 u64 flags; 5717 - u64 last = 0; 5718 5424 u32 nritems; 5719 5425 u32 blocksize; 5720 5426 struct btrfs_key key; ··· 5781 5489 generation); 5782 5490 if (ret) 5783 5491 break; 5784 - last = bytenr + blocksize; 5785 5492 nread++; 5786 5493 } 5787 5494 wc->reada_slot = slot; ··· 8104 7813 return ret; 8105 7814 } 8106 7815 7816 + void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 7817 + { 7818 + struct btrfs_block_group_cache *block_group; 7819 + u64 last = 0; 7820 + 7821 + while (1) { 7822 + struct inode *inode; 7823 + 7824 + block_group = btrfs_lookup_first_block_group(info, last); 7825 + while (block_group) { 7826 + spin_lock(&block_group->lock); 7827 + if (block_group->iref) 7828 + break; 7829 + spin_unlock(&block_group->lock); 7830 + block_group = next_block_group(info->tree_root, 7831 + block_group); 7832 + } 7833 + if (!block_group) { 7834 + if (last == 0) 7835 + break; 7836 + last = 0; 7837 + continue; 7838 + } 7839 + 7840 + inode = block_group->inode; 7841 + block_group->iref = 0; 7842 + block_group->inode = NULL; 7843 + spin_unlock(&block_group->lock); 7844 + iput(inode); 7845 + last = block_group->key.objectid + block_group->key.offset; 7846 + btrfs_put_block_group(block_group); 7847 + } 7848 + } 7849 + 8107 7850 int btrfs_free_block_groups(struct btrfs_fs_info *info) 8108 7851 { 8109 7852 struct btrfs_block_group_cache *block_group; ··· 8221 7896 struct btrfs_key key; 8222 7897 struct btrfs_key found_key; 8223 7898 struct extent_buffer *leaf; 7899 + int need_clear = 0; 7900 + u64 cache_gen; 8224 7901 8225 7902 root = info->extent_root; 8226 7903 key.objectid = 0; ··· 8231 7904 path = btrfs_alloc_path(); 8232 7905 if (!path) 8233 7906 return -ENOMEM; 7907 + 7908 + cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); 7909 + if (cache_gen != 0 && 7910 + btrfs_super_generation(&root->fs_info->super_copy) != cache_gen) 7911 + need_clear = 1; 7912 + if (btrfs_test_opt(root, CLEAR_CACHE)) 7913 + need_clear = 1; 7914 + if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen) 7915 + printk(KERN_INFO "btrfs: disk space caching is enabled\n"); 8234 7916 8235 7917 while (1) { 8236 7918 ret = find_first_block_group(root, path, &key); ··· 8262 7926 cache->fs_info = info; 8263 7927 INIT_LIST_HEAD(&cache->list); 8264 7928 INIT_LIST_HEAD(&cache->cluster_list); 7929 + 7930 + if (need_clear) 7931 + cache->disk_cache_state = BTRFS_DC_CLEAR; 8265 7932 8266 7933 /* 8267 7934 * we only want to have 32k of ram per block group for keeping ··· 8370 8031 cache->key.offset = size; 8371 8032 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 8372 8033 cache->sectorsize = root->sectorsize; 8034 + cache->fs_info = root->fs_info; 8373 8035 8374 8036 /* 8375 8037 * we only want to have 32k of ram per block group for keeping track ··· 8427 8087 struct btrfs_path *path; 8428 8088 struct btrfs_block_group_cache *block_group; 8429 8089 struct btrfs_free_cluster *cluster; 8090 + struct btrfs_root *tree_root = root->fs_info->tree_root; 8430 8091 struct btrfs_key key; 8092 + struct inode *inode; 8431 8093 int ret; 8094 + int factor; 8432 8095 8433 8096 root = root->fs_info->extent_root; 8434 8097 ··· 8440 8097 BUG_ON(!block_group->ro); 8441 8098 8442 8099 memcpy(&key, &block_group->key, sizeof(key)); 8100 + if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | 8101 + BTRFS_BLOCK_GROUP_RAID1 | 8102 + BTRFS_BLOCK_GROUP_RAID10)) 8103 + factor = 2; 8104 + else 8105 + factor = 1; 8443 8106 8444 8107 /* make sure this block group isn't part of an allocation cluster */ 8445 8108 cluster = &root->fs_info->data_alloc_cluster; ··· 8464 8115 8465 8116 path = btrfs_alloc_path(); 8466 8117 BUG_ON(!path); 8118 + 8119 + inode = lookup_free_space_inode(root, block_group, path); 8120 + if (!IS_ERR(inode)) { 8121 + btrfs_orphan_add(trans, inode); 8122 + clear_nlink(inode); 8123 + /* One for the block groups ref */ 8124 + spin_lock(&block_group->lock); 8125 + if (block_group->iref) { 8126 + block_group->iref = 0; 8127 + block_group->inode = NULL; 8128 + spin_unlock(&block_group->lock); 8129 + iput(inode); 8130 + } else { 8131 + spin_unlock(&block_group->lock); 8132 + } 8133 + /* One for our lookup ref */ 8134 + iput(inode); 8135 + } 8136 + 8137 + key.objectid = BTRFS_FREE_SPACE_OBJECTID; 8138 + key.offset = block_group->key.objectid; 8139 + key.type = 0; 8140 + 8141 + ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); 8142 + if (ret < 0) 8143 + goto out; 8144 + if (ret > 0) 8145 + btrfs_release_path(tree_root, path); 8146 + if (ret == 0) { 8147 + ret = btrfs_del_item(trans, tree_root, path); 8148 + if (ret) 8149 + goto out; 8150 + btrfs_release_path(tree_root, path); 8151 + } 8467 8152 8468 8153 spin_lock(&root->fs_info->block_group_cache_lock); 8469 8154 rb_erase(&block_group->cache_node, ··· 8520 8137 spin_lock(&block_group->space_info->lock); 8521 8138 block_group->space_info->total_bytes -= block_group->key.offset; 8522 8139 block_group->space_info->bytes_readonly -= block_group->key.offset; 8140 + block_group->space_info->disk_total -= block_group->key.offset * factor; 8523 8141 spin_unlock(&block_group->space_info->lock); 8142 + 8143 + memcpy(&key, &block_group->key, sizeof(key)); 8524 8144 8525 8145 btrfs_clear_space_info_full(root->fs_info); 8526 8146

+85 -87

fs/btrfs/extent_io.c

··· 104 104 struct address_space *mapping, gfp_t mask) 105 105 { 106 106 tree->state = RB_ROOT; 107 - tree->buffer = RB_ROOT; 107 + INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC); 108 108 tree->ops = NULL; 109 109 tree->dirty_bytes = 0; 110 110 spin_lock_init(&tree->lock); ··· 233 233 if (!ret) 234 234 return prev; 235 235 return ret; 236 - } 237 - 238 - static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree, 239 - u64 offset, struct rb_node *node) 240 - { 241 - struct rb_root *root = &tree->buffer; 242 - struct rb_node **p = &root->rb_node; 243 - struct rb_node *parent = NULL; 244 - struct extent_buffer *eb; 245 - 246 - while (*p) { 247 - parent = *p; 248 - eb = rb_entry(parent, struct extent_buffer, rb_node); 249 - 250 - if (offset < eb->start) 251 - p = &(*p)->rb_left; 252 - else if (offset > eb->start) 253 - p = &(*p)->rb_right; 254 - else 255 - return eb; 256 - } 257 - 258 - rb_link_node(node, parent, p); 259 - rb_insert_color(node, root); 260 - return NULL; 261 - } 262 - 263 - static struct extent_buffer *buffer_search(struct extent_io_tree *tree, 264 - u64 offset) 265 - { 266 - struct rb_root *root = &tree->buffer; 267 - struct rb_node *n = root->rb_node; 268 - struct extent_buffer *eb; 269 - 270 - while (n) { 271 - eb = rb_entry(n, struct extent_buffer, rb_node); 272 - if (offset < eb->start) 273 - n = n->rb_left; 274 - else if (offset > eb->start) 275 - n = n->rb_right; 276 - else 277 - return eb; 278 - } 279 - return NULL; 280 236 } 281 237 282 238 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, ··· 1857 1901 struct page *page = bvec->bv_page; 1858 1902 struct extent_io_tree *tree = bio->bi_private; 1859 1903 u64 start; 1860 - u64 end; 1861 1904 1862 1905 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 1863 - end = start + bvec->bv_len - 1; 1864 1906 1865 1907 bio->bi_private = NULL; 1866 1908 ··· 2158 2204 u64 last_byte = i_size_read(inode); 2159 2205 u64 block_start; 2160 2206 u64 iosize; 2161 - u64 unlock_start; 2162 2207 sector_t sector; 2163 2208 struct extent_state *cached_state = NULL; 2164 2209 struct extent_map *em; ··· 2282 2329 if (tree->ops && tree->ops->writepage_end_io_hook) 2283 2330 tree->ops->writepage_end_io_hook(page, start, 2284 2331 page_end, NULL, 1); 2285 - unlock_start = page_end + 1; 2286 2332 goto done; 2287 2333 } 2288 2334 ··· 2292 2340 if (tree->ops && tree->ops->writepage_end_io_hook) 2293 2341 tree->ops->writepage_end_io_hook(page, cur, 2294 2342 page_end, NULL, 1); 2295 - unlock_start = page_end + 1; 2296 2343 break; 2297 2344 } 2298 2345 em = epd->get_extent(inode, page, pg_offset, cur, ··· 2338 2387 2339 2388 cur += iosize; 2340 2389 pg_offset += iosize; 2341 - unlock_start = cur; 2342 2390 continue; 2343 2391 } 2344 2392 /* leave this out until we have a page_mkwrite call */ ··· 2423 2473 pgoff_t index; 2424 2474 pgoff_t end; /* Inclusive */ 2425 2475 int scanned = 0; 2426 - int range_whole = 0; 2427 2476 2428 2477 pagevec_init(&pvec, 0); 2429 2478 if (wbc->range_cyclic) { ··· 2431 2482 } else { 2432 2483 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2433 2484 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2434 - if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2435 - range_whole = 1; 2436 2485 scanned = 1; 2437 2486 } 2438 2487 retry: ··· 2770 2823 NULL, 1, 2771 2824 end_bio_extent_preparewrite, 0, 2772 2825 0, 0); 2826 + if (ret && !err) 2827 + err = ret; 2773 2828 iocount++; 2774 2829 block_start = block_start + iosize; 2775 2830 } else { ··· 3053 3104 kmem_cache_free(extent_buffer_cache, eb); 3054 3105 } 3055 3106 3107 + /* 3108 + * Helper for releasing extent buffer page. 3109 + */ 3110 + static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, 3111 + unsigned long start_idx) 3112 + { 3113 + unsigned long index; 3114 + struct page *page; 3115 + 3116 + if (!eb->first_page) 3117 + return; 3118 + 3119 + index = num_extent_pages(eb->start, eb->len); 3120 + if (start_idx >= index) 3121 + return; 3122 + 3123 + do { 3124 + index--; 3125 + page = extent_buffer_page(eb, index); 3126 + if (page) 3127 + page_cache_release(page); 3128 + } while (index != start_idx); 3129 + } 3130 + 3131 + /* 3132 + * Helper for releasing the extent buffer. 3133 + */ 3134 + static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 3135 + { 3136 + btrfs_release_extent_buffer_page(eb, 0); 3137 + __free_extent_buffer(eb); 3138 + } 3139 + 3056 3140 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 3057 3141 u64 start, unsigned long len, 3058 3142 struct page *page0, ··· 3099 3117 struct page *p; 3100 3118 struct address_space *mapping = tree->mapping; 3101 3119 int uptodate = 1; 3120 + int ret; 3102 3121 3103 - spin_lock(&tree->buffer_lock); 3104 - eb = buffer_search(tree, start); 3105 - if (eb) { 3106 - atomic_inc(&eb->refs); 3107 - spin_unlock(&tree->buffer_lock); 3122 + rcu_read_lock(); 3123 + eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3124 + if (eb && atomic_inc_not_zero(&eb->refs)) { 3125 + rcu_read_unlock(); 3108 3126 mark_page_accessed(eb->first_page); 3109 3127 return eb; 3110 3128 } 3111 - spin_unlock(&tree->buffer_lock); 3129 + rcu_read_unlock(); 3112 3130 3113 3131 eb = __alloc_extent_buffer(tree, start, len, mask); 3114 3132 if (!eb) ··· 3147 3165 if (uptodate) 3148 3166 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3149 3167 3168 + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 3169 + if (ret) 3170 + goto free_eb; 3171 + 3150 3172 spin_lock(&tree->buffer_lock); 3151 - exists = buffer_tree_insert(tree, start, &eb->rb_node); 3152 - if (exists) { 3173 + ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb); 3174 + if (ret == -EEXIST) { 3175 + exists = radix_tree_lookup(&tree->buffer, 3176 + start >> PAGE_CACHE_SHIFT); 3153 3177 /* add one reference for the caller */ 3154 3178 atomic_inc(&exists->refs); 3155 3179 spin_unlock(&tree->buffer_lock); 3180 + radix_tree_preload_end(); 3156 3181 goto free_eb; 3157 3182 } 3158 3183 /* add one reference for the tree */ 3159 3184 atomic_inc(&eb->refs); 3160 3185 spin_unlock(&tree->buffer_lock); 3186 + radix_tree_preload_end(); 3161 3187 return eb; 3162 3188 3163 3189 free_eb: 3164 3190 if (!atomic_dec_and_test(&eb->refs)) 3165 3191 return exists; 3166 - for (index = 1; index < i; index++) 3167 - page_cache_release(extent_buffer_page(eb, index)); 3168 - page_cache_release(extent_buffer_page(eb, 0)); 3169 - __free_extent_buffer(eb); 3192 + btrfs_release_extent_buffer(eb); 3170 3193 return exists; 3171 3194 } 3172 3195 ··· 3181 3194 { 3182 3195 struct extent_buffer *eb; 3183 3196 3184 - spin_lock(&tree->buffer_lock); 3185 - eb = buffer_search(tree, start); 3186 - if (eb) 3187 - atomic_inc(&eb->refs); 3188 - spin_unlock(&tree->buffer_lock); 3189 - 3190 - if (eb) 3197 + rcu_read_lock(); 3198 + eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3199 + if (eb && atomic_inc_not_zero(&eb->refs)) { 3200 + rcu_read_unlock(); 3191 3201 mark_page_accessed(eb->first_page); 3202 + return eb; 3203 + } 3204 + rcu_read_unlock(); 3192 3205 3193 - return eb; 3206 + return NULL; 3194 3207 } 3195 3208 3196 3209 void free_extent_buffer(struct extent_buffer *eb) ··· 3820 3833 } 3821 3834 } 3822 3835 3836 + static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 3837 + { 3838 + struct extent_buffer *eb = 3839 + container_of(head, struct extent_buffer, rcu_head); 3840 + 3841 + btrfs_release_extent_buffer(eb); 3842 + } 3843 + 3823 3844 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) 3824 3845 { 3825 3846 u64 start = page_offset(page); 3826 3847 struct extent_buffer *eb; 3827 3848 int ret = 1; 3828 - unsigned long i; 3829 - unsigned long num_pages; 3830 3849 3831 3850 spin_lock(&tree->buffer_lock); 3832 - eb = buffer_search(tree, start); 3851 + eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3833 3852 if (!eb) 3834 3853 goto out; 3835 3854 3836 - if (atomic_read(&eb->refs) > 1) { 3837 - ret = 0; 3838 - goto out; 3839 - } 3840 3855 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3841 3856 ret = 0; 3842 3857 goto out; 3843 3858 } 3844 - /* at this point we can safely release the extent buffer */ 3845 - num_pages = num_extent_pages(eb->start, eb->len); 3846 - for (i = 0; i < num_pages; i++) 3847 - page_cache_release(extent_buffer_page(eb, i)); 3848 - rb_erase(&eb->rb_node, &tree->buffer); 3849 - __free_extent_buffer(eb); 3859 + 3860 + /* 3861 + * set @eb->refs to 0 if it is already 1, and then release the @eb. 3862 + * Or go back. 3863 + */ 3864 + if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) { 3865 + ret = 0; 3866 + goto out; 3867 + } 3868 + 3869 + radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3850 3870 out: 3851 3871 spin_unlock(&tree->buffer_lock); 3872 + 3873 + /* at this point we can safely release the extent buffer */ 3874 + if (atomic_read(&eb->refs) == 0) 3875 + call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 3852 3876 return ret; 3853 3877 }

+2 -2

fs/btrfs/extent_io.h

··· 85 85 86 86 struct extent_io_tree { 87 87 struct rb_root state; 88 - struct rb_root buffer; 88 + struct radix_tree_root buffer; 89 89 struct address_space *mapping; 90 90 u64 dirty_bytes; 91 91 spinlock_t lock; ··· 123 123 unsigned long bflags; 124 124 atomic_t refs; 125 125 struct list_head leak_list; 126 - struct rb_node rb_node; 126 + struct rcu_head rcu_head; 127 127 128 128 /* the spinlock is used to protect most operations */ 129 129 spinlock_t lock;

+2 -2

fs/btrfs/extent_map.c

··· 335 335 goto out; 336 336 } 337 337 if (IS_ERR(rb_node)) { 338 - em = ERR_PTR(PTR_ERR(rb_node)); 338 + em = ERR_CAST(rb_node); 339 339 goto out; 340 340 } 341 341 em = rb_entry(rb_node, struct extent_map, rb_node); ··· 384 384 goto out; 385 385 } 386 386 if (IS_ERR(rb_node)) { 387 - em = ERR_PTR(PTR_ERR(rb_node)); 387 + em = ERR_CAST(rb_node); 388 388 goto out; 389 389 } 390 390 em = rb_entry(rb_node, struct extent_map, rb_node);

+751

fs/btrfs/free-space-cache.c

··· 23 23 #include "ctree.h" 24 24 #include "free-space-cache.h" 25 25 #include "transaction.h" 26 + #include "disk-io.h" 26 27 27 28 #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) 28 29 #define MAX_CACHE_BYTES_PER_GIG (32 * 1024) 30 + 31 + static void recalculate_thresholds(struct btrfs_block_group_cache 32 + *block_group); 33 + static int link_free_space(struct btrfs_block_group_cache *block_group, 34 + struct btrfs_free_space *info); 35 + 36 + struct inode *lookup_free_space_inode(struct btrfs_root *root, 37 + struct btrfs_block_group_cache 38 + *block_group, struct btrfs_path *path) 39 + { 40 + struct btrfs_key key; 41 + struct btrfs_key location; 42 + struct btrfs_disk_key disk_key; 43 + struct btrfs_free_space_header *header; 44 + struct extent_buffer *leaf; 45 + struct inode *inode = NULL; 46 + int ret; 47 + 48 + spin_lock(&block_group->lock); 49 + if (block_group->inode) 50 + inode = igrab(block_group->inode); 51 + spin_unlock(&block_group->lock); 52 + if (inode) 53 + return inode; 54 + 55 + key.objectid = BTRFS_FREE_SPACE_OBJECTID; 56 + key.offset = block_group->key.objectid; 57 + key.type = 0; 58 + 59 + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 60 + if (ret < 0) 61 + return ERR_PTR(ret); 62 + if (ret > 0) { 63 + btrfs_release_path(root, path); 64 + return ERR_PTR(-ENOENT); 65 + } 66 + 67 + leaf = path->nodes[0]; 68 + header = btrfs_item_ptr(leaf, path->slots[0], 69 + struct btrfs_free_space_header); 70 + btrfs_free_space_key(leaf, header, &disk_key); 71 + btrfs_disk_key_to_cpu(&location, &disk_key); 72 + btrfs_release_path(root, path); 73 + 74 + inode = btrfs_iget(root->fs_info->sb, &location, root, NULL); 75 + if (!inode) 76 + return ERR_PTR(-ENOENT); 77 + if (IS_ERR(inode)) 78 + return inode; 79 + if (is_bad_inode(inode)) { 80 + iput(inode); 81 + return ERR_PTR(-ENOENT); 82 + } 83 + 84 + spin_lock(&block_group->lock); 85 + if (!root->fs_info->closing) { 86 + block_group->inode = igrab(inode); 87 + block_group->iref = 1; 88 + } 89 + spin_unlock(&block_group->lock); 90 + 91 + return inode; 92 + } 93 + 94 + int create_free_space_inode(struct btrfs_root *root, 95 + struct btrfs_trans_handle *trans, 96 + struct btrfs_block_group_cache *block_group, 97 + struct btrfs_path *path) 98 + { 99 + struct btrfs_key key; 100 + struct btrfs_disk_key disk_key; 101 + struct btrfs_free_space_header *header; 102 + struct btrfs_inode_item *inode_item; 103 + struct extent_buffer *leaf; 104 + u64 objectid; 105 + int ret; 106 + 107 + ret = btrfs_find_free_objectid(trans, root, 0, &objectid); 108 + if (ret < 0) 109 + return ret; 110 + 111 + ret = btrfs_insert_empty_inode(trans, root, path, objectid); 112 + if (ret) 113 + return ret; 114 + 115 + leaf = path->nodes[0]; 116 + inode_item = btrfs_item_ptr(leaf, path->slots[0], 117 + struct btrfs_inode_item); 118 + btrfs_item_key(leaf, &disk_key, path->slots[0]); 119 + memset_extent_buffer(leaf, 0, (unsigned long)inode_item, 120 + sizeof(*inode_item)); 121 + btrfs_set_inode_generation(leaf, inode_item, trans->transid); 122 + btrfs_set_inode_size(leaf, inode_item, 0); 123 + btrfs_set_inode_nbytes(leaf, inode_item, 0); 124 + btrfs_set_inode_uid(leaf, inode_item, 0); 125 + btrfs_set_inode_gid(leaf, inode_item, 0); 126 + btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 127 + btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | 128 + BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM); 129 + btrfs_set_inode_nlink(leaf, inode_item, 1); 130 + btrfs_set_inode_transid(leaf, inode_item, trans->transid); 131 + btrfs_set_inode_block_group(leaf, inode_item, 132 + block_group->key.objectid); 133 + btrfs_mark_buffer_dirty(leaf); 134 + btrfs_release_path(root, path); 135 + 136 + key.objectid = BTRFS_FREE_SPACE_OBJECTID; 137 + key.offset = block_group->key.objectid; 138 + key.type = 0; 139 + 140 + ret = btrfs_insert_empty_item(trans, root, path, &key, 141 + sizeof(struct btrfs_free_space_header)); 142 + if (ret < 0) { 143 + btrfs_release_path(root, path); 144 + return ret; 145 + } 146 + leaf = path->nodes[0]; 147 + header = btrfs_item_ptr(leaf, path->slots[0], 148 + struct btrfs_free_space_header); 149 + memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header)); 150 + btrfs_set_free_space_key(leaf, header, &disk_key); 151 + btrfs_mark_buffer_dirty(leaf); 152 + btrfs_release_path(root, path); 153 + 154 + return 0; 155 + } 156 + 157 + int btrfs_truncate_free_space_cache(struct btrfs_root *root, 158 + struct btrfs_trans_handle *trans, 159 + struct btrfs_path *path, 160 + struct inode *inode) 161 + { 162 + loff_t oldsize; 163 + int ret = 0; 164 + 165 + trans->block_rsv = root->orphan_block_rsv; 166 + ret = btrfs_block_rsv_check(trans, root, 167 + root->orphan_block_rsv, 168 + 0, 5); 169 + if (ret) 170 + return ret; 171 + 172 + oldsize = i_size_read(inode); 173 + btrfs_i_size_write(inode, 0); 174 + truncate_pagecache(inode, oldsize, 0); 175 + 176 + /* 177 + * We don't need an orphan item because truncating the free space cache 178 + * will never be split across transactions. 179 + */ 180 + ret = btrfs_truncate_inode_items(trans, root, inode, 181 + 0, BTRFS_EXTENT_DATA_KEY); 182 + if (ret) { 183 + WARN_ON(1); 184 + return ret; 185 + } 186 + 187 + return btrfs_update_inode(trans, root, inode); 188 + } 189 + 190 + static int readahead_cache(struct inode *inode) 191 + { 192 + struct file_ra_state *ra; 193 + unsigned long last_index; 194 + 195 + ra = kzalloc(sizeof(*ra), GFP_NOFS); 196 + if (!ra) 197 + return -ENOMEM; 198 + 199 + file_ra_state_init(ra, inode->i_mapping); 200 + last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; 201 + 202 + page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index); 203 + 204 + kfree(ra); 205 + 206 + return 0; 207 + } 208 + 209 + int load_free_space_cache(struct btrfs_fs_info *fs_info, 210 + struct btrfs_block_group_cache *block_group) 211 + { 212 + struct btrfs_root *root = fs_info->tree_root; 213 + struct inode *inode; 214 + struct btrfs_free_space_header *header; 215 + struct extent_buffer *leaf; 216 + struct page *page; 217 + struct btrfs_path *path; 218 + u32 *checksums = NULL, *crc; 219 + char *disk_crcs = NULL; 220 + struct btrfs_key key; 221 + struct list_head bitmaps; 222 + u64 num_entries; 223 + u64 num_bitmaps; 224 + u64 generation; 225 + u32 cur_crc = ~(u32)0; 226 + pgoff_t index = 0; 227 + unsigned long first_page_offset; 228 + int num_checksums; 229 + int ret = 0; 230 + 231 + /* 232 + * If we're unmounting then just return, since this does a search on the 233 + * normal root and not the commit root and we could deadlock. 234 + */ 235 + smp_mb(); 236 + if (fs_info->closing) 237 + return 0; 238 + 239 + /* 240 + * If this block group has been marked to be cleared for one reason or 241 + * another then we can't trust the on disk cache, so just return. 242 + */ 243 + spin_lock(&block_group->lock); 244 + if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 245 + spin_unlock(&block_group->lock); 246 + return 0; 247 + } 248 + spin_unlock(&block_group->lock); 249 + 250 + INIT_LIST_HEAD(&bitmaps); 251 + 252 + path = btrfs_alloc_path(); 253 + if (!path) 254 + return 0; 255 + 256 + inode = lookup_free_space_inode(root, block_group, path); 257 + if (IS_ERR(inode)) { 258 + btrfs_free_path(path); 259 + return 0; 260 + } 261 + 262 + /* Nothing in the space cache, goodbye */ 263 + if (!i_size_read(inode)) { 264 + btrfs_free_path(path); 265 + goto out; 266 + } 267 + 268 + key.objectid = BTRFS_FREE_SPACE_OBJECTID; 269 + key.offset = block_group->key.objectid; 270 + key.type = 0; 271 + 272 + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 273 + if (ret) { 274 + btrfs_free_path(path); 275 + goto out; 276 + } 277 + 278 + leaf = path->nodes[0]; 279 + header = btrfs_item_ptr(leaf, path->slots[0], 280 + struct btrfs_free_space_header); 281 + num_entries = btrfs_free_space_entries(leaf, header); 282 + num_bitmaps = btrfs_free_space_bitmaps(leaf, header); 283 + generation = btrfs_free_space_generation(leaf, header); 284 + btrfs_free_path(path); 285 + 286 + if (BTRFS_I(inode)->generation != generation) { 287 + printk(KERN_ERR "btrfs: free space inode generation (%llu) did" 288 + " not match free space cache generation (%llu) for " 289 + "block group %llu\n", 290 + (unsigned long long)BTRFS_I(inode)->generation, 291 + (unsigned long long)generation, 292 + (unsigned long long)block_group->key.objectid); 293 + goto out; 294 + } 295 + 296 + if (!num_entries) 297 + goto out; 298 + 299 + /* Setup everything for doing checksumming */ 300 + num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE; 301 + checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS); 302 + if (!checksums) 303 + goto out; 304 + first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); 305 + disk_crcs = kzalloc(first_page_offset, GFP_NOFS); 306 + if (!disk_crcs) 307 + goto out; 308 + 309 + ret = readahead_cache(inode); 310 + if (ret) { 311 + ret = 0; 312 + goto out; 313 + } 314 + 315 + while (1) { 316 + struct btrfs_free_space_entry *entry; 317 + struct btrfs_free_space *e; 318 + void *addr; 319 + unsigned long offset = 0; 320 + unsigned long start_offset = 0; 321 + int need_loop = 0; 322 + 323 + if (!num_entries && !num_bitmaps) 324 + break; 325 + 326 + if (index == 0) { 327 + start_offset = first_page_offset; 328 + offset = start_offset; 329 + } 330 + 331 + page = grab_cache_page(inode->i_mapping, index); 332 + if (!page) { 333 + ret = 0; 334 + goto free_cache; 335 + } 336 + 337 + if (!PageUptodate(page)) { 338 + btrfs_readpage(NULL, page); 339 + lock_page(page); 340 + if (!PageUptodate(page)) { 341 + unlock_page(page); 342 + page_cache_release(page); 343 + printk(KERN_ERR "btrfs: error reading free " 344 + "space cache: %llu\n", 345 + (unsigned long long) 346 + block_group->key.objectid); 347 + goto free_cache; 348 + } 349 + } 350 + addr = kmap(page); 351 + 352 + if (index == 0) { 353 + u64 *gen; 354 + 355 + memcpy(disk_crcs, addr, first_page_offset); 356 + gen = addr + (sizeof(u32) * num_checksums); 357 + if (*gen != BTRFS_I(inode)->generation) { 358 + printk(KERN_ERR "btrfs: space cache generation" 359 + " (%llu) does not match inode (%llu) " 360 + "for block group %llu\n", 361 + (unsigned long long)*gen, 362 + (unsigned long long) 363 + BTRFS_I(inode)->generation, 364 + (unsigned long long) 365 + block_group->key.objectid); 366 + kunmap(page); 367 + unlock_page(page); 368 + page_cache_release(page); 369 + goto free_cache; 370 + } 371 + crc = (u32 *)disk_crcs; 372 + } 373 + entry = addr + start_offset; 374 + 375 + /* First lets check our crc before we do anything fun */ 376 + cur_crc = ~(u32)0; 377 + cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc, 378 + PAGE_CACHE_SIZE - start_offset); 379 + btrfs_csum_final(cur_crc, (char *)&cur_crc); 380 + if (cur_crc != *crc) { 381 + printk(KERN_ERR "btrfs: crc mismatch for page %lu in " 382 + "block group %llu\n", index, 383 + (unsigned long long)block_group->key.objectid); 384 + kunmap(page); 385 + unlock_page(page); 386 + page_cache_release(page); 387 + goto free_cache; 388 + } 389 + crc++; 390 + 391 + while (1) { 392 + if (!num_entries) 393 + break; 394 + 395 + need_loop = 1; 396 + e = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); 397 + if (!e) { 398 + kunmap(page); 399 + unlock_page(page); 400 + page_cache_release(page); 401 + goto free_cache; 402 + } 403 + 404 + e->offset = le64_to_cpu(entry->offset); 405 + e->bytes = le64_to_cpu(entry->bytes); 406 + if (!e->bytes) { 407 + kunmap(page); 408 + kfree(e); 409 + unlock_page(page); 410 + page_cache_release(page); 411 + goto free_cache; 412 + } 413 + 414 + if (entry->type == BTRFS_FREE_SPACE_EXTENT) { 415 + spin_lock(&block_group->tree_lock); 416 + ret = link_free_space(block_group, e); 417 + spin_unlock(&block_group->tree_lock); 418 + BUG_ON(ret); 419 + } else { 420 + e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); 421 + if (!e->bitmap) { 422 + kunmap(page); 423 + kfree(e); 424 + unlock_page(page); 425 + page_cache_release(page); 426 + goto free_cache; 427 + } 428 + spin_lock(&block_group->tree_lock); 429 + ret = link_free_space(block_group, e); 430 + block_group->total_bitmaps++; 431 + recalculate_thresholds(block_group); 432 + spin_unlock(&block_group->tree_lock); 433 + list_add_tail(&e->list, &bitmaps); 434 + } 435 + 436 + num_entries--; 437 + offset += sizeof(struct btrfs_free_space_entry); 438 + if (offset + sizeof(struct btrfs_free_space_entry) >= 439 + PAGE_CACHE_SIZE) 440 + break; 441 + entry++; 442 + } 443 + 444 + /* 445 + * We read an entry out of this page, we need to move on to the 446 + * next page. 447 + */ 448 + if (need_loop) { 449 + kunmap(page); 450 + goto next; 451 + } 452 + 453 + /* 454 + * We add the bitmaps at the end of the entries in order that 455 + * the bitmap entries are added to the cache. 456 + */ 457 + e = list_entry(bitmaps.next, struct btrfs_free_space, list); 458 + list_del_init(&e->list); 459 + memcpy(e->bitmap, addr, PAGE_CACHE_SIZE); 460 + kunmap(page); 461 + num_bitmaps--; 462 + next: 463 + unlock_page(page); 464 + page_cache_release(page); 465 + index++; 466 + } 467 + 468 + ret = 1; 469 + out: 470 + kfree(checksums); 471 + kfree(disk_crcs); 472 + iput(inode); 473 + return ret; 474 + 475 + free_cache: 476 + /* This cache is bogus, make sure it gets cleared */ 477 + spin_lock(&block_group->lock); 478 + block_group->disk_cache_state = BTRFS_DC_CLEAR; 479 + spin_unlock(&block_group->lock); 480 + btrfs_remove_free_space_cache(block_group); 481 + goto out; 482 + } 483 + 484 + int btrfs_write_out_cache(struct btrfs_root *root, 485 + struct btrfs_trans_handle *trans, 486 + struct btrfs_block_group_cache *block_group, 487 + struct btrfs_path *path) 488 + { 489 + struct btrfs_free_space_header *header; 490 + struct extent_buffer *leaf; 491 + struct inode *inode; 492 + struct rb_node *node; 493 + struct list_head *pos, *n; 494 + struct page *page; 495 + struct extent_state *cached_state = NULL; 496 + struct list_head bitmap_list; 497 + struct btrfs_key key; 498 + u64 bytes = 0; 499 + u32 *crc, *checksums; 500 + pgoff_t index = 0, last_index = 0; 501 + unsigned long first_page_offset; 502 + int num_checksums; 503 + int entries = 0; 504 + int bitmaps = 0; 505 + int ret = 0; 506 + 507 + root = root->fs_info->tree_root; 508 + 509 + INIT_LIST_HEAD(&bitmap_list); 510 + 511 + spin_lock(&block_group->lock); 512 + if (block_group->disk_cache_state < BTRFS_DC_SETUP) { 513 + spin_unlock(&block_group->lock); 514 + return 0; 515 + } 516 + spin_unlock(&block_group->lock); 517 + 518 + inode = lookup_free_space_inode(root, block_group, path); 519 + if (IS_ERR(inode)) 520 + return 0; 521 + 522 + if (!i_size_read(inode)) { 523 + iput(inode); 524 + return 0; 525 + } 526 + 527 + last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; 528 + filemap_write_and_wait(inode->i_mapping); 529 + btrfs_wait_ordered_range(inode, inode->i_size & 530 + ~(root->sectorsize - 1), (u64)-1); 531 + 532 + /* We need a checksum per page. */ 533 + num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE; 534 + crc = checksums = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS); 535 + if (!crc) { 536 + iput(inode); 537 + return 0; 538 + } 539 + 540 + /* Since the first page has all of our checksums and our generation we 541 + * need to calculate the offset into the page that we can start writing 542 + * our entries. 543 + */ 544 + first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); 545 + 546 + node = rb_first(&block_group->free_space_offset); 547 + if (!node) 548 + goto out_free; 549 + 550 + /* 551 + * Lock all pages first so we can lock the extent safely. 552 + * 553 + * NOTE: Because we hold the ref the entire time we're going to write to 554 + * the page find_get_page should never fail, so we don't do a check 555 + * after find_get_page at this point. Just putting this here so people 556 + * know and don't freak out. 557 + */ 558 + while (index <= last_index) { 559 + page = grab_cache_page(inode->i_mapping, index); 560 + if (!page) { 561 + pgoff_t i = 0; 562 + 563 + while (i < index) { 564 + page = find_get_page(inode->i_mapping, i); 565 + unlock_page(page); 566 + page_cache_release(page); 567 + page_cache_release(page); 568 + i++; 569 + } 570 + goto out_free; 571 + } 572 + index++; 573 + } 574 + 575 + index = 0; 576 + lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 577 + 0, &cached_state, GFP_NOFS); 578 + 579 + /* Write out the extent entries */ 580 + do { 581 + struct btrfs_free_space_entry *entry; 582 + void *addr; 583 + unsigned long offset = 0; 584 + unsigned long start_offset = 0; 585 + 586 + if (index == 0) { 587 + start_offset = first_page_offset; 588 + offset = start_offset; 589 + } 590 + 591 + page = find_get_page(inode->i_mapping, index); 592 + 593 + addr = kmap(page); 594 + entry = addr + start_offset; 595 + 596 + memset(addr, 0, PAGE_CACHE_SIZE); 597 + while (1) { 598 + struct btrfs_free_space *e; 599 + 600 + e = rb_entry(node, struct btrfs_free_space, offset_index); 601 + entries++; 602 + 603 + entry->offset = cpu_to_le64(e->offset); 604 + entry->bytes = cpu_to_le64(e->bytes); 605 + if (e->bitmap) { 606 + entry->type = BTRFS_FREE_SPACE_BITMAP; 607 + list_add_tail(&e->list, &bitmap_list); 608 + bitmaps++; 609 + } else { 610 + entry->type = BTRFS_FREE_SPACE_EXTENT; 611 + } 612 + node = rb_next(node); 613 + if (!node) 614 + break; 615 + offset += sizeof(struct btrfs_free_space_entry); 616 + if (offset + sizeof(struct btrfs_free_space_entry) >= 617 + PAGE_CACHE_SIZE) 618 + break; 619 + entry++; 620 + } 621 + *crc = ~(u32)0; 622 + *crc = btrfs_csum_data(root, addr + start_offset, *crc, 623 + PAGE_CACHE_SIZE - start_offset); 624 + kunmap(page); 625 + 626 + btrfs_csum_final(*crc, (char *)crc); 627 + crc++; 628 + 629 + bytes += PAGE_CACHE_SIZE; 630 + 631 + ClearPageChecked(page); 632 + set_page_extent_mapped(page); 633 + SetPageUptodate(page); 634 + set_page_dirty(page); 635 + 636 + /* 637 + * We need to release our reference we got for grab_cache_page, 638 + * except for the first page which will hold our checksums, we 639 + * do that below. 640 + */ 641 + if (index != 0) { 642 + unlock_page(page); 643 + page_cache_release(page); 644 + } 645 + 646 + page_cache_release(page); 647 + 648 + index++; 649 + } while (node); 650 + 651 + /* Write out the bitmaps */ 652 + list_for_each_safe(pos, n, &bitmap_list) { 653 + void *addr; 654 + struct btrfs_free_space *entry = 655 + list_entry(pos, struct btrfs_free_space, list); 656 + 657 + page = find_get_page(inode->i_mapping, index); 658 + 659 + addr = kmap(page); 660 + memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE); 661 + *crc = ~(u32)0; 662 + *crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE); 663 + kunmap(page); 664 + btrfs_csum_final(*crc, (char *)crc); 665 + crc++; 666 + bytes += PAGE_CACHE_SIZE; 667 + 668 + ClearPageChecked(page); 669 + set_page_extent_mapped(page); 670 + SetPageUptodate(page); 671 + set_page_dirty(page); 672 + unlock_page(page); 673 + page_cache_release(page); 674 + page_cache_release(page); 675 + list_del_init(&entry->list); 676 + index++; 677 + } 678 + 679 + /* Zero out the rest of the pages just to make sure */ 680 + while (index <= last_index) { 681 + void *addr; 682 + 683 + page = find_get_page(inode->i_mapping, index); 684 + 685 + addr = kmap(page); 686 + memset(addr, 0, PAGE_CACHE_SIZE); 687 + kunmap(page); 688 + ClearPageChecked(page); 689 + set_page_extent_mapped(page); 690 + SetPageUptodate(page); 691 + set_page_dirty(page); 692 + unlock_page(page); 693 + page_cache_release(page); 694 + page_cache_release(page); 695 + bytes += PAGE_CACHE_SIZE; 696 + index++; 697 + } 698 + 699 + btrfs_set_extent_delalloc(inode, 0, bytes - 1, &cached_state); 700 + 701 + /* Write the checksums and trans id to the first page */ 702 + { 703 + void *addr; 704 + u64 *gen; 705 + 706 + page = find_get_page(inode->i_mapping, 0); 707 + 708 + addr = kmap(page); 709 + memcpy(addr, checksums, sizeof(u32) * num_checksums); 710 + gen = addr + (sizeof(u32) * num_checksums); 711 + *gen = trans->transid; 712 + kunmap(page); 713 + ClearPageChecked(page); 714 + set_page_extent_mapped(page); 715 + SetPageUptodate(page); 716 + set_page_dirty(page); 717 + unlock_page(page); 718 + page_cache_release(page); 719 + page_cache_release(page); 720 + } 721 + BTRFS_I(inode)->generation = trans->transid; 722 + 723 + unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 724 + i_size_read(inode) - 1, &cached_state, GFP_NOFS); 725 + 726 + filemap_write_and_wait(inode->i_mapping); 727 + 728 + key.objectid = BTRFS_FREE_SPACE_OBJECTID; 729 + key.offset = block_group->key.objectid; 730 + key.type = 0; 731 + 732 + ret = btrfs_search_slot(trans, root, &key, path, 1, 1); 733 + if (ret < 0) { 734 + ret = 0; 735 + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 736 + EXTENT_DIRTY | EXTENT_DELALLOC | 737 + EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); 738 + goto out_free; 739 + } 740 + leaf = path->nodes[0]; 741 + if (ret > 0) { 742 + struct btrfs_key found_key; 743 + BUG_ON(!path->slots[0]); 744 + path->slots[0]--; 745 + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 746 + if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 747 + found_key.offset != block_group->key.objectid) { 748 + ret = 0; 749 + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 750 + EXTENT_DIRTY | EXTENT_DELALLOC | 751 + EXTENT_DO_ACCOUNTING, 0, 0, NULL, 752 + GFP_NOFS); 753 + btrfs_release_path(root, path); 754 + goto out_free; 755 + } 756 + } 757 + header = btrfs_item_ptr(leaf, path->slots[0], 758 + struct btrfs_free_space_header); 759 + btrfs_set_free_space_entries(leaf, header, entries); 760 + btrfs_set_free_space_bitmaps(leaf, header, bitmaps); 761 + btrfs_set_free_space_generation(leaf, header, trans->transid); 762 + btrfs_mark_buffer_dirty(leaf); 763 + btrfs_release_path(root, path); 764 + 765 + ret = 1; 766 + 767 + out_free: 768 + if (ret == 0) { 769 + invalidate_inode_pages2_range(inode->i_mapping, 0, index); 770 + spin_lock(&block_group->lock); 771 + block_group->disk_cache_state = BTRFS_DC_ERROR; 772 + spin_unlock(&block_group->lock); 773 + BTRFS_I(inode)->generation = 0; 774 + } 775 + kfree(checksums); 776 + btrfs_update_inode(trans, root, inode); 777 + iput(inode); 778 + return ret; 779 + } 29 780 30 781 static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize, 31 782 u64 offset)

+18

fs/btrfs/free-space-cache.h

··· 27 27 struct list_head list; 28 28 }; 29 29 30 + struct inode *lookup_free_space_inode(struct btrfs_root *root, 31 + struct btrfs_block_group_cache 32 + *block_group, struct btrfs_path *path); 33 + int create_free_space_inode(struct btrfs_root *root, 34 + struct btrfs_trans_handle *trans, 35 + struct btrfs_block_group_cache *block_group, 36 + struct btrfs_path *path); 37 + 38 + int btrfs_truncate_free_space_cache(struct btrfs_root *root, 39 + struct btrfs_trans_handle *trans, 40 + struct btrfs_path *path, 41 + struct inode *inode); 42 + int load_free_space_cache(struct btrfs_fs_info *fs_info, 43 + struct btrfs_block_group_cache *block_group); 44 + int btrfs_write_out_cache(struct btrfs_root *root, 45 + struct btrfs_trans_handle *trans, 46 + struct btrfs_block_group_cache *block_group, 47 + struct btrfs_path *path); 30 48 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 31 49 u64 bytenr, u64 size); 32 50 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,

+153 -49

fs/btrfs/inode.c

··· 319 319 struct btrfs_root *root = BTRFS_I(inode)->root; 320 320 struct btrfs_trans_handle *trans; 321 321 u64 num_bytes; 322 - u64 orig_start; 323 - u64 disk_num_bytes; 324 322 u64 blocksize = root->sectorsize; 325 323 u64 actual_end; 326 324 u64 isize = i_size_read(inode); ··· 332 334 unsigned long max_uncompressed = 128 * 1024; 333 335 int i; 334 336 int will_compress; 335 - 336 - orig_start = start; 337 337 338 338 actual_end = min_t(u64, isize, end + 1); 339 339 again: ··· 367 371 total_compressed = min(total_compressed, max_uncompressed); 368 372 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 369 373 num_bytes = max(blocksize, num_bytes); 370 - disk_num_bytes = num_bytes; 371 374 total_in = 0; 372 375 ret = 0; 373 376 ··· 462 467 if (total_compressed >= total_in) { 463 468 will_compress = 0; 464 469 } else { 465 - disk_num_bytes = total_compressed; 466 470 num_bytes = total_in; 467 471 } 468 472 } ··· 751 757 u64 disk_num_bytes; 752 758 u64 cur_alloc_size; 753 759 u64 blocksize = root->sectorsize; 754 - u64 actual_end; 755 - u64 isize = i_size_read(inode); 756 760 struct btrfs_key ins; 757 761 struct extent_map *em; 758 762 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 759 763 int ret = 0; 760 764 765 + BUG_ON(root == root->fs_info->tree_root); 761 766 trans = btrfs_join_transaction(root, 1); 762 767 BUG_ON(!trans); 763 768 btrfs_set_trans_block_group(trans, inode); 764 769 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 765 - 766 - actual_end = min_t(u64, isize, end + 1); 767 770 768 771 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 769 772 num_bytes = max(blocksize, num_bytes); ··· 1026 1035 int type; 1027 1036 int nocow; 1028 1037 int check_prev = 1; 1038 + bool nolock = false; 1029 1039 1030 1040 path = btrfs_alloc_path(); 1031 1041 BUG_ON(!path); 1032 - trans = btrfs_join_transaction(root, 1); 1042 + if (root == root->fs_info->tree_root) { 1043 + nolock = true; 1044 + trans = btrfs_join_transaction_nolock(root, 1); 1045 + } else { 1046 + trans = btrfs_join_transaction(root, 1); 1047 + } 1033 1048 BUG_ON(!trans); 1034 1049 1035 1050 cow_start = (u64)-1; ··· 1208 1211 BUG_ON(ret); 1209 1212 } 1210 1213 1211 - ret = btrfs_end_transaction(trans, root); 1212 - BUG_ON(ret); 1214 + if (nolock) { 1215 + ret = btrfs_end_transaction_nolock(trans, root); 1216 + BUG_ON(ret); 1217 + } else { 1218 + ret = btrfs_end_transaction(trans, root); 1219 + BUG_ON(ret); 1220 + } 1213 1221 btrfs_free_path(path); 1214 1222 return 0; 1215 1223 } ··· 1291 1289 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1292 1290 struct btrfs_root *root = BTRFS_I(inode)->root; 1293 1291 u64 len = state->end + 1 - state->start; 1292 + int do_list = (root->root_key.objectid != 1293 + BTRFS_ROOT_TREE_OBJECTID); 1294 1294 1295 1295 if (*bits & EXTENT_FIRST_DELALLOC) 1296 1296 *bits &= ~EXTENT_FIRST_DELALLOC; ··· 1302 1298 spin_lock(&root->fs_info->delalloc_lock); 1303 1299 BTRFS_I(inode)->delalloc_bytes += len; 1304 1300 root->fs_info->delalloc_bytes += len; 1305 - if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1301 + if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1306 1302 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1307 1303 &root->fs_info->delalloc_inodes); 1308 1304 } ··· 1325 1321 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1326 1322 struct btrfs_root *root = BTRFS_I(inode)->root; 1327 1323 u64 len = state->end + 1 - state->start; 1324 + int do_list = (root->root_key.objectid != 1325 + BTRFS_ROOT_TREE_OBJECTID); 1328 1326 1329 1327 if (*bits & EXTENT_FIRST_DELALLOC) 1330 1328 *bits &= ~EXTENT_FIRST_DELALLOC; ··· 1336 1330 if (*bits & EXTENT_DO_ACCOUNTING) 1337 1331 btrfs_delalloc_release_metadata(inode, len); 1338 1332 1339 - if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) 1333 + if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID 1334 + && do_list) 1340 1335 btrfs_free_reserved_data_space(inode, len); 1341 1336 1342 1337 spin_lock(&root->fs_info->delalloc_lock); 1343 1338 root->fs_info->delalloc_bytes -= len; 1344 1339 BTRFS_I(inode)->delalloc_bytes -= len; 1345 1340 1346 - if (BTRFS_I(inode)->delalloc_bytes == 0 && 1341 + if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && 1347 1342 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1348 1343 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1349 1344 } ··· 1379 1372 1380 1373 if (map_length < length + size) 1381 1374 return 1; 1382 - return 0; 1375 + return ret; 1383 1376 } 1384 1377 1385 1378 /* ··· 1433 1426 1434 1427 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1435 1428 1436 - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 1429 + if (root == root->fs_info->tree_root) 1430 + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2); 1431 + else 1432 + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 1437 1433 BUG_ON(ret); 1438 1434 1439 1435 if (!(rw & REQ_WRITE)) { ··· 1672 1662 struct extent_state *cached_state = NULL; 1673 1663 int compressed = 0; 1674 1664 int ret; 1665 + bool nolock = false; 1675 1666 1676 1667 ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, 1677 1668 end - start + 1); ··· 1680 1669 return 0; 1681 1670 BUG_ON(!ordered_extent); 1682 1671 1672 + nolock = (root == root->fs_info->tree_root); 1673 + 1683 1674 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1684 1675 BUG_ON(!list_empty(&ordered_extent->list)); 1685 1676 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1686 1677 if (!ret) { 1687 - trans = btrfs_join_transaction(root, 1); 1678 + if (nolock) 1679 + trans = btrfs_join_transaction_nolock(root, 1); 1680 + else 1681 + trans = btrfs_join_transaction(root, 1); 1682 + BUG_ON(!trans); 1688 1683 btrfs_set_trans_block_group(trans, inode); 1689 1684 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1690 1685 ret = btrfs_update_inode(trans, root, inode); ··· 1703 1686 ordered_extent->file_offset + ordered_extent->len - 1, 1704 1687 0, &cached_state, GFP_NOFS); 1705 1688 1706 - trans = btrfs_join_transaction(root, 1); 1689 + if (nolock) 1690 + trans = btrfs_join_transaction_nolock(root, 1); 1691 + else 1692 + trans = btrfs_join_transaction(root, 1); 1707 1693 btrfs_set_trans_block_group(trans, inode); 1708 1694 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1709 1695 ··· 1720 1700 ordered_extent->len); 1721 1701 BUG_ON(ret); 1722 1702 } else { 1703 + BUG_ON(root == root->fs_info->tree_root); 1723 1704 ret = insert_reserved_file_extent(trans, inode, 1724 1705 ordered_extent->file_offset, 1725 1706 ordered_extent->start, ··· 1745 1724 ret = btrfs_update_inode(trans, root, inode); 1746 1725 BUG_ON(ret); 1747 1726 out: 1748 - btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1749 - if (trans) 1750 - btrfs_end_transaction(trans, root); 1727 + if (nolock) { 1728 + if (trans) 1729 + btrfs_end_transaction_nolock(trans, root); 1730 + } else { 1731 + btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1732 + if (trans) 1733 + btrfs_end_transaction(trans, root); 1734 + } 1735 + 1751 1736 /* once for us */ 1752 1737 btrfs_put_ordered_extent(ordered_extent); 1753 1738 /* once for the tree */ ··· 2264 2237 { 2265 2238 struct btrfs_path *path; 2266 2239 struct extent_buffer *leaf; 2267 - struct btrfs_item *item; 2268 2240 struct btrfs_key key, found_key; 2269 2241 struct btrfs_trans_handle *trans; 2270 2242 struct inode *inode; ··· 2301 2275 2302 2276 /* pull out the item */ 2303 2277 leaf = path->nodes[0]; 2304 - item = btrfs_item_nr(leaf, path->slots[0]); 2305 2278 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2306 2279 2307 2280 /* make sure the item matches what we want */ ··· 2676 2651 2677 2652 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, 2678 2653 dir, index); 2679 - BUG_ON(ret); 2654 + if (ret == -ENOENT) 2655 + ret = 0; 2680 2656 err: 2681 2657 btrfs_free_path(path); 2682 2658 if (ret) ··· 2698 2672 { 2699 2673 struct extent_buffer *eb; 2700 2674 int level; 2701 - int ret; 2702 2675 u64 refs = 1; 2676 + int uninitialized_var(ret); 2703 2677 2704 2678 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2705 2679 if (!path->nodes[level]) ··· 2712 2686 if (refs > 1) 2713 2687 return 1; 2714 2688 } 2715 - return 0; 2689 + return ret; /* XXX callers? */ 2716 2690 } 2717 2691 2718 2692 /* ··· 3222 3196 3223 3197 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); 3224 3198 3225 - if (root->ref_cows) 3199 + if (root->ref_cows || root == root->fs_info->tree_root) 3226 3200 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); 3227 3201 3228 3202 path = btrfs_alloc_path(); ··· 3370 3344 } else { 3371 3345 break; 3372 3346 } 3373 - if (found_extent && root->ref_cows) { 3347 + if (found_extent && (root->ref_cows || 3348 + root == root->fs_info->tree_root)) { 3374 3349 btrfs_set_path_blocking(path); 3375 3350 ret = btrfs_free_extent(trans, root, extent_start, 3376 3351 extent_num_bytes, 0, ··· 3702 3675 int ret; 3703 3676 3704 3677 truncate_inode_pages(&inode->i_data, 0); 3705 - if (inode->i_nlink && btrfs_root_refs(&root->root_item) != 0) 3678 + if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || 3679 + root == root->fs_info->tree_root)) 3706 3680 goto no_delete; 3707 3681 3708 3682 if (is_bad_inode(inode)) { ··· 3916 3888 } 3917 3889 spin_unlock(&root->inode_lock); 3918 3890 3919 - if (empty && btrfs_root_refs(&root->root_item) == 0) { 3891 + /* 3892 + * Free space cache has inodes in the tree root, but the tree root has a 3893 + * root_refs of 0, so this could end up dropping the tree root as a 3894 + * snapshot, so we need the extra !root->fs_info->tree_root check to 3895 + * make sure we don't drop it. 3896 + */ 3897 + if (empty && btrfs_root_refs(&root->root_item) == 0 && 3898 + root != root->fs_info->tree_root) { 3920 3899 synchronize_srcu(&root->fs_info->subvol_srcu); 3921 3900 spin_lock(&root->inode_lock); 3922 3901 empty = RB_EMPTY_ROOT(&root->inode_tree); ··· 4317 4282 struct btrfs_root *root = BTRFS_I(inode)->root; 4318 4283 struct btrfs_trans_handle *trans; 4319 4284 int ret = 0; 4285 + bool nolock = false; 4320 4286 4321 4287 if (BTRFS_I(inode)->dummy_inode) 4322 4288 return 0; 4323 4289 4290 + smp_mb(); 4291 + nolock = (root->fs_info->closing && root == root->fs_info->tree_root); 4292 + 4324 4293 if (wbc->sync_mode == WB_SYNC_ALL) { 4325 - trans = btrfs_join_transaction(root, 1); 4294 + if (nolock) 4295 + trans = btrfs_join_transaction_nolock(root, 1); 4296 + else 4297 + trans = btrfs_join_transaction(root, 1); 4326 4298 btrfs_set_trans_block_group(trans, inode); 4327 - ret = btrfs_commit_transaction(trans, root); 4299 + if (nolock) 4300 + ret = btrfs_end_transaction_nolock(trans, root); 4301 + else 4302 + ret = btrfs_commit_transaction(trans, root); 4328 4303 } 4329 4304 return ret; 4330 4305 } ··· 5690 5645 struct btrfs_root *root = BTRFS_I(inode)->root; 5691 5646 struct btrfs_dio_private *dip; 5692 5647 struct bio_vec *bvec = bio->bi_io_vec; 5693 - u64 start; 5694 5648 int skip_sum; 5695 5649 int write = rw & REQ_WRITE; 5696 5650 int ret = 0; ··· 5715 5671 dip->inode = inode; 5716 5672 dip->logical_offset = file_offset; 5717 5673 5718 - start = dip->logical_offset; 5719 5674 dip->bytes = 0; 5720 5675 do { 5721 5676 dip->bytes += bvec->bv_len; ··· 6351 6308 spin_unlock(&root->fs_info->ordered_extent_lock); 6352 6309 } 6353 6310 6311 + if (root == root->fs_info->tree_root) { 6312 + struct btrfs_block_group_cache *block_group; 6313 + 6314 + block_group = btrfs_lookup_block_group(root->fs_info, 6315 + BTRFS_I(inode)->block_group); 6316 + if (block_group && block_group->inode == inode) { 6317 + spin_lock(&block_group->lock); 6318 + block_group->inode = NULL; 6319 + spin_unlock(&block_group->lock); 6320 + btrfs_put_block_group(block_group); 6321 + } else if (block_group) { 6322 + btrfs_put_block_group(block_group); 6323 + } 6324 + } 6325 + 6354 6326 spin_lock(&root->orphan_lock); 6355 6327 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 6356 6328 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", ··· 6398 6340 { 6399 6341 struct btrfs_root *root = BTRFS_I(inode)->root; 6400 6342 6401 - if (btrfs_root_refs(&root->root_item) == 0) 6343 + if (btrfs_root_refs(&root->root_item) == 0 && 6344 + root != root->fs_info->tree_root) 6402 6345 return 1; 6403 6346 else 6404 6347 return generic_drop_inode(inode); ··· 6668 6609 return 0; 6669 6610 } 6670 6611 6671 - int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput) 6612 + int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput, 6613 + int sync) 6672 6614 { 6673 6615 struct btrfs_inode *binode; 6674 6616 struct inode *inode = NULL; ··· 6691 6631 spin_unlock(&root->fs_info->delalloc_lock); 6692 6632 6693 6633 if (inode) { 6694 - write_inode_now(inode, 0); 6634 + if (sync) { 6635 + filemap_write_and_wait(inode->i_mapping); 6636 + /* 6637 + * We have to do this because compression doesn't 6638 + * actually set PG_writeback until it submits the pages 6639 + * for IO, which happens in an async thread, so we could 6640 + * race and not actually wait for any writeback pages 6641 + * because they've not been submitted yet. Technically 6642 + * this could still be the case for the ordered stuff 6643 + * since the async thread may not have started to do its 6644 + * work yet. If this becomes the case then we need to 6645 + * figure out a way to make sure that in writepage we 6646 + * wait for any async pages to be submitted before 6647 + * returning so that fdatawait does what its supposed to 6648 + * do. 6649 + */ 6650 + btrfs_wait_ordered_range(inode, 0, (u64)-1); 6651 + } else { 6652 + filemap_flush(inode->i_mapping); 6653 + } 6695 6654 if (delay_iput) 6696 6655 btrfs_add_delayed_iput(inode); 6697 6656 else ··· 6836 6757 return err; 6837 6758 } 6838 6759 6839 - int btrfs_prealloc_file_range(struct inode *inode, int mode, 6840 - u64 start, u64 num_bytes, u64 min_size, 6841 - loff_t actual_len, u64 *alloc_hint) 6760 + static int __btrfs_prealloc_file_range(struct inode *inode, int mode, 6761 + u64 start, u64 num_bytes, u64 min_size, 6762 + loff_t actual_len, u64 *alloc_hint, 6763 + struct btrfs_trans_handle *trans) 6842 6764 { 6843 - struct btrfs_trans_handle *trans; 6844 6765 struct btrfs_root *root = BTRFS_I(inode)->root; 6845 6766 struct btrfs_key ins; 6846 6767 u64 cur_offset = start; 6847 6768 int ret = 0; 6769 + bool own_trans = true; 6848 6770 6771 + if (trans) 6772 + own_trans = false; 6849 6773 while (num_bytes > 0) { 6850 - trans = btrfs_start_transaction(root, 3); 6851 - if (IS_ERR(trans)) { 6852 - ret = PTR_ERR(trans); 6853 - break; 6774 + if (own_trans) { 6775 + trans = btrfs_start_transaction(root, 3); 6776 + if (IS_ERR(trans)) { 6777 + ret = PTR_ERR(trans); 6778 + break; 6779 + } 6854 6780 } 6855 6781 6856 6782 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, 6857 6783 0, *alloc_hint, (u64)-1, &ins, 1); 6858 6784 if (ret) { 6859 - btrfs_end_transaction(trans, root); 6785 + if (own_trans) 6786 + btrfs_end_transaction(trans, root); 6860 6787 break; 6861 6788 } 6862 6789 ··· 6895 6810 ret = btrfs_update_inode(trans, root, inode); 6896 6811 BUG_ON(ret); 6897 6812 6898 - btrfs_end_transaction(trans, root); 6813 + if (own_trans) 6814 + btrfs_end_transaction(trans, root); 6899 6815 } 6900 6816 return ret; 6817 + } 6818 + 6819 + int btrfs_prealloc_file_range(struct inode *inode, int mode, 6820 + u64 start, u64 num_bytes, u64 min_size, 6821 + loff_t actual_len, u64 *alloc_hint) 6822 + { 6823 + return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 6824 + min_size, actual_len, alloc_hint, 6825 + NULL); 6826 + } 6827 + 6828 + int btrfs_prealloc_file_range_trans(struct inode *inode, 6829 + struct btrfs_trans_handle *trans, int mode, 6830 + u64 start, u64 num_bytes, u64 min_size, 6831 + loff_t actual_len, u64 *alloc_hint) 6832 + { 6833 + return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 6834 + min_size, actual_len, alloc_hint, trans); 6901 6835 } 6902 6836 6903 6837 static long btrfs_fallocate(struct inode *inode, int mode,

+319 -77

fs/btrfs/ioctl.c

··· 224 224 225 225 static noinline int create_subvol(struct btrfs_root *root, 226 226 struct dentry *dentry, 227 - char *name, int namelen) 227 + char *name, int namelen, 228 + u64 *async_transid) 228 229 { 229 230 struct btrfs_trans_handle *trans; 230 231 struct btrfs_key key; ··· 339 338 340 339 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); 341 340 fail: 342 - err = btrfs_commit_transaction(trans, root); 341 + if (async_transid) { 342 + *async_transid = trans->transid; 343 + err = btrfs_commit_transaction_async(trans, root, 1); 344 + } else { 345 + err = btrfs_commit_transaction(trans, root); 346 + } 343 347 if (err && !ret) 344 348 ret = err; 345 349 return ret; 346 350 } 347 351 348 - static int create_snapshot(struct btrfs_root *root, struct dentry *dentry) 352 + static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, 353 + char *name, int namelen, u64 *async_transid) 349 354 { 350 355 struct inode *inode; 351 356 struct btrfs_pending_snapshot *pending_snapshot; ··· 380 373 381 374 list_add(&pending_snapshot->list, 382 375 &trans->transaction->pending_snapshots); 383 - ret = btrfs_commit_transaction(trans, root->fs_info->extent_root); 376 + if (async_transid) { 377 + *async_transid = trans->transid; 378 + ret = btrfs_commit_transaction_async(trans, 379 + root->fs_info->extent_root, 1); 380 + } else { 381 + ret = btrfs_commit_transaction(trans, 382 + root->fs_info->extent_root); 383 + } 384 384 BUG_ON(ret); 385 385 386 386 ret = pending_snapshot->error; ··· 409 395 return ret; 410 396 } 411 397 398 + /* copy of check_sticky in fs/namei.c() 399 + * It's inline, so penalty for filesystems that don't use sticky bit is 400 + * minimal. 401 + */ 402 + static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode) 403 + { 404 + uid_t fsuid = current_fsuid(); 405 + 406 + if (!(dir->i_mode & S_ISVTX)) 407 + return 0; 408 + if (inode->i_uid == fsuid) 409 + return 0; 410 + if (dir->i_uid == fsuid) 411 + return 0; 412 + return !capable(CAP_FOWNER); 413 + } 414 + 415 + /* copy of may_delete in fs/namei.c() 416 + * Check whether we can remove a link victim from directory dir, check 417 + * whether the type of victim is right. 418 + * 1. We can't do it if dir is read-only (done in permission()) 419 + * 2. We should have write and exec permissions on dir 420 + * 3. We can't remove anything from append-only dir 421 + * 4. We can't do anything with immutable dir (done in permission()) 422 + * 5. If the sticky bit on dir is set we should either 423 + * a. be owner of dir, or 424 + * b. be owner of victim, or 425 + * c. have CAP_FOWNER capability 426 + * 6. If the victim is append-only or immutable we can't do antyhing with 427 + * links pointing to it. 428 + * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. 429 + * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. 430 + * 9. We can't remove a root or mountpoint. 431 + * 10. We don't allow removal of NFS sillyrenamed files; it's handled by 432 + * nfs_async_unlink(). 433 + */ 434 + 435 + static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir) 436 + { 437 + int error; 438 + 439 + if (!victim->d_inode) 440 + return -ENOENT; 441 + 442 + BUG_ON(victim->d_parent->d_inode != dir); 443 + audit_inode_child(victim, dir); 444 + 445 + error = inode_permission(dir, MAY_WRITE | MAY_EXEC); 446 + if (error) 447 + return error; 448 + if (IS_APPEND(dir)) 449 + return -EPERM; 450 + if (btrfs_check_sticky(dir, victim->d_inode)|| 451 + IS_APPEND(victim->d_inode)|| 452 + IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) 453 + return -EPERM; 454 + if (isdir) { 455 + if (!S_ISDIR(victim->d_inode->i_mode)) 456 + return -ENOTDIR; 457 + if (IS_ROOT(victim)) 458 + return -EBUSY; 459 + } else if (S_ISDIR(victim->d_inode->i_mode)) 460 + return -EISDIR; 461 + if (IS_DEADDIR(dir)) 462 + return -ENOENT; 463 + if (victim->d_flags & DCACHE_NFSFS_RENAMED) 464 + return -EBUSY; 465 + return 0; 466 + } 467 + 412 468 /* copy of may_create in fs/namei.c() */ 413 469 static inline int btrfs_may_create(struct inode *dir, struct dentry *child) 414 470 { ··· 496 412 */ 497 413 static noinline int btrfs_mksubvol(struct path *parent, 498 414 char *name, int namelen, 499 - struct btrfs_root *snap_src) 415 + struct btrfs_root *snap_src, 416 + u64 *async_transid) 500 417 { 501 418 struct inode *dir = parent->dentry->d_inode; 502 419 struct dentry *dentry; ··· 528 443 goto out_up_read; 529 444 530 445 if (snap_src) { 531 - error = create_snapshot(snap_src, dentry); 446 + error = create_snapshot(snap_src, dentry, 447 + name, namelen, async_transid); 532 448 } else { 533 449 error = create_subvol(BTRFS_I(dir)->root, dentry, 534 - name, namelen); 450 + name, namelen, async_transid); 535 451 } 536 452 if (!error) 537 453 fsnotify_mkdir(dir, dentry); ··· 794 708 char *sizestr; 795 709 char *devstr = NULL; 796 710 int ret = 0; 797 - int namelen; 798 711 int mod = 0; 799 712 800 713 if (root->fs_info->sb->s_flags & MS_RDONLY) ··· 807 722 return PTR_ERR(vol_args); 808 723 809 724 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 810 - namelen = strlen(vol_args->name); 811 725 812 726 mutex_lock(&root->fs_info->volume_mutex); 813 727 sizestr = vol_args->name; ··· 885 801 return ret; 886 802 } 887 803 888 - static noinline int btrfs_ioctl_snap_create(struct file *file, 889 - void __user *arg, int subvol) 804 + static noinline int btrfs_ioctl_snap_create_transid(struct file *file, 805 + char *name, 806 + unsigned long fd, 807 + int subvol, 808 + u64 *transid) 890 809 { 891 810 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 892 - struct btrfs_ioctl_vol_args *vol_args; 893 811 struct file *src_file; 894 812 int namelen; 895 813 int ret = 0; ··· 899 813 if (root->fs_info->sb->s_flags & MS_RDONLY) 900 814 return -EROFS; 901 815 902 - vol_args = memdup_user(arg, sizeof(*vol_args)); 903 - if (IS_ERR(vol_args)) 904 - return PTR_ERR(vol_args); 905 - 906 - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 907 - namelen = strlen(vol_args->name); 908 - if (strchr(vol_args->name, '/')) { 816 + namelen = strlen(name); 817 + if (strchr(name, '/')) { 909 818 ret = -EINVAL; 910 819 goto out; 911 820 } 912 821 913 822 if (subvol) { 914 - ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen, 915 - NULL); 823 + ret = btrfs_mksubvol(&file->f_path, name, namelen, 824 + NULL, transid); 916 825 } else { 917 826 struct inode *src_inode; 918 - src_file = fget(vol_args->fd); 827 + src_file = fget(fd); 919 828 if (!src_file) { 920 829 ret = -EINVAL; 921 830 goto out; ··· 924 843 fput(src_file); 925 844 goto out; 926 845 } 927 - ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen, 928 - BTRFS_I(src_inode)->root); 846 + ret = btrfs_mksubvol(&file->f_path, name, namelen, 847 + BTRFS_I(src_inode)->root, 848 + transid); 929 849 fput(src_file); 930 850 } 931 851 out: 852 + return ret; 853 + } 854 + 855 + static noinline int btrfs_ioctl_snap_create(struct file *file, 856 + void __user *arg, int subvol, 857 + int async) 858 + { 859 + struct btrfs_ioctl_vol_args *vol_args = NULL; 860 + struct btrfs_ioctl_async_vol_args *async_vol_args = NULL; 861 + char *name; 862 + u64 fd; 863 + u64 transid = 0; 864 + int ret; 865 + 866 + if (async) { 867 + async_vol_args = memdup_user(arg, sizeof(*async_vol_args)); 868 + if (IS_ERR(async_vol_args)) 869 + return PTR_ERR(async_vol_args); 870 + 871 + name = async_vol_args->name; 872 + fd = async_vol_args->fd; 873 + async_vol_args->name[BTRFS_SNAPSHOT_NAME_MAX] = '\0'; 874 + } else { 875 + vol_args = memdup_user(arg, sizeof(*vol_args)); 876 + if (IS_ERR(vol_args)) 877 + return PTR_ERR(vol_args); 878 + name = vol_args->name; 879 + fd = vol_args->fd; 880 + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 881 + } 882 + 883 + ret = btrfs_ioctl_snap_create_transid(file, name, fd, 884 + subvol, &transid); 885 + 886 + if (!ret && async) { 887 + if (copy_to_user(arg + 888 + offsetof(struct btrfs_ioctl_async_vol_args, 889 + transid), &transid, sizeof(transid))) 890 + return -EFAULT; 891 + } 892 + 932 893 kfree(vol_args); 894 + kfree(async_vol_args); 895 + 933 896 return ret; 934 897 } 935 898 ··· 1198 1073 if (!capable(CAP_SYS_ADMIN)) 1199 1074 return -EPERM; 1200 1075 1201 - args = kmalloc(sizeof(*args), GFP_KERNEL); 1202 - if (!args) 1203 - return -ENOMEM; 1076 + args = memdup_user(argp, sizeof(*args)); 1077 + if (IS_ERR(args)) 1078 + return PTR_ERR(args); 1204 1079 1205 - if (copy_from_user(args, argp, sizeof(*args))) { 1206 - kfree(args); 1207 - return -EFAULT; 1208 - } 1209 1080 inode = fdentry(file)->d_inode; 1210 1081 ret = search_ioctl(inode, args); 1211 1082 if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) ··· 1309 1188 if (!capable(CAP_SYS_ADMIN)) 1310 1189 return -EPERM; 1311 1190 1312 - args = kmalloc(sizeof(*args), GFP_KERNEL); 1313 - if (!args) 1314 - return -ENOMEM; 1191 + args = memdup_user(argp, sizeof(*args)); 1192 + if (IS_ERR(args)) 1193 + return PTR_ERR(args); 1315 1194 1316 - if (copy_from_user(args, argp, sizeof(*args))) { 1317 - kfree(args); 1318 - return -EFAULT; 1319 - } 1320 1195 inode = fdentry(file)->d_inode; 1321 1196 1322 1197 if (args->treeid == 0) ··· 1344 1227 int ret; 1345 1228 int err = 0; 1346 1229 1347 - if (!capable(CAP_SYS_ADMIN)) 1348 - return -EPERM; 1349 - 1350 1230 vol_args = memdup_user(arg, sizeof(*vol_args)); 1351 1231 if (IS_ERR(vol_args)) 1352 1232 return PTR_ERR(vol_args); ··· 1373 1259 } 1374 1260 1375 1261 inode = dentry->d_inode; 1262 + dest = BTRFS_I(inode)->root; 1263 + if (!capable(CAP_SYS_ADMIN)){ 1264 + /* 1265 + * Regular user. Only allow this with a special mount 1266 + * option, when the user has write+exec access to the 1267 + * subvol root, and when rmdir(2) would have been 1268 + * allowed. 1269 + * 1270 + * Note that this is _not_ check that the subvol is 1271 + * empty or doesn't contain data that we wouldn't 1272 + * otherwise be able to delete. 1273 + * 1274 + * Users who want to delete empty subvols should try 1275 + * rmdir(2). 1276 + */ 1277 + err = -EPERM; 1278 + if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) 1279 + goto out_dput; 1280 + 1281 + /* 1282 + * Do not allow deletion if the parent dir is the same 1283 + * as the dir to be deleted. That means the ioctl 1284 + * must be called on the dentry referencing the root 1285 + * of the subvol, not a random directory contained 1286 + * within it. 1287 + */ 1288 + err = -EINVAL; 1289 + if (root == dest) 1290 + goto out_dput; 1291 + 1292 + err = inode_permission(inode, MAY_WRITE | MAY_EXEC); 1293 + if (err) 1294 + goto out_dput; 1295 + 1296 + /* check if subvolume may be deleted by a non-root user */ 1297 + err = btrfs_may_delete(dir, dentry, 1); 1298 + if (err) 1299 + goto out_dput; 1300 + } 1301 + 1376 1302 if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { 1377 1303 err = -EINVAL; 1378 1304 goto out_dput; 1379 1305 } 1380 - 1381 - dest = BTRFS_I(inode)->root; 1382 1306 1383 1307 mutex_lock(&inode->i_mutex); 1384 1308 err = d_invalidate(dentry); ··· 1456 1304 BUG_ON(ret); 1457 1305 } 1458 1306 1459 - ret = btrfs_commit_transaction(trans, root); 1307 + ret = btrfs_end_transaction(trans, root); 1460 1308 BUG_ON(ret); 1461 1309 inode->i_flags |= S_DEAD; 1462 1310 out_up_write: ··· 1654 1502 path->reada = 2; 1655 1503 1656 1504 if (inode < src) { 1657 - mutex_lock(&inode->i_mutex); 1658 - mutex_lock(&src->i_mutex); 1505 + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); 1506 + mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD); 1659 1507 } else { 1660 - mutex_lock(&src->i_mutex); 1661 - mutex_lock(&inode->i_mutex); 1508 + mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT); 1509 + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 1662 1510 } 1663 1511 1664 1512 /* determine range to clone */ ··· 1682 1530 while (1) { 1683 1531 struct btrfs_ordered_extent *ordered; 1684 1532 lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); 1685 - ordered = btrfs_lookup_first_ordered_extent(inode, off+len); 1686 - if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered) 1533 + ordered = btrfs_lookup_first_ordered_extent(src, off+len); 1534 + if (!ordered && 1535 + !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len, 1536 + EXTENT_DELALLOC, 0, NULL)) 1687 1537 break; 1688 1538 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); 1689 1539 if (ordered) 1690 1540 btrfs_put_ordered_extent(ordered); 1691 - btrfs_wait_ordered_range(src, off, off+len); 1541 + btrfs_wait_ordered_range(src, off, len); 1692 1542 } 1693 1543 1694 1544 /* clone data */ ··· 1759 1605 } 1760 1606 btrfs_release_path(root, path); 1761 1607 1762 - if (key.offset + datal < off || 1608 + if (key.offset + datal <= off || 1763 1609 key.offset >= off+len) 1764 1610 goto next; 1765 1611 ··· 2033 1879 return 0; 2034 1880 } 2035 1881 1882 + static void get_block_group_info(struct list_head *groups_list, 1883 + struct btrfs_ioctl_space_info *space) 1884 + { 1885 + struct btrfs_block_group_cache *block_group; 1886 + 1887 + space->total_bytes = 0; 1888 + space->used_bytes = 0; 1889 + space->flags = 0; 1890 + list_for_each_entry(block_group, groups_list, list) { 1891 + space->flags = block_group->flags; 1892 + space->total_bytes += block_group->key.offset; 1893 + space->used_bytes += 1894 + btrfs_block_group_used(&block_group->item); 1895 + } 1896 + } 1897 + 2036 1898 long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) 2037 1899 { 2038 1900 struct btrfs_ioctl_space_args space_args; ··· 2057 1887 struct btrfs_ioctl_space_info *dest_orig; 2058 1888 struct btrfs_ioctl_space_info *user_dest; 2059 1889 struct btrfs_space_info *info; 1890 + u64 types[] = {BTRFS_BLOCK_GROUP_DATA, 1891 + BTRFS_BLOCK_GROUP_SYSTEM, 1892 + BTRFS_BLOCK_GROUP_METADATA, 1893 + BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA}; 1894 + int num_types = 4; 2060 1895 int alloc_size; 2061 1896 int ret = 0; 2062 1897 int slot_count = 0; 1898 + int i, c; 2063 1899 2064 1900 if (copy_from_user(&space_args, 2065 1901 (struct btrfs_ioctl_space_args __user *)arg, 2066 1902 sizeof(space_args))) 2067 1903 return -EFAULT; 2068 1904 2069 - /* first we count slots */ 2070 - rcu_read_lock(); 2071 - list_for_each_entry_rcu(info, &root->fs_info->space_info, list) 2072 - slot_count++; 2073 - rcu_read_unlock(); 1905 + for (i = 0; i < num_types; i++) { 1906 + struct btrfs_space_info *tmp; 1907 + 1908 + info = NULL; 1909 + rcu_read_lock(); 1910 + list_for_each_entry_rcu(tmp, &root->fs_info->space_info, 1911 + list) { 1912 + if (tmp->flags == types[i]) { 1913 + info = tmp; 1914 + break; 1915 + } 1916 + } 1917 + rcu_read_unlock(); 1918 + 1919 + if (!info) 1920 + continue; 1921 + 1922 + down_read(&info->groups_sem); 1923 + for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { 1924 + if (!list_empty(&info->block_groups[c])) 1925 + slot_count++; 1926 + } 1927 + up_read(&info->groups_sem); 1928 + } 2074 1929 2075 1930 /* space_slots == 0 means they are asking for a count */ 2076 1931 if (space_args.space_slots == 0) { 2077 1932 space_args.total_spaces = slot_count; 2078 1933 goto out; 2079 1934 } 1935 + 1936 + slot_count = min_t(int, space_args.space_slots, slot_count); 1937 + 2080 1938 alloc_size = sizeof(*dest) * slot_count; 1939 + 2081 1940 /* we generally have at most 6 or so space infos, one for each raid 2082 1941 * level. So, a whole page should be more than enough for everyone 2083 1942 */ ··· 2120 1921 dest_orig = dest; 2121 1922 2122 1923 /* now we have a buffer to copy into */ 2123 - rcu_read_lock(); 2124 - list_for_each_entry_rcu(info, &root->fs_info->space_info, list) { 2125 - /* make sure we don't copy more than we allocated 2126 - * in our buffer 2127 - */ 2128 - if (slot_count == 0) 2129 - break; 2130 - slot_count--; 1924 + for (i = 0; i < num_types; i++) { 1925 + struct btrfs_space_info *tmp; 2131 1926 2132 - /* make sure userland has enough room in their buffer */ 2133 - if (space_args.total_spaces >= space_args.space_slots) 2134 - break; 1927 + info = NULL; 1928 + rcu_read_lock(); 1929 + list_for_each_entry_rcu(tmp, &root->fs_info->space_info, 1930 + list) { 1931 + if (tmp->flags == types[i]) { 1932 + info = tmp; 1933 + break; 1934 + } 1935 + } 1936 + rcu_read_unlock(); 2135 1937 2136 - space.flags = info->flags; 2137 - space.total_bytes = info->total_bytes; 2138 - space.used_bytes = info->bytes_used; 2139 - memcpy(dest, &space, sizeof(space)); 2140 - dest++; 2141 - space_args.total_spaces++; 1938 + if (!info) 1939 + continue; 1940 + down_read(&info->groups_sem); 1941 + for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { 1942 + if (!list_empty(&info->block_groups[c])) { 1943 + get_block_group_info(&info->block_groups[c], 1944 + &space); 1945 + memcpy(dest, &space, sizeof(space)); 1946 + dest++; 1947 + space_args.total_spaces++; 1948 + } 1949 + } 1950 + up_read(&info->groups_sem); 2142 1951 } 2143 - rcu_read_unlock(); 2144 1952 2145 1953 user_dest = (struct btrfs_ioctl_space_info *) 2146 1954 (arg + sizeof(struct btrfs_ioctl_space_args)); ··· 2190 1984 return 0; 2191 1985 } 2192 1986 1987 + static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp) 1988 + { 1989 + struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; 1990 + struct btrfs_trans_handle *trans; 1991 + u64 transid; 1992 + 1993 + trans = btrfs_start_transaction(root, 0); 1994 + transid = trans->transid; 1995 + btrfs_commit_transaction_async(trans, root, 0); 1996 + 1997 + if (argp) 1998 + if (copy_to_user(argp, &transid, sizeof(transid))) 1999 + return -EFAULT; 2000 + return 0; 2001 + } 2002 + 2003 + static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp) 2004 + { 2005 + struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; 2006 + u64 transid; 2007 + 2008 + if (argp) { 2009 + if (copy_from_user(&transid, argp, sizeof(transid))) 2010 + return -EFAULT; 2011 + } else { 2012 + transid = 0; /* current trans */ 2013 + } 2014 + return btrfs_wait_for_commit(root, transid); 2015 + } 2016 + 2193 2017 long btrfs_ioctl(struct file *file, unsigned int 2194 2018 cmd, unsigned long arg) 2195 2019 { ··· 2234 1998 case FS_IOC_GETVERSION: 2235 1999 return btrfs_ioctl_getversion(file, argp); 2236 2000 case BTRFS_IOC_SNAP_CREATE: 2237 - return btrfs_ioctl_snap_create(file, argp, 0); 2001 + return btrfs_ioctl_snap_create(file, argp, 0, 0); 2002 + case BTRFS_IOC_SNAP_CREATE_ASYNC: 2003 + return btrfs_ioctl_snap_create(file, argp, 0, 1); 2238 2004 case BTRFS_IOC_SUBVOL_CREATE: 2239 - return btrfs_ioctl_snap_create(file, argp, 1); 2005 + return btrfs_ioctl_snap_create(file, argp, 1, 0); 2240 2006 case BTRFS_IOC_SNAP_DESTROY: 2241 2007 return btrfs_ioctl_snap_destroy(file, argp); 2242 2008 case BTRFS_IOC_DEFAULT_SUBVOL: ··· 2272 2034 case BTRFS_IOC_SYNC: 2273 2035 btrfs_sync_fs(file->f_dentry->d_sb, 1); 2274 2036 return 0; 2037 + case BTRFS_IOC_START_SYNC: 2038 + return btrfs_ioctl_start_sync(file, argp); 2039 + case BTRFS_IOC_WAIT_SYNC: 2040 + return btrfs_ioctl_wait_sync(file, argp); 2275 2041 } 2276 2042 2277 2043 return -ENOTTY;

+12 -1

fs/btrfs/ioctl.h

··· 22 22 23 23 #define BTRFS_IOCTL_MAGIC 0x94 24 24 #define BTRFS_VOL_NAME_MAX 255 25 - #define BTRFS_PATH_NAME_MAX 4087 26 25 27 26 /* this should be 4k */ 27 + #define BTRFS_PATH_NAME_MAX 4087 28 28 struct btrfs_ioctl_vol_args { 29 29 __s64 fd; 30 30 char name[BTRFS_PATH_NAME_MAX + 1]; 31 + }; 32 + 33 + #define BTRFS_SNAPSHOT_NAME_MAX 4079 34 + struct btrfs_ioctl_async_vol_args { 35 + __s64 fd; 36 + __u64 transid; 37 + char name[BTRFS_SNAPSHOT_NAME_MAX + 1]; 31 38 }; 32 39 33 40 #define BTRFS_INO_LOOKUP_PATH_MAX 4080 ··· 185 178 #define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64) 186 179 #define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \ 187 180 struct btrfs_ioctl_space_args) 181 + #define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64) 182 + #define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) 183 + #define BTRFS_IOC_SNAP_CREATE_ASYNC _IOW(BTRFS_IOCTL_MAGIC, 23, \ 184 + struct btrfs_ioctl_async_vol_args) 188 185 #endif

-2

fs/btrfs/ordered-data.c

··· 526 526 { 527 527 u64 end; 528 528 u64 orig_end; 529 - u64 wait_end; 530 529 struct btrfs_ordered_extent *ordered; 531 530 int found; 532 531 ··· 536 537 if (orig_end > INT_LIMIT(loff_t)) 537 538 orig_end = INT_LIMIT(loff_t); 538 539 } 539 - wait_end = orig_end; 540 540 again: 541 541 /* start IO across the range first to instantiate any delalloc 542 542 * extents

+93 -16

fs/btrfs/relocation.c

··· 29 29 #include "locking.h" 30 30 #include "btrfs_inode.h" 31 31 #include "async-thread.h" 32 + #include "free-space-cache.h" 32 33 33 34 /* 34 35 * backref_node, mapping_node and tree_block start with this ··· 178 177 179 178 u64 search_start; 180 179 u64 extents_found; 181 - 182 - int block_rsv_retries; 183 180 184 181 unsigned int stage:8; 185 182 unsigned int create_reloc_tree:1; ··· 2132 2133 LIST_HEAD(reloc_roots); 2133 2134 u64 num_bytes = 0; 2134 2135 int ret; 2135 - int retries = 0; 2136 2136 2137 2137 mutex_lock(&root->fs_info->trans_mutex); 2138 2138 rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; ··· 2141 2143 if (!err) { 2142 2144 num_bytes = rc->merging_rsv_size; 2143 2145 ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, 2144 - num_bytes, &retries); 2146 + num_bytes); 2145 2147 if (ret) 2146 2148 err = ret; 2147 2149 } ··· 2153 2155 btrfs_end_transaction(trans, rc->extent_root); 2154 2156 btrfs_block_rsv_release(rc->extent_root, 2155 2157 rc->block_rsv, num_bytes); 2156 - retries = 0; 2157 2158 goto again; 2158 2159 } 2159 2160 } ··· 2402 2405 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2403 2406 2404 2407 trans->block_rsv = rc->block_rsv; 2405 - ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes, 2406 - &rc->block_rsv_retries); 2408 + ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes); 2407 2409 if (ret) { 2408 2410 if (ret == -EAGAIN) 2409 2411 rc->commit_transaction = 1; 2410 2412 return ret; 2411 2413 } 2412 2414 2413 - rc->block_rsv_retries = 0; 2414 2415 return 0; 2415 2416 } 2416 2417 ··· 3094 3099 BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0)); 3095 3100 ret = get_ref_objectid_v0(rc, path, extent_key, 3096 3101 &ref_owner, NULL); 3102 + if (ret < 0) 3103 + return ret; 3097 3104 BUG_ON(ref_owner >= BTRFS_MAX_LEVEL); 3098 3105 level = (int)ref_owner; 3099 3106 /* FIXME: get real generation */ ··· 3188 3191 return ret; 3189 3192 } 3190 3193 3194 + static int delete_block_group_cache(struct btrfs_fs_info *fs_info, 3195 + struct inode *inode, u64 ino) 3196 + { 3197 + struct btrfs_key key; 3198 + struct btrfs_path *path; 3199 + struct btrfs_root *root = fs_info->tree_root; 3200 + struct btrfs_trans_handle *trans; 3201 + unsigned long nr; 3202 + int ret = 0; 3203 + 3204 + if (inode) 3205 + goto truncate; 3206 + 3207 + key.objectid = ino; 3208 + key.type = BTRFS_INODE_ITEM_KEY; 3209 + key.offset = 0; 3210 + 3211 + inode = btrfs_iget(fs_info->sb, &key, root, NULL); 3212 + if (!inode || IS_ERR(inode) || is_bad_inode(inode)) { 3213 + if (inode && !IS_ERR(inode)) 3214 + iput(inode); 3215 + return -ENOENT; 3216 + } 3217 + 3218 + truncate: 3219 + path = btrfs_alloc_path(); 3220 + if (!path) { 3221 + ret = -ENOMEM; 3222 + goto out; 3223 + } 3224 + 3225 + trans = btrfs_join_transaction(root, 0); 3226 + if (IS_ERR(trans)) { 3227 + btrfs_free_path(path); 3228 + goto out; 3229 + } 3230 + 3231 + ret = btrfs_truncate_free_space_cache(root, trans, path, inode); 3232 + 3233 + btrfs_free_path(path); 3234 + nr = trans->blocks_used; 3235 + btrfs_end_transaction(trans, root); 3236 + btrfs_btree_balance_dirty(root, nr); 3237 + out: 3238 + iput(inode); 3239 + return ret; 3240 + } 3241 + 3191 3242 /* 3192 3243 * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY 3193 3244 * this function scans fs tree to find blocks reference the data extent ··· 3262 3217 int counted; 3263 3218 int ret; 3264 3219 3265 - path = btrfs_alloc_path(); 3266 - if (!path) 3267 - return -ENOMEM; 3268 - 3269 3220 ref_root = btrfs_extent_data_ref_root(leaf, ref); 3270 3221 ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref); 3271 3222 ref_offset = btrfs_extent_data_ref_offset(leaf, ref); 3272 3223 ref_count = btrfs_extent_data_ref_count(leaf, ref); 3224 + 3225 + /* 3226 + * This is an extent belonging to the free space cache, lets just delete 3227 + * it and redo the search. 3228 + */ 3229 + if (ref_root == BTRFS_ROOT_TREE_OBJECTID) { 3230 + ret = delete_block_group_cache(rc->extent_root->fs_info, 3231 + NULL, ref_objectid); 3232 + if (ret != -ENOENT) 3233 + return ret; 3234 + ret = 0; 3235 + } 3236 + 3237 + path = btrfs_alloc_path(); 3238 + if (!path) 3239 + return -ENOMEM; 3273 3240 3274 3241 root = read_fs_root(rc->extent_root->fs_info, ref_root); 3275 3242 if (IS_ERR(root)) { ··· 3611 3554 * is no reservation in transaction handle. 3612 3555 */ 3613 3556 ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, 3614 - rc->extent_root->nodesize * 256, 3615 - &rc->block_rsv_retries); 3557 + rc->extent_root->nodesize * 256); 3616 3558 if (ret) 3617 3559 return ret; 3618 3560 ··· 3623 3567 rc->extents_found = 0; 3624 3568 rc->nodes_relocated = 0; 3625 3569 rc->merging_rsv_size = 0; 3626 - rc->block_rsv_retries = 0; 3627 3570 3628 3571 rc->create_reloc_tree = 1; 3629 3572 set_reloc_control(rc); ··· 3915 3860 { 3916 3861 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3917 3862 struct reloc_control *rc; 3863 + struct inode *inode; 3864 + struct btrfs_path *path; 3918 3865 int ret; 3919 3866 int rw = 0; 3920 3867 int err = 0; ··· 3937 3880 goto out; 3938 3881 } 3939 3882 rw = 1; 3883 + } 3884 + 3885 + path = btrfs_alloc_path(); 3886 + if (!path) { 3887 + err = -ENOMEM; 3888 + goto out; 3889 + } 3890 + 3891 + inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group, 3892 + path); 3893 + btrfs_free_path(path); 3894 + 3895 + if (!IS_ERR(inode)) 3896 + ret = delete_block_group_cache(fs_info, inode, 0); 3897 + else 3898 + ret = PTR_ERR(inode); 3899 + 3900 + if (ret && ret != -ENOENT) { 3901 + err = ret; 3902 + goto out; 3940 3903 } 3941 3904 3942 3905 rc->data_inode = create_reloc_inode(fs_info, rc->block_group); ··· 4220 4143 btrfs_add_ordered_sum(inode, ordered, sums); 4221 4144 } 4222 4145 btrfs_put_ordered_extent(ordered); 4223 - return 0; 4146 + return ret; 4224 4147 } 4225 4148 4226 4149 void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,

-2

fs/btrfs/root-tree.c

··· 181 181 int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid) 182 182 { 183 183 struct btrfs_root *dead_root; 184 - struct btrfs_item *item; 185 184 struct btrfs_root_item *ri; 186 185 struct btrfs_key key; 187 186 struct btrfs_key found_key; ··· 213 214 nritems = btrfs_header_nritems(leaf); 214 215 slot = path->slots[0]; 215 216 } 216 - item = btrfs_item_nr(leaf, slot); 217 217 btrfs_item_key_to_cpu(leaf, &key, slot); 218 218 if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY) 219 219 goto next;

+30 -12

fs/btrfs/super.c

··· 61 61 62 62 ret = close_ctree(root); 63 63 sb->s_fs_info = NULL; 64 + 65 + (void)ret; /* FIXME: need to fix VFS to return error? */ 64 66 } 65 67 66 68 enum { ··· 70 68 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, 71 69 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, 72 70 Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit, 73 - Opt_discard, Opt_err, 71 + Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err, 72 + Opt_user_subvol_rm_allowed, 74 73 }; 75 74 76 75 static match_table_t tokens = { ··· 95 92 {Opt_flushoncommit, "flushoncommit"}, 96 93 {Opt_ratio, "metadata_ratio=%d"}, 97 94 {Opt_discard, "discard"}, 95 + {Opt_space_cache, "space_cache"}, 96 + {Opt_clear_cache, "clear_cache"}, 97 + {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, 98 98 {Opt_err, NULL}, 99 99 }; 100 100 ··· 240 234 break; 241 235 case Opt_discard: 242 236 btrfs_set_opt(info->mount_opt, DISCARD); 237 + break; 238 + case Opt_space_cache: 239 + printk(KERN_INFO "btrfs: enabling disk space caching\n"); 240 + btrfs_set_opt(info->mount_opt, SPACE_CACHE); 241 + case Opt_clear_cache: 242 + printk(KERN_INFO "btrfs: force clearing of disk cache\n"); 243 + btrfs_set_opt(info->mount_opt, CLEAR_CACHE); 244 + break; 245 + case Opt_user_subvol_rm_allowed: 246 + btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED); 243 247 break; 244 248 case Opt_err: 245 249 printk(KERN_INFO "btrfs: unrecognized mount option " ··· 396 380 find_root: 397 381 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); 398 382 if (IS_ERR(new_root)) 399 - return ERR_PTR(PTR_ERR(new_root)); 383 + return ERR_CAST(new_root); 400 384 401 385 if (btrfs_root_refs(&new_root->root_item) == 0) 402 386 return ERR_PTR(-ENOENT); ··· 452 436 { 453 437 struct inode *inode; 454 438 struct dentry *root_dentry; 455 - struct btrfs_super_block *disk_super; 456 439 struct btrfs_root *tree_root; 457 440 struct btrfs_key key; 458 441 int err; ··· 473 458 return PTR_ERR(tree_root); 474 459 } 475 460 sb->s_fs_info = tree_root; 476 - disk_super = &tree_root->fs_info->super_copy; 477 461 478 462 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 479 463 key.type = BTRFS_INODE_ITEM_KEY; ··· 585 571 char *subvol_name = NULL; 586 572 u64 subvol_objectid = 0; 587 573 int error = 0; 588 - int found = 0; 589 574 590 575 if (!(flags & MS_RDONLY)) 591 576 mode |= FMODE_WRITE; ··· 620 607 goto error_close_devices; 621 608 } 622 609 623 - found = 1; 624 610 btrfs_close_devices(fs_devices); 625 611 } else { 626 612 char b[BDEVNAME_SIZE]; ··· 641 629 if (IS_ERR(root)) { 642 630 error = PTR_ERR(root); 643 631 deactivate_locked_super(s); 644 - goto error; 632 + goto error_free_subvol_name; 645 633 } 646 634 /* if they gave us a subvolume name bind mount into that */ 647 635 if (strcmp(subvol_name, ".")) { ··· 655 643 deactivate_locked_super(s); 656 644 error = PTR_ERR(new_root); 657 645 dput(root); 658 - goto error_close_devices; 646 + goto error_free_subvol_name; 659 647 } 660 648 if (!new_root->d_inode) { 661 649 dput(root); 662 650 dput(new_root); 663 651 deactivate_locked_super(s); 664 652 error = -ENXIO; 665 - goto error_close_devices; 653 + goto error_free_subvol_name; 666 654 } 667 655 dput(root); 668 656 root = new_root; ··· 677 665 btrfs_close_devices(fs_devices); 678 666 error_free_subvol_name: 679 667 kfree(subvol_name); 680 - error: 681 668 return ERR_PTR(error); 682 669 } 683 670 ··· 724 713 struct list_head *head = &root->fs_info->space_info; 725 714 struct btrfs_space_info *found; 726 715 u64 total_used = 0; 716 + u64 total_used_data = 0; 727 717 int bits = dentry->d_sb->s_blocksize_bits; 728 718 __be32 *fsid = (__be32 *)root->fs_info->fsid; 729 719 730 720 rcu_read_lock(); 731 - list_for_each_entry_rcu(found, head, list) 721 + list_for_each_entry_rcu(found, head, list) { 722 + if (found->flags & (BTRFS_BLOCK_GROUP_METADATA | 723 + BTRFS_BLOCK_GROUP_SYSTEM)) 724 + total_used_data += found->disk_total; 725 + else 726 + total_used_data += found->disk_used; 732 727 total_used += found->disk_used; 728 + } 733 729 rcu_read_unlock(); 734 730 735 731 buf->f_namelen = BTRFS_NAME_LEN; 736 732 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 737 733 buf->f_bfree = buf->f_blocks - (total_used >> bits); 738 - buf->f_bavail = buf->f_bfree; 734 + buf->f_bavail = buf->f_blocks - (total_used_data >> bits); 739 735 buf->f_bsize = dentry->d_sb->s_blocksize; 740 736 buf->f_type = BTRFS_SUPER_MAGIC; 741 737

+210 -24

fs/btrfs/transaction.c

··· 163 163 TRANS_START, 164 164 TRANS_JOIN, 165 165 TRANS_USERSPACE, 166 + TRANS_JOIN_NOLOCK, 166 167 }; 167 168 168 169 static int may_wait_transaction(struct btrfs_root *root, int type) ··· 180 179 { 181 180 struct btrfs_trans_handle *h; 182 181 struct btrfs_transaction *cur_trans; 183 - int retries = 0; 184 182 int ret; 185 183 again: 186 184 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 187 185 if (!h) 188 186 return ERR_PTR(-ENOMEM); 189 187 190 - mutex_lock(&root->fs_info->trans_mutex); 188 + if (type != TRANS_JOIN_NOLOCK) 189 + mutex_lock(&root->fs_info->trans_mutex); 191 190 if (may_wait_transaction(root, type)) 192 191 wait_current_trans(root); 193 192 ··· 196 195 197 196 cur_trans = root->fs_info->running_transaction; 198 197 cur_trans->use_count++; 199 - mutex_unlock(&root->fs_info->trans_mutex); 198 + if (type != TRANS_JOIN_NOLOCK) 199 + mutex_unlock(&root->fs_info->trans_mutex); 200 200 201 201 h->transid = cur_trans->transid; 202 202 h->transaction = cur_trans; ··· 214 212 } 215 213 216 214 if (num_items > 0) { 217 - ret = btrfs_trans_reserve_metadata(h, root, num_items, 218 - &retries); 215 + ret = btrfs_trans_reserve_metadata(h, root, num_items); 219 216 if (ret == -EAGAIN) { 220 217 btrfs_commit_transaction(h, root); 221 218 goto again; ··· 225 224 } 226 225 } 227 226 228 - mutex_lock(&root->fs_info->trans_mutex); 227 + if (type != TRANS_JOIN_NOLOCK) 228 + mutex_lock(&root->fs_info->trans_mutex); 229 229 record_root_in_trans(h, root); 230 - mutex_unlock(&root->fs_info->trans_mutex); 230 + if (type != TRANS_JOIN_NOLOCK) 231 + mutex_unlock(&root->fs_info->trans_mutex); 231 232 232 233 if (!current->journal_info && type != TRANS_USERSPACE) 233 234 current->journal_info = h; ··· 245 242 int num_blocks) 246 243 { 247 244 return start_transaction(root, 0, TRANS_JOIN); 245 + } 246 + 247 + struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root, 248 + int num_blocks) 249 + { 250 + return start_transaction(root, 0, TRANS_JOIN_NOLOCK); 248 251 } 249 252 250 253 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, ··· 277 268 mutex_unlock(&root->fs_info->trans_mutex); 278 269 finish_wait(&commit->commit_wait, &wait); 279 270 return 0; 271 + } 272 + 273 + int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) 274 + { 275 + struct btrfs_transaction *cur_trans = NULL, *t; 276 + int ret; 277 + 278 + mutex_lock(&root->fs_info->trans_mutex); 279 + 280 + ret = 0; 281 + if (transid) { 282 + if (transid <= root->fs_info->last_trans_committed) 283 + goto out_unlock; 284 + 285 + /* find specified transaction */ 286 + list_for_each_entry(t, &root->fs_info->trans_list, list) { 287 + if (t->transid == transid) { 288 + cur_trans = t; 289 + break; 290 + } 291 + if (t->transid > transid) 292 + break; 293 + } 294 + ret = -EINVAL; 295 + if (!cur_trans) 296 + goto out_unlock; /* bad transid */ 297 + } else { 298 + /* find newest transaction that is committing | committed */ 299 + list_for_each_entry_reverse(t, &root->fs_info->trans_list, 300 + list) { 301 + if (t->in_commit) { 302 + if (t->commit_done) 303 + goto out_unlock; 304 + cur_trans = t; 305 + break; 306 + } 307 + } 308 + if (!cur_trans) 309 + goto out_unlock; /* nothing committing|committed */ 310 + } 311 + 312 + cur_trans->use_count++; 313 + mutex_unlock(&root->fs_info->trans_mutex); 314 + 315 + wait_for_commit(root, cur_trans); 316 + 317 + mutex_lock(&root->fs_info->trans_mutex); 318 + put_transaction(cur_trans); 319 + ret = 0; 320 + out_unlock: 321 + mutex_unlock(&root->fs_info->trans_mutex); 322 + return ret; 280 323 } 281 324 282 325 #if 0 ··· 409 348 } 410 349 411 350 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, 412 - struct btrfs_root *root, int throttle) 351 + struct btrfs_root *root, int throttle, int lock) 413 352 { 414 353 struct btrfs_transaction *cur_trans = trans->transaction; 415 354 struct btrfs_fs_info *info = root->fs_info; ··· 437 376 438 377 btrfs_trans_release_metadata(trans, root); 439 378 440 - if (!root->fs_info->open_ioctl_trans && 379 + if (lock && !root->fs_info->open_ioctl_trans && 441 380 should_end_transaction(trans, root)) 442 381 trans->transaction->blocked = 1; 443 382 444 - if (cur_trans->blocked && !cur_trans->in_commit) { 383 + if (lock && cur_trans->blocked && !cur_trans->in_commit) { 445 384 if (throttle) 446 385 return btrfs_commit_transaction(trans, root); 447 386 else 448 387 wake_up_process(info->transaction_kthread); 449 388 } 450 389 451 - mutex_lock(&info->trans_mutex); 390 + if (lock) 391 + mutex_lock(&info->trans_mutex); 452 392 WARN_ON(cur_trans != info->running_transaction); 453 393 WARN_ON(cur_trans->num_writers < 1); 454 394 cur_trans->num_writers--; 455 395 396 + smp_mb(); 456 397 if (waitqueue_active(&cur_trans->writer_wait)) 457 398 wake_up(&cur_trans->writer_wait); 458 399 put_transaction(cur_trans); 459 - mutex_unlock(&info->trans_mutex); 400 + if (lock) 401 + mutex_unlock(&info->trans_mutex); 460 402 461 403 if (current->journal_info == trans) 462 404 current->journal_info = NULL; ··· 475 411 int btrfs_end_transaction(struct btrfs_trans_handle *trans, 476 412 struct btrfs_root *root) 477 413 { 478 - return __btrfs_end_transaction(trans, root, 0); 414 + return __btrfs_end_transaction(trans, root, 0, 1); 479 415 } 480 416 481 417 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, 482 418 struct btrfs_root *root) 483 419 { 484 - return __btrfs_end_transaction(trans, root, 1); 420 + return __btrfs_end_transaction(trans, root, 1, 1); 421 + } 422 + 423 + int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans, 424 + struct btrfs_root *root) 425 + { 426 + return __btrfs_end_transaction(trans, root, 0, 0); 485 427 } 486 428 487 429 /* ··· 906 836 struct extent_buffer *tmp; 907 837 struct extent_buffer *old; 908 838 int ret; 909 - int retries = 0; 910 839 u64 to_reserve = 0; 911 840 u64 index = 0; 912 841 u64 objectid; ··· 927 858 928 859 if (to_reserve > 0) { 929 860 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, 930 - to_reserve, &retries); 861 + to_reserve); 931 862 if (ret) { 932 863 pending->error = ret; 933 864 goto fail; ··· 1035 966 super->root = root_item->bytenr; 1036 967 super->generation = root_item->generation; 1037 968 super->root_level = root_item->level; 969 + if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE)) 970 + super->cache_generation = root_item->generation; 1038 971 } 1039 972 1040 973 int btrfs_transaction_in_commit(struct btrfs_fs_info *info) ··· 1059 988 return ret; 1060 989 } 1061 990 991 + /* 992 + * wait for the current transaction commit to start and block subsequent 993 + * transaction joins 994 + */ 995 + static void wait_current_trans_commit_start(struct btrfs_root *root, 996 + struct btrfs_transaction *trans) 997 + { 998 + DEFINE_WAIT(wait); 999 + 1000 + if (trans->in_commit) 1001 + return; 1002 + 1003 + while (1) { 1004 + prepare_to_wait(&root->fs_info->transaction_blocked_wait, &wait, 1005 + TASK_UNINTERRUPTIBLE); 1006 + if (trans->in_commit) { 1007 + finish_wait(&root->fs_info->transaction_blocked_wait, 1008 + &wait); 1009 + break; 1010 + } 1011 + mutex_unlock(&root->fs_info->trans_mutex); 1012 + schedule(); 1013 + mutex_lock(&root->fs_info->trans_mutex); 1014 + finish_wait(&root->fs_info->transaction_blocked_wait, &wait); 1015 + } 1016 + } 1017 + 1018 + /* 1019 + * wait for the current transaction to start and then become unblocked. 1020 + * caller holds ref. 1021 + */ 1022 + static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, 1023 + struct btrfs_transaction *trans) 1024 + { 1025 + DEFINE_WAIT(wait); 1026 + 1027 + if (trans->commit_done || (trans->in_commit && !trans->blocked)) 1028 + return; 1029 + 1030 + while (1) { 1031 + prepare_to_wait(&root->fs_info->transaction_wait, &wait, 1032 + TASK_UNINTERRUPTIBLE); 1033 + if (trans->commit_done || 1034 + (trans->in_commit && !trans->blocked)) { 1035 + finish_wait(&root->fs_info->transaction_wait, 1036 + &wait); 1037 + break; 1038 + } 1039 + mutex_unlock(&root->fs_info->trans_mutex); 1040 + schedule(); 1041 + mutex_lock(&root->fs_info->trans_mutex); 1042 + finish_wait(&root->fs_info->transaction_wait, 1043 + &wait); 1044 + } 1045 + } 1046 + 1047 + /* 1048 + * commit transactions asynchronously. once btrfs_commit_transaction_async 1049 + * returns, any subsequent transaction will not be allowed to join. 1050 + */ 1051 + struct btrfs_async_commit { 1052 + struct btrfs_trans_handle *newtrans; 1053 + struct btrfs_root *root; 1054 + struct delayed_work work; 1055 + }; 1056 + 1057 + static void do_async_commit(struct work_struct *work) 1058 + { 1059 + struct btrfs_async_commit *ac = 1060 + container_of(work, struct btrfs_async_commit, work.work); 1061 + 1062 + btrfs_commit_transaction(ac->newtrans, ac->root); 1063 + kfree(ac); 1064 + } 1065 + 1066 + int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, 1067 + struct btrfs_root *root, 1068 + int wait_for_unblock) 1069 + { 1070 + struct btrfs_async_commit *ac; 1071 + struct btrfs_transaction *cur_trans; 1072 + 1073 + ac = kmalloc(sizeof(*ac), GFP_NOFS); 1074 + BUG_ON(!ac); 1075 + 1076 + INIT_DELAYED_WORK(&ac->work, do_async_commit); 1077 + ac->root = root; 1078 + ac->newtrans = btrfs_join_transaction(root, 0); 1079 + 1080 + /* take transaction reference */ 1081 + mutex_lock(&root->fs_info->trans_mutex); 1082 + cur_trans = trans->transaction; 1083 + cur_trans->use_count++; 1084 + mutex_unlock(&root->fs_info->trans_mutex); 1085 + 1086 + btrfs_end_transaction(trans, root); 1087 + schedule_delayed_work(&ac->work, 0); 1088 + 1089 + /* wait for transaction to start and unblock */ 1090 + mutex_lock(&root->fs_info->trans_mutex); 1091 + if (wait_for_unblock) 1092 + wait_current_trans_commit_start_and_unblock(root, cur_trans); 1093 + else 1094 + wait_current_trans_commit_start(root, cur_trans); 1095 + put_transaction(cur_trans); 1096 + mutex_unlock(&root->fs_info->trans_mutex); 1097 + 1098 + return 0; 1099 + } 1100 + 1101 + /* 1102 + * btrfs_transaction state sequence: 1103 + * in_commit = 0, blocked = 0 (initial) 1104 + * in_commit = 1, blocked = 1 1105 + * blocked = 0 1106 + * commit_done = 1 1107 + */ 1062 1108 int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 1063 1109 struct btrfs_root *root) 1064 1110 { 1065 1111 unsigned long joined = 0; 1066 - unsigned long timeout = 1; 1067 1112 struct btrfs_transaction *cur_trans; 1068 1113 struct btrfs_transaction *prev_trans = NULL; 1069 1114 DEFINE_WAIT(wait); ··· 1226 1039 1227 1040 trans->transaction->in_commit = 1; 1228 1041 trans->transaction->blocked = 1; 1042 + wake_up(&root->fs_info->transaction_blocked_wait); 1043 + 1229 1044 if (cur_trans->list.prev != &root->fs_info->trans_list) { 1230 1045 prev_trans = list_entry(cur_trans->list.prev, 1231 1046 struct btrfs_transaction, list); ··· 1252 1063 snap_pending = 1; 1253 1064 1254 1065 WARN_ON(cur_trans != trans->transaction); 1255 - if (cur_trans->num_writers > 1) 1256 - timeout = MAX_SCHEDULE_TIMEOUT; 1257 - else if (should_grow) 1258 - timeout = 1; 1259 - 1260 1066 mutex_unlock(&root->fs_info->trans_mutex); 1261 1067 1262 1068 if (flush_on_commit || snap_pending) { ··· 1273 1089 TASK_UNINTERRUPTIBLE); 1274 1090 1275 1091 smp_mb(); 1276 - if (cur_trans->num_writers > 1 || should_grow) 1277 - schedule_timeout(timeout); 1092 + if (cur_trans->num_writers > 1) 1093 + schedule_timeout(MAX_SCHEDULE_TIMEOUT); 1094 + else if (should_grow) 1095 + schedule_timeout(1); 1278 1096 1279 1097 mutex_lock(&root->fs_info->trans_mutex); 1280 1098 finish_wait(&cur_trans->writer_wait, &wait);

+8

fs/btrfs/transaction.h

··· 87 87 88 88 int btrfs_end_transaction(struct btrfs_trans_handle *trans, 89 89 struct btrfs_root *root); 90 + int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans, 91 + struct btrfs_root *root); 90 92 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 91 93 int num_items); 92 94 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 93 95 int num_blocks); 96 + struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root, 97 + int num_blocks); 94 98 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 95 99 int num_blocks); 100 + int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); 96 101 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 97 102 struct btrfs_root *root); 98 103 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, ··· 109 104 int btrfs_clean_old_snapshots(struct btrfs_root *root); 110 105 int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 111 106 struct btrfs_root *root); 107 + int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, 108 + struct btrfs_root *root, 109 + int wait_for_unblock); 112 110 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, 113 111 struct btrfs_root *root); 114 112 int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,

-2

fs/btrfs/tree-defrag.c

··· 36 36 int ret = 0; 37 37 int wret; 38 38 int level; 39 - int orig_level; 40 39 int is_extent = 0; 41 40 int next_key_ret = 0; 42 41 u64 last_ret = 0; ··· 63 64 return -ENOMEM; 64 65 65 66 level = btrfs_header_level(root->node); 66 - orig_level = level; 67 67 68 68 if (level == 0) 69 69 goto out;

+1 -16

fs/btrfs/tree-log.c

··· 786 786 { 787 787 struct inode *dir; 788 788 int ret; 789 - struct btrfs_key location; 790 789 struct btrfs_inode_ref *ref; 791 790 struct btrfs_dir_item *di; 792 791 struct inode *inode; ··· 793 794 int namelen; 794 795 unsigned long ref_ptr; 795 796 unsigned long ref_end; 796 - 797 - location.objectid = key->objectid; 798 - location.type = BTRFS_INODE_ITEM_KEY; 799 - location.offset = 0; 800 797 801 798 /* 802 799 * it is possible that we didn't log all the parent directories ··· 1578 1583 struct btrfs_path *path; 1579 1584 struct btrfs_root *root = wc->replay_dest; 1580 1585 struct btrfs_key key; 1581 - u32 item_size; 1582 1586 int level; 1583 1587 int i; 1584 1588 int ret; ··· 1595 1601 nritems = btrfs_header_nritems(eb); 1596 1602 for (i = 0; i < nritems; i++) { 1597 1603 btrfs_item_key_to_cpu(eb, &key, i); 1598 - item_size = btrfs_item_size_nr(eb, i); 1599 1604 1600 1605 /* inode keys are done during the first stage */ 1601 1606 if (key.type == BTRFS_INODE_ITEM_KEY && ··· 1661 1668 struct walk_control *wc) 1662 1669 { 1663 1670 u64 root_owner; 1664 - u64 root_gen; 1665 1671 u64 bytenr; 1666 1672 u64 ptr_gen; 1667 1673 struct extent_buffer *next; ··· 1690 1698 1691 1699 parent = path->nodes[*level]; 1692 1700 root_owner = btrfs_header_owner(parent); 1693 - root_gen = btrfs_header_generation(parent); 1694 1701 1695 1702 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 1696 1703 ··· 1740 1749 struct walk_control *wc) 1741 1750 { 1742 1751 u64 root_owner; 1743 - u64 root_gen; 1744 1752 int i; 1745 1753 int slot; 1746 1754 int ret; ··· 1747 1757 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 1748 1758 slot = path->slots[i]; 1749 1759 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 1750 - struct extent_buffer *node; 1751 - node = path->nodes[i]; 1752 1760 path->slots[i]++; 1753 1761 *level = i; 1754 1762 WARN_ON(*level == 0); ··· 1759 1771 parent = path->nodes[*level + 1]; 1760 1772 1761 1773 root_owner = btrfs_header_owner(parent); 1762 - root_gen = btrfs_header_generation(parent); 1763 1774 wc->process_func(root, path->nodes[*level], wc, 1764 1775 btrfs_header_generation(path->nodes[*level])); 1765 1776 if (wc->free) { ··· 2260 2273 } 2261 2274 btrfs_end_log_trans(root); 2262 2275 2263 - return 0; 2276 + return err; 2264 2277 } 2265 2278 2266 2279 /* see comments for btrfs_del_dir_entries_in_log */ ··· 2716 2729 struct btrfs_key max_key; 2717 2730 struct btrfs_root *log = root->log_root; 2718 2731 struct extent_buffer *src = NULL; 2719 - u32 size; 2720 2732 int err = 0; 2721 2733 int ret; 2722 2734 int nritems; ··· 2779 2793 break; 2780 2794 2781 2795 src = path->nodes[0]; 2782 - size = btrfs_item_size_nr(src, path->slots[0]); 2783 2796 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 2784 2797 ins_nr++; 2785 2798 goto next_slot;

+1 -6

fs/btrfs/volumes.c

··· 1898 1898 u64 size_to_free; 1899 1899 struct btrfs_path *path; 1900 1900 struct btrfs_key key; 1901 - struct btrfs_chunk *chunk; 1902 1901 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; 1903 1902 struct btrfs_trans_handle *trans; 1904 1903 struct btrfs_key found_key; ··· 1961 1962 if (found_key.objectid != key.objectid) 1962 1963 break; 1963 1964 1964 - chunk = btrfs_item_ptr(path->nodes[0], 1965 - path->slots[0], 1966 - struct btrfs_chunk); 1967 1965 /* chunk zero is special */ 1968 1966 if (found_key.offset == 0) 1969 1967 break; ··· 3027 3031 } 3028 3032 bio->bi_sector = multi->stripes[dev_nr].physical >> 9; 3029 3033 dev = multi->stripes[dev_nr].dev; 3030 - BUG_ON(rw == WRITE && !dev->writeable); 3031 - if (dev && dev->bdev) { 3034 + if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { 3032 3035 bio->bi_bdev = dev->bdev; 3033 3036 if (async_submit) 3034 3037 schedule_bio(root, dev, rw, bio);

-2

fs/btrfs/xattr.c

··· 178 178 struct inode *inode = dentry->d_inode; 179 179 struct btrfs_root *root = BTRFS_I(inode)->root; 180 180 struct btrfs_path *path; 181 - struct btrfs_item *item; 182 181 struct extent_buffer *leaf; 183 182 struct btrfs_dir_item *di; 184 183 int ret = 0, slot, advance; ··· 233 234 } 234 235 advance = 1; 235 236 236 - item = btrfs_item_nr(leaf, slot); 237 237 btrfs_item_key_to_cpu(leaf, &found_key, slot); 238 238 239 239 /* check to make sure this item is what we want */

-5

fs/btrfs/zlib.c

··· 199 199 int nr_pages = 0; 200 200 struct page *in_page = NULL; 201 201 struct page *out_page = NULL; 202 - int out_written = 0; 203 - int in_read = 0; 204 202 unsigned long bytes_left; 205 203 206 204 *out_pages = 0; ··· 230 232 workspace->def_strm.next_out = cpage_out; 231 233 workspace->def_strm.avail_out = PAGE_CACHE_SIZE; 232 234 workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE); 233 - 234 - out_written = 0; 235 - in_read = 0; 236 235 237 236 while (workspace->def_strm.total_in < len) { 238 237 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);

+40 -7

fs/fs-writeback.c

··· 1081 1081 } 1082 1082 1083 1083 /** 1084 - * writeback_inodes_sb - writeback dirty inodes from given super_block 1084 + * writeback_inodes_sb_nr - writeback dirty inodes from given super_block 1085 1085 * @sb: the superblock 1086 + * @nr: the number of pages to write 1086 1087 * 1087 1088 * Start writeback on some inodes on this super_block. No guarantees are made 1088 1089 * on how many (if any) will be written, and this function does not wait 1089 - * for IO completion of submitted IO. The number of pages submitted is 1090 - * returned. 1090 + * for IO completion of submitted IO. 1091 1091 */ 1092 - void writeback_inodes_sb(struct super_block *sb) 1092 + void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) 1093 1093 { 1094 1094 DECLARE_COMPLETION_ONSTACK(done); 1095 1095 struct wb_writeback_work work = { 1096 1096 .sb = sb, 1097 1097 .sync_mode = WB_SYNC_NONE, 1098 1098 .done = &done, 1099 + .nr_pages = nr, 1099 1100 }; 1100 1101 1101 1102 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1102 - 1103 - work.nr_pages = get_nr_dirty_pages(); 1104 - 1105 1103 bdi_queue_work(sb->s_bdi, &work); 1106 1104 wait_for_completion(&done); 1105 + } 1106 + EXPORT_SYMBOL(writeback_inodes_sb_nr); 1107 + 1108 + /** 1109 + * writeback_inodes_sb - writeback dirty inodes from given super_block 1110 + * @sb: the superblock 1111 + * 1112 + * Start writeback on some inodes on this super_block. No guarantees are made 1113 + * on how many (if any) will be written, and this function does not wait 1114 + * for IO completion of submitted IO. 1115 + */ 1116 + void writeback_inodes_sb(struct super_block *sb) 1117 + { 1118 + return writeback_inodes_sb_nr(sb, get_nr_dirty_pages()); 1107 1119 } 1108 1120 EXPORT_SYMBOL(writeback_inodes_sb); 1109 1121 ··· 1137 1125 return 0; 1138 1126 } 1139 1127 EXPORT_SYMBOL(writeback_inodes_sb_if_idle); 1128 + 1129 + /** 1130 + * writeback_inodes_sb_if_idle - start writeback if none underway 1131 + * @sb: the superblock 1132 + * @nr: the number of pages to write 1133 + * 1134 + * Invoke writeback_inodes_sb if no writeback is currently underway. 1135 + * Returns 1 if writeback was started, 0 if not. 1136 + */ 1137 + int writeback_inodes_sb_nr_if_idle(struct super_block *sb, 1138 + unsigned long nr) 1139 + { 1140 + if (!writeback_in_progress(sb->s_bdi)) { 1141 + down_read(&sb->s_umount); 1142 + writeback_inodes_sb_nr(sb, nr); 1143 + up_read(&sb->s_umount); 1144 + return 1; 1145 + } else 1146 + return 0; 1147 + } 1148 + EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle); 1140 1149 1141 1150 /** 1142 1151 * sync_inodes_sb - sync sb inode pages

+2

include/linux/writeback.h

··· 58 58 struct bdi_writeback; 59 59 int inode_wait(void *); 60 60 void writeback_inodes_sb(struct super_block *); 61 + void writeback_inodes_sb_nr(struct super_block *, unsigned long nr); 61 62 int writeback_inodes_sb_if_idle(struct super_block *); 63 + int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr); 62 64 void sync_inodes_sb(struct super_block *); 63 65 void writeback_inodes_wb(struct bdi_writeback *wb, 64 66 struct writeback_control *wbc);