Merge git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

* git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable:
Btrfs: fix panic when trying to destroy a newly allocated
Btrfs: allow more metadata chunk preallocation
Btrfs: fallback on uncompressed io if compressed io fails
Btrfs: find ideal block group for caching
Btrfs: avoid null deref in unpin_extent_cache()
Btrfs: skip btrfs_release_path in btrfs_update_root and btrfs_del_root
Btrfs: fix some metadata enospc issues
Btrfs: fix how we set max_size for free space clusters
Btrfs: cleanup transaction starting and fix journal_info usage
Btrfs: fix data allocation hint start

Linus Torvalds 16 years ago aa021baa 404291ac

+183 -50

6 changed files

expand all

btrfs

extent-tree.c

extent_map.c

free-space-cache.c

inode.c

root-tree.c

transaction.c

+88 -25

fs/btrfs/extent-tree.c

··· 2977 2977 2978 2978 free_space = btrfs_super_total_bytes(disk_super); 2979 2979 /* 2980 - * we allow the metadata to grow to a max of either 5gb or 5% of the 2980 + * we allow the metadata to grow to a max of either 10gb or 5% of the 2981 2981 * space in the volume. 2982 2982 */ 2983 - min_metadata = min((u64)5 * 1024 * 1024 * 1024, 2983 + min_metadata = min((u64)10 * 1024 * 1024 * 1024, 2984 2984 div64_u64(free_space * 5, 100)); 2985 2985 if (info->total_bytes >= min_metadata) { 2986 2986 spin_unlock(&info->lock); ··· 4102 4102 } 4103 4103 4104 4104 enum btrfs_loop_type { 4105 - LOOP_CACHED_ONLY = 0, 4105 + LOOP_FIND_IDEAL = 0, 4106 4106 LOOP_CACHING_NOWAIT = 1, 4107 4107 LOOP_CACHING_WAIT = 2, 4108 4108 LOOP_ALLOC_CHUNK = 3, ··· 4131 4131 struct btrfs_block_group_cache *block_group = NULL; 4132 4132 int empty_cluster = 2 * 1024 * 1024; 4133 4133 int allowed_chunk_alloc = 0; 4134 + int done_chunk_alloc = 0; 4134 4135 struct btrfs_space_info *space_info; 4135 4136 int last_ptr_loop = 0; 4136 4137 int loop = 0; 4137 4138 bool found_uncached_bg = false; 4138 4139 bool failed_cluster_refill = false; 4139 4140 bool failed_alloc = false; 4141 + u64 ideal_cache_percent = 0; 4142 + u64 ideal_cache_offset = 0; 4140 4143 4141 4144 WARN_ON(num_bytes < root->sectorsize); 4142 4145 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); ··· 4175 4172 empty_cluster = 0; 4176 4173 4177 4174 if (search_start == hint_byte) { 4175 + ideal_cache: 4178 4176 block_group = btrfs_lookup_block_group(root->fs_info, 4179 4177 search_start); 4180 4178 /* 4181 4179 * we don't want to use the block group if it doesn't match our 4182 4180 * allocation bits, or if its not cached. 4181 + * 4182 + * However if we are re-searching with an ideal block group 4183 + * picked out then we don't care that the block group is cached. 4183 4184 */ 4184 4185 if (block_group && block_group_bits(block_group, data) && 4185 - block_group_cache_done(block_group)) { 4186 + (block_group->cached != BTRFS_CACHE_NO || 4187 + search_start == ideal_cache_offset)) { 4186 4188 down_read(&space_info->groups_sem); 4187 4189 if (list_empty(&block_group->list) || 4188 4190 block_group->ro) { ··· 4199 4191 */ 4200 4192 btrfs_put_block_group(block_group); 4201 4193 up_read(&space_info->groups_sem); 4202 - } else 4194 + } else { 4203 4195 goto have_block_group; 4196 + } 4204 4197 } else if (block_group) { 4205 4198 btrfs_put_block_group(block_group); 4206 4199 } 4207 4200 } 4208 - 4209 4201 search: 4210 4202 down_read(&space_info->groups_sem); 4211 4203 list_for_each_entry(block_group, &space_info->block_groups, list) { ··· 4217 4209 4218 4210 have_block_group: 4219 4211 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { 4212 + u64 free_percent; 4213 + 4214 + free_percent = btrfs_block_group_used(&block_group->item); 4215 + free_percent *= 100; 4216 + free_percent = div64_u64(free_percent, 4217 + block_group->key.offset); 4218 + free_percent = 100 - free_percent; 4219 + if (free_percent > ideal_cache_percent && 4220 + likely(!block_group->ro)) { 4221 + ideal_cache_offset = block_group->key.objectid; 4222 + ideal_cache_percent = free_percent; 4223 + } 4224 + 4220 4225 /* 4221 - * we want to start caching kthreads, but not too many 4222 - * right off the bat so we don't overwhelm the system, 4223 - * so only start them if there are less than 2 and we're 4224 - * in the initial allocation phase. 4226 + * We only want to start kthread caching if we are at 4227 + * the point where we will wait for caching to make 4228 + * progress, or if our ideal search is over and we've 4229 + * found somebody to start caching. 4225 4230 */ 4226 4231 if (loop > LOOP_CACHING_NOWAIT || 4227 - atomic_read(&space_info->caching_threads) < 2) { 4232 + (loop > LOOP_FIND_IDEAL && 4233 + atomic_read(&space_info->caching_threads) < 2)) { 4228 4234 ret = cache_block_group(block_group); 4229 4235 BUG_ON(ret); 4230 4236 } 4237 + found_uncached_bg = true; 4238 + 4239 + /* 4240 + * If loop is set for cached only, try the next block 4241 + * group. 4242 + */ 4243 + if (loop == LOOP_FIND_IDEAL) 4244 + goto loop; 4231 4245 } 4232 4246 4233 4247 cached = block_group_cache_done(block_group); 4234 - if (unlikely(!cached)) { 4248 + if (unlikely(!cached)) 4235 4249 found_uncached_bg = true; 4236 - 4237 - /* if we only want cached bgs, loop */ 4238 - if (loop == LOOP_CACHED_ONLY) 4239 - goto loop; 4240 - } 4241 4250 4242 4251 if (unlikely(block_group->ro)) 4243 4252 goto loop; ··· 4435 4410 } 4436 4411 up_read(&space_info->groups_sem); 4437 4412 4438 - /* LOOP_CACHED_ONLY, only search fully cached block groups 4439 - * LOOP_CACHING_NOWAIT, search partially cached block groups, but 4440 - * dont wait foR them to finish caching 4413 + /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for 4414 + * for them to make caching progress. Also 4415 + * determine the best possible bg to cache 4416 + * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking 4417 + * caching kthreads as we move along 4441 4418 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 4442 4419 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 4443 4420 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try ··· 4448 4421 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && 4449 4422 (found_uncached_bg || empty_size || empty_cluster || 4450 4423 allowed_chunk_alloc)) { 4451 - if (found_uncached_bg) { 4424 + if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { 4452 4425 found_uncached_bg = false; 4453 - if (loop < LOOP_CACHING_WAIT) { 4454 - loop++; 4426 + loop++; 4427 + if (!ideal_cache_percent && 4428 + atomic_read(&space_info->caching_threads)) 4455 4429 goto search; 4456 - } 4430 + 4431 + /* 4432 + * 1 of the following 2 things have happened so far 4433 + * 4434 + * 1) We found an ideal block group for caching that 4435 + * is mostly full and will cache quickly, so we might 4436 + * as well wait for it. 4437 + * 4438 + * 2) We searched for cached only and we didn't find 4439 + * anything, and we didn't start any caching kthreads 4440 + * either, so chances are we will loop through and 4441 + * start a couple caching kthreads, and then come back 4442 + * around and just wait for them. This will be slower 4443 + * because we will have 2 caching kthreads reading at 4444 + * the same time when we could have just started one 4445 + * and waited for it to get far enough to give us an 4446 + * allocation, so go ahead and go to the wait caching 4447 + * loop. 4448 + */ 4449 + loop = LOOP_CACHING_WAIT; 4450 + search_start = ideal_cache_offset; 4451 + ideal_cache_percent = 0; 4452 + goto ideal_cache; 4453 + } else if (loop == LOOP_FIND_IDEAL) { 4454 + /* 4455 + * Didn't find a uncached bg, wait on anything we find 4456 + * next. 4457 + */ 4458 + loop = LOOP_CACHING_WAIT; 4459 + goto search; 4460 + } 4461 + 4462 + if (loop < LOOP_CACHING_WAIT) { 4463 + loop++; 4464 + goto search; 4457 4465 } 4458 4466 4459 4467 if (loop == LOOP_ALLOC_CHUNK) { ··· 4500 4438 ret = do_chunk_alloc(trans, root, num_bytes + 4501 4439 2 * 1024 * 1024, data, 1); 4502 4440 allowed_chunk_alloc = 0; 4503 - } else { 4441 + done_chunk_alloc = 1; 4442 + } else if (!done_chunk_alloc) { 4504 4443 space_info->force_alloc = 1; 4505 4444 } 4506 4445

+1 -1

fs/btrfs/extent_map.c

··· 208 208 write_lock(&tree->lock); 209 209 em = lookup_extent_mapping(tree, start, len); 210 210 211 - WARN_ON(em->start != start || !em); 211 + WARN_ON(!em || em->start != start); 212 212 213 213 if (!em) 214 214 goto out;

+1 -1

fs/btrfs/free-space-cache.c

··· 1296 1296 window_start = entry->offset; 1297 1297 window_free = entry->bytes; 1298 1298 last = entry; 1299 - max_extent = 0; 1299 + max_extent = entry->bytes; 1300 1300 } else { 1301 1301 last = next; 1302 1302 window_free += next->bytes;

+80 -15

fs/btrfs/inode.c

··· 538 538 struct btrfs_root *root = BTRFS_I(inode)->root; 539 539 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 540 540 struct extent_io_tree *io_tree; 541 - int ret; 541 + int ret = 0; 542 542 543 543 if (list_empty(&async_cow->extents)) 544 544 return 0; ··· 552 552 553 553 io_tree = &BTRFS_I(inode)->io_tree; 554 554 555 + retry: 555 556 /* did the compression code fall back to uncompressed IO? */ 556 557 if (!async_extent->pages) { 557 558 int page_started = 0; ··· 563 562 async_extent->ram_size - 1, GFP_NOFS); 564 563 565 564 /* allocate blocks */ 566 - cow_file_range(inode, async_cow->locked_page, 567 - async_extent->start, 568 - async_extent->start + 569 - async_extent->ram_size - 1, 570 - &page_started, &nr_written, 0); 565 + ret = cow_file_range(inode, async_cow->locked_page, 566 + async_extent->start, 567 + async_extent->start + 568 + async_extent->ram_size - 1, 569 + &page_started, &nr_written, 0); 571 570 572 571 /* 573 572 * if page_started, cow_file_range inserted an ··· 575 574 * and IO for us. Otherwise, we need to submit 576 575 * all those pages down to the drive. 577 576 */ 578 - if (!page_started) 577 + if (!page_started && !ret) 579 578 extent_write_locked_range(io_tree, 580 579 inode, async_extent->start, 581 580 async_extent->start + ··· 603 602 async_extent->compressed_size, 604 603 0, alloc_hint, 605 604 (u64)-1, &ins, 1); 606 - BUG_ON(ret); 605 + if (ret) { 606 + int i; 607 + for (i = 0; i < async_extent->nr_pages; i++) { 608 + WARN_ON(async_extent->pages[i]->mapping); 609 + page_cache_release(async_extent->pages[i]); 610 + } 611 + kfree(async_extent->pages); 612 + async_extent->nr_pages = 0; 613 + async_extent->pages = NULL; 614 + unlock_extent(io_tree, async_extent->start, 615 + async_extent->start + 616 + async_extent->ram_size - 1, GFP_NOFS); 617 + goto retry; 618 + } 619 + 607 620 em = alloc_extent_map(GFP_NOFS); 608 621 em->start = async_extent->start; 609 622 em->len = async_extent->ram_size; ··· 758 743 em = search_extent_mapping(&BTRFS_I(inode)->extent_tree, 759 744 start, num_bytes); 760 745 if (em) { 761 - alloc_hint = em->block_start; 762 - free_extent_map(em); 746 + /* 747 + * if block start isn't an actual block number then find the 748 + * first block in this inode and use that as a hint. If that 749 + * block is also bogus then just don't worry about it. 750 + */ 751 + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { 752 + free_extent_map(em); 753 + em = search_extent_mapping(em_tree, 0, 0); 754 + if (em && em->block_start < EXTENT_MAP_LAST_BYTE) 755 + alloc_hint = em->block_start; 756 + if (em) 757 + free_extent_map(em); 758 + } else { 759 + alloc_hint = em->block_start; 760 + free_extent_map(em); 761 + } 763 762 } 764 763 read_unlock(&BTRFS_I(inode)->extent_tree.lock); 765 764 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); ··· 2503 2474 2504 2475 root = BTRFS_I(dir)->root; 2505 2476 2477 + /* 2478 + * 5 items for unlink inode 2479 + * 1 for orphan 2480 + */ 2481 + ret = btrfs_reserve_metadata_space(root, 6); 2482 + if (ret) 2483 + return ret; 2484 + 2506 2485 trans = btrfs_start_transaction(root, 1); 2486 + if (IS_ERR(trans)) { 2487 + btrfs_unreserve_metadata_space(root, 6); 2488 + return PTR_ERR(trans); 2489 + } 2507 2490 2508 2491 btrfs_set_trans_block_group(trans, dir); 2509 2492 ··· 2530 2489 nr = trans->blocks_used; 2531 2490 2532 2491 btrfs_end_transaction_throttle(trans, root); 2492 + btrfs_unreserve_metadata_space(root, 6); 2533 2493 btrfs_btree_balance_dirty(root, nr); 2534 2494 return ret; 2535 2495 } ··· 2611 2569 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 2612 2570 return -ENOTEMPTY; 2613 2571 2572 + ret = btrfs_reserve_metadata_space(root, 5); 2573 + if (ret) 2574 + return ret; 2575 + 2614 2576 trans = btrfs_start_transaction(root, 1); 2577 + if (IS_ERR(trans)) { 2578 + btrfs_unreserve_metadata_space(root, 5); 2579 + return PTR_ERR(trans); 2580 + } 2581 + 2615 2582 btrfs_set_trans_block_group(trans, dir); 2616 2583 2617 2584 if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ··· 2643 2592 out: 2644 2593 nr = trans->blocks_used; 2645 2594 ret = btrfs_end_transaction_throttle(trans, root); 2595 + btrfs_unreserve_metadata_space(root, 5); 2646 2596 btrfs_btree_balance_dirty(root, nr); 2647 2597 2648 2598 if (ret && !err) ··· 5180 5128 ei->logged_trans = 0; 5181 5129 ei->outstanding_extents = 0; 5182 5130 ei->reserved_extents = 0; 5131 + ei->root = NULL; 5183 5132 spin_lock_init(&ei->accounting_lock); 5184 5133 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 5185 5134 INIT_LIST_HEAD(&ei->i_orphan); ··· 5195 5142 5196 5143 WARN_ON(!list_empty(&inode->i_dentry)); 5197 5144 WARN_ON(inode->i_data.nrpages); 5145 + 5146 + /* 5147 + * This can happen where we create an inode, but somebody else also 5148 + * created the same inode and we need to destroy the one we already 5149 + * created. 5150 + */ 5151 + if (!root) 5152 + goto free; 5198 5153 5199 5154 /* 5200 5155 * Make sure we're properly removed from the ordered operation ··· 5239 5178 } 5240 5179 inode_tree_del(inode); 5241 5180 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 5181 + free: 5242 5182 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 5243 5183 } 5244 5184 ··· 5345 5283 return -ENOTEMPTY; 5346 5284 5347 5285 /* 5348 - * 2 items for dir items 5349 - * 1 item for orphan entry 5350 - * 1 item for ref 5286 + * We want to reserve the absolute worst case amount of items. So if 5287 + * both inodes are subvols and we need to unlink them then that would 5288 + * require 4 item modifications, but if they are both normal inodes it 5289 + * would require 5 item modifications, so we'll assume their normal 5290 + * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items 5291 + * should cover the worst case number of items we'll modify. 5351 5292 */ 5352 - ret = btrfs_reserve_metadata_space(root, 4); 5293 + ret = btrfs_reserve_metadata_space(root, 11); 5353 5294 if (ret) 5354 5295 return ret; 5355 5296 ··· 5468 5403 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 5469 5404 up_read(&root->fs_info->subvol_sem); 5470 5405 5471 - btrfs_unreserve_metadata_space(root, 4); 5406 + btrfs_unreserve_metadata_space(root, 11); 5472 5407 return ret; 5473 5408 } 5474 5409

-2

fs/btrfs/root-tree.c

··· 159 159 write_extent_buffer(l, item, ptr, sizeof(*item)); 160 160 btrfs_mark_buffer_dirty(path->nodes[0]); 161 161 out: 162 - btrfs_release_path(root, path); 163 162 btrfs_free_path(path); 164 163 return ret; 165 164 } ··· 331 332 BUG_ON(refs != 0); 332 333 ret = btrfs_del_item(trans, root, path); 333 334 out: 334 - btrfs_release_path(root, path); 335 335 btrfs_free_path(path); 336 336 return ret; 337 337 }

+13 -6

fs/btrfs/transaction.c

··· 163 163 } 164 164 } 165 165 166 + enum btrfs_trans_type { 167 + TRANS_START, 168 + TRANS_JOIN, 169 + TRANS_USERSPACE, 170 + }; 171 + 166 172 static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 167 - int num_blocks, int wait) 173 + int num_blocks, int type) 168 174 { 169 175 struct btrfs_trans_handle *h = 170 176 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); ··· 178 172 179 173 mutex_lock(&root->fs_info->trans_mutex); 180 174 if (!root->fs_info->log_root_recovering && 181 - ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2)) 175 + ((type == TRANS_START && !root->fs_info->open_ioctl_trans) || 176 + type == TRANS_USERSPACE)) 182 177 wait_current_trans(root); 183 178 ret = join_transaction(root); 184 179 BUG_ON(ret); ··· 193 186 h->alloc_exclude_start = 0; 194 187 h->delayed_ref_updates = 0; 195 188 196 - if (!current->journal_info) 189 + if (!current->journal_info && type != TRANS_USERSPACE) 197 190 current->journal_info = h; 198 191 199 192 root->fs_info->running_transaction->use_count++; ··· 205 198 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 206 199 int num_blocks) 207 200 { 208 - return start_transaction(root, num_blocks, 1); 201 + return start_transaction(root, num_blocks, TRANS_START); 209 202 } 210 203 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 211 204 int num_blocks) 212 205 { 213 - return start_transaction(root, num_blocks, 0); 206 + return start_transaction(root, num_blocks, TRANS_JOIN); 214 207 } 215 208 216 209 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 217 210 int num_blocks) 218 211 { 219 - return start_transaction(r, num_blocks, 2); 212 + return start_transaction(r, num_blocks, TRANS_USERSPACE); 220 213 } 221 214 222 215 /* wait for a transaction commit to be fully complete */