Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Btrfs: allow block group cache writeout outside critical section in commit

We loop through all of the dirty block groups during commit and write
the free space cache. In order to make sure the cache is currect, we do
this while no other writers are allowed in the commit.

If a large number of block groups are dirty, this can introduce long
stalls during the final stages of the commit, which can block new procs
trying to change the filesystem.

This commit changes the block group cache writeout to take appropriate
locks and allow it to run earlier in the commit. We'll still have to
redo some of the block groups, but it means we can get most of the work
out of the way without blocking the entire FS.

Signed-off-by: Chris Mason <clm@fb.com>

+341 -37
+8
fs/btrfs/ctree.h
··· 1491 1491 struct mutex chunk_mutex; 1492 1492 struct mutex volume_mutex; 1493 1493 1494 + /* 1495 + * this is taken to make sure we don't set block groups ro after 1496 + * the free space cache has been allocated on them 1497 + */ 1498 + struct mutex ro_block_group_mutex; 1499 + 1494 1500 /* this is used during read/modify/write to make sure 1495 1501 * no two ios are trying to mod the same stripe at the same 1496 1502 * time ··· 3413 3407 u64 bytenr, u64 num_bytes, u64 parent, 3414 3408 u64 root_objectid, u64 owner, u64 offset, int no_quota); 3415 3409 3410 + int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, 3411 + struct btrfs_root *root); 3416 3412 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3417 3413 struct btrfs_root *root); 3418 3414 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
+1
fs/btrfs/disk-io.c
··· 2572 2572 mutex_init(&fs_info->transaction_kthread_mutex); 2573 2573 mutex_init(&fs_info->cleaner_mutex); 2574 2574 mutex_init(&fs_info->volume_mutex); 2575 + mutex_init(&fs_info->ro_block_group_mutex); 2575 2576 init_rwsem(&fs_info->commit_root_sem); 2576 2577 init_rwsem(&fs_info->cleanup_work_sem); 2577 2578 init_rwsem(&fs_info->subvol_sem);
+216 -25
fs/btrfs/extent-tree.c
··· 3298 3298 if (ret) 3299 3299 goto out_put; 3300 3300 3301 - ret = btrfs_truncate_free_space_cache(root, trans, inode); 3301 + ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode); 3302 3302 if (ret) 3303 3303 goto out_put; 3304 3304 } ··· 3382 3382 return 0; 3383 3383 } 3384 3384 3385 + /* 3386 + * transaction commit does final block group cache writeback during a 3387 + * critical section where nothing is allowed to change the FS. This is 3388 + * required in order for the cache to actually match the block group, 3389 + * but can introduce a lot of latency into the commit. 3390 + * 3391 + * So, btrfs_start_dirty_block_groups is here to kick off block group 3392 + * cache IO. There's a chance we'll have to redo some of it if the 3393 + * block group changes again during the commit, but it greatly reduces 3394 + * the commit latency by getting rid of the easy block groups while 3395 + * we're still allowing others to join the commit. 3396 + */ 3397 + int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, 3398 + struct btrfs_root *root) 3399 + { 3400 + struct btrfs_block_group_cache *cache; 3401 + struct btrfs_transaction *cur_trans = trans->transaction; 3402 + int ret = 0; 3403 + int should_put; 3404 + struct btrfs_path *path = NULL; 3405 + LIST_HEAD(dirty); 3406 + struct list_head *io = &cur_trans->io_bgs; 3407 + int num_started = 0; 3408 + int loops = 0; 3409 + 3410 + spin_lock(&cur_trans->dirty_bgs_lock); 3411 + if (!list_empty(&cur_trans->dirty_bgs)) { 3412 + list_splice_init(&cur_trans->dirty_bgs, &dirty); 3413 + } 3414 + spin_unlock(&cur_trans->dirty_bgs_lock); 3415 + 3416 + again: 3417 + if (list_empty(&dirty)) { 3418 + btrfs_free_path(path); 3419 + return 0; 3420 + } 3421 + 3422 + /* 3423 + * make sure all the block groups on our dirty list actually 3424 + * exist 3425 + */ 3426 + btrfs_create_pending_block_groups(trans, root); 3427 + 3428 + if (!path) { 3429 + path = btrfs_alloc_path(); 3430 + if (!path) 3431 + return -ENOMEM; 3432 + } 3433 + 3434 + while (!list_empty(&dirty)) { 3435 + cache = list_first_entry(&dirty, 3436 + struct btrfs_block_group_cache, 3437 + dirty_list); 3438 + 3439 + /* 3440 + * cache_write_mutex is here only to save us from balance 3441 + * deleting this block group while we are writing out the 3442 + * cache 3443 + */ 3444 + mutex_lock(&trans->transaction->cache_write_mutex); 3445 + 3446 + /* 3447 + * this can happen if something re-dirties a block 3448 + * group that is already under IO. Just wait for it to 3449 + * finish and then do it all again 3450 + */ 3451 + if (!list_empty(&cache->io_list)) { 3452 + list_del_init(&cache->io_list); 3453 + btrfs_wait_cache_io(root, trans, cache, 3454 + &cache->io_ctl, path, 3455 + cache->key.objectid); 3456 + btrfs_put_block_group(cache); 3457 + } 3458 + 3459 + 3460 + /* 3461 + * btrfs_wait_cache_io uses the cache->dirty_list to decide 3462 + * if it should update the cache_state. Don't delete 3463 + * until after we wait. 3464 + * 3465 + * Since we're not running in the commit critical section 3466 + * we need the dirty_bgs_lock to protect from update_block_group 3467 + */ 3468 + spin_lock(&cur_trans->dirty_bgs_lock); 3469 + list_del_init(&cache->dirty_list); 3470 + spin_unlock(&cur_trans->dirty_bgs_lock); 3471 + 3472 + should_put = 1; 3473 + 3474 + cache_save_setup(cache, trans, path); 3475 + 3476 + if (cache->disk_cache_state == BTRFS_DC_SETUP) { 3477 + cache->io_ctl.inode = NULL; 3478 + ret = btrfs_write_out_cache(root, trans, cache, path); 3479 + if (ret == 0 && cache->io_ctl.inode) { 3480 + num_started++; 3481 + should_put = 0; 3482 + 3483 + /* 3484 + * the cache_write_mutex is protecting 3485 + * the io_list 3486 + */ 3487 + list_add_tail(&cache->io_list, io); 3488 + } else { 3489 + /* 3490 + * if we failed to write the cache, the 3491 + * generation will be bad and life goes on 3492 + */ 3493 + ret = 0; 3494 + } 3495 + } 3496 + if (!ret) 3497 + ret = write_one_cache_group(trans, root, path, cache); 3498 + mutex_unlock(&trans->transaction->cache_write_mutex); 3499 + 3500 + /* if its not on the io list, we need to put the block group */ 3501 + if (should_put) 3502 + btrfs_put_block_group(cache); 3503 + 3504 + if (ret) 3505 + break; 3506 + } 3507 + 3508 + /* 3509 + * go through delayed refs for all the stuff we've just kicked off 3510 + * and then loop back (just once) 3511 + */ 3512 + ret = btrfs_run_delayed_refs(trans, root, 0); 3513 + if (!ret && loops == 0) { 3514 + loops++; 3515 + spin_lock(&cur_trans->dirty_bgs_lock); 3516 + list_splice_init(&cur_trans->dirty_bgs, &dirty); 3517 + spin_unlock(&cur_trans->dirty_bgs_lock); 3518 + goto again; 3519 + } 3520 + 3521 + btrfs_free_path(path); 3522 + return ret; 3523 + } 3524 + 3385 3525 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3386 3526 struct btrfs_root *root) 3387 3527 { ··· 3530 3390 int ret = 0; 3531 3391 int should_put; 3532 3392 struct btrfs_path *path; 3533 - LIST_HEAD(io); 3393 + struct list_head *io = &cur_trans->io_bgs; 3534 3394 int num_started = 0; 3535 - int num_waited = 0; 3536 - 3537 - if (list_empty(&cur_trans->dirty_bgs)) 3538 - return 0; 3539 3395 3540 3396 path = btrfs_alloc_path(); 3541 3397 if (!path) ··· 3559 3423 &cache->io_ctl, path, 3560 3424 cache->key.objectid); 3561 3425 btrfs_put_block_group(cache); 3562 - num_waited++; 3563 3426 } 3564 3427 3428 + /* 3429 + * don't remove from the dirty list until after we've waited 3430 + * on any pending IO 3431 + */ 3565 3432 list_del_init(&cache->dirty_list); 3566 3433 should_put = 1; 3567 3434 3568 - if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3569 - cache_save_setup(cache, trans, path); 3435 + cache_save_setup(cache, trans, path); 3570 3436 3571 3437 if (!ret) 3572 3438 ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1); ··· 3579 3441 if (ret == 0 && cache->io_ctl.inode) { 3580 3442 num_started++; 3581 3443 should_put = 0; 3582 - list_add_tail(&cache->io_list, &io); 3444 + list_add_tail(&cache->io_list, io); 3583 3445 } else { 3584 3446 /* 3585 3447 * if we failed to write the cache, the ··· 3596 3458 btrfs_put_block_group(cache); 3597 3459 } 3598 3460 3599 - while (!list_empty(&io)) { 3600 - cache = list_first_entry(&io, struct btrfs_block_group_cache, 3461 + while (!list_empty(io)) { 3462 + cache = list_first_entry(io, struct btrfs_block_group_cache, 3601 3463 io_list); 3602 3464 list_del_init(&cache->io_list); 3603 - num_waited++; 3604 3465 btrfs_wait_cache_io(root, trans, cache, 3605 3466 &cache->io_ctl, path, cache->key.objectid); 3606 3467 btrfs_put_block_group(cache); ··· 5596 5459 if (!alloc && cache->cached == BTRFS_CACHE_NO) 5597 5460 cache_block_group(cache, 1); 5598 5461 5599 - spin_lock(&trans->transaction->dirty_bgs_lock); 5600 - if (list_empty(&cache->dirty_list)) { 5601 - list_add_tail(&cache->dirty_list, 5602 - &trans->transaction->dirty_bgs); 5603 - trans->transaction->num_dirty_bgs++; 5604 - btrfs_get_block_group(cache); 5605 - } 5606 - spin_unlock(&trans->transaction->dirty_bgs_lock); 5607 - 5608 5462 byte_in_group = bytenr - cache->key.objectid; 5609 5463 WARN_ON(byte_in_group > cache->key.offset); 5610 5464 ··· 5644 5516 spin_unlock(&info->unused_bgs_lock); 5645 5517 } 5646 5518 } 5519 + 5520 + spin_lock(&trans->transaction->dirty_bgs_lock); 5521 + if (list_empty(&cache->dirty_list)) { 5522 + list_add_tail(&cache->dirty_list, 5523 + &trans->transaction->dirty_bgs); 5524 + trans->transaction->num_dirty_bgs++; 5525 + btrfs_get_block_group(cache); 5526 + } 5527 + spin_unlock(&trans->transaction->dirty_bgs_lock); 5528 + 5647 5529 btrfs_put_block_group(cache); 5648 5530 total -= num_bytes; 5649 5531 bytenr += num_bytes; ··· 8740 8602 8741 8603 BUG_ON(cache->ro); 8742 8604 8605 + again: 8743 8606 trans = btrfs_join_transaction(root); 8744 8607 if (IS_ERR(trans)) 8745 8608 return PTR_ERR(trans); 8609 + 8610 + /* 8611 + * we're not allowed to set block groups readonly after the dirty 8612 + * block groups cache has started writing. If it already started, 8613 + * back off and let this transaction commit 8614 + */ 8615 + mutex_lock(&root->fs_info->ro_block_group_mutex); 8616 + if (trans->transaction->dirty_bg_run) { 8617 + u64 transid = trans->transid; 8618 + 8619 + mutex_unlock(&root->fs_info->ro_block_group_mutex); 8620 + btrfs_end_transaction(trans, root); 8621 + 8622 + ret = btrfs_wait_for_commit(root, transid); 8623 + if (ret) 8624 + return ret; 8625 + goto again; 8626 + } 8627 + 8746 8628 8747 8629 ret = set_block_group_ro(cache, 0); 8748 8630 if (!ret) ··· 8778 8620 alloc_flags = update_block_group_flags(root, cache->flags); 8779 8621 check_system_chunk(trans, root, alloc_flags); 8780 8622 } 8623 + mutex_unlock(&root->fs_info->ro_block_group_mutex); 8781 8624 8782 8625 btrfs_end_transaction(trans, root); 8783 8626 return ret; ··· 9584 9425 goto out; 9585 9426 } 9586 9427 9428 + /* 9429 + * get the inode first so any iput calls done for the io_list 9430 + * aren't the final iput (no unlinks allowed now) 9431 + */ 9587 9432 inode = lookup_free_space_inode(tree_root, block_group, path); 9433 + 9434 + mutex_lock(&trans->transaction->cache_write_mutex); 9435 + /* 9436 + * make sure our free spache cache IO is done before remove the 9437 + * free space inode 9438 + */ 9439 + spin_lock(&trans->transaction->dirty_bgs_lock); 9440 + if (!list_empty(&block_group->io_list)) { 9441 + list_del_init(&block_group->io_list); 9442 + 9443 + WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); 9444 + 9445 + spin_unlock(&trans->transaction->dirty_bgs_lock); 9446 + btrfs_wait_cache_io(root, trans, block_group, 9447 + &block_group->io_ctl, path, 9448 + block_group->key.objectid); 9449 + btrfs_put_block_group(block_group); 9450 + spin_lock(&trans->transaction->dirty_bgs_lock); 9451 + } 9452 + 9453 + if (!list_empty(&block_group->dirty_list)) { 9454 + list_del_init(&block_group->dirty_list); 9455 + btrfs_put_block_group(block_group); 9456 + } 9457 + spin_unlock(&trans->transaction->dirty_bgs_lock); 9458 + mutex_unlock(&trans->transaction->cache_write_mutex); 9459 + 9588 9460 if (!IS_ERR(inode)) { 9589 9461 ret = btrfs_orphan_add(trans, inode); 9590 9462 if (ret) { ··· 9708 9518 9709 9519 spin_lock(&trans->transaction->dirty_bgs_lock); 9710 9520 if (!list_empty(&block_group->dirty_list)) { 9711 - list_del_init(&block_group->dirty_list); 9712 - btrfs_put_block_group(block_group); 9521 + WARN_ON(1); 9522 + } 9523 + if (!list_empty(&block_group->io_list)) { 9524 + WARN_ON(1); 9713 9525 } 9714 9526 spin_unlock(&trans->transaction->dirty_bgs_lock); 9715 - 9716 9527 btrfs_remove_free_space_cache(block_group); 9717 9528 9718 9529 spin_lock(&block_group->space_info->lock);
+62 -7
fs/btrfs/free-space-cache.c
··· 226 226 227 227 int btrfs_truncate_free_space_cache(struct btrfs_root *root, 228 228 struct btrfs_trans_handle *trans, 229 + struct btrfs_block_group_cache *block_group, 229 230 struct inode *inode) 230 231 { 231 232 int ret = 0; 233 + struct btrfs_path *path = btrfs_alloc_path(); 234 + 235 + if (!path) { 236 + ret = -ENOMEM; 237 + goto fail; 238 + } 239 + 240 + if (block_group) { 241 + mutex_lock(&trans->transaction->cache_write_mutex); 242 + if (!list_empty(&block_group->io_list)) { 243 + list_del_init(&block_group->io_list); 244 + 245 + btrfs_wait_cache_io(root, trans, block_group, 246 + &block_group->io_ctl, path, 247 + block_group->key.objectid); 248 + btrfs_put_block_group(block_group); 249 + } 250 + 251 + /* 252 + * now that we've truncated the cache away, its no longer 253 + * setup or written 254 + */ 255 + spin_lock(&block_group->lock); 256 + block_group->disk_cache_state = BTRFS_DC_CLEAR; 257 + spin_unlock(&block_group->lock); 258 + } 259 + btrfs_free_path(path); 232 260 233 261 btrfs_i_size_write(inode, 0); 234 262 truncate_pagecache(inode, 0); ··· 270 242 ret = btrfs_truncate_inode_items(trans, root, inode, 271 243 0, BTRFS_EXTENT_DATA_KEY); 272 244 if (ret) { 245 + mutex_unlock(&trans->transaction->cache_write_mutex); 273 246 btrfs_abort_transaction(trans, root, ret); 274 247 return ret; 275 248 } 276 249 277 250 ret = btrfs_update_inode(trans, root, inode); 251 + 252 + if (block_group) 253 + mutex_unlock(&trans->transaction->cache_write_mutex); 254 + 255 + fail: 278 256 if (ret) 279 257 btrfs_abort_transaction(trans, root, ret); 280 258 ··· 910 876 { 911 877 int ret; 912 878 struct btrfs_free_cluster *cluster = NULL; 879 + struct btrfs_free_cluster *cluster_locked = NULL; 913 880 struct rb_node *node = rb_first(&ctl->free_space_offset); 914 881 struct btrfs_trim_range *trim_entry; 915 882 ··· 922 887 } 923 888 924 889 if (!node && cluster) { 890 + cluster_locked = cluster; 891 + spin_lock(&cluster_locked->lock); 925 892 node = rb_first(&cluster->root); 926 893 cluster = NULL; 927 894 } ··· 947 910 node = rb_next(node); 948 911 if (!node && cluster) { 949 912 node = rb_first(&cluster->root); 913 + cluster_locked = cluster; 914 + spin_lock(&cluster_locked->lock); 950 915 cluster = NULL; 951 916 } 917 + } 918 + if (cluster_locked) { 919 + spin_unlock(&cluster_locked->lock); 920 + cluster_locked = NULL; 952 921 } 953 922 954 923 /* ··· 973 930 974 931 return 0; 975 932 fail: 933 + if (cluster_locked) 934 + spin_unlock(&cluster_locked->lock); 976 935 return -ENOSPC; 977 936 } 978 937 ··· 1146 1101 int ret; 1147 1102 struct inode *inode = io_ctl->inode; 1148 1103 1104 + if (!inode) 1105 + return 0; 1106 + 1149 1107 root = root->fs_info->tree_root; 1150 1108 1151 1109 /* Flush the dirty pages in the cache file. */ ··· 1175 1127 btrfs_update_inode(trans, root, inode); 1176 1128 1177 1129 if (block_group) { 1130 + /* the dirty list is protected by the dirty_bgs_lock */ 1131 + spin_lock(&trans->transaction->dirty_bgs_lock); 1132 + 1133 + /* the disk_cache_state is protected by the block group lock */ 1178 1134 spin_lock(&block_group->lock); 1179 1135 1180 1136 /* 1181 1137 * only mark this as written if we didn't get put back on 1182 - * the dirty list while waiting for IO. 1138 + * the dirty list while waiting for IO. Otherwise our 1139 + * cache state won't be right, and we won't get written again 1183 1140 */ 1184 1141 if (!ret && list_empty(&block_group->dirty_list)) 1185 1142 block_group->disk_cache_state = BTRFS_DC_WRITTEN; ··· 1192 1139 block_group->disk_cache_state = BTRFS_DC_ERROR; 1193 1140 1194 1141 spin_unlock(&block_group->lock); 1142 + spin_unlock(&trans->transaction->dirty_bgs_lock); 1195 1143 io_ctl->inode = NULL; 1196 1144 iput(inode); 1197 1145 } ··· 1261 1207 1262 1208 mutex_lock(&ctl->cache_writeout_mutex); 1263 1209 /* Write out the extent entries in the free space cache */ 1210 + spin_lock(&ctl->tree_lock); 1264 1211 ret = write_cache_extent_entries(io_ctl, ctl, 1265 1212 block_group, &entries, &bitmaps, 1266 1213 &bitmap_list); 1214 + spin_unlock(&ctl->tree_lock); 1267 1215 if (ret) { 1268 1216 mutex_unlock(&ctl->cache_writeout_mutex); 1269 1217 goto out_nospc; ··· 1275 1219 * Some spaces that are freed in the current transaction are pinned, 1276 1220 * they will be added into free space cache after the transaction is 1277 1221 * committed, we shouldn't lose them. 1222 + * 1223 + * If this changes while we are working we'll get added back to 1224 + * the dirty list and redo it. No locking needed 1278 1225 */ 1279 1226 ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries); 1280 1227 if (ret) { ··· 1290 1231 * locked while doing it because a concurrent trim can be manipulating 1291 1232 * or freeing the bitmap. 1292 1233 */ 1234 + spin_lock(&ctl->tree_lock); 1293 1235 ret = write_bitmap_entries(io_ctl, &bitmap_list); 1236 + spin_unlock(&ctl->tree_lock); 1294 1237 mutex_unlock(&ctl->cache_writeout_mutex); 1295 1238 if (ret) 1296 1239 goto out_nospc; ··· 1365 1304 1366 1305 spin_lock(&block_group->lock); 1367 1306 if (block_group->disk_cache_state < BTRFS_DC_SETUP) { 1368 - spin_unlock(&block_group->lock); 1369 - return 0; 1370 - } 1371 - 1372 - if (block_group->delalloc_bytes) { 1373 - block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1374 1307 spin_unlock(&block_group->lock); 1375 1308 return 0; 1376 1309 }
+1
fs/btrfs/free-space-cache.h
··· 62 62 struct btrfs_block_rsv *rsv); 63 63 int btrfs_truncate_free_space_cache(struct btrfs_root *root, 64 64 struct btrfs_trans_handle *trans, 65 + struct btrfs_block_group_cache *block_group, 65 66 struct inode *inode); 66 67 int load_free_space_cache(struct btrfs_fs_info *fs_info, 67 68 struct btrfs_block_group_cache *block_group);
+1 -1
fs/btrfs/inode-map.c
··· 456 456 } 457 457 458 458 if (i_size_read(inode) > 0) { 459 - ret = btrfs_truncate_free_space_cache(root, trans, inode); 459 + ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode); 460 460 if (ret) { 461 461 if (ret != -ENOSPC) 462 462 btrfs_abort_transaction(trans, root, ret);
+6 -3
fs/btrfs/relocation.c
··· 3430 3430 } 3431 3431 3432 3432 static int delete_block_group_cache(struct btrfs_fs_info *fs_info, 3433 - struct inode *inode, u64 ino) 3433 + struct btrfs_block_group_cache *block_group, 3434 + struct inode *inode, 3435 + u64 ino) 3434 3436 { 3435 3437 struct btrfs_key key; 3436 3438 struct btrfs_root *root = fs_info->tree_root; ··· 3465 3463 goto out; 3466 3464 } 3467 3465 3468 - ret = btrfs_truncate_free_space_cache(root, trans, inode); 3466 + ret = btrfs_truncate_free_space_cache(root, trans, block_group, inode); 3469 3467 3470 3468 btrfs_end_transaction(trans, root); 3471 3469 btrfs_btree_balance_dirty(root); ··· 3511 3509 */ 3512 3510 if (ref_root == BTRFS_ROOT_TREE_OBJECTID) { 3513 3511 ret = delete_block_group_cache(rc->extent_root->fs_info, 3512 + rc->block_group, 3514 3513 NULL, ref_objectid); 3515 3514 if (ret != -ENOENT) 3516 3515 return ret; ··· 4226 4223 btrfs_free_path(path); 4227 4224 4228 4225 if (!IS_ERR(inode)) 4229 - ret = delete_block_group_cache(fs_info, inode, 0); 4226 + ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); 4230 4227 else 4231 4228 ret = PTR_ERR(inode); 4232 4229
+37 -1
fs/btrfs/transaction.c
··· 222 222 atomic_set(&cur_trans->use_count, 2); 223 223 cur_trans->have_free_bgs = 0; 224 224 cur_trans->start_time = get_seconds(); 225 + cur_trans->dirty_bg_run = 0; 225 226 226 227 cur_trans->delayed_refs.href_root = RB_ROOT; 227 228 atomic_set(&cur_trans->delayed_refs.num_entries, 0); ··· 252 251 INIT_LIST_HEAD(&cur_trans->switch_commits); 253 252 INIT_LIST_HEAD(&cur_trans->pending_ordered); 254 253 INIT_LIST_HEAD(&cur_trans->dirty_bgs); 254 + INIT_LIST_HEAD(&cur_trans->io_bgs); 255 + mutex_init(&cur_trans->cache_write_mutex); 255 256 cur_trans->num_dirty_bgs = 0; 256 257 spin_lock_init(&cur_trans->dirty_bgs_lock); 257 258 list_add_tail(&cur_trans->list, &fs_info->trans_list); ··· 1062 1059 { 1063 1060 struct btrfs_fs_info *fs_info = root->fs_info; 1064 1061 struct list_head *dirty_bgs = &trans->transaction->dirty_bgs; 1062 + struct list_head *io_bgs = &trans->transaction->io_bgs; 1065 1063 struct list_head *next; 1066 1064 struct extent_buffer *eb; 1067 1065 int ret; ··· 1116 1112 return ret; 1117 1113 } 1118 1114 1119 - while (!list_empty(dirty_bgs)) { 1115 + while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) { 1120 1116 ret = btrfs_write_dirty_block_groups(trans, root); 1121 1117 if (ret) 1122 1118 return ret; ··· 1816 1812 return ret; 1817 1813 } 1818 1814 1815 + if (!cur_trans->dirty_bg_run) { 1816 + int run_it = 0; 1817 + 1818 + /* this mutex is also taken before trying to set 1819 + * block groups readonly. We need to make sure 1820 + * that nobody has set a block group readonly 1821 + * after a extents from that block group have been 1822 + * allocated for cache files. btrfs_set_block_group_ro 1823 + * will wait for the transaction to commit if it 1824 + * finds dirty_bg_run = 1 1825 + * 1826 + * The dirty_bg_run flag is also used to make sure only 1827 + * one process starts all the block group IO. It wouldn't 1828 + * hurt to have more than one go through, but there's no 1829 + * real advantage to it either. 1830 + */ 1831 + mutex_lock(&root->fs_info->ro_block_group_mutex); 1832 + if (!cur_trans->dirty_bg_run) { 1833 + run_it = 1; 1834 + cur_trans->dirty_bg_run = 1; 1835 + } 1836 + mutex_unlock(&root->fs_info->ro_block_group_mutex); 1837 + 1838 + if (run_it) 1839 + ret = btrfs_start_dirty_block_groups(trans, root); 1840 + } 1841 + if (ret) { 1842 + btrfs_end_transaction(trans, root); 1843 + return ret; 1844 + } 1845 + 1819 1846 spin_lock(&root->fs_info->trans_lock); 1820 1847 list_splice(&trans->ordered, &cur_trans->pending_ordered); 1821 1848 if (cur_trans->state >= TRANS_STATE_COMMIT_START) { ··· 2040 2005 2041 2006 assert_qgroups_uptodate(trans); 2042 2007 ASSERT(list_empty(&cur_trans->dirty_bgs)); 2008 + ASSERT(list_empty(&cur_trans->io_bgs)); 2043 2009 update_super_roots(root); 2044 2010 2045 2011 btrfs_set_super_log_root(root->fs_info->super_copy, 0);
+9
fs/btrfs/transaction.h
··· 64 64 struct list_head pending_ordered; 65 65 struct list_head switch_commits; 66 66 struct list_head dirty_bgs; 67 + struct list_head io_bgs; 67 68 u64 num_dirty_bgs; 69 + 70 + /* 71 + * we need to make sure block group deletion doesn't race with 72 + * free space cache writeout. This mutex keeps them from stomping 73 + * on each other 74 + */ 75 + struct mutex cache_write_mutex; 68 76 spinlock_t dirty_bgs_lock; 69 77 struct btrfs_delayed_ref_root delayed_refs; 70 78 int aborted; 79 + int dirty_bg_run; 71 80 }; 72 81 73 82 #define __TRANS_FREEZABLE (1U << 0)