Btrfs: allow block group cache writeout outside critical section in commit

+8

fs/btrfs/ctree.h

··· 1491 1491 struct mutex chunk_mutex; 1492 1492 struct mutex volume_mutex; 1493 1493 1494 + /* 1495 + * this is taken to make sure we don't set block groups ro after 1496 + * the free space cache has been allocated on them 1497 + */ 1498 + struct mutex ro_block_group_mutex; 1499 + 1494 1500 /* this is used during read/modify/write to make sure 1495 1501 * no two ios are trying to mod the same stripe at the same 1496 1502 * time ··· 3413 3407 u64 bytenr, u64 num_bytes, u64 parent, 3414 3408 u64 root_objectid, u64 owner, u64 offset, int no_quota); 3415 3409 3410 + int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, 3411 + struct btrfs_root *root); 3416 3412 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3417 3413 struct btrfs_root *root); 3418 3414 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,

+1

fs/btrfs/disk-io.c

··· 2572 2572 mutex_init(&fs_info->transaction_kthread_mutex); 2573 2573 mutex_init(&fs_info->cleaner_mutex); 2574 2574 mutex_init(&fs_info->volume_mutex); 2575 + mutex_init(&fs_info->ro_block_group_mutex); 2575 2576 init_rwsem(&fs_info->commit_root_sem); 2576 2577 init_rwsem(&fs_info->cleanup_work_sem); 2577 2578 init_rwsem(&fs_info->subvol_sem);

+216 -25

fs/btrfs/extent-tree.c

··· 3298 3298 if (ret) 3299 3299 goto out_put; 3300 3300 3301 - ret = btrfs_truncate_free_space_cache(root, trans, inode); 3301 + ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode); 3302 3302 if (ret) 3303 3303 goto out_put; 3304 3304 } ··· 3382 3382 return 0; 3383 3383 } 3384 3384 3385 + /* 3386 + * transaction commit does final block group cache writeback during a 3387 + * critical section where nothing is allowed to change the FS. This is 3388 + * required in order for the cache to actually match the block group, 3389 + * but can introduce a lot of latency into the commit. 3390 + * 3391 + * So, btrfs_start_dirty_block_groups is here to kick off block group 3392 + * cache IO. There's a chance we'll have to redo some of it if the 3393 + * block group changes again during the commit, but it greatly reduces 3394 + * the commit latency by getting rid of the easy block groups while 3395 + * we're still allowing others to join the commit. 3396 + */ 3397 + int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, 3398 + struct btrfs_root *root) 3399 + { 3400 + struct btrfs_block_group_cache *cache; 3401 + struct btrfs_transaction *cur_trans = trans->transaction; 3402 + int ret = 0; 3403 + int should_put; 3404 + struct btrfs_path *path = NULL; 3405 + LIST_HEAD(dirty); 3406 + struct list_head *io = &cur_trans->io_bgs; 3407 + int num_started = 0; 3408 + int loops = 0; 3409 + 3410 + spin_lock(&cur_trans->dirty_bgs_lock); 3411 + if (!list_empty(&cur_trans->dirty_bgs)) { 3412 + list_splice_init(&cur_trans->dirty_bgs, &dirty); 3413 + } 3414 + spin_unlock(&cur_trans->dirty_bgs_lock); 3415 + 3416 + again: 3417 + if (list_empty(&dirty)) { 3418 + btrfs_free_path(path); 3419 + return 0; 3420 + } 3421 + 3422 + /* 3423 + * make sure all the block groups on our dirty list actually 3424 + * exist 3425 + */ 3426 + btrfs_create_pending_block_groups(trans, root); 3427 + 3428 + if (!path) { 3429 + path = btrfs_alloc_path(); 3430 + if (!path) 3431 + return -ENOMEM; 3432 + } 3433 + 3434 + while (!list_empty(&dirty)) { 3435 + cache = list_first_entry(&dirty, 3436 + struct btrfs_block_group_cache, 3437 + dirty_list); 3438 + 3439 + /* 3440 + * cache_write_mutex is here only to save us from balance 3441 + * deleting this block group while we are writing out the 3442 + * cache 3443 + */ 3444 + mutex_lock(&trans->transaction->cache_write_mutex); 3445 + 3446 + /* 3447 + * this can happen if something re-dirties a block 3448 + * group that is already under IO. Just wait for it to 3449 + * finish and then do it all again 3450 + */ 3451 + if (!list_empty(&cache->io_list)) { 3452 + list_del_init(&cache->io_list); 3453 + btrfs_wait_cache_io(root, trans, cache, 3454 + &cache->io_ctl, path, 3455 + cache->key.objectid); 3456 + btrfs_put_block_group(cache); 3457 + } 3458 + 3459 + 3460 + /* 3461 + * btrfs_wait_cache_io uses the cache->dirty_list to decide 3462 + * if it should update the cache_state. Don't delete 3463 + * until after we wait. 3464 + * 3465 + * Since we're not running in the commit critical section 3466 + * we need the dirty_bgs_lock to protect from update_block_group 3467 + */ 3468 + spin_lock(&cur_trans->dirty_bgs_lock); 3469 + list_del_init(&cache->dirty_list); 3470 + spin_unlock(&cur_trans->dirty_bgs_lock); 3471 + 3472 + should_put = 1; 3473 + 3474 + cache_save_setup(cache, trans, path); 3475 + 3476 + if (cache->disk_cache_state == BTRFS_DC_SETUP) { 3477 + cache->io_ctl.inode = NULL; 3478 + ret = btrfs_write_out_cache(root, trans, cache, path); 3479 + if (ret == 0 && cache->io_ctl.inode) { 3480 + num_started++; 3481 + should_put = 0; 3482 + 3483 + /* 3484 + * the cache_write_mutex is protecting 3485 + * the io_list 3486 + */ 3487 + list_add_tail(&cache->io_list, io); 3488 + } else { 3489 + /* 3490 + * if we failed to write the cache, the 3491 + * generation will be bad and life goes on 3492 + */ 3493 + ret = 0; 3494 + } 3495 + } 3496 + if (!ret) 3497 + ret = write_one_cache_group(trans, root, path, cache); 3498 + mutex_unlock(&trans->transaction->cache_write_mutex); 3499 + 3500 + /* if its not on the io list, we need to put the block group */ 3501 + if (should_put) 3502 + btrfs_put_block_group(cache); 3503 + 3504 + if (ret) 3505 + break; 3506 + } 3507 + 3508 + /* 3509 + * go through delayed refs for all the stuff we've just kicked off 3510 + * and then loop back (just once) 3511 + */ 3512 + ret = btrfs_run_delayed_refs(trans, root, 0); 3513 + if (!ret && loops == 0) { 3514 + loops++; 3515 + spin_lock(&cur_trans->dirty_bgs_lock); 3516 + list_splice_init(&cur_trans->dirty_bgs, &dirty); 3517 + spin_unlock(&cur_trans->dirty_bgs_lock); 3518 + goto again; 3519 + } 3520 + 3521 + btrfs_free_path(path); 3522 + return ret; 3523 + } 3524 + 3385 3525 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3386 3526 struct btrfs_root *root) 3387 3527 { ··· 3530 3390 int ret = 0; 3531 3391 int should_put; 3532 3392 struct btrfs_path *path; 3533 - LIST_HEAD(io); 3393 + struct list_head *io = &cur_trans->io_bgs; 3534 3394 int num_started = 0; 3535 - int num_waited = 0; 3536 - 3537 - if (list_empty(&cur_trans->dirty_bgs)) 3538 - return 0; 3539 3395 3540 3396 path = btrfs_alloc_path(); 3541 3397 if (!path) ··· 3559 3423 &cache->io_ctl, path, 3560 3424 cache->key.objectid); 3561 3425 btrfs_put_block_group(cache); 3562 - num_waited++; 3563 3426 } 3564 3427 3428 + /* 3429 + * don't remove from the dirty list until after we've waited 3430 + * on any pending IO 3431 + */ 3565 3432 list_del_init(&cache->dirty_list); 3566 3433 should_put = 1; 3567 3434 3568 - if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3569 - cache_save_setup(cache, trans, path); 3435 + cache_save_setup(cache, trans, path); 3570 3436 3571 3437 if (!ret) 3572 3438 ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1); ··· 3579 3441 if (ret == 0 && cache->io_ctl.inode) { 3580 3442 num_started++; 3581 3443 should_put = 0; 3582 - list_add_tail(&cache->io_list, &io); 3444 + list_add_tail(&cache->io_list, io); 3583 3445 } else { 3584 3446 /* 3585 3447 * if we failed to write the cache, the ··· 3596 3458 btrfs_put_block_group(cache); 3597 3459 } 3598 3460 3599 - while (!list_empty(&io)) { 3600 - cache = list_first_entry(&io, struct btrfs_block_group_cache, 3461 + while (!list_empty(io)) { 3462 + cache = list_first_entry(io, struct btrfs_block_group_cache, 3601 3463 io_list); 3602 3464 list_del_init(&cache->io_list); 3603 - num_waited++; 3604 3465 btrfs_wait_cache_io(root, trans, cache, 3605 3466 &cache->io_ctl, path, cache->key.objectid); 3606 3467 btrfs_put_block_group(cache); ··· 5596 5459 if (!alloc && cache->cached == BTRFS_CACHE_NO) 5597 5460 cache_block_group(cache, 1); 5598 5461 5599 - spin_lock(&trans->transaction->dirty_bgs_lock); 5600 - if (list_empty(&cache->dirty_list)) { 5601 - list_add_tail(&cache->dirty_list, 5602 - &trans->transaction->dirty_bgs); 5603 - trans->transaction->num_dirty_bgs++; 5604 - btrfs_get_block_group(cache); 5605 - } 5606 - spin_unlock(&trans->transaction->dirty_bgs_lock); 5607 - 5608 5462 byte_in_group = bytenr - cache->key.objectid; 5609 5463 WARN_ON(byte_in_group > cache->key.offset); 5610 5464 ··· 5644 5516 spin_unlock(&info->unused_bgs_lock); 5645 5517 } 5646 5518 } 5519 + 5520 + spin_lock(&trans->transaction->dirty_bgs_lock); 5521 + if (list_empty(&cache->dirty_list)) { 5522 + list_add_tail(&cache->dirty_list, 5523 + &trans->transaction->dirty_bgs); 5524 + trans->transaction->num_dirty_bgs++; 5525 + btrfs_get_block_group(cache); 5526 + } 5527 + spin_unlock(&trans->transaction->dirty_bgs_lock); 5528 + 5647 5529 btrfs_put_block_group(cache); 5648 5530 total -= num_bytes; 5649 5531 bytenr += num_bytes; ··· 8740 8602 8741 8603 BUG_ON(cache->ro); 8742 8604 8605 + again: 8743 8606 trans = btrfs_join_transaction(root); 8744 8607 if (IS_ERR(trans)) 8745 8608 return PTR_ERR(trans); 8609 + 8610 + /* 8611 + * we're not allowed to set block groups readonly after the dirty 8612 + * block groups cache has started writing. If it already started, 8613 + * back off and let this transaction commit 8614 + */ 8615 + mutex_lock(&root->fs_info->ro_block_group_mutex); 8616 + if (trans->transaction->dirty_bg_run) { 8617 + u64 transid = trans->transid; 8618 + 8619 + mutex_unlock(&root->fs_info->ro_block_group_mutex); 8620 + btrfs_end_transaction(trans, root); 8621 + 8622 + ret = btrfs_wait_for_commit(root, transid); 8623 + if (ret) 8624 + return ret; 8625 + goto again; 8626 + } 8627 + 8746 8628 8747 8629 ret = set_block_group_ro(cache, 0); 8748 8630 if (!ret) ··· 8778 8620 alloc_flags = update_block_group_flags(root, cache->flags); 8779 8621 check_system_chunk(trans, root, alloc_flags); 8780 8622 } 8623 + mutex_unlock(&root->fs_info->ro_block_group_mutex); 8781 8624 8782 8625 btrfs_end_transaction(trans, root); 8783 8626 return ret; ··· 9584 9425 goto out; 9585 9426 } 9586 9427 9428 + /* 9429 + * get the inode first so any iput calls done for the io_list 9430 + * aren't the final iput (no unlinks allowed now) 9431 + */ 9587 9432 inode = lookup_free_space_inode(tree_root, block_group, path); 9433 + 9434 + mutex_lock(&trans->transaction->cache_write_mutex); 9435 + /* 9436 + * make sure our free spache cache IO is done before remove the 9437 + * free space inode 9438 + */ 9439 + spin_lock(&trans->transaction->dirty_bgs_lock); 9440 + if (!list_empty(&block_group->io_list)) { 9441 + list_del_init(&block_group->io_list); 9442 + 9443 + WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); 9444 + 9445 + spin_unlock(&trans->transaction->dirty_bgs_lock); 9446 + btrfs_wait_cache_io(root, trans, block_group, 9447 + &block_group->io_ctl, path, 9448 + block_group->key.objectid); 9449 + btrfs_put_block_group(block_group); 9450 + spin_lock(&trans->transaction->dirty_bgs_lock); 9451 + } 9452 + 9453 + if (!list_empty(&block_group->dirty_list)) { 9454 + list_del_init(&block_group->dirty_list); 9455 + btrfs_put_block_group(block_group); 9456 + } 9457 + spin_unlock(&trans->transaction->dirty_bgs_lock); 9458 + mutex_unlock(&trans->transaction->cache_write_mutex); 9459 + 9588 9460 if (!IS_ERR(inode)) { 9589 9461 ret = btrfs_orphan_add(trans, inode); 9590 9462 if (ret) { ··· 9708 9518 9709 9519 spin_lock(&trans->transaction->dirty_bgs_lock); 9710 9520 if (!list_empty(&block_group->dirty_list)) { 9711 - list_del_init(&block_group->dirty_list); 9712 - btrfs_put_block_group(block_group); 9521 + WARN_ON(1); 9522 + } 9523 + if (!list_empty(&block_group->io_list)) { 9524 + WARN_ON(1); 9713 9525 } 9714 9526 spin_unlock(&trans->transaction->dirty_bgs_lock); 9715 - 9716 9527 btrfs_remove_free_space_cache(block_group); 9717 9528 9718 9529 spin_lock(&block_group->space_info->lock);

+62 -7

fs/btrfs/free-space-cache.c

··· 226 226 227 227 int btrfs_truncate_free_space_cache(struct btrfs_root *root, 228 228 struct btrfs_trans_handle *trans, 229 + struct btrfs_block_group_cache *block_group, 229 230 struct inode *inode) 230 231 { 231 232 int ret = 0; 233 + struct btrfs_path *path = btrfs_alloc_path(); 234 + 235 + if (!path) { 236 + ret = -ENOMEM; 237 + goto fail; 238 + } 239 + 240 + if (block_group) { 241 + mutex_lock(&trans->transaction->cache_write_mutex); 242 + if (!list_empty(&block_group->io_list)) { 243 + list_del_init(&block_group->io_list); 244 + 245 + btrfs_wait_cache_io(root, trans, block_group, 246 + &block_group->io_ctl, path, 247 + block_group->key.objectid); 248 + btrfs_put_block_group(block_group); 249 + } 250 + 251 + /* 252 + * now that we've truncated the cache away, its no longer 253 + * setup or written 254 + */ 255 + spin_lock(&block_group->lock); 256 + block_group->disk_cache_state = BTRFS_DC_CLEAR; 257 + spin_unlock(&block_group->lock); 258 + } 259 + btrfs_free_path(path); 232 260 233 261 btrfs_i_size_write(inode, 0); 234 262 truncate_pagecache(inode, 0); ··· 270 242 ret = btrfs_truncate_inode_items(trans, root, inode, 271 243 0, BTRFS_EXTENT_DATA_KEY); 272 244 if (ret) { 245 + mutex_unlock(&trans->transaction->cache_write_mutex); 273 246 btrfs_abort_transaction(trans, root, ret); 274 247 return ret; 275 248 } 276 249 277 250 ret = btrfs_update_inode(trans, root, inode); 251 + 252 + if (block_group) 253 + mutex_unlock(&trans->transaction->cache_write_mutex); 254 + 255 + fail: 278 256 if (ret) 279 257 btrfs_abort_transaction(trans, root, ret); 280 258 ··· 910 876 { 911 877 int ret; 912 878 struct btrfs_free_cluster *cluster = NULL; 879 + struct btrfs_free_cluster *cluster_locked = NULL; 913 880 struct rb_node *node = rb_first(&ctl->free_space_offset); 914 881 struct btrfs_trim_range *trim_entry; 915 882 ··· 922 887 } 923 888 924 889 if (!node && cluster) { 890 + cluster_locked = cluster; 891 + spin_lock(&cluster_locked->lock); 925 892 node = rb_first(&cluster->root); 926 893 cluster = NULL; 927 894 } ··· 947 910 node = rb_next(node); 948 911 if (!node && cluster) { 949 912 node = rb_first(&cluster->root); 913 + cluster_locked = cluster; 914 + spin_lock(&cluster_locked->lock); 950 915 cluster = NULL; 951 916 } 917 + } 918 + if (cluster_locked) { 919 + spin_unlock(&cluster_locked->lock); 920 + cluster_locked = NULL; 952 921 } 953 922 954 923 /* ··· 973 930 974 931 return 0; 975 932 fail: 933 + if (cluster_locked) 934 + spin_unlock(&cluster_locked->lock); 976 935 return -ENOSPC; 977 936 } 978 937 ··· 1146 1101 int ret; 1147 1102 struct inode *inode = io_ctl->inode; 1148 1103 1104 + if (!inode) 1105 + return 0; 1106 + 1149 1107 root = root->fs_info->tree_root; 1150 1108 1151 1109 /* Flush the dirty pages in the cache file. */ ··· 1175 1127 btrfs_update_inode(trans, root, inode); 1176 1128 1177 1129 if (block_group) { 1130 + /* the dirty list is protected by the dirty_bgs_lock */ 1131 + spin_lock(&trans->transaction->dirty_bgs_lock); 1132 + 1133 + /* the disk_cache_state is protected by the block group lock */ 1178 1134 spin_lock(&block_group->lock); 1179 1135 1180 1136 /* 1181 1137 * only mark this as written if we didn't get put back on 1182 - * the dirty list while waiting for IO. 1138 + * the dirty list while waiting for IO. Otherwise our 1139 + * cache state won't be right, and we won't get written again 1183 1140 */ 1184 1141 if (!ret && list_empty(&block_group->dirty_list)) 1185 1142 block_group->disk_cache_state = BTRFS_DC_WRITTEN; ··· 1192 1139 block_group->disk_cache_state = BTRFS_DC_ERROR; 1193 1140 1194 1141 spin_unlock(&block_group->lock); 1142 + spin_unlock(&trans->transaction->dirty_bgs_lock); 1195 1143 io_ctl->inode = NULL; 1196 1144 iput(inode); 1197 1145 } ··· 1261 1207 1262 1208 mutex_lock(&ctl->cache_writeout_mutex); 1263 1209 /* Write out the extent entries in the free space cache */ 1210 + spin_lock(&ctl->tree_lock); 1264 1211 ret = write_cache_extent_entries(io_ctl, ctl, 1265 1212 block_group, &entries, &bitmaps, 1266 1213 &bitmap_list); 1214 + spin_unlock(&ctl->tree_lock); 1267 1215 if (ret) { 1268 1216 mutex_unlock(&ctl->cache_writeout_mutex); 1269 1217 goto out_nospc; ··· 1275 1219 * Some spaces that are freed in the current transaction are pinned, 1276 1220 * they will be added into free space cache after the transaction is 1277 1221 * committed, we shouldn't lose them. 1222 + * 1223 + * If this changes while we are working we'll get added back to 1224 + * the dirty list and redo it. No locking needed 1278 1225 */ 1279 1226 ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries); 1280 1227 if (ret) { ··· 1290 1231 * locked while doing it because a concurrent trim can be manipulating 1291 1232 * or freeing the bitmap. 1292 1233 */ 1234 + spin_lock(&ctl->tree_lock); 1293 1235 ret = write_bitmap_entries(io_ctl, &bitmap_list); 1236 + spin_unlock(&ctl->tree_lock); 1294 1237 mutex_unlock(&ctl->cache_writeout_mutex); 1295 1238 if (ret) 1296 1239 goto out_nospc; ··· 1365 1304 1366 1305 spin_lock(&block_group->lock); 1367 1306 if (block_group->disk_cache_state < BTRFS_DC_SETUP) { 1368 - spin_unlock(&block_group->lock); 1369 - return 0; 1370 - } 1371 - 1372 - if (block_group->delalloc_bytes) { 1373 - block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1374 1307 spin_unlock(&block_group->lock); 1375 1308 return 0; 1376 1309 }

+1

fs/btrfs/free-space-cache.h

··· 62 62 struct btrfs_block_rsv *rsv); 63 63 int btrfs_truncate_free_space_cache(struct btrfs_root *root, 64 64 struct btrfs_trans_handle *trans, 65 + struct btrfs_block_group_cache *block_group, 65 66 struct inode *inode); 66 67 int load_free_space_cache(struct btrfs_fs_info *fs_info, 67 68 struct btrfs_block_group_cache *block_group);

+1 -1

fs/btrfs/inode-map.c

··· 456 456 } 457 457 458 458 if (i_size_read(inode) > 0) { 459 - ret = btrfs_truncate_free_space_cache(root, trans, inode); 459 + ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode); 460 460 if (ret) { 461 461 if (ret != -ENOSPC) 462 462 btrfs_abort_transaction(trans, root, ret);

+6 -3

fs/btrfs/relocation.c

··· 3430 3430 } 3431 3431 3432 3432 static int delete_block_group_cache(struct btrfs_fs_info *fs_info, 3433 - struct inode *inode, u64 ino) 3433 + struct btrfs_block_group_cache *block_group, 3434 + struct inode *inode, 3435 + u64 ino) 3434 3436 { 3435 3437 struct btrfs_key key; 3436 3438 struct btrfs_root *root = fs_info->tree_root; ··· 3465 3463 goto out; 3466 3464 } 3467 3465 3468 - ret = btrfs_truncate_free_space_cache(root, trans, inode); 3466 + ret = btrfs_truncate_free_space_cache(root, trans, block_group, inode); 3469 3467 3470 3468 btrfs_end_transaction(trans, root); 3471 3469 btrfs_btree_balance_dirty(root); ··· 3511 3509 */ 3512 3510 if (ref_root == BTRFS_ROOT_TREE_OBJECTID) { 3513 3511 ret = delete_block_group_cache(rc->extent_root->fs_info, 3512 + rc->block_group, 3514 3513 NULL, ref_objectid); 3515 3514 if (ret != -ENOENT) 3516 3515 return ret; ··· 4226 4223 btrfs_free_path(path); 4227 4224 4228 4225 if (!IS_ERR(inode)) 4229 - ret = delete_block_group_cache(fs_info, inode, 0); 4226 + ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); 4230 4227 else 4231 4228 ret = PTR_ERR(inode); 4232 4229

+37 -1

fs/btrfs/transaction.c

··· 222 222 atomic_set(&cur_trans->use_count, 2); 223 223 cur_trans->have_free_bgs = 0; 224 224 cur_trans->start_time = get_seconds(); 225 + cur_trans->dirty_bg_run = 0; 225 226 226 227 cur_trans->delayed_refs.href_root = RB_ROOT; 227 228 atomic_set(&cur_trans->delayed_refs.num_entries, 0); ··· 252 251 INIT_LIST_HEAD(&cur_trans->switch_commits); 253 252 INIT_LIST_HEAD(&cur_trans->pending_ordered); 254 253 INIT_LIST_HEAD(&cur_trans->dirty_bgs); 254 + INIT_LIST_HEAD(&cur_trans->io_bgs); 255 + mutex_init(&cur_trans->cache_write_mutex); 255 256 cur_trans->num_dirty_bgs = 0; 256 257 spin_lock_init(&cur_trans->dirty_bgs_lock); 257 258 list_add_tail(&cur_trans->list, &fs_info->trans_list); ··· 1062 1059 { 1063 1060 struct btrfs_fs_info *fs_info = root->fs_info; 1064 1061 struct list_head *dirty_bgs = &trans->transaction->dirty_bgs; 1062 + struct list_head *io_bgs = &trans->transaction->io_bgs; 1065 1063 struct list_head *next; 1066 1064 struct extent_buffer *eb; 1067 1065 int ret; ··· 1116 1112 return ret; 1117 1113 } 1118 1114 1119 - while (!list_empty(dirty_bgs)) { 1115 + while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) { 1120 1116 ret = btrfs_write_dirty_block_groups(trans, root); 1121 1117 if (ret) 1122 1118 return ret; ··· 1816 1812 return ret; 1817 1813 } 1818 1814 1815 + if (!cur_trans->dirty_bg_run) { 1816 + int run_it = 0; 1817 + 1818 + /* this mutex is also taken before trying to set 1819 + * block groups readonly. We need to make sure 1820 + * that nobody has set a block group readonly 1821 + * after a extents from that block group have been 1822 + * allocated for cache files. btrfs_set_block_group_ro 1823 + * will wait for the transaction to commit if it 1824 + * finds dirty_bg_run = 1 1825 + * 1826 + * The dirty_bg_run flag is also used to make sure only 1827 + * one process starts all the block group IO. It wouldn't 1828 + * hurt to have more than one go through, but there's no 1829 + * real advantage to it either. 1830 + */ 1831 + mutex_lock(&root->fs_info->ro_block_group_mutex); 1832 + if (!cur_trans->dirty_bg_run) { 1833 + run_it = 1; 1834 + cur_trans->dirty_bg_run = 1; 1835 + } 1836 + mutex_unlock(&root->fs_info->ro_block_group_mutex); 1837 + 1838 + if (run_it) 1839 + ret = btrfs_start_dirty_block_groups(trans, root); 1840 + } 1841 + if (ret) { 1842 + btrfs_end_transaction(trans, root); 1843 + return ret; 1844 + } 1845 + 1819 1846 spin_lock(&root->fs_info->trans_lock); 1820 1847 list_splice(&trans->ordered, &cur_trans->pending_ordered); 1821 1848 if (cur_trans->state >= TRANS_STATE_COMMIT_START) { ··· 2040 2005 2041 2006 assert_qgroups_uptodate(trans); 2042 2007 ASSERT(list_empty(&cur_trans->dirty_bgs)); 2008 + ASSERT(list_empty(&cur_trans->io_bgs)); 2043 2009 update_super_roots(root); 2044 2010 2045 2011 btrfs_set_super_log_root(root->fs_info->super_copy, 0);

+9

fs/btrfs/transaction.h

··· 64 64 struct list_head pending_ordered; 65 65 struct list_head switch_commits; 66 66 struct list_head dirty_bgs; 67 + struct list_head io_bgs; 67 68 u64 num_dirty_bgs; 69 + 70 + /* 71 + * we need to make sure block group deletion doesn't race with 72 + * free space cache writeout. This mutex keeps them from stomping 73 + * on each other 74 + */ 75 + struct mutex cache_write_mutex; 68 76 spinlock_t dirty_bgs_lock; 69 77 struct btrfs_delayed_ref_root delayed_refs; 70 78 int aborted; 79 + int dirty_bg_run; 71 80 }; 72 81 73 82 #define __TRANS_FREEZABLE (1U << 0)