commit b4ce94de9b4d64e8ab3cf155d13653c666e22b9b · tjh.dev/kernel

tjh.dev / kernel

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Btrfs: Change btree locking to use explicit blocking points

Most of the btrfs metadata operations can be protected by a spinlock,
but some operations still need to schedule.

So far, btrfs has been using a mutex along with a trylock loop,
most of the time it is able to avoid going for the full mutex, so
the trylock loop is a big performance gain.

This commit is step one for getting rid of the blocking locks entirely.
btrfs_tree_lock takes a spinlock, and the code explicitly switches
to a blocking lock when it starts an operation that can schedule.

We'll be able get rid of the blocking locks in smaller pieces over time.
Tracing allows us to find the most common cause of blocking, so we
can start with the hot spots first.

The basic idea is:

btrfs_tree_lock() returns with the spin lock held

btrfs_set_lock_blocking() sets the EXTENT_BUFFER_BLOCKING bit in
the extent buffer flags, and then drops the spin lock. The buffer is
still considered locked by all of the btrfs code.

If btrfs_tree_lock gets the spinlock but finds the blocking bit set, it drops
the spin lock and waits on a wait queue for the blocking bit to go away.

Much of the code that needs to set the blocking bit finishes without actually
blocking a good percentage of the time. So, an adaptive spin is still
used against the blocking bit to avoid very high context switch rates.

btrfs_clear_lock_blocking() clears the blocking bit and returns
with the spinlock held again.

btrfs_tree_unlock() can be called on either blocking or spinning locks,
it does the right thing based on the blocking bit.

ctree.c has a helper function to set/clear all the locked buffers in a
path as blocking.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

Chris Mason 17 years ago b4ce94de c487685d

+471 -40

11 changed files

expand all

unified split

btrfs

ctree.c

ctree.h

disk-io.c

extent-tree.c

extent_io.c

extent_io.h

inode.c

locking.c

locking.h

tree-defrag.c

tree-log.c

+226 -8

fs/btrfs/ctree.c

··· 54 54 return path; 55 55 } 56 56 57 + /* 58 + * set all locked nodes in the path to blocking locks. This should 59 + * be done before scheduling 60 + */ 61 + noinline void btrfs_set_path_blocking(struct btrfs_path *p) 62 + { 63 + int i; 64 + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 65 + if (p->nodes[i] && p->locks[i]) 66 + btrfs_set_lock_blocking(p->nodes[i]); 67 + } 68 + } 69 + 70 + /* 71 + * reset all the locked nodes in the patch to spinning locks. 72 + */ 73 + noinline void btrfs_clear_path_blocking(struct btrfs_path *p) 74 + { 75 + int i; 76 + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 77 + if (p->nodes[i] && p->locks[i]) 78 + btrfs_clear_lock_blocking(p->nodes[i]); 79 + } 80 + } 81 + 57 82 /* this also releases the path */ 58 83 void btrfs_free_path(struct btrfs_path *p) 59 84 { ··· 297 272 if (IS_ERR(cow)) 298 273 return PTR_ERR(cow); 299 274 275 + /* cow is set to blocking by btrfs_init_new_buffer */ 276 + 300 277 copy_extent_buffer(cow, buf, 0, 0, cow->len); 301 278 btrfs_set_header_bytenr(cow, cow->start); 302 279 btrfs_set_header_generation(cow, trans->transid); ··· 424 397 } 425 398 426 399 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); 400 + 401 + if (parent) 402 + btrfs_set_lock_blocking(parent); 403 + btrfs_set_lock_blocking(buf); 404 + 427 405 ret = __btrfs_cow_block(trans, root, buf, parent, 428 406 parent_slot, cow_ret, search_start, 0, 429 407 prealloc_dest); ··· 534 502 if (parent_nritems == 1) 535 503 return 0; 536 504 505 + btrfs_set_lock_blocking(parent); 506 + 537 507 for (i = start_slot; i < end_slot; i++) { 538 508 int close = 1; 539 509 ··· 596 562 search_start = last_block; 597 563 598 564 btrfs_tree_lock(cur); 565 + btrfs_set_lock_blocking(cur); 599 566 err = __btrfs_cow_block(trans, root, cur, parent, i, 600 567 &cur, search_start, 601 568 min(16 * blocksize, ··· 895 860 return 0; 896 861 897 862 mid = path->nodes[level]; 863 + 898 864 WARN_ON(!path->locks[level]); 899 865 WARN_ON(btrfs_header_generation(mid) != trans->transid); 900 866 ··· 918 882 /* promote the child to a root */ 919 883 child = read_node_slot(root, mid, 0); 920 884 btrfs_tree_lock(child); 885 + btrfs_set_lock_blocking(child); 921 886 BUG_ON(!child); 922 887 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); 923 888 BUG_ON(ret); ··· 935 898 936 899 add_root_to_dirty_list(root); 937 900 btrfs_tree_unlock(child); 901 + 938 902 path->locks[level] = 0; 939 903 path->nodes[level] = NULL; 940 904 clean_tree_block(trans, root, mid); ··· 960 922 left = read_node_slot(root, parent, pslot - 1); 961 923 if (left) { 962 924 btrfs_tree_lock(left); 925 + btrfs_set_lock_blocking(left); 963 926 wret = btrfs_cow_block(trans, root, left, 964 927 parent, pslot - 1, &left, 0); 965 928 if (wret) { ··· 971 932 right = read_node_slot(root, parent, pslot + 1); 972 933 if (right) { 973 934 btrfs_tree_lock(right); 935 + btrfs_set_lock_blocking(right); 974 936 wret = btrfs_cow_block(trans, root, right, 975 937 parent, pslot + 1, &right, 0); 976 938 if (wret) { ··· 1147 1107 u32 left_nr; 1148 1108 1149 1109 btrfs_tree_lock(left); 1110 + btrfs_set_lock_blocking(left); 1111 + 1150 1112 left_nr = btrfs_header_nritems(left); 1151 1113 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { 1152 1114 wret = 1; ··· 1195 1153 */ 1196 1154 if (right) { 1197 1155 u32 right_nr; 1156 + 1198 1157 btrfs_tree_lock(right); 1158 + btrfs_set_lock_blocking(right); 1159 + 1199 1160 right_nr = btrfs_header_nritems(right); 1200 1161 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { 1201 1162 wret = 1; ··· 1310 1265 } 1311 1266 1312 1267 /* 1268 + * returns -EAGAIN if it had to drop the path, or zero if everything was in 1269 + * cache 1270 + */ 1271 + static noinline int reada_for_balance(struct btrfs_root *root, 1272 + struct btrfs_path *path, int level) 1273 + { 1274 + int slot; 1275 + int nritems; 1276 + struct extent_buffer *parent; 1277 + struct extent_buffer *eb; 1278 + u64 gen; 1279 + u64 block1 = 0; 1280 + u64 block2 = 0; 1281 + int ret = 0; 1282 + int blocksize; 1283 + 1284 + parent = path->nodes[level - 1]; 1285 + if (!parent) 1286 + return 0; 1287 + 1288 + nritems = btrfs_header_nritems(parent); 1289 + slot = path->slots[level]; 1290 + blocksize = btrfs_level_size(root, level); 1291 + 1292 + if (slot > 0) { 1293 + block1 = btrfs_node_blockptr(parent, slot - 1); 1294 + gen = btrfs_node_ptr_generation(parent, slot - 1); 1295 + eb = btrfs_find_tree_block(root, block1, blocksize); 1296 + if (eb && btrfs_buffer_uptodate(eb, gen)) 1297 + block1 = 0; 1298 + free_extent_buffer(eb); 1299 + } 1300 + if (slot < nritems) { 1301 + block2 = btrfs_node_blockptr(parent, slot + 1); 1302 + gen = btrfs_node_ptr_generation(parent, slot + 1); 1303 + eb = btrfs_find_tree_block(root, block2, blocksize); 1304 + if (eb && btrfs_buffer_uptodate(eb, gen)) 1305 + block2 = 0; 1306 + free_extent_buffer(eb); 1307 + } 1308 + if (block1 || block2) { 1309 + ret = -EAGAIN; 1310 + btrfs_release_path(root, path); 1311 + if (block1) 1312 + readahead_tree_block(root, block1, blocksize, 0); 1313 + if (block2) 1314 + readahead_tree_block(root, block2, blocksize, 0); 1315 + 1316 + if (block1) { 1317 + eb = read_tree_block(root, block1, blocksize, 0); 1318 + free_extent_buffer(eb); 1319 + } 1320 + if (block1) { 1321 + eb = read_tree_block(root, block2, blocksize, 0); 1322 + free_extent_buffer(eb); 1323 + } 1324 + } 1325 + return ret; 1326 + } 1327 + 1328 + 1329 + /* 1313 1330 * when we walk down the tree, it is usually safe to unlock the higher layers 1314 1331 * in the tree. The exceptions are when our path goes through slot 0, because 1315 1332 * operations on the tree might require changing key pointers higher up in the ··· 1418 1311 btrfs_tree_unlock(t); 1419 1312 path->locks[i] = 0; 1420 1313 } 1314 + } 1315 + } 1316 + 1317 + /* 1318 + * This releases any locks held in the path starting at level and 1319 + * going all the way up to the root. 1320 + * 1321 + * btrfs_search_slot will keep the lock held on higher nodes in a few 1322 + * corner cases, such as COW of the block at slot zero in the node. This 1323 + * ignores those rules, and it should only be called when there are no 1324 + * more updates to be done higher up in the tree. 1325 + */ 1326 + noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) 1327 + { 1328 + int i; 1329 + 1330 + if (path->keep_locks || path->lowest_level) 1331 + return; 1332 + 1333 + for (i = level; i < BTRFS_MAX_LEVEL; i++) { 1334 + if (!path->nodes[i]) 1335 + break; 1336 + if (!path->locks[i]) 1337 + break; 1338 + btrfs_tree_unlock(path->nodes[i]); 1339 + path->locks[i] = 0; 1421 1340 } 1422 1341 } 1423 1342 ··· 1518 1385 */ 1519 1386 if (prealloc_block.objectid && 1520 1387 prealloc_block.offset != b->len) { 1388 + btrfs_set_path_blocking(p); 1521 1389 btrfs_free_reserved_extent(root, 1522 1390 prealloc_block.objectid, 1523 1391 prealloc_block.offset); ··· 1543 1409 goto again; 1544 1410 } 1545 1411 1412 + btrfs_set_path_blocking(p); 1413 + 1546 1414 wret = btrfs_cow_block(trans, root, b, 1547 1415 p->nodes[level + 1], 1548 1416 p->slots[level + 1], ··· 1566 1430 if (!p->skip_locking) 1567 1431 p->locks[level] = 1; 1568 1432 1433 + btrfs_clear_path_blocking(p); 1434 + 1435 + /* 1436 + * we have a lock on b and as long as we aren't changing 1437 + * the tree, there is no way to for the items in b to change. 1438 + * It is safe to drop the lock on our parent before we 1439 + * go through the expensive btree search on b. 1440 + * 1441 + * If cow is true, then we might be changing slot zero, 1442 + * which may require changing the parent. So, we can't 1443 + * drop the lock until after we know which slot we're 1444 + * operating on. 1445 + */ 1446 + if (!cow) 1447 + btrfs_unlock_up_safe(p, level + 1); 1448 + 1569 1449 ret = check_block(root, p, level); 1570 1450 if (ret) { 1571 1451 ret = -1; ··· 1589 1437 } 1590 1438 1591 1439 ret = bin_search(b, key, level, &slot); 1440 + 1592 1441 if (level != 0) { 1593 1442 if (ret && slot > 0) 1594 1443 slot -= 1; ··· 1597 1444 if ((p->search_for_split || ins_len > 0) && 1598 1445 btrfs_header_nritems(b) >= 1599 1446 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 1600 - int sret = split_node(trans, root, p, level); 1447 + int sret; 1448 + 1449 + sret = reada_for_balance(root, p, level); 1450 + if (sret) 1451 + goto again; 1452 + 1453 + btrfs_set_path_blocking(p); 1454 + sret = split_node(trans, root, p, level); 1455 + btrfs_clear_path_blocking(p); 1456 + 1601 1457 BUG_ON(sret > 0); 1602 1458 if (sret) { 1603 1459 ret = sret; ··· 1615 1453 b = p->nodes[level]; 1616 1454 slot = p->slots[level]; 1617 1455 } else if (ins_len < 0) { 1618 - int sret = balance_level(trans, root, p, 1619 - level); 1456 + int sret; 1457 + 1458 + sret = reada_for_balance(root, p, level); 1459 + if (sret) 1460 + goto again; 1461 + 1462 + btrfs_set_path_blocking(p); 1463 + sret = balance_level(trans, root, p, level); 1464 + btrfs_clear_path_blocking(p); 1465 + 1620 1466 if (sret) { 1621 1467 ret = sret; 1622 1468 goto done; ··· 1658 1488 * of the btree by dropping locks before 1659 1489 * we read. 1660 1490 */ 1661 - if (level > 1) { 1491 + if (level > 0) { 1662 1492 btrfs_release_path(NULL, p); 1663 1493 if (tmp) 1664 1494 free_extent_buffer(tmp); ··· 1673 1503 free_extent_buffer(tmp); 1674 1504 goto again; 1675 1505 } else { 1506 + btrfs_set_path_blocking(p); 1676 1507 if (tmp) 1677 1508 free_extent_buffer(tmp); 1678 1509 if (should_reada) ··· 1683 1512 b = read_node_slot(root, b, slot); 1684 1513 } 1685 1514 } 1686 - if (!p->skip_locking) 1687 - btrfs_tree_lock(b); 1515 + if (!p->skip_locking) { 1516 + int lret; 1517 + 1518 + btrfs_clear_path_blocking(p); 1519 + lret = btrfs_try_spin_lock(b); 1520 + 1521 + if (!lret) { 1522 + btrfs_set_path_blocking(p); 1523 + btrfs_tree_lock(b); 1524 + btrfs_clear_path_blocking(p); 1525 + } 1526 + } 1688 1527 } else { 1689 1528 p->slots[level] = slot; 1690 1529 if (ins_len > 0 && 1691 1530 btrfs_leaf_free_space(root, b) < ins_len) { 1692 - int sret = split_leaf(trans, root, key, 1531 + int sret; 1532 + 1533 + btrfs_set_path_blocking(p); 1534 + sret = split_leaf(trans, root, key, 1693 1535 p, ins_len, ret == 0); 1536 + btrfs_clear_path_blocking(p); 1537 + 1694 1538 BUG_ON(sret > 0); 1695 1539 if (sret) { 1696 1540 ret = sret; ··· 1719 1533 } 1720 1534 ret = 1; 1721 1535 done: 1536 + /* 1537 + * we don't really know what they plan on doing with the path 1538 + * from here on, so for now just mark it as blocking 1539 + */ 1540 + btrfs_set_path_blocking(p); 1722 1541 if (prealloc_block.objectid) { 1723 1542 btrfs_free_reserved_extent(root, 1724 1543 prealloc_block.objectid, 1725 1544 prealloc_block.offset); 1726 1545 } 1727 - 1728 1546 return ret; 1729 1547 } 1730 1548 ··· 1751 1561 eb = btrfs_lock_root_node(root); 1752 1562 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); 1753 1563 BUG_ON(ret); 1564 + 1565 + btrfs_set_lock_blocking(eb); 1754 1566 1755 1567 parent = eb; 1756 1568 while (1) { ··· 1778 1586 eb = read_tree_block(root, bytenr, blocksize, 1779 1587 generation); 1780 1588 btrfs_tree_lock(eb); 1589 + btrfs_set_lock_blocking(eb); 1781 1590 } 1782 1591 1783 1592 /* ··· 1803 1610 eb = read_tree_block(root, bytenr, blocksize, 1804 1611 generation); 1805 1612 btrfs_tree_lock(eb); 1613 + btrfs_set_lock_blocking(eb); 1806 1614 } 1807 1615 1808 1616 ret = btrfs_cow_block(trans, root, eb, parent, slot, ··· 2350 2156 2351 2157 right = read_node_slot(root, upper, slot + 1); 2352 2158 btrfs_tree_lock(right); 2159 + btrfs_set_lock_blocking(right); 2160 + 2353 2161 free_space = btrfs_leaf_free_space(root, right); 2354 2162 if (free_space < data_size) 2355 2163 goto out_unlock; ··· 2547 2351 2548 2352 left = read_node_slot(root, path->nodes[1], slot - 1); 2549 2353 btrfs_tree_lock(left); 2354 + btrfs_set_lock_blocking(left); 2355 + 2550 2356 free_space = btrfs_leaf_free_space(root, left); 2551 2357 if (free_space < data_size) { 2552 2358 ret = 1; ··· 3006 2808 sizeof(struct btrfs_item), 1); 3007 2809 path->keep_locks = 0; 3008 2810 BUG_ON(ret); 2811 + 2812 + /* 2813 + * make sure any changes to the path from split_leaf leave it 2814 + * in a blocking state 2815 + */ 2816 + btrfs_set_path_blocking(path); 3009 2817 3010 2818 leaf = path->nodes[0]; 3011 2819 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); ··· 3542 3338 BUG(); 3543 3339 } 3544 3340 out: 3341 + btrfs_unlock_up_safe(path, 1); 3545 3342 return ret; 3546 3343 } 3547 3344 ··· 3910 3705 */ 3911 3706 if (slot >= nritems) { 3912 3707 path->slots[level] = slot; 3708 + btrfs_set_path_blocking(path); 3913 3709 sret = btrfs_find_next_key(root, path, min_key, level, 3914 3710 cache_only, min_trans); 3915 3711 if (sret == 0) { 3916 3712 btrfs_release_path(root, path); 3917 3713 goto again; 3918 3714 } else { 3715 + btrfs_clear_path_blocking(path); 3919 3716 goto out; 3920 3717 } 3921 3718 } ··· 3929 3722 unlock_up(path, level, 1); 3930 3723 goto out; 3931 3724 } 3725 + btrfs_set_path_blocking(path); 3932 3726 cur = read_node_slot(root, cur, slot); 3933 3727 3934 3728 btrfs_tree_lock(cur); 3729 + 3935 3730 path->locks[level - 1] = 1; 3936 3731 path->nodes[level - 1] = cur; 3937 3732 unlock_up(path, level, 1); 3733 + btrfs_clear_path_blocking(path); 3938 3734 } 3939 3735 out: 3940 3736 if (ret == 0) 3941 3737 memcpy(min_key, &found_key, sizeof(found_key)); 3738 + btrfs_set_path_blocking(path); 3942 3739 return ret; 3943 3740 } 3944 3741 ··· 4038 3827 if (ret < 0) 4039 3828 return ret; 4040 3829 3830 + btrfs_set_path_blocking(path); 4041 3831 nritems = btrfs_header_nritems(path->nodes[0]); 4042 3832 /* 4043 3833 * by releasing the path above we dropped all our locks. A balance ··· 4069 3857 free_extent_buffer(next); 4070 3858 } 4071 3859 3860 + /* the path was set to blocking above */ 4072 3861 if (level == 1 && (path->locks[1] || path->skip_locking) && 4073 3862 path->reada) 4074 3863 reada_for_search(root, path, level, slot, 0); ··· 4078 3865 if (!path->skip_locking) { 4079 3866 WARN_ON(!btrfs_tree_locked(c)); 4080 3867 btrfs_tree_lock(next); 3868 + btrfs_set_lock_blocking(next); 4081 3869 } 4082 3870 break; 4083 3871 } ··· 4095 3881 path->locks[level] = 1; 4096 3882 if (!level) 4097 3883 break; 3884 + 3885 + btrfs_set_path_blocking(path); 4098 3886 if (level == 1 && path->locks[1] && path->reada) 4099 3887 reada_for_search(root, path, level, slot, 0); 4100 3888 next = read_node_slot(root, next, 0); 4101 3889 if (!path->skip_locking) { 4102 3890 WARN_ON(!btrfs_tree_locked(path->nodes[level])); 4103 3891 btrfs_tree_lock(next); 3892 + btrfs_set_lock_blocking(next); 4104 3893 } 4105 3894 } 4106 3895 done: ··· 4128 3911 4129 3912 while (1) { 4130 3913 if (path->slots[0] == 0) { 3914 + btrfs_set_path_blocking(path); 4131 3915 ret = btrfs_prev_leaf(root, path); 4132 3916 if (ret != 0) 4133 3917 return ret;

fs/btrfs/ctree.h

··· 1835 1835 struct btrfs_path *btrfs_alloc_path(void); 1836 1836 void btrfs_free_path(struct btrfs_path *p); 1837 1837 void btrfs_init_path(struct btrfs_path *p); 1838 + void btrfs_set_path_blocking(struct btrfs_path *p); 1839 + void btrfs_clear_path_blocking(struct btrfs_path *p); 1840 + void btrfs_unlock_up_safe(struct btrfs_path *p, int level); 1841 + 1838 1842 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1839 1843 struct btrfs_path *path, int slot, int nr); 1840 1844 int btrfs_del_leaf(struct btrfs_trans_handle *trans,

+8 -2

fs/btrfs/disk-io.c

··· 799 799 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 800 800 801 801 if (ret == 0) 802 - buf->flags |= EXTENT_UPTODATE; 802 + set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); 803 803 else 804 804 WARN_ON(1); 805 805 return buf; ··· 813 813 if (btrfs_header_generation(buf) == 814 814 root->fs_info->running_transaction->transid) { 815 815 WARN_ON(!btrfs_tree_locked(buf)); 816 + 817 + /* ugh, clear_extent_buffer_dirty can be expensive */ 818 + btrfs_set_lock_blocking(buf); 819 + 816 820 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, 817 821 buf); 818 822 } ··· 2315 2311 u64 transid = btrfs_header_generation(buf); 2316 2312 struct inode *btree_inode = root->fs_info->btree_inode; 2317 2313 2314 + btrfs_set_lock_blocking(buf); 2315 + 2318 2316 WARN_ON(!btrfs_tree_locked(buf)); 2319 2317 if (transid != root->fs_info->generation) { 2320 2318 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " ··· 2359 2353 int ret; 2360 2354 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 2361 2355 if (ret == 0) 2362 - buf->flags |= EXTENT_UPTODATE; 2356 + set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); 2363 2357 return ret; 2364 2358 } 2365 2359

fs/btrfs/extent-tree.c

··· 3407 3407 btrfs_set_header_generation(buf, trans->transid); 3408 3408 btrfs_tree_lock(buf); 3409 3409 clean_tree_block(trans, root, buf); 3410 + 3411 + btrfs_set_lock_blocking(buf); 3410 3412 btrfs_set_buffer_uptodate(buf); 3413 + 3411 3414 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 3412 3415 set_extent_dirty(&root->dirty_log_pages, buf->start, 3413 3416 buf->start + buf->len - 1, GFP_NOFS); ··· 3419 3416 buf->start + buf->len - 1, GFP_NOFS); 3420 3417 } 3421 3418 trans->blocks_used++; 3419 + /* this returns a buffer locked for blocking */ 3422 3420 return buf; 3423 3421 } 3424 3422 ··· 3756 3752 3757 3753 next = read_tree_block(root, bytenr, blocksize, ptr_gen); 3758 3754 btrfs_tree_lock(next); 3755 + btrfs_set_lock_blocking(next); 3759 3756 3760 3757 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, 3761 3758 &refs);

+9 -9

fs/btrfs/extent_io.c

··· 2990 2990 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 2991 2991 eb->start = start; 2992 2992 eb->len = len; 2993 - mutex_init(&eb->mutex); 2993 + spin_lock_init(&eb->lock); 2994 + init_waitqueue_head(&eb->lock_wq); 2995 + 2994 2996 #if LEAK_DEBUG 2995 2997 spin_lock_irqsave(&leak_lock, flags); 2996 2998 list_add(&eb->leak_list, &buffers); ··· 3073 3071 unlock_page(p); 3074 3072 } 3075 3073 if (uptodate) 3076 - eb->flags |= EXTENT_UPTODATE; 3077 - eb->flags |= EXTENT_BUFFER_FILLED; 3074 + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3078 3075 3079 3076 spin_lock(&tree->buffer_lock); 3080 3077 exists = buffer_tree_insert(tree, start, &eb->rb_node); ··· 3227 3226 unsigned long num_pages; 3228 3227 3229 3228 num_pages = num_extent_pages(eb->start, eb->len); 3230 - eb->flags &= ~EXTENT_UPTODATE; 3229 + clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3231 3230 3232 3231 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3233 3232 GFP_NOFS); ··· 3298 3297 struct page *page; 3299 3298 int pg_uptodate = 1; 3300 3299 3301 - if (eb->flags & EXTENT_UPTODATE) 3300 + if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3302 3301 return 1; 3303 3302 3304 3303 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, ··· 3334 3333 struct bio *bio = NULL; 3335 3334 unsigned long bio_flags = 0; 3336 3335 3337 - if (eb->flags & EXTENT_UPTODATE) 3336 + if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3338 3337 return 0; 3339 3338 3340 3339 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, ··· 3365 3364 } 3366 3365 if (all_uptodate) { 3367 3366 if (start_i == 0) 3368 - eb->flags |= EXTENT_UPTODATE; 3367 + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3369 3368 goto unlock_exit; 3370 3369 } 3371 3370 ··· 3401 3400 } 3402 3401 3403 3402 if (!ret) 3404 - eb->flags |= EXTENT_UPTODATE; 3403 + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3405 3404 return ret; 3406 3405 3407 3406 unlock_exit: ··· 3498 3497 unmap_extent_buffer(eb, eb->map_token, km); 3499 3498 eb->map_token = NULL; 3500 3499 save = 1; 3501 - WARN_ON(!mutex_is_locked(&eb->mutex)); 3502 3500 } 3503 3501 err = map_private_extent_buffer(eb, start, min_len, token, map, 3504 3502 map_start, map_len, km);

+14 -2

fs/btrfs/extent_io.h

··· 22 22 /* flags for bio submission */ 23 23 #define EXTENT_BIO_COMPRESSED 1 24 24 25 + /* these are bit numbers for test/set bit */ 26 + #define EXTENT_BUFFER_UPTODATE 0 27 + #define EXTENT_BUFFER_BLOCKING 1 28 + 25 29 /* 26 30 * page->private values. Every page that is controlled by the extent 27 31 * map has page->private set to one. ··· 99 95 unsigned long map_start; 100 96 unsigned long map_len; 101 97 struct page *first_page; 98 + unsigned long bflags; 102 99 atomic_t refs; 103 - int flags; 104 100 struct list_head leak_list; 105 101 struct rb_node rb_node; 106 - struct mutex mutex; 102 + 103 + /* the spinlock is used to protect most operations */ 104 + spinlock_t lock; 105 + 106 + /* 107 + * when we keep the lock held while blocking, waiters go onto 108 + * the wq 109 + */ 110 + wait_queue_head_t lock_wq; 107 111 }; 108 112 109 113 struct extent_map_tree;

fs/btrfs/inode.c

··· 50 50 #include "tree-log.h" 51 51 #include "ref-cache.h" 52 52 #include "compression.h" 53 + #include "locking.h" 53 54 54 55 struct btrfs_iget_args { 55 56 u64 ino; ··· 2022 2021 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 2023 2022 2024 2023 alloc_group_block = btrfs_inode_block_group(leaf, inode_item); 2024 + 2025 2025 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, 2026 2026 alloc_group_block, 0); 2027 2027 btrfs_free_path(path); ··· 2119 2117 goto failed; 2120 2118 } 2121 2119 2120 + btrfs_unlock_up_safe(path, 1); 2122 2121 leaf = path->nodes[0]; 2123 2122 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2124 2123 struct btrfs_inode_item);

+191 -19

fs/btrfs/locking.c

··· 26 26 #include "locking.h" 27 27 28 28 /* 29 - * locks the per buffer mutex in an extent buffer. This uses adaptive locks 30 - * and the spin is not tuned very extensively. The spinning does make a big 31 - * difference in almost every workload, but spinning for the right amount of 32 - * time needs some help. 33 - * 34 - * In general, we want to spin as long as the lock holder is doing btree 35 - * searches, and we should give up if they are in more expensive code. 29 + * btrfs_header_level() isn't free, so don't call it when lockdep isn't 30 + * on 36 31 */ 32 + #ifdef CONFIG_DEBUG_LOCK_ALLOC 33 + static inline void spin_nested(struct extent_buffer *eb) 34 + { 35 + spin_lock_nested(&eb->lock, BTRFS_MAX_LEVEL - btrfs_header_level(eb)); 36 + } 37 + #else 38 + static inline void spin_nested(struct extent_buffer *eb) 39 + { 40 + spin_lock(&eb->lock); 41 + } 42 + #endif 37 43 38 - int btrfs_tree_lock(struct extent_buffer *eb) 44 + /* 45 + * Setting a lock to blocking will drop the spinlock and set the 46 + * flag that forces other procs who want the lock to wait. After 47 + * this you can safely schedule with the lock held. 48 + */ 49 + void btrfs_set_lock_blocking(struct extent_buffer *eb) 50 + { 51 + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { 52 + set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); 53 + spin_unlock(&eb->lock); 54 + } 55 + /* exit with the spin lock released and the bit set */ 56 + } 57 + 58 + /* 59 + * clearing the blocking flag will take the spinlock again. 60 + * After this you can't safely schedule 61 + */ 62 + void btrfs_clear_lock_blocking(struct extent_buffer *eb) 63 + { 64 + if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { 65 + spin_nested(eb); 66 + clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); 67 + smp_mb__after_clear_bit(); 68 + } 69 + /* exit with the spin lock held */ 70 + } 71 + 72 + /* 73 + * unfortunately, many of the places that currently set a lock to blocking 74 + * don't end up blocking for every long, and often they don't block 75 + * at all. For a dbench 50 run, if we don't spin one the blocking bit 76 + * at all, the context switch rate can jump up to 400,000/sec or more. 77 + * 78 + * So, we're still stuck with this crummy spin on the blocking bit, 79 + * at least until the most common causes of the short blocks 80 + * can be dealt with. 81 + */ 82 + static int btrfs_spin_on_block(struct extent_buffer *eb) 39 83 { 40 84 int i; 41 - 42 - if (mutex_trylock(&eb->mutex)) 43 - return 0; 44 85 for (i = 0; i < 512; i++) { 45 86 cpu_relax(); 46 - if (mutex_trylock(&eb->mutex)) 47 - return 0; 87 + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 88 + return 1; 89 + if (need_resched()) 90 + break; 48 91 } 49 - cpu_relax(); 50 - mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb)); 51 92 return 0; 52 93 } 53 94 95 + /* 96 + * This is somewhat different from trylock. It will take the 97 + * spinlock but if it finds the lock is set to blocking, it will 98 + * return without the lock held. 99 + * 100 + * returns 1 if it was able to take the lock and zero otherwise 101 + * 102 + * After this call, scheduling is not safe without first calling 103 + * btrfs_set_lock_blocking() 104 + */ 105 + int btrfs_try_spin_lock(struct extent_buffer *eb) 106 + { 107 + int i; 108 + 109 + spin_nested(eb); 110 + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 111 + return 1; 112 + spin_unlock(&eb->lock); 113 + 114 + /* spin for a bit on the BLOCKING flag */ 115 + for (i = 0; i < 2; i++) { 116 + if (!btrfs_spin_on_block(eb)) 117 + break; 118 + 119 + spin_nested(eb); 120 + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 121 + return 1; 122 + spin_unlock(&eb->lock); 123 + } 124 + return 0; 125 + } 126 + 127 + /* 128 + * the autoremove wake function will return 0 if it tried to wake up 129 + * a process that was already awake, which means that process won't 130 + * count as an exclusive wakeup. The waitq code will continue waking 131 + * procs until it finds one that was actually sleeping. 132 + * 133 + * For btrfs, this isn't quite what we want. We want a single proc 134 + * to be notified that the lock is ready for taking. If that proc 135 + * already happen to be awake, great, it will loop around and try for 136 + * the lock. 137 + * 138 + * So, btrfs_wake_function always returns 1, even when the proc that we 139 + * tried to wake up was already awake. 140 + */ 141 + static int btrfs_wake_function(wait_queue_t *wait, unsigned mode, 142 + int sync, void *key) 143 + { 144 + autoremove_wake_function(wait, mode, sync, key); 145 + return 1; 146 + } 147 + 148 + /* 149 + * returns with the extent buffer spinlocked. 150 + * 151 + * This will spin and/or wait as required to take the lock, and then 152 + * return with the spinlock held. 153 + * 154 + * After this call, scheduling is not safe without first calling 155 + * btrfs_set_lock_blocking() 156 + */ 157 + int btrfs_tree_lock(struct extent_buffer *eb) 158 + { 159 + DEFINE_WAIT(wait); 160 + wait.func = btrfs_wake_function; 161 + 162 + while(1) { 163 + spin_nested(eb); 164 + 165 + /* nobody is blocking, exit with the spinlock held */ 166 + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 167 + return 0; 168 + 169 + /* 170 + * we have the spinlock, but the real owner is blocking. 171 + * wait for them 172 + */ 173 + spin_unlock(&eb->lock); 174 + 175 + /* 176 + * spin for a bit, and if the blocking flag goes away, 177 + * loop around 178 + */ 179 + if (btrfs_spin_on_block(eb)) 180 + continue; 181 + 182 + prepare_to_wait_exclusive(&eb->lock_wq, &wait, 183 + TASK_UNINTERRUPTIBLE); 184 + 185 + if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 186 + schedule(); 187 + 188 + finish_wait(&eb->lock_wq, &wait); 189 + } 190 + return 0; 191 + } 192 + 193 + /* 194 + * Very quick trylock, this does not spin or schedule. It returns 195 + * 1 with the spinlock held if it was able to take the lock, or it 196 + * returns zero if it was unable to take the lock. 197 + * 198 + * After this call, scheduling is not safe without first calling 199 + * btrfs_set_lock_blocking() 200 + */ 54 201 int btrfs_try_tree_lock(struct extent_buffer *eb) 55 202 { 56 - return mutex_trylock(&eb->mutex); 203 + if (spin_trylock(&eb->lock)) { 204 + if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { 205 + /* 206 + * we've got the spinlock, but the real owner is 207 + * blocking. Drop the spinlock and return failure 208 + */ 209 + spin_unlock(&eb->lock); 210 + return 0; 211 + } 212 + return 1; 213 + } 214 + /* someone else has the spinlock giveup */ 215 + return 0; 57 216 } 58 217 59 218 int btrfs_tree_unlock(struct extent_buffer *eb) 60 219 { 61 - mutex_unlock(&eb->mutex); 220 + /* 221 + * if we were a blocking owner, we don't have the spinlock held 222 + * just clear the bit and look for waiters 223 + */ 224 + if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 225 + smp_mb__after_clear_bit(); 226 + else 227 + spin_unlock(&eb->lock); 228 + 229 + if (waitqueue_active(&eb->lock_wq)) 230 + wake_up(&eb->lock_wq); 62 231 return 0; 63 232 } 64 233 65 234 int btrfs_tree_locked(struct extent_buffer *eb) 66 235 { 67 - return mutex_is_locked(&eb->mutex); 236 + return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) || 237 + spin_is_locked(&eb->lock); 68 238 } 69 239 70 240 /* ··· 245 75 { 246 76 int i; 247 77 struct extent_buffer *eb; 78 + 248 79 for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) { 249 80 eb = path->nodes[i]; 250 81 if (!eb) 251 82 break; 252 83 smp_mb(); 253 - if (!list_empty(&eb->mutex.wait_list)) 84 + if (spin_is_contended(&eb->lock) || 85 + waitqueue_active(&eb->lock_wq)) 254 86 return 1; 255 87 } 256 88 return 0;

fs/btrfs/locking.h

··· 22 22 int btrfs_tree_lock(struct extent_buffer *eb); 23 23 int btrfs_tree_unlock(struct extent_buffer *eb); 24 24 int btrfs_tree_locked(struct extent_buffer *eb); 25 + 25 26 int btrfs_try_tree_lock(struct extent_buffer *eb); 27 + int btrfs_try_spin_lock(struct extent_buffer *eb); 28 + 26 29 int btrfs_path_lock_waiting(struct btrfs_path *path, int level); 30 + 31 + void btrfs_set_lock_blocking(struct extent_buffer *eb); 32 + void btrfs_clear_lock_blocking(struct extent_buffer *eb); 27 33 #endif

fs/btrfs/tree-defrag.c

··· 74 74 u32 nritems; 75 75 76 76 root_node = btrfs_lock_root_node(root); 77 + btrfs_set_lock_blocking(root_node); 77 78 nritems = btrfs_header_nritems(root_node); 78 79 root->defrag_max.objectid = 0; 79 80 /* from above we know this is not a leaf */

fs/btrfs/tree-log.c

··· 1615 1615 1616 1616 btrfs_tree_lock(next); 1617 1617 clean_tree_block(trans, root, next); 1618 + btrfs_set_lock_blocking(next); 1618 1619 btrfs_wait_tree_block_writeback(next); 1619 1620 btrfs_tree_unlock(next); 1620 1621 ··· 1662 1661 next = path->nodes[*level]; 1663 1662 btrfs_tree_lock(next); 1664 1663 clean_tree_block(trans, root, next); 1664 + btrfs_set_lock_blocking(next); 1665 1665 btrfs_wait_tree_block_writeback(next); 1666 1666 btrfs_tree_unlock(next); 1667 1667 ··· 1720 1718 1721 1719 btrfs_tree_lock(next); 1722 1720 clean_tree_block(trans, root, next); 1721 + btrfs_set_lock_blocking(next); 1723 1722 btrfs_wait_tree_block_writeback(next); 1724 1723 btrfs_tree_unlock(next); 1725 1724 ··· 1793 1790 1794 1791 btrfs_tree_lock(next); 1795 1792 clean_tree_block(trans, log, next); 1793 + btrfs_set_lock_blocking(next); 1796 1794 btrfs_wait_tree_block_writeback(next); 1797 1795 btrfs_tree_unlock(next); 1798 1796