Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Btrfs: switch the btrfs tree locks to reader/writer

The btrfs metadata btree is the source of significant
lock contention, especially in the root node. This
commit changes our locking to use a reader/writer
lock.

The lock is built on top of rw spinlocks, and it
extends the lock tracking to remember if we have a
read lock or a write lock when we go to blocking. Atomics
count the number of blocking readers or writers at any
given time.

It removes all of the adaptive spinning from the old code
and uses only the spinning/blocking hints inside of btrfs
to decide when it should continue spinning.

In read heavy workloads this is dramatically faster. In write
heavy workloads we're still faster because of less contention
on the root node lock.

We suffer slightly in dbench because we schedule more often
during write locks, but all other benchmarks so far are improved.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

+439 -226
+209 -59
fs/btrfs/ctree.c
··· 54 54 { 55 55 int i; 56 56 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 57 - if (p->nodes[i] && p->locks[i]) 58 - btrfs_set_lock_blocking(p->nodes[i]); 57 + if (!p->nodes[i] || !p->locks[i]) 58 + continue; 59 + btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]); 60 + if (p->locks[i] == BTRFS_READ_LOCK) 61 + p->locks[i] = BTRFS_READ_LOCK_BLOCKING; 62 + else if (p->locks[i] == BTRFS_WRITE_LOCK) 63 + p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; 59 64 } 60 65 } 61 66 ··· 73 68 * for held 74 69 */ 75 70 noinline void btrfs_clear_path_blocking(struct btrfs_path *p, 76 - struct extent_buffer *held) 71 + struct extent_buffer *held, int held_rw) 77 72 { 78 73 int i; 79 74 ··· 84 79 * really sure by forcing the path to blocking before we clear 85 80 * the path blocking. 86 81 */ 87 - if (held) 88 - btrfs_set_lock_blocking(held); 82 + if (held) { 83 + btrfs_set_lock_blocking_rw(held, held_rw); 84 + if (held_rw == BTRFS_WRITE_LOCK) 85 + held_rw = BTRFS_WRITE_LOCK_BLOCKING; 86 + else if (held_rw == BTRFS_READ_LOCK) 87 + held_rw = BTRFS_READ_LOCK_BLOCKING; 88 + } 89 89 btrfs_set_path_blocking(p); 90 90 #endif 91 91 92 92 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { 93 - if (p->nodes[i] && p->locks[i]) 94 - btrfs_clear_lock_blocking(p->nodes[i]); 93 + if (p->nodes[i] && p->locks[i]) { 94 + btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]); 95 + if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING) 96 + p->locks[i] = BTRFS_WRITE_LOCK; 97 + else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING) 98 + p->locks[i] = BTRFS_READ_LOCK; 99 + } 95 100 } 96 101 97 102 #ifdef CONFIG_DEBUG_LOCK_ALLOC 98 103 if (held) 99 - btrfs_clear_lock_blocking(held); 104 + btrfs_clear_lock_blocking_rw(held, held_rw); 100 105 #endif 101 106 } 102 107 ··· 134 119 if (!p->nodes[i]) 135 120 continue; 136 121 if (p->locks[i]) { 137 - btrfs_tree_unlock(p->nodes[i]); 122 + btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); 138 123 p->locks[i] = 0; 139 124 } 140 125 free_extent_buffer(p->nodes[i]); ··· 177 162 if (eb == root->node) 178 163 break; 179 164 btrfs_tree_unlock(eb); 165 + free_extent_buffer(eb); 166 + } 167 + return eb; 168 + } 169 + 170 + /* loop around taking references on and locking the root node of the 171 + * tree until you end up with a lock on the root. A locked buffer 172 + * is returned, with a reference held. 173 + */ 174 + struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) 175 + { 176 + struct extent_buffer *eb; 177 + 178 + while (1) { 179 + eb = btrfs_root_node(root); 180 + btrfs_tree_read_lock(eb); 181 + if (eb == root->node) 182 + break; 183 + btrfs_tree_read_unlock(eb); 180 184 free_extent_buffer(eb); 181 185 } 182 186 return eb; ··· 896 862 897 863 mid = path->nodes[level]; 898 864 899 - WARN_ON(!path->locks[level]); 865 + WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK && 866 + path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING); 900 867 WARN_ON(btrfs_header_generation(mid) != trans->transid); 901 868 902 869 orig_ptr = btrfs_node_blockptr(mid, orig_slot); ··· 1395 1360 1396 1361 t = path->nodes[i]; 1397 1362 if (i >= lowest_unlock && i > skip_level && path->locks[i]) { 1398 - btrfs_tree_unlock(t); 1363 + btrfs_tree_unlock_rw(t, path->locks[i]); 1399 1364 path->locks[i] = 0; 1400 1365 } 1401 1366 } ··· 1422 1387 continue; 1423 1388 if (!path->locks[i]) 1424 1389 continue; 1425 - btrfs_tree_unlock(path->nodes[i]); 1390 + btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); 1426 1391 path->locks[i] = 0; 1427 1392 } 1428 1393 } ··· 1471 1436 * we can trust our generation number 1472 1437 */ 1473 1438 free_extent_buffer(tmp); 1439 + btrfs_set_path_blocking(p); 1440 + 1474 1441 tmp = read_tree_block(root, blocknr, blocksize, gen); 1475 1442 if (tmp && btrfs_buffer_uptodate(tmp, gen)) { 1476 1443 *eb_ret = tmp; ··· 1528 1491 static int 1529 1492 setup_nodes_for_search(struct btrfs_trans_handle *trans, 1530 1493 struct btrfs_root *root, struct btrfs_path *p, 1531 - struct extent_buffer *b, int level, int ins_len) 1494 + struct extent_buffer *b, int level, int ins_len, 1495 + int *write_lock_level) 1532 1496 { 1533 1497 int ret; 1534 1498 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= 1535 1499 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 1536 1500 int sret; 1501 + 1502 + if (*write_lock_level < level + 1) { 1503 + *write_lock_level = level + 1; 1504 + btrfs_release_path(p); 1505 + goto again; 1506 + } 1537 1507 1538 1508 sret = reada_for_balance(root, p, level); 1539 1509 if (sret) ··· 1548 1504 1549 1505 btrfs_set_path_blocking(p); 1550 1506 sret = split_node(trans, root, p, level); 1551 - btrfs_clear_path_blocking(p, NULL); 1507 + btrfs_clear_path_blocking(p, NULL, 0); 1552 1508 1553 1509 BUG_ON(sret > 0); 1554 1510 if (sret) { ··· 1560 1516 BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { 1561 1517 int sret; 1562 1518 1519 + if (*write_lock_level < level + 1) { 1520 + *write_lock_level = level + 1; 1521 + btrfs_release_path(p); 1522 + goto again; 1523 + } 1524 + 1563 1525 sret = reada_for_balance(root, p, level); 1564 1526 if (sret) 1565 1527 goto again; 1566 1528 1567 1529 btrfs_set_path_blocking(p); 1568 1530 sret = balance_level(trans, root, p, level); 1569 - btrfs_clear_path_blocking(p, NULL); 1531 + btrfs_clear_path_blocking(p, NULL, 0); 1570 1532 1571 1533 if (sret) { 1572 1534 ret = sret; ··· 1616 1566 int err; 1617 1567 int level; 1618 1568 int lowest_unlock = 1; 1569 + int root_lock; 1570 + /* everything at write_lock_level or lower must be write locked */ 1571 + int write_lock_level = 0; 1619 1572 u8 lowest_level = 0; 1620 1573 1621 1574 lowest_level = p->lowest_level; 1622 1575 WARN_ON(lowest_level && ins_len > 0); 1623 1576 WARN_ON(p->nodes[0] != NULL); 1624 1577 1625 - if (ins_len < 0) 1578 + if (ins_len < 0) { 1626 1579 lowest_unlock = 2; 1627 1580 1581 + /* when we are removing items, we might have to go up to level 1582 + * two as we update tree pointers Make sure we keep write 1583 + * for those levels as well 1584 + */ 1585 + write_lock_level = 2; 1586 + } else if (ins_len > 0) { 1587 + /* 1588 + * for inserting items, make sure we have a write lock on 1589 + * level 1 so we can update keys 1590 + */ 1591 + write_lock_level = 1; 1592 + } 1593 + 1594 + if (!cow) 1595 + write_lock_level = -1; 1596 + 1597 + if (cow && (p->keep_locks || p->lowest_level)) 1598 + write_lock_level = BTRFS_MAX_LEVEL; 1599 + 1628 1600 again: 1601 + /* 1602 + * we try very hard to do read locks on the root 1603 + */ 1604 + root_lock = BTRFS_READ_LOCK; 1605 + level = 0; 1629 1606 if (p->search_commit_root) { 1607 + /* 1608 + * the commit roots are read only 1609 + * so we always do read locks 1610 + */ 1630 1611 b = root->commit_root; 1631 1612 extent_buffer_get(b); 1613 + level = btrfs_header_level(b); 1632 1614 if (!p->skip_locking) 1633 - btrfs_tree_lock(b); 1615 + btrfs_tree_read_lock(b); 1634 1616 } else { 1635 - if (p->skip_locking) 1617 + if (p->skip_locking) { 1636 1618 b = btrfs_root_node(root); 1637 - else 1638 - b = btrfs_lock_root_node(root); 1619 + level = btrfs_header_level(b); 1620 + } else { 1621 + /* we don't know the level of the root node 1622 + * until we actually have it read locked 1623 + */ 1624 + b = btrfs_read_lock_root_node(root); 1625 + level = btrfs_header_level(b); 1626 + if (level <= write_lock_level) { 1627 + /* whoops, must trade for write lock */ 1628 + btrfs_tree_read_unlock(b); 1629 + free_extent_buffer(b); 1630 + b = btrfs_lock_root_node(root); 1631 + root_lock = BTRFS_WRITE_LOCK; 1632 + 1633 + /* the level might have changed, check again */ 1634 + level = btrfs_header_level(b); 1635 + } 1636 + } 1639 1637 } 1638 + p->nodes[level] = b; 1639 + if (!p->skip_locking) 1640 + p->locks[level] = root_lock; 1640 1641 1641 1642 while (b) { 1642 1643 level = btrfs_header_level(b); ··· 1696 1595 * setup the path here so we can release it under lock 1697 1596 * contention with the cow code 1698 1597 */ 1699 - p->nodes[level] = b; 1700 - if (!p->skip_locking) 1701 - p->locks[level] = 1; 1702 - 1703 1598 if (cow) { 1704 1599 /* 1705 1600 * if we don't really need to cow this block ··· 1706 1609 goto cow_done; 1707 1610 1708 1611 btrfs_set_path_blocking(p); 1612 + 1613 + /* 1614 + * must have write locks on this node and the 1615 + * parent 1616 + */ 1617 + if (level + 1 > write_lock_level) { 1618 + write_lock_level = level + 1; 1619 + btrfs_release_path(p); 1620 + goto again; 1621 + } 1709 1622 1710 1623 err = btrfs_cow_block(trans, root, b, 1711 1624 p->nodes[level + 1], ··· 1729 1622 BUG_ON(!cow && ins_len); 1730 1623 1731 1624 p->nodes[level] = b; 1732 - if (!p->skip_locking) 1733 - p->locks[level] = 1; 1734 - 1735 - btrfs_clear_path_blocking(p, NULL); 1625 + btrfs_clear_path_blocking(p, NULL, 0); 1736 1626 1737 1627 /* 1738 1628 * we have a lock on b and as long as we aren't changing ··· 1755 1651 } 1756 1652 p->slots[level] = slot; 1757 1653 err = setup_nodes_for_search(trans, root, p, b, level, 1758 - ins_len); 1654 + ins_len, &write_lock_level); 1759 1655 if (err == -EAGAIN) 1760 1656 goto again; 1761 1657 if (err) { ··· 1764 1660 } 1765 1661 b = p->nodes[level]; 1766 1662 slot = p->slots[level]; 1663 + 1664 + /* 1665 + * slot 0 is special, if we change the key 1666 + * we have to update the parent pointer 1667 + * which means we must have a write lock 1668 + * on the parent 1669 + */ 1670 + if (slot == 0 && cow && 1671 + write_lock_level < level + 1) { 1672 + write_lock_level = level + 1; 1673 + btrfs_release_path(p); 1674 + goto again; 1675 + } 1767 1676 1768 1677 unlock_up(p, level, lowest_unlock); 1769 1678 ··· 1796 1679 } 1797 1680 1798 1681 if (!p->skip_locking) { 1799 - btrfs_clear_path_blocking(p, NULL); 1800 - err = btrfs_try_spin_lock(b); 1801 - 1802 - if (!err) { 1803 - btrfs_set_path_blocking(p); 1804 - btrfs_tree_lock(b); 1805 - btrfs_clear_path_blocking(p, b); 1682 + level = btrfs_header_level(b); 1683 + if (level <= write_lock_level) { 1684 + err = btrfs_try_tree_write_lock(b); 1685 + if (!err) { 1686 + btrfs_set_path_blocking(p); 1687 + btrfs_tree_lock(b); 1688 + btrfs_clear_path_blocking(p, b, 1689 + BTRFS_WRITE_LOCK); 1690 + } 1691 + p->locks[level] = BTRFS_WRITE_LOCK; 1692 + } else { 1693 + err = btrfs_try_tree_read_lock(b); 1694 + if (!err) { 1695 + btrfs_set_path_blocking(p); 1696 + btrfs_tree_read_lock(b); 1697 + btrfs_clear_path_blocking(p, b, 1698 + BTRFS_READ_LOCK); 1699 + } 1700 + p->locks[level] = BTRFS_READ_LOCK; 1806 1701 } 1702 + p->nodes[level] = b; 1807 1703 } 1808 1704 } else { 1809 1705 p->slots[level] = slot; 1810 1706 if (ins_len > 0 && 1811 1707 btrfs_leaf_free_space(root, b) < ins_len) { 1708 + if (write_lock_level < 1) { 1709 + write_lock_level = 1; 1710 + btrfs_release_path(p); 1711 + goto again; 1712 + } 1713 + 1812 1714 btrfs_set_path_blocking(p); 1813 1715 err = split_leaf(trans, root, key, 1814 1716 p, ins_len, ret == 0); 1815 - btrfs_clear_path_blocking(p, NULL); 1717 + btrfs_clear_path_blocking(p, NULL, 0); 1816 1718 1817 1719 BUG_ON(err > 0); 1818 1720 if (err) { ··· 2112 1976 add_root_to_dirty_list(root); 2113 1977 extent_buffer_get(c); 2114 1978 path->nodes[level] = c; 2115 - path->locks[level] = 1; 1979 + path->locks[level] = BTRFS_WRITE_LOCK; 2116 1980 path->slots[level] = 0; 2117 1981 return 0; 2118 1982 } ··· 3955 3819 3956 3820 WARN_ON(!path->keep_locks); 3957 3821 again: 3958 - cur = btrfs_lock_root_node(root); 3822 + cur = btrfs_read_lock_root_node(root); 3959 3823 level = btrfs_header_level(cur); 3960 3824 WARN_ON(path->nodes[level]); 3961 3825 path->nodes[level] = cur; 3962 - path->locks[level] = 1; 3826 + path->locks[level] = BTRFS_READ_LOCK; 3963 3827 3964 3828 if (btrfs_header_generation(cur) < min_trans) { 3965 3829 ret = 1; ··· 4049 3913 cur = read_node_slot(root, cur, slot); 4050 3914 BUG_ON(!cur); 4051 3915 4052 - btrfs_tree_lock(cur); 3916 + btrfs_tree_read_lock(cur); 4053 3917 4054 - path->locks[level - 1] = 1; 3918 + path->locks[level - 1] = BTRFS_READ_LOCK; 4055 3919 path->nodes[level - 1] = cur; 4056 3920 unlock_up(path, level, 1); 4057 - btrfs_clear_path_blocking(path, NULL); 3921 + btrfs_clear_path_blocking(path, NULL, 0); 4058 3922 } 4059 3923 out: 4060 3924 if (ret == 0) ··· 4170 4034 int ret; 4171 4035 int old_spinning = path->leave_spinning; 4172 4036 int force_blocking = 0; 4037 + int next_rw_lock = 0; 4173 4038 4174 4039 nritems = btrfs_header_nritems(path->nodes[0]); 4175 4040 if (nritems == 0) ··· 4188 4051 again: 4189 4052 level = 1; 4190 4053 next = NULL; 4054 + next_rw_lock = 0; 4191 4055 btrfs_release_path(path); 4192 4056 4193 4057 path->keep_locks = 1; ··· 4234 4096 } 4235 4097 4236 4098 if (next) { 4237 - btrfs_tree_unlock(next); 4099 + btrfs_tree_unlock_rw(next, next_rw_lock); 4238 4100 free_extent_buffer(next); 4239 4101 } 4240 4102 4241 4103 next = c; 4104 + next_rw_lock = path->locks[level]; 4242 4105 ret = read_block_for_search(NULL, root, path, &next, level, 4243 4106 slot, &key); 4244 4107 if (ret == -EAGAIN) ··· 4251 4112 } 4252 4113 4253 4114 if (!path->skip_locking) { 4254 - ret = btrfs_try_spin_lock(next); 4115 + ret = btrfs_try_tree_read_lock(next); 4255 4116 if (!ret) { 4256 4117 btrfs_set_path_blocking(path); 4257 - btrfs_tree_lock(next); 4258 - if (!force_blocking) 4259 - btrfs_clear_path_blocking(path, next); 4118 + btrfs_tree_read_lock(next); 4119 + if (!force_blocking) { 4120 + btrfs_clear_path_blocking(path, next, 4121 + BTRFS_READ_LOCK); 4122 + } 4260 4123 } 4261 - if (force_blocking) 4262 - btrfs_set_lock_blocking(next); 4124 + if (force_blocking) { 4125 + btrfs_set_lock_blocking_rw(next, 4126 + BTRFS_READ_LOCK); 4127 + next_rw_lock = BTRFS_READ_LOCK_BLOCKING; 4128 + } else { 4129 + next_rw_lock = BTRFS_READ_LOCK; 4130 + } 4263 4131 } 4264 4132 break; 4265 4133 } ··· 4275 4129 level--; 4276 4130 c = path->nodes[level]; 4277 4131 if (path->locks[level]) 4278 - btrfs_tree_unlock(c); 4132 + btrfs_tree_unlock_rw(c, path->locks[level]); 4279 4133 4280 4134 free_extent_buffer(c); 4281 4135 path->nodes[level] = next; 4282 4136 path->slots[level] = 0; 4283 4137 if (!path->skip_locking) 4284 - path->locks[level] = 1; 4285 - 4138 + path->locks[level] = next_rw_lock; 4286 4139 if (!level) 4287 4140 break; 4288 4141 ··· 4296 4151 } 4297 4152 4298 4153 if (!path->skip_locking) { 4299 - btrfs_assert_tree_locked(path->nodes[level]); 4300 - ret = btrfs_try_spin_lock(next); 4154 + ret = btrfs_try_tree_read_lock(next); 4301 4155 if (!ret) { 4302 4156 btrfs_set_path_blocking(path); 4303 - btrfs_tree_lock(next); 4157 + btrfs_tree_read_lock(next); 4304 4158 if (!force_blocking) 4305 - btrfs_clear_path_blocking(path, next); 4159 + btrfs_clear_path_blocking(path, next, 4160 + BTRFS_READ_LOCK); 4306 4161 } 4307 - if (force_blocking) 4308 - btrfs_set_lock_blocking(next); 4162 + if (force_blocking) { 4163 + btrfs_set_lock_blocking_rw(next, 4164 + BTRFS_READ_LOCK); 4165 + next_rw_lock = BTRFS_READ_LOCK_BLOCKING; 4166 + } else { 4167 + next_rw_lock = BTRFS_READ_LOCK; 4168 + } 4309 4169 } 4310 4170 } 4311 4171 ret = 0;
+1 -1
fs/btrfs/ctree.h
··· 2333 2333 void btrfs_free_path(struct btrfs_path *p); 2334 2334 void btrfs_set_path_blocking(struct btrfs_path *p); 2335 2335 void btrfs_clear_path_blocking(struct btrfs_path *p, 2336 - struct extent_buffer *held); 2336 + struct extent_buffer *held, int held_rw); 2337 2337 void btrfs_unlock_up_safe(struct btrfs_path *p, int level); 2338 2338 2339 2339 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+1 -1
fs/btrfs/delayed-inode.c
··· 735 735 } 736 736 737 737 /* reset all the locked nodes in the patch to spinning locks. */ 738 - btrfs_clear_path_blocking(path, NULL); 738 + btrfs_clear_path_blocking(path, NULL, 0); 739 739 740 740 /* insert the keys of the items */ 741 741 ret = setup_items_for_insert(trans, root, path, keys, data_size,
+10 -10
fs/btrfs/extent-tree.c
··· 5912 5912 return 1; 5913 5913 5914 5914 if (path->locks[level] && !wc->keep_locks) { 5915 - btrfs_tree_unlock(eb); 5915 + btrfs_tree_unlock_rw(eb, path->locks[level]); 5916 5916 path->locks[level] = 0; 5917 5917 } 5918 5918 return 0; ··· 5936 5936 * keep the tree lock 5937 5937 */ 5938 5938 if (path->locks[level] && level > 0) { 5939 - btrfs_tree_unlock(eb); 5939 + btrfs_tree_unlock_rw(eb, path->locks[level]); 5940 5940 path->locks[level] = 0; 5941 5941 } 5942 5942 return 0; ··· 6049 6049 BUG_ON(level != btrfs_header_level(next)); 6050 6050 path->nodes[level] = next; 6051 6051 path->slots[level] = 0; 6052 - path->locks[level] = 1; 6052 + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 6053 6053 wc->level = level; 6054 6054 if (wc->level == 1) 6055 6055 wc->reada_slot = 0; ··· 6120 6120 BUG_ON(level == 0); 6121 6121 btrfs_tree_lock(eb); 6122 6122 btrfs_set_lock_blocking(eb); 6123 - path->locks[level] = 1; 6123 + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 6124 6124 6125 6125 ret = btrfs_lookup_extent_info(trans, root, 6126 6126 eb->start, eb->len, ··· 6129 6129 BUG_ON(ret); 6130 6130 BUG_ON(wc->refs[level] == 0); 6131 6131 if (wc->refs[level] == 1) { 6132 - btrfs_tree_unlock(eb); 6133 - path->locks[level] = 0; 6132 + btrfs_tree_unlock_rw(eb, path->locks[level]); 6134 6133 return 1; 6135 6134 } 6136 6135 } ··· 6151 6152 btrfs_header_generation(eb) == trans->transid) { 6152 6153 btrfs_tree_lock(eb); 6153 6154 btrfs_set_lock_blocking(eb); 6154 - path->locks[level] = 1; 6155 + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 6155 6156 } 6156 6157 clean_tree_block(trans, root, eb); 6157 6158 } ··· 6230 6231 return 0; 6231 6232 6232 6233 if (path->locks[level]) { 6233 - btrfs_tree_unlock(path->nodes[level]); 6234 + btrfs_tree_unlock_rw(path->nodes[level], 6235 + path->locks[level]); 6234 6236 path->locks[level] = 0; 6235 6237 } 6236 6238 free_extent_buffer(path->nodes[level]); ··· 6283 6283 path->nodes[level] = btrfs_lock_root_node(root); 6284 6284 btrfs_set_lock_blocking(path->nodes[level]); 6285 6285 path->slots[level] = 0; 6286 - path->locks[level] = 1; 6286 + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 6287 6287 memset(&wc->update_progress, 0, 6288 6288 sizeof(wc->update_progress)); 6289 6289 } else { ··· 6451 6451 level = btrfs_header_level(node); 6452 6452 path->nodes[level] = node; 6453 6453 path->slots[level] = 0; 6454 - path->locks[level] = 1; 6454 + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 6455 6455 6456 6456 wc->refs[parent_level] = 1; 6457 6457 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+9 -2
fs/btrfs/extent_io.c
··· 3017 3017 return NULL; 3018 3018 eb->start = start; 3019 3019 eb->len = len; 3020 - spin_lock_init(&eb->lock); 3021 - init_waitqueue_head(&eb->lock_wq); 3020 + rwlock_init(&eb->lock); 3021 + atomic_set(&eb->write_locks, 0); 3022 + atomic_set(&eb->read_locks, 0); 3023 + atomic_set(&eb->blocking_readers, 0); 3024 + atomic_set(&eb->blocking_writers, 0); 3025 + atomic_set(&eb->spinning_readers, 0); 3026 + atomic_set(&eb->spinning_writers, 0); 3027 + init_waitqueue_head(&eb->write_lock_wq); 3028 + init_waitqueue_head(&eb->read_lock_wq); 3022 3029 3023 3030 #if LEAK_DEBUG 3024 3031 spin_lock_irqsave(&leak_lock, flags);
+18 -6
fs/btrfs/extent_io.h
··· 128 128 struct rcu_head rcu_head; 129 129 atomic_t refs; 130 130 131 - /* the spinlock is used to protect most operations */ 132 - spinlock_t lock; 131 + /* count of read lock holders on the extent buffer */ 132 + atomic_t write_locks; 133 + atomic_t read_locks; 134 + atomic_t blocking_writers; 135 + atomic_t blocking_readers; 136 + atomic_t spinning_readers; 137 + atomic_t spinning_writers; 133 138 134 - /* 135 - * when we keep the lock held while blocking, waiters go onto 136 - * the wq 139 + /* protects write locks */ 140 + rwlock_t lock; 141 + 142 + /* readers use lock_wq while they wait for the write 143 + * lock holders to unlock 137 144 */ 138 - wait_queue_head_t lock_wq; 145 + wait_queue_head_t write_lock_wq; 146 + 147 + /* writers use read_lock_wq while they wait for readers 148 + * to unlock 149 + */ 150 + wait_queue_head_t read_lock_wq; 139 151 }; 140 152 141 153 static inline void extent_set_compress_type(unsigned long *bio_flags,
+154 -142
fs/btrfs/locking.c
··· 24 24 #include "extent_io.h" 25 25 #include "locking.h" 26 26 27 - static inline void spin_nested(struct extent_buffer *eb) 27 + void btrfs_assert_tree_read_locked(struct extent_buffer *eb); 28 + 29 + /* 30 + * if we currently have a spinning reader or writer lock 31 + * (indicated by the rw flag) this will bump the count 32 + * of blocking holders and drop the spinlock. 33 + */ 34 + void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) 28 35 { 29 - spin_lock(&eb->lock); 36 + if (rw == BTRFS_WRITE_LOCK) { 37 + if (atomic_read(&eb->blocking_writers) == 0) { 38 + WARN_ON(atomic_read(&eb->spinning_writers) != 1); 39 + atomic_dec(&eb->spinning_writers); 40 + btrfs_assert_tree_locked(eb); 41 + atomic_inc(&eb->blocking_writers); 42 + write_unlock(&eb->lock); 43 + } 44 + } else if (rw == BTRFS_READ_LOCK) { 45 + btrfs_assert_tree_read_locked(eb); 46 + atomic_inc(&eb->blocking_readers); 47 + WARN_ON(atomic_read(&eb->spinning_readers) == 0); 48 + atomic_dec(&eb->spinning_readers); 49 + read_unlock(&eb->lock); 50 + } 51 + return; 30 52 } 31 53 32 54 /* 33 - * Setting a lock to blocking will drop the spinlock and set the 34 - * flag that forces other procs who want the lock to wait. After 35 - * this you can safely schedule with the lock held. 55 + * if we currently have a blocking lock, take the spinlock 56 + * and drop our blocking count 36 57 */ 37 - void btrfs_set_lock_blocking(struct extent_buffer *eb) 58 + void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) 38 59 { 39 - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { 40 - set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); 41 - spin_unlock(&eb->lock); 60 + if (rw == BTRFS_WRITE_LOCK_BLOCKING) { 61 + BUG_ON(atomic_read(&eb->blocking_writers) != 1); 62 + write_lock(&eb->lock); 63 + WARN_ON(atomic_read(&eb->spinning_writers)); 64 + atomic_inc(&eb->spinning_writers); 65 + if (atomic_dec_and_test(&eb->blocking_writers)) 66 + wake_up(&eb->write_lock_wq); 67 + } else if (rw == BTRFS_READ_LOCK_BLOCKING) { 68 + BUG_ON(atomic_read(&eb->blocking_readers) == 0); 69 + read_lock(&eb->lock); 70 + atomic_inc(&eb->spinning_readers); 71 + if (atomic_dec_and_test(&eb->blocking_readers)) 72 + wake_up(&eb->read_lock_wq); 42 73 } 43 - /* exit with the spin lock released and the bit set */ 74 + return; 44 75 } 45 76 46 77 /* 47 - * clearing the blocking flag will take the spinlock again. 48 - * After this you can't safely schedule 78 + * take a spinning read lock. This will wait for any blocking 79 + * writers 49 80 */ 50 - void btrfs_clear_lock_blocking(struct extent_buffer *eb) 81 + void btrfs_tree_read_lock(struct extent_buffer *eb) 51 82 { 52 - if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { 53 - spin_nested(eb); 54 - clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); 55 - smp_mb__after_clear_bit(); 83 + again: 84 + wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); 85 + read_lock(&eb->lock); 86 + if (atomic_read(&eb->blocking_writers)) { 87 + read_unlock(&eb->lock); 88 + wait_event(eb->write_lock_wq, 89 + atomic_read(&eb->blocking_writers) == 0); 90 + goto again; 56 91 } 57 - /* exit with the spin lock held */ 92 + atomic_inc(&eb->read_locks); 93 + atomic_inc(&eb->spinning_readers); 58 94 } 59 95 60 96 /* 61 - * unfortunately, many of the places that currently set a lock to blocking 62 - * don't end up blocking for very long, and often they don't block 63 - * at all. For a dbench 50 run, if we don't spin on the blocking bit 64 - * at all, the context switch rate can jump up to 400,000/sec or more. 65 - * 66 - * So, we're still stuck with this crummy spin on the blocking bit, 67 - * at least until the most common causes of the short blocks 68 - * can be dealt with. 97 + * returns 1 if we get the read lock and 0 if we don't 98 + * this won't wait for blocking writers 69 99 */ 70 - static int btrfs_spin_on_block(struct extent_buffer *eb) 100 + int btrfs_try_tree_read_lock(struct extent_buffer *eb) 71 101 { 72 - int i; 102 + if (atomic_read(&eb->blocking_writers)) 103 + return 0; 73 104 74 - for (i = 0; i < 512; i++) { 75 - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 76 - return 1; 77 - if (need_resched()) 78 - break; 79 - cpu_relax(); 105 + read_lock(&eb->lock); 106 + if (atomic_read(&eb->blocking_writers)) { 107 + read_unlock(&eb->lock); 108 + return 0; 80 109 } 81 - return 0; 82 - } 83 - 84 - /* 85 - * This is somewhat different from trylock. It will take the 86 - * spinlock but if it finds the lock is set to blocking, it will 87 - * return without the lock held. 88 - * 89 - * returns 1 if it was able to take the lock and zero otherwise 90 - * 91 - * After this call, scheduling is not safe without first calling 92 - * btrfs_set_lock_blocking() 93 - */ 94 - int btrfs_try_spin_lock(struct extent_buffer *eb) 95 - { 96 - int i; 97 - 98 - if (btrfs_spin_on_block(eb)) { 99 - spin_nested(eb); 100 - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 101 - return 1; 102 - spin_unlock(&eb->lock); 103 - } 104 - /* spin for a bit on the BLOCKING flag */ 105 - for (i = 0; i < 2; i++) { 106 - cpu_relax(); 107 - if (!btrfs_spin_on_block(eb)) 108 - break; 109 - 110 - spin_nested(eb); 111 - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 112 - return 1; 113 - spin_unlock(&eb->lock); 114 - } 115 - return 0; 116 - } 117 - 118 - /* 119 - * the autoremove wake function will return 0 if it tried to wake up 120 - * a process that was already awake, which means that process won't 121 - * count as an exclusive wakeup. The waitq code will continue waking 122 - * procs until it finds one that was actually sleeping. 123 - * 124 - * For btrfs, this isn't quite what we want. We want a single proc 125 - * to be notified that the lock is ready for taking. If that proc 126 - * already happen to be awake, great, it will loop around and try for 127 - * the lock. 128 - * 129 - * So, btrfs_wake_function always returns 1, even when the proc that we 130 - * tried to wake up was already awake. 131 - */ 132 - static int btrfs_wake_function(wait_queue_t *wait, unsigned mode, 133 - int sync, void *key) 134 - { 135 - autoremove_wake_function(wait, mode, sync, key); 110 + atomic_inc(&eb->read_locks); 111 + atomic_inc(&eb->spinning_readers); 136 112 return 1; 137 113 } 138 114 139 115 /* 140 - * returns with the extent buffer spinlocked. 141 - * 142 - * This will spin and/or wait as required to take the lock, and then 143 - * return with the spinlock held. 144 - * 145 - * After this call, scheduling is not safe without first calling 146 - * btrfs_set_lock_blocking() 116 + * returns 1 if we get the read lock and 0 if we don't 117 + * this won't wait for blocking writers or readers 118 + */ 119 + int btrfs_try_tree_write_lock(struct extent_buffer *eb) 120 + { 121 + if (atomic_read(&eb->blocking_writers) || 122 + atomic_read(&eb->blocking_readers)) 123 + return 0; 124 + write_lock(&eb->lock); 125 + if (atomic_read(&eb->blocking_writers) || 126 + atomic_read(&eb->blocking_readers)) { 127 + write_unlock(&eb->lock); 128 + return 0; 129 + } 130 + atomic_inc(&eb->write_locks); 131 + atomic_inc(&eb->spinning_writers); 132 + return 1; 133 + } 134 + 135 + /* 136 + * drop a spinning read lock 137 + */ 138 + void btrfs_tree_read_unlock(struct extent_buffer *eb) 139 + { 140 + btrfs_assert_tree_read_locked(eb); 141 + WARN_ON(atomic_read(&eb->spinning_readers) == 0); 142 + atomic_dec(&eb->spinning_readers); 143 + atomic_dec(&eb->read_locks); 144 + read_unlock(&eb->lock); 145 + } 146 + 147 + /* 148 + * drop a blocking read lock 149 + */ 150 + void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) 151 + { 152 + btrfs_assert_tree_read_locked(eb); 153 + WARN_ON(atomic_read(&eb->blocking_readers) == 0); 154 + if (atomic_dec_and_test(&eb->blocking_readers)) 155 + wake_up(&eb->read_lock_wq); 156 + atomic_dec(&eb->read_locks); 157 + } 158 + 159 + /* 160 + * take a spinning write lock. This will wait for both 161 + * blocking readers or writers 147 162 */ 148 163 int btrfs_tree_lock(struct extent_buffer *eb) 149 164 { 150 - DEFINE_WAIT(wait); 151 - wait.func = btrfs_wake_function; 152 - 153 - if (!btrfs_spin_on_block(eb)) 154 - goto sleep; 155 - 156 - while(1) { 157 - spin_nested(eb); 158 - 159 - /* nobody is blocking, exit with the spinlock held */ 160 - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 161 - return 0; 162 - 163 - /* 164 - * we have the spinlock, but the real owner is blocking. 165 - * wait for them 166 - */ 167 - spin_unlock(&eb->lock); 168 - 169 - /* 170 - * spin for a bit, and if the blocking flag goes away, 171 - * loop around 172 - */ 173 - cpu_relax(); 174 - if (btrfs_spin_on_block(eb)) 175 - continue; 176 - sleep: 177 - prepare_to_wait_exclusive(&eb->lock_wq, &wait, 178 - TASK_UNINTERRUPTIBLE); 179 - 180 - if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 181 - schedule(); 182 - 183 - finish_wait(&eb->lock_wq, &wait); 165 + again: 166 + wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); 167 + wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); 168 + write_lock(&eb->lock); 169 + if (atomic_read(&eb->blocking_readers)) { 170 + write_unlock(&eb->lock); 171 + wait_event(eb->read_lock_wq, 172 + atomic_read(&eb->blocking_readers) == 0); 173 + goto again; 184 174 } 175 + if (atomic_read(&eb->blocking_writers)) { 176 + write_unlock(&eb->lock); 177 + wait_event(eb->write_lock_wq, 178 + atomic_read(&eb->blocking_writers) == 0); 179 + goto again; 180 + } 181 + WARN_ON(atomic_read(&eb->spinning_writers)); 182 + atomic_inc(&eb->spinning_writers); 183 + atomic_inc(&eb->write_locks); 185 184 return 0; 186 185 } 187 186 187 + /* 188 + * drop a spinning or a blocking write lock. 189 + */ 188 190 int btrfs_tree_unlock(struct extent_buffer *eb) 189 191 { 190 - /* 191 - * if we were a blocking owner, we don't have the spinlock held 192 - * just clear the bit and look for waiters 193 - */ 194 - if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 195 - smp_mb__after_clear_bit(); 196 - else 197 - spin_unlock(&eb->lock); 192 + int blockers = atomic_read(&eb->blocking_writers); 198 193 199 - if (waitqueue_active(&eb->lock_wq)) 200 - wake_up(&eb->lock_wq); 194 + BUG_ON(blockers > 1); 195 + 196 + btrfs_assert_tree_locked(eb); 197 + atomic_dec(&eb->write_locks); 198 + 199 + if (blockers) { 200 + WARN_ON(atomic_read(&eb->spinning_writers)); 201 + atomic_dec(&eb->blocking_writers); 202 + smp_wmb(); 203 + wake_up(&eb->write_lock_wq); 204 + } else { 205 + WARN_ON(atomic_read(&eb->spinning_writers) != 1); 206 + atomic_dec(&eb->spinning_writers); 207 + write_unlock(&eb->lock); 208 + } 201 209 return 0; 202 210 } 203 211 204 212 void btrfs_assert_tree_locked(struct extent_buffer *eb) 205 213 { 206 - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 207 - assert_spin_locked(&eb->lock); 214 + BUG_ON(!atomic_read(&eb->write_locks)); 215 + } 216 + 217 + void btrfs_assert_tree_read_locked(struct extent_buffer *eb) 218 + { 219 + BUG_ON(!atomic_read(&eb->read_locks)); 208 220 }
+34 -2
fs/btrfs/locking.h
··· 19 19 #ifndef __BTRFS_LOCKING_ 20 20 #define __BTRFS_LOCKING_ 21 21 22 + #define BTRFS_WRITE_LOCK 1 23 + #define BTRFS_READ_LOCK 2 24 + #define BTRFS_WRITE_LOCK_BLOCKING 3 25 + #define BTRFS_READ_LOCK_BLOCKING 4 26 + 22 27 int btrfs_tree_lock(struct extent_buffer *eb); 23 28 int btrfs_tree_unlock(struct extent_buffer *eb); 24 29 int btrfs_try_spin_lock(struct extent_buffer *eb); 25 30 26 - void btrfs_set_lock_blocking(struct extent_buffer *eb); 27 - void btrfs_clear_lock_blocking(struct extent_buffer *eb); 31 + void btrfs_tree_read_lock(struct extent_buffer *eb); 32 + void btrfs_tree_read_unlock(struct extent_buffer *eb); 33 + void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); 34 + void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw); 35 + void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw); 28 36 void btrfs_assert_tree_locked(struct extent_buffer *eb); 37 + int btrfs_try_tree_read_lock(struct extent_buffer *eb); 38 + int btrfs_try_tree_write_lock(struct extent_buffer *eb); 39 + 40 + static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) 41 + { 42 + if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING) 43 + btrfs_tree_unlock(eb); 44 + else if (rw == BTRFS_READ_LOCK_BLOCKING) 45 + btrfs_tree_read_unlock_blocking(eb); 46 + else if (rw == BTRFS_READ_LOCK) 47 + btrfs_tree_read_unlock(eb); 48 + else 49 + BUG(); 50 + } 51 + 52 + static inline void btrfs_set_lock_blocking(struct extent_buffer *eb) 53 + { 54 + btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK); 55 + } 56 + 57 + static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb) 58 + { 59 + btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING); 60 + } 29 61 #endif
+3 -3
fs/btrfs/tree-log.c
··· 1730 1730 btrfs_read_buffer(next, ptr_gen); 1731 1731 1732 1732 btrfs_tree_lock(next); 1733 - clean_tree_block(trans, root, next); 1734 1733 btrfs_set_lock_blocking(next); 1734 + clean_tree_block(trans, root, next); 1735 1735 btrfs_wait_tree_block_writeback(next); 1736 1736 btrfs_tree_unlock(next); 1737 1737 ··· 1796 1796 next = path->nodes[*level]; 1797 1797 1798 1798 btrfs_tree_lock(next); 1799 - clean_tree_block(trans, root, next); 1800 1799 btrfs_set_lock_blocking(next); 1800 + clean_tree_block(trans, root, next); 1801 1801 btrfs_wait_tree_block_writeback(next); 1802 1802 btrfs_tree_unlock(next); 1803 1803 ··· 1864 1864 next = path->nodes[orig_level]; 1865 1865 1866 1866 btrfs_tree_lock(next); 1867 - clean_tree_block(trans, log, next); 1868 1867 btrfs_set_lock_blocking(next); 1868 + clean_tree_block(trans, log, next); 1869 1869 btrfs_wait_tree_block_writeback(next); 1870 1870 btrfs_tree_unlock(next); 1871 1871