Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable

* git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable:
Btrfs: BUG to BUG_ON changes
Btrfs: remove dead code
Btrfs: remove dead code
Btrfs: fix typos in comments
Btrfs: remove unused ftrace include
Btrfs: fix __ucmpdi2 compile bug on 32 bit builds
Btrfs: free inode struct when btrfs_new_inode fails
Btrfs: fix race in worker_loop
Btrfs: add flushoncommit mount option
Btrfs: notreelog mount option
Btrfs: introduce btrfs_show_options
Btrfs: rework allocation clustering
Btrfs: Optimize locking in btrfs_next_leaf()
Btrfs: break up btrfs_search_slot into smaller pieces
Btrfs: kill the pinned_mutex
Btrfs: kill the block group alloc mutex
Btrfs: clean up find_free_extent
Btrfs: free space cache cleanups
Btrfs: unplug in the async bio submission threads
Btrfs: keep processing bios for a given bdev if our proc is batching

+994 -556
+5 -2
fs/btrfs/async-thread.c
··· 20 20 #include <linux/list.h> 21 21 #include <linux/spinlock.h> 22 22 #include <linux/freezer.h> 23 - #include <linux/ftrace.h> 24 23 #include "async-thread.h" 25 24 26 25 #define WORK_QUEUED_BIT 0 ··· 194 195 if (!list_empty(&worker->pending)) 195 196 continue; 196 197 198 + if (kthread_should_stop()) 199 + break; 200 + 197 201 /* still no more work?, sleep for real */ 198 202 spin_lock_irq(&worker->lock); 199 203 set_current_state(TASK_INTERRUPTIBLE); ··· 210 208 worker->working = 0; 211 209 spin_unlock_irq(&worker->lock); 212 210 213 - schedule(); 211 + if (!kthread_should_stop()) 212 + schedule(); 214 213 } 215 214 __set_current_state(TASK_RUNNING); 216 215 }
+197 -115
fs/btrfs/ctree.c
··· 1244 1244 * readahead one full node of leaves, finding things that are close 1245 1245 * to the block in 'slot', and triggering ra on them. 1246 1246 */ 1247 - static noinline void reada_for_search(struct btrfs_root *root, 1248 - struct btrfs_path *path, 1249 - int level, int slot, u64 objectid) 1247 + static void reada_for_search(struct btrfs_root *root, 1248 + struct btrfs_path *path, 1249 + int level, int slot, u64 objectid) 1250 1250 { 1251 1251 struct extent_buffer *node; 1252 1252 struct btrfs_disk_key disk_key; ··· 1447 1447 } 1448 1448 1449 1449 /* 1450 + * helper function for btrfs_search_slot. The goal is to find a block 1451 + * in cache without setting the path to blocking. If we find the block 1452 + * we return zero and the path is unchanged. 1453 + * 1454 + * If we can't find the block, we set the path blocking and do some 1455 + * reada. -EAGAIN is returned and the search must be repeated. 1456 + */ 1457 + static int 1458 + read_block_for_search(struct btrfs_trans_handle *trans, 1459 + struct btrfs_root *root, struct btrfs_path *p, 1460 + struct extent_buffer **eb_ret, int level, int slot, 1461 + struct btrfs_key *key) 1462 + { 1463 + u64 blocknr; 1464 + u64 gen; 1465 + u32 blocksize; 1466 + struct extent_buffer *b = *eb_ret; 1467 + struct extent_buffer *tmp; 1468 + 1469 + blocknr = btrfs_node_blockptr(b, slot); 1470 + gen = btrfs_node_ptr_generation(b, slot); 1471 + blocksize = btrfs_level_size(root, level - 1); 1472 + 1473 + tmp = btrfs_find_tree_block(root, blocknr, blocksize); 1474 + if (tmp && btrfs_buffer_uptodate(tmp, gen)) { 1475 + *eb_ret = tmp; 1476 + return 0; 1477 + } 1478 + 1479 + /* 1480 + * reduce lock contention at high levels 1481 + * of the btree by dropping locks before 1482 + * we read. 1483 + */ 1484 + btrfs_release_path(NULL, p); 1485 + if (tmp) 1486 + free_extent_buffer(tmp); 1487 + if (p->reada) 1488 + reada_for_search(root, p, level, slot, key->objectid); 1489 + 1490 + tmp = read_tree_block(root, blocknr, blocksize, gen); 1491 + if (tmp) 1492 + free_extent_buffer(tmp); 1493 + return -EAGAIN; 1494 + } 1495 + 1496 + /* 1497 + * helper function for btrfs_search_slot. This does all of the checks 1498 + * for node-level blocks and does any balancing required based on 1499 + * the ins_len. 1500 + * 1501 + * If no extra work was required, zero is returned. If we had to 1502 + * drop the path, -EAGAIN is returned and btrfs_search_slot must 1503 + * start over 1504 + */ 1505 + static int 1506 + setup_nodes_for_search(struct btrfs_trans_handle *trans, 1507 + struct btrfs_root *root, struct btrfs_path *p, 1508 + struct extent_buffer *b, int level, int ins_len) 1509 + { 1510 + int ret; 1511 + if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= 1512 + BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 1513 + int sret; 1514 + 1515 + sret = reada_for_balance(root, p, level); 1516 + if (sret) 1517 + goto again; 1518 + 1519 + btrfs_set_path_blocking(p); 1520 + sret = split_node(trans, root, p, level); 1521 + btrfs_clear_path_blocking(p, NULL); 1522 + 1523 + BUG_ON(sret > 0); 1524 + if (sret) { 1525 + ret = sret; 1526 + goto done; 1527 + } 1528 + b = p->nodes[level]; 1529 + } else if (ins_len < 0 && btrfs_header_nritems(b) < 1530 + BTRFS_NODEPTRS_PER_BLOCK(root) / 4) { 1531 + int sret; 1532 + 1533 + sret = reada_for_balance(root, p, level); 1534 + if (sret) 1535 + goto again; 1536 + 1537 + btrfs_set_path_blocking(p); 1538 + sret = balance_level(trans, root, p, level); 1539 + btrfs_clear_path_blocking(p, NULL); 1540 + 1541 + if (sret) { 1542 + ret = sret; 1543 + goto done; 1544 + } 1545 + b = p->nodes[level]; 1546 + if (!b) { 1547 + btrfs_release_path(NULL, p); 1548 + goto again; 1549 + } 1550 + BUG_ON(btrfs_header_nritems(b) == 1); 1551 + } 1552 + return 0; 1553 + 1554 + again: 1555 + ret = -EAGAIN; 1556 + done: 1557 + return ret; 1558 + } 1559 + 1560 + /* 1450 1561 * look for key in the tree. path is filled in with nodes along the way 1451 1562 * if key is found, we return zero and you can find the item in the leaf 1452 1563 * level of the path (level 0) ··· 1575 1464 ins_len, int cow) 1576 1465 { 1577 1466 struct extent_buffer *b; 1578 - struct extent_buffer *tmp; 1579 1467 int slot; 1580 1468 int ret; 1581 1469 int level; 1582 - int should_reada = p->reada; 1583 1470 int lowest_unlock = 1; 1584 - int blocksize; 1585 1471 u8 lowest_level = 0; 1586 - u64 blocknr; 1587 - u64 gen; 1588 1472 1589 1473 lowest_level = p->lowest_level; 1590 1474 WARN_ON(lowest_level && ins_len > 0); ··· 1608 1502 if (cow) { 1609 1503 int wret; 1610 1504 1611 - /* is a cow on this block not required */ 1505 + /* 1506 + * if we don't really need to cow this block 1507 + * then we don't want to set the path blocking, 1508 + * so we test it here 1509 + */ 1612 1510 if (btrfs_header_generation(b) == trans->transid && 1613 1511 btrfs_header_owner(b) == root->root_key.objectid && 1614 1512 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { ··· 1667 1557 if (ret && slot > 0) 1668 1558 slot -= 1; 1669 1559 p->slots[level] = slot; 1670 - if ((p->search_for_split || ins_len > 0) && 1671 - btrfs_header_nritems(b) >= 1672 - BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 1673 - int sret; 1560 + ret = setup_nodes_for_search(trans, root, p, b, level, 1561 + ins_len); 1562 + if (ret == -EAGAIN) 1563 + goto again; 1564 + else if (ret) 1565 + goto done; 1566 + b = p->nodes[level]; 1567 + slot = p->slots[level]; 1674 1568 1675 - sret = reada_for_balance(root, p, level); 1676 - if (sret) 1677 - goto again; 1678 - 1679 - btrfs_set_path_blocking(p); 1680 - sret = split_node(trans, root, p, level); 1681 - btrfs_clear_path_blocking(p, NULL); 1682 - 1683 - BUG_ON(sret > 0); 1684 - if (sret) { 1685 - ret = sret; 1686 - goto done; 1687 - } 1688 - b = p->nodes[level]; 1689 - slot = p->slots[level]; 1690 - } else if (ins_len < 0 && 1691 - btrfs_header_nritems(b) < 1692 - BTRFS_NODEPTRS_PER_BLOCK(root) / 4) { 1693 - int sret; 1694 - 1695 - sret = reada_for_balance(root, p, level); 1696 - if (sret) 1697 - goto again; 1698 - 1699 - btrfs_set_path_blocking(p); 1700 - sret = balance_level(trans, root, p, level); 1701 - btrfs_clear_path_blocking(p, NULL); 1702 - 1703 - if (sret) { 1704 - ret = sret; 1705 - goto done; 1706 - } 1707 - b = p->nodes[level]; 1708 - if (!b) { 1709 - btrfs_release_path(NULL, p); 1710 - goto again; 1711 - } 1712 - slot = p->slots[level]; 1713 - BUG_ON(btrfs_header_nritems(b) == 1); 1714 - } 1715 1569 unlock_up(p, level, lowest_unlock); 1716 1570 1717 1571 /* this is only true while dropping a snapshot */ ··· 1684 1610 goto done; 1685 1611 } 1686 1612 1687 - blocknr = btrfs_node_blockptr(b, slot); 1688 - gen = btrfs_node_ptr_generation(b, slot); 1689 - blocksize = btrfs_level_size(root, level - 1); 1613 + ret = read_block_for_search(trans, root, p, 1614 + &b, level, slot, key); 1615 + if (ret == -EAGAIN) 1616 + goto again; 1690 1617 1691 - tmp = btrfs_find_tree_block(root, blocknr, blocksize); 1692 - if (tmp && btrfs_buffer_uptodate(tmp, gen)) { 1693 - b = tmp; 1694 - } else { 1695 - /* 1696 - * reduce lock contention at high levels 1697 - * of the btree by dropping locks before 1698 - * we read. 1699 - */ 1700 - if (level > 0) { 1701 - btrfs_release_path(NULL, p); 1702 - if (tmp) 1703 - free_extent_buffer(tmp); 1704 - if (should_reada) 1705 - reada_for_search(root, p, 1706 - level, slot, 1707 - key->objectid); 1708 - 1709 - tmp = read_tree_block(root, blocknr, 1710 - blocksize, gen); 1711 - if (tmp) 1712 - free_extent_buffer(tmp); 1713 - goto again; 1714 - } else { 1715 - btrfs_set_path_blocking(p); 1716 - if (tmp) 1717 - free_extent_buffer(tmp); 1718 - if (should_reada) 1719 - reada_for_search(root, p, 1720 - level, slot, 1721 - key->objectid); 1722 - b = read_node_slot(root, b, slot); 1723 - } 1724 - } 1725 1618 if (!p->skip_locking) { 1726 1619 int lret; 1727 1620 ··· 2157 2116 BUG_ON(!path->nodes[level]); 2158 2117 lower = path->nodes[level]; 2159 2118 nritems = btrfs_header_nritems(lower); 2160 - if (slot > nritems) 2161 - BUG(); 2119 + BUG_ON(slot > nritems); 2162 2120 if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root)) 2163 2121 BUG(); 2164 2122 if (slot != nritems) { ··· 4126 4086 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) 4127 4087 { 4128 4088 int slot; 4129 - int level = 1; 4089 + int level; 4130 4090 struct extent_buffer *c; 4131 - struct extent_buffer *next = NULL; 4091 + struct extent_buffer *next; 4132 4092 struct btrfs_key key; 4133 4093 u32 nritems; 4134 4094 int ret; 4095 + int old_spinning = path->leave_spinning; 4096 + int force_blocking = 0; 4135 4097 4136 4098 nritems = btrfs_header_nritems(path->nodes[0]); 4137 4099 if (nritems == 0) 4138 4100 return 1; 4139 4101 4140 - btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 4102 + /* 4103 + * we take the blocks in an order that upsets lockdep. Using 4104 + * blocking mode is the only way around it. 4105 + */ 4106 + #ifdef CONFIG_DEBUG_LOCK_ALLOC 4107 + force_blocking = 1; 4108 + #endif 4141 4109 4110 + btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 4111 + again: 4112 + level = 1; 4113 + next = NULL; 4142 4114 btrfs_release_path(root, path); 4115 + 4143 4116 path->keep_locks = 1; 4117 + 4118 + if (!force_blocking) 4119 + path->leave_spinning = 1; 4120 + 4144 4121 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4145 4122 path->keep_locks = 0; 4146 4123 4147 4124 if (ret < 0) 4148 4125 return ret; 4149 4126 4150 - btrfs_set_path_blocking(path); 4151 4127 nritems = btrfs_header_nritems(path->nodes[0]); 4152 4128 /* 4153 4129 * by releasing the path above we dropped all our locks. A balance ··· 4173 4117 */ 4174 4118 if (nritems > 0 && path->slots[0] < nritems - 1) { 4175 4119 path->slots[0]++; 4120 + ret = 0; 4176 4121 goto done; 4177 4122 } 4178 4123 4179 4124 while (level < BTRFS_MAX_LEVEL) { 4180 - if (!path->nodes[level]) 4181 - return 1; 4125 + if (!path->nodes[level]) { 4126 + ret = 1; 4127 + goto done; 4128 + } 4182 4129 4183 4130 slot = path->slots[level] + 1; 4184 4131 c = path->nodes[level]; 4185 4132 if (slot >= btrfs_header_nritems(c)) { 4186 4133 level++; 4187 - if (level == BTRFS_MAX_LEVEL) 4188 - return 1; 4134 + if (level == BTRFS_MAX_LEVEL) { 4135 + ret = 1; 4136 + goto done; 4137 + } 4189 4138 continue; 4190 4139 } 4191 4140 ··· 4199 4138 free_extent_buffer(next); 4200 4139 } 4201 4140 4202 - /* the path was set to blocking above */ 4203 - if (level == 1 && (path->locks[1] || path->skip_locking) && 4204 - path->reada) 4205 - reada_for_search(root, path, level, slot, 0); 4141 + next = c; 4142 + ret = read_block_for_search(NULL, root, path, &next, level, 4143 + slot, &key); 4144 + if (ret == -EAGAIN) 4145 + goto again; 4206 4146 4207 - next = read_node_slot(root, c, slot); 4208 4147 if (!path->skip_locking) { 4209 - btrfs_assert_tree_locked(c); 4210 - btrfs_tree_lock(next); 4211 - btrfs_set_lock_blocking(next); 4148 + ret = btrfs_try_spin_lock(next); 4149 + if (!ret) { 4150 + btrfs_set_path_blocking(path); 4151 + btrfs_tree_lock(next); 4152 + if (!force_blocking) 4153 + btrfs_clear_path_blocking(path, next); 4154 + } 4155 + if (force_blocking) 4156 + btrfs_set_lock_blocking(next); 4212 4157 } 4213 4158 break; 4214 4159 } ··· 4224 4157 c = path->nodes[level]; 4225 4158 if (path->locks[level]) 4226 4159 btrfs_tree_unlock(c); 4160 + 4227 4161 free_extent_buffer(c); 4228 4162 path->nodes[level] = next; 4229 4163 path->slots[level] = 0; 4230 4164 if (!path->skip_locking) 4231 4165 path->locks[level] = 1; 4166 + 4232 4167 if (!level) 4233 4168 break; 4234 4169 4235 - btrfs_set_path_blocking(path); 4236 - if (level == 1 && path->locks[1] && path->reada) 4237 - reada_for_search(root, path, level, slot, 0); 4238 - next = read_node_slot(root, next, 0); 4170 + ret = read_block_for_search(NULL, root, path, &next, level, 4171 + 0, &key); 4172 + if (ret == -EAGAIN) 4173 + goto again; 4174 + 4239 4175 if (!path->skip_locking) { 4240 4176 btrfs_assert_tree_locked(path->nodes[level]); 4241 - btrfs_tree_lock(next); 4242 - btrfs_set_lock_blocking(next); 4177 + ret = btrfs_try_spin_lock(next); 4178 + if (!ret) { 4179 + btrfs_set_path_blocking(path); 4180 + btrfs_tree_lock(next); 4181 + if (!force_blocking) 4182 + btrfs_clear_path_blocking(path, next); 4183 + } 4184 + if (force_blocking) 4185 + btrfs_set_lock_blocking(next); 4243 4186 } 4244 4187 } 4188 + ret = 0; 4245 4189 done: 4246 4190 unlock_up(path, 0, 1); 4247 - return 0; 4191 + path->leave_spinning = old_spinning; 4192 + if (!old_spinning) 4193 + btrfs_set_path_blocking(path); 4194 + 4195 + return ret; 4248 4196 } 4249 4197 4250 4198 /*
+49 -35
fs/btrfs/ctree.h
··· 143 143 #define BTRFS_FT_MAX 9 144 144 145 145 /* 146 - * the key defines the order in the tree, and so it also defines (optimal) 147 - * block layout. objectid corresonds to the inode number. The flags 148 - * tells us things about the object, and is a kind of stream selector. 149 - * so for a given inode, keys with flags of 1 might refer to the inode 150 - * data, flags of 2 may point to file data in the btree and flags == 3 151 - * may point to extents. 146 + * The key defines the order in the tree, and so it also defines (optimal) 147 + * block layout. 148 + * 149 + * objectid corresponds to the inode number. 150 + * 151 + * type tells us things about the object, and is a kind of stream selector. 152 + * so for a given inode, keys with type of 1 might refer to the inode data, 153 + * type of 2 may point to file data in the btree and type == 3 may point to 154 + * extents. 152 155 * 153 156 * offset is the starting byte offset for this key in the stream. 154 157 * ··· 203 200 204 201 /* 205 202 * starting byte of this partition on the device, 206 - * to allowr for stripe alignment in the future 203 + * to allow for stripe alignment in the future 207 204 */ 208 205 __le64 start_offset; 209 206 ··· 636 633 struct rw_semaphore groups_sem; 637 634 }; 638 635 639 - struct btrfs_free_space { 640 - struct rb_node bytes_index; 641 - struct rb_node offset_index; 642 - u64 offset; 643 - u64 bytes; 636 + /* 637 + * free clusters are used to claim free space in relatively large chunks, 638 + * allowing us to do less seeky writes. They are used for all metadata 639 + * allocations and data allocations in ssd mode. 640 + */ 641 + struct btrfs_free_cluster { 642 + spinlock_t lock; 643 + spinlock_t refill_lock; 644 + struct rb_root root; 645 + 646 + /* largest extent in this cluster */ 647 + u64 max_size; 648 + 649 + /* first extent starting offset */ 650 + u64 window_start; 651 + 652 + struct btrfs_block_group_cache *block_group; 653 + /* 654 + * when a cluster is allocated from a block group, we put the 655 + * cluster onto a list in the block group so that it can 656 + * be freed before the block group is freed. 657 + */ 658 + struct list_head block_group_list; 644 659 }; 645 660 646 661 struct btrfs_block_group_cache { 647 662 struct btrfs_key key; 648 663 struct btrfs_block_group_item item; 649 664 spinlock_t lock; 650 - struct mutex alloc_mutex; 651 665 struct mutex cache_mutex; 652 666 u64 pinned; 653 667 u64 reserved; ··· 676 656 struct btrfs_space_info *space_info; 677 657 678 658 /* free space cache stuff */ 659 + spinlock_t tree_lock; 679 660 struct rb_root free_space_bytes; 680 661 struct rb_root free_space_offset; 681 662 ··· 688 667 689 668 /* usage count */ 690 669 atomic_t count; 670 + 671 + /* List of struct btrfs_free_clusters for this block group. 672 + * Today it will only have one thing on it, but that may change 673 + */ 674 + struct list_head cluster_list; 691 675 }; 692 676 693 677 struct btrfs_leaf_ref_tree { ··· 754 728 struct mutex tree_log_mutex; 755 729 struct mutex transaction_kthread_mutex; 756 730 struct mutex cleaner_mutex; 757 - struct mutex pinned_mutex; 758 731 struct mutex chunk_mutex; 759 732 struct mutex drop_mutex; 760 733 struct mutex volume_mutex; ··· 864 839 spinlock_t delalloc_lock; 865 840 spinlock_t new_trans_lock; 866 841 u64 delalloc_bytes; 867 - u64 last_alloc; 868 - u64 last_data_alloc; 842 + 843 + /* data_alloc_cluster is only used in ssd mode */ 844 + struct btrfs_free_cluster data_alloc_cluster; 845 + 846 + /* all metadata allocations go through this cluster */ 847 + struct btrfs_free_cluster meta_alloc_cluster; 869 848 870 849 spinlock_t ref_cache_lock; 871 850 u64 total_ref_cache_size; ··· 961 932 }; 962 933 963 934 /* 964 - 965 935 * inode items have the data typically returned from stat and store other 966 936 * info about object characteristics. There is one for every file and dir in 967 937 * the FS ··· 991 963 #define BTRFS_EXTENT_CSUM_KEY 128 992 964 993 965 /* 994 - * root items point to tree roots. There are typically in the root 966 + * root items point to tree roots. They are typically in the root 995 967 * tree used by the super block to find all the other trees 996 968 */ 997 969 #define BTRFS_ROOT_ITEM_KEY 132 ··· 1038 1010 #define BTRFS_MOUNT_SSD (1 << 3) 1039 1011 #define BTRFS_MOUNT_DEGRADED (1 << 4) 1040 1012 #define BTRFS_MOUNT_COMPRESS (1 << 5) 1013 + #define BTRFS_MOUNT_NOTREELOG (1 << 6) 1014 + #define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) 1041 1015 1042 1016 #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1043 1017 #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) ··· 1778 1748 } 1779 1749 1780 1750 /* extent-tree.c */ 1751 + void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 1781 1752 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 1782 1753 struct btrfs_root *root, unsigned long count); 1783 1754 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); ··· 2205 2174 int btrfs_init_acl(struct inode *inode, struct inode *dir); 2206 2175 int btrfs_acl_chmod(struct inode *inode); 2207 2176 2208 - /* free-space-cache.c */ 2209 - int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 2210 - u64 bytenr, u64 size); 2211 - int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group, 2212 - u64 offset, u64 bytes); 2213 - int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 2214 - u64 bytenr, u64 size); 2215 - int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group, 2216 - u64 offset, u64 bytes); 2217 - void btrfs_remove_free_space_cache(struct btrfs_block_group_cache 2218 - *block_group); 2219 - struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache 2220 - *block_group, u64 offset, 2221 - u64 bytes); 2222 - void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, 2223 - u64 bytes); 2224 - u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group); 2225 2177 #endif
-1
fs/btrfs/delayed-ref.c
··· 18 18 19 19 #include <linux/sched.h> 20 20 #include <linux/sort.h> 21 - #include <linux/ftrace.h> 22 21 #include "ctree.h" 23 22 #include "delayed-ref.h" 24 23 #include "transaction.h"
+5 -3
fs/btrfs/disk-io.c
··· 38 38 #include "locking.h" 39 39 #include "ref-cache.h" 40 40 #include "tree-log.h" 41 + #include "free-space-cache.h" 41 42 42 43 static struct extent_io_ops btree_extent_io_ops; 43 44 static void end_workqueue_fn(struct btrfs_work *work); ··· 1413 1412 1414 1413 ret = extent_range_uptodate(io_tree, start + length, 1415 1414 start + buf_len - 1); 1416 - if (ret == 1) 1417 - return ret; 1418 1415 return ret; 1419 1416 } 1420 1417 ··· 1646 1647 mutex_init(&fs_info->ordered_operations_mutex); 1647 1648 mutex_init(&fs_info->tree_log_mutex); 1648 1649 mutex_init(&fs_info->drop_mutex); 1649 - mutex_init(&fs_info->pinned_mutex); 1650 1650 mutex_init(&fs_info->chunk_mutex); 1651 1651 mutex_init(&fs_info->transaction_kthread_mutex); 1652 1652 mutex_init(&fs_info->cleaner_mutex); 1653 1653 mutex_init(&fs_info->volume_mutex); 1654 1654 mutex_init(&fs_info->tree_reloc_mutex); 1655 + 1656 + btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); 1657 + btrfs_init_free_cluster(&fs_info->data_alloc_cluster); 1658 + 1655 1659 init_waitqueue_head(&fs_info->transaction_throttle); 1656 1660 init_waitqueue_head(&fs_info->transaction_wait); 1657 1661 init_waitqueue_head(&fs_info->async_submit_wait);
+203 -213
fs/btrfs/extent-tree.c
··· 31 31 #include "volumes.h" 32 32 #include "locking.h" 33 33 #include "ref-cache.h" 34 + #include "free-space-cache.h" 34 35 35 36 #define PENDING_EXTENT_INSERT 0 36 37 #define PENDING_EXTENT_DELETE 1 ··· 167 166 u64 extent_start, extent_end, size; 168 167 int ret; 169 168 170 - mutex_lock(&info->pinned_mutex); 171 169 while (start < end) { 172 170 ret = find_first_extent_bit(&info->pinned_extents, start, 173 171 &extent_start, &extent_end, ··· 192 192 ret = btrfs_add_free_space(block_group, start, size); 193 193 BUG_ON(ret); 194 194 } 195 - mutex_unlock(&info->pinned_mutex); 196 195 197 196 return 0; 198 197 } ··· 290 291 block_group->key.objectid + 291 292 block_group->key.offset); 292 293 293 - remove_sb_from_cache(root, block_group); 294 294 block_group->cached = 1; 295 + remove_sb_from_cache(root, block_group); 295 296 ret = 0; 296 297 err: 297 298 btrfs_free_path(path); ··· 325 326 return cache; 326 327 } 327 328 328 - static inline void put_block_group(struct btrfs_block_group_cache *cache) 329 + void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 329 330 { 330 331 if (atomic_dec_and_test(&cache->count)) 331 332 kfree(cache); ··· 398 399 div_factor(cache->key.offset, factor)) { 399 400 group_start = cache->key.objectid; 400 401 spin_unlock(&cache->lock); 401 - put_block_group(cache); 402 + btrfs_put_block_group(cache); 402 403 goto found; 403 404 } 404 405 } 405 406 spin_unlock(&cache->lock); 406 - put_block_group(cache); 407 + btrfs_put_block_group(cache); 407 408 cond_resched(); 408 409 } 409 410 if (!wrapped) { ··· 1593 1594 if (!block_group || block_group->ro) 1594 1595 readonly = 1; 1595 1596 if (block_group) 1596 - put_block_group(block_group); 1597 + btrfs_put_block_group(block_group); 1597 1598 return readonly; 1598 1599 } 1599 1600 ··· 2017 2018 WARN_ON(ret); 2018 2019 } 2019 2020 } 2020 - put_block_group(cache); 2021 + btrfs_put_block_group(cache); 2021 2022 total -= num_bytes; 2022 2023 bytenr += num_bytes; 2023 2024 } ··· 2034 2035 return 0; 2035 2036 2036 2037 bytenr = cache->key.objectid; 2037 - put_block_group(cache); 2038 + btrfs_put_block_group(cache); 2038 2039 2039 2040 return bytenr; 2040 2041 } ··· 2046 2047 struct btrfs_block_group_cache *cache; 2047 2048 struct btrfs_fs_info *fs_info = root->fs_info; 2048 2049 2049 - WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex)); 2050 2050 if (pin) { 2051 2051 set_extent_dirty(&fs_info->pinned_extents, 2052 2052 bytenr, bytenr + num - 1, GFP_NOFS); ··· 2053 2055 clear_extent_dirty(&fs_info->pinned_extents, 2054 2056 bytenr, bytenr + num - 1, GFP_NOFS); 2055 2057 } 2056 - mutex_unlock(&root->fs_info->pinned_mutex); 2057 2058 2058 2059 while (num > 0) { 2059 2060 cache = btrfs_lookup_block_group(fs_info, bytenr); ··· 2078 2081 if (cache->cached) 2079 2082 btrfs_add_free_space(cache, bytenr, len); 2080 2083 } 2081 - put_block_group(cache); 2084 + btrfs_put_block_group(cache); 2082 2085 bytenr += len; 2083 2086 num -= len; 2084 2087 } ··· 2109 2112 } 2110 2113 spin_unlock(&cache->lock); 2111 2114 spin_unlock(&cache->space_info->lock); 2112 - put_block_group(cache); 2115 + btrfs_put_block_group(cache); 2113 2116 bytenr += len; 2114 2117 num -= len; 2115 2118 } ··· 2124 2127 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents; 2125 2128 int ret; 2126 2129 2127 - mutex_lock(&root->fs_info->pinned_mutex); 2128 2130 while (1) { 2129 2131 ret = find_first_extent_bit(pinned_extents, last, 2130 2132 &start, &end, EXTENT_DIRTY); ··· 2132 2136 set_extent_dirty(copy, start, end, GFP_NOFS); 2133 2137 last = end + 1; 2134 2138 } 2135 - mutex_unlock(&root->fs_info->pinned_mutex); 2136 2139 return 0; 2137 2140 } 2138 2141 ··· 2144 2149 int ret; 2145 2150 2146 2151 while (1) { 2147 - mutex_lock(&root->fs_info->pinned_mutex); 2148 2152 ret = find_first_extent_bit(unpin, 0, &start, &end, 2149 2153 EXTENT_DIRTY); 2150 2154 if (ret) ··· 2157 2163 2158 2164 cond_resched(); 2159 2165 } 2160 - mutex_unlock(&root->fs_info->pinned_mutex); 2161 2166 return ret; 2162 2167 } 2163 2168 ··· 2198 2205 free_extent_buffer(buf); 2199 2206 pinit: 2200 2207 btrfs_set_path_blocking(path); 2201 - mutex_lock(&root->fs_info->pinned_mutex); 2202 2208 /* unlocks the pinned mutex */ 2203 2209 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 2204 2210 ··· 2503 2511 */ 2504 2512 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID && 2505 2513 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { 2506 - mutex_lock(&root->fs_info->pinned_mutex); 2507 - 2508 2514 /* unlocks the pinned mutex */ 2509 2515 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 2510 2516 update_reserved_extents(root, bytenr, num_bytes, 0); ··· 2544 2554 { 2545 2555 int ret = 0; 2546 2556 struct btrfs_root *root = orig_root->fs_info->extent_root; 2547 - u64 total_needed = num_bytes; 2548 - u64 *last_ptr = NULL; 2549 - u64 last_wanted = 0; 2557 + struct btrfs_free_cluster *last_ptr = NULL; 2550 2558 struct btrfs_block_group_cache *block_group = NULL; 2551 - int chunk_alloc_done = 0; 2552 2559 int empty_cluster = 2 * 1024 * 1024; 2553 2560 int allowed_chunk_alloc = 0; 2554 - struct list_head *head = NULL, *cur = NULL; 2555 - int loop = 0; 2556 - int extra_loop = 0; 2557 2561 struct btrfs_space_info *space_info; 2562 + int last_ptr_loop = 0; 2563 + int loop = 0; 2558 2564 2559 2565 WARN_ON(num_bytes < root->sectorsize); 2560 2566 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 2561 2567 ins->objectid = 0; 2562 2568 ins->offset = 0; 2563 2569 2570 + space_info = __find_space_info(root->fs_info, data); 2571 + 2564 2572 if (orig_root->ref_cows || empty_size) 2565 2573 allowed_chunk_alloc = 1; 2566 2574 2567 2575 if (data & BTRFS_BLOCK_GROUP_METADATA) { 2568 - last_ptr = &root->fs_info->last_alloc; 2576 + last_ptr = &root->fs_info->meta_alloc_cluster; 2569 2577 if (!btrfs_test_opt(root, SSD)) 2570 2578 empty_cluster = 64 * 1024; 2571 2579 } 2572 2580 2573 - if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) 2574 - last_ptr = &root->fs_info->last_data_alloc; 2581 + if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) { 2582 + last_ptr = &root->fs_info->data_alloc_cluster; 2583 + } 2575 2584 2576 2585 if (last_ptr) { 2577 - if (*last_ptr) { 2578 - hint_byte = *last_ptr; 2579 - last_wanted = *last_ptr; 2580 - } else 2581 - empty_size += empty_cluster; 2582 - } else { 2583 - empty_cluster = 0; 2586 + spin_lock(&last_ptr->lock); 2587 + if (last_ptr->block_group) 2588 + hint_byte = last_ptr->window_start; 2589 + spin_unlock(&last_ptr->lock); 2584 2590 } 2591 + 2585 2592 search_start = max(search_start, first_logical_byte(root, 0)); 2586 2593 search_start = max(search_start, hint_byte); 2587 2594 2588 - if (last_wanted && search_start != last_wanted) { 2589 - last_wanted = 0; 2590 - empty_size += empty_cluster; 2595 + if (!last_ptr) { 2596 + empty_cluster = 0; 2597 + loop = 1; 2591 2598 } 2592 2599 2593 - total_needed += empty_size; 2594 - block_group = btrfs_lookup_block_group(root->fs_info, search_start); 2595 - if (!block_group) 2596 - block_group = btrfs_lookup_first_block_group(root->fs_info, 2597 - search_start); 2598 - space_info = __find_space_info(root->fs_info, data); 2600 + if (search_start == hint_byte) { 2601 + block_group = btrfs_lookup_block_group(root->fs_info, 2602 + search_start); 2603 + if (block_group && block_group_bits(block_group, data)) { 2604 + down_read(&space_info->groups_sem); 2605 + goto have_block_group; 2606 + } else if (block_group) { 2607 + btrfs_put_block_group(block_group); 2608 + } 2609 + } 2599 2610 2611 + search: 2600 2612 down_read(&space_info->groups_sem); 2601 - while (1) { 2602 - struct btrfs_free_space *free_space; 2603 - /* 2604 - * the only way this happens if our hint points to a block 2605 - * group thats not of the proper type, while looping this 2606 - * should never happen 2607 - */ 2608 - if (empty_size) 2609 - extra_loop = 1; 2613 + list_for_each_entry(block_group, &space_info->block_groups, list) { 2614 + u64 offset; 2610 2615 2611 - if (!block_group) 2612 - goto new_group_no_lock; 2616 + atomic_inc(&block_group->count); 2617 + search_start = block_group->key.objectid; 2613 2618 2619 + have_block_group: 2614 2620 if (unlikely(!block_group->cached)) { 2615 2621 mutex_lock(&block_group->cache_mutex); 2616 2622 ret = cache_block_group(root, block_group); 2617 2623 mutex_unlock(&block_group->cache_mutex); 2618 - if (ret) 2624 + if (ret) { 2625 + btrfs_put_block_group(block_group); 2619 2626 break; 2627 + } 2620 2628 } 2621 - 2622 - mutex_lock(&block_group->alloc_mutex); 2623 - if (unlikely(!block_group_bits(block_group, data))) 2624 - goto new_group; 2625 2629 2626 2630 if (unlikely(block_group->ro)) 2627 - goto new_group; 2631 + goto loop; 2628 2632 2629 - free_space = btrfs_find_free_space(block_group, search_start, 2630 - total_needed); 2631 - if (free_space) { 2632 - u64 start = block_group->key.objectid; 2633 - u64 end = block_group->key.objectid + 2634 - block_group->key.offset; 2635 - 2636 - search_start = stripe_align(root, free_space->offset); 2637 - 2638 - /* move on to the next group */ 2639 - if (search_start + num_bytes >= search_end) 2640 - goto new_group; 2641 - 2642 - /* move on to the next group */ 2643 - if (search_start + num_bytes > end) 2644 - goto new_group; 2645 - 2646 - if (last_wanted && search_start != last_wanted) { 2647 - total_needed += empty_cluster; 2648 - empty_size += empty_cluster; 2649 - last_wanted = 0; 2650 - /* 2651 - * if search_start is still in this block group 2652 - * then we just re-search this block group 2653 - */ 2654 - if (search_start >= start && 2655 - search_start < end) { 2656 - mutex_unlock(&block_group->alloc_mutex); 2657 - continue; 2658 - } 2659 - 2660 - /* else we go to the next block group */ 2661 - goto new_group; 2662 - } 2663 - 2664 - if (exclude_nr > 0 && 2665 - (search_start + num_bytes > exclude_start && 2666 - search_start < exclude_start + exclude_nr)) { 2667 - search_start = exclude_start + exclude_nr; 2668 - /* 2669 - * if search_start is still in this block group 2670 - * then we just re-search this block group 2671 - */ 2672 - if (search_start >= start && 2673 - search_start < end) { 2674 - mutex_unlock(&block_group->alloc_mutex); 2675 - last_wanted = 0; 2676 - continue; 2677 - } 2678 - 2679 - /* else we go to the next block group */ 2680 - goto new_group; 2681 - } 2682 - 2683 - ins->objectid = search_start; 2684 - ins->offset = num_bytes; 2685 - 2686 - btrfs_remove_free_space_lock(block_group, search_start, 2687 - num_bytes); 2688 - /* we are all good, lets return */ 2689 - mutex_unlock(&block_group->alloc_mutex); 2690 - break; 2691 - } 2692 - new_group: 2693 - mutex_unlock(&block_group->alloc_mutex); 2694 - put_block_group(block_group); 2695 - block_group = NULL; 2696 - new_group_no_lock: 2697 - /* don't try to compare new allocations against the 2698 - * last allocation any more 2699 - */ 2700 - last_wanted = 0; 2701 - 2702 - /* 2703 - * Here's how this works. 2704 - * loop == 0: we were searching a block group via a hint 2705 - * and didn't find anything, so we start at 2706 - * the head of the block groups and keep searching 2707 - * loop == 1: we're searching through all of the block groups 2708 - * if we hit the head again we have searched 2709 - * all of the block groups for this space and we 2710 - * need to try and allocate, if we cant error out. 2711 - * loop == 2: we allocated more space and are looping through 2712 - * all of the block groups again. 2713 - */ 2714 - if (loop == 0) { 2715 - head = &space_info->block_groups; 2716 - cur = head->next; 2717 - loop++; 2718 - } else if (loop == 1 && cur == head) { 2719 - int keep_going; 2720 - 2721 - /* at this point we give up on the empty_size 2722 - * allocations and just try to allocate the min 2723 - * space. 2724 - * 2725 - * The extra_loop field was set if an empty_size 2726 - * allocation was attempted above, and if this 2727 - * is try we need to try the loop again without 2728 - * the additional empty_size. 2633 + if (last_ptr) { 2634 + /* 2635 + * the refill lock keeps out other 2636 + * people trying to start a new cluster 2729 2637 */ 2730 - total_needed -= empty_size; 2731 - empty_size = 0; 2732 - keep_going = extra_loop; 2733 - loop++; 2638 + spin_lock(&last_ptr->refill_lock); 2639 + offset = btrfs_alloc_from_cluster(block_group, last_ptr, 2640 + num_bytes, search_start); 2641 + if (offset) { 2642 + /* we have a block, we're done */ 2643 + spin_unlock(&last_ptr->refill_lock); 2644 + goto checks; 2645 + } 2734 2646 2735 - if (allowed_chunk_alloc && !chunk_alloc_done) { 2736 - up_read(&space_info->groups_sem); 2737 - ret = do_chunk_alloc(trans, root, num_bytes + 2738 - 2 * 1024 * 1024, data, 1); 2739 - down_read(&space_info->groups_sem); 2740 - if (ret < 0) 2741 - goto loop_check; 2742 - head = &space_info->block_groups; 2647 + spin_lock(&last_ptr->lock); 2648 + /* 2649 + * whoops, this cluster doesn't actually point to 2650 + * this block group. Get a ref on the block 2651 + * group is does point to and try again 2652 + */ 2653 + if (!last_ptr_loop && last_ptr->block_group && 2654 + last_ptr->block_group != block_group) { 2655 + 2656 + btrfs_put_block_group(block_group); 2657 + block_group = last_ptr->block_group; 2658 + atomic_inc(&block_group->count); 2659 + spin_unlock(&last_ptr->lock); 2660 + spin_unlock(&last_ptr->refill_lock); 2661 + 2662 + last_ptr_loop = 1; 2663 + search_start = block_group->key.objectid; 2664 + goto have_block_group; 2665 + } 2666 + spin_unlock(&last_ptr->lock); 2667 + 2668 + /* 2669 + * this cluster didn't work out, free it and 2670 + * start over 2671 + */ 2672 + btrfs_return_cluster_to_free_space(NULL, last_ptr); 2673 + 2674 + last_ptr_loop = 0; 2675 + 2676 + /* allocate a cluster in this block group */ 2677 + ret = btrfs_find_space_cluster(trans, 2678 + block_group, last_ptr, 2679 + offset, num_bytes, 2680 + empty_cluster + empty_size); 2681 + if (ret == 0) { 2743 2682 /* 2744 - * we've allocated a new chunk, keep 2745 - * trying 2683 + * now pull our allocation out of this 2684 + * cluster 2746 2685 */ 2747 - keep_going = 1; 2748 - chunk_alloc_done = 1; 2749 - } else if (!allowed_chunk_alloc) { 2750 - space_info->force_alloc = 1; 2686 + offset = btrfs_alloc_from_cluster(block_group, 2687 + last_ptr, num_bytes, 2688 + search_start); 2689 + if (offset) { 2690 + /* we found one, proceed */ 2691 + spin_unlock(&last_ptr->refill_lock); 2692 + goto checks; 2693 + } 2751 2694 } 2752 - loop_check: 2753 - if (keep_going) { 2754 - cur = head->next; 2755 - extra_loop = 0; 2756 - } else { 2757 - break; 2695 + /* 2696 + * at this point we either didn't find a cluster 2697 + * or we weren't able to allocate a block from our 2698 + * cluster. Free the cluster we've been trying 2699 + * to use, and go to the next block group 2700 + */ 2701 + if (loop < 2) { 2702 + btrfs_return_cluster_to_free_space(NULL, 2703 + last_ptr); 2704 + spin_unlock(&last_ptr->refill_lock); 2705 + goto loop; 2758 2706 } 2759 - } else if (cur == head) { 2760 - break; 2707 + spin_unlock(&last_ptr->refill_lock); 2761 2708 } 2762 2709 2763 - block_group = list_entry(cur, struct btrfs_block_group_cache, 2764 - list); 2765 - atomic_inc(&block_group->count); 2710 + offset = btrfs_find_space_for_alloc(block_group, search_start, 2711 + num_bytes, empty_size); 2712 + if (!offset) 2713 + goto loop; 2714 + checks: 2715 + search_start = stripe_align(root, offset); 2766 2716 2767 - search_start = block_group->key.objectid; 2768 - cur = cur->next; 2717 + /* move on to the next group */ 2718 + if (search_start + num_bytes >= search_end) { 2719 + btrfs_add_free_space(block_group, offset, num_bytes); 2720 + goto loop; 2721 + } 2722 + 2723 + /* move on to the next group */ 2724 + if (search_start + num_bytes > 2725 + block_group->key.objectid + block_group->key.offset) { 2726 + btrfs_add_free_space(block_group, offset, num_bytes); 2727 + goto loop; 2728 + } 2729 + 2730 + if (exclude_nr > 0 && 2731 + (search_start + num_bytes > exclude_start && 2732 + search_start < exclude_start + exclude_nr)) { 2733 + search_start = exclude_start + exclude_nr; 2734 + 2735 + btrfs_add_free_space(block_group, offset, num_bytes); 2736 + /* 2737 + * if search_start is still in this block group 2738 + * then we just re-search this block group 2739 + */ 2740 + if (search_start >= block_group->key.objectid && 2741 + search_start < (block_group->key.objectid + 2742 + block_group->key.offset)) 2743 + goto have_block_group; 2744 + goto loop; 2745 + } 2746 + 2747 + ins->objectid = search_start; 2748 + ins->offset = num_bytes; 2749 + 2750 + if (offset < search_start) 2751 + btrfs_add_free_space(block_group, offset, 2752 + search_start - offset); 2753 + BUG_ON(offset > search_start); 2754 + 2755 + /* we are all good, lets return */ 2756 + break; 2757 + loop: 2758 + btrfs_put_block_group(block_group); 2759 + } 2760 + up_read(&space_info->groups_sem); 2761 + 2762 + /* loop == 0, try to find a clustered alloc in every block group 2763 + * loop == 1, try again after forcing a chunk allocation 2764 + * loop == 2, set empty_size and empty_cluster to 0 and try again 2765 + */ 2766 + if (!ins->objectid && loop < 3 && 2767 + (empty_size || empty_cluster || allowed_chunk_alloc)) { 2768 + if (loop >= 2) { 2769 + empty_size = 0; 2770 + empty_cluster = 0; 2771 + } 2772 + 2773 + if (allowed_chunk_alloc) { 2774 + ret = do_chunk_alloc(trans, root, num_bytes + 2775 + 2 * 1024 * 1024, data, 1); 2776 + allowed_chunk_alloc = 0; 2777 + } else { 2778 + space_info->force_alloc = 1; 2779 + } 2780 + 2781 + if (loop < 3) { 2782 + loop++; 2783 + goto search; 2784 + } 2785 + ret = -ENOSPC; 2786 + } else if (!ins->objectid) { 2787 + ret = -ENOSPC; 2769 2788 } 2770 2789 2771 2790 /* we found what we needed */ ··· 2782 2783 if (!(data & BTRFS_BLOCK_GROUP_DATA)) 2783 2784 trans->block_group = block_group->key.objectid; 2784 2785 2785 - if (last_ptr) 2786 - *last_ptr = ins->objectid + ins->offset; 2786 + btrfs_put_block_group(block_group); 2787 2787 ret = 0; 2788 - } else if (!ret) { 2789 - printk(KERN_ERR "btrfs searching for %llu bytes, " 2790 - "num_bytes %llu, loop %d, allowed_alloc %d\n", 2791 - (unsigned long long)total_needed, 2792 - (unsigned long long)num_bytes, 2793 - loop, allowed_chunk_alloc); 2794 - ret = -ENOSPC; 2795 2788 } 2796 - if (block_group) 2797 - put_block_group(block_group); 2798 2789 2799 - up_read(&space_info->groups_sem); 2800 2790 return ret; 2801 2791 } 2802 2792 ··· 2890 2902 ret = btrfs_discard_extent(root, start, len); 2891 2903 2892 2904 btrfs_add_free_space(cache, start, len); 2893 - put_block_group(cache); 2905 + btrfs_put_block_group(cache); 2894 2906 update_reserved_extents(root, start, len, 0); 2895 2907 2896 2908 return ret; ··· 3028 3040 ret = btrfs_remove_free_space(block_group, ins->objectid, 3029 3041 ins->offset); 3030 3042 BUG_ON(ret); 3031 - put_block_group(block_group); 3043 + btrfs_put_block_group(block_group); 3032 3044 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, 3033 3045 ref_generation, owner, ins, 1); 3034 3046 return ret; ··· 5717 5729 WARN_ON(block_group->reserved > 0); 5718 5730 WARN_ON(btrfs_block_group_used(&block_group->item) > 0); 5719 5731 spin_unlock(&block_group->lock); 5720 - put_block_group(block_group); 5732 + btrfs_put_block_group(block_group); 5721 5733 ret = 0; 5722 5734 out: 5723 5735 btrfs_free_path(path); ··· 5844 5856 5845 5857 atomic_set(&cache->count, 1); 5846 5858 spin_lock_init(&cache->lock); 5847 - mutex_init(&cache->alloc_mutex); 5859 + spin_lock_init(&cache->tree_lock); 5848 5860 mutex_init(&cache->cache_mutex); 5849 5861 INIT_LIST_HEAD(&cache->list); 5862 + INIT_LIST_HEAD(&cache->cluster_list); 5850 5863 read_extent_buffer(leaf, &cache->item, 5851 5864 btrfs_item_ptr_offset(leaf, path->slots[0]), 5852 5865 sizeof(cache->item)); ··· 5901 5912 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 5902 5913 atomic_set(&cache->count, 1); 5903 5914 spin_lock_init(&cache->lock); 5904 - mutex_init(&cache->alloc_mutex); 5915 + spin_lock_init(&cache->tree_lock); 5905 5916 mutex_init(&cache->cache_mutex); 5906 5917 INIT_LIST_HEAD(&cache->list); 5918 + INIT_LIST_HEAD(&cache->cluster_list); 5907 5919 5908 5920 btrfs_set_block_group_used(&cache->item, bytes_used); 5909 5921 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); ··· 5964 5974 spin_unlock(&block_group->space_info->lock); 5965 5975 block_group->space_info->full = 0; 5966 5976 5967 - put_block_group(block_group); 5968 - put_block_group(block_group); 5977 + btrfs_put_block_group(block_group); 5978 + btrfs_put_block_group(block_group); 5969 5979 5970 5980 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 5971 5981 if (ret > 0)
+5 -11
fs/btrfs/extent_io.c
··· 2884 2884 disko = 0; 2885 2885 flags = 0; 2886 2886 2887 - switch (em->block_start) { 2888 - case EXTENT_MAP_LAST_BYTE: 2887 + if (em->block_start == EXTENT_MAP_LAST_BYTE) { 2889 2888 end = 1; 2890 2889 flags |= FIEMAP_EXTENT_LAST; 2891 - break; 2892 - case EXTENT_MAP_HOLE: 2890 + } else if (em->block_start == EXTENT_MAP_HOLE) { 2893 2891 flags |= FIEMAP_EXTENT_UNWRITTEN; 2894 - break; 2895 - case EXTENT_MAP_INLINE: 2892 + } else if (em->block_start == EXTENT_MAP_INLINE) { 2896 2893 flags |= (FIEMAP_EXTENT_DATA_INLINE | 2897 2894 FIEMAP_EXTENT_NOT_ALIGNED); 2898 - break; 2899 - case EXTENT_MAP_DELALLOC: 2895 + } else if (em->block_start == EXTENT_MAP_DELALLOC) { 2900 2896 flags |= (FIEMAP_EXTENT_DELALLOC | 2901 2897 FIEMAP_EXTENT_UNKNOWN); 2902 - break; 2903 - default: 2898 + } else { 2904 2899 disko = em->block_start; 2905 - break; 2906 2900 } 2907 2901 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2908 2902 flags |= FIEMAP_EXTENT_ENCODED;
-1
fs/btrfs/extent_map.c
··· 234 234 rb = tree_insert(&tree->map, em->start, &em->rb_node); 235 235 if (rb) { 236 236 ret = -EEXIST; 237 - free_extent_map(merge); 238 237 goto out; 239 238 } 240 239 atomic_inc(&em->refs);
+378 -158
fs/btrfs/free-space-cache.c
··· 18 18 19 19 #include <linux/sched.h> 20 20 #include "ctree.h" 21 + #include "free-space-cache.h" 22 + #include "transaction.h" 23 + 24 + struct btrfs_free_space { 25 + struct rb_node bytes_index; 26 + struct rb_node offset_index; 27 + u64 offset; 28 + u64 bytes; 29 + }; 21 30 22 31 static int tree_insert_offset(struct rb_root *root, u64 offset, 23 32 struct rb_node *node) ··· 77 68 } 78 69 79 70 /* 80 - * searches the tree for the given offset. If contains is set we will return 81 - * the free space that contains the given offset. If contains is not set we 82 - * will return the free space that starts at or after the given offset and is 83 - * at least bytes long. 71 + * searches the tree for the given offset. 72 + * 73 + * fuzzy == 1: this is used for allocations where we are given a hint of where 74 + * to look for free space. Because the hint may not be completely on an offset 75 + * mark, or the hint may no longer point to free space we need to fudge our 76 + * results a bit. So we look for free space starting at or after offset with at 77 + * least bytes size. We prefer to find as close to the given offset as we can. 78 + * Also if the offset is within a free space range, then we will return the free 79 + * space that contains the given offset, which means we can return a free space 80 + * chunk with an offset before the provided offset. 81 + * 82 + * fuzzy == 0: this is just a normal tree search. Give us the free space that 83 + * starts at the given offset which is at least bytes size, and if its not there 84 + * return NULL. 84 85 */ 85 86 static struct btrfs_free_space *tree_search_offset(struct rb_root *root, 86 87 u64 offset, u64 bytes, 87 - int contains) 88 + int fuzzy) 88 89 { 89 90 struct rb_node *n = root->rb_node; 90 91 struct btrfs_free_space *entry, *ret = NULL; ··· 103 84 entry = rb_entry(n, struct btrfs_free_space, offset_index); 104 85 105 86 if (offset < entry->offset) { 106 - if (!contains && 87 + if (fuzzy && 107 88 (!ret || entry->offset < ret->offset) && 108 89 (bytes <= entry->bytes)) 109 90 ret = entry; 110 91 n = n->rb_left; 111 92 } else if (offset > entry->offset) { 112 - if ((entry->offset + entry->bytes - 1) >= offset && 93 + if (fuzzy && 94 + (entry->offset + entry->bytes - 1) >= offset && 113 95 bytes <= entry->bytes) { 114 96 ret = entry; 115 97 break; ··· 191 171 int ret = 0; 192 172 193 173 174 + BUG_ON(!info->bytes); 194 175 ret = tree_insert_offset(&block_group->free_space_offset, info->offset, 195 176 &info->offset_index); 196 177 if (ret) ··· 205 184 return ret; 206 185 } 207 186 208 - static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 209 - u64 offset, u64 bytes) 187 + int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 188 + u64 offset, u64 bytes) 210 189 { 211 190 struct btrfs_free_space *right_info; 212 191 struct btrfs_free_space *left_info; 213 192 struct btrfs_free_space *info = NULL; 214 - struct btrfs_free_space *alloc_info; 215 193 int ret = 0; 216 194 217 - alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); 218 - if (!alloc_info) 195 + info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); 196 + if (!info) 219 197 return -ENOMEM; 198 + 199 + info->offset = offset; 200 + info->bytes = bytes; 201 + 202 + spin_lock(&block_group->tree_lock); 220 203 221 204 /* 222 205 * first we want to see if there is free space adjacent to the range we ··· 228 203 * cover the entire range 229 204 */ 230 205 right_info = tree_search_offset(&block_group->free_space_offset, 231 - offset+bytes, 0, 1); 206 + offset+bytes, 0, 0); 232 207 left_info = tree_search_offset(&block_group->free_space_offset, 233 208 offset-1, 0, 1); 234 209 235 - if (right_info && right_info->offset == offset+bytes) { 210 + if (right_info) { 236 211 unlink_free_space(block_group, right_info); 237 - info = right_info; 238 - info->offset = offset; 239 - info->bytes += bytes; 240 - } else if (right_info && right_info->offset != offset+bytes) { 241 - printk(KERN_ERR "btrfs adding space in the middle of an " 242 - "existing free space area. existing: " 243 - "offset=%llu, bytes=%llu. new: offset=%llu, " 244 - "bytes=%llu\n", (unsigned long long)right_info->offset, 245 - (unsigned long long)right_info->bytes, 246 - (unsigned long long)offset, 247 - (unsigned long long)bytes); 248 - BUG(); 212 + info->bytes += right_info->bytes; 213 + kfree(right_info); 249 214 } 250 215 251 - if (left_info) { 216 + if (left_info && left_info->offset + left_info->bytes == offset) { 252 217 unlink_free_space(block_group, left_info); 253 - 254 - if (unlikely((left_info->offset + left_info->bytes) != 255 - offset)) { 256 - printk(KERN_ERR "btrfs free space to the left " 257 - "of new free space isn't " 258 - "quite right. existing: offset=%llu, " 259 - "bytes=%llu. new: offset=%llu, bytes=%llu\n", 260 - (unsigned long long)left_info->offset, 261 - (unsigned long long)left_info->bytes, 262 - (unsigned long long)offset, 263 - (unsigned long long)bytes); 264 - BUG(); 265 - } 266 - 267 - if (info) { 268 - info->offset = left_info->offset; 269 - info->bytes += left_info->bytes; 270 - kfree(left_info); 271 - } else { 272 - info = left_info; 273 - info->bytes += bytes; 274 - } 218 + info->offset = left_info->offset; 219 + info->bytes += left_info->bytes; 220 + kfree(left_info); 275 221 } 276 - 277 - if (info) { 278 - ret = link_free_space(block_group, info); 279 - if (!ret) 280 - info = NULL; 281 - goto out; 282 - } 283 - 284 - info = alloc_info; 285 - alloc_info = NULL; 286 - info->offset = offset; 287 - info->bytes = bytes; 288 222 289 223 ret = link_free_space(block_group, info); 290 224 if (ret) 291 225 kfree(info); 292 - out: 226 + 227 + spin_unlock(&block_group->tree_lock); 228 + 293 229 if (ret) { 294 230 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); 295 - if (ret == -EEXIST) 296 - BUG(); 231 + BUG_ON(ret == -EEXIST); 297 232 } 298 - 299 - kfree(alloc_info); 300 233 301 234 return ret; 302 235 } 303 236 304 - static int 305 - __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 306 - u64 offset, u64 bytes) 237 + int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 238 + u64 offset, u64 bytes) 307 239 { 308 240 struct btrfs_free_space *info; 309 241 int ret = 0; 310 242 243 + spin_lock(&block_group->tree_lock); 244 + 311 245 info = tree_search_offset(&block_group->free_space_offset, offset, 0, 312 246 1); 313 - 314 247 if (info && info->offset == offset) { 315 248 if (info->bytes < bytes) { 316 249 printk(KERN_ERR "Found free space at %llu, size %llu," ··· 278 295 (unsigned long long)bytes); 279 296 WARN_ON(1); 280 297 ret = -EINVAL; 298 + spin_unlock(&block_group->tree_lock); 281 299 goto out; 282 300 } 283 301 unlink_free_space(block_group, info); 284 302 285 303 if (info->bytes == bytes) { 286 304 kfree(info); 305 + spin_unlock(&block_group->tree_lock); 287 306 goto out; 288 307 } 289 308 ··· 293 308 info->bytes -= bytes; 294 309 295 310 ret = link_free_space(block_group, info); 311 + spin_unlock(&block_group->tree_lock); 296 312 BUG_ON(ret); 297 313 } else if (info && info->offset < offset && 298 314 info->offset + info->bytes >= offset + bytes) { ··· 319 333 */ 320 334 kfree(info); 321 335 } 322 - 336 + spin_unlock(&block_group->tree_lock); 323 337 /* step two, insert a new info struct to cover anything 324 338 * before the hole 325 339 */ 326 - ret = __btrfs_add_free_space(block_group, old_start, 327 - offset - old_start); 340 + ret = btrfs_add_free_space(block_group, old_start, 341 + offset - old_start); 328 342 BUG_ON(ret); 329 343 } else { 344 + spin_unlock(&block_group->tree_lock); 345 + if (!info) { 346 + printk(KERN_ERR "couldn't find space %llu to free\n", 347 + (unsigned long long)offset); 348 + printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n", 349 + block_group->cached, block_group->key.objectid, 350 + block_group->key.offset); 351 + btrfs_dump_free_space(block_group, bytes); 352 + } else if (info) { 353 + printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, " 354 + "but wanted offset=%llu bytes=%llu\n", 355 + info->offset, info->bytes, offset, bytes); 356 + } 330 357 WARN_ON(1); 331 358 } 332 359 out: 333 - return ret; 334 - } 335 - 336 - int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 337 - u64 offset, u64 bytes) 338 - { 339 - int ret; 340 - struct btrfs_free_space *sp; 341 - 342 - mutex_lock(&block_group->alloc_mutex); 343 - ret = __btrfs_add_free_space(block_group, offset, bytes); 344 - sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1); 345 - BUG_ON(!sp); 346 - mutex_unlock(&block_group->alloc_mutex); 347 - 348 - return ret; 349 - } 350 - 351 - int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group, 352 - u64 offset, u64 bytes) 353 - { 354 - int ret; 355 - struct btrfs_free_space *sp; 356 - 357 - ret = __btrfs_add_free_space(block_group, offset, bytes); 358 - sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1); 359 - BUG_ON(!sp); 360 - 361 - return ret; 362 - } 363 - 364 - int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 365 - u64 offset, u64 bytes) 366 - { 367 - int ret = 0; 368 - 369 - mutex_lock(&block_group->alloc_mutex); 370 - ret = __btrfs_remove_free_space(block_group, offset, bytes); 371 - mutex_unlock(&block_group->alloc_mutex); 372 - 373 - return ret; 374 - } 375 - 376 - int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group, 377 - u64 offset, u64 bytes) 378 - { 379 - int ret; 380 - 381 - ret = __btrfs_remove_free_space(block_group, offset, bytes); 382 - 383 360 return ret; 384 361 } 385 362 ··· 357 408 info = rb_entry(n, struct btrfs_free_space, offset_index); 358 409 if (info->bytes >= bytes) 359 410 count++; 411 + printk(KERN_ERR "entry offset %llu, bytes %llu\n", info->offset, 412 + info->bytes); 360 413 } 361 414 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" 362 415 "\n", count); ··· 379 428 return ret; 380 429 } 381 430 431 + /* 432 + * for a given cluster, put all of its extents back into the free 433 + * space cache. If the block group passed doesn't match the block group 434 + * pointed to by the cluster, someone else raced in and freed the 435 + * cluster already. In that case, we just return without changing anything 436 + */ 437 + static int 438 + __btrfs_return_cluster_to_free_space( 439 + struct btrfs_block_group_cache *block_group, 440 + struct btrfs_free_cluster *cluster) 441 + { 442 + struct btrfs_free_space *entry; 443 + struct rb_node *node; 444 + 445 + spin_lock(&cluster->lock); 446 + if (cluster->block_group != block_group) 447 + goto out; 448 + 449 + cluster->window_start = 0; 450 + node = rb_first(&cluster->root); 451 + while(node) { 452 + entry = rb_entry(node, struct btrfs_free_space, offset_index); 453 + node = rb_next(&entry->offset_index); 454 + rb_erase(&entry->offset_index, &cluster->root); 455 + link_free_space(block_group, entry); 456 + } 457 + list_del_init(&cluster->block_group_list); 458 + 459 + btrfs_put_block_group(cluster->block_group); 460 + cluster->block_group = NULL; 461 + cluster->root.rb_node = NULL; 462 + out: 463 + spin_unlock(&cluster->lock); 464 + return 0; 465 + } 466 + 382 467 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) 383 468 { 384 469 struct btrfs_free_space *info; 385 470 struct rb_node *node; 471 + struct btrfs_free_cluster *cluster; 472 + struct btrfs_free_cluster *safe; 386 473 387 - mutex_lock(&block_group->alloc_mutex); 474 + spin_lock(&block_group->tree_lock); 475 + 476 + list_for_each_entry_safe(cluster, safe, &block_group->cluster_list, 477 + block_group_list) { 478 + 479 + WARN_ON(cluster->block_group != block_group); 480 + __btrfs_return_cluster_to_free_space(block_group, cluster); 481 + } 482 + 388 483 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { 389 484 info = rb_entry(node, struct btrfs_free_space, bytes_index); 390 485 unlink_free_space(block_group, info); 391 486 kfree(info); 392 487 if (need_resched()) { 393 - mutex_unlock(&block_group->alloc_mutex); 488 + spin_unlock(&block_group->tree_lock); 394 489 cond_resched(); 395 - mutex_lock(&block_group->alloc_mutex); 490 + spin_lock(&block_group->tree_lock); 396 491 } 397 492 } 398 - mutex_unlock(&block_group->alloc_mutex); 493 + spin_unlock(&block_group->tree_lock); 399 494 } 400 495 401 - #if 0 402 - static struct btrfs_free_space *btrfs_find_free_space_offset(struct 403 - btrfs_block_group_cache 404 - *block_group, u64 offset, 405 - u64 bytes) 496 + u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, 497 + u64 offset, u64 bytes, u64 empty_size) 406 498 { 407 - struct btrfs_free_space *ret; 499 + struct btrfs_free_space *entry = NULL; 500 + u64 ret = 0; 408 501 409 - mutex_lock(&block_group->alloc_mutex); 410 - ret = tree_search_offset(&block_group->free_space_offset, offset, 411 - bytes, 0); 412 - mutex_unlock(&block_group->alloc_mutex); 502 + spin_lock(&block_group->tree_lock); 503 + entry = tree_search_offset(&block_group->free_space_offset, offset, 504 + bytes + empty_size, 1); 505 + if (!entry) 506 + entry = tree_search_bytes(&block_group->free_space_bytes, 507 + offset, bytes + empty_size); 508 + if (entry) { 509 + unlink_free_space(block_group, entry); 510 + ret = entry->offset; 511 + entry->offset += bytes; 512 + entry->bytes -= bytes; 513 + 514 + if (!entry->bytes) 515 + kfree(entry); 516 + else 517 + link_free_space(block_group, entry); 518 + } 519 + spin_unlock(&block_group->tree_lock); 413 520 414 521 return ret; 415 522 } 416 523 417 - static struct btrfs_free_space *btrfs_find_free_space_bytes(struct 418 - btrfs_block_group_cache 419 - *block_group, u64 offset, 420 - u64 bytes) 524 + /* 525 + * given a cluster, put all of its extents back into the free space 526 + * cache. If a block group is passed, this function will only free 527 + * a cluster that belongs to the passed block group. 528 + * 529 + * Otherwise, it'll get a reference on the block group pointed to by the 530 + * cluster and remove the cluster from it. 531 + */ 532 + int btrfs_return_cluster_to_free_space( 533 + struct btrfs_block_group_cache *block_group, 534 + struct btrfs_free_cluster *cluster) 421 535 { 422 - struct btrfs_free_space *ret; 536 + int ret; 423 537 424 - mutex_lock(&block_group->alloc_mutex); 538 + /* first, get a safe pointer to the block group */ 539 + spin_lock(&cluster->lock); 540 + if (!block_group) { 541 + block_group = cluster->block_group; 542 + if (!block_group) { 543 + spin_unlock(&cluster->lock); 544 + return 0; 545 + } 546 + } else if (cluster->block_group != block_group) { 547 + /* someone else has already freed it don't redo their work */ 548 + spin_unlock(&cluster->lock); 549 + return 0; 550 + } 551 + atomic_inc(&block_group->count); 552 + spin_unlock(&cluster->lock); 425 553 426 - ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes); 427 - mutex_unlock(&block_group->alloc_mutex); 554 + /* now return any extents the cluster had on it */ 555 + spin_lock(&block_group->tree_lock); 556 + ret = __btrfs_return_cluster_to_free_space(block_group, cluster); 557 + spin_unlock(&block_group->tree_lock); 558 + 559 + /* finally drop our ref */ 560 + btrfs_put_block_group(block_group); 561 + return ret; 562 + } 563 + 564 + /* 565 + * given a cluster, try to allocate 'bytes' from it, returns 0 566 + * if it couldn't find anything suitably large, or a logical disk offset 567 + * if things worked out 568 + */ 569 + u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, 570 + struct btrfs_free_cluster *cluster, u64 bytes, 571 + u64 min_start) 572 + { 573 + struct btrfs_free_space *entry = NULL; 574 + struct rb_node *node; 575 + u64 ret = 0; 576 + 577 + spin_lock(&cluster->lock); 578 + if (bytes > cluster->max_size) 579 + goto out; 580 + 581 + if (cluster->block_group != block_group) 582 + goto out; 583 + 584 + node = rb_first(&cluster->root); 585 + if (!node) 586 + goto out; 587 + 588 + entry = rb_entry(node, struct btrfs_free_space, offset_index); 589 + 590 + while(1) { 591 + if (entry->bytes < bytes || entry->offset < min_start) { 592 + struct rb_node *node; 593 + 594 + node = rb_next(&entry->offset_index); 595 + if (!node) 596 + break; 597 + entry = rb_entry(node, struct btrfs_free_space, 598 + offset_index); 599 + continue; 600 + } 601 + ret = entry->offset; 602 + 603 + entry->offset += bytes; 604 + entry->bytes -= bytes; 605 + 606 + if (entry->bytes == 0) { 607 + rb_erase(&entry->offset_index, &cluster->root); 608 + kfree(entry); 609 + } 610 + break; 611 + } 612 + out: 613 + spin_unlock(&cluster->lock); 614 + return ret; 615 + } 616 + 617 + /* 618 + * here we try to find a cluster of blocks in a block group. The goal 619 + * is to find at least bytes free and up to empty_size + bytes free. 620 + * We might not find them all in one contiguous area. 621 + * 622 + * returns zero and sets up cluster if things worked out, otherwise 623 + * it returns -enospc 624 + */ 625 + int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, 626 + struct btrfs_block_group_cache *block_group, 627 + struct btrfs_free_cluster *cluster, 628 + u64 offset, u64 bytes, u64 empty_size) 629 + { 630 + struct btrfs_free_space *entry = NULL; 631 + struct rb_node *node; 632 + struct btrfs_free_space *next; 633 + struct btrfs_free_space *last; 634 + u64 min_bytes; 635 + u64 window_start; 636 + u64 window_free; 637 + u64 max_extent = 0; 638 + int total_retries = 0; 639 + int ret; 640 + 641 + /* for metadata, allow allocates with more holes */ 642 + if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 643 + /* 644 + * we want to do larger allocations when we are 645 + * flushing out the delayed refs, it helps prevent 646 + * making more work as we go along. 647 + */ 648 + if (trans->transaction->delayed_refs.flushing) 649 + min_bytes = max(bytes, (bytes + empty_size) >> 1); 650 + else 651 + min_bytes = max(bytes, (bytes + empty_size) >> 4); 652 + } else 653 + min_bytes = max(bytes, (bytes + empty_size) >> 2); 654 + 655 + spin_lock(&block_group->tree_lock); 656 + spin_lock(&cluster->lock); 657 + 658 + /* someone already found a cluster, hooray */ 659 + if (cluster->block_group) { 660 + ret = 0; 661 + goto out; 662 + } 663 + again: 664 + min_bytes = min(min_bytes, bytes + empty_size); 665 + entry = tree_search_bytes(&block_group->free_space_bytes, 666 + offset, min_bytes); 667 + if (!entry) { 668 + ret = -ENOSPC; 669 + goto out; 670 + } 671 + window_start = entry->offset; 672 + window_free = entry->bytes; 673 + last = entry; 674 + max_extent = entry->bytes; 675 + 676 + while(1) { 677 + /* out window is just right, lets fill it */ 678 + if (window_free >= bytes + empty_size) 679 + break; 680 + 681 + node = rb_next(&last->offset_index); 682 + if (!node) { 683 + ret = -ENOSPC; 684 + goto out; 685 + } 686 + next = rb_entry(node, struct btrfs_free_space, offset_index); 687 + 688 + /* 689 + * we haven't filled the empty size and the window is 690 + * very large. reset and try again 691 + */ 692 + if (next->offset - window_start > (bytes + empty_size) * 2) { 693 + entry = next; 694 + window_start = entry->offset; 695 + window_free = entry->bytes; 696 + last = entry; 697 + max_extent = 0; 698 + total_retries++; 699 + if (total_retries % 256 == 0) { 700 + if (min_bytes >= (bytes + empty_size)) { 701 + ret = -ENOSPC; 702 + goto out; 703 + } 704 + /* 705 + * grow our allocation a bit, we're not having 706 + * much luck 707 + */ 708 + min_bytes *= 2; 709 + goto again; 710 + } 711 + } else { 712 + last = next; 713 + window_free += next->bytes; 714 + if (entry->bytes > max_extent) 715 + max_extent = entry->bytes; 716 + } 717 + } 718 + 719 + cluster->window_start = entry->offset; 720 + 721 + /* 722 + * now we've found our entries, pull them out of the free space 723 + * cache and put them into the cluster rbtree 724 + * 725 + * The cluster includes an rbtree, but only uses the offset index 726 + * of each free space cache entry. 727 + */ 728 + while(1) { 729 + node = rb_next(&entry->offset_index); 730 + unlink_free_space(block_group, entry); 731 + ret = tree_insert_offset(&cluster->root, entry->offset, 732 + &entry->offset_index); 733 + BUG_ON(ret); 734 + 735 + if (!node || entry == last) 736 + break; 737 + 738 + entry = rb_entry(node, struct btrfs_free_space, offset_index); 739 + } 740 + ret = 0; 741 + cluster->max_size = max_extent; 742 + atomic_inc(&block_group->count); 743 + list_add_tail(&cluster->block_group_list, &block_group->cluster_list); 744 + cluster->block_group = block_group; 745 + out: 746 + spin_unlock(&cluster->lock); 747 + spin_unlock(&block_group->tree_lock); 428 748 429 749 return ret; 430 750 } 431 - #endif 432 751 433 - struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache 434 - *block_group, u64 offset, 435 - u64 bytes) 752 + /* 753 + * simple code to zero out a cluster 754 + */ 755 + void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) 436 756 { 437 - struct btrfs_free_space *ret = NULL; 438 - 439 - ret = tree_search_offset(&block_group->free_space_offset, offset, 440 - bytes, 0); 441 - if (!ret) 442 - ret = tree_search_bytes(&block_group->free_space_bytes, 443 - offset, bytes); 444 - 445 - return ret; 757 + spin_lock_init(&cluster->lock); 758 + spin_lock_init(&cluster->refill_lock); 759 + cluster->root.rb_node = NULL; 760 + cluster->max_size = 0; 761 + INIT_LIST_HEAD(&cluster->block_group_list); 762 + cluster->block_group = NULL; 446 763 } 764 +
+44
fs/btrfs/free-space-cache.h
··· 1 + /* 2 + * Copyright (C) 2009 Oracle. All rights reserved. 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public 6 + * License v2 as published by the Free Software Foundation. 7 + * 8 + * This program is distributed in the hope that it will be useful, 9 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 + * General Public License for more details. 12 + * 13 + * You should have received a copy of the GNU General Public 14 + * License along with this program; if not, write to the 15 + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 + * Boston, MA 021110-1307, USA. 17 + */ 18 + 19 + #ifndef __BTRFS_FREE_SPACE_CACHE 20 + #define __BTRFS_FREE_SPACE_CACHE 21 + 22 + int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 23 + u64 bytenr, u64 size); 24 + int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 25 + u64 bytenr, u64 size); 26 + void btrfs_remove_free_space_cache(struct btrfs_block_group_cache 27 + *block_group); 28 + u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, 29 + u64 offset, u64 bytes, u64 empty_size); 30 + void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, 31 + u64 bytes); 32 + u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group); 33 + int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, 34 + struct btrfs_block_group_cache *block_group, 35 + struct btrfs_free_cluster *cluster, 36 + u64 offset, u64 bytes, u64 empty_size); 37 + void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster); 38 + u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, 39 + struct btrfs_free_cluster *cluster, u64 bytes, 40 + u64 min_start); 41 + int btrfs_return_cluster_to_free_space( 42 + struct btrfs_block_group_cache *block_group, 43 + struct btrfs_free_cluster *cluster); 44 + #endif
+4 -1
fs/btrfs/inode.c
··· 3481 3481 3482 3482 if (dir) { 3483 3483 ret = btrfs_set_inode_index(dir, index); 3484 - if (ret) 3484 + if (ret) { 3485 + iput(inode); 3485 3486 return ERR_PTR(ret); 3487 + } 3486 3488 } 3487 3489 /* 3488 3490 * index_cnt is ignored for everything but a dir, ··· 3567 3565 if (dir) 3568 3566 BTRFS_I(dir)->index_cnt--; 3569 3567 btrfs_free_path(path); 3568 + iput(inode); 3570 3569 return ERR_PTR(ret); 3571 3570 } 3572 3571
+2 -2
fs/btrfs/locking.c
··· 60 60 61 61 /* 62 62 * unfortunately, many of the places that currently set a lock to blocking 63 - * don't end up blocking for every long, and often they don't block 64 - * at all. For a dbench 50 run, if we don't spin one the blocking bit 63 + * don't end up blocking for very long, and often they don't block 64 + * at all. For a dbench 50 run, if we don't spin on the blocking bit 65 65 * at all, the context switch rate can jump up to 400,000/sec or more. 66 66 * 67 67 * So, we're still stuck with this crummy spin on the blocking bit,
+50 -4
fs/btrfs/super.c
··· 24 24 #include <linux/highmem.h> 25 25 #include <linux/time.h> 26 26 #include <linux/init.h> 27 + #include <linux/seq_file.h> 27 28 #include <linux/string.h> 28 29 #include <linux/smp_lock.h> 29 30 #include <linux/backing-dev.h> ··· 67 66 enum { 68 67 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, 69 68 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, 70 - Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err, 69 + Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_notreelog, 70 + Opt_flushoncommit, Opt_err, 71 71 }; 72 72 73 73 static match_table_t tokens = { ··· 85 83 {Opt_compress, "compress"}, 86 84 {Opt_ssd, "ssd"}, 87 85 {Opt_noacl, "noacl"}, 86 + {Opt_notreelog, "notreelog"}, 87 + {Opt_flushoncommit, "flushoncommit"}, 88 88 {Opt_err, NULL}, 89 89 }; 90 90 ··· 225 221 break; 226 222 case Opt_noacl: 227 223 root->fs_info->sb->s_flags &= ~MS_POSIXACL; 224 + break; 225 + case Opt_notreelog: 226 + printk(KERN_INFO "btrfs: disabling tree log\n"); 227 + btrfs_set_opt(info->mount_opt, NOTREELOG); 228 + break; 229 + case Opt_flushoncommit: 230 + printk(KERN_INFO "btrfs: turning on flush-on-commit\n"); 231 + btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT); 228 232 break; 229 233 default: 230 234 break; ··· 375 363 int btrfs_sync_fs(struct super_block *sb, int wait) 376 364 { 377 365 struct btrfs_trans_handle *trans; 378 - struct btrfs_root *root; 366 + struct btrfs_root *root = btrfs_sb(sb); 379 367 int ret; 380 - root = btrfs_sb(sb); 381 368 382 369 if (sb->s_flags & MS_RDONLY) 383 370 return 0; ··· 394 383 ret = btrfs_commit_transaction(trans, root); 395 384 sb->s_dirt = 0; 396 385 return ret; 386 + } 387 + 388 + static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) 389 + { 390 + struct btrfs_root *root = btrfs_sb(vfs->mnt_sb); 391 + struct btrfs_fs_info *info = root->fs_info; 392 + 393 + if (btrfs_test_opt(root, DEGRADED)) 394 + seq_puts(seq, ",degraded"); 395 + if (btrfs_test_opt(root, NODATASUM)) 396 + seq_puts(seq, ",nodatasum"); 397 + if (btrfs_test_opt(root, NODATACOW)) 398 + seq_puts(seq, ",nodatacow"); 399 + if (btrfs_test_opt(root, NOBARRIER)) 400 + seq_puts(seq, ",nobarrier"); 401 + if (info->max_extent != (u64)-1) 402 + seq_printf(seq, ",max_extent=%llu", info->max_extent); 403 + if (info->max_inline != 8192 * 1024) 404 + seq_printf(seq, ",max_inline=%llu", info->max_inline); 405 + if (info->alloc_start != 0) 406 + seq_printf(seq, ",alloc_start=%llu", info->alloc_start); 407 + if (info->thread_pool_size != min_t(unsigned long, 408 + num_online_cpus() + 2, 8)) 409 + seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); 410 + if (btrfs_test_opt(root, COMPRESS)) 411 + seq_puts(seq, ",compress"); 412 + if (btrfs_test_opt(root, SSD)) 413 + seq_puts(seq, ",ssd"); 414 + if (btrfs_test_opt(root, NOTREELOG)) 415 + seq_puts(seq, ",no-treelog"); 416 + if (btrfs_test_opt(root, FLUSHONCOMMIT)) 417 + seq_puts(seq, ",flush-on-commit"); 418 + if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) 419 + seq_puts(seq, ",noacl"); 420 + return 0; 397 421 } 398 422 399 423 static void btrfs_write_super(struct super_block *sb) ··· 676 630 .put_super = btrfs_put_super, 677 631 .write_super = btrfs_write_super, 678 632 .sync_fs = btrfs_sync_fs, 679 - .show_options = generic_show_options, 633 + .show_options = btrfs_show_options, 680 634 .write_inode = btrfs_write_inode, 681 635 .dirty_inode = btrfs_dirty_inode, 682 636 .alloc_inode = btrfs_alloc_inode,
+4 -3
fs/btrfs/transaction.c
··· 53 53 GFP_NOFS); 54 54 BUG_ON(!cur_trans); 55 55 root->fs_info->generation++; 56 - root->fs_info->last_alloc = 0; 57 - root->fs_info->last_data_alloc = 0; 58 56 cur_trans->num_writers = 1; 59 57 cur_trans->num_joined = 0; 60 58 cur_trans->transid = root->fs_info->generation; ··· 972 974 int ret; 973 975 int should_grow = 0; 974 976 unsigned long now = get_seconds(); 977 + int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); 975 978 976 979 btrfs_run_ordered_operations(root, 0); 977 980 ··· 1052 1053 1053 1054 mutex_unlock(&root->fs_info->trans_mutex); 1054 1055 1055 - if (snap_pending) { 1056 + if (flush_on_commit || snap_pending) { 1057 + if (flush_on_commit) 1058 + btrfs_start_delalloc_inodes(root); 1056 1059 ret = btrfs_wait_ordered_extents(root, 1); 1057 1060 BUG_ON(ret); 1058 1061 }
+7 -5
fs/btrfs/tree-log.c
··· 262 262 struct extent_buffer *eb, 263 263 struct walk_control *wc, u64 gen) 264 264 { 265 - if (wc->pin) { 266 - mutex_lock(&log->fs_info->pinned_mutex); 265 + if (wc->pin) 267 266 btrfs_update_pinned_extents(log->fs_info->extent_root, 268 267 eb->start, eb->len, 1); 269 - } 270 268 271 269 if (btrfs_buffer_uptodate(eb, gen)) { 272 270 if (wc->write) ··· 1222 1224 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1223 1225 name, name_len, log_type, &log_key); 1224 1226 1225 - if (ret && ret != -ENOENT) 1226 - BUG(); 1227 + BUG_ON(ret && ret != -ENOENT); 1227 1228 goto out; 1228 1229 } 1229 1230 ··· 2896 2899 u64 last_committed = root->fs_info->last_trans_committed; 2897 2900 2898 2901 sb = inode->i_sb; 2902 + 2903 + if (btrfs_test_opt(root, NOTREELOG)) { 2904 + ret = 1; 2905 + goto end_no_trans; 2906 + } 2899 2907 2900 2908 if (root->fs_info->last_trans_log_full_commit > 2901 2909 root->fs_info->last_trans_committed) {
+40 -1
fs/btrfs/volumes.c
··· 20 20 #include <linux/buffer_head.h> 21 21 #include <linux/blkdev.h> 22 22 #include <linux/random.h> 23 + #include <linux/iocontext.h> 23 24 #include <asm/div64.h> 24 25 #include "compat.h" 25 26 #include "ctree.h" ··· 146 145 int again = 0; 147 146 unsigned long num_run = 0; 148 147 unsigned long limit; 148 + unsigned long last_waited = 0; 149 149 150 - bdi = device->bdev->bd_inode->i_mapping->backing_dev_info; 150 + bdi = blk_get_backing_dev_info(device->bdev); 151 151 fs_info = device->dev_root->fs_info; 152 152 limit = btrfs_async_submit_limit(fs_info); 153 153 limit = limit * 2 / 3; ··· 209 207 if (pending && bdi_write_congested(bdi) && num_run > 16 && 210 208 fs_info->fs_devices->open_devices > 1) { 211 209 struct bio *old_head; 210 + struct io_context *ioc; 212 211 212 + ioc = current->io_context; 213 + 214 + /* 215 + * the main goal here is that we don't want to 216 + * block if we're going to be able to submit 217 + * more requests without blocking. 218 + * 219 + * This code does two great things, it pokes into 220 + * the elevator code from a filesystem _and_ 221 + * it makes assumptions about how batching works. 222 + */ 223 + if (ioc && ioc->nr_batch_requests > 0 && 224 + time_before(jiffies, ioc->last_waited + HZ/50UL) && 225 + (last_waited == 0 || 226 + ioc->last_waited == last_waited)) { 227 + /* 228 + * we want to go through our batch of 229 + * requests and stop. So, we copy out 230 + * the ioc->last_waited time and test 231 + * against it before looping 232 + */ 233 + last_waited = ioc->last_waited; 234 + continue; 235 + } 213 236 spin_lock(&device->io_lock); 214 237 215 238 old_head = device->pending_bios; ··· 258 231 if (device->pending_bios) 259 232 goto loop_lock; 260 233 spin_unlock(&device->io_lock); 234 + 235 + /* 236 + * IO has already been through a long path to get here. Checksumming, 237 + * async helper threads, perhaps compression. We've done a pretty 238 + * good job of collecting a batch of IO and should just unplug 239 + * the device right away. 240 + * 241 + * This will help anyone who is waiting on the IO, they might have 242 + * already unplugged, but managed to do so before the bio they 243 + * cared about found its way down here. 244 + */ 245 + blk_run_backing_dev(bdi, NULL); 261 246 done: 262 247 return 0; 263 248 }
+1 -1
fs/btrfs/volumes.h
··· 76 76 struct btrfs_fs_devices { 77 77 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ 78 78 79 - /* the device with this id has the most recent coyp of the super */ 79 + /* the device with this id has the most recent copy of the super */ 80 80 u64 latest_devid; 81 81 u64 latest_trans; 82 82 u64 num_devices;