commit ae1a25da8448271a99745da03100d5299575a269 · tjh.dev/kernel

+8

MAINTAINERS

··· 1021 1021 W: http://bu3sch.de/btgpio.php 1022 1022 S: Maintained 1023 1023 1024 + BTRFS FILE SYSTEM 1025 + P: Chris Mason 1026 + M: chris.mason@oracle.com 1027 + L: linux-btrfs@vger.kernel.org 1028 + W: http://btrfs.wiki.kernel.org/ 1029 + T: git kernel.org:/pub/scm/linux/kernel/git/mason/btrfs-unstable.git 1030 + S: Maintained 1031 + 1024 1032 BTTV VIDEO4LINUX DRIVER 1025 1033 P: Mauro Carvalho Chehab 1026 1034 M: mchehab@infradead.org

+13

fs/btrfs/Kconfig

··· 16 16 module will be called btrfs. 17 17 18 18 If unsure, say N. 19 + 20 + config BTRFS_FS_POSIX_ACL 21 + bool "Btrfs POSIX Access Control Lists" 22 + depends on BTRFS_FS 23 + select FS_POSIX_ACL 24 + help 25 + POSIX Access Control Lists (ACLs) support permissions for users and 26 + groups beyond the owner/group/world scheme. 27 + 28 + To learn more about Access Control Lists, visit the POSIX ACLs for 29 + Linux website <http://acl.bestbits.at/>. 30 + 31 + If you don't know what Access Control Lists are, say N

+53 -8

fs/btrfs/async-thread.c

··· 16 16 * Boston, MA 021110-1307, USA. 17 17 */ 18 18 19 - #include <linux/version.h> 20 19 #include <linux/kthread.h> 21 20 #include <linux/list.h> 22 21 #include <linux/spinlock.h> 23 - # include <linux/freezer.h> 22 + #include <linux/freezer.h> 23 + #include <linux/ftrace.h> 24 24 #include "async-thread.h" 25 25 26 26 #define WORK_QUEUED_BIT 0 ··· 143 143 struct btrfs_work *work; 144 144 do { 145 145 spin_lock_irq(&worker->lock); 146 + again_locked: 146 147 while (!list_empty(&worker->pending)) { 147 148 cur = worker->pending.next; 148 149 work = list_entry(cur, struct btrfs_work, list); ··· 166 165 check_idle_worker(worker); 167 166 168 167 } 169 - worker->working = 0; 170 168 if (freezing(current)) { 169 + worker->working = 0; 170 + spin_unlock_irq(&worker->lock); 171 171 refrigerator(); 172 172 } else { 173 - set_current_state(TASK_INTERRUPTIBLE); 174 173 spin_unlock_irq(&worker->lock); 175 - if (!kthread_should_stop()) 174 + if (!kthread_should_stop()) { 175 + cpu_relax(); 176 + /* 177 + * we've dropped the lock, did someone else 178 + * jump_in? 179 + */ 180 + smp_mb(); 181 + if (!list_empty(&worker->pending)) 182 + continue; 183 + 184 + /* 185 + * this short schedule allows more work to 186 + * come in without the queue functions 187 + * needing to go through wake_up_process() 188 + * 189 + * worker->working is still 1, so nobody 190 + * is going to try and wake us up 191 + */ 192 + schedule_timeout(1); 193 + smp_mb(); 194 + if (!list_empty(&worker->pending)) 195 + continue; 196 + 197 + /* still no more work?, sleep for real */ 198 + spin_lock_irq(&worker->lock); 199 + set_current_state(TASK_INTERRUPTIBLE); 200 + if (!list_empty(&worker->pending)) 201 + goto again_locked; 202 + 203 + /* 204 + * this makes sure we get a wakeup when someone 205 + * adds something new to the queue 206 + */ 207 + worker->working = 0; 208 + spin_unlock_irq(&worker->lock); 209 + 176 210 schedule(); 211 + } 177 212 __set_current_state(TASK_RUNNING); 178 213 } 179 214 } while (!kthread_should_stop()); ··· 387 350 { 388 351 struct btrfs_worker_thread *worker = work->worker; 389 352 unsigned long flags; 353 + int wake = 0; 390 354 391 355 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) 392 356 goto out; 393 357 394 358 spin_lock_irqsave(&worker->lock, flags); 395 - atomic_inc(&worker->num_pending); 396 359 list_add_tail(&work->list, &worker->pending); 360 + atomic_inc(&worker->num_pending); 397 361 398 362 /* by definition we're busy, take ourselves off the idle 399 363 * list ··· 406 368 &worker->workers->worker_list); 407 369 spin_unlock_irqrestore(&worker->workers->lock, flags); 408 370 } 371 + if (!worker->working) { 372 + wake = 1; 373 + worker->working = 1; 374 + } 409 375 410 376 spin_unlock_irqrestore(&worker->lock, flags); 411 - 377 + if (wake) 378 + wake_up_process(worker->task); 412 379 out: 380 + 413 381 return 0; 414 382 } 415 383 ··· 442 398 } 443 399 444 400 spin_lock_irqsave(&worker->lock, flags); 401 + 402 + list_add_tail(&work->list, &worker->pending); 445 403 atomic_inc(&worker->num_pending); 446 404 check_busy_worker(worker); 447 - list_add_tail(&work->list, &worker->pending); 448 405 449 406 /* 450 407 * avoid calling into wake_up_process if this thread has already

-1

fs/btrfs/compression.c

··· 32 32 #include <linux/swap.h> 33 33 #include <linux/writeback.h> 34 34 #include <linux/bit_spinlock.h> 35 - #include <linux/version.h> 36 35 #include <linux/pagevec.h> 37 36 #include "compat.h" 38 37 #include "ctree.h"

+246 -34

fs/btrfs/ctree.c

··· 54 54 return path; 55 55 } 56 56 57 + /* 58 + * set all locked nodes in the path to blocking locks. This should 59 + * be done before scheduling 60 + */ 61 + noinline void btrfs_set_path_blocking(struct btrfs_path *p) 62 + { 63 + int i; 64 + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 65 + if (p->nodes[i] && p->locks[i]) 66 + btrfs_set_lock_blocking(p->nodes[i]); 67 + } 68 + } 69 + 70 + /* 71 + * reset all the locked nodes in the patch to spinning locks. 72 + */ 73 + noinline void btrfs_clear_path_blocking(struct btrfs_path *p) 74 + { 75 + int i; 76 + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 77 + if (p->nodes[i] && p->locks[i]) 78 + btrfs_clear_lock_blocking(p->nodes[i]); 79 + } 80 + } 81 + 57 82 /* this also releases the path */ 58 83 void btrfs_free_path(struct btrfs_path *p) 59 84 { ··· 297 272 if (IS_ERR(cow)) 298 273 return PTR_ERR(cow); 299 274 275 + /* cow is set to blocking by btrfs_init_new_buffer */ 276 + 300 277 copy_extent_buffer(cow, buf, 0, 0, cow->len); 301 278 btrfs_set_header_bytenr(cow, cow->start); 302 279 btrfs_set_header_generation(cow, trans->transid); ··· 415 388 WARN_ON(1); 416 389 } 417 390 418 - spin_lock(&root->fs_info->hash_lock); 419 391 if (btrfs_header_generation(buf) == trans->transid && 420 392 btrfs_header_owner(buf) == root->root_key.objectid && 421 393 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 422 394 *cow_ret = buf; 423 - spin_unlock(&root->fs_info->hash_lock); 424 395 WARN_ON(prealloc_dest); 425 396 return 0; 426 397 } 427 - spin_unlock(&root->fs_info->hash_lock); 398 + 428 399 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); 400 + 401 + if (parent) 402 + btrfs_set_lock_blocking(parent); 403 + btrfs_set_lock_blocking(buf); 404 + 429 405 ret = __btrfs_cow_block(trans, root, buf, parent, 430 406 parent_slot, cow_ret, search_start, 0, 431 407 prealloc_dest); ··· 534 504 if (parent_nritems == 1) 535 505 return 0; 536 506 507 + btrfs_set_lock_blocking(parent); 508 + 537 509 for (i = start_slot; i < end_slot; i++) { 538 510 int close = 1; 539 511 ··· 596 564 search_start = last_block; 597 565 598 566 btrfs_tree_lock(cur); 567 + btrfs_set_lock_blocking(cur); 599 568 err = __btrfs_cow_block(trans, root, cur, parent, i, 600 569 &cur, search_start, 601 570 min(16 * blocksize, ··· 895 862 return 0; 896 863 897 864 mid = path->nodes[level]; 865 + 898 866 WARN_ON(!path->locks[level]); 899 867 WARN_ON(btrfs_header_generation(mid) != trans->transid); 900 868 ··· 918 884 /* promote the child to a root */ 919 885 child = read_node_slot(root, mid, 0); 920 886 btrfs_tree_lock(child); 887 + btrfs_set_lock_blocking(child); 921 888 BUG_ON(!child); 922 889 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); 923 890 BUG_ON(ret); ··· 935 900 936 901 add_root_to_dirty_list(root); 937 902 btrfs_tree_unlock(child); 903 + 938 904 path->locks[level] = 0; 939 905 path->nodes[level] = NULL; 940 906 clean_tree_block(trans, root, mid); ··· 960 924 left = read_node_slot(root, parent, pslot - 1); 961 925 if (left) { 962 926 btrfs_tree_lock(left); 927 + btrfs_set_lock_blocking(left); 963 928 wret = btrfs_cow_block(trans, root, left, 964 929 parent, pslot - 1, &left, 0); 965 930 if (wret) { ··· 971 934 right = read_node_slot(root, parent, pslot + 1); 972 935 if (right) { 973 936 btrfs_tree_lock(right); 937 + btrfs_set_lock_blocking(right); 974 938 wret = btrfs_cow_block(trans, root, right, 975 939 parent, pslot + 1, &right, 0); 976 940 if (wret) { ··· 1147 1109 u32 left_nr; 1148 1110 1149 1111 btrfs_tree_lock(left); 1112 + btrfs_set_lock_blocking(left); 1113 + 1150 1114 left_nr = btrfs_header_nritems(left); 1151 1115 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { 1152 1116 wret = 1; ··· 1195 1155 */ 1196 1156 if (right) { 1197 1157 u32 right_nr; 1158 + 1198 1159 btrfs_tree_lock(right); 1160 + btrfs_set_lock_blocking(right); 1161 + 1199 1162 right_nr = btrfs_header_nritems(right); 1200 1163 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { 1201 1164 wret = 1; ··· 1253 1210 struct btrfs_disk_key disk_key; 1254 1211 u32 nritems; 1255 1212 u64 search; 1256 - u64 lowest_read; 1257 - u64 highest_read; 1213 + u64 target; 1258 1214 u64 nread = 0; 1259 1215 int direction = path->reada; 1260 1216 struct extent_buffer *eb; ··· 1277 1235 return; 1278 1236 } 1279 1237 1280 - highest_read = search; 1281 - lowest_read = search; 1238 + target = search; 1282 1239 1283 1240 nritems = btrfs_header_nritems(node); 1284 1241 nr = slot; ··· 1297 1256 break; 1298 1257 } 1299 1258 search = btrfs_node_blockptr(node, nr); 1300 - if ((search >= lowest_read && search <= highest_read) || 1301 - (search < lowest_read && lowest_read - search <= 16384) || 1302 - (search > highest_read && search - highest_read <= 16384)) { 1259 + if ((search <= target && target - search <= 65536) || 1260 + (search > target && search - target <= 65536)) { 1303 1261 readahead_tree_block(root, search, blocksize, 1304 1262 btrfs_node_ptr_generation(node, nr)); 1305 1263 nread += blocksize; 1306 1264 } 1307 1265 nscan++; 1308 - if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32)) 1266 + if ((nread > 65536 || nscan > 32)) 1309 1267 break; 1310 - 1311 - if (nread > (256 * 1024) || nscan > 128) 1312 - break; 1313 - 1314 - if (search < lowest_read) 1315 - lowest_read = search; 1316 - if (search > highest_read) 1317 - highest_read = search; 1318 1268 } 1319 1269 } 1270 + 1271 + /* 1272 + * returns -EAGAIN if it had to drop the path, or zero if everything was in 1273 + * cache 1274 + */ 1275 + static noinline int reada_for_balance(struct btrfs_root *root, 1276 + struct btrfs_path *path, int level) 1277 + { 1278 + int slot; 1279 + int nritems; 1280 + struct extent_buffer *parent; 1281 + struct extent_buffer *eb; 1282 + u64 gen; 1283 + u64 block1 = 0; 1284 + u64 block2 = 0; 1285 + int ret = 0; 1286 + int blocksize; 1287 + 1288 + parent = path->nodes[level - 1]; 1289 + if (!parent) 1290 + return 0; 1291 + 1292 + nritems = btrfs_header_nritems(parent); 1293 + slot = path->slots[level]; 1294 + blocksize = btrfs_level_size(root, level); 1295 + 1296 + if (slot > 0) { 1297 + block1 = btrfs_node_blockptr(parent, slot - 1); 1298 + gen = btrfs_node_ptr_generation(parent, slot - 1); 1299 + eb = btrfs_find_tree_block(root, block1, blocksize); 1300 + if (eb && btrfs_buffer_uptodate(eb, gen)) 1301 + block1 = 0; 1302 + free_extent_buffer(eb); 1303 + } 1304 + if (slot < nritems) { 1305 + block2 = btrfs_node_blockptr(parent, slot + 1); 1306 + gen = btrfs_node_ptr_generation(parent, slot + 1); 1307 + eb = btrfs_find_tree_block(root, block2, blocksize); 1308 + if (eb && btrfs_buffer_uptodate(eb, gen)) 1309 + block2 = 0; 1310 + free_extent_buffer(eb); 1311 + } 1312 + if (block1 || block2) { 1313 + ret = -EAGAIN; 1314 + btrfs_release_path(root, path); 1315 + if (block1) 1316 + readahead_tree_block(root, block1, blocksize, 0); 1317 + if (block2) 1318 + readahead_tree_block(root, block2, blocksize, 0); 1319 + 1320 + if (block1) { 1321 + eb = read_tree_block(root, block1, blocksize, 0); 1322 + free_extent_buffer(eb); 1323 + } 1324 + if (block1) { 1325 + eb = read_tree_block(root, block2, blocksize, 0); 1326 + free_extent_buffer(eb); 1327 + } 1328 + } 1329 + return ret; 1330 + } 1331 + 1320 1332 1321 1333 /* 1322 1334 * when we walk down the tree, it is usually safe to unlock the higher layers ··· 1418 1324 btrfs_tree_unlock(t); 1419 1325 path->locks[i] = 0; 1420 1326 } 1327 + } 1328 + } 1329 + 1330 + /* 1331 + * This releases any locks held in the path starting at level and 1332 + * going all the way up to the root. 1333 + * 1334 + * btrfs_search_slot will keep the lock held on higher nodes in a few 1335 + * corner cases, such as COW of the block at slot zero in the node. This 1336 + * ignores those rules, and it should only be called when there are no 1337 + * more updates to be done higher up in the tree. 1338 + */ 1339 + noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) 1340 + { 1341 + int i; 1342 + 1343 + if (path->keep_locks || path->lowest_level) 1344 + return; 1345 + 1346 + for (i = level; i < BTRFS_MAX_LEVEL; i++) { 1347 + if (!path->nodes[i]) 1348 + continue; 1349 + if (!path->locks[i]) 1350 + continue; 1351 + btrfs_tree_unlock(path->nodes[i]); 1352 + path->locks[i] = 0; 1421 1353 } 1422 1354 } 1423 1355 ··· 1507 1387 int wret; 1508 1388 1509 1389 /* is a cow on this block not required */ 1510 - spin_lock(&root->fs_info->hash_lock); 1511 1390 if (btrfs_header_generation(b) == trans->transid && 1512 1391 btrfs_header_owner(b) == root->root_key.objectid && 1513 1392 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { 1514 - spin_unlock(&root->fs_info->hash_lock); 1515 1393 goto cow_done; 1516 1394 } 1517 - spin_unlock(&root->fs_info->hash_lock); 1518 1395 1519 1396 /* ok, we have to cow, is our old prealloc the right 1520 1397 * size? 1521 1398 */ 1522 1399 if (prealloc_block.objectid && 1523 1400 prealloc_block.offset != b->len) { 1401 + btrfs_release_path(root, p); 1524 1402 btrfs_free_reserved_extent(root, 1525 1403 prealloc_block.objectid, 1526 1404 prealloc_block.offset); 1527 1405 prealloc_block.objectid = 0; 1406 + goto again; 1528 1407 } 1529 1408 1530 1409 /* 1531 1410 * for higher level blocks, try not to allocate blocks 1532 1411 * with the block and the parent locks held. 1533 1412 */ 1534 - if (level > 1 && !prealloc_block.objectid && 1413 + if (level > 0 && !prealloc_block.objectid && 1535 1414 btrfs_path_lock_waiting(p, level)) { 1536 1415 u32 size = b->len; 1537 1416 u64 hint = b->start; ··· 1543 1424 BUG_ON(ret); 1544 1425 goto again; 1545 1426 } 1427 + 1428 + btrfs_set_path_blocking(p); 1546 1429 1547 1430 wret = btrfs_cow_block(trans, root, b, 1548 1431 p->nodes[level + 1], ··· 1567 1446 if (!p->skip_locking) 1568 1447 p->locks[level] = 1; 1569 1448 1449 + btrfs_clear_path_blocking(p); 1450 + 1451 + /* 1452 + * we have a lock on b and as long as we aren't changing 1453 + * the tree, there is no way to for the items in b to change. 1454 + * It is safe to drop the lock on our parent before we 1455 + * go through the expensive btree search on b. 1456 + * 1457 + * If cow is true, then we might be changing slot zero, 1458 + * which may require changing the parent. So, we can't 1459 + * drop the lock until after we know which slot we're 1460 + * operating on. 1461 + */ 1462 + if (!cow) 1463 + btrfs_unlock_up_safe(p, level + 1); 1464 + 1570 1465 ret = check_block(root, p, level); 1571 1466 if (ret) { 1572 1467 ret = -1; ··· 1590 1453 } 1591 1454 1592 1455 ret = bin_search(b, key, level, &slot); 1456 + 1593 1457 if (level != 0) { 1594 1458 if (ret && slot > 0) 1595 1459 slot -= 1; ··· 1598 1460 if ((p->search_for_split || ins_len > 0) && 1599 1461 btrfs_header_nritems(b) >= 1600 1462 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 1601 - int sret = split_node(trans, root, p, level); 1463 + int sret; 1464 + 1465 + sret = reada_for_balance(root, p, level); 1466 + if (sret) 1467 + goto again; 1468 + 1469 + btrfs_set_path_blocking(p); 1470 + sret = split_node(trans, root, p, level); 1471 + btrfs_clear_path_blocking(p); 1472 + 1602 1473 BUG_ON(sret > 0); 1603 1474 if (sret) { 1604 1475 ret = sret; ··· 1615 1468 } 1616 1469 b = p->nodes[level]; 1617 1470 slot = p->slots[level]; 1618 - } else if (ins_len < 0) { 1619 - int sret = balance_level(trans, root, p, 1620 - level); 1471 + } else if (ins_len < 0 && 1472 + btrfs_header_nritems(b) < 1473 + BTRFS_NODEPTRS_PER_BLOCK(root) / 4) { 1474 + int sret; 1475 + 1476 + sret = reada_for_balance(root, p, level); 1477 + if (sret) 1478 + goto again; 1479 + 1480 + btrfs_set_path_blocking(p); 1481 + sret = balance_level(trans, root, p, level); 1482 + btrfs_clear_path_blocking(p); 1483 + 1621 1484 if (sret) { 1622 1485 ret = sret; 1623 1486 goto done; ··· 1661 1504 * of the btree by dropping locks before 1662 1505 * we read. 1663 1506 */ 1664 - if (level > 1) { 1507 + if (level > 0) { 1665 1508 btrfs_release_path(NULL, p); 1666 1509 if (tmp) 1667 1510 free_extent_buffer(tmp); ··· 1676 1519 free_extent_buffer(tmp); 1677 1520 goto again; 1678 1521 } else { 1522 + btrfs_set_path_blocking(p); 1679 1523 if (tmp) 1680 1524 free_extent_buffer(tmp); 1681 1525 if (should_reada) ··· 1686 1528 b = read_node_slot(root, b, slot); 1687 1529 } 1688 1530 } 1689 - if (!p->skip_locking) 1690 - btrfs_tree_lock(b); 1531 + if (!p->skip_locking) { 1532 + int lret; 1533 + 1534 + btrfs_clear_path_blocking(p); 1535 + lret = btrfs_try_spin_lock(b); 1536 + 1537 + if (!lret) { 1538 + btrfs_set_path_blocking(p); 1539 + btrfs_tree_lock(b); 1540 + btrfs_clear_path_blocking(p); 1541 + } 1542 + } 1691 1543 } else { 1692 1544 p->slots[level] = slot; 1693 1545 if (ins_len > 0 && 1694 1546 btrfs_leaf_free_space(root, b) < ins_len) { 1695 - int sret = split_leaf(trans, root, key, 1547 + int sret; 1548 + 1549 + btrfs_set_path_blocking(p); 1550 + sret = split_leaf(trans, root, key, 1696 1551 p, ins_len, ret == 0); 1552 + btrfs_clear_path_blocking(p); 1553 + 1697 1554 BUG_ON(sret > 0); 1698 1555 if (sret) { 1699 1556 ret = sret; ··· 1722 1549 } 1723 1550 ret = 1; 1724 1551 done: 1552 + /* 1553 + * we don't really know what they plan on doing with the path 1554 + * from here on, so for now just mark it as blocking 1555 + */ 1556 + btrfs_set_path_blocking(p); 1725 1557 if (prealloc_block.objectid) { 1726 1558 btrfs_free_reserved_extent(root, 1727 1559 prealloc_block.objectid, 1728 1560 prealloc_block.offset); 1729 1561 } 1730 - 1731 1562 return ret; 1732 1563 } 1733 1564 ··· 1754 1577 eb = btrfs_lock_root_node(root); 1755 1578 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); 1756 1579 BUG_ON(ret); 1580 + 1581 + btrfs_set_lock_blocking(eb); 1757 1582 1758 1583 parent = eb; 1759 1584 while (1) { ··· 1781 1602 eb = read_tree_block(root, bytenr, blocksize, 1782 1603 generation); 1783 1604 btrfs_tree_lock(eb); 1605 + btrfs_set_lock_blocking(eb); 1784 1606 } 1785 1607 1786 1608 /* ··· 1806 1626 eb = read_tree_block(root, bytenr, blocksize, 1807 1627 generation); 1808 1628 btrfs_tree_lock(eb); 1629 + btrfs_set_lock_blocking(eb); 1809 1630 } 1810 1631 1811 1632 ret = btrfs_cow_block(trans, root, eb, parent, slot, ··· 2353 2172 2354 2173 right = read_node_slot(root, upper, slot + 1); 2355 2174 btrfs_tree_lock(right); 2175 + btrfs_set_lock_blocking(right); 2176 + 2356 2177 free_space = btrfs_leaf_free_space(root, right); 2357 2178 if (free_space < data_size) 2358 2179 goto out_unlock; ··· 2550 2367 2551 2368 left = read_node_slot(root, path->nodes[1], slot - 1); 2552 2369 btrfs_tree_lock(left); 2370 + btrfs_set_lock_blocking(left); 2371 + 2553 2372 free_space = btrfs_leaf_free_space(root, left); 2554 2373 if (free_space < data_size) { 2555 2374 ret = 1; ··· 3009 2824 sizeof(struct btrfs_item), 1); 3010 2825 path->keep_locks = 0; 3011 2826 BUG_ON(ret); 2827 + 2828 + /* 2829 + * make sure any changes to the path from split_leaf leave it 2830 + * in a blocking state 2831 + */ 2832 + btrfs_set_path_blocking(path); 3012 2833 3013 2834 leaf = path->nodes[0]; 3014 2835 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); ··· 3545 3354 BUG(); 3546 3355 } 3547 3356 out: 3357 + btrfs_unlock_up_safe(path, 1); 3548 3358 return ret; 3549 3359 } 3550 3360 ··· 3633 3441 { 3634 3442 int ret; 3635 3443 u64 root_gen = btrfs_header_generation(path->nodes[1]); 3444 + u64 parent_start = path->nodes[1]->start; 3445 + u64 parent_owner = btrfs_header_owner(path->nodes[1]); 3636 3446 3637 3447 ret = del_ptr(trans, root, path, 1, path->slots[1]); 3638 3448 if (ret) 3639 3449 return ret; 3640 3450 3451 + /* 3452 + * btrfs_free_extent is expensive, we want to make sure we 3453 + * aren't holding any locks when we call it 3454 + */ 3455 + btrfs_unlock_up_safe(path, 0); 3456 + 3641 3457 ret = btrfs_free_extent(trans, root, bytenr, 3642 3458 btrfs_level_size(root, 0), 3643 - path->nodes[1]->start, 3644 - btrfs_header_owner(path->nodes[1]), 3459 + parent_start, parent_owner, 3645 3460 root_gen, 0, 1); 3646 3461 return ret; 3647 3462 } ··· 3920 3721 */ 3921 3722 if (slot >= nritems) { 3922 3723 path->slots[level] = slot; 3724 + btrfs_set_path_blocking(path); 3923 3725 sret = btrfs_find_next_key(root, path, min_key, level, 3924 3726 cache_only, min_trans); 3925 3727 if (sret == 0) { 3926 3728 btrfs_release_path(root, path); 3927 3729 goto again; 3928 3730 } else { 3731 + btrfs_clear_path_blocking(path); 3929 3732 goto out; 3930 3733 } 3931 3734 } ··· 3939 3738 unlock_up(path, level, 1); 3940 3739 goto out; 3941 3740 } 3741 + btrfs_set_path_blocking(path); 3942 3742 cur = read_node_slot(root, cur, slot); 3943 3743 3944 3744 btrfs_tree_lock(cur); 3745 + 3945 3746 path->locks[level - 1] = 1; 3946 3747 path->nodes[level - 1] = cur; 3947 3748 unlock_up(path, level, 1); 3749 + btrfs_clear_path_blocking(path); 3948 3750 } 3949 3751 out: 3950 3752 if (ret == 0) 3951 3753 memcpy(min_key, &found_key, sizeof(found_key)); 3754 + btrfs_set_path_blocking(path); 3952 3755 return ret; 3953 3756 } 3954 3757 ··· 4048 3843 if (ret < 0) 4049 3844 return ret; 4050 3845 3846 + btrfs_set_path_blocking(path); 4051 3847 nritems = btrfs_header_nritems(path->nodes[0]); 4052 3848 /* 4053 3849 * by releasing the path above we dropped all our locks. A balance ··· 4079 3873 free_extent_buffer(next); 4080 3874 } 4081 3875 3876 + /* the path was set to blocking above */ 4082 3877 if (level == 1 && (path->locks[1] || path->skip_locking) && 4083 3878 path->reada) 4084 3879 reada_for_search(root, path, level, slot, 0); ··· 4088 3881 if (!path->skip_locking) { 4089 3882 WARN_ON(!btrfs_tree_locked(c)); 4090 3883 btrfs_tree_lock(next); 3884 + btrfs_set_lock_blocking(next); 4091 3885 } 4092 3886 break; 4093 3887 } ··· 4105 3897 path->locks[level] = 1; 4106 3898 if (!level) 4107 3899 break; 3900 + 3901 + btrfs_set_path_blocking(path); 4108 3902 if (level == 1 && path->locks[1] && path->reada) 4109 3903 reada_for_search(root, path, level, slot, 0); 4110 3904 next = read_node_slot(root, next, 0); 4111 3905 if (!path->skip_locking) { 4112 3906 WARN_ON(!btrfs_tree_locked(path->nodes[level])); 4113 3907 btrfs_tree_lock(next); 3908 + btrfs_set_lock_blocking(next); 4114 3909 } 4115 3910 } 4116 3911 done: ··· 4138 3927 4139 3928 while (1) { 4140 3929 if (path->slots[0] == 0) { 3930 + btrfs_set_path_blocking(path); 4141 3931 ret = btrfs_prev_leaf(root, path); 4142 3932 if (ret != 0) 4143 3933 return ret;

+13 -15

fs/btrfs/ctree.h

··· 454 454 __le32 nsec; 455 455 } __attribute__ ((__packed__)); 456 456 457 - typedef enum { 457 + enum btrfs_compression_type { 458 458 BTRFS_COMPRESS_NONE = 0, 459 459 BTRFS_COMPRESS_ZLIB = 1, 460 460 BTRFS_COMPRESS_LAST = 2, 461 - } btrfs_compression_type; 462 - 463 - /* we don't understand any encryption methods right now */ 464 - typedef enum { 465 - BTRFS_ENCRYPTION_NONE = 0, 466 - BTRFS_ENCRYPTION_LAST = 1, 467 - } btrfs_encryption_type; 461 + }; 468 462 469 463 struct btrfs_inode_item { 470 464 /* nfs style generation number */ ··· 695 701 struct btrfs_transaction *running_transaction; 696 702 wait_queue_head_t transaction_throttle; 697 703 wait_queue_head_t transaction_wait; 698 - 699 704 wait_queue_head_t async_submit_wait; 700 - wait_queue_head_t tree_log_wait; 701 705 702 706 struct btrfs_super_block super_copy; 703 707 struct btrfs_super_block super_for_commit; ··· 703 711 struct super_block *sb; 704 712 struct inode *btree_inode; 705 713 struct backing_dev_info bdi; 706 - spinlock_t hash_lock; 707 714 struct mutex trans_mutex; 708 715 struct mutex tree_log_mutex; 709 716 struct mutex transaction_kthread_mutex; ··· 721 730 atomic_t async_submit_draining; 722 731 atomic_t nr_async_bios; 723 732 atomic_t async_delalloc_pages; 724 - atomic_t tree_log_writers; 725 - atomic_t tree_log_commit; 726 - unsigned long tree_log_batch; 727 - u64 tree_log_transid; 728 733 729 734 /* 730 735 * this is used by the balancing code to wait for all the pending ··· 820 833 struct kobject root_kobj; 821 834 struct completion kobj_unregister; 822 835 struct mutex objectid_mutex; 836 + 823 837 struct mutex log_mutex; 838 + wait_queue_head_t log_writer_wait; 839 + wait_queue_head_t log_commit_wait[2]; 840 + atomic_t log_writers; 841 + atomic_t log_commit[2]; 842 + unsigned long log_transid; 843 + unsigned long log_batch; 824 844 825 845 u64 objectid; 826 846 u64 last_trans; ··· 1835 1841 struct btrfs_path *btrfs_alloc_path(void); 1836 1842 void btrfs_free_path(struct btrfs_path *p); 1837 1843 void btrfs_init_path(struct btrfs_path *p); 1844 + void btrfs_set_path_blocking(struct btrfs_path *p); 1845 + void btrfs_clear_path_blocking(struct btrfs_path *p); 1846 + void btrfs_unlock_up_safe(struct btrfs_path *p, int level); 1847 + 1838 1848 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1839 1849 struct btrfs_path *path, int slot, int nr); 1840 1850 int btrfs_del_leaf(struct btrfs_trans_handle *trans,

+87 -33

fs/btrfs/disk-io.c

··· 16 16 * Boston, MA 021110-1307, USA. 17 17 */ 18 18 19 - #include <linux/version.h> 20 19 #include <linux/fs.h> 21 20 #include <linux/blkdev.h> 22 21 #include <linux/scatterlist.h> ··· 799 800 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 800 801 801 802 if (ret == 0) 802 - buf->flags |= EXTENT_UPTODATE; 803 + set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); 803 804 else 804 805 WARN_ON(1); 805 806 return buf; ··· 813 814 if (btrfs_header_generation(buf) == 814 815 root->fs_info->running_transaction->transid) { 815 816 WARN_ON(!btrfs_tree_locked(buf)); 817 + 818 + /* ugh, clear_extent_buffer_dirty can be expensive */ 819 + btrfs_set_lock_blocking(buf); 820 + 816 821 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, 817 822 buf); 818 823 } ··· 853 850 spin_lock_init(&root->list_lock); 854 851 mutex_init(&root->objectid_mutex); 855 852 mutex_init(&root->log_mutex); 853 + init_waitqueue_head(&root->log_writer_wait); 854 + init_waitqueue_head(&root->log_commit_wait[0]); 855 + init_waitqueue_head(&root->log_commit_wait[1]); 856 + atomic_set(&root->log_commit[0], 0); 857 + atomic_set(&root->log_commit[1], 0); 858 + atomic_set(&root->log_writers, 0); 859 + root->log_batch = 0; 860 + root->log_transid = 0; 856 861 extent_io_tree_init(&root->dirty_log_pages, 857 862 fs_info->btree_inode->i_mapping, GFP_NOFS); 858 863 ··· 945 934 return 0; 946 935 } 947 936 948 - int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 949 - struct btrfs_fs_info *fs_info) 937 + static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, 938 + struct btrfs_fs_info *fs_info) 950 939 { 951 940 struct btrfs_root *root; 952 941 struct btrfs_root *tree_root = fs_info->tree_root; 942 + struct extent_buffer *leaf; 953 943 954 944 root = kzalloc(sizeof(*root), GFP_NOFS); 955 945 if (!root) 956 - return -ENOMEM; 946 + return ERR_PTR(-ENOMEM); 957 947 958 948 __setup_root(tree_root->nodesize, tree_root->leafsize, 959 949 tree_root->sectorsize, tree_root->stripesize, ··· 963 951 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; 964 952 root->root_key.type = BTRFS_ROOT_ITEM_KEY; 965 953 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; 954 + /* 955 + * log trees do not get reference counted because they go away 956 + * before a real commit is actually done. They do store pointers 957 + * to file data extents, and those reference counts still get 958 + * updated (along with back refs to the log tree). 959 + */ 966 960 root->ref_cows = 0; 967 961 968 - root->node = btrfs_alloc_free_block(trans, root, root->leafsize, 969 - 0, BTRFS_TREE_LOG_OBJECTID, 970 - trans->transid, 0, 0, 0); 962 + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 963 + 0, BTRFS_TREE_LOG_OBJECTID, 964 + trans->transid, 0, 0, 0); 965 + if (IS_ERR(leaf)) { 966 + kfree(root); 967 + return ERR_CAST(leaf); 968 + } 971 969 970 + root->node = leaf; 972 971 btrfs_set_header_nritems(root->node, 0); 973 972 btrfs_set_header_level(root->node, 0); 974 973 btrfs_set_header_bytenr(root->node, root->node->start); ··· 991 968 BTRFS_FSID_SIZE); 992 969 btrfs_mark_buffer_dirty(root->node); 993 970 btrfs_tree_unlock(root->node); 994 - fs_info->log_root_tree = root; 971 + return root; 972 + } 973 + 974 + int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 975 + struct btrfs_fs_info *fs_info) 976 + { 977 + struct btrfs_root *log_root; 978 + 979 + log_root = alloc_log_tree(trans, fs_info); 980 + if (IS_ERR(log_root)) 981 + return PTR_ERR(log_root); 982 + WARN_ON(fs_info->log_root_tree); 983 + fs_info->log_root_tree = log_root; 984 + return 0; 985 + } 986 + 987 + int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 988 + struct btrfs_root *root) 989 + { 990 + struct btrfs_root *log_root; 991 + struct btrfs_inode_item *inode_item; 992 + 993 + log_root = alloc_log_tree(trans, root->fs_info); 994 + if (IS_ERR(log_root)) 995 + return PTR_ERR(log_root); 996 + 997 + log_root->last_trans = trans->transid; 998 + log_root->root_key.offset = root->root_key.objectid; 999 + 1000 + inode_item = &log_root->root_item.inode; 1001 + inode_item->generation = cpu_to_le64(1); 1002 + inode_item->size = cpu_to_le64(3); 1003 + inode_item->nlink = cpu_to_le32(1); 1004 + inode_item->nbytes = cpu_to_le64(root->leafsize); 1005 + inode_item->mode = cpu_to_le32(S_IFDIR | 0755); 1006 + 1007 + btrfs_set_root_bytenr(&log_root->root_item, log_root->node->start); 1008 + btrfs_set_root_generation(&log_root->root_item, trans->transid); 1009 + 1010 + WARN_ON(root->log_root); 1011 + root->log_root = log_root; 1012 + root->log_transid = 0; 995 1013 return 0; 996 1014 } 997 1015 ··· 1200 1136 { 1201 1137 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; 1202 1138 int ret = 0; 1203 - struct list_head *cur; 1204 1139 struct btrfs_device *device; 1205 1140 struct backing_dev_info *bdi; 1206 1141 #if 0 ··· 1207 1144 btrfs_congested_async(info, 0)) 1208 1145 return 1; 1209 1146 #endif 1210 - list_for_each(cur, &info->fs_devices->devices) { 1211 - device = list_entry(cur, struct btrfs_device, dev_list); 1147 + list_for_each_entry(device, &info->fs_devices->devices, dev_list) { 1212 1148 if (!device->bdev) 1213 1149 continue; 1214 1150 bdi = blk_get_backing_dev_info(device->bdev); ··· 1225 1163 */ 1226 1164 static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page) 1227 1165 { 1228 - struct list_head *cur; 1229 1166 struct btrfs_device *device; 1230 1167 struct btrfs_fs_info *info; 1231 1168 1232 1169 info = (struct btrfs_fs_info *)bdi->unplug_io_data; 1233 - list_for_each(cur, &info->fs_devices->devices) { 1234 - device = list_entry(cur, struct btrfs_device, dev_list); 1170 + list_for_each_entry(device, &info->fs_devices->devices, dev_list) { 1235 1171 if (!device->bdev) 1236 1172 continue; 1237 1173 ··· 1507 1447 INIT_LIST_HEAD(&fs_info->dead_roots); 1508 1448 INIT_LIST_HEAD(&fs_info->hashers); 1509 1449 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 1510 - spin_lock_init(&fs_info->hash_lock); 1511 1450 spin_lock_init(&fs_info->delalloc_lock); 1512 1451 spin_lock_init(&fs_info->new_trans_lock); 1513 1452 spin_lock_init(&fs_info->ref_cache_lock); ··· 1594 1535 init_waitqueue_head(&fs_info->transaction_throttle); 1595 1536 init_waitqueue_head(&fs_info->transaction_wait); 1596 1537 init_waitqueue_head(&fs_info->async_submit_wait); 1597 - init_waitqueue_head(&fs_info->tree_log_wait); 1598 - atomic_set(&fs_info->tree_log_commit, 0); 1599 - atomic_set(&fs_info->tree_log_writers, 0); 1600 - fs_info->tree_log_transid = 0; 1601 1538 1602 1539 __setup_root(4096, 4096, 4096, 4096, tree_root, 1603 1540 fs_info, BTRFS_ROOT_TREE_OBJECTID); ··· 1682 1627 * low idle thresh 1683 1628 */ 1684 1629 fs_info->endio_workers.idle_thresh = 4; 1630 + fs_info->endio_meta_workers.idle_thresh = 4; 1631 + 1685 1632 fs_info->endio_write_workers.idle_thresh = 64; 1686 1633 fs_info->endio_meta_write_workers.idle_thresh = 64; 1687 1634 ··· 1797 1740 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; 1798 1741 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 1799 1742 "btrfs-cleaner"); 1800 - if (!fs_info->cleaner_kthread) 1743 + if (IS_ERR(fs_info->cleaner_kthread)) 1801 1744 goto fail_csum_root; 1802 1745 1803 1746 fs_info->transaction_kthread = kthread_run(transaction_kthread, 1804 1747 tree_root, 1805 1748 "btrfs-transaction"); 1806 - if (!fs_info->transaction_kthread) 1749 + if (IS_ERR(fs_info->transaction_kthread)) 1807 1750 goto fail_cleaner; 1808 1751 1809 1752 if (btrfs_super_log_root(disk_super) != 0) { ··· 1885 1828 fail_iput: 1886 1829 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 1887 1830 iput(fs_info->btree_inode); 1888 - fail: 1831 + 1889 1832 btrfs_close_devices(fs_info->fs_devices); 1890 1833 btrfs_mapping_tree_free(&fs_info->mapping_tree); 1834 + bdi_destroy(&fs_info->bdi); 1891 1835 1836 + fail: 1892 1837 kfree(extent_root); 1893 1838 kfree(tree_root); 1894 - bdi_destroy(&fs_info->bdi); 1895 1839 kfree(fs_info); 1896 1840 kfree(chunk_root); 1897 1841 kfree(dev_root); ··· 2053 1995 2054 1996 int write_all_supers(struct btrfs_root *root, int max_mirrors) 2055 1997 { 2056 - struct list_head *cur; 2057 1998 struct list_head *head = &root->fs_info->fs_devices->devices; 2058 1999 struct btrfs_device *dev; 2059 2000 struct btrfs_super_block *sb; ··· 2068 2011 2069 2012 sb = &root->fs_info->super_for_commit; 2070 2013 dev_item = &sb->dev_item; 2071 - list_for_each(cur, head) { 2072 - dev = list_entry(cur, struct btrfs_device, dev_list); 2014 + list_for_each_entry(dev, head, dev_list) { 2073 2015 if (!dev->bdev) { 2074 2016 total_errors++; 2075 2017 continue; ··· 2101 2045 } 2102 2046 2103 2047 total_errors = 0; 2104 - list_for_each(cur, head) { 2105 - dev = list_entry(cur, struct btrfs_device, dev_list); 2048 + list_for_each_entry(dev, head, dev_list) { 2106 2049 if (!dev->bdev) 2107 2050 continue; 2108 2051 if (!dev->in_fs_metadata || !dev->writeable) ··· 2315 2260 u64 transid = btrfs_header_generation(buf); 2316 2261 struct inode *btree_inode = root->fs_info->btree_inode; 2317 2262 2263 + btrfs_set_lock_blocking(buf); 2264 + 2318 2265 WARN_ON(!btrfs_tree_locked(buf)); 2319 2266 if (transid != root->fs_info->generation) { 2320 2267 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " ··· 2359 2302 int ret; 2360 2303 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 2361 2304 if (ret == 0) 2362 - buf->flags |= EXTENT_UPTODATE; 2305 + set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); 2363 2306 return ret; 2364 2307 } 2365 2308 2366 2309 int btree_lock_page_hook(struct page *page) 2367 2310 { 2368 2311 struct inode *inode = page->mapping->host; 2369 - struct btrfs_root *root = BTRFS_I(inode)->root; 2370 2312 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2371 2313 struct extent_buffer *eb; 2372 2314 unsigned long len; ··· 2380 2324 goto out; 2381 2325 2382 2326 btrfs_tree_lock(eb); 2383 - spin_lock(&root->fs_info->hash_lock); 2384 2327 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 2385 - spin_unlock(&root->fs_info->hash_lock); 2386 2328 btrfs_tree_unlock(eb); 2387 2329 free_extent_buffer(eb); 2388 2330 out:

+2

fs/btrfs/disk-io.h

··· 98 98 struct btrfs_fs_info *fs_info); 99 99 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 100 100 struct btrfs_fs_info *fs_info); 101 + int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 102 + struct btrfs_root *root); 101 103 int btree_lock_page_hook(struct page *page); 102 104 #endif

+366 -74

fs/btrfs/extent-tree.c

··· 19 19 #include <linux/pagemap.h> 20 20 #include <linux/writeback.h> 21 21 #include <linux/blkdev.h> 22 - #include <linux/version.h> 22 + #include <linux/sort.h> 23 23 #include "compat.h" 24 24 #include "hash.h" 25 25 #include "crc32c.h" ··· 30 30 #include "volumes.h" 31 31 #include "locking.h" 32 32 #include "ref-cache.h" 33 - #include "compat.h" 34 33 35 34 #define PENDING_EXTENT_INSERT 0 36 35 #define PENDING_EXTENT_DELETE 1 ··· 325 326 u64 flags) 326 327 { 327 328 struct list_head *head = &info->space_info; 328 - struct list_head *cur; 329 329 struct btrfs_space_info *found; 330 - list_for_each(cur, head) { 331 - found = list_entry(cur, struct btrfs_space_info, list); 330 + list_for_each_entry(found, head, list) { 332 331 if (found->flags == flags) 333 332 return found; 334 333 } ··· 1522 1525 return ret; 1523 1526 } 1524 1527 1525 - int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1526 - struct extent_buffer *orig_buf, struct extent_buffer *buf, 1527 - u32 *nr_extents) 1528 + /* when a block goes through cow, we update the reference counts of 1529 + * everything that block points to. The internal pointers of the block 1530 + * can be in just about any order, and it is likely to have clusters of 1531 + * things that are close together and clusters of things that are not. 1532 + * 1533 + * To help reduce the seeks that come with updating all of these reference 1534 + * counts, sort them by byte number before actual updates are done. 1535 + * 1536 + * struct refsort is used to match byte number to slot in the btree block. 1537 + * we sort based on the byte number and then use the slot to actually 1538 + * find the item. 1539 + * 1540 + * struct refsort is smaller than strcut btrfs_item and smaller than 1541 + * struct btrfs_key_ptr. Since we're currently limited to the page size 1542 + * for a btree block, there's no way for a kmalloc of refsorts for a 1543 + * single node to be bigger than a page. 1544 + */ 1545 + struct refsort { 1546 + u64 bytenr; 1547 + u32 slot; 1548 + }; 1549 + 1550 + /* 1551 + * for passing into sort() 1552 + */ 1553 + static int refsort_cmp(const void *a_void, const void *b_void) 1554 + { 1555 + const struct refsort *a = a_void; 1556 + const struct refsort *b = b_void; 1557 + 1558 + if (a->bytenr < b->bytenr) 1559 + return -1; 1560 + if (a->bytenr > b->bytenr) 1561 + return 1; 1562 + return 0; 1563 + } 1564 + 1565 + 1566 + noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans, 1567 + struct btrfs_root *root, 1568 + struct extent_buffer *orig_buf, 1569 + struct extent_buffer *buf, u32 *nr_extents) 1528 1570 { 1529 1571 u64 bytenr; 1530 1572 u64 ref_root; 1531 1573 u64 orig_root; 1532 1574 u64 ref_generation; 1533 1575 u64 orig_generation; 1576 + struct refsort *sorted; 1534 1577 u32 nritems; 1535 1578 u32 nr_file_extents = 0; 1536 1579 struct btrfs_key key; ··· 1579 1542 int level; 1580 1543 int ret = 0; 1581 1544 int faili = 0; 1545 + int refi = 0; 1546 + int slot; 1582 1547 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 1583 1548 u64, u64, u64, u64, u64, u64, u64, u64); 1584 1549 ··· 1591 1552 1592 1553 nritems = btrfs_header_nritems(buf); 1593 1554 level = btrfs_header_level(buf); 1555 + 1556 + sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS); 1557 + BUG_ON(!sorted); 1594 1558 1595 1559 if (root->ref_cows) { 1596 1560 process_func = __btrfs_inc_extent_ref; ··· 1607 1565 process_func = __btrfs_update_extent_ref; 1608 1566 } 1609 1567 1568 + /* 1569 + * we make two passes through the items. In the first pass we 1570 + * only record the byte number and slot. Then we sort based on 1571 + * byte number and do the actual work based on the sorted results 1572 + */ 1610 1573 for (i = 0; i < nritems; i++) { 1611 1574 cond_resched(); 1612 1575 if (level == 0) { ··· 1628 1581 continue; 1629 1582 1630 1583 nr_file_extents++; 1584 + sorted[refi].bytenr = bytenr; 1585 + sorted[refi].slot = i; 1586 + refi++; 1587 + } else { 1588 + bytenr = btrfs_node_blockptr(buf, i); 1589 + sorted[refi].bytenr = bytenr; 1590 + sorted[refi].slot = i; 1591 + refi++; 1592 + } 1593 + } 1594 + /* 1595 + * if refi == 0, we didn't actually put anything into the sorted 1596 + * array and we're done 1597 + */ 1598 + if (refi == 0) 1599 + goto out; 1600 + 1601 + sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); 1602 + 1603 + for (i = 0; i < refi; i++) { 1604 + cond_resched(); 1605 + slot = sorted[i].slot; 1606 + bytenr = sorted[i].bytenr; 1607 + 1608 + if (level == 0) { 1609 + btrfs_item_key_to_cpu(buf, &key, slot); 1631 1610 1632 1611 ret = process_func(trans, root, bytenr, 1633 1612 orig_buf->start, buf->start, ··· 1662 1589 key.objectid); 1663 1590 1664 1591 if (ret) { 1665 - faili = i; 1592 + faili = slot; 1666 1593 WARN_ON(1); 1667 1594 goto fail; 1668 1595 } 1669 1596 } else { 1670 - bytenr = btrfs_node_blockptr(buf, i); 1671 1597 ret = process_func(trans, root, bytenr, 1672 1598 orig_buf->start, buf->start, 1673 1599 orig_root, ref_root, 1674 1600 orig_generation, ref_generation, 1675 1601 level - 1); 1676 1602 if (ret) { 1677 - faili = i; 1603 + faili = slot; 1678 1604 WARN_ON(1); 1679 1605 goto fail; 1680 1606 } 1681 1607 } 1682 1608 } 1683 1609 out: 1610 + kfree(sorted); 1684 1611 if (nr_extents) { 1685 1612 if (level == 0) 1686 1613 *nr_extents = nr_file_extents; ··· 1689 1616 } 1690 1617 return 0; 1691 1618 fail: 1619 + kfree(sorted); 1692 1620 WARN_ON(1); 1693 1621 return ret; 1694 1622 } ··· 2233 2159 ret = find_first_extent_bit(&info->extent_ins, search, &start, 2234 2160 &end, EXTENT_WRITEBACK); 2235 2161 if (ret) { 2236 - if (skipped && all && !num_inserts) { 2162 + if (skipped && all && !num_inserts && 2163 + list_empty(&update_list)) { 2237 2164 skipped = 0; 2238 2165 search = 0; 2239 2166 continue; ··· 2622 2547 if (ret) { 2623 2548 if (all && skipped && !nr) { 2624 2549 search = 0; 2550 + skipped = 0; 2625 2551 continue; 2626 2552 } 2627 2553 mutex_unlock(&info->extent_ins_mutex); ··· 2776 2700 /* if metadata always pin */ 2777 2701 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { 2778 2702 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 2779 - struct btrfs_block_group_cache *cache; 2780 - 2781 - /* btrfs_free_reserved_extent */ 2782 - cache = btrfs_lookup_block_group(root->fs_info, bytenr); 2783 - BUG_ON(!cache); 2784 - btrfs_add_free_space(cache, bytenr, num_bytes); 2785 - put_block_group(cache); 2703 + mutex_lock(&root->fs_info->pinned_mutex); 2704 + btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 2705 + mutex_unlock(&root->fs_info->pinned_mutex); 2786 2706 update_reserved_extents(root, bytenr, num_bytes, 0); 2787 2707 return 0; 2788 2708 } ··· 3086 3014 static void dump_space_info(struct btrfs_space_info *info, u64 bytes) 3087 3015 { 3088 3016 struct btrfs_block_group_cache *cache; 3089 - struct list_head *l; 3090 3017 3091 3018 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 3092 3019 (unsigned long long)(info->total_bytes - info->bytes_used - ··· 3093 3022 (info->full) ? "" : "not "); 3094 3023 3095 3024 down_read(&info->groups_sem); 3096 - list_for_each(l, &info->block_groups) { 3097 - cache = list_entry(l, struct btrfs_block_group_cache, list); 3025 + list_for_each_entry(cache, &info->block_groups, list) { 3098 3026 spin_lock(&cache->lock); 3099 3027 printk(KERN_INFO "block group %llu has %llu bytes, %llu used " 3100 3028 "%llu pinned %llu reserved\n", ··· 3412 3342 btrfs_set_header_generation(buf, trans->transid); 3413 3343 btrfs_tree_lock(buf); 3414 3344 clean_tree_block(trans, root, buf); 3345 + 3346 + btrfs_set_lock_blocking(buf); 3415 3347 btrfs_set_buffer_uptodate(buf); 3348 + 3416 3349 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 3417 3350 set_extent_dirty(&root->dirty_log_pages, buf->start, 3418 3351 buf->start + buf->len - 1, GFP_NOFS); ··· 3424 3351 buf->start + buf->len - 1, GFP_NOFS); 3425 3352 } 3426 3353 trans->blocks_used++; 3354 + /* this returns a buffer locked for blocking */ 3427 3355 return buf; 3428 3356 } 3429 3357 ··· 3462 3388 { 3463 3389 u64 leaf_owner; 3464 3390 u64 leaf_generation; 3391 + struct refsort *sorted; 3465 3392 struct btrfs_key key; 3466 3393 struct btrfs_file_extent_item *fi; 3467 3394 int i; 3468 3395 int nritems; 3469 3396 int ret; 3397 + int refi = 0; 3398 + int slot; 3470 3399 3471 3400 BUG_ON(!btrfs_is_leaf(leaf)); 3472 3401 nritems = btrfs_header_nritems(leaf); 3473 3402 leaf_owner = btrfs_header_owner(leaf); 3474 3403 leaf_generation = btrfs_header_generation(leaf); 3475 3404 3405 + sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS); 3406 + /* we do this loop twice. The first time we build a list 3407 + * of the extents we have a reference on, then we sort the list 3408 + * by bytenr. The second time around we actually do the 3409 + * extent freeing. 3410 + */ 3476 3411 for (i = 0; i < nritems; i++) { 3477 3412 u64 disk_bytenr; 3478 3413 cond_resched(); 3479 3414 3480 3415 btrfs_item_key_to_cpu(leaf, &key, i); 3416 + 3417 + /* only extents have references, skip everything else */ 3481 3418 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 3482 3419 continue; 3420 + 3483 3421 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3422 + 3423 + /* inline extents live in the btree, they don't have refs */ 3484 3424 if (btrfs_file_extent_type(leaf, fi) == 3485 3425 BTRFS_FILE_EXTENT_INLINE) 3486 3426 continue; 3487 - /* 3488 - * FIXME make sure to insert a trans record that 3489 - * repeats the snapshot del on crash 3490 - */ 3427 + 3491 3428 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 3429 + 3430 + /* holes don't have refs */ 3492 3431 if (disk_bytenr == 0) 3493 3432 continue; 3433 + 3434 + sorted[refi].bytenr = disk_bytenr; 3435 + sorted[refi].slot = i; 3436 + refi++; 3437 + } 3438 + 3439 + if (refi == 0) 3440 + goto out; 3441 + 3442 + sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); 3443 + 3444 + for (i = 0; i < refi; i++) { 3445 + u64 disk_bytenr; 3446 + 3447 + disk_bytenr = sorted[i].bytenr; 3448 + slot = sorted[i].slot; 3449 + 3450 + cond_resched(); 3451 + 3452 + btrfs_item_key_to_cpu(leaf, &key, slot); 3453 + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 3454 + continue; 3455 + 3456 + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 3494 3457 3495 3458 ret = __btrfs_free_extent(trans, root, disk_bytenr, 3496 3459 btrfs_file_extent_disk_num_bytes(leaf, fi), ··· 3539 3428 wake_up(&root->fs_info->transaction_throttle); 3540 3429 cond_resched(); 3541 3430 } 3431 + out: 3432 + kfree(sorted); 3542 3433 return 0; 3543 3434 } 3544 3435 ··· 3550 3437 { 3551 3438 int i; 3552 3439 int ret; 3553 - struct btrfs_extent_info *info = ref->extents; 3440 + struct btrfs_extent_info *info; 3441 + struct refsort *sorted; 3554 3442 3443 + if (ref->nritems == 0) 3444 + return 0; 3445 + 3446 + sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS); 3555 3447 for (i = 0; i < ref->nritems; i++) { 3448 + sorted[i].bytenr = ref->extents[i].bytenr; 3449 + sorted[i].slot = i; 3450 + } 3451 + sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL); 3452 + 3453 + /* 3454 + * the items in the ref were sorted when the ref was inserted 3455 + * into the ref cache, so this is already in order 3456 + */ 3457 + for (i = 0; i < ref->nritems; i++) { 3458 + info = ref->extents + sorted[i].slot; 3556 3459 ret = __btrfs_free_extent(trans, root, info->bytenr, 3557 3460 info->num_bytes, ref->bytenr, 3558 3461 ref->owner, ref->generation, ··· 3582 3453 info++; 3583 3454 } 3584 3455 3456 + kfree(sorted); 3585 3457 return 0; 3586 3458 } 3587 3459 ··· 3627 3497 } 3628 3498 3629 3499 /* 3500 + * this is used while deleting old snapshots, and it drops the refs 3501 + * on a whole subtree starting from a level 1 node. 3502 + * 3503 + * The idea is to sort all the leaf pointers, and then drop the 3504 + * ref on all the leaves in order. Most of the time the leaves 3505 + * will have ref cache entries, so no leaf IOs will be required to 3506 + * find the extents they have references on. 3507 + * 3508 + * For each leaf, any references it has are also dropped in order 3509 + * 3510 + * This ends up dropping the references in something close to optimal 3511 + * order for reading and modifying the extent allocation tree. 3512 + */ 3513 + static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans, 3514 + struct btrfs_root *root, 3515 + struct btrfs_path *path) 3516 + { 3517 + u64 bytenr; 3518 + u64 root_owner; 3519 + u64 root_gen; 3520 + struct extent_buffer *eb = path->nodes[1]; 3521 + struct extent_buffer *leaf; 3522 + struct btrfs_leaf_ref *ref; 3523 + struct refsort *sorted = NULL; 3524 + int nritems = btrfs_header_nritems(eb); 3525 + int ret; 3526 + int i; 3527 + int refi = 0; 3528 + int slot = path->slots[1]; 3529 + u32 blocksize = btrfs_level_size(root, 0); 3530 + u32 refs; 3531 + 3532 + if (nritems == 0) 3533 + goto out; 3534 + 3535 + root_owner = btrfs_header_owner(eb); 3536 + root_gen = btrfs_header_generation(eb); 3537 + sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS); 3538 + 3539 + /* 3540 + * step one, sort all the leaf pointers so we don't scribble 3541 + * randomly into the extent allocation tree 3542 + */ 3543 + for (i = slot; i < nritems; i++) { 3544 + sorted[refi].bytenr = btrfs_node_blockptr(eb, i); 3545 + sorted[refi].slot = i; 3546 + refi++; 3547 + } 3548 + 3549 + /* 3550 + * nritems won't be zero, but if we're picking up drop_snapshot 3551 + * after a crash, slot might be > 0, so double check things 3552 + * just in case. 3553 + */ 3554 + if (refi == 0) 3555 + goto out; 3556 + 3557 + sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); 3558 + 3559 + /* 3560 + * the first loop frees everything the leaves point to 3561 + */ 3562 + for (i = 0; i < refi; i++) { 3563 + u64 ptr_gen; 3564 + 3565 + bytenr = sorted[i].bytenr; 3566 + 3567 + /* 3568 + * check the reference count on this leaf. If it is > 1 3569 + * we just decrement it below and don't update any 3570 + * of the refs the leaf points to. 3571 + */ 3572 + ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 3573 + BUG_ON(ret); 3574 + if (refs != 1) 3575 + continue; 3576 + 3577 + ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot); 3578 + 3579 + /* 3580 + * the leaf only had one reference, which means the 3581 + * only thing pointing to this leaf is the snapshot 3582 + * we're deleting. It isn't possible for the reference 3583 + * count to increase again later 3584 + * 3585 + * The reference cache is checked for the leaf, 3586 + * and if found we'll be able to drop any refs held by 3587 + * the leaf without needing to read it in. 3588 + */ 3589 + ref = btrfs_lookup_leaf_ref(root, bytenr); 3590 + if (ref && ref->generation != ptr_gen) { 3591 + btrfs_free_leaf_ref(root, ref); 3592 + ref = NULL; 3593 + } 3594 + if (ref) { 3595 + ret = cache_drop_leaf_ref(trans, root, ref); 3596 + BUG_ON(ret); 3597 + btrfs_remove_leaf_ref(root, ref); 3598 + btrfs_free_leaf_ref(root, ref); 3599 + } else { 3600 + /* 3601 + * the leaf wasn't in the reference cache, so 3602 + * we have to read it. 3603 + */ 3604 + leaf = read_tree_block(root, bytenr, blocksize, 3605 + ptr_gen); 3606 + ret = btrfs_drop_leaf_ref(trans, root, leaf); 3607 + BUG_ON(ret); 3608 + free_extent_buffer(leaf); 3609 + } 3610 + atomic_inc(&root->fs_info->throttle_gen); 3611 + wake_up(&root->fs_info->transaction_throttle); 3612 + cond_resched(); 3613 + } 3614 + 3615 + /* 3616 + * run through the loop again to free the refs on the leaves. 3617 + * This is faster than doing it in the loop above because 3618 + * the leaves are likely to be clustered together. We end up 3619 + * working in nice chunks on the extent allocation tree. 3620 + */ 3621 + for (i = 0; i < refi; i++) { 3622 + bytenr = sorted[i].bytenr; 3623 + ret = __btrfs_free_extent(trans, root, bytenr, 3624 + blocksize, eb->start, 3625 + root_owner, root_gen, 0, 1); 3626 + BUG_ON(ret); 3627 + 3628 + atomic_inc(&root->fs_info->throttle_gen); 3629 + wake_up(&root->fs_info->transaction_throttle); 3630 + cond_resched(); 3631 + } 3632 + out: 3633 + kfree(sorted); 3634 + 3635 + /* 3636 + * update the path to show we've processed the entire level 1 3637 + * node. This will get saved into the root's drop_snapshot_progress 3638 + * field so these drops are not repeated again if this transaction 3639 + * commits. 3640 + */ 3641 + path->slots[1] = nritems; 3642 + return 0; 3643 + } 3644 + 3645 + /* 3630 3646 * helper function for drop_snapshot, this walks down the tree dropping ref 3631 3647 * counts as it goes. 3632 3648 */ ··· 3787 3511 struct extent_buffer *next; 3788 3512 struct extent_buffer *cur; 3789 3513 struct extent_buffer *parent; 3790 - struct btrfs_leaf_ref *ref; 3791 3514 u32 blocksize; 3792 3515 int ret; 3793 3516 u32 refs; ··· 3813 3538 if (path->slots[*level] >= 3814 3539 btrfs_header_nritems(cur)) 3815 3540 break; 3541 + 3542 + /* the new code goes down to level 1 and does all the 3543 + * leaves pointed to that node in bulk. So, this check 3544 + * for level 0 will always be false. 3545 + * 3546 + * But, the disk format allows the drop_snapshot_progress 3547 + * field in the root to leave things in a state where 3548 + * a leaf will need cleaning up here. If someone crashes 3549 + * with the old code and then boots with the new code, 3550 + * we might find a leaf here. 3551 + */ 3816 3552 if (*level == 0) { 3817 3553 ret = btrfs_drop_leaf_ref(trans, root, cur); 3818 3554 BUG_ON(ret); 3819 3555 break; 3820 3556 } 3557 + 3558 + /* 3559 + * once we get to level one, process the whole node 3560 + * at once, including everything below it. 3561 + */ 3562 + if (*level == 1) { 3563 + ret = drop_level_one_refs(trans, root, path); 3564 + BUG_ON(ret); 3565 + break; 3566 + } 3567 + 3821 3568 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 3822 3569 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 3823 3570 blocksize = btrfs_level_size(root, *level - 1); 3824 3571 3825 3572 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 3826 3573 BUG_ON(ret); 3574 + 3575 + /* 3576 + * if there is more than one reference, we don't need 3577 + * to read that node to drop any references it has. We 3578 + * just drop the ref we hold on that node and move on to the 3579 + * next slot in this level. 3580 + */ 3827 3581 if (refs != 1) { 3828 3582 parent = path->nodes[*level]; 3829 3583 root_owner = btrfs_header_owner(parent); ··· 3871 3567 3872 3568 continue; 3873 3569 } 3874 - /* 3875 - * at this point, we have a single ref, and since the 3876 - * only place referencing this extent is a dead root 3877 - * the reference count should never go higher. 3878 - * So, we don't need to check it again 3879 - */ 3880 - if (*level == 1) { 3881 - ref = btrfs_lookup_leaf_ref(root, bytenr); 3882 - if (ref && ref->generation != ptr_gen) { 3883 - btrfs_free_leaf_ref(root, ref); 3884 - ref = NULL; 3885 - } 3886 - if (ref) { 3887 - ret = cache_drop_leaf_ref(trans, root, ref); 3888 - BUG_ON(ret); 3889 - btrfs_remove_leaf_ref(root, ref); 3890 - btrfs_free_leaf_ref(root, ref); 3891 - *level = 0; 3892 - break; 3893 - } 3894 - } 3895 - next = btrfs_find_tree_block(root, bytenr, blocksize); 3896 - if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) { 3897 - free_extent_buffer(next); 3898 3570 3899 - next = read_tree_block(root, bytenr, blocksize, 3900 - ptr_gen); 3901 - cond_resched(); 3902 - #if 0 3903 - /* 3904 - * this is a debugging check and can go away 3905 - * the ref should never go all the way down to 1 3906 - * at this point 3907 - */ 3908 - ret = lookup_extent_ref(NULL, root, bytenr, blocksize, 3909 - &refs); 3910 - BUG_ON(ret); 3911 - WARN_ON(refs != 1); 3912 - #endif 3913 - } 3571 + /* 3572 + * we need to keep freeing things in the next level down. 3573 + * read the block and loop around to process it 3574 + */ 3575 + next = read_tree_block(root, bytenr, blocksize, ptr_gen); 3914 3576 WARN_ON(*level <= 0); 3915 3577 if (path->nodes[*level-1]) 3916 3578 free_extent_buffer(path->nodes[*level-1]); ··· 3901 3631 root_owner = btrfs_header_owner(parent); 3902 3632 root_gen = btrfs_header_generation(parent); 3903 3633 3634 + /* 3635 + * cleanup and free the reference on the last node 3636 + * we processed 3637 + */ 3904 3638 ret = __btrfs_free_extent(trans, root, bytenr, blocksize, 3905 3639 parent->start, root_owner, root_gen, 3906 3640 *level, 1); 3907 3641 free_extent_buffer(path->nodes[*level]); 3908 3642 path->nodes[*level] = NULL; 3643 + 3909 3644 *level += 1; 3910 3645 BUG_ON(ret); 3911 3646 ··· 3962 3687 3963 3688 next = read_tree_block(root, bytenr, blocksize, ptr_gen); 3964 3689 btrfs_tree_lock(next); 3690 + btrfs_set_lock_blocking(next); 3965 3691 3966 3692 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, 3967 3693 &refs); ··· 4030 3754 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 4031 3755 struct extent_buffer *node; 4032 3756 struct btrfs_disk_key disk_key; 3757 + 3758 + /* 3759 + * there is more work to do in this level. 3760 + * Update the drop_progress marker to reflect 3761 + * the work we've done so far, and then bump 3762 + * the slot number 3763 + */ 4033 3764 node = path->nodes[i]; 4034 3765 path->slots[i]++; 4035 3766 *level = i; ··· 4048 3765 return 0; 4049 3766 } else { 4050 3767 struct extent_buffer *parent; 3768 + 3769 + /* 3770 + * this whole node is done, free our reference 3771 + * on it and go up one level 3772 + */ 4051 3773 if (path->nodes[*level] == root->node) 4052 3774 parent = path->nodes[*level]; 4053 3775 else ··· 4732 4444 u64 lock_end = 0; 4733 4445 u64 num_bytes; 4734 4446 u64 ext_offset; 4735 - u64 first_pos; 4447 + u64 search_end = (u64)-1; 4736 4448 u32 nritems; 4737 4449 int nr_scaned = 0; 4738 4450 int extent_locked = 0; ··· 4740 4452 int ret; 4741 4453 4742 4454 memcpy(&key, leaf_key, sizeof(key)); 4743 - first_pos = INT_LIMIT(loff_t) - extent_key->offset; 4744 4455 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { 4745 4456 if (key.objectid < ref_path->owner_objectid || 4746 4457 (key.objectid == ref_path->owner_objectid && ··· 4788 4501 if ((key.objectid > ref_path->owner_objectid) || 4789 4502 (key.objectid == ref_path->owner_objectid && 4790 4503 key.type > BTRFS_EXTENT_DATA_KEY) || 4791 - (key.offset >= first_pos + extent_key->offset)) 4504 + key.offset >= search_end) 4792 4505 break; 4793 4506 } 4794 4507 ··· 4821 4534 num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 4822 4535 ext_offset = btrfs_file_extent_offset(leaf, fi); 4823 4536 4824 - if (first_pos > key.offset - ext_offset) 4825 - first_pos = key.offset - ext_offset; 4537 + if (search_end == (u64)-1) { 4538 + search_end = key.offset - ext_offset + 4539 + btrfs_file_extent_ram_bytes(leaf, fi); 4540 + } 4826 4541 4827 4542 if (!extent_locked) { 4828 4543 lock_start = key.offset; ··· 5013 4724 } 5014 4725 skip: 5015 4726 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS && 5016 - key.offset >= first_pos + extent_key->offset) 4727 + key.offset >= search_end) 5017 4728 break; 5018 4729 5019 4730 cond_resched(); ··· 5067 4778 ref->bytenr = buf->start; 5068 4779 ref->owner = btrfs_header_owner(buf); 5069 4780 ref->generation = btrfs_header_generation(buf); 4781 + 5070 4782 ret = btrfs_add_leaf_ref(root, ref, 0); 5071 4783 WARN_ON(ret); 5072 4784 btrfs_free_leaf_ref(root, ref); ··· 6247 5957 path = btrfs_alloc_path(); 6248 5958 BUG_ON(!path); 6249 5959 6250 - btrfs_remove_free_space_cache(block_group); 5960 + spin_lock(&root->fs_info->block_group_cache_lock); 6251 5961 rb_erase(&block_group->cache_node, 6252 5962 &root->fs_info->block_group_cache_tree); 5963 + spin_unlock(&root->fs_info->block_group_cache_lock); 5964 + btrfs_remove_free_space_cache(block_group); 6253 5965 down_write(&block_group->space_info->groups_sem); 6254 5966 list_del(&block_group->list); 6255 5967 up_write(&block_group->space_info->groups_sem);

+109 -23

fs/btrfs/extent_io.c

··· 9 9 #include <linux/spinlock.h> 10 10 #include <linux/blkdev.h> 11 11 #include <linux/swap.h> 12 - #include <linux/version.h> 13 12 #include <linux/writeback.h> 14 13 #include <linux/pagevec.h> 15 14 #include "extent_io.h" ··· 30 31 static LIST_HEAD(states); 31 32 32 33 #define LEAK_DEBUG 0 33 - #ifdef LEAK_DEBUG 34 + #if LEAK_DEBUG 34 35 static DEFINE_SPINLOCK(leak_lock); 35 36 #endif 36 37 ··· 119 120 static struct extent_state *alloc_extent_state(gfp_t mask) 120 121 { 121 122 struct extent_state *state; 122 - #ifdef LEAK_DEBUG 123 + #if LEAK_DEBUG 123 124 unsigned long flags; 124 125 #endif 125 126 ··· 129 130 state->state = 0; 130 131 state->private = 0; 131 132 state->tree = NULL; 132 - #ifdef LEAK_DEBUG 133 + #if LEAK_DEBUG 133 134 spin_lock_irqsave(&leak_lock, flags); 134 135 list_add(&state->leak_list, &states); 135 136 spin_unlock_irqrestore(&leak_lock, flags); ··· 144 145 if (!state) 145 146 return; 146 147 if (atomic_dec_and_test(&state->refs)) { 147 - #ifdef LEAK_DEBUG 148 + #if LEAK_DEBUG 148 149 unsigned long flags; 149 150 #endif 150 151 WARN_ON(state->tree); 151 - #ifdef LEAK_DEBUG 152 + #if LEAK_DEBUG 152 153 spin_lock_irqsave(&leak_lock, flags); 153 154 list_del(&state->leak_list); 154 155 spin_unlock_irqrestore(&leak_lock, flags); ··· 2377 2378 int scanned = 0; 2378 2379 int range_whole = 0; 2379 2380 2380 - if (wbc->nonblocking && bdi_write_congested(bdi)) { 2381 - wbc->encountered_congestion = 1; 2382 - return 0; 2383 - } 2384 - 2385 2381 pagevec_init(&pvec, 0); 2386 2382 if (wbc->range_cyclic) { 2387 2383 index = mapping->writeback_index; /* Start from prev offset */ ··· 2849 2855 return sector; 2850 2856 } 2851 2857 2858 + int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2859 + __u64 start, __u64 len, get_extent_t *get_extent) 2860 + { 2861 + int ret; 2862 + u64 off = start; 2863 + u64 max = start + len; 2864 + u32 flags = 0; 2865 + u64 disko = 0; 2866 + struct extent_map *em = NULL; 2867 + int end = 0; 2868 + u64 em_start = 0, em_len = 0; 2869 + unsigned long emflags; 2870 + ret = 0; 2871 + 2872 + if (len == 0) 2873 + return -EINVAL; 2874 + 2875 + lock_extent(&BTRFS_I(inode)->io_tree, start, start + len, 2876 + GFP_NOFS); 2877 + em = get_extent(inode, NULL, 0, off, max - off, 0); 2878 + if (!em) 2879 + goto out; 2880 + if (IS_ERR(em)) { 2881 + ret = PTR_ERR(em); 2882 + goto out; 2883 + } 2884 + while (!end) { 2885 + off = em->start + em->len; 2886 + if (off >= max) 2887 + end = 1; 2888 + 2889 + em_start = em->start; 2890 + em_len = em->len; 2891 + 2892 + disko = 0; 2893 + flags = 0; 2894 + 2895 + switch (em->block_start) { 2896 + case EXTENT_MAP_LAST_BYTE: 2897 + end = 1; 2898 + flags |= FIEMAP_EXTENT_LAST; 2899 + break; 2900 + case EXTENT_MAP_HOLE: 2901 + flags |= FIEMAP_EXTENT_UNWRITTEN; 2902 + break; 2903 + case EXTENT_MAP_INLINE: 2904 + flags |= (FIEMAP_EXTENT_DATA_INLINE | 2905 + FIEMAP_EXTENT_NOT_ALIGNED); 2906 + break; 2907 + case EXTENT_MAP_DELALLOC: 2908 + flags |= (FIEMAP_EXTENT_DELALLOC | 2909 + FIEMAP_EXTENT_UNKNOWN); 2910 + break; 2911 + default: 2912 + disko = em->block_start; 2913 + break; 2914 + } 2915 + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2916 + flags |= FIEMAP_EXTENT_ENCODED; 2917 + 2918 + emflags = em->flags; 2919 + free_extent_map(em); 2920 + em = NULL; 2921 + 2922 + if (!end) { 2923 + em = get_extent(inode, NULL, 0, off, max - off, 0); 2924 + if (!em) 2925 + goto out; 2926 + if (IS_ERR(em)) { 2927 + ret = PTR_ERR(em); 2928 + goto out; 2929 + } 2930 + emflags = em->flags; 2931 + } 2932 + if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) { 2933 + flags |= FIEMAP_EXTENT_LAST; 2934 + end = 1; 2935 + } 2936 + 2937 + ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 2938 + em_len, flags); 2939 + if (ret) 2940 + goto out_free; 2941 + } 2942 + out_free: 2943 + free_extent_map(em); 2944 + out: 2945 + unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len, 2946 + GFP_NOFS); 2947 + return ret; 2948 + } 2949 + 2852 2950 static inline struct page *extent_buffer_page(struct extent_buffer *eb, 2853 2951 unsigned long i) 2854 2952 { ··· 2978 2892 gfp_t mask) 2979 2893 { 2980 2894 struct extent_buffer *eb = NULL; 2981 - #ifdef LEAK_DEBUG 2895 + #if LEAK_DEBUG 2982 2896 unsigned long flags; 2983 2897 #endif 2984 2898 2985 2899 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 2986 2900 eb->start = start; 2987 2901 eb->len = len; 2988 - mutex_init(&eb->mutex); 2989 - #ifdef LEAK_DEBUG 2902 + spin_lock_init(&eb->lock); 2903 + init_waitqueue_head(&eb->lock_wq); 2904 + 2905 + #if LEAK_DEBUG 2990 2906 spin_lock_irqsave(&leak_lock, flags); 2991 2907 list_add(&eb->leak_list, &buffers); 2992 2908 spin_unlock_irqrestore(&leak_lock, flags); ··· 3000 2912 3001 2913 static void __free_extent_buffer(struct extent_buffer *eb) 3002 2914 { 3003 - #ifdef LEAK_DEBUG 2915 + #if LEAK_DEBUG 3004 2916 unsigned long flags; 3005 2917 spin_lock_irqsave(&leak_lock, flags); 3006 2918 list_del(&eb->leak_list); ··· 3068 2980 unlock_page(p); 3069 2981 } 3070 2982 if (uptodate) 3071 - eb->flags |= EXTENT_UPTODATE; 3072 - eb->flags |= EXTENT_BUFFER_FILLED; 2983 + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3073 2984 3074 2985 spin_lock(&tree->buffer_lock); 3075 2986 exists = buffer_tree_insert(tree, start, &eb->rb_node); ··· 3222 3135 unsigned long num_pages; 3223 3136 3224 3137 num_pages = num_extent_pages(eb->start, eb->len); 3225 - eb->flags &= ~EXTENT_UPTODATE; 3138 + clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3226 3139 3227 3140 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3228 3141 GFP_NOFS); ··· 3293 3206 struct page *page; 3294 3207 int pg_uptodate = 1; 3295 3208 3296 - if (eb->flags & EXTENT_UPTODATE) 3209 + if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3297 3210 return 1; 3298 3211 3299 3212 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, ··· 3329 3242 struct bio *bio = NULL; 3330 3243 unsigned long bio_flags = 0; 3331 3244 3332 - if (eb->flags & EXTENT_UPTODATE) 3245 + if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3333 3246 return 0; 3334 3247 3335 3248 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, ··· 3360 3273 } 3361 3274 if (all_uptodate) { 3362 3275 if (start_i == 0) 3363 - eb->flags |= EXTENT_UPTODATE; 3276 + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3364 3277 goto unlock_exit; 3365 3278 } 3366 3279 ··· 3396 3309 } 3397 3310 3398 3311 if (!ret) 3399 - eb->flags |= EXTENT_UPTODATE; 3312 + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3400 3313 return ret; 3401 3314 3402 3315 unlock_exit: ··· 3493 3406 unmap_extent_buffer(eb, eb->map_token, km); 3494 3407 eb->map_token = NULL; 3495 3408 save = 1; 3496 - WARN_ON(!mutex_is_locked(&eb->mutex)); 3497 3409 } 3498 3410 err = map_private_extent_buffer(eb, start, min_len, token, map, 3499 3411 map_start, map_len, km);

+16 -2

fs/btrfs/extent_io.h

··· 22 22 /* flags for bio submission */ 23 23 #define EXTENT_BIO_COMPRESSED 1 24 24 25 + /* these are bit numbers for test/set bit */ 26 + #define EXTENT_BUFFER_UPTODATE 0 27 + #define EXTENT_BUFFER_BLOCKING 1 28 + 25 29 /* 26 30 * page->private values. Every page that is controlled by the extent 27 31 * map has page->private set to one. ··· 99 95 unsigned long map_start; 100 96 unsigned long map_len; 101 97 struct page *first_page; 98 + unsigned long bflags; 102 99 atomic_t refs; 103 - int flags; 104 100 struct list_head leak_list; 105 101 struct rb_node rb_node; 106 - struct mutex mutex; 102 + 103 + /* the spinlock is used to protect most operations */ 104 + spinlock_t lock; 105 + 106 + /* 107 + * when we keep the lock held while blocking, waiters go onto 108 + * the wq 109 + */ 110 + wait_queue_head_t lock_wq; 107 111 }; 108 112 109 113 struct extent_map_tree; ··· 205 193 unsigned from, unsigned to); 206 194 sector_t extent_bmap(struct address_space *mapping, sector_t iblock, 207 195 get_extent_t *get_extent); 196 + int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 197 + __u64 start, __u64 len, get_extent_t *get_extent); 208 198 int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end); 209 199 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private); 210 200 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);

-1

fs/btrfs/extent_map.c

··· 3 3 #include <linux/slab.h> 4 4 #include <linux/module.h> 5 5 #include <linux/spinlock.h> 6 - #include <linux/version.h> 7 6 #include <linux/hardirq.h> 8 7 #include "extent_map.h" 9 8

+2 -3

fs/btrfs/file.c

··· 29 29 #include <linux/writeback.h> 30 30 #include <linux/statfs.h> 31 31 #include <linux/compat.h> 32 - #include <linux/version.h> 33 32 #include "ctree.h" 34 33 #include "disk-io.h" 35 34 #include "transaction.h" ··· 1214 1215 } 1215 1216 mutex_unlock(&root->fs_info->trans_mutex); 1216 1217 1217 - root->fs_info->tree_log_batch++; 1218 + root->log_batch++; 1218 1219 filemap_fdatawrite(inode->i_mapping); 1219 1220 btrfs_wait_ordered_range(inode, 0, (u64)-1); 1220 - root->fs_info->tree_log_batch++; 1221 + root->log_batch++; 1221 1222 1222 1223 /* 1223 1224 * ok we haven't committed the transaction yet, lets do a commit

+70 -14

fs/btrfs/inode.c

··· 34 34 #include <linux/statfs.h> 35 35 #include <linux/compat.h> 36 36 #include <linux/bit_spinlock.h> 37 - #include <linux/version.h> 38 37 #include <linux/xattr.h> 39 38 #include <linux/posix_acl.h> 40 39 #include <linux/falloc.h> ··· 50 51 #include "tree-log.h" 51 52 #include "ref-cache.h" 52 53 #include "compression.h" 54 + #include "locking.h" 53 55 54 56 struct btrfs_iget_args { 55 57 u64 ino; ··· 90 90 struct page *locked_page, 91 91 u64 start, u64 end, int *page_started, 92 92 unsigned long *nr_written, int unlock); 93 + 94 + static int btrfs_init_inode_security(struct inode *inode, struct inode *dir) 95 + { 96 + int err; 97 + 98 + err = btrfs_init_acl(inode, dir); 99 + if (!err) 100 + err = btrfs_xattr_security_init(inode, dir); 101 + return err; 102 + } 93 103 94 104 /* 95 105 * a very lame attempt at stopping writes when the FS is 85% full. There ··· 360 350 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; 361 351 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); 362 352 353 + /* 354 + * we don't want to send crud past the end of i_size through 355 + * compression, that's just a waste of CPU time. So, if the 356 + * end of the file is before the start of our current 357 + * requested range of bytes, we bail out to the uncompressed 358 + * cleanup code that can deal with all of this. 359 + * 360 + * It isn't really the fastest way to fix things, but this is a 361 + * very uncommon corner. 362 + */ 363 + if (actual_end <= start) 364 + goto cleanup_and_bail_uncompressed; 365 + 363 366 total_compressed = actual_end - start; 364 367 365 368 /* we want to make sure that amount of ram required to uncompress ··· 517 494 goto again; 518 495 } 519 496 } else { 497 + cleanup_and_bail_uncompressed: 520 498 /* 521 499 * No compression, but we still need to write the pages in 522 500 * the file we've been given so far. redirty the locked ··· 1348 1324 struct inode *inode, u64 file_offset, 1349 1325 struct list_head *list) 1350 1326 { 1351 - struct list_head *cur; 1352 1327 struct btrfs_ordered_sum *sum; 1353 1328 1354 1329 btrfs_set_trans_block_group(trans, inode); 1355 - list_for_each(cur, list) { 1356 - sum = list_entry(cur, struct btrfs_ordered_sum, list); 1330 + 1331 + list_for_each_entry(sum, list, list) { 1357 1332 btrfs_csum_file_blocks(trans, 1358 1333 BTRFS_I(inode)->root->fs_info->csum_root, sum); 1359 1334 } ··· 2036 2013 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 2037 2014 2038 2015 alloc_group_block = btrfs_inode_block_group(leaf, inode_item); 2016 + 2039 2017 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, 2040 2018 alloc_group_block, 0); 2041 2019 btrfs_free_path(path); ··· 2063 2039 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 2064 2040 break; 2065 2041 default: 2042 + inode->i_op = &btrfs_special_inode_operations; 2066 2043 init_special_inode(inode, inode->i_mode, rdev); 2067 2044 break; 2068 2045 } ··· 2133 2108 goto failed; 2134 2109 } 2135 2110 2111 + btrfs_unlock_up_safe(path, 1); 2136 2112 leaf = path->nodes[0]; 2137 2113 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2138 2114 struct btrfs_inode_item); ··· 2455 2429 ref->generation = leaf_gen; 2456 2430 ref->nritems = 0; 2457 2431 2432 + btrfs_sort_leaf_ref(ref); 2433 + 2458 2434 ret = btrfs_add_leaf_ref(root, ref, 0); 2459 2435 WARN_ON(ret); 2460 2436 btrfs_free_leaf_ref(root, ref); ··· 2504 2476 struct btrfs_path *path; 2505 2477 struct btrfs_key key; 2506 2478 struct btrfs_key found_key; 2507 - u32 found_type; 2479 + u32 found_type = (u8)-1; 2508 2480 struct extent_buffer *leaf; 2509 2481 struct btrfs_file_extent_item *fi; 2510 2482 u64 extent_start = 0; ··· 2691 2663 if (pending_del_nr) 2692 2664 goto del_pending; 2693 2665 btrfs_release_path(root, path); 2666 + if (found_type == BTRFS_INODE_ITEM_KEY) 2667 + break; 2694 2668 goto search_again; 2695 2669 } 2696 2670 ··· 2709 2679 BUG_ON(ret); 2710 2680 pending_del_nr = 0; 2711 2681 btrfs_release_path(root, path); 2682 + if (found_type == BTRFS_INODE_ITEM_KEY) 2683 + break; 2712 2684 goto search_again; 2713 2685 } 2714 2686 } ··· 3297 3265 3298 3266 /* Reached end of directory/root. Bump pos past the last item. */ 3299 3267 if (key_type == BTRFS_DIR_INDEX_KEY) 3300 - filp->f_pos = INT_LIMIT(typeof(filp->f_pos)); 3268 + filp->f_pos = INT_LIMIT(off_t); 3301 3269 else 3302 3270 filp->f_pos++; 3303 3271 nopos: ··· 3490 3458 root->highest_inode = objectid; 3491 3459 3492 3460 inode->i_uid = current_fsuid(); 3493 - inode->i_gid = current_fsgid(); 3461 + 3462 + if (dir && (dir->i_mode & S_ISGID)) { 3463 + inode->i_gid = dir->i_gid; 3464 + if (S_ISDIR(mode)) 3465 + mode |= S_ISGID; 3466 + } else 3467 + inode->i_gid = current_fsgid(); 3468 + 3494 3469 inode->i_mode = mode; 3495 3470 inode->i_ino = objectid; 3496 3471 inode_set_bytes(inode, 0); ··· 3625 3586 if (IS_ERR(inode)) 3626 3587 goto out_unlock; 3627 3588 3628 - err = btrfs_init_acl(inode, dir); 3589 + err = btrfs_init_inode_security(inode, dir); 3629 3590 if (err) { 3630 3591 drop_inode = 1; 3631 3592 goto out_unlock; ··· 3688 3649 if (IS_ERR(inode)) 3689 3650 goto out_unlock; 3690 3651 3691 - err = btrfs_init_acl(inode, dir); 3652 + err = btrfs_init_inode_security(inode, dir); 3692 3653 if (err) { 3693 3654 drop_inode = 1; 3694 3655 goto out_unlock; ··· 3811 3772 3812 3773 drop_on_err = 1; 3813 3774 3814 - err = btrfs_init_acl(inode, dir); 3775 + err = btrfs_init_inode_security(inode, dir); 3815 3776 if (err) 3816 3777 goto out_fail; 3817 3778 ··· 4197 4158 return -EINVAL; 4198 4159 } 4199 4160 4200 - static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock) 4161 + static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4162 + __u64 start, __u64 len) 4201 4163 { 4202 - return extent_bmap(mapping, iblock, btrfs_get_extent); 4164 + return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent); 4203 4165 } 4204 4166 4205 4167 int btrfs_readpage(struct file *file, struct page *page) ··· 4773 4733 if (IS_ERR(inode)) 4774 4734 goto out_unlock; 4775 4735 4776 - err = btrfs_init_acl(inode, dir); 4736 + err = btrfs_init_inode_security(inode, dir); 4777 4737 if (err) { 4778 4738 drop_inode = 1; 4779 4739 goto out_unlock; ··· 5027 4987 .clear_bit_hook = btrfs_clear_bit_hook, 5028 4988 }; 5029 4989 4990 + /* 4991 + * btrfs doesn't support the bmap operation because swapfiles 4992 + * use bmap to make a mapping of extents in the file. They assume 4993 + * these extents won't change over the life of the file and they 4994 + * use the bmap result to do IO directly to the drive. 4995 + * 4996 + * the btrfs bmap call would return logical addresses that aren't 4997 + * suitable for IO and they also will change frequently as COW 4998 + * operations happen. So, swapfile + btrfs == corruption. 4999 + * 5000 + * For now we're avoiding this by dropping bmap. 5001 + */ 5030 5002 static struct address_space_operations btrfs_aops = { 5031 5003 .readpage = btrfs_readpage, 5032 5004 .writepage = btrfs_writepage, 5033 5005 .writepages = btrfs_writepages, 5034 5006 .readpages = btrfs_readpages, 5035 5007 .sync_page = block_sync_page, 5036 - .bmap = btrfs_bmap, 5037 5008 .direct_IO = btrfs_direct_IO, 5038 5009 .invalidatepage = btrfs_invalidatepage, 5039 5010 .releasepage = btrfs_releasepage, ··· 5068 5017 .removexattr = btrfs_removexattr, 5069 5018 .permission = btrfs_permission, 5070 5019 .fallocate = btrfs_fallocate, 5020 + .fiemap = btrfs_fiemap, 5071 5021 }; 5072 5022 static struct inode_operations btrfs_special_inode_operations = { 5073 5023 .getattr = btrfs_getattr, ··· 5084 5032 .follow_link = page_follow_link_light, 5085 5033 .put_link = page_put_link, 5086 5034 .permission = btrfs_permission, 5035 + .setxattr = btrfs_setxattr, 5036 + .getxattr = btrfs_getxattr, 5037 + .listxattr = btrfs_listxattr, 5038 + .removexattr = btrfs_removexattr, 5087 5039 };

-1

fs/btrfs/ioctl.c

··· 38 38 #include <linux/compat.h> 39 39 #include <linux/bit_spinlock.h> 40 40 #include <linux/security.h> 41 - #include <linux/version.h> 42 41 #include <linux/xattr.h> 43 42 #include <linux/vmalloc.h> 44 43 #include "compat.h"

+191 -19

fs/btrfs/locking.c

··· 26 26 #include "locking.h" 27 27 28 28 /* 29 - * locks the per buffer mutex in an extent buffer. This uses adaptive locks 30 - * and the spin is not tuned very extensively. The spinning does make a big 31 - * difference in almost every workload, but spinning for the right amount of 32 - * time needs some help. 33 - * 34 - * In general, we want to spin as long as the lock holder is doing btree 35 - * searches, and we should give up if they are in more expensive code. 29 + * btrfs_header_level() isn't free, so don't call it when lockdep isn't 30 + * on 36 31 */ 32 + #ifdef CONFIG_DEBUG_LOCK_ALLOC 33 + static inline void spin_nested(struct extent_buffer *eb) 34 + { 35 + spin_lock_nested(&eb->lock, BTRFS_MAX_LEVEL - btrfs_header_level(eb)); 36 + } 37 + #else 38 + static inline void spin_nested(struct extent_buffer *eb) 39 + { 40 + spin_lock(&eb->lock); 41 + } 42 + #endif 37 43 38 - int btrfs_tree_lock(struct extent_buffer *eb) 44 + /* 45 + * Setting a lock to blocking will drop the spinlock and set the 46 + * flag that forces other procs who want the lock to wait. After 47 + * this you can safely schedule with the lock held. 48 + */ 49 + void btrfs_set_lock_blocking(struct extent_buffer *eb) 50 + { 51 + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { 52 + set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); 53 + spin_unlock(&eb->lock); 54 + } 55 + /* exit with the spin lock released and the bit set */ 56 + } 57 + 58 + /* 59 + * clearing the blocking flag will take the spinlock again. 60 + * After this you can't safely schedule 61 + */ 62 + void btrfs_clear_lock_blocking(struct extent_buffer *eb) 63 + { 64 + if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { 65 + spin_nested(eb); 66 + clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); 67 + smp_mb__after_clear_bit(); 68 + } 69 + /* exit with the spin lock held */ 70 + } 71 + 72 + /* 73 + * unfortunately, many of the places that currently set a lock to blocking 74 + * don't end up blocking for every long, and often they don't block 75 + * at all. For a dbench 50 run, if we don't spin one the blocking bit 76 + * at all, the context switch rate can jump up to 400,000/sec or more. 77 + * 78 + * So, we're still stuck with this crummy spin on the blocking bit, 79 + * at least until the most common causes of the short blocks 80 + * can be dealt with. 81 + */ 82 + static int btrfs_spin_on_block(struct extent_buffer *eb) 39 83 { 40 84 int i; 41 - 42 - if (mutex_trylock(&eb->mutex)) 43 - return 0; 44 85 for (i = 0; i < 512; i++) { 45 86 cpu_relax(); 46 - if (mutex_trylock(&eb->mutex)) 47 - return 0; 87 + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 88 + return 1; 89 + if (need_resched()) 90 + break; 48 91 } 49 - cpu_relax(); 50 - mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb)); 51 92 return 0; 52 93 } 53 94 95 + /* 96 + * This is somewhat different from trylock. It will take the 97 + * spinlock but if it finds the lock is set to blocking, it will 98 + * return without the lock held. 99 + * 100 + * returns 1 if it was able to take the lock and zero otherwise 101 + * 102 + * After this call, scheduling is not safe without first calling 103 + * btrfs_set_lock_blocking() 104 + */ 105 + int btrfs_try_spin_lock(struct extent_buffer *eb) 106 + { 107 + int i; 108 + 109 + spin_nested(eb); 110 + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 111 + return 1; 112 + spin_unlock(&eb->lock); 113 + 114 + /* spin for a bit on the BLOCKING flag */ 115 + for (i = 0; i < 2; i++) { 116 + if (!btrfs_spin_on_block(eb)) 117 + break; 118 + 119 + spin_nested(eb); 120 + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 121 + return 1; 122 + spin_unlock(&eb->lock); 123 + } 124 + return 0; 125 + } 126 + 127 + /* 128 + * the autoremove wake function will return 0 if it tried to wake up 129 + * a process that was already awake, which means that process won't 130 + * count as an exclusive wakeup. The waitq code will continue waking 131 + * procs until it finds one that was actually sleeping. 132 + * 133 + * For btrfs, this isn't quite what we want. We want a single proc 134 + * to be notified that the lock is ready for taking. If that proc 135 + * already happen to be awake, great, it will loop around and try for 136 + * the lock. 137 + * 138 + * So, btrfs_wake_function always returns 1, even when the proc that we 139 + * tried to wake up was already awake. 140 + */ 141 + static int btrfs_wake_function(wait_queue_t *wait, unsigned mode, 142 + int sync, void *key) 143 + { 144 + autoremove_wake_function(wait, mode, sync, key); 145 + return 1; 146 + } 147 + 148 + /* 149 + * returns with the extent buffer spinlocked. 150 + * 151 + * This will spin and/or wait as required to take the lock, and then 152 + * return with the spinlock held. 153 + * 154 + * After this call, scheduling is not safe without first calling 155 + * btrfs_set_lock_blocking() 156 + */ 157 + int btrfs_tree_lock(struct extent_buffer *eb) 158 + { 159 + DEFINE_WAIT(wait); 160 + wait.func = btrfs_wake_function; 161 + 162 + while(1) { 163 + spin_nested(eb); 164 + 165 + /* nobody is blocking, exit with the spinlock held */ 166 + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 167 + return 0; 168 + 169 + /* 170 + * we have the spinlock, but the real owner is blocking. 171 + * wait for them 172 + */ 173 + spin_unlock(&eb->lock); 174 + 175 + /* 176 + * spin for a bit, and if the blocking flag goes away, 177 + * loop around 178 + */ 179 + if (btrfs_spin_on_block(eb)) 180 + continue; 181 + 182 + prepare_to_wait_exclusive(&eb->lock_wq, &wait, 183 + TASK_UNINTERRUPTIBLE); 184 + 185 + if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 186 + schedule(); 187 + 188 + finish_wait(&eb->lock_wq, &wait); 189 + } 190 + return 0; 191 + } 192 + 193 + /* 194 + * Very quick trylock, this does not spin or schedule. It returns 195 + * 1 with the spinlock held if it was able to take the lock, or it 196 + * returns zero if it was unable to take the lock. 197 + * 198 + * After this call, scheduling is not safe without first calling 199 + * btrfs_set_lock_blocking() 200 + */ 54 201 int btrfs_try_tree_lock(struct extent_buffer *eb) 55 202 { 56 - return mutex_trylock(&eb->mutex); 203 + if (spin_trylock(&eb->lock)) { 204 + if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { 205 + /* 206 + * we've got the spinlock, but the real owner is 207 + * blocking. Drop the spinlock and return failure 208 + */ 209 + spin_unlock(&eb->lock); 210 + return 0; 211 + } 212 + return 1; 213 + } 214 + /* someone else has the spinlock giveup */ 215 + return 0; 57 216 } 58 217 59 218 int btrfs_tree_unlock(struct extent_buffer *eb) 60 219 { 61 - mutex_unlock(&eb->mutex); 220 + /* 221 + * if we were a blocking owner, we don't have the spinlock held 222 + * just clear the bit and look for waiters 223 + */ 224 + if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 225 + smp_mb__after_clear_bit(); 226 + else 227 + spin_unlock(&eb->lock); 228 + 229 + if (waitqueue_active(&eb->lock_wq)) 230 + wake_up(&eb->lock_wq); 62 231 return 0; 63 232 } 64 233 65 234 int btrfs_tree_locked(struct extent_buffer *eb) 66 235 { 67 - return mutex_is_locked(&eb->mutex); 236 + return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) || 237 + spin_is_locked(&eb->lock); 68 238 } 69 239 70 240 /* ··· 245 75 { 246 76 int i; 247 77 struct extent_buffer *eb; 78 + 248 79 for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) { 249 80 eb = path->nodes[i]; 250 81 if (!eb) 251 82 break; 252 83 smp_mb(); 253 - if (!list_empty(&eb->mutex.wait_list)) 84 + if (spin_is_contended(&eb->lock) || 85 + waitqueue_active(&eb->lock_wq)) 254 86 return 1; 255 87 } 256 88 return 0;

+6

fs/btrfs/locking.h

··· 22 22 int btrfs_tree_lock(struct extent_buffer *eb); 23 23 int btrfs_tree_unlock(struct extent_buffer *eb); 24 24 int btrfs_tree_locked(struct extent_buffer *eb); 25 + 25 26 int btrfs_try_tree_lock(struct extent_buffer *eb); 27 + int btrfs_try_spin_lock(struct extent_buffer *eb); 28 + 26 29 int btrfs_path_lock_waiting(struct btrfs_path *path, int level); 30 + 31 + void btrfs_set_lock_blocking(struct extent_buffer *eb); 32 + void btrfs_clear_lock_blocking(struct extent_buffer *eb); 27 33 #endif

+1 -3

fs/btrfs/ordered-data.c

··· 613 613 struct btrfs_sector_sum *sector_sums; 614 614 struct btrfs_ordered_extent *ordered; 615 615 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 616 - struct list_head *cur; 617 616 unsigned long num_sectors; 618 617 unsigned long i; 619 618 u32 sectorsize = BTRFS_I(inode)->root->sectorsize; ··· 623 624 return 1; 624 625 625 626 mutex_lock(&tree->mutex); 626 - list_for_each_prev(cur, &ordered->list) { 627 - ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list); 627 + list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { 628 628 if (disk_bytenr >= ordered_sum->bytenr) { 629 629 num_sectors = ordered_sum->len / sectorsize; 630 630 sector_sums = ordered_sum->sums;

+1

fs/btrfs/ref-cache.c

··· 17 17 */ 18 18 19 19 #include <linux/sched.h> 20 + #include <linux/sort.h> 20 21 #include "ctree.h" 21 22 #include "ref-cache.h" 22 23 #include "transaction.h"

-1

fs/btrfs/ref-cache.h

··· 73 73 int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen, 74 74 int shared); 75 75 int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); 76 - 77 76 #endif

+3 -3

fs/btrfs/super.c

··· 37 37 #include <linux/ctype.h> 38 38 #include <linux/namei.h> 39 39 #include <linux/miscdevice.h> 40 - #include <linux/version.h> 41 40 #include <linux/magic.h> 42 41 #include "compat.h" 43 42 #include "ctree.h" ··· 582 583 struct btrfs_ioctl_vol_args *vol; 583 584 struct btrfs_fs_devices *fs_devices; 584 585 int ret = -ENOTTY; 585 - int len; 586 586 587 587 if (!capable(CAP_SYS_ADMIN)) 588 588 return -EPERM; 589 589 590 590 vol = kmalloc(sizeof(*vol), GFP_KERNEL); 591 + if (!vol) 592 + return -ENOMEM; 593 + 591 594 if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) { 592 595 ret = -EFAULT; 593 596 goto out; 594 597 } 595 - len = strnlen(vol->name, BTRFS_PATH_NAME_MAX); 596 598 597 599 switch (cmd) { 598 600 case BTRFS_IOC_SCAN_DEV:

+1 -3

fs/btrfs/transaction.c

··· 852 852 { 853 853 struct btrfs_pending_snapshot *pending; 854 854 struct list_head *head = &trans->transaction->pending_snapshots; 855 - struct list_head *cur; 856 855 int ret; 857 856 858 - list_for_each(cur, head) { 859 - pending = list_entry(cur, struct btrfs_pending_snapshot, list); 857 + list_for_each_entry(pending, head, list) { 860 858 ret = create_pending_snapshot(trans, fs_info, pending); 861 859 BUG_ON(ret); 862 860 }

+1

fs/btrfs/tree-defrag.c

··· 74 74 u32 nritems; 75 75 76 76 root_node = btrfs_lock_root_node(root); 77 + btrfs_set_lock_blocking(root_node); 77 78 nritems = btrfs_header_nritems(root_node); 78 79 root->defrag_max.objectid = 0; 79 80 /* from above we know this is not a leaf */

+170 -184

fs/btrfs/tree-log.c

··· 78 78 */ 79 79 80 80 /* 81 - * btrfs_add_log_tree adds a new per-subvolume log tree into the 82 - * tree of log tree roots. This must be called with a tree log transaction 83 - * running (see start_log_trans). 84 - */ 85 - static int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 86 - struct btrfs_root *root) 87 - { 88 - struct btrfs_key key; 89 - struct btrfs_root_item root_item; 90 - struct btrfs_inode_item *inode_item; 91 - struct extent_buffer *leaf; 92 - struct btrfs_root *new_root = root; 93 - int ret; 94 - u64 objectid = root->root_key.objectid; 95 - 96 - leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 97 - BTRFS_TREE_LOG_OBJECTID, 98 - trans->transid, 0, 0, 0); 99 - if (IS_ERR(leaf)) { 100 - ret = PTR_ERR(leaf); 101 - return ret; 102 - } 103 - 104 - btrfs_set_header_nritems(leaf, 0); 105 - btrfs_set_header_level(leaf, 0); 106 - btrfs_set_header_bytenr(leaf, leaf->start); 107 - btrfs_set_header_generation(leaf, trans->transid); 108 - btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID); 109 - 110 - write_extent_buffer(leaf, root->fs_info->fsid, 111 - (unsigned long)btrfs_header_fsid(leaf), 112 - BTRFS_FSID_SIZE); 113 - btrfs_mark_buffer_dirty(leaf); 114 - 115 - inode_item = &root_item.inode; 116 - memset(inode_item, 0, sizeof(*inode_item)); 117 - inode_item->generation = cpu_to_le64(1); 118 - inode_item->size = cpu_to_le64(3); 119 - inode_item->nlink = cpu_to_le32(1); 120 - inode_item->nbytes = cpu_to_le64(root->leafsize); 121 - inode_item->mode = cpu_to_le32(S_IFDIR | 0755); 122 - 123 - btrfs_set_root_bytenr(&root_item, leaf->start); 124 - btrfs_set_root_generation(&root_item, trans->transid); 125 - btrfs_set_root_level(&root_item, 0); 126 - btrfs_set_root_refs(&root_item, 0); 127 - btrfs_set_root_used(&root_item, 0); 128 - 129 - memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); 130 - root_item.drop_level = 0; 131 - 132 - btrfs_tree_unlock(leaf); 133 - free_extent_buffer(leaf); 134 - leaf = NULL; 135 - 136 - btrfs_set_root_dirid(&root_item, 0); 137 - 138 - key.objectid = BTRFS_TREE_LOG_OBJECTID; 139 - key.offset = objectid; 140 - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 141 - ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key, 142 - &root_item); 143 - if (ret) 144 - goto fail; 145 - 146 - new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree, 147 - &key); 148 - BUG_ON(!new_root); 149 - 150 - WARN_ON(root->log_root); 151 - root->log_root = new_root; 152 - 153 - /* 154 - * log trees do not get reference counted because they go away 155 - * before a real commit is actually done. They do store pointers 156 - * to file data extents, and those reference counts still get 157 - * updated (along with back refs to the log tree). 158 - */ 159 - new_root->ref_cows = 0; 160 - new_root->last_trans = trans->transid; 161 - 162 - /* 163 - * we need to make sure the root block for this new tree 164 - * is marked as dirty in the dirty_log_pages tree. This 165 - * is how it gets flushed down to disk at tree log commit time. 166 - * 167 - * the tree logging mutex keeps others from coming in and changing 168 - * the new_root->node, so we can safely access it here 169 - */ 170 - set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start, 171 - new_root->node->start + new_root->node->len - 1, 172 - GFP_NOFS); 173 - 174 - fail: 175 - return ret; 176 - } 177 - 178 - /* 179 81 * start a sub transaction and setup the log tree 180 82 * this increments the log tree writer count to make the people 181 83 * syncing the tree wait for us to finish ··· 86 184 struct btrfs_root *root) 87 185 { 88 186 int ret; 187 + 188 + mutex_lock(&root->log_mutex); 189 + if (root->log_root) { 190 + root->log_batch++; 191 + atomic_inc(&root->log_writers); 192 + mutex_unlock(&root->log_mutex); 193 + return 0; 194 + } 89 195 mutex_lock(&root->fs_info->tree_log_mutex); 90 196 if (!root->fs_info->log_root_tree) { 91 197 ret = btrfs_init_log_root_tree(trans, root->fs_info); ··· 103 193 ret = btrfs_add_log_tree(trans, root); 104 194 BUG_ON(ret); 105 195 } 106 - atomic_inc(&root->fs_info->tree_log_writers); 107 - root->fs_info->tree_log_batch++; 108 196 mutex_unlock(&root->fs_info->tree_log_mutex); 197 + root->log_batch++; 198 + atomic_inc(&root->log_writers); 199 + mutex_unlock(&root->log_mutex); 109 200 return 0; 110 201 } 111 202 ··· 123 212 if (!root->log_root) 124 213 return -ENOENT; 125 214 126 - mutex_lock(&root->fs_info->tree_log_mutex); 215 + mutex_lock(&root->log_mutex); 127 216 if (root->log_root) { 128 217 ret = 0; 129 - atomic_inc(&root->fs_info->tree_log_writers); 130 - root->fs_info->tree_log_batch++; 218 + atomic_inc(&root->log_writers); 131 219 } 132 - mutex_unlock(&root->fs_info->tree_log_mutex); 220 + mutex_unlock(&root->log_mutex); 133 221 return ret; 134 222 } 135 223 ··· 138 228 */ 139 229 static int end_log_trans(struct btrfs_root *root) 140 230 { 141 - atomic_dec(&root->fs_info->tree_log_writers); 142 - smp_mb(); 143 - if (waitqueue_active(&root->fs_info->tree_log_wait)) 144 - wake_up(&root->fs_info->tree_log_wait); 231 + if (atomic_dec_and_test(&root->log_writers)) { 232 + smp_mb(); 233 + if (waitqueue_active(&root->log_writer_wait)) 234 + wake_up(&root->log_writer_wait); 235 + } 145 236 return 0; 146 237 } 147 238 ··· 1615 1704 1616 1705 btrfs_tree_lock(next); 1617 1706 clean_tree_block(trans, root, next); 1707 + btrfs_set_lock_blocking(next); 1618 1708 btrfs_wait_tree_block_writeback(next); 1619 1709 btrfs_tree_unlock(next); 1620 1710 ··· 1662 1750 next = path->nodes[*level]; 1663 1751 btrfs_tree_lock(next); 1664 1752 clean_tree_block(trans, root, next); 1753 + btrfs_set_lock_blocking(next); 1665 1754 btrfs_wait_tree_block_writeback(next); 1666 1755 btrfs_tree_unlock(next); 1667 1756 ··· 1720 1807 1721 1808 btrfs_tree_lock(next); 1722 1809 clean_tree_block(trans, root, next); 1810 + btrfs_set_lock_blocking(next); 1723 1811 btrfs_wait_tree_block_writeback(next); 1724 1812 btrfs_tree_unlock(next); 1725 1813 ··· 1793 1879 1794 1880 btrfs_tree_lock(next); 1795 1881 clean_tree_block(trans, log, next); 1882 + btrfs_set_lock_blocking(next); 1796 1883 btrfs_wait_tree_block_writeback(next); 1797 1884 btrfs_tree_unlock(next); 1798 1885 ··· 1817 1902 } 1818 1903 } 1819 1904 btrfs_free_path(path); 1820 - if (wc->free) 1821 - free_extent_buffer(log->node); 1822 1905 return ret; 1823 1906 } 1824 1907 1825 - static int wait_log_commit(struct btrfs_root *log) 1908 + /* 1909 + * helper function to update the item for a given subvolumes log root 1910 + * in the tree of log roots 1911 + */ 1912 + static int update_log_root(struct btrfs_trans_handle *trans, 1913 + struct btrfs_root *log) 1914 + { 1915 + int ret; 1916 + 1917 + if (log->log_transid == 1) { 1918 + /* insert root item on the first sync */ 1919 + ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, 1920 + &log->root_key, &log->root_item); 1921 + } else { 1922 + ret = btrfs_update_root(trans, log->fs_info->log_root_tree, 1923 + &log->root_key, &log->root_item); 1924 + } 1925 + return ret; 1926 + } 1927 + 1928 + static int wait_log_commit(struct btrfs_root *root, unsigned long transid) 1826 1929 { 1827 1930 DEFINE_WAIT(wait); 1828 - u64 transid = log->fs_info->tree_log_transid; 1931 + int index = transid % 2; 1829 1932 1933 + /* 1934 + * we only allow two pending log transactions at a time, 1935 + * so we know that if ours is more than 2 older than the 1936 + * current transaction, we're done 1937 + */ 1830 1938 do { 1831 - prepare_to_wait(&log->fs_info->tree_log_wait, &wait, 1832 - TASK_UNINTERRUPTIBLE); 1833 - mutex_unlock(&log->fs_info->tree_log_mutex); 1834 - if (atomic_read(&log->fs_info->tree_log_commit)) 1939 + prepare_to_wait(&root->log_commit_wait[index], 1940 + &wait, TASK_UNINTERRUPTIBLE); 1941 + mutex_unlock(&root->log_mutex); 1942 + if (root->log_transid < transid + 2 && 1943 + atomic_read(&root->log_commit[index])) 1835 1944 schedule(); 1836 - finish_wait(&log->fs_info->tree_log_wait, &wait); 1837 - mutex_lock(&log->fs_info->tree_log_mutex); 1838 - } while (transid == log->fs_info->tree_log_transid && 1839 - atomic_read(&log->fs_info->tree_log_commit)); 1945 + finish_wait(&root->log_commit_wait[index], &wait); 1946 + mutex_lock(&root->log_mutex); 1947 + } while (root->log_transid < transid + 2 && 1948 + atomic_read(&root->log_commit[index])); 1949 + return 0; 1950 + } 1951 + 1952 + static int wait_for_writer(struct btrfs_root *root) 1953 + { 1954 + DEFINE_WAIT(wait); 1955 + while (atomic_read(&root->log_writers)) { 1956 + prepare_to_wait(&root->log_writer_wait, 1957 + &wait, TASK_UNINTERRUPTIBLE); 1958 + mutex_unlock(&root->log_mutex); 1959 + if (atomic_read(&root->log_writers)) 1960 + schedule(); 1961 + mutex_lock(&root->log_mutex); 1962 + finish_wait(&root->log_writer_wait, &wait); 1963 + } 1840 1964 return 0; 1841 1965 } 1842 1966 ··· 1887 1933 int btrfs_sync_log(struct btrfs_trans_handle *trans, 1888 1934 struct btrfs_root *root) 1889 1935 { 1936 + int index1; 1937 + int index2; 1890 1938 int ret; 1891 - unsigned long batch; 1892 1939 struct btrfs_root *log = root->log_root; 1940 + struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 1893 1941 1894 - mutex_lock(&log->fs_info->tree_log_mutex); 1895 - if (atomic_read(&log->fs_info->tree_log_commit)) { 1896 - wait_log_commit(log); 1897 - goto out; 1942 + mutex_lock(&root->log_mutex); 1943 + index1 = root->log_transid % 2; 1944 + if (atomic_read(&root->log_commit[index1])) { 1945 + wait_log_commit(root, root->log_transid); 1946 + mutex_unlock(&root->log_mutex); 1947 + return 0; 1898 1948 } 1899 - atomic_set(&log->fs_info->tree_log_commit, 1); 1949 + atomic_set(&root->log_commit[index1], 1); 1950 + 1951 + /* wait for previous tree log sync to complete */ 1952 + if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 1953 + wait_log_commit(root, root->log_transid - 1); 1900 1954 1901 1955 while (1) { 1902 - batch = log->fs_info->tree_log_batch; 1903 - mutex_unlock(&log->fs_info->tree_log_mutex); 1956 + unsigned long batch = root->log_batch; 1957 + mutex_unlock(&root->log_mutex); 1904 1958 schedule_timeout_uninterruptible(1); 1905 - mutex_lock(&log->fs_info->tree_log_mutex); 1906 - 1907 - while (atomic_read(&log->fs_info->tree_log_writers)) { 1908 - DEFINE_WAIT(wait); 1909 - prepare_to_wait(&log->fs_info->tree_log_wait, &wait, 1910 - TASK_UNINTERRUPTIBLE); 1911 - mutex_unlock(&log->fs_info->tree_log_mutex); 1912 - if (atomic_read(&log->fs_info->tree_log_writers)) 1913 - schedule(); 1914 - mutex_lock(&log->fs_info->tree_log_mutex); 1915 - finish_wait(&log->fs_info->tree_log_wait, &wait); 1916 - } 1917 - if (batch == log->fs_info->tree_log_batch) 1959 + mutex_lock(&root->log_mutex); 1960 + wait_for_writer(root); 1961 + if (batch == root->log_batch) 1918 1962 break; 1919 1963 } 1920 1964 1921 1965 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 1922 1966 BUG_ON(ret); 1923 - ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree, 1924 - &root->fs_info->log_root_tree->dirty_log_pages); 1967 + 1968 + btrfs_set_root_bytenr(&log->root_item, log->node->start); 1969 + btrfs_set_root_generation(&log->root_item, trans->transid); 1970 + btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node)); 1971 + 1972 + root->log_batch = 0; 1973 + root->log_transid++; 1974 + log->log_transid = root->log_transid; 1975 + smp_mb(); 1976 + /* 1977 + * log tree has been flushed to disk, new modifications of 1978 + * the log will be written to new positions. so it's safe to 1979 + * allow log writers to go in. 1980 + */ 1981 + mutex_unlock(&root->log_mutex); 1982 + 1983 + mutex_lock(&log_root_tree->log_mutex); 1984 + log_root_tree->log_batch++; 1985 + atomic_inc(&log_root_tree->log_writers); 1986 + mutex_unlock(&log_root_tree->log_mutex); 1987 + 1988 + ret = update_log_root(trans, log); 1989 + BUG_ON(ret); 1990 + 1991 + mutex_lock(&log_root_tree->log_mutex); 1992 + if (atomic_dec_and_test(&log_root_tree->log_writers)) { 1993 + smp_mb(); 1994 + if (waitqueue_active(&log_root_tree->log_writer_wait)) 1995 + wake_up(&log_root_tree->log_writer_wait); 1996 + } 1997 + 1998 + index2 = log_root_tree->log_transid % 2; 1999 + if (atomic_read(&log_root_tree->log_commit[index2])) { 2000 + wait_log_commit(log_root_tree, log_root_tree->log_transid); 2001 + mutex_unlock(&log_root_tree->log_mutex); 2002 + goto out; 2003 + } 2004 + atomic_set(&log_root_tree->log_commit[index2], 1); 2005 + 2006 + if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) 2007 + wait_log_commit(log_root_tree, log_root_tree->log_transid - 1); 2008 + 2009 + wait_for_writer(log_root_tree); 2010 + 2011 + ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2012 + &log_root_tree->dirty_log_pages); 1925 2013 BUG_ON(ret); 1926 2014 1927 2015 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 1928 - log->fs_info->log_root_tree->node->start); 2016 + log_root_tree->node->start); 1929 2017 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 1930 - btrfs_header_level(log->fs_info->log_root_tree->node)); 2018 + btrfs_header_level(log_root_tree->node)); 1931 2019 1932 - write_ctree_super(trans, log->fs_info->tree_root, 2); 1933 - log->fs_info->tree_log_transid++; 1934 - log->fs_info->tree_log_batch = 0; 1935 - atomic_set(&log->fs_info->tree_log_commit, 0); 2020 + log_root_tree->log_batch = 0; 2021 + log_root_tree->log_transid++; 1936 2022 smp_mb(); 1937 - if (waitqueue_active(&log->fs_info->tree_log_wait)) 1938 - wake_up(&log->fs_info->tree_log_wait); 2023 + 2024 + mutex_unlock(&log_root_tree->log_mutex); 2025 + 2026 + /* 2027 + * nobody else is going to jump in and write the the ctree 2028 + * super here because the log_commit atomic below is protecting 2029 + * us. We must be called with a transaction handle pinning 2030 + * the running transaction open, so a full commit can't hop 2031 + * in and cause problems either. 2032 + */ 2033 + write_ctree_super(trans, root->fs_info->tree_root, 2); 2034 + 2035 + atomic_set(&log_root_tree->log_commit[index2], 0); 2036 + smp_mb(); 2037 + if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2038 + wake_up(&log_root_tree->log_commit_wait[index2]); 1939 2039 out: 1940 - mutex_unlock(&log->fs_info->tree_log_mutex); 2040 + atomic_set(&root->log_commit[index1], 0); 2041 + smp_mb(); 2042 + if (waitqueue_active(&root->log_commit_wait[index1])) 2043 + wake_up(&root->log_commit_wait[index1]); 1941 2044 return 0; 1942 2045 } 1943 2046 ··· 2030 2019 start, end, GFP_NOFS); 2031 2020 } 2032 2021 2033 - log = root->log_root; 2034 - ret = btrfs_del_root(trans, root->fs_info->log_root_tree, 2035 - &log->root_key); 2036 - BUG_ON(ret); 2022 + if (log->log_transid > 0) { 2023 + ret = btrfs_del_root(trans, root->fs_info->log_root_tree, 2024 + &log->root_key); 2025 + BUG_ON(ret); 2026 + } 2037 2027 root->log_root = NULL; 2038 - kfree(root->log_root); 2028 + free_extent_buffer(log->node); 2029 + kfree(log); 2039 2030 return 0; 2040 - } 2041 - 2042 - /* 2043 - * helper function to update the item for a given subvolumes log root 2044 - * in the tree of log roots 2045 - */ 2046 - static int update_log_root(struct btrfs_trans_handle *trans, 2047 - struct btrfs_root *log) 2048 - { 2049 - u64 bytenr = btrfs_root_bytenr(&log->root_item); 2050 - int ret; 2051 - 2052 - if (log->node->start == bytenr) 2053 - return 0; 2054 - 2055 - btrfs_set_root_bytenr(&log->root_item, log->node->start); 2056 - btrfs_set_root_generation(&log->root_item, trans->transid); 2057 - btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node)); 2058 - ret = btrfs_update_root(trans, log->fs_info->log_root_tree, 2059 - &log->root_key, &log->root_item); 2060 - BUG_ON(ret); 2061 - return ret; 2062 2031 } 2063 2032 2064 2033 /* ··· 2702 2711 2703 2712 btrfs_free_path(path); 2704 2713 btrfs_free_path(dst_path); 2705 - 2706 - mutex_lock(&root->fs_info->tree_log_mutex); 2707 - ret = update_log_root(trans, log); 2708 - BUG_ON(ret); 2709 - mutex_unlock(&root->fs_info->tree_log_mutex); 2710 2714 out: 2711 2715 return 0; 2712 2716 }

+19 -30

fs/btrfs/volumes.c

··· 20 20 #include <linux/buffer_head.h> 21 21 #include <linux/blkdev.h> 22 22 #include <linux/random.h> 23 - #include <linux/version.h> 24 23 #include <asm/div64.h> 25 24 #include "compat.h" 26 25 #include "ctree.h" ··· 103 104 u64 devid, u8 *uuid) 104 105 { 105 106 struct btrfs_device *dev; 106 - struct list_head *cur; 107 107 108 - list_for_each(cur, head) { 109 - dev = list_entry(cur, struct btrfs_device, dev_list); 108 + list_for_each_entry(dev, head, dev_list) { 110 109 if (dev->devid == devid && 111 110 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 112 111 return dev; ··· 115 118 116 119 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 117 120 { 118 - struct list_head *cur; 119 121 struct btrfs_fs_devices *fs_devices; 120 122 121 - list_for_each(cur, &fs_uuids) { 122 - fs_devices = list_entry(cur, struct btrfs_fs_devices, list); 123 + list_for_each_entry(fs_devices, &fs_uuids, list) { 123 124 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 124 125 return fs_devices; 125 126 } ··· 154 159 loop: 155 160 spin_lock(&device->io_lock); 156 161 162 + loop_lock: 157 163 /* take all the bios off the list at once and process them 158 164 * later on (without the lock held). But, remember the 159 165 * tail and other pointers so the bios can be properly reinserted ··· 204 208 * is now congested. Back off and let other work structs 205 209 * run instead 206 210 */ 207 - if (pending && bdi_write_congested(bdi) && 211 + if (pending && bdi_write_congested(bdi) && num_run > 16 && 208 212 fs_info->fs_devices->open_devices > 1) { 209 213 struct bio *old_head; 210 214 ··· 216 220 tail->bi_next = old_head; 217 221 else 218 222 device->pending_bio_tail = tail; 219 - device->running_pending = 0; 223 + 224 + device->running_pending = 1; 220 225 221 226 spin_unlock(&device->io_lock); 222 227 btrfs_requeue_work(&device->work); ··· 226 229 } 227 230 if (again) 228 231 goto loop; 232 + 233 + spin_lock(&device->io_lock); 234 + if (device->pending_bios) 235 + goto loop_lock; 236 + spin_unlock(&device->io_lock); 229 237 done: 230 238 return 0; 231 239 } ··· 347 345 348 346 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 349 347 { 350 - struct list_head *tmp; 351 - struct list_head *cur; 352 - struct btrfs_device *device; 348 + struct btrfs_device *device, *next; 353 349 354 350 mutex_lock(&uuid_mutex); 355 351 again: 356 - list_for_each_safe(cur, tmp, &fs_devices->devices) { 357 - device = list_entry(cur, struct btrfs_device, dev_list); 352 + list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 358 353 if (device->in_fs_metadata) 359 354 continue; 360 355 ··· 382 383 383 384 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 384 385 { 385 - struct list_head *cur; 386 386 struct btrfs_device *device; 387 387 388 388 if (--fs_devices->opened > 0) 389 389 return 0; 390 390 391 - list_for_each(cur, &fs_devices->devices) { 392 - device = list_entry(cur, struct btrfs_device, dev_list); 391 + list_for_each_entry(device, &fs_devices->devices, dev_list) { 393 392 if (device->bdev) { 394 393 close_bdev_exclusive(device->bdev, device->mode); 395 394 fs_devices->open_devices--; ··· 436 439 { 437 440 struct block_device *bdev; 438 441 struct list_head *head = &fs_devices->devices; 439 - struct list_head *cur; 440 442 struct btrfs_device *device; 441 443 struct block_device *latest_bdev = NULL; 442 444 struct buffer_head *bh; ··· 446 450 int seeding = 1; 447 451 int ret = 0; 448 452 449 - list_for_each(cur, head) { 450 - device = list_entry(cur, struct btrfs_device, dev_list); 453 + list_for_each_entry(device, head, dev_list) { 451 454 if (device->bdev) 452 455 continue; 453 456 if (!device->name) ··· 573 578 *(unsigned long long *)disk_super->fsid, 574 579 *(unsigned long long *)(disk_super->fsid + 8)); 575 580 } 576 - printk(KERN_INFO "devid %llu transid %llu %s\n", 581 + printk(KERN_CONT "devid %llu transid %llu %s\n", 577 582 (unsigned long long)devid, (unsigned long long)transid, path); 578 583 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 579 584 ··· 1012 1017 } 1013 1018 1014 1019 if (strcmp(device_path, "missing") == 0) { 1015 - struct list_head *cur; 1016 1020 struct list_head *devices; 1017 1021 struct btrfs_device *tmp; 1018 1022 1019 1023 device = NULL; 1020 1024 devices = &root->fs_info->fs_devices->devices; 1021 - list_for_each(cur, devices) { 1022 - tmp = list_entry(cur, struct btrfs_device, dev_list); 1025 + list_for_each_entry(tmp, devices, dev_list) { 1023 1026 if (tmp->in_fs_metadata && !tmp->bdev) { 1024 1027 device = tmp; 1025 1028 break; ··· 1273 1280 struct btrfs_trans_handle *trans; 1274 1281 struct btrfs_device *device; 1275 1282 struct block_device *bdev; 1276 - struct list_head *cur; 1277 1283 struct list_head *devices; 1278 1284 struct super_block *sb = root->fs_info->sb; 1279 1285 u64 total_bytes; ··· 1296 1304 mutex_lock(&root->fs_info->volume_mutex); 1297 1305 1298 1306 devices = &root->fs_info->fs_devices->devices; 1299 - list_for_each(cur, devices) { 1300 - device = list_entry(cur, struct btrfs_device, dev_list); 1307 + list_for_each_entry(device, devices, dev_list) { 1301 1308 if (device->bdev == bdev) { 1302 1309 ret = -EEXIST; 1303 1310 goto error; ··· 1695 1704 int btrfs_balance(struct btrfs_root *dev_root) 1696 1705 { 1697 1706 int ret; 1698 - struct list_head *cur; 1699 1707 struct list_head *devices = &dev_root->fs_info->fs_devices->devices; 1700 1708 struct btrfs_device *device; 1701 1709 u64 old_size; ··· 1713 1723 dev_root = dev_root->fs_info->dev_root; 1714 1724 1715 1725 /* step one make some room on all the devices */ 1716 - list_for_each(cur, devices) { 1717 - device = list_entry(cur, struct btrfs_device, dev_list); 1726 + list_for_each_entry(device, devices, dev_list) { 1718 1727 old_size = device->total_bytes; 1719 1728 size_to_free = div_factor(old_size, 1); 1720 1729 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);

+45 -3

fs/btrfs/xattr.c

··· 21 21 #include <linux/slab.h> 22 22 #include <linux/rwsem.h> 23 23 #include <linux/xattr.h> 24 + #include <linux/security.h> 24 25 #include "ctree.h" 25 26 #include "btrfs_inode.h" 26 27 #include "transaction.h" ··· 46 45 /* lookup the xattr by name */ 47 46 di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name, 48 47 strlen(name), 0); 49 - if (!di || IS_ERR(di)) { 48 + if (!di) { 50 49 ret = -ENODATA; 50 + goto out; 51 + } else if (IS_ERR(di)) { 52 + ret = PTR_ERR(di); 51 53 goto out; 52 54 } 53 55 ··· 66 62 ret = -ERANGE; 67 63 goto out; 68 64 } 65 + 66 + /* 67 + * The way things are packed into the leaf is like this 68 + * |struct btrfs_dir_item|name|data| 69 + * where name is the xattr name, so security.foo, and data is the 70 + * content of the xattr. data_ptr points to the location in memory 71 + * where the data starts in the in memory leaf 72 + */ 69 73 data_ptr = (unsigned long)((char *)(di + 1) + 70 74 btrfs_dir_name_len(leaf, di)); 71 75 read_extent_buffer(leaf, buffer, data_ptr, ··· 98 86 if (!path) 99 87 return -ENOMEM; 100 88 101 - trans = btrfs_start_transaction(root, 1); 89 + trans = btrfs_join_transaction(root, 1); 102 90 btrfs_set_trans_block_group(trans, inode); 103 91 104 92 /* first lets see if we already have this xattr */ ··· 188 176 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 189 177 if (ret < 0) 190 178 goto err; 191 - ret = 0; 192 179 advance = 0; 193 180 while (1) { 194 181 leaf = path->nodes[0]; ··· 330 319 if (!btrfs_is_valid_xattr(name)) 331 320 return -EOPNOTSUPP; 332 321 return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); 322 + } 323 + 324 + int btrfs_xattr_security_init(struct inode *inode, struct inode *dir) 325 + { 326 + int err; 327 + size_t len; 328 + void *value; 329 + char *suffix; 330 + char *name; 331 + 332 + err = security_inode_init_security(inode, dir, &suffix, &value, &len); 333 + if (err) { 334 + if (err == -EOPNOTSUPP) 335 + return 0; 336 + return err; 337 + } 338 + 339 + name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1, 340 + GFP_NOFS); 341 + if (!name) { 342 + err = -ENOMEM; 343 + } else { 344 + strcpy(name, XATTR_SECURITY_PREFIX); 345 + strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix); 346 + err = __btrfs_setxattr(inode, name, value, len, 0); 347 + kfree(name); 348 + } 349 + 350 + kfree(suffix); 351 + kfree(value); 352 + return err; 333 353 }

+2

fs/btrfs/xattr.h

··· 36 36 const void *value, size_t size, int flags); 37 37 extern int btrfs_removexattr(struct dentry *dentry, const char *name); 38 38 39 + extern int btrfs_xattr_security_init(struct inode *inode, struct inode *dir); 40 + 39 41 #endif /* __XATTR__ */