at v6.14 1514 lines 39 kB view raw
1// SPDX-License-Identifier: GPL-2.0 2 3#include "bcachefs.h" 4#include "bbpos.h" 5#include "bkey_buf.h" 6#include "btree_cache.h" 7#include "btree_io.h" 8#include "btree_iter.h" 9#include "btree_locking.h" 10#include "debug.h" 11#include "errcode.h" 12#include "error.h" 13#include "journal.h" 14#include "trace.h" 15 16#include <linux/prefetch.h> 17#include <linux/sched/mm.h> 18#include <linux/swap.h> 19 20#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ 21do { \ 22 if (shrinker_counter) \ 23 bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \ 24} while (0) 25 26const char * const bch2_btree_node_flags[] = { 27 "typebit", 28 "typebit", 29 "typebit", 30#define x(f) [BTREE_NODE_##f] = #f, 31 BTREE_FLAGS() 32#undef x 33 NULL 34}; 35 36void bch2_recalc_btree_reserve(struct bch_fs *c) 37{ 38 unsigned reserve = 16; 39 40 if (!c->btree_roots_known[0].b) 41 reserve += 8; 42 43 for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { 44 struct btree_root *r = bch2_btree_id_root(c, i); 45 46 if (r->b) 47 reserve += min_t(unsigned, 1, r->b->c.level) * 8; 48 } 49 50 c->btree_cache.nr_reserve = reserve; 51} 52 53static inline size_t btree_cache_can_free(struct btree_cache_list *list) 54{ 55 struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); 56 57 size_t can_free = list->nr; 58 if (!list->idx) 59 can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve); 60 return can_free; 61} 62 63static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) 64{ 65 BUG_ON(!list_empty(&b->list)); 66 67 if (b->c.lock.readers) 68 list_add(&b->list, &bc->freed_pcpu); 69 else 70 list_add(&b->list, &bc->freed_nonpcpu); 71} 72 73static void __bch2_btree_node_to_freelist(struct btree_cache *bc, struct btree *b) 74{ 75 BUG_ON(!list_empty(&b->list)); 76 BUG_ON(!b->data); 77 78 bc->nr_freeable++; 79 list_add(&b->list, &bc->freeable); 80} 81 82void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) 83{ 84 struct btree_cache *bc = &c->btree_cache; 85 86 mutex_lock(&bc->lock); 87 __bch2_btree_node_to_freelist(bc, b); 88 mutex_unlock(&bc->lock); 89 90 six_unlock_write(&b->c.lock); 91 six_unlock_intent(&b->c.lock); 92} 93 94static void __btree_node_data_free(struct btree_cache *bc, struct btree *b) 95{ 96 BUG_ON(!list_empty(&b->list)); 97 BUG_ON(btree_node_hashed(b)); 98 99 /* 100 * This should really be done in slub/vmalloc, but we're using the 101 * kmalloc_large() path, so we're working around a slub bug by doing 102 * this here: 103 */ 104 if (b->data) 105 mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE); 106 if (b->aux_data) 107 mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE); 108 109 EBUG_ON(btree_node_write_in_flight(b)); 110 111 clear_btree_node_just_written(b); 112 113 kvfree(b->data); 114 b->data = NULL; 115#ifdef __KERNEL__ 116 kvfree(b->aux_data); 117#else 118 munmap(b->aux_data, btree_aux_data_bytes(b)); 119#endif 120 b->aux_data = NULL; 121 122 btree_node_to_freedlist(bc, b); 123} 124 125static void btree_node_data_free(struct btree_cache *bc, struct btree *b) 126{ 127 BUG_ON(list_empty(&b->list)); 128 list_del_init(&b->list); 129 --bc->nr_freeable; 130 __btree_node_data_free(bc, b); 131} 132 133static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, 134 const void *obj) 135{ 136 const struct btree *b = obj; 137 const u64 *v = arg->key; 138 139 return b->hash_val == *v ? 0 : 1; 140} 141 142static const struct rhashtable_params bch_btree_cache_params = { 143 .head_offset = offsetof(struct btree, hash), 144 .key_offset = offsetof(struct btree, hash_val), 145 .key_len = sizeof(u64), 146 .obj_cmpfn = bch2_btree_cache_cmp_fn, 147 .automatic_shrinking = true, 148}; 149 150static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) 151{ 152 BUG_ON(b->data || b->aux_data); 153 154 gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; 155 156 b->data = kvmalloc(btree_buf_bytes(b), gfp); 157 if (!b->data) 158 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 159#ifdef __KERNEL__ 160 b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); 161#else 162 b->aux_data = mmap(NULL, btree_aux_data_bytes(b), 163 PROT_READ|PROT_WRITE|PROT_EXEC, 164 MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); 165 if (b->aux_data == MAP_FAILED) 166 b->aux_data = NULL; 167#endif 168 if (!b->aux_data) { 169 kvfree(b->data); 170 b->data = NULL; 171 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 172 } 173 174 return 0; 175} 176 177static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) 178{ 179 struct btree *b; 180 181 b = kzalloc(sizeof(struct btree), gfp); 182 if (!b) 183 return NULL; 184 185 bkey_btree_ptr_init(&b->key); 186 INIT_LIST_HEAD(&b->list); 187 INIT_LIST_HEAD(&b->write_blocked); 188 b->byte_order = ilog2(c->opts.btree_node_size); 189 return b; 190} 191 192struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) 193{ 194 struct btree_cache *bc = &c->btree_cache; 195 struct btree *b; 196 197 b = __btree_node_mem_alloc(c, GFP_KERNEL); 198 if (!b) 199 return NULL; 200 201 if (btree_node_data_alloc(c, b, GFP_KERNEL)) { 202 kfree(b); 203 return NULL; 204 } 205 206 bch2_btree_lock_init(&b->c, 0, GFP_KERNEL); 207 208 __bch2_btree_node_to_freelist(bc, b); 209 return b; 210} 211 212static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b) 213{ 214 struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); 215 216 u64 mask = bc->pinned_nodes_mask[!!b->c.level]; 217 218 return ((mask & BIT_ULL(b->c.btree_id)) && 219 bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && 220 bbpos_cmp(bc->pinned_nodes_end, pos) >= 0); 221} 222 223void bch2_node_pin(struct bch_fs *c, struct btree *b) 224{ 225 struct btree_cache *bc = &c->btree_cache; 226 227 mutex_lock(&bc->lock); 228 if (b != btree_node_root(c, b) && !btree_node_pinned(b)) { 229 set_btree_node_pinned(b); 230 list_move(&b->list, &bc->live[1].list); 231 bc->live[0].nr--; 232 bc->live[1].nr++; 233 } 234 mutex_unlock(&bc->lock); 235} 236 237void bch2_btree_cache_unpin(struct bch_fs *c) 238{ 239 struct btree_cache *bc = &c->btree_cache; 240 struct btree *b, *n; 241 242 mutex_lock(&bc->lock); 243 c->btree_cache.pinned_nodes_mask[0] = 0; 244 c->btree_cache.pinned_nodes_mask[1] = 0; 245 246 list_for_each_entry_safe(b, n, &bc->live[1].list, list) { 247 clear_btree_node_pinned(b); 248 list_move(&b->list, &bc->live[0].list); 249 bc->live[0].nr++; 250 bc->live[1].nr--; 251 } 252 253 mutex_unlock(&bc->lock); 254} 255 256/* Btree in memory cache - hash table */ 257 258void __bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) 259{ 260 lockdep_assert_held(&bc->lock); 261 262 int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); 263 BUG_ON(ret); 264 265 /* Cause future lookups for this node to fail: */ 266 b->hash_val = 0; 267 268 if (b->c.btree_id < BTREE_ID_NR) 269 --bc->nr_by_btree[b->c.btree_id]; 270 --bc->live[btree_node_pinned(b)].nr; 271 list_del_init(&b->list); 272} 273 274void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) 275{ 276 __bch2_btree_node_hash_remove(bc, b); 277 __bch2_btree_node_to_freelist(bc, b); 278} 279 280int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) 281{ 282 BUG_ON(!list_empty(&b->list)); 283 BUG_ON(b->hash_val); 284 285 b->hash_val = btree_ptr_hash_val(&b->key); 286 int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash, 287 bch_btree_cache_params); 288 if (ret) 289 return ret; 290 291 if (b->c.btree_id < BTREE_ID_NR) 292 bc->nr_by_btree[b->c.btree_id]++; 293 294 bool p = __btree_node_pinned(bc, b); 295 mod_bit(BTREE_NODE_pinned, &b->flags, p); 296 297 list_add_tail(&b->list, &bc->live[p].list); 298 bc->live[p].nr++; 299 return 0; 300} 301 302int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, 303 unsigned level, enum btree_id id) 304{ 305 b->c.level = level; 306 b->c.btree_id = id; 307 308 mutex_lock(&bc->lock); 309 int ret = __bch2_btree_node_hash_insert(bc, b); 310 mutex_unlock(&bc->lock); 311 312 return ret; 313} 314 315void bch2_btree_node_update_key_early(struct btree_trans *trans, 316 enum btree_id btree, unsigned level, 317 struct bkey_s_c old, struct bkey_i *new) 318{ 319 struct bch_fs *c = trans->c; 320 struct btree *b; 321 struct bkey_buf tmp; 322 int ret; 323 324 bch2_bkey_buf_init(&tmp); 325 bch2_bkey_buf_reassemble(&tmp, c, old); 326 327 b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); 328 if (!IS_ERR_OR_NULL(b)) { 329 mutex_lock(&c->btree_cache.lock); 330 331 __bch2_btree_node_hash_remove(&c->btree_cache, b); 332 333 bkey_copy(&b->key, new); 334 ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); 335 BUG_ON(ret); 336 337 mutex_unlock(&c->btree_cache.lock); 338 six_unlock_read(&b->c.lock); 339 } 340 341 bch2_bkey_buf_exit(&tmp, c); 342} 343 344__flatten 345static inline struct btree *btree_cache_find(struct btree_cache *bc, 346 const struct bkey_i *k) 347{ 348 u64 v = btree_ptr_hash_val(k); 349 350 return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); 351} 352 353/* 354 * this version is for btree nodes that have already been freed (we're not 355 * reaping a real btree node) 356 */ 357static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter) 358{ 359 struct btree_cache *bc = &c->btree_cache; 360 int ret = 0; 361 362 lockdep_assert_held(&bc->lock); 363wait_on_io: 364 if (b->flags & ((1U << BTREE_NODE_dirty)| 365 (1U << BTREE_NODE_read_in_flight)| 366 (1U << BTREE_NODE_write_in_flight))) { 367 if (!flush) { 368 if (btree_node_dirty(b)) 369 BTREE_CACHE_NOT_FREED_INCREMENT(dirty); 370 else if (btree_node_read_in_flight(b)) 371 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); 372 else if (btree_node_write_in_flight(b)) 373 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); 374 return -BCH_ERR_ENOMEM_btree_node_reclaim; 375 } 376 377 /* XXX: waiting on IO with btree cache lock held */ 378 bch2_btree_node_wait_on_read(b); 379 bch2_btree_node_wait_on_write(b); 380 } 381 382 if (!six_trylock_intent(&b->c.lock)) { 383 BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent); 384 return -BCH_ERR_ENOMEM_btree_node_reclaim; 385 } 386 387 if (!six_trylock_write(&b->c.lock)) { 388 BTREE_CACHE_NOT_FREED_INCREMENT(lock_write); 389 goto out_unlock_intent; 390 } 391 392 /* recheck under lock */ 393 if (b->flags & ((1U << BTREE_NODE_read_in_flight)| 394 (1U << BTREE_NODE_write_in_flight))) { 395 if (!flush) { 396 if (btree_node_read_in_flight(b)) 397 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); 398 else if (btree_node_write_in_flight(b)) 399 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); 400 goto out_unlock; 401 } 402 six_unlock_write(&b->c.lock); 403 six_unlock_intent(&b->c.lock); 404 goto wait_on_io; 405 } 406 407 if (btree_node_noevict(b)) { 408 BTREE_CACHE_NOT_FREED_INCREMENT(noevict); 409 goto out_unlock; 410 } 411 if (btree_node_write_blocked(b)) { 412 BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked); 413 goto out_unlock; 414 } 415 if (btree_node_will_make_reachable(b)) { 416 BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable); 417 goto out_unlock; 418 } 419 420 if (btree_node_dirty(b)) { 421 if (!flush) { 422 BTREE_CACHE_NOT_FREED_INCREMENT(dirty); 423 goto out_unlock; 424 } 425 /* 426 * Using the underscore version because we don't want to compact 427 * bsets after the write, since this node is about to be evicted 428 * - unless btree verify mode is enabled, since it runs out of 429 * the post write cleanup: 430 */ 431 if (bch2_verify_btree_ondisk) 432 bch2_btree_node_write(c, b, SIX_LOCK_intent, 433 BTREE_WRITE_cache_reclaim); 434 else 435 __bch2_btree_node_write(c, b, 436 BTREE_WRITE_cache_reclaim); 437 438 six_unlock_write(&b->c.lock); 439 six_unlock_intent(&b->c.lock); 440 goto wait_on_io; 441 } 442out: 443 if (b->hash_val && !ret) 444 trace_and_count(c, btree_cache_reap, c, b); 445 return ret; 446out_unlock: 447 six_unlock_write(&b->c.lock); 448out_unlock_intent: 449 six_unlock_intent(&b->c.lock); 450 ret = -BCH_ERR_ENOMEM_btree_node_reclaim; 451 goto out; 452} 453 454static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter) 455{ 456 return __btree_node_reclaim(c, b, false, shrinker_counter); 457} 458 459static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) 460{ 461 return __btree_node_reclaim(c, b, true, false); 462} 463 464static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, 465 struct shrink_control *sc) 466{ 467 struct btree_cache_list *list = shrink->private_data; 468 struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); 469 struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); 470 struct btree *b, *t; 471 unsigned long nr = sc->nr_to_scan; 472 unsigned long can_free = 0; 473 unsigned long freed = 0; 474 unsigned long touched = 0; 475 unsigned i, flags; 476 unsigned long ret = SHRINK_STOP; 477 bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4; 478 479 if (bch2_btree_shrinker_disabled) 480 return SHRINK_STOP; 481 482 mutex_lock(&bc->lock); 483 flags = memalloc_nofs_save(); 484 485 /* 486 * It's _really_ critical that we don't free too many btree nodes - we 487 * have to always leave ourselves a reserve. The reserve is how we 488 * guarantee that allocating memory for a new btree node can always 489 * succeed, so that inserting keys into the btree can always succeed and 490 * IO can always make forward progress: 491 */ 492 can_free = btree_cache_can_free(list); 493 nr = min_t(unsigned long, nr, can_free); 494 495 i = 0; 496 list_for_each_entry_safe(b, t, &bc->freeable, list) { 497 /* 498 * Leave a few nodes on the freeable list, so that a btree split 499 * won't have to hit the system allocator: 500 */ 501 if (++i <= 3) 502 continue; 503 504 touched++; 505 506 if (touched >= nr) 507 goto out; 508 509 if (!btree_node_reclaim(c, b, true)) { 510 btree_node_data_free(bc, b); 511 six_unlock_write(&b->c.lock); 512 six_unlock_intent(&b->c.lock); 513 freed++; 514 bc->nr_freed++; 515 } 516 } 517restart: 518 list_for_each_entry_safe(b, t, &list->list, list) { 519 touched++; 520 521 if (btree_node_accessed(b)) { 522 clear_btree_node_accessed(b); 523 bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++; 524 --touched;; 525 } else if (!btree_node_reclaim(c, b, true)) { 526 __bch2_btree_node_hash_remove(bc, b); 527 __btree_node_data_free(bc, b); 528 529 freed++; 530 bc->nr_freed++; 531 532 six_unlock_write(&b->c.lock); 533 six_unlock_intent(&b->c.lock); 534 535 if (freed == nr) 536 goto out_rotate; 537 } else if (trigger_writes && 538 btree_node_dirty(b) && 539 !btree_node_will_make_reachable(b) && 540 !btree_node_write_blocked(b) && 541 six_trylock_read(&b->c.lock)) { 542 list_move(&list->list, &b->list); 543 mutex_unlock(&bc->lock); 544 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 545 six_unlock_read(&b->c.lock); 546 if (touched >= nr) 547 goto out_nounlock; 548 mutex_lock(&bc->lock); 549 goto restart; 550 } 551 552 if (touched >= nr) 553 break; 554 } 555out_rotate: 556 if (&t->list != &list->list) 557 list_move_tail(&list->list, &t->list); 558out: 559 mutex_unlock(&bc->lock); 560out_nounlock: 561 ret = freed; 562 memalloc_nofs_restore(flags); 563 trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret); 564 return ret; 565} 566 567static unsigned long bch2_btree_cache_count(struct shrinker *shrink, 568 struct shrink_control *sc) 569{ 570 struct btree_cache_list *list = shrink->private_data; 571 572 if (bch2_btree_shrinker_disabled) 573 return 0; 574 575 return btree_cache_can_free(list); 576} 577 578void bch2_fs_btree_cache_exit(struct bch_fs *c) 579{ 580 struct btree_cache *bc = &c->btree_cache; 581 struct btree *b, *t; 582 unsigned long flags; 583 584 shrinker_free(bc->live[1].shrink); 585 shrinker_free(bc->live[0].shrink); 586 587 /* vfree() can allocate memory: */ 588 flags = memalloc_nofs_save(); 589 mutex_lock(&bc->lock); 590 591 if (c->verify_data) 592 list_move(&c->verify_data->list, &bc->live[0].list); 593 594 kvfree(c->verify_ondisk); 595 596 for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { 597 struct btree_root *r = bch2_btree_id_root(c, i); 598 599 if (r->b) 600 list_add(&r->b->list, &bc->live[0].list); 601 } 602 603 list_for_each_entry_safe(b, t, &bc->live[1].list, list) 604 bch2_btree_node_hash_remove(bc, b); 605 list_for_each_entry_safe(b, t, &bc->live[0].list, list) 606 bch2_btree_node_hash_remove(bc, b); 607 608 list_for_each_entry_safe(b, t, &bc->freeable, list) { 609 BUG_ON(btree_node_read_in_flight(b) || 610 btree_node_write_in_flight(b)); 611 612 btree_node_data_free(bc, b); 613 } 614 615 BUG_ON(!bch2_journal_error(&c->journal) && 616 atomic_long_read(&c->btree_cache.nr_dirty)); 617 618 list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); 619 620 list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) { 621 list_del(&b->list); 622 six_lock_exit(&b->c.lock); 623 kfree(b); 624 } 625 626 mutex_unlock(&bc->lock); 627 memalloc_nofs_restore(flags); 628 629 for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) 630 BUG_ON(bc->nr_by_btree[i]); 631 BUG_ON(bc->live[0].nr); 632 BUG_ON(bc->live[1].nr); 633 BUG_ON(bc->nr_freeable); 634 635 if (bc->table_init_done) 636 rhashtable_destroy(&bc->table); 637} 638 639int bch2_fs_btree_cache_init(struct bch_fs *c) 640{ 641 struct btree_cache *bc = &c->btree_cache; 642 struct shrinker *shrink; 643 unsigned i; 644 int ret = 0; 645 646 ret = rhashtable_init(&bc->table, &bch_btree_cache_params); 647 if (ret) 648 goto err; 649 650 bc->table_init_done = true; 651 652 bch2_recalc_btree_reserve(c); 653 654 for (i = 0; i < bc->nr_reserve; i++) 655 if (!__bch2_btree_node_mem_alloc(c)) 656 goto err; 657 658 list_splice_init(&bc->live[0].list, &bc->freeable); 659 660 mutex_init(&c->verify_lock); 661 662 shrink = shrinker_alloc(0, "%s-btree_cache", c->name); 663 if (!shrink) 664 goto err; 665 bc->live[0].shrink = shrink; 666 shrink->count_objects = bch2_btree_cache_count; 667 shrink->scan_objects = bch2_btree_cache_scan; 668 shrink->seeks = 2; 669 shrink->private_data = &bc->live[0]; 670 shrinker_register(shrink); 671 672 shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name); 673 if (!shrink) 674 goto err; 675 bc->live[1].shrink = shrink; 676 shrink->count_objects = bch2_btree_cache_count; 677 shrink->scan_objects = bch2_btree_cache_scan; 678 shrink->seeks = 8; 679 shrink->private_data = &bc->live[1]; 680 shrinker_register(shrink); 681 682 return 0; 683err: 684 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 685} 686 687void bch2_fs_btree_cache_init_early(struct btree_cache *bc) 688{ 689 mutex_init(&bc->lock); 690 for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) { 691 bc->live[i].idx = i; 692 INIT_LIST_HEAD(&bc->live[i].list); 693 } 694 INIT_LIST_HEAD(&bc->freeable); 695 INIT_LIST_HEAD(&bc->freed_pcpu); 696 INIT_LIST_HEAD(&bc->freed_nonpcpu); 697} 698 699/* 700 * We can only have one thread cannibalizing other cached btree nodes at a time, 701 * or we'll deadlock. We use an open coded mutex to ensure that, which a 702 * cannibalize_bucket() will take. This means every time we unlock the root of 703 * the btree, we need to release this lock if we have it held. 704 */ 705void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans) 706{ 707 struct bch_fs *c = trans->c; 708 struct btree_cache *bc = &c->btree_cache; 709 710 if (bc->alloc_lock == current) { 711 trace_and_count(c, btree_cache_cannibalize_unlock, trans); 712 bc->alloc_lock = NULL; 713 closure_wake_up(&bc->alloc_wait); 714 } 715} 716 717int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl) 718{ 719 struct bch_fs *c = trans->c; 720 struct btree_cache *bc = &c->btree_cache; 721 struct task_struct *old; 722 723 old = NULL; 724 if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) 725 goto success; 726 727 if (!cl) { 728 trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); 729 return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; 730 } 731 732 closure_wait(&bc->alloc_wait, cl); 733 734 /* Try again, after adding ourselves to waitlist */ 735 old = NULL; 736 if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) { 737 /* We raced */ 738 closure_wake_up(&bc->alloc_wait); 739 goto success; 740 } 741 742 trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); 743 return -BCH_ERR_btree_cache_cannibalize_lock_blocked; 744 745success: 746 trace_and_count(c, btree_cache_cannibalize_lock, trans); 747 return 0; 748} 749 750static struct btree *btree_node_cannibalize(struct bch_fs *c) 751{ 752 struct btree_cache *bc = &c->btree_cache; 753 struct btree *b; 754 755 for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) 756 list_for_each_entry_reverse(b, &bc->live[i].list, list) 757 if (!btree_node_reclaim(c, b, false)) 758 return b; 759 760 while (1) { 761 for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) 762 list_for_each_entry_reverse(b, &bc->live[i].list, list) 763 if (!btree_node_write_and_reclaim(c, b)) 764 return b; 765 766 /* 767 * Rare case: all nodes were intent-locked. 768 * Just busy-wait. 769 */ 770 WARN_ONCE(1, "btree cache cannibalize failed\n"); 771 cond_resched(); 772 } 773} 774 775struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks) 776{ 777 struct bch_fs *c = trans->c; 778 struct btree_cache *bc = &c->btree_cache; 779 struct list_head *freed = pcpu_read_locks 780 ? &bc->freed_pcpu 781 : &bc->freed_nonpcpu; 782 struct btree *b, *b2; 783 u64 start_time = local_clock(); 784 785 mutex_lock(&bc->lock); 786 787 /* 788 * We never free struct btree itself, just the memory that holds the on 789 * disk node. Check the freed list before allocating a new one: 790 */ 791 list_for_each_entry(b, freed, list) 792 if (!btree_node_reclaim(c, b, false)) { 793 list_del_init(&b->list); 794 goto got_node; 795 } 796 797 b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); 798 if (b) { 799 bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT); 800 } else { 801 mutex_unlock(&bc->lock); 802 bch2_trans_unlock(trans); 803 b = __btree_node_mem_alloc(c, GFP_KERNEL); 804 if (!b) 805 goto err; 806 bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL); 807 mutex_lock(&bc->lock); 808 } 809 810 BUG_ON(!six_trylock_intent(&b->c.lock)); 811 BUG_ON(!six_trylock_write(&b->c.lock)); 812 813got_node: 814 /* 815 * btree_free() doesn't free memory; it sticks the node on the end of 816 * the list. Check if there's any freed nodes there: 817 */ 818 list_for_each_entry(b2, &bc->freeable, list) 819 if (!btree_node_reclaim(c, b2, false)) { 820 swap(b->data, b2->data); 821 swap(b->aux_data, b2->aux_data); 822 823 list_del_init(&b2->list); 824 --bc->nr_freeable; 825 btree_node_to_freedlist(bc, b2); 826 mutex_unlock(&bc->lock); 827 828 six_unlock_write(&b2->c.lock); 829 six_unlock_intent(&b2->c.lock); 830 goto got_mem; 831 } 832 833 mutex_unlock(&bc->lock); 834 835 if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { 836 bch2_trans_unlock(trans); 837 if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) 838 goto err; 839 } 840 841got_mem: 842 BUG_ON(!list_empty(&b->list)); 843 BUG_ON(btree_node_hashed(b)); 844 BUG_ON(btree_node_dirty(b)); 845 BUG_ON(btree_node_write_in_flight(b)); 846out: 847 b->flags = 0; 848 b->written = 0; 849 b->nsets = 0; 850 b->sib_u64s[0] = 0; 851 b->sib_u64s[1] = 0; 852 b->whiteout_u64s = 0; 853 bch2_btree_keys_init(b); 854 set_btree_node_accessed(b); 855 856 bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], 857 start_time); 858 859 int ret = bch2_trans_relock(trans); 860 if (unlikely(ret)) { 861 bch2_btree_node_to_freelist(c, b); 862 return ERR_PTR(ret); 863 } 864 865 return b; 866err: 867 mutex_lock(&bc->lock); 868 869 /* Try to cannibalize another cached btree node: */ 870 if (bc->alloc_lock == current) { 871 b2 = btree_node_cannibalize(c); 872 clear_btree_node_just_written(b2); 873 __bch2_btree_node_hash_remove(bc, b2); 874 875 if (b) { 876 swap(b->data, b2->data); 877 swap(b->aux_data, b2->aux_data); 878 btree_node_to_freedlist(bc, b2); 879 six_unlock_write(&b2->c.lock); 880 six_unlock_intent(&b2->c.lock); 881 } else { 882 b = b2; 883 } 884 885 BUG_ON(!list_empty(&b->list)); 886 mutex_unlock(&bc->lock); 887 888 trace_and_count(c, btree_cache_cannibalize, trans); 889 goto out; 890 } 891 892 mutex_unlock(&bc->lock); 893 return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); 894} 895 896/* Slowpath, don't want it inlined into btree_iter_traverse() */ 897static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, 898 struct btree_path *path, 899 const struct bkey_i *k, 900 enum btree_id btree_id, 901 unsigned level, 902 enum six_lock_type lock_type, 903 bool sync) 904{ 905 struct bch_fs *c = trans->c; 906 struct btree_cache *bc = &c->btree_cache; 907 struct btree *b; 908 909 if (unlikely(level >= BTREE_MAX_DEPTH)) { 910 int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u", 911 level, BTREE_MAX_DEPTH); 912 return ERR_PTR(ret); 913 } 914 915 if (unlikely(!bkey_is_btree_ptr(&k->k))) { 916 struct printbuf buf = PRINTBUF; 917 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); 918 919 int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf); 920 printbuf_exit(&buf); 921 return ERR_PTR(ret); 922 } 923 924 if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) { 925 struct printbuf buf = PRINTBUF; 926 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); 927 928 int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf); 929 printbuf_exit(&buf); 930 return ERR_PTR(ret); 931 } 932 933 /* 934 * Parent node must be locked, else we could read in a btree node that's 935 * been freed: 936 */ 937 if (path && !bch2_btree_node_relock(trans, path, level + 1)) { 938 trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); 939 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); 940 } 941 942 b = bch2_btree_node_mem_alloc(trans, level != 0); 943 944 if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { 945 if (!path) 946 return b; 947 948 trans->memory_allocation_failure = true; 949 trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); 950 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); 951 } 952 953 if (IS_ERR(b)) 954 return b; 955 956 bkey_copy(&b->key, k); 957 if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { 958 /* raced with another fill: */ 959 960 /* mark as unhashed... */ 961 b->hash_val = 0; 962 963 mutex_lock(&bc->lock); 964 __bch2_btree_node_to_freelist(bc, b); 965 mutex_unlock(&bc->lock); 966 967 six_unlock_write(&b->c.lock); 968 six_unlock_intent(&b->c.lock); 969 return NULL; 970 } 971 972 set_btree_node_read_in_flight(b); 973 six_unlock_write(&b->c.lock); 974 975 if (path) { 976 u32 seq = six_lock_seq(&b->c.lock); 977 978 /* Unlock before doing IO: */ 979 six_unlock_intent(&b->c.lock); 980 bch2_trans_unlock_noassert(trans); 981 982 bch2_btree_node_read(trans, b, sync); 983 984 int ret = bch2_trans_relock(trans); 985 if (ret) 986 return ERR_PTR(ret); 987 988 if (!sync) 989 return NULL; 990 991 if (!six_relock_type(&b->c.lock, lock_type, seq)) 992 b = NULL; 993 } else { 994 bch2_btree_node_read(trans, b, sync); 995 if (lock_type == SIX_LOCK_read) 996 six_lock_downgrade(&b->c.lock); 997 } 998 999 return b; 1000} 1001 1002static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) 1003{ 1004 struct printbuf buf = PRINTBUF; 1005 1006 if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) 1007 return; 1008 1009 prt_printf(&buf, 1010 "btree node header doesn't match ptr: "); 1011 bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); 1012 prt_str(&buf, "\nptr: "); 1013 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 1014 1015 prt_str(&buf, "\nheader: "); 1016 bch2_btree_id_level_to_text(&buf, BTREE_NODE_ID(b->data), BTREE_NODE_LEVEL(b->data)); 1017 prt_str(&buf, "\nmin "); 1018 bch2_bpos_to_text(&buf, b->data->min_key); 1019 1020 prt_printf(&buf, "\nmax "); 1021 bch2_bpos_to_text(&buf, b->data->max_key); 1022 1023 bch2_fs_topology_error(c, "%s", buf.buf); 1024 1025 printbuf_exit(&buf); 1026} 1027 1028static inline void btree_check_header(struct bch_fs *c, struct btree *b) 1029{ 1030 if (b->c.btree_id != BTREE_NODE_ID(b->data) || 1031 b->c.level != BTREE_NODE_LEVEL(b->data) || 1032 !bpos_eq(b->data->max_key, b->key.k.p) || 1033 (b->key.k.type == KEY_TYPE_btree_ptr_v2 && 1034 !bpos_eq(b->data->min_key, 1035 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) 1036 btree_bad_header(c, b); 1037} 1038 1039static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 1040 const struct bkey_i *k, unsigned level, 1041 enum six_lock_type lock_type, 1042 unsigned long trace_ip) 1043{ 1044 struct bch_fs *c = trans->c; 1045 struct btree_cache *bc = &c->btree_cache; 1046 struct btree *b; 1047 bool need_relock = false; 1048 int ret; 1049 1050 EBUG_ON(level >= BTREE_MAX_DEPTH); 1051retry: 1052 b = btree_cache_find(bc, k); 1053 if (unlikely(!b)) { 1054 /* 1055 * We must have the parent locked to call bch2_btree_node_fill(), 1056 * else we could read in a btree node from disk that's been 1057 * freed: 1058 */ 1059 b = bch2_btree_node_fill(trans, path, k, path->btree_id, 1060 level, lock_type, true); 1061 need_relock = true; 1062 1063 /* We raced and found the btree node in the cache */ 1064 if (!b) 1065 goto retry; 1066 1067 if (IS_ERR(b)) 1068 return b; 1069 } else { 1070 if (btree_node_read_locked(path, level + 1)) 1071 btree_node_unlock(trans, path, level + 1); 1072 1073 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 1074 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1075 return ERR_PTR(ret); 1076 1077 BUG_ON(ret); 1078 1079 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1080 b->c.level != level || 1081 race_fault())) { 1082 six_unlock_type(&b->c.lock, lock_type); 1083 if (bch2_btree_node_relock(trans, path, level + 1)) 1084 goto retry; 1085 1086 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 1087 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 1088 } 1089 1090 /* avoid atomic set bit if it's not needed: */ 1091 if (!btree_node_accessed(b)) 1092 set_btree_node_accessed(b); 1093 } 1094 1095 if (unlikely(btree_node_read_in_flight(b))) { 1096 u32 seq = six_lock_seq(&b->c.lock); 1097 1098 six_unlock_type(&b->c.lock, lock_type); 1099 bch2_trans_unlock(trans); 1100 need_relock = true; 1101 1102 bch2_btree_node_wait_on_read(b); 1103 1104 ret = bch2_trans_relock(trans); 1105 if (ret) 1106 return ERR_PTR(ret); 1107 1108 /* 1109 * should_be_locked is not set on this path yet, so we need to 1110 * relock it specifically: 1111 */ 1112 if (!six_relock_type(&b->c.lock, lock_type, seq)) 1113 goto retry; 1114 } 1115 1116 if (unlikely(need_relock)) { 1117 ret = bch2_trans_relock(trans) ?: 1118 bch2_btree_path_relock_intent(trans, path); 1119 if (ret) { 1120 six_unlock_type(&b->c.lock, lock_type); 1121 return ERR_PTR(ret); 1122 } 1123 } 1124 1125 prefetch(b->aux_data); 1126 1127 for_each_bset(b, t) { 1128 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1129 1130 prefetch(p + L1_CACHE_BYTES * 0); 1131 prefetch(p + L1_CACHE_BYTES * 1); 1132 prefetch(p + L1_CACHE_BYTES * 2); 1133 } 1134 1135 if (unlikely(btree_node_read_error(b))) { 1136 six_unlock_type(&b->c.lock, lock_type); 1137 return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); 1138 } 1139 1140 EBUG_ON(b->c.btree_id != path->btree_id); 1141 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1142 btree_check_header(c, b); 1143 1144 return b; 1145} 1146 1147/** 1148 * bch2_btree_node_get - find a btree node in the cache and lock it, reading it 1149 * in from disk if necessary. 1150 * 1151 * @trans: btree transaction object 1152 * @path: btree_path being traversed 1153 * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2) 1154 * @level: level of btree node being looked up (0 == leaf node) 1155 * @lock_type: SIX_LOCK_read or SIX_LOCK_intent 1156 * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek()) 1157 * 1158 * The btree node will have either a read or a write lock held, depending on 1159 * the @write parameter. 1160 * 1161 * Returns: btree node or ERR_PTR() 1162 */ 1163struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, 1164 const struct bkey_i *k, unsigned level, 1165 enum six_lock_type lock_type, 1166 unsigned long trace_ip) 1167{ 1168 struct bch_fs *c = trans->c; 1169 struct btree *b; 1170 int ret; 1171 1172 EBUG_ON(level >= BTREE_MAX_DEPTH); 1173 1174 b = btree_node_mem_ptr(k); 1175 1176 /* 1177 * Check b->hash_val _before_ calling btree_node_lock() - this might not 1178 * be the node we want anymore, and trying to lock the wrong node could 1179 * cause an unneccessary transaction restart: 1180 */ 1181 if (unlikely(!c->opts.btree_node_mem_ptr_optimization || 1182 !b || 1183 b->hash_val != btree_ptr_hash_val(k))) 1184 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 1185 1186 if (btree_node_read_locked(path, level + 1)) 1187 btree_node_unlock(trans, path, level + 1); 1188 1189 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); 1190 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1191 return ERR_PTR(ret); 1192 1193 BUG_ON(ret); 1194 1195 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1196 b->c.level != level || 1197 race_fault())) { 1198 six_unlock_type(&b->c.lock, lock_type); 1199 if (bch2_btree_node_relock(trans, path, level + 1)) 1200 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 1201 1202 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); 1203 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); 1204 } 1205 1206 if (unlikely(btree_node_read_in_flight(b))) { 1207 six_unlock_type(&b->c.lock, lock_type); 1208 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); 1209 } 1210 1211 prefetch(b->aux_data); 1212 1213 for_each_bset(b, t) { 1214 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1215 1216 prefetch(p + L1_CACHE_BYTES * 0); 1217 prefetch(p + L1_CACHE_BYTES * 1); 1218 prefetch(p + L1_CACHE_BYTES * 2); 1219 } 1220 1221 /* avoid atomic set bit if it's not needed: */ 1222 if (!btree_node_accessed(b)) 1223 set_btree_node_accessed(b); 1224 1225 if (unlikely(btree_node_read_error(b))) { 1226 six_unlock_type(&b->c.lock, lock_type); 1227 return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); 1228 } 1229 1230 EBUG_ON(b->c.btree_id != path->btree_id); 1231 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1232 btree_check_header(c, b); 1233 1234 return b; 1235} 1236 1237struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, 1238 const struct bkey_i *k, 1239 enum btree_id btree_id, 1240 unsigned level, 1241 bool nofill) 1242{ 1243 struct bch_fs *c = trans->c; 1244 struct btree_cache *bc = &c->btree_cache; 1245 struct btree *b; 1246 int ret; 1247 1248 EBUG_ON(level >= BTREE_MAX_DEPTH); 1249 1250 if (c->opts.btree_node_mem_ptr_optimization) { 1251 b = btree_node_mem_ptr(k); 1252 if (b) 1253 goto lock_node; 1254 } 1255retry: 1256 b = btree_cache_find(bc, k); 1257 if (unlikely(!b)) { 1258 if (nofill) 1259 goto out; 1260 1261 b = bch2_btree_node_fill(trans, NULL, k, btree_id, 1262 level, SIX_LOCK_read, true); 1263 1264 /* We raced and found the btree node in the cache */ 1265 if (!b) 1266 goto retry; 1267 1268 if (IS_ERR(b) && 1269 !bch2_btree_cache_cannibalize_lock(trans, NULL)) 1270 goto retry; 1271 1272 if (IS_ERR(b)) 1273 goto out; 1274 } else { 1275lock_node: 1276 ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); 1277 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1278 return ERR_PTR(ret); 1279 1280 BUG_ON(ret); 1281 1282 if (unlikely(b->hash_val != btree_ptr_hash_val(k) || 1283 b->c.btree_id != btree_id || 1284 b->c.level != level)) { 1285 six_unlock_read(&b->c.lock); 1286 goto retry; 1287 } 1288 } 1289 1290 /* XXX: waiting on IO with btree locks held: */ 1291 __bch2_btree_node_wait_on_read(b); 1292 1293 prefetch(b->aux_data); 1294 1295 for_each_bset(b, t) { 1296 void *p = (u64 *) b->aux_data + t->aux_data_offset; 1297 1298 prefetch(p + L1_CACHE_BYTES * 0); 1299 prefetch(p + L1_CACHE_BYTES * 1); 1300 prefetch(p + L1_CACHE_BYTES * 2); 1301 } 1302 1303 /* avoid atomic set bit if it's not needed: */ 1304 if (!btree_node_accessed(b)) 1305 set_btree_node_accessed(b); 1306 1307 if (unlikely(btree_node_read_error(b))) { 1308 six_unlock_read(&b->c.lock); 1309 b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached); 1310 goto out; 1311 } 1312 1313 EBUG_ON(b->c.btree_id != btree_id); 1314 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); 1315 btree_check_header(c, b); 1316out: 1317 bch2_btree_cache_cannibalize_unlock(trans); 1318 return b; 1319} 1320 1321int bch2_btree_node_prefetch(struct btree_trans *trans, 1322 struct btree_path *path, 1323 const struct bkey_i *k, 1324 enum btree_id btree_id, unsigned level) 1325{ 1326 struct bch_fs *c = trans->c; 1327 struct btree_cache *bc = &c->btree_cache; 1328 1329 BUG_ON(path && !btree_node_locked(path, level + 1)); 1330 BUG_ON(level >= BTREE_MAX_DEPTH); 1331 1332 struct btree *b = btree_cache_find(bc, k); 1333 if (b) 1334 return 0; 1335 1336 b = bch2_btree_node_fill(trans, path, k, btree_id, 1337 level, SIX_LOCK_read, false); 1338 int ret = PTR_ERR_OR_ZERO(b); 1339 if (ret) 1340 return ret; 1341 if (b) 1342 six_unlock_read(&b->c.lock); 1343 return 0; 1344} 1345 1346void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) 1347{ 1348 struct bch_fs *c = trans->c; 1349 struct btree_cache *bc = &c->btree_cache; 1350 struct btree *b; 1351 1352 b = btree_cache_find(bc, k); 1353 if (!b) 1354 return; 1355 1356 BUG_ON(b == btree_node_root(trans->c, b)); 1357wait_on_io: 1358 /* not allowed to wait on io with btree locks held: */ 1359 1360 /* XXX we're called from btree_gc which will be holding other btree 1361 * nodes locked 1362 */ 1363 __bch2_btree_node_wait_on_read(b); 1364 __bch2_btree_node_wait_on_write(b); 1365 1366 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); 1367 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); 1368 if (unlikely(b->hash_val != btree_ptr_hash_val(k))) 1369 goto out; 1370 1371 if (btree_node_dirty(b)) { 1372 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); 1373 six_unlock_write(&b->c.lock); 1374 six_unlock_intent(&b->c.lock); 1375 goto wait_on_io; 1376 } 1377 1378 BUG_ON(btree_node_dirty(b)); 1379 1380 mutex_lock(&bc->lock); 1381 bch2_btree_node_hash_remove(bc, b); 1382 btree_node_data_free(bc, b); 1383 mutex_unlock(&bc->lock); 1384out: 1385 six_unlock_write(&b->c.lock); 1386 six_unlock_intent(&b->c.lock); 1387} 1388 1389const char *bch2_btree_id_str(enum btree_id btree) 1390{ 1391 return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; 1392} 1393 1394void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree) 1395{ 1396 if (btree < BTREE_ID_NR) 1397 prt_str(out, __bch2_btree_ids[btree]); 1398 else 1399 prt_printf(out, "(unknown btree %u)", btree); 1400} 1401 1402void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsigned level) 1403{ 1404 prt_str(out, "btree="); 1405 bch2_btree_id_to_text(out, btree); 1406 prt_printf(out, " level=%u", level); 1407} 1408 1409void __bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, 1410 enum btree_id btree, unsigned level, struct bkey_s_c k) 1411{ 1412 bch2_btree_id_to_text(out, btree); 1413 prt_printf(out, " level %u/", level); 1414 struct btree_root *r = bch2_btree_id_root(c, btree); 1415 if (r) 1416 prt_printf(out, "%u", r->level); 1417 else 1418 prt_printf(out, "(unknown)"); 1419 prt_printf(out, "\n "); 1420 1421 bch2_bkey_val_to_text(out, c, k); 1422} 1423 1424void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) 1425{ 1426 __bch2_btree_pos_to_text(out, c, b->c.btree_id, b->c.level, bkey_i_to_s_c(&b->key)); 1427} 1428 1429void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) 1430{ 1431 struct bset_stats stats; 1432 1433 memset(&stats, 0, sizeof(stats)); 1434 1435 bch2_btree_keys_stats(b, &stats); 1436 1437 prt_printf(out, "l %u ", b->c.level); 1438 bch2_bpos_to_text(out, b->data->min_key); 1439 prt_printf(out, " - "); 1440 bch2_bpos_to_text(out, b->data->max_key); 1441 prt_printf(out, ":\n" 1442 " ptrs: "); 1443 bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); 1444 prt_newline(out); 1445 1446 prt_printf(out, 1447 " format: "); 1448 bch2_bkey_format_to_text(out, &b->format); 1449 1450 prt_printf(out, 1451 " unpack fn len: %u\n" 1452 " bytes used %zu/%zu (%zu%% full)\n" 1453 " sib u64s: %u, %u (merge threshold %u)\n" 1454 " nr packed keys %u\n" 1455 " nr unpacked keys %u\n" 1456 " floats %zu\n" 1457 " failed unpacked %zu\n", 1458 b->unpack_fn_len, 1459 b->nr.live_u64s * sizeof(u64), 1460 btree_buf_bytes(b) - sizeof(struct btree_node), 1461 b->nr.live_u64s * 100 / btree_max_u64s(c), 1462 b->sib_u64s[0], 1463 b->sib_u64s[1], 1464 c->btree_foreground_merge_threshold, 1465 b->nr.packed_keys, 1466 b->nr.unpacked_keys, 1467 stats.floats, 1468 stats.failed); 1469} 1470 1471static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c, 1472 const char *label, size_t nr) 1473{ 1474 prt_printf(out, "%s\t", label); 1475 prt_human_readable_u64(out, nr * c->opts.btree_node_size); 1476 prt_printf(out, " (%zu)\n", nr); 1477} 1478 1479static const char * const bch2_btree_cache_not_freed_reasons_strs[] = { 1480#define x(n) #n, 1481 BCH_BTREE_CACHE_NOT_FREED_REASONS() 1482#undef x 1483 NULL 1484}; 1485 1486void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc) 1487{ 1488 struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); 1489 1490 if (!out->nr_tabstops) 1491 printbuf_tabstop_push(out, 32); 1492 1493 prt_btree_cache_line(out, c, "live:", bc->live[0].nr); 1494 prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr); 1495 prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable); 1496 prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty)); 1497 prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); 1498 prt_newline(out); 1499 1500 for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) { 1501 bch2_btree_id_to_text(out, i); 1502 prt_printf(out, "\t"); 1503 prt_human_readable_u64(out, bc->nr_by_btree[i] * c->opts.btree_node_size); 1504 prt_printf(out, " (%zu)\n", bc->nr_by_btree[i]); 1505 } 1506 1507 prt_newline(out); 1508 prt_printf(out, "freed:\t%zu\n", bc->nr_freed); 1509 prt_printf(out, "not freed:\n"); 1510 1511 for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++) 1512 prt_printf(out, " %s\t%llu\n", 1513 bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]); 1514}