Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'bcachefs-2025-01-29' of git://evilpiepirate.org/bcachefs

Pull bcachefs fixes from Kent Overstreet:

- second half of a fix for a bug that'd been causing oopses on
filesystems using snapshots with memory pressure (key cache fills for
snaphots btrees are tricky)

- build fix for strange compiler configurations that double stack frame
size

- "journal stuck timeout" now takes into account device latency: this
fixes some spurious warnings, and the main remaining source of SRCU
lock hold time warnings (I'm no longer seeing this in my CI, so any
users still seeing this should definitely ping me)

- fix for slow/hanging unmounts (" Improve journal pin flushing")

- some more tracepoint fixes/improvements, to chase down the "rebalance
isn't making progress" issues

* tag 'bcachefs-2025-01-29' of git://evilpiepirate.org/bcachefs:
bcachefs: Improve trace_move_extent_finish
bcachefs: Fix trace_copygc
bcachefs: Journal writes are now IOPRIO_CLASS_RT
bcachefs: Improve journal pin flushing
bcachefs: fix bch2_btree_node_flags
bcachefs: rebalance, copygc enabled are runtime opts
bcachefs: Improve decompression error messages
bcachefs: bset_blacklisted_journal_seq is now AUTOFIX
bcachefs: "Journal stuck" timeout now takes into account device latency
bcachefs: Reduce stack frame size of __bch2_str_hash_check_key()
bcachefs: Fix btree_trans_peek_key_cache()

+275 -159
+4 -1
fs/bcachefs/btree_cache.c
··· 24 24 } while (0) 25 25 26 26 const char * const bch2_btree_node_flags[] = { 27 - #define x(f) #f, 27 + "typebit", 28 + "typebit", 29 + "typebit", 30 + #define x(f) [BTREE_NODE_##f] = #f, 28 31 BTREE_FLAGS() 29 32 #undef x 30 33 NULL
+1 -2
fs/bcachefs/btree_iter.c
··· 2239 2239 if (unlikely(ret)) 2240 2240 return bkey_s_c_err(ret); 2241 2241 2242 - btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); 2243 - 2244 2242 k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u); 2245 2243 if (!k.k) 2246 2244 return k; ··· 2249 2251 2250 2252 iter->k = u; 2251 2253 k.k = &iter->k; 2254 + btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); 2252 2255 return k; 2253 2256 } 2254 2257
+3 -1
fs/bcachefs/btree_key_cache.c
··· 291 291 struct btree_path *ck_path, 292 292 unsigned flags) 293 293 { 294 - if (flags & BTREE_ITER_cached_nofill) 294 + if (flags & BTREE_ITER_cached_nofill) { 295 + ck_path->l[0].b = NULL; 295 296 return 0; 297 + } 296 298 297 299 struct bch_fs *c = trans->c; 298 300 struct btree_iter iter;
+1 -1
fs/bcachefs/btree_trans_commit.c
··· 348 348 unsigned flags) 349 349 { 350 350 return bch2_journal_res_get(&trans->c->journal, &trans->journal_res, 351 - trans->journal_u64s, flags); 351 + trans->journal_u64s, flags, trans); 352 352 } 353 353 354 354 #define JSET_ENTRY_LOG_U64s 4
+22 -9
fs/bcachefs/compress.c
··· 4 4 #include "compress.h" 5 5 #include "error.h" 6 6 #include "extents.h" 7 + #include "io_write.h" 7 8 #include "opts.h" 8 9 #include "super-io.h" 9 10 ··· 255 254 goto out; 256 255 } 257 256 258 - int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, 259 - struct bch_extent_crc_unpacked *crc) 257 + int bch2_bio_uncompress_inplace(struct bch_write_op *op, 258 + struct bio *bio) 260 259 { 260 + struct bch_fs *c = op->c; 261 + struct bch_extent_crc_unpacked *crc = &op->crc; 261 262 struct bbuf data = { NULL }; 262 263 size_t dst_len = crc->uncompressed_size << 9; 264 + int ret = 0; 263 265 264 266 /* bio must own its pages: */ 265 267 BUG_ON(!bio->bi_vcnt); ··· 270 266 271 267 if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || 272 268 crc->compressed_size << 9 > c->opts.encoded_extent_max) { 273 - bch_err(c, "error rewriting existing data: extent too big"); 269 + struct printbuf buf = PRINTBUF; 270 + bch2_write_op_error(&buf, op); 271 + prt_printf(&buf, "error rewriting existing data: extent too big"); 272 + bch_err_ratelimited(c, "%s", buf.buf); 273 + printbuf_exit(&buf); 274 274 return -EIO; 275 275 } 276 276 277 277 data = __bounce_alloc(c, dst_len, WRITE); 278 278 279 279 if (__bio_uncompress(c, bio, data.b, *crc)) { 280 - if (!c->opts.no_data_io) 281 - bch_err(c, "error rewriting existing data: decompression error"); 282 - bio_unmap_or_unbounce(c, data); 283 - return -EIO; 280 + if (!c->opts.no_data_io) { 281 + struct printbuf buf = PRINTBUF; 282 + bch2_write_op_error(&buf, op); 283 + prt_printf(&buf, "error rewriting existing data: decompression error"); 284 + bch_err_ratelimited(c, "%s", buf.buf); 285 + printbuf_exit(&buf); 286 + } 287 + ret = -EIO; 288 + goto err; 284 289 } 285 290 286 291 /* ··· 306 293 crc->uncompressed_size = crc->live_size; 307 294 crc->offset = 0; 308 295 crc->csum = (struct bch_csum) { 0, 0 }; 309 - 296 + err: 310 297 bio_unmap_or_unbounce(c, data); 311 - return 0; 298 + return ret; 312 299 } 313 300 314 301 int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
+2 -2
fs/bcachefs/compress.h
··· 47 47 return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; 48 48 } 49 49 50 - int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, 51 - struct bch_extent_crc_unpacked *); 50 + struct bch_write_op; 51 + int bch2_bio_uncompress_inplace(struct bch_write_op *, struct bio *); 52 52 int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, 53 53 struct bvec_iter, struct bch_extent_crc_unpacked); 54 54 unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
+34 -16
fs/bcachefs/data_update.c
··· 91 91 return true; 92 92 } 93 93 94 - static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k) 94 + static noinline void trace_move_extent_finish2(struct data_update *u, 95 + struct bkey_i *new, 96 + struct bkey_i *insert) 95 97 { 96 - if (trace_move_extent_finish_enabled()) { 97 - struct printbuf buf = PRINTBUF; 98 + struct bch_fs *c = u->op.c; 99 + struct printbuf buf = PRINTBUF; 98 100 99 - bch2_bkey_val_to_text(&buf, c, k); 100 - trace_move_extent_finish(c, buf.buf); 101 - printbuf_exit(&buf); 102 - } 101 + prt_newline(&buf); 102 + 103 + bch2_data_update_to_text(&buf, u); 104 + prt_newline(&buf); 105 + 106 + prt_str_indented(&buf, "new replicas:\t"); 107 + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); 108 + prt_newline(&buf); 109 + 110 + prt_str_indented(&buf, "insert:\t"); 111 + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); 112 + prt_newline(&buf); 113 + 114 + trace_move_extent_finish(c, buf.buf); 115 + printbuf_exit(&buf); 103 116 } 104 117 105 118 static void trace_move_extent_fail2(struct data_update *m, ··· 385 372 bch2_btree_iter_set_pos(&iter, next_pos); 386 373 387 374 this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size); 388 - trace_move_extent_finish2(c, bkey_i_to_s_c(&new->k_i)); 375 + if (trace_move_extent_finish_enabled()) 376 + trace_move_extent_finish2(m, &new->k_i, insert); 389 377 } 390 378 err: 391 379 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ··· 539 525 struct data_update_opts *data_opts) 540 526 { 541 527 printbuf_tabstop_push(out, 20); 542 - prt_str(out, "rewrite ptrs:\t"); 528 + 529 + prt_str_indented(out, "rewrite ptrs:\t"); 543 530 bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); 544 531 prt_newline(out); 545 532 546 - prt_str(out, "kill ptrs:\t"); 533 + prt_str_indented(out, "kill ptrs:\t"); 547 534 bch2_prt_u64_base2(out, data_opts->kill_ptrs); 548 535 prt_newline(out); 549 536 550 - prt_str(out, "target:\t"); 537 + prt_str_indented(out, "target:\t"); 551 538 bch2_target_to_text(out, c, data_opts->target); 552 539 prt_newline(out); 553 540 554 - prt_str(out, "compression:\t"); 541 + prt_str_indented(out, "compression:\t"); 555 542 bch2_compression_opt_to_text(out, io_opts->background_compression); 556 543 prt_newline(out); 557 544 558 - prt_str(out, "opts.replicas:\t"); 545 + prt_str_indented(out, "opts.replicas:\t"); 559 546 prt_u64(out, io_opts->data_replicas); 547 + prt_newline(out); 560 548 561 - prt_str(out, "extra replicas:\t"); 549 + prt_str_indented(out, "extra replicas:\t"); 562 550 prt_u64(out, data_opts->extra_replicas); 563 551 } 564 552 565 553 void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) 566 554 { 567 - bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); 568 - prt_newline(out); 569 555 bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); 556 + prt_newline(out); 557 + 558 + prt_str_indented(out, "old key:\t"); 559 + bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); 570 560 } 571 561 572 562 int bch2_extent_drop_ptrs(struct btree_trans *trans,
+1
fs/bcachefs/debug.c
··· 20 20 #include "extents.h" 21 21 #include "fsck.h" 22 22 #include "inode.h" 23 + #include "journal_reclaim.h" 23 24 #include "super.h" 24 25 25 26 #include <linux/console.h>
+2 -2
fs/bcachefs/io_write.c
··· 406 406 op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); 407 407 } 408 408 409 - static void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) 409 + void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) 410 410 { 411 411 __bch2_write_op_error(out, op, op->pos.offset); 412 412 } ··· 873 873 if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) 874 874 return PREP_ENCODED_CHECKSUM_ERR; 875 875 876 - if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) 876 + if (bch2_bio_uncompress_inplace(op, bio)) 877 877 return PREP_ENCODED_ERR; 878 878 } 879 879
+2
fs/bcachefs/io_write.h
··· 20 20 void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, 21 21 enum bch_data_type, const struct bkey_i *, bool); 22 22 23 + void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op); 24 + 23 25 #define BCH_WRITE_FLAGS() \ 24 26 x(ALLOC_NOWAIT) \ 25 27 x(CACHED) \
+32 -60
fs/bcachefs/journal.c
··· 113 113 114 114 static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) 115 115 { 116 - unsigned i; 117 - 118 - for (i = 0; i < ARRAY_SIZE(p->list); i++) 119 - INIT_LIST_HEAD(&p->list[i]); 120 - INIT_LIST_HEAD(&p->flushed); 116 + for (unsigned i = 0; i < ARRAY_SIZE(p->unflushed); i++) 117 + INIT_LIST_HEAD(&p->unflushed[i]); 118 + for (unsigned i = 0; i < ARRAY_SIZE(p->flushed); i++) 119 + INIT_LIST_HEAD(&p->flushed[i]); 121 120 atomic_set(&p->count, count); 122 121 p->devs.nr = 0; 123 122 } ··· 600 601 : -BCH_ERR_journal_res_get_blocked; 601 602 } 602 603 604 + static unsigned max_dev_latency(struct bch_fs *c) 605 + { 606 + u64 nsecs = 0; 607 + 608 + for_each_rw_member(c, ca) 609 + nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration); 610 + 611 + return nsecs_to_jiffies(nsecs); 612 + } 613 + 603 614 /* 604 615 * Essentially the entry function to the journaling code. When bcachefs is doing 605 616 * a btree insert, it calls this function to get the current journal write. ··· 621 612 * btree node write locks. 622 613 */ 623 614 int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, 624 - unsigned flags) 615 + unsigned flags, 616 + struct btree_trans *trans) 625 617 { 626 618 int ret; 627 619 628 620 if (closure_wait_event_timeout(&j->async_wait, 629 621 (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || 630 622 (flags & JOURNAL_RES_GET_NONBLOCK), 631 - HZ * 10)) 623 + HZ)) 632 624 return ret; 633 625 626 + if (trans) 627 + bch2_trans_unlock_long(trans); 628 + 634 629 struct bch_fs *c = container_of(j, struct bch_fs, journal); 630 + int remaining_wait = max(max_dev_latency(c) * 2, HZ * 10); 631 + 632 + remaining_wait = max(0, remaining_wait - HZ); 633 + 634 + if (closure_wait_event_timeout(&j->async_wait, 635 + (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || 636 + (flags & JOURNAL_RES_GET_NONBLOCK), 637 + remaining_wait)) 638 + return ret; 639 + 635 640 struct printbuf buf = PRINTBUF; 636 641 bch2_journal_debug_to_text(&buf, j); 637 642 bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s", ··· 750 727 * livelock: 751 728 */ 752 729 sched_annotate_sleep(); 753 - ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); 730 + ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL); 754 731 if (ret) 755 732 return ret; 756 733 ··· 871 848 static int __bch2_journal_meta(struct journal *j) 872 849 { 873 850 struct journal_res res = {}; 874 - int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); 851 + int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL); 875 852 if (ret) 876 853 return ret; 877 854 ··· 1624 1601 spin_lock(&j->lock); 1625 1602 __bch2_journal_debug_to_text(out, j); 1626 1603 spin_unlock(&j->lock); 1627 - } 1628 - 1629 - bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) 1630 - { 1631 - struct journal_entry_pin_list *pin_list; 1632 - struct journal_entry_pin *pin; 1633 - 1634 - spin_lock(&j->lock); 1635 - if (!test_bit(JOURNAL_running, &j->flags)) { 1636 - spin_unlock(&j->lock); 1637 - return true; 1638 - } 1639 - 1640 - *seq = max(*seq, j->pin.front); 1641 - 1642 - if (*seq >= j->pin.back) { 1643 - spin_unlock(&j->lock); 1644 - return true; 1645 - } 1646 - 1647 - out->atomic++; 1648 - 1649 - pin_list = journal_seq_pin(j, *seq); 1650 - 1651 - prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count)); 1652 - printbuf_indent_add(out, 2); 1653 - 1654 - for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++) 1655 - list_for_each_entry(pin, &pin_list->list[i], list) 1656 - prt_printf(out, "\t%px %ps\n", pin, pin->flush); 1657 - 1658 - if (!list_empty(&pin_list->flushed)) 1659 - prt_printf(out, "flushed:\n"); 1660 - 1661 - list_for_each_entry(pin, &pin_list->flushed, list) 1662 - prt_printf(out, "\t%px %ps\n", pin, pin->flush); 1663 - 1664 - printbuf_indent_sub(out, 2); 1665 - 1666 - --out->atomic; 1667 - spin_unlock(&j->lock); 1668 - 1669 - return false; 1670 - } 1671 - 1672 - void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) 1673 - { 1674 - u64 seq = 0; 1675 - 1676 - while (!bch2_journal_seq_pins_to_text(out, j, &seq)) 1677 - seq++; 1678 1604 }
+4 -5
fs/bcachefs/journal.h
··· 312 312 } 313 313 314 314 int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, 315 - unsigned); 315 + unsigned, struct btree_trans *); 316 316 317 317 /* First bits for BCH_WATERMARK: */ 318 318 enum journal_res_flags { ··· 368 368 } 369 369 370 370 static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, 371 - unsigned u64s, unsigned flags) 371 + unsigned u64s, unsigned flags, 372 + struct btree_trans *trans) 372 373 { 373 374 int ret; 374 375 ··· 381 380 if (journal_res_get_fast(j, res, flags)) 382 381 goto out; 383 382 384 - ret = bch2_journal_res_get_slowpath(j, res, flags); 383 + ret = bch2_journal_res_get_slowpath(j, res, flags, trans); 385 384 if (ret) 386 385 return ret; 387 386 out: ··· 430 429 431 430 void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); 432 431 void bch2_journal_debug_to_text(struct printbuf *, struct journal *); 433 - void bch2_journal_pins_to_text(struct printbuf *, struct journal *); 434 - bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); 435 432 436 433 int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, 437 434 unsigned nr);
+2
fs/bcachefs/journal_io.c
··· 17 17 #include "sb-clean.h" 18 18 #include "trace.h" 19 19 20 + #include <linux/ioprio.h> 20 21 #include <linux/string_choices.h> 21 22 22 23 void bch2_journal_pos_from_member_info_set(struct bch_fs *c) ··· 1764 1763 bio->bi_iter.bi_sector = ptr->offset; 1765 1764 bio->bi_end_io = journal_write_endio; 1766 1765 bio->bi_private = ca; 1766 + bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 0); 1767 1767 1768 1768 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); 1769 1769 ca->prev_journal_sector = bio->bi_iter.bi_sector;
+121 -21
fs/bcachefs/journal_reclaim.c
··· 327 327 popped = true; 328 328 } 329 329 330 - if (popped) 330 + if (popped) { 331 331 bch2_journal_space_available(j); 332 + __closure_wake_up(&j->reclaim_flush_wait); 333 + } 332 334 } 333 335 334 336 bool __bch2_journal_pin_put(struct journal *j, u64 seq) ··· 364 362 pin->seq = 0; 365 363 list_del_init(&pin->list); 366 364 365 + if (j->reclaim_flush_wait.list.first) 366 + __closure_wake_up(&j->reclaim_flush_wait); 367 + 367 368 /* 368 369 * Unpinning a journal entry may make journal_next_bucket() succeed, if 369 370 * writing a new last_seq will now make another bucket available: ··· 388 383 { 389 384 if (fn == bch2_btree_node_flush0 || 390 385 fn == bch2_btree_node_flush1) 391 - return JOURNAL_PIN_btree; 386 + return JOURNAL_PIN_TYPE_btree; 392 387 else if (fn == bch2_btree_key_cache_journal_flush) 393 - return JOURNAL_PIN_key_cache; 388 + return JOURNAL_PIN_TYPE_key_cache; 394 389 else 395 - return JOURNAL_PIN_other; 390 + return JOURNAL_PIN_TYPE_other; 396 391 } 397 392 398 393 static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq, ··· 411 406 atomic_inc(&pin_list->count); 412 407 pin->seq = seq; 413 408 pin->flush = flush_fn; 414 - list_add(&pin->list, &pin_list->list[type]); 409 + 410 + if (list_empty(&pin_list->unflushed[type]) && 411 + j->reclaim_flush_wait.list.first) 412 + __closure_wake_up(&j->reclaim_flush_wait); 413 + 414 + list_add(&pin->list, &pin_list->unflushed[type]); 415 415 } 416 416 417 417 void bch2_journal_pin_copy(struct journal *j, ··· 509 499 { 510 500 struct journal_entry_pin_list *pin_list; 511 501 struct journal_entry_pin *ret = NULL; 512 - unsigned i; 513 502 514 503 fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) { 515 504 if (*seq > seq_to_flush && !allowed_above_seq) 516 505 break; 517 506 518 - for (i = 0; i < JOURNAL_PIN_NR; i++) 519 - if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) || 520 - ((1U << i) & allowed_above_seq)) { 521 - ret = list_first_entry_or_null(&pin_list->list[i], 507 + for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++) 508 + if (((BIT(i) & allowed_below_seq) && *seq <= seq_to_flush) || 509 + (BIT(i) & allowed_above_seq)) { 510 + ret = list_first_entry_or_null(&pin_list->unflushed[i], 522 511 struct journal_entry_pin, list); 523 512 if (ret) 524 513 return ret; ··· 553 544 } 554 545 555 546 if (min_key_cache) { 556 - allowed_above |= 1U << JOURNAL_PIN_key_cache; 557 - allowed_below |= 1U << JOURNAL_PIN_key_cache; 547 + allowed_above |= BIT(JOURNAL_PIN_TYPE_key_cache); 548 + allowed_below |= BIT(JOURNAL_PIN_TYPE_key_cache); 558 549 } 559 550 560 551 cond_resched(); ··· 562 553 j->last_flushed = jiffies; 563 554 564 555 spin_lock(&j->lock); 565 - pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq); 556 + pin = journal_get_next_pin(j, seq_to_flush, 557 + allowed_below, 558 + allowed_above, &seq); 566 559 if (pin) { 567 560 BUG_ON(j->flush_in_progress); 568 561 j->flush_in_progress = pin; ··· 587 576 spin_lock(&j->lock); 588 577 /* Pin might have been dropped or rearmed: */ 589 578 if (likely(!err && !j->flush_in_progress_dropped)) 590 - list_move(&pin->list, &journal_seq_pin(j, seq)->flushed); 579 + list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(flush_fn)]); 591 580 j->flush_in_progress = NULL; 592 581 j->flush_in_progress_dropped = false; 593 582 spin_unlock(&j->lock); ··· 827 816 return 0; 828 817 } 829 818 819 + static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush, 820 + unsigned types) 821 + { 822 + struct journal_entry_pin_list *pin_list; 823 + u64 seq; 824 + 825 + spin_lock(&j->lock); 826 + fifo_for_each_entry_ptr(pin_list, &j->pin, seq) { 827 + if (seq > seq_to_flush) 828 + break; 829 + 830 + for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++) 831 + if ((BIT(i) & types) && 832 + (!list_empty(&pin_list->unflushed[i]) || 833 + !list_empty(&pin_list->flushed[i]))) { 834 + spin_unlock(&j->lock); 835 + return true; 836 + } 837 + } 838 + spin_unlock(&j->lock); 839 + 840 + return false; 841 + } 842 + 843 + static bool journal_flush_pins_or_still_flushing(struct journal *j, u64 seq_to_flush, 844 + unsigned types) 845 + { 846 + return journal_flush_pins(j, seq_to_flush, types, 0, 0, 0) || 847 + journal_pins_still_flushing(j, seq_to_flush, types); 848 + } 849 + 830 850 static int journal_flush_done(struct journal *j, u64 seq_to_flush, 831 851 bool *did_work) 832 852 { 833 - int ret; 853 + int ret = 0; 834 854 835 855 ret = bch2_journal_error(j); 836 856 if (ret) ··· 869 827 870 828 mutex_lock(&j->reclaim_lock); 871 829 872 - if (journal_flush_pins(j, seq_to_flush, 873 - (1U << JOURNAL_PIN_key_cache)| 874 - (1U << JOURNAL_PIN_other), 0, 0, 0) || 875 - journal_flush_pins(j, seq_to_flush, 876 - (1U << JOURNAL_PIN_btree), 0, 0, 0)) 830 + if (journal_flush_pins_or_still_flushing(j, seq_to_flush, 831 + BIT(JOURNAL_PIN_TYPE_key_cache)| 832 + BIT(JOURNAL_PIN_TYPE_other))) { 877 833 *did_work = true; 834 + goto unlock; 835 + } 836 + 837 + if (journal_flush_pins_or_still_flushing(j, seq_to_flush, 838 + BIT(JOURNAL_PIN_TYPE_btree))) { 839 + *did_work = true; 840 + goto unlock; 841 + } 878 842 879 843 if (seq_to_flush > journal_cur_seq(j)) 880 844 bch2_journal_entry_close(j); ··· 895 847 !fifo_used(&j->pin); 896 848 897 849 spin_unlock(&j->lock); 850 + unlock: 898 851 mutex_unlock(&j->reclaim_lock); 899 852 900 853 return ret; ··· 909 860 if (!test_bit(JOURNAL_running, &j->flags)) 910 861 return false; 911 862 912 - closure_wait_event(&j->async_wait, 863 + closure_wait_event(&j->reclaim_flush_wait, 913 864 journal_flush_done(j, seq_to_flush, &did_work)); 914 865 915 866 return did_work; ··· 974 925 mutex_unlock(&c->replicas_gc_lock); 975 926 976 927 return ret; 928 + } 929 + 930 + bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) 931 + { 932 + struct journal_entry_pin_list *pin_list; 933 + struct journal_entry_pin *pin; 934 + 935 + spin_lock(&j->lock); 936 + if (!test_bit(JOURNAL_running, &j->flags)) { 937 + spin_unlock(&j->lock); 938 + return true; 939 + } 940 + 941 + *seq = max(*seq, j->pin.front); 942 + 943 + if (*seq >= j->pin.back) { 944 + spin_unlock(&j->lock); 945 + return true; 946 + } 947 + 948 + out->atomic++; 949 + 950 + pin_list = journal_seq_pin(j, *seq); 951 + 952 + prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count)); 953 + printbuf_indent_add(out, 2); 954 + 955 + prt_printf(out, "unflushed:\n"); 956 + for (unsigned i = 0; i < ARRAY_SIZE(pin_list->unflushed); i++) 957 + list_for_each_entry(pin, &pin_list->unflushed[i], list) 958 + prt_printf(out, "\t%px %ps\n", pin, pin->flush); 959 + 960 + prt_printf(out, "flushed:\n"); 961 + for (unsigned i = 0; i < ARRAY_SIZE(pin_list->flushed); i++) 962 + list_for_each_entry(pin, &pin_list->flushed[i], list) 963 + prt_printf(out, "\t%px %ps\n", pin, pin->flush); 964 + 965 + printbuf_indent_sub(out, 2); 966 + 967 + --out->atomic; 968 + spin_unlock(&j->lock); 969 + 970 + return false; 971 + } 972 + 973 + void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) 974 + { 975 + u64 seq = 0; 976 + 977 + while (!bch2_journal_seq_pins_to_text(out, j, &seq)) 978 + seq++; 977 979 }
+3
fs/bcachefs/journal_reclaim.h
··· 78 78 79 79 int bch2_journal_flush_device_pins(struct journal *, int); 80 80 81 + void bch2_journal_pins_to_text(struct printbuf *, struct journal *); 82 + bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); 83 + 81 84 #endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
+7 -6
fs/bcachefs/journal_types.h
··· 53 53 */ 54 54 55 55 enum journal_pin_type { 56 - JOURNAL_PIN_btree, 57 - JOURNAL_PIN_key_cache, 58 - JOURNAL_PIN_other, 59 - JOURNAL_PIN_NR, 56 + JOURNAL_PIN_TYPE_btree, 57 + JOURNAL_PIN_TYPE_key_cache, 58 + JOURNAL_PIN_TYPE_other, 59 + JOURNAL_PIN_TYPE_NR, 60 60 }; 61 61 62 62 struct journal_entry_pin_list { 63 - struct list_head list[JOURNAL_PIN_NR]; 64 - struct list_head flushed; 63 + struct list_head unflushed[JOURNAL_PIN_TYPE_NR]; 64 + struct list_head flushed[JOURNAL_PIN_TYPE_NR]; 65 65 atomic_t count; 66 66 struct bch_devs_list devs; 67 67 }; ··· 226 226 /* Used when waiting because the journal was full */ 227 227 wait_queue_head_t wait; 228 228 struct closure_waitlist async_wait; 229 + struct closure_waitlist reclaim_flush_wait; 229 230 230 231 struct delayed_work write_work; 231 232 struct workqueue_struct *wq;
+7 -4
fs/bcachefs/movinggc.c
··· 215 215 }; 216 216 move_buckets buckets = { 0 }; 217 217 struct move_bucket_in_flight *f; 218 - u64 moved = atomic64_read(&ctxt->stats->sectors_moved); 218 + u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen); 219 + u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved); 219 220 int ret = 0; 220 221 221 222 ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets); ··· 246 245 *did_work = true; 247 246 } 248 247 err: 249 - darray_exit(&buckets); 250 248 251 249 /* no entries in LRU btree found, or got to end: */ 252 250 if (bch2_err_matches(ret, ENOENT)) ··· 254 254 if (ret < 0 && !bch2_err_matches(ret, EROFS)) 255 255 bch_err_msg(c, ret, "from bch2_move_data()"); 256 256 257 - moved = atomic64_read(&ctxt->stats->sectors_moved) - moved; 258 - trace_and_count(c, copygc, c, moved, 0, 0, 0); 257 + sectors_seen = atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen; 258 + sectors_moved = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved; 259 + trace_and_count(c, copygc, c, buckets.nr, sectors_seen, sectors_moved); 260 + 261 + darray_exit(&buckets); 259 262 return ret; 260 263 } 261 264
+2 -2
fs/bcachefs/opts.h
··· 476 476 NULL, "Enable nocow mode: enables runtime locking in\n"\ 477 477 "data move path needed if nocow will ever be in use\n")\ 478 478 x(copygc_enabled, u8, \ 479 - OPT_FS|OPT_MOUNT, \ 479 + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ 480 480 OPT_BOOL(), \ 481 481 BCH2_NO_SB_OPT, true, \ 482 482 NULL, "Enable copygc: disable for debugging, or to\n"\ 483 483 "quiet the system when doing performance testing\n")\ 484 484 x(rebalance_enabled, u8, \ 485 - OPT_FS|OPT_MOUNT, \ 485 + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ 486 486 OPT_BOOL(), \ 487 487 BCH2_NO_SB_OPT, true, \ 488 488 NULL, "Enable rebalance: disable for debugging, or to\n"\
+1 -1
fs/bcachefs/sb-errors_format.h
··· 57 57 x(bset_wrong_sector_offset, 44, 0) \ 58 58 x(bset_empty, 45, 0) \ 59 59 x(bset_bad_seq, 46, 0) \ 60 - x(bset_blacklisted_journal_seq, 47, 0) \ 60 + x(bset_blacklisted_journal_seq, 47, FSCK_AUTOFIX) \ 61 61 x(first_bset_blacklisted_journal_seq, 48, FSCK_AUTOFIX) \ 62 62 x(btree_node_bad_btree, 49, 0) \ 63 63 x(btree_node_bad_level, 50, 0) \
+12 -12
fs/bcachefs/str_hash.c
··· 31 31 } 32 32 } 33 33 34 - static int fsck_rename_dirent(struct btree_trans *trans, 35 - struct snapshots_seen *s, 36 - const struct bch_hash_desc desc, 37 - struct bch_hash_info *hash_info, 38 - struct bkey_s_c_dirent old) 34 + static noinline int fsck_rename_dirent(struct btree_trans *trans, 35 + struct snapshots_seen *s, 36 + const struct bch_hash_desc desc, 37 + struct bch_hash_info *hash_info, 38 + struct bkey_s_c_dirent old) 39 39 { 40 40 struct qstr old_name = bch2_dirent_get_name(old); 41 41 struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32); ··· 71 71 return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); 72 72 } 73 73 74 - static int hash_pick_winner(struct btree_trans *trans, 75 - const struct bch_hash_desc desc, 76 - struct bch_hash_info *hash_info, 77 - struct bkey_s_c k1, 78 - struct bkey_s_c k2) 74 + static noinline int hash_pick_winner(struct btree_trans *trans, 75 + const struct bch_hash_desc desc, 76 + struct bch_hash_info *hash_info, 77 + struct bkey_s_c k1, 78 + struct bkey_s_c k2) 79 79 { 80 80 if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) && 81 81 !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k))) ··· 142 142 * All versions of the same inode in different snapshots must have the same hash 143 143 * seed/type: verify that the hash info we're using matches the root 144 144 */ 145 - static int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum, 146 - struct bch_hash_info *hash_info) 145 + static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum, 146 + struct bch_hash_info *hash_info) 147 147 { 148 148 struct bch_fs *c = trans->c; 149 149 struct btree_iter iter;
+12 -14
fs/bcachefs/trace.h
··· 902 902 903 903 TRACE_EVENT(copygc, 904 904 TP_PROTO(struct bch_fs *c, 905 - u64 sectors_moved, u64 sectors_not_moved, 906 - u64 buckets_moved, u64 buckets_not_moved), 907 - TP_ARGS(c, 908 - sectors_moved, sectors_not_moved, 909 - buckets_moved, buckets_not_moved), 905 + u64 buckets, 906 + u64 sectors_seen, 907 + u64 sectors_moved), 908 + TP_ARGS(c, buckets, sectors_seen, sectors_moved), 910 909 911 910 TP_STRUCT__entry( 912 911 __field(dev_t, dev ) 912 + __field(u64, buckets ) 913 + __field(u64, sectors_seen ) 913 914 __field(u64, sectors_moved ) 914 - __field(u64, sectors_not_moved ) 915 - __field(u64, buckets_moved ) 916 - __field(u64, buckets_not_moved ) 917 915 ), 918 916 919 917 TP_fast_assign( 920 918 __entry->dev = c->dev; 919 + __entry->buckets = buckets; 920 + __entry->sectors_seen = sectors_seen; 921 921 __entry->sectors_moved = sectors_moved; 922 - __entry->sectors_not_moved = sectors_not_moved; 923 - __entry->buckets_moved = buckets_moved; 924 - __entry->buckets_not_moved = buckets_moved; 925 922 ), 926 923 927 - TP_printk("%d,%d sectors moved %llu remain %llu buckets moved %llu remain %llu", 924 + TP_printk("%d,%d buckets %llu sectors seen %llu moved %llu", 928 925 MAJOR(__entry->dev), MINOR(__entry->dev), 929 - __entry->sectors_moved, __entry->sectors_not_moved, 930 - __entry->buckets_moved, __entry->buckets_not_moved) 926 + __entry->buckets, 927 + __entry->sectors_seen, 928 + __entry->sectors_moved) 931 929 ); 932 930 933 931 TRACE_EVENT(copygc_wait,