fs/bcachefs/journal.c at v6.10

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / fs / bcachefs / journal.c
at v6.10 1569 lines 40 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * bcachefs journalling code, for btree insertions
   4 *
   5 * Copyright 2012 Google, Inc.
   6 */
   7
   8#include "bcachefs.h"
   9#include "alloc_foreground.h"
  10#include "bkey_methods.h"
  11#include "btree_gc.h"
  12#include "btree_update.h"
  13#include "btree_write_buffer.h"
  14#include "buckets.h"
  15#include "error.h"
  16#include "journal.h"
  17#include "journal_io.h"
  18#include "journal_reclaim.h"
  19#include "journal_sb.h"
  20#include "journal_seq_blacklist.h"
  21#include "trace.h"
  22
  23static const char * const bch2_journal_errors[] = {
  24#define x(n)	#n,
  25	JOURNAL_ERRORS()
  26#undef x
  27	NULL
  28};
  29
  30static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
  31{
  32	return seq > j->seq_ondisk;
  33}
  34
  35static bool __journal_entry_is_open(union journal_res_state state)
  36{
  37	return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
  38}
  39
  40static inline unsigned nr_unwritten_journal_entries(struct journal *j)
  41{
  42	return atomic64_read(&j->seq) - j->seq_ondisk;
  43}
  44
  45static bool journal_entry_is_open(struct journal *j)
  46{
  47	return __journal_entry_is_open(j->reservations);
  48}
  49
  50static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq)
  51{
  52	union journal_res_state s = READ_ONCE(j->reservations);
  53	unsigned i = seq & JOURNAL_BUF_MASK;
  54	struct journal_buf *buf = j->buf + i;
  55
  56	prt_printf(out, "seq:\t%llu\n", seq);
  57	printbuf_indent_add(out, 2);
  58
  59	prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i));
  60
  61	prt_printf(out, "size:\t");
  62	prt_human_readable_u64(out, vstruct_bytes(buf->data));
  63	prt_newline(out);
  64
  65	prt_printf(out, "expires:\t");
  66	prt_printf(out, "%li jiffies\n", buf->expires - jiffies);
  67
  68	prt_printf(out, "flags:\t");
  69	if (buf->noflush)
  70		prt_str(out, "noflush ");
  71	if (buf->must_flush)
  72		prt_str(out, "must_flush ");
  73	if (buf->separate_flush)
  74		prt_str(out, "separate_flush ");
  75	if (buf->need_flush_to_write_buffer)
  76		prt_str(out, "need_flush_to_write_buffer ");
  77	if (buf->write_started)
  78		prt_str(out, "write_started ");
  79	if (buf->write_allocated)
  80		prt_str(out, "write_allocated ");
  81	if (buf->write_done)
  82		prt_str(out, "write_done");
  83	prt_newline(out);
  84
  85	printbuf_indent_sub(out, 2);
  86}
  87
  88static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
  89{
  90	if (!out->nr_tabstops)
  91		printbuf_tabstop_push(out, 24);
  92
  93	for (u64 seq = journal_last_unwritten_seq(j);
  94	     seq <= journal_cur_seq(j);
  95	     seq++)
  96		bch2_journal_buf_to_text(out, j, seq);
  97	prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed");
  98}
  99
 100static inline struct journal_buf *
 101journal_seq_to_buf(struct journal *j, u64 seq)
 102{
 103	struct journal_buf *buf = NULL;
 104
 105	EBUG_ON(seq > journal_cur_seq(j));
 106
 107	if (journal_seq_unwritten(j, seq)) {
 108		buf = j->buf + (seq & JOURNAL_BUF_MASK);
 109		EBUG_ON(le64_to_cpu(buf->data->seq) != seq);
 110	}
 111	return buf;
 112}
 113
 114static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
 115{
 116	unsigned i;
 117
 118	for (i = 0; i < ARRAY_SIZE(p->list); i++)
 119		INIT_LIST_HEAD(&p->list[i]);
 120	INIT_LIST_HEAD(&p->flushed);
 121	atomic_set(&p->count, count);
 122	p->devs.nr = 0;
 123}
 124
 125/*
 126 * Detect stuck journal conditions and trigger shutdown. Technically the journal
 127 * can end up stuck for a variety of reasons, such as a blocked I/O, journal
 128 * reservation lockup, etc. Since this is a fatal error with potentially
 129 * unpredictable characteristics, we want to be fairly conservative before we
 130 * decide to shut things down.
 131 *
 132 * Consider the journal stuck when it appears full with no ability to commit
 133 * btree transactions, to discard journal buckets, nor acquire priority
 134 * (reserved watermark) reservation.
 135 */
 136static inline bool
 137journal_error_check_stuck(struct journal *j, int error, unsigned flags)
 138{
 139	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 140	bool stuck = false;
 141	struct printbuf buf = PRINTBUF;
 142
 143	if (!(error == JOURNAL_ERR_journal_full ||
 144	      error == JOURNAL_ERR_journal_pin_full) ||
 145	    nr_unwritten_journal_entries(j) ||
 146	    (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim)
 147		return stuck;
 148
 149	spin_lock(&j->lock);
 150
 151	if (j->can_discard) {
 152		spin_unlock(&j->lock);
 153		return stuck;
 154	}
 155
 156	stuck = true;
 157
 158	/*
 159	 * The journal shutdown path will set ->err_seq, but do it here first to
 160	 * serialize against concurrent failures and avoid duplicate error
 161	 * reports.
 162	 */
 163	if (j->err_seq) {
 164		spin_unlock(&j->lock);
 165		return stuck;
 166	}
 167	j->err_seq = journal_cur_seq(j);
 168	spin_unlock(&j->lock);
 169
 170	bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)",
 171		bch2_journal_errors[error]);
 172	bch2_journal_debug_to_text(&buf, j);
 173	bch_err(c, "%s", buf.buf);
 174
 175	printbuf_reset(&buf);
 176	bch2_journal_pins_to_text(&buf, j);
 177	bch_err(c, "Journal pins:\n%s", buf.buf);
 178	printbuf_exit(&buf);
 179
 180	bch2_fatal_error(c);
 181	dump_stack();
 182
 183	return stuck;
 184}
 185
 186void bch2_journal_do_writes(struct journal *j)
 187{
 188	for (u64 seq = journal_last_unwritten_seq(j);
 189	     seq <= journal_cur_seq(j);
 190	     seq++) {
 191		unsigned idx = seq & JOURNAL_BUF_MASK;
 192		struct journal_buf *w = j->buf + idx;
 193
 194		if (w->write_started && !w->write_allocated)
 195			break;
 196		if (w->write_started)
 197			continue;
 198
 199		if (!journal_state_count(j->reservations, idx)) {
 200			w->write_started = true;
 201			closure_call(&w->io, bch2_journal_write, j->wq, NULL);
 202		}
 203
 204		break;
 205	}
 206}
 207
 208/*
 209 * Final processing when the last reference of a journal buffer has been
 210 * dropped. Drop the pin list reference acquired at journal entry open and write
 211 * the buffer, if requested.
 212 */
 213void bch2_journal_buf_put_final(struct journal *j, u64 seq)
 214{
 215	lockdep_assert_held(&j->lock);
 216
 217	if (__bch2_journal_pin_put(j, seq))
 218		bch2_journal_reclaim_fast(j);
 219	bch2_journal_do_writes(j);
 220}
 221
 222/*
 223 * Returns true if journal entry is now closed:
 224 *
 225 * We don't close a journal_buf until the next journal_buf is finished writing,
 226 * and can be opened again - this also initializes the next journal_buf:
 227 */
 228static void __journal_entry_close(struct journal *j, unsigned closed_val, bool trace)
 229{
 230	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 231	struct journal_buf *buf = journal_cur_buf(j);
 232	union journal_res_state old, new;
 233	u64 v = atomic64_read(&j->reservations.counter);
 234	unsigned sectors;
 235
 236	BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL &&
 237	       closed_val != JOURNAL_ENTRY_ERROR_VAL);
 238
 239	lockdep_assert_held(&j->lock);
 240
 241	do {
 242		old.v = new.v = v;
 243		new.cur_entry_offset = closed_val;
 244
 245		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL ||
 246		    old.cur_entry_offset == new.cur_entry_offset)
 247			return;
 248	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 249				       old.v, new.v)) != old.v);
 250
 251	if (!__journal_entry_is_open(old))
 252		return;
 253
 254	/* Close out old buffer: */
 255	buf->data->u64s		= cpu_to_le32(old.cur_entry_offset);
 256
 257	if (trace_journal_entry_close_enabled() && trace) {
 258		struct printbuf pbuf = PRINTBUF;
 259		pbuf.atomic++;
 260
 261		prt_str(&pbuf, "entry size: ");
 262		prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data));
 263		prt_newline(&pbuf);
 264		bch2_prt_task_backtrace(&pbuf, current, 1, GFP_NOWAIT);
 265		trace_journal_entry_close(c, pbuf.buf);
 266		printbuf_exit(&pbuf);
 267	}
 268
 269	sectors = vstruct_blocks_plus(buf->data, c->block_bits,
 270				      buf->u64s_reserved) << c->block_bits;
 271	BUG_ON(sectors > buf->sectors);
 272	buf->sectors = sectors;
 273
 274	/*
 275	 * We have to set last_seq here, _before_ opening a new journal entry:
 276	 *
 277	 * A threads may replace an old pin with a new pin on their current
 278	 * journal reservation - the expectation being that the journal will
 279	 * contain either what the old pin protected or what the new pin
 280	 * protects.
 281	 *
 282	 * After the old pin is dropped journal_last_seq() won't include the old
 283	 * pin, so we can only write the updated last_seq on the entry that
 284	 * contains whatever the new pin protects.
 285	 *
 286	 * Restated, we can _not_ update last_seq for a given entry if there
 287	 * could be a newer entry open with reservations/pins that have been
 288	 * taken against it.
 289	 *
 290	 * Hence, we want update/set last_seq on the current journal entry right
 291	 * before we open a new one:
 292	 */
 293	buf->last_seq		= journal_last_seq(j);
 294	buf->data->last_seq	= cpu_to_le64(buf->last_seq);
 295	BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq));
 296
 297	cancel_delayed_work(&j->write_work);
 298
 299	bch2_journal_space_available(j);
 300
 301	__bch2_journal_buf_put(j, old.idx, le64_to_cpu(buf->data->seq));
 302}
 303
 304void bch2_journal_halt(struct journal *j)
 305{
 306	spin_lock(&j->lock);
 307	__journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true);
 308	if (!j->err_seq)
 309		j->err_seq = journal_cur_seq(j);
 310	journal_wake(j);
 311	spin_unlock(&j->lock);
 312}
 313
 314static bool journal_entry_want_write(struct journal *j)
 315{
 316	bool ret = !journal_entry_is_open(j) ||
 317		journal_cur_seq(j) == journal_last_unwritten_seq(j);
 318
 319	/* Don't close it yet if we already have a write in flight: */
 320	if (ret)
 321		__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
 322	else if (nr_unwritten_journal_entries(j)) {
 323		struct journal_buf *buf = journal_cur_buf(j);
 324
 325		if (!buf->flush_time) {
 326			buf->flush_time	= local_clock() ?: 1;
 327			buf->expires = jiffies;
 328		}
 329	}
 330
 331	return ret;
 332}
 333
 334bool bch2_journal_entry_close(struct journal *j)
 335{
 336	bool ret;
 337
 338	spin_lock(&j->lock);
 339	ret = journal_entry_want_write(j);
 340	spin_unlock(&j->lock);
 341
 342	return ret;
 343}
 344
 345/*
 346 * should _only_ called from journal_res_get() - when we actually want a
 347 * journal reservation - journal entry is open means journal is dirty:
 348 */
 349static int journal_entry_open(struct journal *j)
 350{
 351	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 352	struct journal_buf *buf = j->buf +
 353		((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK);
 354	union journal_res_state old, new;
 355	int u64s;
 356	u64 v;
 357
 358	lockdep_assert_held(&j->lock);
 359	BUG_ON(journal_entry_is_open(j));
 360	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
 361
 362	if (j->blocked)
 363		return JOURNAL_ERR_blocked;
 364
 365	if (j->cur_entry_error)
 366		return j->cur_entry_error;
 367
 368	if (bch2_journal_error(j))
 369		return JOURNAL_ERR_insufficient_devices; /* -EROFS */
 370
 371	if (!fifo_free(&j->pin))
 372		return JOURNAL_ERR_journal_pin_full;
 373
 374	if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
 375		return JOURNAL_ERR_max_in_flight;
 376
 377	BUG_ON(!j->cur_entry_sectors);
 378
 379	buf->expires		=
 380		(journal_cur_seq(j) == j->flushed_seq_ondisk
 381		 ? jiffies
 382		 : j->last_flush_write) +
 383		msecs_to_jiffies(c->opts.journal_flush_delay);
 384
 385	buf->u64s_reserved	= j->entry_u64s_reserved;
 386	buf->disk_sectors	= j->cur_entry_sectors;
 387	buf->sectors		= min(buf->disk_sectors, buf->buf_size >> 9);
 388
 389	u64s = (int) (buf->sectors << 9) / sizeof(u64) -
 390		journal_entry_overhead(j);
 391	u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
 392
 393	if (u64s <= (ssize_t) j->early_journal_entries.nr)
 394		return JOURNAL_ERR_journal_full;
 395
 396	if (fifo_empty(&j->pin) && j->reclaim_thread)
 397		wake_up_process(j->reclaim_thread);
 398
 399	/*
 400	 * The fifo_push() needs to happen at the same time as j->seq is
 401	 * incremented for journal_last_seq() to be calculated correctly
 402	 */
 403	atomic64_inc(&j->seq);
 404	journal_pin_list_init(fifo_push_ref(&j->pin), 1);
 405
 406	BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
 407
 408	BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
 409
 410	bkey_extent_init(&buf->key);
 411	buf->noflush		= false;
 412	buf->must_flush		= false;
 413	buf->separate_flush	= false;
 414	buf->flush_time		= 0;
 415	buf->need_flush_to_write_buffer = true;
 416	buf->write_started	= false;
 417	buf->write_allocated	= false;
 418	buf->write_done		= false;
 419
 420	memset(buf->data, 0, sizeof(*buf->data));
 421	buf->data->seq	= cpu_to_le64(journal_cur_seq(j));
 422	buf->data->u64s	= 0;
 423
 424	if (j->early_journal_entries.nr) {
 425		memcpy(buf->data->_data, j->early_journal_entries.data,
 426		       j->early_journal_entries.nr * sizeof(u64));
 427		le32_add_cpu(&buf->data->u64s, j->early_journal_entries.nr);
 428	}
 429
 430	/*
 431	 * Must be set before marking the journal entry as open:
 432	 */
 433	j->cur_entry_u64s = u64s;
 434
 435	v = atomic64_read(&j->reservations.counter);
 436	do {
 437		old.v = new.v = v;
 438
 439		BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL);
 440
 441		new.idx++;
 442		BUG_ON(journal_state_count(new, new.idx));
 443		BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK));
 444
 445		journal_state_inc(&new);
 446
 447		/* Handle any already added entries */
 448		new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
 449	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 450				       old.v, new.v)) != old.v);
 451
 452	if (nr_unwritten_journal_entries(j) == 1)
 453		mod_delayed_work(j->wq,
 454				 &j->write_work,
 455				 msecs_to_jiffies(c->opts.journal_flush_delay));
 456	journal_wake(j);
 457
 458	if (j->early_journal_entries.nr)
 459		darray_exit(&j->early_journal_entries);
 460	return 0;
 461}
 462
 463static bool journal_quiesced(struct journal *j)
 464{
 465	bool ret = atomic64_read(&j->seq) == j->seq_ondisk;
 466
 467	if (!ret)
 468		bch2_journal_entry_close(j);
 469	return ret;
 470}
 471
 472static void journal_quiesce(struct journal *j)
 473{
 474	wait_event(j->wait, journal_quiesced(j));
 475}
 476
 477static void journal_write_work(struct work_struct *work)
 478{
 479	struct journal *j = container_of(work, struct journal, write_work.work);
 480
 481	spin_lock(&j->lock);
 482	if (__journal_entry_is_open(j->reservations)) {
 483		long delta = journal_cur_buf(j)->expires - jiffies;
 484
 485		if (delta > 0)
 486			mod_delayed_work(j->wq, &j->write_work, delta);
 487		else
 488			__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
 489	}
 490	spin_unlock(&j->lock);
 491}
 492
 493static int __journal_res_get(struct journal *j, struct journal_res *res,
 494			     unsigned flags)
 495{
 496	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 497	struct journal_buf *buf;
 498	bool can_discard;
 499	int ret;
 500retry:
 501	if (journal_res_get_fast(j, res, flags))
 502		return 0;
 503
 504	if (bch2_journal_error(j))
 505		return -BCH_ERR_erofs_journal_err;
 506
 507	if (j->blocked)
 508		return -BCH_ERR_journal_res_get_blocked;
 509
 510	if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
 511		ret = JOURNAL_ERR_journal_full;
 512		can_discard = j->can_discard;
 513		goto out;
 514	}
 515
 516	if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) {
 517		ret = JOURNAL_ERR_max_in_flight;
 518		goto out;
 519	}
 520
 521	spin_lock(&j->lock);
 522
 523	/*
 524	 * Recheck after taking the lock, so we don't race with another thread
 525	 * that just did journal_entry_open() and call bch2_journal_entry_close()
 526	 * unnecessarily
 527	 */
 528	if (journal_res_get_fast(j, res, flags)) {
 529		ret = 0;
 530		goto unlock;
 531	}
 532
 533	/*
 534	 * If we couldn't get a reservation because the current buf filled up,
 535	 * and we had room for a bigger entry on disk, signal that we want to
 536	 * realloc the journal bufs:
 537	 */
 538	buf = journal_cur_buf(j);
 539	if (journal_entry_is_open(j) &&
 540	    buf->buf_size >> 9 < buf->disk_sectors &&
 541	    buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
 542		j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
 543
 544	__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
 545	ret = journal_entry_open(j) ?: JOURNAL_ERR_retry;
 546unlock:
 547	can_discard = j->can_discard;
 548	spin_unlock(&j->lock);
 549out:
 550	if (ret == JOURNAL_ERR_retry)
 551		goto retry;
 552	if (!ret)
 553		return 0;
 554
 555	if (journal_error_check_stuck(j, ret, flags))
 556		ret = -BCH_ERR_journal_res_get_blocked;
 557
 558	if (ret == JOURNAL_ERR_max_in_flight &&
 559	    track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) {
 560
 561		struct printbuf buf = PRINTBUF;
 562		prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
 563		bch2_journal_bufs_to_text(&buf, j);
 564		trace_journal_entry_full(c, buf.buf);
 565		printbuf_exit(&buf);
 566		count_event(c, journal_entry_full);
 567	}
 568
 569	/*
 570	 * Journal is full - can't rely on reclaim from work item due to
 571	 * freezing:
 572	 */
 573	if ((ret == JOURNAL_ERR_journal_full ||
 574	     ret == JOURNAL_ERR_journal_pin_full) &&
 575	    !(flags & JOURNAL_RES_GET_NONBLOCK)) {
 576		if (can_discard) {
 577			bch2_journal_do_discards(j);
 578			goto retry;
 579		}
 580
 581		if (mutex_trylock(&j->reclaim_lock)) {
 582			bch2_journal_reclaim(j);
 583			mutex_unlock(&j->reclaim_lock);
 584		}
 585	}
 586
 587	return ret == JOURNAL_ERR_insufficient_devices
 588		? -BCH_ERR_erofs_journal_err
 589		: -BCH_ERR_journal_res_get_blocked;
 590}
 591
 592/*
 593 * Essentially the entry function to the journaling code. When bcachefs is doing
 594 * a btree insert, it calls this function to get the current journal write.
 595 * Journal write is the structure used set up journal writes. The calling
 596 * function will then add its keys to the structure, queuing them for the next
 597 * write.
 598 *
 599 * To ensure forward progress, the current task must not be holding any
 600 * btree node write locks.
 601 */
 602int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
 603				  unsigned flags)
 604{
 605	int ret;
 606
 607	closure_wait_event(&j->async_wait,
 608		   (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
 609		   (flags & JOURNAL_RES_GET_NONBLOCK));
 610	return ret;
 611}
 612
 613/* journal_entry_res: */
 614
 615void bch2_journal_entry_res_resize(struct journal *j,
 616				   struct journal_entry_res *res,
 617				   unsigned new_u64s)
 618{
 619	union journal_res_state state;
 620	int d = new_u64s - res->u64s;
 621
 622	spin_lock(&j->lock);
 623
 624	j->entry_u64s_reserved += d;
 625	if (d <= 0)
 626		goto out;
 627
 628	j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d);
 629	smp_mb();
 630	state = READ_ONCE(j->reservations);
 631
 632	if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL &&
 633	    state.cur_entry_offset > j->cur_entry_u64s) {
 634		j->cur_entry_u64s += d;
 635		/*
 636		 * Not enough room in current journal entry, have to flush it:
 637		 */
 638		__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
 639	} else {
 640		journal_cur_buf(j)->u64s_reserved += d;
 641	}
 642out:
 643	spin_unlock(&j->lock);
 644	res->u64s += d;
 645}
 646
 647/* journal flushing: */
 648
 649/**
 650 * bch2_journal_flush_seq_async - wait for a journal entry to be written
 651 * @j:		journal object
 652 * @seq:	seq to flush
 653 * @parent:	closure object to wait with
 654 * Returns:	1 if @seq has already been flushed, 0 if @seq is being flushed,
 655 *		-EIO if @seq will never be flushed
 656 *
 657 * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if
 658 * necessary
 659 */
 660int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
 661				 struct closure *parent)
 662{
 663	struct journal_buf *buf;
 664	int ret = 0;
 665
 666	if (seq <= j->flushed_seq_ondisk)
 667		return 1;
 668
 669	spin_lock(&j->lock);
 670
 671	if (WARN_ONCE(seq > journal_cur_seq(j),
 672		      "requested to flush journal seq %llu, but currently at %llu",
 673		      seq, journal_cur_seq(j)))
 674		goto out;
 675
 676	/* Recheck under lock: */
 677	if (j->err_seq && seq >= j->err_seq) {
 678		ret = -EIO;
 679		goto out;
 680	}
 681
 682	if (seq <= j->flushed_seq_ondisk) {
 683		ret = 1;
 684		goto out;
 685	}
 686
 687	/* if seq was written, but not flushed - flush a newer one instead */
 688	seq = max(seq, journal_last_unwritten_seq(j));
 689
 690recheck_need_open:
 691	if (seq > journal_cur_seq(j)) {
 692		struct journal_res res = { 0 };
 693
 694		if (journal_entry_is_open(j))
 695			__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
 696
 697		spin_unlock(&j->lock);
 698
 699		/*
 700		 * We're called from bch2_journal_flush_seq() -> wait_event();
 701		 * but this might block. We won't usually block, so we won't
 702		 * livelock:
 703		 */
 704		sched_annotate_sleep();
 705		ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
 706		if (ret)
 707			return ret;
 708
 709		seq = res.seq;
 710		buf = journal_seq_to_buf(j, seq);
 711		buf->must_flush = true;
 712
 713		if (!buf->flush_time) {
 714			buf->flush_time	= local_clock() ?: 1;
 715			buf->expires = jiffies;
 716		}
 717
 718		if (parent && !closure_wait(&buf->wait, parent))
 719			BUG();
 720
 721		bch2_journal_res_put(j, &res);
 722
 723		spin_lock(&j->lock);
 724		goto want_write;
 725	}
 726
 727	/*
 728	 * if write was kicked off without a flush, or if we promised it
 729	 * wouldn't be a flush, flush the next sequence number instead
 730	 */
 731	buf = journal_seq_to_buf(j, seq);
 732	if (buf->noflush) {
 733		seq++;
 734		goto recheck_need_open;
 735	}
 736
 737	buf->must_flush = true;
 738
 739	if (parent && !closure_wait(&buf->wait, parent))
 740		BUG();
 741want_write:
 742	if (seq == journal_cur_seq(j))
 743		journal_entry_want_write(j);
 744out:
 745	spin_unlock(&j->lock);
 746	return ret;
 747}
 748
 749int bch2_journal_flush_seq(struct journal *j, u64 seq)
 750{
 751	u64 start_time = local_clock();
 752	int ret, ret2;
 753
 754	/*
 755	 * Don't update time_stats when @seq is already flushed:
 756	 */
 757	if (seq <= j->flushed_seq_ondisk)
 758		return 0;
 759
 760	ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
 761
 762	if (!ret)
 763		bch2_time_stats_update(j->flush_seq_time, start_time);
 764
 765	return ret ?: ret2 < 0 ? ret2 : 0;
 766}
 767
 768/*
 769 * bch2_journal_flush_async - if there is an open journal entry, or a journal
 770 * still being written, write it and wait for the write to complete
 771 */
 772void bch2_journal_flush_async(struct journal *j, struct closure *parent)
 773{
 774	bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent);
 775}
 776
 777int bch2_journal_flush(struct journal *j)
 778{
 779	return bch2_journal_flush_seq(j, atomic64_read(&j->seq));
 780}
 781
 782/*
 783 * bch2_journal_noflush_seq - tell the journal not to issue any flushes before
 784 * @seq
 785 */
 786bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
 787{
 788	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 789	u64 unwritten_seq;
 790	bool ret = false;
 791
 792	if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush)))
 793		return false;
 794
 795	if (seq <= c->journal.flushed_seq_ondisk)
 796		return false;
 797
 798	spin_lock(&j->lock);
 799	if (seq <= c->journal.flushed_seq_ondisk)
 800		goto out;
 801
 802	for (unwritten_seq = journal_last_unwritten_seq(j);
 803	     unwritten_seq < seq;
 804	     unwritten_seq++) {
 805		struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
 806
 807		/* journal flush already in flight, or flush requseted */
 808		if (buf->must_flush)
 809			goto out;
 810
 811		buf->noflush = true;
 812	}
 813
 814	ret = true;
 815out:
 816	spin_unlock(&j->lock);
 817	return ret;
 818}
 819
 820int bch2_journal_meta(struct journal *j)
 821{
 822	struct journal_buf *buf;
 823	struct journal_res res;
 824	int ret;
 825
 826	memset(&res, 0, sizeof(res));
 827
 828	ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
 829	if (ret)
 830		return ret;
 831
 832	buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
 833	buf->must_flush = true;
 834
 835	if (!buf->flush_time) {
 836		buf->flush_time	= local_clock() ?: 1;
 837		buf->expires = jiffies;
 838	}
 839
 840	bch2_journal_res_put(j, &res);
 841
 842	return bch2_journal_flush_seq(j, res.seq);
 843}
 844
 845/* block/unlock the journal: */
 846
 847void bch2_journal_unblock(struct journal *j)
 848{
 849	spin_lock(&j->lock);
 850	j->blocked--;
 851	spin_unlock(&j->lock);
 852
 853	journal_wake(j);
 854}
 855
 856void bch2_journal_block(struct journal *j)
 857{
 858	spin_lock(&j->lock);
 859	j->blocked++;
 860	spin_unlock(&j->lock);
 861
 862	journal_quiesce(j);
 863}
 864
 865static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq)
 866{
 867	struct journal_buf *ret = NULL;
 868
 869	/* We're inside wait_event(), but using mutex_lock(: */
 870	sched_annotate_sleep();
 871	mutex_lock(&j->buf_lock);
 872	spin_lock(&j->lock);
 873	max_seq = min(max_seq, journal_cur_seq(j));
 874
 875	for (u64 seq = journal_last_unwritten_seq(j);
 876	     seq <= max_seq;
 877	     seq++) {
 878		unsigned idx = seq & JOURNAL_BUF_MASK;
 879		struct journal_buf *buf = j->buf + idx;
 880
 881		if (buf->need_flush_to_write_buffer) {
 882			if (seq == journal_cur_seq(j))
 883				__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
 884
 885			union journal_res_state s;
 886			s.v = atomic64_read_acquire(&j->reservations.counter);
 887
 888			ret = journal_state_count(s, idx)
 889				? ERR_PTR(-EAGAIN)
 890				: buf;
 891			break;
 892		}
 893	}
 894
 895	spin_unlock(&j->lock);
 896	if (IS_ERR_OR_NULL(ret))
 897		mutex_unlock(&j->buf_lock);
 898	return ret;
 899}
 900
 901struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq)
 902{
 903	struct journal_buf *ret;
 904
 905	wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq)) != ERR_PTR(-EAGAIN));
 906	return ret;
 907}
 908
 909/* allocate journal on a device: */
 910
 911static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 912					 bool new_fs, struct closure *cl)
 913{
 914	struct bch_fs *c = ca->fs;
 915	struct journal_device *ja = &ca->journal;
 916	u64 *new_bucket_seq = NULL, *new_buckets = NULL;
 917	struct open_bucket **ob = NULL;
 918	long *bu = NULL;
 919	unsigned i, pos, nr_got = 0, nr_want = nr - ja->nr;
 920	int ret = 0;
 921
 922	BUG_ON(nr <= ja->nr);
 923
 924	bu		= kcalloc(nr_want, sizeof(*bu), GFP_KERNEL);
 925	ob		= kcalloc(nr_want, sizeof(*ob), GFP_KERNEL);
 926	new_buckets	= kcalloc(nr, sizeof(u64), GFP_KERNEL);
 927	new_bucket_seq	= kcalloc(nr, sizeof(u64), GFP_KERNEL);
 928	if (!bu || !ob || !new_buckets || !new_bucket_seq) {
 929		ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets;
 930		goto err_free;
 931	}
 932
 933	for (nr_got = 0; nr_got < nr_want; nr_got++) {
 934		if (new_fs) {
 935			bu[nr_got] = bch2_bucket_alloc_new_fs(ca);
 936			if (bu[nr_got] < 0) {
 937				ret = -BCH_ERR_ENOSPC_bucket_alloc;
 938				break;
 939			}
 940		} else {
 941			ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal,
 942						       BCH_DATA_journal, cl);
 943			ret = PTR_ERR_OR_ZERO(ob[nr_got]);
 944			if (ret)
 945				break;
 946
 947			ret = bch2_trans_run(c,
 948				bch2_trans_mark_metadata_bucket(trans, ca,
 949						ob[nr_got]->bucket, BCH_DATA_journal,
 950						ca->mi.bucket_size, BTREE_TRIGGER_transactional));
 951			if (ret) {
 952				bch2_open_bucket_put(c, ob[nr_got]);
 953				bch_err_msg(c, ret, "marking new journal buckets");
 954				break;
 955			}
 956
 957			bu[nr_got] = ob[nr_got]->bucket;
 958		}
 959	}
 960
 961	if (!nr_got)
 962		goto err_free;
 963
 964	/* Don't return an error if we successfully allocated some buckets: */
 965	ret = 0;
 966
 967	if (c) {
 968		bch2_journal_flush_all_pins(&c->journal);
 969		bch2_journal_block(&c->journal);
 970		mutex_lock(&c->sb_lock);
 971	}
 972
 973	memcpy(new_buckets,	ja->buckets,	ja->nr * sizeof(u64));
 974	memcpy(new_bucket_seq,	ja->bucket_seq,	ja->nr * sizeof(u64));
 975
 976	BUG_ON(ja->discard_idx > ja->nr);
 977
 978	pos = ja->discard_idx ?: ja->nr;
 979
 980	memmove(new_buckets + pos + nr_got,
 981		new_buckets + pos,
 982		sizeof(new_buckets[0]) * (ja->nr - pos));
 983	memmove(new_bucket_seq + pos + nr_got,
 984		new_bucket_seq + pos,
 985		sizeof(new_bucket_seq[0]) * (ja->nr - pos));
 986
 987	for (i = 0; i < nr_got; i++) {
 988		new_buckets[pos + i] = bu[i];
 989		new_bucket_seq[pos + i] = 0;
 990	}
 991
 992	nr = ja->nr + nr_got;
 993
 994	ret = bch2_journal_buckets_to_sb(c, ca, new_buckets, nr);
 995	if (ret)
 996		goto err_unblock;
 997
 998	if (!new_fs)
 999		bch2_write_super(c);
1000
1001	/* Commit: */
1002	if (c)
1003		spin_lock(&c->journal.lock);
1004
1005	swap(new_buckets,	ja->buckets);
1006	swap(new_bucket_seq,	ja->bucket_seq);
1007	ja->nr = nr;
1008
1009	if (pos <= ja->discard_idx)
1010		ja->discard_idx = (ja->discard_idx + nr_got) % ja->nr;
1011	if (pos <= ja->dirty_idx_ondisk)
1012		ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + nr_got) % ja->nr;
1013	if (pos <= ja->dirty_idx)
1014		ja->dirty_idx = (ja->dirty_idx + nr_got) % ja->nr;
1015	if (pos <= ja->cur_idx)
1016		ja->cur_idx = (ja->cur_idx + nr_got) % ja->nr;
1017
1018	if (c)
1019		spin_unlock(&c->journal.lock);
1020err_unblock:
1021	if (c) {
1022		bch2_journal_unblock(&c->journal);
1023		mutex_unlock(&c->sb_lock);
1024	}
1025
1026	if (ret && !new_fs)
1027		for (i = 0; i < nr_got; i++)
1028			bch2_trans_run(c,
1029				bch2_trans_mark_metadata_bucket(trans, ca,
1030						bu[i], BCH_DATA_free, 0,
1031						BTREE_TRIGGER_transactional));
1032err_free:
1033	if (!new_fs)
1034		for (i = 0; i < nr_got; i++)
1035			bch2_open_bucket_put(c, ob[i]);
1036
1037	kfree(new_bucket_seq);
1038	kfree(new_buckets);
1039	kfree(ob);
1040	kfree(bu);
1041	return ret;
1042}
1043
1044/*
1045 * Allocate more journal space at runtime - not currently making use if it, but
1046 * the code works:
1047 */
1048int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
1049				unsigned nr)
1050{
1051	struct journal_device *ja = &ca->journal;
1052	struct closure cl;
1053	int ret = 0;
1054
1055	closure_init_stack(&cl);
1056
1057	down_write(&c->state_lock);
1058
1059	/* don't handle reducing nr of buckets yet: */
1060	if (nr < ja->nr)
1061		goto unlock;
1062
1063	while (ja->nr < nr) {
1064		struct disk_reservation disk_res = { 0, 0, 0 };
1065
1066		/*
1067		 * note: journal buckets aren't really counted as _sectors_ used yet, so
1068		 * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
1069		 * when space used goes up without a reservation - but we do need the
1070		 * reservation to ensure we'll actually be able to allocate:
1071		 *
1072		 * XXX: that's not right, disk reservations only ensure a
1073		 * filesystem-wide allocation will succeed, this is a device
1074		 * specific allocation - we can hang here:
1075		 */
1076
1077		ret = bch2_disk_reservation_get(c, &disk_res,
1078						bucket_to_sector(ca, nr - ja->nr), 1, 0);
1079		if (ret)
1080			break;
1081
1082		ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
1083
1084		bch2_disk_reservation_put(c, &disk_res);
1085
1086		closure_sync(&cl);
1087
1088		if (ret && ret != -BCH_ERR_bucket_alloc_blocked)
1089			break;
1090	}
1091
1092	bch_err_fn(c, ret);
1093unlock:
1094	up_write(&c->state_lock);
1095	return ret;
1096}
1097
1098int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
1099{
1100	unsigned nr;
1101	int ret;
1102
1103	if (dynamic_fault("bcachefs:add:journal_alloc")) {
1104		ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets;
1105		goto err;
1106	}
1107
1108	/* 1/128th of the device by default: */
1109	nr = ca->mi.nbuckets >> 7;
1110
1111	/*
1112	 * clamp journal size to 8192 buckets or 8GB (in sectors), whichever
1113	 * is smaller:
1114	 */
1115	nr = clamp_t(unsigned, nr,
1116		     BCH_JOURNAL_BUCKETS_MIN,
1117		     min(1 << 13,
1118			 (1 << 24) / ca->mi.bucket_size));
1119
1120	ret = __bch2_set_nr_journal_buckets(ca, nr, new_fs, NULL);
1121err:
1122	bch_err_fn(ca, ret);
1123	return ret;
1124}
1125
1126int bch2_fs_journal_alloc(struct bch_fs *c)
1127{
1128	for_each_online_member(c, ca) {
1129		if (ca->journal.nr)
1130			continue;
1131
1132		int ret = bch2_dev_journal_alloc(ca, true);
1133		if (ret) {
1134			percpu_ref_put(&ca->io_ref);
1135			return ret;
1136		}
1137	}
1138
1139	return 0;
1140}
1141
1142/* startup/shutdown: */
1143
1144static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
1145{
1146	bool ret = false;
1147	u64 seq;
1148
1149	spin_lock(&j->lock);
1150	for (seq = journal_last_unwritten_seq(j);
1151	     seq <= journal_cur_seq(j) && !ret;
1152	     seq++) {
1153		struct journal_buf *buf = journal_seq_to_buf(j, seq);
1154
1155		if (bch2_bkey_has_device_c(bkey_i_to_s_c(&buf->key), dev_idx))
1156			ret = true;
1157	}
1158	spin_unlock(&j->lock);
1159
1160	return ret;
1161}
1162
1163void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
1164{
1165	wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx));
1166}
1167
1168void bch2_fs_journal_stop(struct journal *j)
1169{
1170	if (!test_bit(JOURNAL_running, &j->flags))
1171		return;
1172
1173	bch2_journal_reclaim_stop(j);
1174	bch2_journal_flush_all_pins(j);
1175
1176	wait_event(j->wait, bch2_journal_entry_close(j));
1177
1178	/*
1179	 * Always write a new journal entry, to make sure the clock hands are up
1180	 * to date (and match the superblock)
1181	 */
1182	bch2_journal_meta(j);
1183
1184	journal_quiesce(j);
1185	cancel_delayed_work_sync(&j->write_work);
1186
1187	WARN(!bch2_journal_error(j) &&
1188	     test_bit(JOURNAL_replay_done, &j->flags) &&
1189	     j->last_empty_seq != journal_cur_seq(j),
1190	     "journal shutdown error: cur seq %llu but last empty seq %llu",
1191	     journal_cur_seq(j), j->last_empty_seq);
1192
1193	if (!bch2_journal_error(j))
1194		clear_bit(JOURNAL_running, &j->flags);
1195}
1196
1197int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
1198{
1199	struct bch_fs *c = container_of(j, struct bch_fs, journal);
1200	struct journal_entry_pin_list *p;
1201	struct journal_replay *i, **_i;
1202	struct genradix_iter iter;
1203	bool had_entries = false;
1204	u64 last_seq = cur_seq, nr, seq;
1205
1206	genradix_for_each_reverse(&c->journal_entries, iter, _i) {
1207		i = *_i;
1208
1209		if (journal_replay_ignore(i))
1210			continue;
1211
1212		last_seq = le64_to_cpu(i->j.last_seq);
1213		break;
1214	}
1215
1216	nr = cur_seq - last_seq;
1217
1218	if (nr + 1 > j->pin.size) {
1219		free_fifo(&j->pin);
1220		init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL);
1221		if (!j->pin.data) {
1222			bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
1223			return -BCH_ERR_ENOMEM_journal_pin_fifo;
1224		}
1225	}
1226
1227	j->replay_journal_seq	= last_seq;
1228	j->replay_journal_seq_end = cur_seq;
1229	j->last_seq_ondisk	= last_seq;
1230	j->flushed_seq_ondisk	= cur_seq - 1;
1231	j->seq_ondisk		= cur_seq - 1;
1232	j->pin.front		= last_seq;
1233	j->pin.back		= cur_seq;
1234	atomic64_set(&j->seq, cur_seq - 1);
1235
1236	fifo_for_each_entry_ptr(p, &j->pin, seq)
1237		journal_pin_list_init(p, 1);
1238
1239	genradix_for_each(&c->journal_entries, iter, _i) {
1240		i = *_i;
1241
1242		if (journal_replay_ignore(i))
1243			continue;
1244
1245		seq = le64_to_cpu(i->j.seq);
1246		BUG_ON(seq >= cur_seq);
1247
1248		if (seq < last_seq)
1249			continue;
1250
1251		if (journal_entry_empty(&i->j))
1252			j->last_empty_seq = le64_to_cpu(i->j.seq);
1253
1254		p = journal_seq_pin(j, seq);
1255
1256		p->devs.nr = 0;
1257		darray_for_each(i->ptrs, ptr)
1258			bch2_dev_list_add_dev(&p->devs, ptr->dev);
1259
1260		had_entries = true;
1261	}
1262
1263	if (!had_entries)
1264		j->last_empty_seq = cur_seq;
1265
1266	spin_lock(&j->lock);
1267
1268	set_bit(JOURNAL_running, &j->flags);
1269	j->last_flush_write = jiffies;
1270
1271	j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
1272	j->reservations.unwritten_idx++;
1273
1274	c->last_bucket_seq_cleanup = journal_cur_seq(j);
1275
1276	bch2_journal_space_available(j);
1277	spin_unlock(&j->lock);
1278
1279	return bch2_journal_reclaim_start(j);
1280}
1281
1282/* init/exit: */
1283
1284void bch2_dev_journal_exit(struct bch_dev *ca)
1285{
1286	struct journal_device *ja = &ca->journal;
1287
1288	for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
1289		kfree(ja->bio[i]);
1290		ja->bio[i] = NULL;
1291	}
1292
1293	kfree(ja->buckets);
1294	kfree(ja->bucket_seq);
1295	ja->buckets	= NULL;
1296	ja->bucket_seq	= NULL;
1297}
1298
1299int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
1300{
1301	struct journal_device *ja = &ca->journal;
1302	struct bch_sb_field_journal *journal_buckets =
1303		bch2_sb_field_get(sb, journal);
1304	struct bch_sb_field_journal_v2 *journal_buckets_v2 =
1305		bch2_sb_field_get(sb, journal_v2);
1306
1307	ja->nr = 0;
1308
1309	if (journal_buckets_v2) {
1310		unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
1311
1312		for (unsigned i = 0; i < nr; i++)
1313			ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
1314	} else if (journal_buckets) {
1315		ja->nr = bch2_nr_journal_buckets(journal_buckets);
1316	}
1317
1318	ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
1319	if (!ja->bucket_seq)
1320		return -BCH_ERR_ENOMEM_dev_journal_init;
1321
1322	unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
1323
1324	for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
1325		ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
1326				     nr_bvecs), GFP_KERNEL);
1327		if (!ja->bio[i])
1328			return -BCH_ERR_ENOMEM_dev_journal_init;
1329
1330		ja->bio[i]->ca = ca;
1331		ja->bio[i]->buf_idx = i;
1332		bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0);
1333	}
1334
1335	ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
1336	if (!ja->buckets)
1337		return -BCH_ERR_ENOMEM_dev_journal_init;
1338
1339	if (journal_buckets_v2) {
1340		unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
1341		unsigned dst = 0;
1342
1343		for (unsigned i = 0; i < nr; i++)
1344			for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
1345				ja->buckets[dst++] =
1346					le64_to_cpu(journal_buckets_v2->d[i].start) + j;
1347	} else if (journal_buckets) {
1348		for (unsigned i = 0; i < ja->nr; i++)
1349			ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
1350	}
1351
1352	return 0;
1353}
1354
1355void bch2_fs_journal_exit(struct journal *j)
1356{
1357	if (j->wq)
1358		destroy_workqueue(j->wq);
1359
1360	darray_exit(&j->early_journal_entries);
1361
1362	for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
1363		kvfree(j->buf[i].data);
1364	free_fifo(&j->pin);
1365}
1366
1367int bch2_fs_journal_init(struct journal *j)
1368{
1369	static struct lock_class_key res_key;
1370
1371	mutex_init(&j->buf_lock);
1372	spin_lock_init(&j->lock);
1373	spin_lock_init(&j->err_lock);
1374	init_waitqueue_head(&j->wait);
1375	INIT_DELAYED_WORK(&j->write_work, journal_write_work);
1376	init_waitqueue_head(&j->reclaim_wait);
1377	init_waitqueue_head(&j->pin_flush_wait);
1378	mutex_init(&j->reclaim_lock);
1379	mutex_init(&j->discard_lock);
1380
1381	lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
1382
1383	atomic64_set(&j->reservations.counter,
1384		((union journal_res_state)
1385		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
1386
1387	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)))
1388		return -BCH_ERR_ENOMEM_journal_pin_fifo;
1389
1390	for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) {
1391		j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
1392		j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL);
1393		if (!j->buf[i].data)
1394			return -BCH_ERR_ENOMEM_journal_buf;
1395		j->buf[i].idx = i;
1396	}
1397
1398	j->pin.front = j->pin.back = 1;
1399
1400	j->wq = alloc_workqueue("bcachefs_journal",
1401				WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512);
1402	if (!j->wq)
1403		return -BCH_ERR_ENOMEM_fs_other_alloc;
1404	return 0;
1405}
1406
1407/* debug: */
1408
1409static const char * const bch2_journal_flags_strs[] = {
1410#define x(n)	#n,
1411	JOURNAL_FLAGS()
1412#undef x
1413	NULL
1414};
1415
1416void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
1417{
1418	struct bch_fs *c = container_of(j, struct bch_fs, journal);
1419	union journal_res_state s;
1420	unsigned long now = jiffies;
1421	u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes;
1422
1423	printbuf_tabstops_reset(out);
1424	printbuf_tabstop_push(out, 28);
1425	out->atomic++;
1426
1427	rcu_read_lock();
1428	s = READ_ONCE(j->reservations);
1429
1430	prt_printf(out, "flags:\t");
1431	prt_bitflags(out, bch2_journal_flags_strs, j->flags);
1432	prt_newline(out);
1433	prt_printf(out, "dirty journal entries:\t%llu/%llu\n",	fifo_used(&j->pin), j->pin.size);
1434	prt_printf(out, "seq:\t%llu\n",				journal_cur_seq(j));
1435	prt_printf(out, "seq_ondisk:\t%llu\n",			j->seq_ondisk);
1436	prt_printf(out, "last_seq:\t%llu\n",			journal_last_seq(j));
1437	prt_printf(out, "last_seq_ondisk:\t%llu\n",		j->last_seq_ondisk);
1438	prt_printf(out, "flushed_seq_ondisk:\t%llu\n",		j->flushed_seq_ondisk);
1439	prt_printf(out, "watermark:\t%s\n",			bch2_watermarks[j->watermark]);
1440	prt_printf(out, "each entry reserved:\t%u\n",		j->entry_u64s_reserved);
1441	prt_printf(out, "nr flush writes:\t%llu\n",		j->nr_flush_writes);
1442	prt_printf(out, "nr noflush writes:\t%llu\n",		j->nr_noflush_writes);
1443	prt_printf(out, "average write size:\t");
1444	prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0);
1445	prt_newline(out);
1446	prt_printf(out, "nr direct reclaim:\t%llu\n",		j->nr_direct_reclaim);
1447	prt_printf(out, "nr background reclaim:\t%llu\n",	j->nr_background_reclaim);
1448	prt_printf(out, "reclaim kicked:\t%u\n",		j->reclaim_kicked);
1449	prt_printf(out, "reclaim runs in:\t%u ms\n",		time_after(j->next_reclaim, now)
1450	       ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
1451	prt_printf(out, "blocked:\t%u\n",			j->blocked);
1452	prt_printf(out, "current entry sectors:\t%u\n",		j->cur_entry_sectors);
1453	prt_printf(out, "current entry error:\t%s\n",		bch2_journal_errors[j->cur_entry_error]);
1454	prt_printf(out, "current entry:\t");
1455
1456	switch (s.cur_entry_offset) {
1457	case JOURNAL_ENTRY_ERROR_VAL:
1458		prt_printf(out, "error\n");
1459		break;
1460	case JOURNAL_ENTRY_CLOSED_VAL:
1461		prt_printf(out, "closed\n");
1462		break;
1463	default:
1464		prt_printf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s);
1465		break;
1466	}
1467
1468	prt_printf(out, "unwritten entries:\n");
1469	bch2_journal_bufs_to_text(out, j);
1470
1471	prt_printf(out, "space:\n");
1472	printbuf_indent_add(out, 2);
1473	prt_printf(out, "discarded\t%u:%u\n",
1474	       j->space[journal_space_discarded].next_entry,
1475	       j->space[journal_space_discarded].total);
1476	prt_printf(out, "clean ondisk\t%u:%u\n",
1477	       j->space[journal_space_clean_ondisk].next_entry,
1478	       j->space[journal_space_clean_ondisk].total);
1479	prt_printf(out, "clean\t%u:%u\n",
1480	       j->space[journal_space_clean].next_entry,
1481	       j->space[journal_space_clean].total);
1482	prt_printf(out, "total\t%u:%u\n",
1483	       j->space[journal_space_total].next_entry,
1484	       j->space[journal_space_total].total);
1485	printbuf_indent_sub(out, 2);
1486
1487	for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
1488		struct journal_device *ja = &ca->journal;
1489
1490		if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d))
1491			continue;
1492
1493		if (!ja->nr)
1494			continue;
1495
1496		prt_printf(out, "dev %u:\n",			ca->dev_idx);
1497		printbuf_indent_add(out, 2);
1498		prt_printf(out, "nr\t%u\n",			ja->nr);
1499		prt_printf(out, "bucket size\t%u\n",		ca->mi.bucket_size);
1500		prt_printf(out, "available\t%u:%u\n",		bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
1501		prt_printf(out, "discard_idx\t%u\n",		ja->discard_idx);
1502		prt_printf(out, "dirty_ondisk\t%u (seq %llu)\n",ja->dirty_idx_ondisk,	ja->bucket_seq[ja->dirty_idx_ondisk]);
1503		prt_printf(out, "dirty_idx\t%u (seq %llu)\n",	ja->dirty_idx,		ja->bucket_seq[ja->dirty_idx]);
1504		prt_printf(out, "cur_idx\t%u (seq %llu)\n",	ja->cur_idx,		ja->bucket_seq[ja->cur_idx]);
1505		printbuf_indent_sub(out, 2);
1506	}
1507
1508	rcu_read_unlock();
1509
1510	--out->atomic;
1511}
1512
1513void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
1514{
1515	spin_lock(&j->lock);
1516	__bch2_journal_debug_to_text(out, j);
1517	spin_unlock(&j->lock);
1518}
1519
1520bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
1521{
1522	struct journal_entry_pin_list *pin_list;
1523	struct journal_entry_pin *pin;
1524
1525	spin_lock(&j->lock);
1526	if (!test_bit(JOURNAL_running, &j->flags)) {
1527		spin_unlock(&j->lock);
1528		return true;
1529	}
1530
1531	*seq = max(*seq, j->pin.front);
1532
1533	if (*seq >= j->pin.back) {
1534		spin_unlock(&j->lock);
1535		return true;
1536	}
1537
1538	out->atomic++;
1539
1540	pin_list = journal_seq_pin(j, *seq);
1541
1542	prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count));
1543	printbuf_indent_add(out, 2);
1544
1545	for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++)
1546		list_for_each_entry(pin, &pin_list->list[i], list)
1547			prt_printf(out, "\t%px %ps\n", pin, pin->flush);
1548
1549	if (!list_empty(&pin_list->flushed))
1550		prt_printf(out, "flushed:\n");
1551
1552	list_for_each_entry(pin, &pin_list->flushed, list)
1553		prt_printf(out, "\t%px %ps\n", pin, pin->flush);
1554
1555	printbuf_indent_sub(out, 2);
1556
1557	--out->atomic;
1558	spin_unlock(&j->lock);
1559
1560	return false;
1561}
1562
1563void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
1564{
1565	u64 seq = 0;
1566
1567	while (!bch2_journal_seq_pins_to_text(out, j, &seq))
1568		seq++;
1569}
Configure Feed

Configure Feed