Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bcachefs: better journal pipelining

Recently a severe performance regression was discovered, which bisected
to

a6548c8b5eb5 bcachefs: Avoid flushing the journal in the discard path

It turns out the old behaviour, which issued excessive journal flushes,
worked around a performance issue where queueing delays would cause the
journal to not be able to write quickly enough and stall.

The journal flushes masked the issue because they periodically flushed
the device write cache, reducing write latency for non flushes.

This patch reworks the journalling code to allow more than one
(non-flush) write to be in flight at a time. With this patch, doing 4k
random writes and an iodepth of 128, we are now able to hit 560k iops to
a Samsung 970 EVO Plus - previously, we were stuck in the ~200k range.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

+98 -59
+38 -9
fs/bcachefs/journal.c
··· 74 74 prt_printf(out, "%li jiffies", buf->expires - jiffies); 75 75 prt_newline(out); 76 76 77 + if (buf->write_done) 78 + prt_printf(out, "write done\n"); 79 + else if (buf->write_allocated) 80 + prt_printf(out, "write allocated\n"); 81 + else if (buf->write_started) 82 + prt_printf(out, "write started\n"); 83 + 77 84 printbuf_indent_sub(out, 2); 78 85 } 79 86 ··· 182 175 return stuck; 183 176 } 184 177 178 + void bch2_journal_do_writes(struct journal *j) 179 + { 180 + for (u64 seq = journal_last_unwritten_seq(j); 181 + seq <= journal_cur_seq(j); 182 + seq++) { 183 + unsigned idx = seq & JOURNAL_BUF_MASK; 184 + struct journal_buf *w = j->buf + idx; 185 + 186 + if (w->write_started && !w->write_allocated) 187 + break; 188 + if (w->write_started) 189 + continue; 190 + 191 + if (!journal_state_count(j->reservations, idx)) { 192 + w->write_started = true; 193 + closure_call(&w->io, bch2_journal_write, j->wq, NULL); 194 + } 195 + 196 + break; 197 + } 198 + } 199 + 185 200 /* 186 201 * Final processing when the last reference of a journal buffer has been 187 202 * dropped. Drop the pin list reference acquired at journal entry open and write 188 203 * the buffer, if requested. 189 204 */ 190 - void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write) 205 + void bch2_journal_buf_put_final(struct journal *j, u64 seq) 191 206 { 192 207 lockdep_assert_held(&j->lock); 193 208 194 209 if (__bch2_journal_pin_put(j, seq)) 195 210 bch2_journal_reclaim_fast(j); 196 - if (write) { 197 - struct journal_buf *w = j->buf + (seq & JOURNAL_BUF_MASK); 198 - closure_call(&w->io, bch2_journal_write, j->wq, NULL); 199 - } 211 + bch2_journal_do_writes(j); 200 212 } 201 213 202 214 /* ··· 407 381 BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); 408 382 409 383 bkey_extent_init(&buf->key); 410 - buf->noflush = false; 411 - buf->must_flush = false; 412 - buf->separate_flush = false; 413 - buf->flush_time = 0; 384 + buf->noflush = false; 385 + buf->must_flush = false; 386 + buf->separate_flush = false; 387 + buf->flush_time = 0; 414 388 buf->need_flush_to_write_buffer = true; 389 + buf->write_started = false; 390 + buf->write_allocated = false; 391 + buf->write_done = false; 415 392 416 393 memset(buf->data, 0, sizeof(*buf->data)); 417 394 buf->data->seq = cpu_to_le64(journal_cur_seq(j));
+4 -3
fs/bcachefs/journal.h
··· 264 264 } 265 265 266 266 bool bch2_journal_entry_close(struct journal *); 267 - void bch2_journal_buf_put_final(struct journal *, u64, bool); 267 + void bch2_journal_do_writes(struct journal *); 268 + void bch2_journal_buf_put_final(struct journal *, u64); 268 269 269 270 static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) 270 271 { ··· 273 272 274 273 s = journal_state_buf_put(j, idx); 275 274 if (!journal_state_count(s, idx)) 276 - bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx); 275 + bch2_journal_buf_put_final(j, seq); 277 276 } 278 277 279 278 static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) ··· 283 282 s = journal_state_buf_put(j, idx); 284 283 if (!journal_state_count(s, idx)) { 285 284 spin_lock(&j->lock); 286 - bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx); 285 + bch2_journal_buf_put_final(j, seq); 287 286 spin_unlock(&j->lock); 288 287 } 289 288 }
+49 -43
fs/bcachefs/journal_io.c
··· 1602 1602 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1603 1603 struct bch_replicas_padded replicas; 1604 1604 union journal_res_state old, new; 1605 - u64 v, seq; 1605 + u64 v, seq = le64_to_cpu(w->data->seq); 1606 1606 int err = 0; 1607 1607 1608 1608 bch2_time_stats_update(!JSET_NO_FLUSH(w->data) ··· 1622 1622 if (err) 1623 1623 bch2_fatal_error(c); 1624 1624 1625 - spin_lock(&j->lock); 1626 - seq = le64_to_cpu(w->data->seq); 1625 + closure_debug_destroy(cl); 1627 1626 1627 + spin_lock(&j->lock); 1628 1628 if (seq >= j->pin.front) 1629 1629 journal_seq_pin(j, seq)->devs = w->devs_written; 1630 + if (err && (!j->err_seq || seq < j->err_seq)) 1631 + j->err_seq = seq; 1632 + w->write_done = true; 1630 1633 1631 - if (!err) { 1632 - if (!JSET_NO_FLUSH(w->data)) { 1634 + bool completed = false; 1635 + 1636 + for (seq = journal_last_unwritten_seq(j); 1637 + seq <= journal_cur_seq(j); 1638 + seq++) { 1639 + w = j->buf + (seq & JOURNAL_BUF_MASK); 1640 + if (!w->write_done) 1641 + break; 1642 + 1643 + if (!j->err_seq && !JSET_NO_FLUSH(w->data)) { 1633 1644 j->flushed_seq_ondisk = seq; 1634 1645 j->last_seq_ondisk = w->last_seq; 1635 1646 1636 1647 bch2_do_discards(c); 1637 1648 closure_wake_up(&c->freelist_wait); 1638 - 1639 1649 bch2_reset_alloc_cursors(c); 1640 1650 } 1641 - } else if (!j->err_seq || seq < j->err_seq) 1642 - j->err_seq = seq; 1643 1651 1644 - j->seq_ondisk = seq; 1652 + j->seq_ondisk = seq; 1645 1653 1646 - /* 1647 - * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard 1648 - * more buckets: 1649 - * 1650 - * Must come before signaling write completion, for 1651 - * bch2_fs_journal_stop(): 1652 - */ 1653 - if (j->watermark != BCH_WATERMARK_stripe) 1654 - journal_reclaim_kick(&c->journal); 1654 + /* 1655 + * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard 1656 + * more buckets: 1657 + * 1658 + * Must come before signaling write completion, for 1659 + * bch2_fs_journal_stop(): 1660 + */ 1661 + if (j->watermark != BCH_WATERMARK_stripe) 1662 + journal_reclaim_kick(&c->journal); 1655 1663 1656 - /* also must come before signalling write completion: */ 1657 - closure_debug_destroy(cl); 1664 + v = atomic64_read(&j->reservations.counter); 1665 + do { 1666 + old.v = new.v = v; 1667 + BUG_ON(journal_state_count(new, new.unwritten_idx)); 1668 + BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK)); 1658 1669 1659 - v = atomic64_read(&j->reservations.counter); 1660 - do { 1661 - old.v = new.v = v; 1662 - BUG_ON(journal_state_count(new, new.unwritten_idx)); 1670 + new.unwritten_idx++; 1671 + } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); 1663 1672 1664 - new.unwritten_idx++; 1665 - } while ((v = atomic64_cmpxchg(&j->reservations.counter, 1666 - old.v, new.v)) != old.v); 1673 + completed = true; 1674 + } 1667 1675 1668 - bch2_journal_reclaim_fast(j); 1669 - bch2_journal_space_available(j); 1676 + if (completed) { 1677 + bch2_journal_reclaim_fast(j); 1678 + bch2_journal_space_available(j); 1670 1679 1671 - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], 1672 - &j->max_in_flight_start, false); 1680 + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], 1681 + &j->max_in_flight_start, false); 1673 1682 1674 - closure_wake_up(&w->wait); 1675 - journal_wake(j); 1683 + closure_wake_up(&w->wait); 1684 + journal_wake(j); 1685 + } 1676 1686 1677 - if (!journal_state_count(new, new.unwritten_idx) && 1678 - journal_last_unwritten_seq(j) <= journal_cur_seq(j)) { 1679 - struct journal_buf *w = j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); 1680 - spin_unlock(&j->lock); 1681 - closure_call(&w->io, bch2_journal_write, j->wq, NULL); 1682 - } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && 1687 + if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && 1683 1688 new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { 1684 1689 struct journal_buf *buf = journal_cur_buf(j); 1685 1690 long delta = buf->expires - jiffies; ··· 1694 1689 * previous entries still in flight - the current journal entry 1695 1690 * might want to be written now: 1696 1691 */ 1697 - 1698 - spin_unlock(&j->lock); 1699 1692 mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); 1700 - } else { 1701 - spin_unlock(&j->lock); 1702 1693 } 1694 + 1695 + spin_unlock(&j->lock); 1703 1696 } 1704 1697 1705 1698 static void journal_write_endio(struct bio *bio) ··· 1951 1948 int ret; 1952 1949 1953 1950 BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); 1951 + BUG_ON(w->write_allocated); 1954 1952 1955 1953 j->write_start_time = local_clock(); 1956 1954 ··· 1995 1991 * bch2_journal_space_available(): 1996 1992 */ 1997 1993 w->sectors = 0; 1994 + w->write_allocated = true; 1998 1995 1999 1996 /* 2000 1997 * journal entry has been compacted and allocated, recalculate space 2001 1998 * available: 2002 1999 */ 2003 2000 bch2_journal_space_available(j); 2001 + bch2_journal_do_writes(j); 2004 2002 spin_unlock(&j->lock); 2005 2003 2006 2004 w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
+7 -4
fs/bcachefs/journal_types.h
··· 34 34 unsigned disk_sectors; /* maximum size entry could have been, if 35 35 buf_size was bigger */ 36 36 unsigned u64s_reserved; 37 - bool noflush; /* write has already been kicked off, and was noflush */ 38 - bool must_flush; /* something wants a flush */ 39 - bool separate_flush; 40 - bool need_flush_to_write_buffer; 37 + bool noflush:1; /* write has already been kicked off, and was noflush */ 38 + bool must_flush:1; /* something wants a flush */ 39 + bool separate_flush:1; 40 + bool need_flush_to_write_buffer:1; 41 + bool write_started:1; 42 + bool write_allocated:1; 43 + bool write_done:1; 41 44 u8 idx; 42 45 }; 43 46