Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bcachefs: Persist 64 bit io clocks

Originally, bcachefs - going back to bcache - stored, for each bucket, a
16 bit counter corresponding to how long it had been since the bucket
was read from. But, this required periodically rescaling counters on
every bucket to avoid wraparound. That wasn't an issue in bcache, where
we'd perodically rewrite the per bucket metadata all at once, but in
bcachefs we're trying to avoid having to walk every single bucket.

This patch switches to persisting 64 bit io clocks, corresponding to the
64 bit bucket timestaps introduced in the previous patch with
KEY_TYPE_alloc_v2.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

authored by

Kent Overstreet and committed by
Kent Overstreet
2abe5420 7f4e1d5d

+142 -311
+42 -183
fs/bcachefs/alloc_background.c
··· 31 31 #undef x 32 32 }; 33 33 34 - static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int); 35 - 36 34 /* Ratelimiting/PD controllers */ 37 35 38 36 static void pd_controllers_update(struct work_struct *work) ··· 338 340 339 341 int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) 340 342 { 341 - struct bch_dev *ca; 342 - unsigned i; 343 - int ret = 0; 343 + int ret; 344 344 345 345 down_read(&c->gc_lock); 346 346 ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC, ··· 353 357 percpu_down_write(&c->mark_lock); 354 358 bch2_dev_usage_from_buckets(c); 355 359 percpu_up_write(&c->mark_lock); 356 - 357 - mutex_lock(&c->bucket_clock[READ].lock); 358 - for_each_member_device(ca, c, i) { 359 - down_read(&ca->bucket_lock); 360 - bch2_recalc_oldest_io(c, ca, READ); 361 - up_read(&ca->bucket_lock); 362 - } 363 - mutex_unlock(&c->bucket_clock[READ].lock); 364 - 365 - mutex_lock(&c->bucket_clock[WRITE].lock); 366 - for_each_member_device(ca, c, i) { 367 - down_read(&ca->bucket_lock); 368 - bch2_recalc_oldest_io(c, ca, WRITE); 369 - up_read(&ca->bucket_lock); 370 - } 371 - mutex_unlock(&c->bucket_clock[WRITE].lock); 372 360 373 361 return 0; 374 362 } ··· 440 460 441 461 /* Bucket IO clocks: */ 442 462 443 - static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw) 444 - { 445 - struct bucket_clock *clock = &c->bucket_clock[rw]; 446 - struct bucket_array *buckets = bucket_array(ca); 447 - struct bucket *g; 448 - u16 max_last_io = 0; 449 - unsigned i; 450 - 451 - lockdep_assert_held(&c->bucket_clock[rw].lock); 452 - 453 - /* Recalculate max_last_io for this device: */ 454 - for_each_bucket(g, buckets) 455 - max_last_io = max(max_last_io, bucket_last_io(c, g, rw)); 456 - 457 - ca->max_last_bucket_io[rw] = max_last_io; 458 - 459 - /* Recalculate global max_last_io: */ 460 - max_last_io = 0; 461 - 462 - for_each_member_device(ca, c, i) 463 - max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]); 464 - 465 - clock->max_last_io = max_last_io; 466 - } 467 - 468 - static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw) 469 - { 470 - struct bucket_clock *clock = &c->bucket_clock[rw]; 471 - struct bucket_array *buckets; 472 - struct bch_dev *ca; 473 - struct bucket *g; 474 - unsigned i; 475 - 476 - trace_rescale_prios(c); 477 - 478 - for_each_member_device(ca, c, i) { 479 - down_read(&ca->bucket_lock); 480 - buckets = bucket_array(ca); 481 - 482 - for_each_bucket(g, buckets) 483 - g->io_time[rw] = clock->hand - 484 - bucket_last_io(c, g, rw) / 2; 485 - 486 - bch2_recalc_oldest_io(c, ca, rw); 487 - 488 - up_read(&ca->bucket_lock); 489 - } 490 - } 491 - 492 - static inline u64 bucket_clock_freq(u64 capacity) 493 - { 494 - return max(capacity >> 10, 2028ULL); 495 - } 496 - 497 - static void bch2_inc_clock_hand(struct io_timer *timer) 498 - { 499 - struct bucket_clock *clock = container_of(timer, 500 - struct bucket_clock, rescale); 501 - struct bch_fs *c = container_of(clock, 502 - struct bch_fs, bucket_clock[clock->rw]); 503 - struct bch_dev *ca; 504 - u64 capacity; 505 - unsigned i; 506 - 507 - mutex_lock(&clock->lock); 508 - 509 - /* if clock cannot be advanced more, rescale prio */ 510 - if (clock->max_last_io >= U16_MAX - 2) 511 - bch2_rescale_bucket_io_times(c, clock->rw); 512 - 513 - BUG_ON(clock->max_last_io >= U16_MAX - 2); 514 - 515 - for_each_member_device(ca, c, i) 516 - ca->max_last_bucket_io[clock->rw]++; 517 - clock->max_last_io++; 518 - clock->hand++; 519 - 520 - mutex_unlock(&clock->lock); 521 - 522 - capacity = READ_ONCE(c->capacity); 523 - 524 - if (!capacity) 525 - return; 526 - 527 - /* 528 - * we only increment when 0.1% of the filesystem capacity has been read 529 - * or written too, this determines if it's time 530 - * 531 - * XXX: we shouldn't really be going off of the capacity of devices in 532 - * RW mode (that will be 0 when we're RO, yet we can still service 533 - * reads) 534 - */ 535 - timer->expire += bucket_clock_freq(capacity); 536 - 537 - bch2_io_timer_add(&c->io_clock[clock->rw], timer); 538 - } 539 - 540 - static void bch2_bucket_clock_init(struct bch_fs *c, int rw) 541 - { 542 - struct bucket_clock *clock = &c->bucket_clock[rw]; 543 - 544 - clock->hand = 1; 545 - clock->rw = rw; 546 - clock->rescale.fn = bch2_inc_clock_hand; 547 - clock->rescale.expire = bucket_clock_freq(c->capacity); 548 - mutex_init(&clock->lock); 549 - } 550 - 551 463 int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, 552 464 size_t bucket_nr, int rw) 553 465 { ··· 449 577 struct bucket *g; 450 578 struct bkey_alloc_buf *a; 451 579 struct bkey_alloc_unpacked u; 452 - u64 *time; 580 + u64 *time, now; 453 581 int ret = 0; 454 582 455 583 iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr), ··· 471 599 percpu_up_read(&c->mark_lock); 472 600 473 601 time = rw == READ ? &u.read_time : &u.write_time; 474 - if (*time == c->bucket_clock[rw].hand) 602 + now = atomic64_read(&c->io_clock[rw].now); 603 + if (*time == now) 475 604 goto out; 476 605 477 - *time = c->bucket_clock[rw].hand; 606 + *time = now; 478 607 479 608 bch2_alloc_pack(c, a, u); 480 609 ret = bch2_trans_update(trans, iter, &a->k, 0) ?: ··· 547 674 return ret; 548 675 } 549 676 550 - static bool bch2_can_invalidate_bucket(struct bch_dev *ca, 551 - size_t bucket, 552 - struct bucket_mark mark) 677 + static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, 678 + struct bucket_mark m) 553 679 { 554 680 u8 gc_gen; 555 681 556 - if (!is_available_bucket(mark)) 682 + if (!is_available_bucket(m)) 557 683 return false; 558 684 559 - if (mark.owned_by_allocator) 685 + if (m.owned_by_allocator) 560 686 return false; 561 687 562 688 if (ca->buckets_nouse && 563 - test_bit(bucket, ca->buckets_nouse)) 689 + test_bit(b, ca->buckets_nouse)) 564 690 return false; 565 691 566 - gc_gen = bucket_gc_gen(ca, bucket); 692 + gc_gen = bucket_gc_gen(bucket(ca, b)); 567 693 568 694 if (gc_gen >= BUCKET_GC_GEN_MAX / 2) 569 695 ca->inc_gen_needs_gc++; ··· 576 704 /* 577 705 * Determines what order we're going to reuse buckets, smallest bucket_key() 578 706 * first. 579 - * 580 - * 581 - * - We take into account the read prio of the bucket, which gives us an 582 - * indication of how hot the data is -- we scale the prio so that the prio 583 - * farthest from the clock is worth 1/8th of the closest. 584 - * 585 - * - The number of sectors of cached data in the bucket, which gives us an 586 - * indication of the cost in cache misses this eviction will cause. 587 - * 588 - * - If hotness * sectors used compares equal, we pick the bucket with the 589 - * smallest bucket_gc_gen() - since incrementing the same bucket's generation 590 - * number repeatedly forces us to run mark and sweep gc to avoid generation 591 - * number wraparound. 592 707 */ 593 708 594 - static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca, 595 - size_t b, struct bucket_mark m) 709 + static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m, 710 + u64 now, u64 last_seq_ondisk) 596 711 { 597 - unsigned last_io = bucket_last_io(c, bucket(ca, b), READ); 598 - unsigned max_last_io = ca->max_last_bucket_io[READ]; 712 + unsigned used = bucket_sectors_used(m); 599 713 600 - /* 601 - * Time since last read, scaled to [0, 8) where larger value indicates 602 - * more recently read data: 603 - */ 604 - unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io; 714 + if (used) { 715 + /* 716 + * Prefer to keep buckets that have been read more recently, and 717 + * buckets that have more data in them: 718 + */ 719 + u64 last_read = max_t(s64, 0, now - g->io_time[READ]); 720 + u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used)); 605 721 606 - /* How much we want to keep the data in this bucket: */ 607 - unsigned long data_wantness = 608 - (hotness + 1) * bucket_sectors_used(m); 609 - 610 - unsigned long needs_journal_commit = 611 - bucket_needs_journal_commit(m, c->journal.last_seq_ondisk); 612 - 613 - return (data_wantness << 9) | 614 - (needs_journal_commit << 8) | 615 - (bucket_gc_gen(ca, b) / 16); 722 + return -last_read_scaled; 723 + } else { 724 + /* 725 + * Prefer to use buckets with smaller gc_gen so that we don't 726 + * have to walk the btree and recalculate oldest_gen - but shift 727 + * off the low bits so that buckets will still have equal sort 728 + * keys when there's only a small difference, so that we can 729 + * keep sequential buckets together: 730 + */ 731 + return (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)| 732 + (bucket_gc_gen(g) >> 4); 733 + } 616 734 } 617 735 618 736 static inline int bucket_alloc_cmp(alloc_heap *h, ··· 625 763 { 626 764 struct bucket_array *buckets; 627 765 struct alloc_heap_entry e = { 0 }; 766 + u64 now, last_seq_ondisk; 628 767 size_t b, i, nr = 0; 629 768 630 - ca->alloc_heap.used = 0; 631 - 632 - mutex_lock(&c->bucket_clock[READ].lock); 633 769 down_read(&ca->bucket_lock); 634 770 635 771 buckets = bucket_array(ca); 636 - 637 - bch2_recalc_oldest_io(c, ca, READ); 772 + ca->alloc_heap.used = 0; 773 + now = atomic64_read(&c->io_clock[READ].now); 774 + last_seq_ondisk = c->journal.last_seq_ondisk; 638 775 639 776 /* 640 777 * Find buckets with lowest read priority, by building a maxheap sorted ··· 641 780 * all buckets have been visited. 642 781 */ 643 782 for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { 644 - struct bucket_mark m = READ_ONCE(buckets->b[b].mark); 645 - unsigned long key = bucket_sort_key(c, ca, b, m); 783 + struct bucket *g = &buckets->b[b]; 784 + struct bucket_mark m = READ_ONCE(g->mark); 785 + unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk); 646 786 647 787 if (!bch2_can_invalidate_bucket(ca, b, m)) 648 788 continue; ··· 678 816 } 679 817 680 818 up_read(&ca->bucket_lock); 681 - mutex_unlock(&c->bucket_clock[READ].lock); 682 819 } 683 820 684 821 static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) ··· 892 1031 u.data_type = 0; 893 1032 u.dirty_sectors = 0; 894 1033 u.cached_sectors = 0; 895 - u.read_time = c->bucket_clock[READ].hand; 896 - u.write_time = c->bucket_clock[WRITE].hand; 1034 + u.read_time = atomic64_read(&c->io_clock[READ].now); 1035 + u.write_time = atomic64_read(&c->io_clock[WRITE].now); 897 1036 898 1037 bch2_alloc_pack(c, &a, u); 899 1038 bch2_trans_update(trans, iter, &a.k, ··· 1403 1542 void bch2_fs_allocator_background_init(struct bch_fs *c) 1404 1543 { 1405 1544 spin_lock_init(&c->freelist_lock); 1406 - bch2_bucket_clock_init(c, READ); 1407 - bch2_bucket_clock_init(c, WRITE); 1408 1545 1409 1546 c->pd_controllers_update_seconds = 5; 1410 1547 INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
-24
fs/bcachefs/alloc_types.h
··· 10 10 11 11 struct ec_bucket_buf; 12 12 13 - /* There's two of these clocks, one for reads and one for writes: */ 14 - struct bucket_clock { 15 - /* 16 - * "now" in (read/write) IO time - incremented whenever we do X amount 17 - * of reads or writes. 18 - * 19 - * Goes with the bucket read/write prios: when we read or write to a 20 - * bucket we reset the bucket's prio to the current hand; thus hand - 21 - * prio = time since bucket was last read/written. 22 - * 23 - * The units are some amount (bytes/sectors) of data read/written, and 24 - * the units can change on the fly if we need to rescale to fit 25 - * everything in a u16 - your only guarantee is that the units are 26 - * consistent. 27 - */ 28 - u16 hand; 29 - u16 max_last_io; 30 - 31 - int rw; 32 - 33 - struct io_timer rescale; 34 - struct mutex lock; 35 - }; 36 - 37 13 enum alloc_reserve { 38 14 RESERVE_BTREE_MOVINGGC = -2, 39 15 RESERVE_BTREE = -1,
-11
fs/bcachefs/bcachefs.h
··· 451 451 452 452 size_t fifo_last_bucket; 453 453 454 - /* last calculated minimum prio */ 455 - u16 max_last_bucket_io[2]; 456 - 457 454 size_t inc_gen_needs_gc; 458 455 size_t inc_gen_really_needs_gc; 459 456 ··· 689 692 /* single element mempool: */ 690 693 struct mutex usage_scratch_lock; 691 694 struct bch_fs_usage_online *usage_scratch; 692 - 693 - /* 694 - * When we invalidate buckets, we use both the priority and the amount 695 - * of good data to determine which buckets to reuse first - to weight 696 - * those together consistently we keep track of the smallest nonzero 697 - * priority of any bucket. 698 - */ 699 - struct bucket_clock bucket_clock[2]; 700 695 701 696 struct io_clock io_clock[2]; 702 697
+13 -5
fs/bcachefs/bcachefs_format.h
··· 1143 1143 struct bch_sb_field field; 1144 1144 1145 1145 __le32 flags; 1146 - __le16 read_clock; 1147 - __le16 write_clock; 1146 + __le16 _read_clock; /* no longer used */ 1147 + __le16 _write_clock; 1148 1148 __le64 journal_seq; 1149 1149 1150 1150 union { ··· 1511 1511 x(blacklist, 3) \ 1512 1512 x(blacklist_v2, 4) \ 1513 1513 x(usage, 5) \ 1514 - x(data_usage, 6) 1514 + x(data_usage, 6) \ 1515 + x(clock, 7) 1515 1516 1516 1517 enum { 1517 1518 #define x(f, nr) BCH_JSET_ENTRY_##f = nr, ··· 1560 1559 struct bch_replicas_entry r; 1561 1560 } __attribute__((packed)); 1562 1561 1562 + struct jset_entry_clock { 1563 + struct jset_entry entry; 1564 + __u8 rw; 1565 + __u8 pad[7]; 1566 + __le64 time; 1567 + } __attribute__((packed)); 1568 + 1563 1569 /* 1564 1570 * On disk format for a journal entry: 1565 1571 * seq is monotonically increasing; every journal entry has its own unique ··· 1589 1581 1590 1582 __u8 encrypted_start[0]; 1591 1583 1592 - __le16 read_clock; 1593 - __le16 write_clock; 1584 + __le16 _read_clock; /* no longer used */ 1585 + __le16 _write_clock; 1594 1586 1595 1587 /* Sequence number of oldest dirty journal entry */ 1596 1588 __le64 last_seq;
+3 -3
fs/bcachefs/btree_gc.c
··· 1489 1489 { 1490 1490 struct bch_fs *c = arg; 1491 1491 struct io_clock *clock = &c->io_clock[WRITE]; 1492 - unsigned long last = atomic_long_read(&clock->now); 1492 + unsigned long last = atomic64_read(&clock->now); 1493 1493 unsigned last_kick = atomic_read(&c->kick_gc); 1494 1494 int ret; 1495 1495 ··· 1510 1510 if (c->btree_gc_periodic) { 1511 1511 unsigned long next = last + c->capacity / 16; 1512 1512 1513 - if (atomic_long_read(&clock->now) >= next) 1513 + if (atomic64_read(&clock->now) >= next) 1514 1514 break; 1515 1515 1516 1516 bch2_io_clock_schedule_timeout(clock, next); ··· 1522 1522 } 1523 1523 __set_current_state(TASK_RUNNING); 1524 1524 1525 - last = atomic_long_read(&clock->now); 1525 + last = atomic64_read(&clock->now); 1526 1526 last_kick = atomic_read(&c->kick_gc); 1527 1527 1528 1528 /*
+1 -8
fs/bcachefs/buckets.h
··· 58 58 return __bucket(ca, b, false); 59 59 } 60 60 61 - static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw) 62 - { 63 - return c->bucket_clock[rw].hand - g->io_time[rw]; 64 - } 65 - 66 61 /* 67 62 * bucket_gc_gen() returns the difference between the bucket's current gen and 68 63 * the oldest gen of any pointer into that bucket in the btree. 69 64 */ 70 65 71 - static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b) 66 + static inline u8 bucket_gc_gen(struct bucket *g) 72 67 { 73 - struct bucket *g = bucket(ca, b); 74 - 75 68 return g->mark.gen - g->oldest_gen; 76 69 } 77 70
+1 -1
fs/bcachefs/buckets_types.h
··· 37 37 const struct bucket_mark mark; 38 38 }; 39 39 40 - u16 io_time[2]; 40 + u64 io_time[2]; 41 41 u8 oldest_gen; 42 42 u8 gc_gen; 43 43 unsigned gen_valid:1;
+4 -4
fs/bcachefs/clock.c
··· 19 19 20 20 spin_lock(&clock->timer_lock); 21 21 22 - if (time_after_eq((unsigned long) atomic_long_read(&clock->now), 22 + if (time_after_eq((unsigned long) atomic64_read(&clock->now), 23 23 timer->expire)) { 24 24 spin_unlock(&clock->timer_lock); 25 25 timer->fn(timer); ··· 146 146 void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) 147 147 { 148 148 struct io_timer *timer; 149 - unsigned long now = atomic_long_add_return(sectors, &clock->now); 149 + unsigned long now = atomic64_add_return(sectors, &clock->now); 150 150 151 151 while ((timer = get_expired_timer(clock, now))) 152 152 timer->fn(timer); ··· 158 158 unsigned i; 159 159 160 160 spin_lock(&clock->timer_lock); 161 - now = atomic_long_read(&clock->now); 161 + now = atomic64_read(&clock->now); 162 162 163 163 for (i = 0; i < clock->timers.used; i++) 164 164 pr_buf(out, "%ps:\t%li\n", ··· 175 175 176 176 int bch2_io_clock_init(struct io_clock *clock) 177 177 { 178 - atomic_long_set(&clock->now, 0); 178 + atomic64_set(&clock->now, 0); 179 179 spin_lock_init(&clock->timer_lock); 180 180 181 181 clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
+1 -1
fs/bcachefs/clock_types.h
··· 26 26 typedef HEAP(struct io_timer *) io_timer_heap; 27 27 28 28 struct io_clock { 29 - atomic_long_t now; 29 + atomic64_t now; 30 30 u16 __percpu *pcpu_buf; 31 31 unsigned max_slop; 32 32
+3
fs/bcachefs/journal.c
··· 1123 1123 j->entry_u64s_reserved += 1124 1124 BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX); 1125 1125 1126 + j->entry_u64s_reserved += 1127 + 2 * (sizeof(struct jset_entry_clock) / sizeof(u64)); 1128 + 1126 1129 atomic64_set(&j->reservations.counter, 1127 1130 ((union journal_res_state) 1128 1131 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
+28 -5
fs/bcachefs/journal_io.c
··· 426 426 return ret; 427 427 } 428 428 429 + static int journal_entry_validate_clock(struct bch_fs *c, 430 + struct jset *jset, 431 + struct jset_entry *entry, 432 + int write) 433 + { 434 + struct jset_entry_clock *clock = 435 + container_of(entry, struct jset_entry_clock, entry); 436 + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 437 + int ret = 0; 438 + 439 + if (journal_entry_err_on(bytes != sizeof(*clock), 440 + c, "invalid journal entry clock: bad size")) { 441 + journal_entry_null_range(entry, vstruct_next(entry)); 442 + return ret; 443 + } 444 + 445 + if (journal_entry_err_on(clock->rw > 1, 446 + c, "invalid journal entry clock: bad rw")) { 447 + journal_entry_null_range(entry, vstruct_next(entry)); 448 + return ret; 449 + } 450 + 451 + fsck_err: 452 + return ret; 453 + } 454 + 429 455 struct jset_entry_ops { 430 456 int (*validate)(struct bch_fs *, struct jset *, 431 457 struct jset_entry *, int); ··· 1387 1361 1388 1362 end = bch2_btree_roots_to_journal_entries(c, jset->start, end); 1389 1363 1390 - end = bch2_journal_super_entries_add_common(c, end, 1391 - le64_to_cpu(jset->seq)); 1364 + bch2_journal_super_entries_add_common(c, &end, 1365 + le64_to_cpu(jset->seq)); 1392 1366 u64s = (u64 *) end - (u64 *) start; 1393 1367 BUG_ON(u64s > j->entry_u64s_reserved); 1394 1368 ··· 1397 1371 1398 1372 journal_write_compact(jset); 1399 1373 1400 - jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); 1401 - jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); 1402 1374 jset->magic = cpu_to_le64(jset_magic(c)); 1403 - 1404 1375 jset->version = c->sb.version < bcachefs_metadata_version_new_versioning 1405 1376 ? cpu_to_le32(BCH_JSET_VERSION_OLD) 1406 1377 : cpu_to_le32(c->sb.version);
+2 -2
fs/bcachefs/movinggc.c
··· 298 298 { 299 299 struct bch_fs *c = arg; 300 300 struct io_clock *clock = &c->io_clock[WRITE]; 301 - unsigned long last, wait; 301 + u64 last, wait; 302 302 303 303 set_freezable(); 304 304 ··· 306 306 if (kthread_wait_freezable(c->copy_gc_enabled)) 307 307 break; 308 308 309 - last = atomic_long_read(&clock->now); 309 + last = atomic64_read(&clock->now); 310 310 wait = bch2_copygc_wait_amount(c); 311 311 312 312 if (wait > clock->max_slop) {
+5 -5
fs/bcachefs/rebalance.c
··· 169 169 unsigned long start, prev_start; 170 170 unsigned long prev_run_time, prev_run_cputime; 171 171 unsigned long cputime, prev_cputime; 172 - unsigned long io_start; 172 + u64 io_start; 173 173 long throttle; 174 174 175 175 set_freezable(); 176 176 177 - io_start = atomic_long_read(&clock->now); 177 + io_start = atomic64_read(&clock->now); 178 178 p = rebalance_work(c); 179 179 prev_start = jiffies; 180 180 prev_cputime = curr_cputime(); ··· 210 210 (20 - w.dev_most_full_percent), 211 211 50); 212 212 213 - if (atomic_long_read(&clock->now) + clock->max_slop < 213 + if (atomic64_read(&clock->now) + clock->max_slop < 214 214 r->throttled_until_iotime) { 215 215 r->throttled_until_cputime = start + throttle; 216 216 r->state = REBALANCE_THROTTLED; ··· 229 229 max(p.dev_most_full_percent, 1U) / 230 230 max(w.dev_most_full_percent, 1U)); 231 231 232 - io_start = atomic_long_read(&clock->now); 232 + io_start = atomic64_read(&clock->now); 233 233 p = w; 234 234 prev_start = start; 235 235 prev_cputime = cputime; ··· 274 274 case REBALANCE_THROTTLED: 275 275 bch2_hprint(&PBUF(h1), 276 276 (r->throttled_until_iotime - 277 - atomic_long_read(&c->io_clock[WRITE].now)) << 9); 277 + atomic64_read(&c->io_clock[WRITE].now)) << 9); 278 278 pr_buf(out, "throttled for %lu sec or %s io\n", 279 279 (r->throttled_until_cputime - jiffies) / HZ, 280 280 h1);
+1 -1
fs/bcachefs/rebalance_types.h
··· 17 17 atomic64_t work_unknown_dev; 18 18 19 19 enum rebalance_state state; 20 - unsigned long throttled_until_iotime; 20 + u64 throttled_until_iotime; 21 21 unsigned long throttled_until_cputime; 22 22 struct bch_move_stats move_stats; 23 23
+6 -13
fs/bcachefs/recovery.c
··· 847 847 le64_to_cpu(bl_entry->end) + 1); 848 848 break; 849 849 } 850 + case BCH_JSET_ENTRY_clock: { 851 + struct jset_entry_clock *clock = 852 + container_of(entry, struct jset_entry_clock, entry); 853 + 854 + atomic64_set(&c->io_clock[clock->rw].now, clock->time); 855 + } 850 856 } 851 857 852 858 return ret; ··· 867 861 int ret; 868 862 869 863 if (clean) { 870 - c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock); 871 - c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock); 872 - 873 864 for (entry = clean->start; 874 865 entry != vstruct_end(&clean->field); 875 866 entry = vstruct_next(entry)) { ··· 878 875 list_for_each_entry(i, journal, list) { 879 876 if (i->ignore) 880 877 continue; 881 - 882 - c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); 883 - c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); 884 878 885 879 vstruct_for_each(&i->j, entry) { 886 880 ret = journal_replay_entry_early(c, entry); ··· 941 941 *cleanp = NULL; 942 942 return 0; 943 943 } 944 - 945 - mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, 946 - "superblock read clock %u doesn't match journal %u after clean shutdown", 947 - clean->read_clock, j->read_clock); 948 - mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, 949 - "superblock write clock %u doesn't match journal %u after clean shutdown", 950 - clean->write_clock, j->write_clock); 951 944 952 945 for (i = 0; i < BTREE_ID_NR; i++) { 953 946 char buf1[200], buf2[200];
+28 -34
fs/bcachefs/super-io.c
··· 966 966 return ret; 967 967 } 968 968 969 - static void 970 - entry_init_u64s(struct jset_entry *entry, unsigned u64s) 969 + static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) 971 970 { 972 - memset(entry, 0, u64s * sizeof(u64)); 971 + struct jset_entry *entry = *end; 972 + unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); 973 973 974 + memset(entry, 0, u64s * sizeof(u64)); 974 975 /* 975 976 * The u64s field counts from the start of data, ignoring the shared 976 977 * fields. 977 978 */ 978 979 entry->u64s = u64s - 1; 980 + 981 + *end = vstruct_next(*end); 982 + return entry; 979 983 } 980 984 981 - static void 982 - entry_init_size(struct jset_entry *entry, size_t size) 983 - { 984 - unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); 985 - entry_init_u64s(entry, u64s); 986 - } 987 - 988 - struct jset_entry * 989 - bch2_journal_super_entries_add_common(struct bch_fs *c, 990 - struct jset_entry *entry, 991 - u64 journal_seq) 985 + void bch2_journal_super_entries_add_common(struct bch_fs *c, 986 + struct jset_entry **end, 987 + u64 journal_seq) 992 988 { 993 989 unsigned i; 994 990 ··· 999 1003 1000 1004 { 1001 1005 struct jset_entry_usage *u = 1002 - container_of(entry, struct jset_entry_usage, entry); 1006 + container_of(jset_entry_init(end, sizeof(*u)), 1007 + struct jset_entry_usage, entry); 1003 1008 1004 - entry_init_size(entry, sizeof(*u)); 1005 1009 u->entry.type = BCH_JSET_ENTRY_usage; 1006 1010 u->entry.btree_id = FS_USAGE_INODES; 1007 1011 u->v = cpu_to_le64(c->usage_base->nr_inodes); 1008 - 1009 - entry = vstruct_next(entry); 1010 1012 } 1011 1013 1012 1014 { 1013 1015 struct jset_entry_usage *u = 1014 - container_of(entry, struct jset_entry_usage, entry); 1016 + container_of(jset_entry_init(end, sizeof(*u)), 1017 + struct jset_entry_usage, entry); 1015 1018 1016 - entry_init_size(entry, sizeof(*u)); 1017 1019 u->entry.type = BCH_JSET_ENTRY_usage; 1018 1020 u->entry.btree_id = FS_USAGE_KEY_VERSION; 1019 1021 u->v = cpu_to_le64(atomic64_read(&c->key_version)); 1020 - 1021 - entry = vstruct_next(entry); 1022 1022 } 1023 1023 1024 1024 for (i = 0; i < BCH_REPLICAS_MAX; i++) { 1025 1025 struct jset_entry_usage *u = 1026 - container_of(entry, struct jset_entry_usage, entry); 1026 + container_of(jset_entry_init(end, sizeof(*u)), 1027 + struct jset_entry_usage, entry); 1027 1028 1028 - entry_init_size(entry, sizeof(*u)); 1029 1029 u->entry.type = BCH_JSET_ENTRY_usage; 1030 1030 u->entry.btree_id = FS_USAGE_RESERVED; 1031 1031 u->entry.level = i; 1032 1032 u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); 1033 - 1034 - entry = vstruct_next(entry); 1035 1033 } 1036 1034 1037 1035 for (i = 0; i < c->replicas.nr; i++) { 1038 1036 struct bch_replicas_entry *e = 1039 1037 cpu_replicas_entry(&c->replicas, i); 1040 1038 struct jset_entry_data_usage *u = 1041 - container_of(entry, struct jset_entry_data_usage, entry); 1039 + container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs), 1040 + struct jset_entry_data_usage, entry); 1042 1041 1043 - entry_init_size(entry, sizeof(*u) + e->nr_devs); 1044 1042 u->entry.type = BCH_JSET_ENTRY_data_usage; 1045 1043 u->v = cpu_to_le64(c->usage_base->replicas[i]); 1046 1044 unsafe_memcpy(&u->r, e, replicas_entry_bytes(e), 1047 1045 "embedded variable length struct"); 1048 - 1049 - entry = vstruct_next(entry); 1050 1046 } 1051 1047 1052 1048 percpu_up_read(&c->mark_lock); 1053 1049 1054 - return entry; 1050 + for (i = 0; i < 2; i++) { 1051 + struct jset_entry_clock *clock = 1052 + container_of(jset_entry_init(end, sizeof(*clock)), 1053 + struct jset_entry_clock, entry); 1054 + 1055 + clock->entry.type = BCH_JSET_ENTRY_clock; 1056 + clock->rw = i; 1057 + clock->time = atomic64_read(&c->io_clock[i].now); 1058 + } 1055 1059 } 1056 1060 1057 1061 void bch2_fs_mark_clean(struct bch_fs *c) ··· 1080 1084 } 1081 1085 1082 1086 sb_clean->flags = 0; 1083 - sb_clean->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); 1084 - sb_clean->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); 1085 1087 sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1); 1086 1088 1087 1089 /* Trying to catch outstanding bug: */ 1088 1090 BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); 1089 1091 1090 1092 entry = sb_clean->start; 1091 - entry = bch2_journal_super_entries_add_common(c, entry, 0); 1093 + bch2_journal_super_entries_add_common(c, &entry, 0); 1092 1094 entry = bch2_btree_roots_to_journal_entries(c, entry, entry); 1093 1095 BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); 1094 1096
+2 -3
fs/bcachefs/super-io.h
··· 122 122 123 123 /* BCH_SB_FIELD_clean: */ 124 124 125 - struct jset_entry * 126 - bch2_journal_super_entries_add_common(struct bch_fs *, 127 - struct jset_entry *, u64); 125 + void bch2_journal_super_entries_add_common(struct bch_fs *, 126 + struct jset_entry **, u64); 128 127 129 128 void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int); 130 129
-6
fs/bcachefs/super.c
··· 181 181 bch2_copygc_stop(c); 182 182 bch2_gc_thread_stop(c); 183 183 184 - bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale); 185 - bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale); 186 - 187 184 /* 188 185 * Flush journal before stopping allocators, because flushing journal 189 186 * blacklist entries involves allocating new btree nodes: ··· 402 405 for_each_rw_member(ca, c, i) 403 406 bch2_dev_allocator_add(c, ca); 404 407 bch2_recalc_capacity(c); 405 - 406 - bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale); 407 - bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale); 408 408 409 409 for_each_rw_member(ca, c, i) { 410 410 ret = bch2_dev_allocator_start(ca);
+2 -2
fs/bcachefs/sysfs.c
··· 705 705 { 706 706 int rw = (private ? 1 : 0); 707 707 708 - return bucket_last_io(c, bucket(ca, b), rw); 708 + return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw]; 709 709 } 710 710 711 711 static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca, ··· 718 718 static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca, 719 719 size_t b, void *private) 720 720 { 721 - return bucket_gc_gen(ca, b); 721 + return bucket_gc_gen(bucket(ca, b)); 722 722 } 723 723 724 724 static int unsigned_cmp(const void *_l, const void *_r)