bcachefs: Persist 64 bit io clocks · tjh.dev/kernel@2abe542

tjh.dev / kernel

fork atom

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork atom

bcachefs: Persist 64 bit io clocks

Originally, bcachefs - going back to bcache - stored, for each bucket, a
16 bit counter corresponding to how long it had been since the bucket
was read from. But, this required periodically rescaling counters on
every bucket to avoid wraparound. That wasn't an issue in bcache, where
we'd perodically rewrite the per bucket metadata all at once, but in
bcachefs we're trying to avoid having to walk every single bucket.

This patch switches to persisting 64 bit io clocks, corresponding to the
64 bit bucket timestaps introduced in the previous patch with
KEY_TYPE_alloc_v2.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

authored by

Kent Overstreet and committed by

Kent Overstreet 2 years ago 2abe5420 7f4e1d5d

+142 -311

19 changed files

expand all collapse all

bcachefs

alloc_background.c

alloc_types.h

bcachefs.h

bcachefs_format.h

btree_gc.c

buckets.h

buckets_types.h

clock.c

clock_types.h

journal.c

journal_io.c

movinggc.c

rebalance.c

rebalance_types.h

recovery.c

super-io.c

super-io.h

super.c

sysfs.c

+42 -183

fs/bcachefs/alloc_background.c

reviewed

··· 31 31 #undef x 32 32 }; 33 33 34 34 - static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int); 35 35 - 36 34 /* Ratelimiting/PD controllers */ 37 35 38 36 static void pd_controllers_update(struct work_struct *work) ··· 338 340 339 341 int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) 340 342 { 341 341 - struct bch_dev *ca; 342 342 - unsigned i; 343 343 - int ret = 0; 343 343 + int ret; 344 344 345 345 down_read(&c->gc_lock); 346 346 ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC, ··· 353 357 percpu_down_write(&c->mark_lock); 354 358 bch2_dev_usage_from_buckets(c); 355 359 percpu_up_write(&c->mark_lock); 356 356 - 357 357 - mutex_lock(&c->bucket_clock[READ].lock); 358 358 - for_each_member_device(ca, c, i) { 359 359 - down_read(&ca->bucket_lock); 360 360 - bch2_recalc_oldest_io(c, ca, READ); 361 361 - up_read(&ca->bucket_lock); 362 362 - } 363 363 - mutex_unlock(&c->bucket_clock[READ].lock); 364 364 - 365 365 - mutex_lock(&c->bucket_clock[WRITE].lock); 366 366 - for_each_member_device(ca, c, i) { 367 367 - down_read(&ca->bucket_lock); 368 368 - bch2_recalc_oldest_io(c, ca, WRITE); 369 369 - up_read(&ca->bucket_lock); 370 370 - } 371 371 - mutex_unlock(&c->bucket_clock[WRITE].lock); 372 360 373 361 return 0; 374 362 } ··· 440 460 441 461 /* Bucket IO clocks: */ 442 462 443 443 - static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw) 444 444 - { 445 445 - struct bucket_clock *clock = &c->bucket_clock[rw]; 446 446 - struct bucket_array *buckets = bucket_array(ca); 447 447 - struct bucket *g; 448 448 - u16 max_last_io = 0; 449 449 - unsigned i; 450 450 - 451 451 - lockdep_assert_held(&c->bucket_clock[rw].lock); 452 452 - 453 453 - /* Recalculate max_last_io for this device: */ 454 454 - for_each_bucket(g, buckets) 455 455 - max_last_io = max(max_last_io, bucket_last_io(c, g, rw)); 456 456 - 457 457 - ca->max_last_bucket_io[rw] = max_last_io; 458 458 - 459 459 - /* Recalculate global max_last_io: */ 460 460 - max_last_io = 0; 461 461 - 462 462 - for_each_member_device(ca, c, i) 463 463 - max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]); 464 464 - 465 465 - clock->max_last_io = max_last_io; 466 466 - } 467 467 - 468 468 - static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw) 469 469 - { 470 470 - struct bucket_clock *clock = &c->bucket_clock[rw]; 471 471 - struct bucket_array *buckets; 472 472 - struct bch_dev *ca; 473 473 - struct bucket *g; 474 474 - unsigned i; 475 475 - 476 476 - trace_rescale_prios(c); 477 477 - 478 478 - for_each_member_device(ca, c, i) { 479 479 - down_read(&ca->bucket_lock); 480 480 - buckets = bucket_array(ca); 481 481 - 482 482 - for_each_bucket(g, buckets) 483 483 - g->io_time[rw] = clock->hand - 484 484 - bucket_last_io(c, g, rw) / 2; 485 485 - 486 486 - bch2_recalc_oldest_io(c, ca, rw); 487 487 - 488 488 - up_read(&ca->bucket_lock); 489 489 - } 490 490 - } 491 491 - 492 492 - static inline u64 bucket_clock_freq(u64 capacity) 493 493 - { 494 494 - return max(capacity >> 10, 2028ULL); 495 495 - } 496 496 - 497 497 - static void bch2_inc_clock_hand(struct io_timer *timer) 498 498 - { 499 499 - struct bucket_clock *clock = container_of(timer, 500 500 - struct bucket_clock, rescale); 501 501 - struct bch_fs *c = container_of(clock, 502 502 - struct bch_fs, bucket_clock[clock->rw]); 503 503 - struct bch_dev *ca; 504 504 - u64 capacity; 505 505 - unsigned i; 506 506 - 507 507 - mutex_lock(&clock->lock); 508 508 - 509 509 - /* if clock cannot be advanced more, rescale prio */ 510 510 - if (clock->max_last_io >= U16_MAX - 2) 511 511 - bch2_rescale_bucket_io_times(c, clock->rw); 512 512 - 513 513 - BUG_ON(clock->max_last_io >= U16_MAX - 2); 514 514 - 515 515 - for_each_member_device(ca, c, i) 516 516 - ca->max_last_bucket_io[clock->rw]++; 517 517 - clock->max_last_io++; 518 518 - clock->hand++; 519 519 - 520 520 - mutex_unlock(&clock->lock); 521 521 - 522 522 - capacity = READ_ONCE(c->capacity); 523 523 - 524 524 - if (!capacity) 525 525 - return; 526 526 - 527 527 - /* 528 528 - * we only increment when 0.1% of the filesystem capacity has been read 529 529 - * or written too, this determines if it's time 530 530 - * 531 531 - * XXX: we shouldn't really be going off of the capacity of devices in 532 532 - * RW mode (that will be 0 when we're RO, yet we can still service 533 533 - * reads) 534 534 - */ 535 535 - timer->expire += bucket_clock_freq(capacity); 536 536 - 537 537 - bch2_io_timer_add(&c->io_clock[clock->rw], timer); 538 538 - } 539 539 - 540 540 - static void bch2_bucket_clock_init(struct bch_fs *c, int rw) 541 541 - { 542 542 - struct bucket_clock *clock = &c->bucket_clock[rw]; 543 543 - 544 544 - clock->hand = 1; 545 545 - clock->rw = rw; 546 546 - clock->rescale.fn = bch2_inc_clock_hand; 547 547 - clock->rescale.expire = bucket_clock_freq(c->capacity); 548 548 - mutex_init(&clock->lock); 549 549 - } 550 550 - 551 463 int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, 552 464 size_t bucket_nr, int rw) 553 465 { ··· 449 577 struct bucket *g; 450 578 struct bkey_alloc_buf *a; 451 579 struct bkey_alloc_unpacked u; 452 452 - u64 *time; 580 580 + u64 *time, now; 453 581 int ret = 0; 454 582 455 583 iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr), ··· 471 599 percpu_up_read(&c->mark_lock); 472 600 473 601 time = rw == READ ? &u.read_time : &u.write_time; 474 474 - if (*time == c->bucket_clock[rw].hand) 602 602 + now = atomic64_read(&c->io_clock[rw].now); 603 603 + if (*time == now) 475 604 goto out; 476 605 477 477 - *time = c->bucket_clock[rw].hand; 606 606 + *time = now; 478 607 479 608 bch2_alloc_pack(c, a, u); 480 609 ret = bch2_trans_update(trans, iter, &a->k, 0) ?: ··· 547 674 return ret; 548 675 } 549 676 550 550 - static bool bch2_can_invalidate_bucket(struct bch_dev *ca, 551 551 - size_t bucket, 552 552 - struct bucket_mark mark) 677 677 + static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, 678 678 + struct bucket_mark m) 553 679 { 554 680 u8 gc_gen; 555 681 556 556 - if (!is_available_bucket(mark)) 682 682 + if (!is_available_bucket(m)) 557 683 return false; 558 684 559 559 - if (mark.owned_by_allocator) 685 685 + if (m.owned_by_allocator) 560 686 return false; 561 687 562 688 if (ca->buckets_nouse && 563 563 - test_bit(bucket, ca->buckets_nouse)) 689 689 + test_bit(b, ca->buckets_nouse)) 564 690 return false; 565 691 566 566 - gc_gen = bucket_gc_gen(ca, bucket); 692 692 + gc_gen = bucket_gc_gen(bucket(ca, b)); 567 693 568 694 if (gc_gen >= BUCKET_GC_GEN_MAX / 2) 569 695 ca->inc_gen_needs_gc++; ··· 576 704 /* 577 705 * Determines what order we're going to reuse buckets, smallest bucket_key() 578 706 * first. 579 579 - * 580 580 - * 581 581 - * - We take into account the read prio of the bucket, which gives us an 582 582 - * indication of how hot the data is -- we scale the prio so that the prio 583 583 - * farthest from the clock is worth 1/8th of the closest. 584 584 - * 585 585 - * - The number of sectors of cached data in the bucket, which gives us an 586 586 - * indication of the cost in cache misses this eviction will cause. 587 587 - * 588 588 - * - If hotness * sectors used compares equal, we pick the bucket with the 589 589 - * smallest bucket_gc_gen() - since incrementing the same bucket's generation 590 590 - * number repeatedly forces us to run mark and sweep gc to avoid generation 591 591 - * number wraparound. 592 707 */ 593 708 594 594 - static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca, 595 595 - size_t b, struct bucket_mark m) 709 709 + static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m, 710 710 + u64 now, u64 last_seq_ondisk) 596 711 { 597 597 - unsigned last_io = bucket_last_io(c, bucket(ca, b), READ); 598 598 - unsigned max_last_io = ca->max_last_bucket_io[READ]; 712 712 + unsigned used = bucket_sectors_used(m); 599 713 600 600 - /* 601 601 - * Time since last read, scaled to [0, 8) where larger value indicates 602 602 - * more recently read data: 603 603 - */ 604 604 - unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io; 714 714 + if (used) { 715 715 + /* 716 716 + * Prefer to keep buckets that have been read more recently, and 717 717 + * buckets that have more data in them: 718 718 + */ 719 719 + u64 last_read = max_t(s64, 0, now - g->io_time[READ]); 720 720 + u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used)); 605 721 606 606 - /* How much we want to keep the data in this bucket: */ 607 607 - unsigned long data_wantness = 608 608 - (hotness + 1) * bucket_sectors_used(m); 609 609 - 610 610 - unsigned long needs_journal_commit = 611 611 - bucket_needs_journal_commit(m, c->journal.last_seq_ondisk); 612 612 - 613 613 - return (data_wantness << 9) | 614 614 - (needs_journal_commit << 8) | 615 615 - (bucket_gc_gen(ca, b) / 16); 722 722 + return -last_read_scaled; 723 723 + } else { 724 724 + /* 725 725 + * Prefer to use buckets with smaller gc_gen so that we don't 726 726 + * have to walk the btree and recalculate oldest_gen - but shift 727 727 + * off the low bits so that buckets will still have equal sort 728 728 + * keys when there's only a small difference, so that we can 729 729 + * keep sequential buckets together: 730 730 + */ 731 731 + return (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)| 732 732 + (bucket_gc_gen(g) >> 4); 733 733 + } 616 734 } 617 735 618 736 static inline int bucket_alloc_cmp(alloc_heap *h, ··· 625 763 { 626 764 struct bucket_array *buckets; 627 765 struct alloc_heap_entry e = { 0 }; 766 766 + u64 now, last_seq_ondisk; 628 767 size_t b, i, nr = 0; 629 768 630 630 - ca->alloc_heap.used = 0; 631 631 - 632 632 - mutex_lock(&c->bucket_clock[READ].lock); 633 769 down_read(&ca->bucket_lock); 634 770 635 771 buckets = bucket_array(ca); 636 636 - 637 637 - bch2_recalc_oldest_io(c, ca, READ); 772 772 + ca->alloc_heap.used = 0; 773 773 + now = atomic64_read(&c->io_clock[READ].now); 774 774 + last_seq_ondisk = c->journal.last_seq_ondisk; 638 775 639 776 /* 640 777 * Find buckets with lowest read priority, by building a maxheap sorted ··· 641 780 * all buckets have been visited. 642 781 */ 643 782 for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { 644 644 - struct bucket_mark m = READ_ONCE(buckets->b[b].mark); 645 645 - unsigned long key = bucket_sort_key(c, ca, b, m); 783 783 + struct bucket *g = &buckets->b[b]; 784 784 + struct bucket_mark m = READ_ONCE(g->mark); 785 785 + unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk); 646 786 647 787 if (!bch2_can_invalidate_bucket(ca, b, m)) 648 788 continue; ··· 678 816 } 679 817 680 818 up_read(&ca->bucket_lock); 681 681 - mutex_unlock(&c->bucket_clock[READ].lock); 682 819 } 683 820 684 821 static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) ··· 892 1031 u.data_type = 0; 893 1032 u.dirty_sectors = 0; 894 1033 u.cached_sectors = 0; 895 895 - u.read_time = c->bucket_clock[READ].hand; 896 896 - u.write_time = c->bucket_clock[WRITE].hand; 1034 1034 + u.read_time = atomic64_read(&c->io_clock[READ].now); 1035 1035 + u.write_time = atomic64_read(&c->io_clock[WRITE].now); 897 1036 898 1037 bch2_alloc_pack(c, &a, u); 899 1038 bch2_trans_update(trans, iter, &a.k, ··· 1403 1542 void bch2_fs_allocator_background_init(struct bch_fs *c) 1404 1543 { 1405 1544 spin_lock_init(&c->freelist_lock); 1406 1406 - bch2_bucket_clock_init(c, READ); 1407 1407 - bch2_bucket_clock_init(c, WRITE); 1408 1545 1409 1546 c->pd_controllers_update_seconds = 5; 1410 1547 INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);

-24

fs/bcachefs/alloc_types.h

reviewed

··· 10 10 11 11 struct ec_bucket_buf; 12 12 13 13 - /* There's two of these clocks, one for reads and one for writes: */ 14 14 - struct bucket_clock { 15 15 - /* 16 16 - * "now" in (read/write) IO time - incremented whenever we do X amount 17 17 - * of reads or writes. 18 18 - * 19 19 - * Goes with the bucket read/write prios: when we read or write to a 20 20 - * bucket we reset the bucket's prio to the current hand; thus hand - 21 21 - * prio = time since bucket was last read/written. 22 22 - * 23 23 - * The units are some amount (bytes/sectors) of data read/written, and 24 24 - * the units can change on the fly if we need to rescale to fit 25 25 - * everything in a u16 - your only guarantee is that the units are 26 26 - * consistent. 27 27 - */ 28 28 - u16 hand; 29 29 - u16 max_last_io; 30 30 - 31 31 - int rw; 32 32 - 33 33 - struct io_timer rescale; 34 34 - struct mutex lock; 35 35 - }; 36 36 - 37 13 enum alloc_reserve { 38 14 RESERVE_BTREE_MOVINGGC = -2, 39 15 RESERVE_BTREE = -1,

-11

fs/bcachefs/bcachefs.h

reviewed

··· 451 451 452 452 size_t fifo_last_bucket; 453 453 454 454 - /* last calculated minimum prio */ 455 455 - u16 max_last_bucket_io[2]; 456 456 - 457 454 size_t inc_gen_needs_gc; 458 455 size_t inc_gen_really_needs_gc; 459 456 ··· 689 692 /* single element mempool: */ 690 693 struct mutex usage_scratch_lock; 691 694 struct bch_fs_usage_online *usage_scratch; 692 692 - 693 693 - /* 694 694 - * When we invalidate buckets, we use both the priority and the amount 695 695 - * of good data to determine which buckets to reuse first - to weight 696 696 - * those together consistently we keep track of the smallest nonzero 697 697 - * priority of any bucket. 698 698 - */ 699 699 - struct bucket_clock bucket_clock[2]; 700 695 701 696 struct io_clock io_clock[2]; 702 697

+13 -5

fs/bcachefs/bcachefs_format.h

reviewed

··· 1143 1143 struct bch_sb_field field; 1144 1144 1145 1145 __le32 flags; 1146 1146 - __le16 read_clock; 1147 1147 - __le16 write_clock; 1146 1146 + __le16 _read_clock; /* no longer used */ 1147 1147 + __le16 _write_clock; 1148 1148 __le64 journal_seq; 1149 1149 1150 1150 union { ··· 1511 1511 x(blacklist, 3) \ 1512 1512 x(blacklist_v2, 4) \ 1513 1513 x(usage, 5) \ 1514 1514 - x(data_usage, 6) 1514 1514 + x(data_usage, 6) \ 1515 1515 + x(clock, 7) 1515 1516 1516 1517 enum { 1517 1518 #define x(f, nr) BCH_JSET_ENTRY_##f = nr, ··· 1560 1559 struct bch_replicas_entry r; 1561 1560 } __attribute__((packed)); 1562 1561 1562 1562 + struct jset_entry_clock { 1563 1563 + struct jset_entry entry; 1564 1564 + __u8 rw; 1565 1565 + __u8 pad[7]; 1566 1566 + __le64 time; 1567 1567 + } __attribute__((packed)); 1568 1568 + 1563 1569 /* 1564 1570 * On disk format for a journal entry: 1565 1571 * seq is monotonically increasing; every journal entry has its own unique ··· 1589 1581 1590 1582 __u8 encrypted_start[0]; 1591 1583 1592 1592 - __le16 read_clock; 1593 1593 - __le16 write_clock; 1584 1584 + __le16 _read_clock; /* no longer used */ 1585 1585 + __le16 _write_clock; 1594 1586 1595 1587 /* Sequence number of oldest dirty journal entry */ 1596 1588 __le64 last_seq;

+3 -3

fs/bcachefs/btree_gc.c

reviewed

··· 1489 1489 { 1490 1490 struct bch_fs *c = arg; 1491 1491 struct io_clock *clock = &c->io_clock[WRITE]; 1492 1492 - unsigned long last = atomic_long_read(&clock->now); 1492 1492 + unsigned long last = atomic64_read(&clock->now); 1493 1493 unsigned last_kick = atomic_read(&c->kick_gc); 1494 1494 int ret; 1495 1495 ··· 1510 1510 if (c->btree_gc_periodic) { 1511 1511 unsigned long next = last + c->capacity / 16; 1512 1512 1513 1513 - if (atomic_long_read(&clock->now) >= next) 1513 1513 + if (atomic64_read(&clock->now) >= next) 1514 1514 break; 1515 1515 1516 1516 bch2_io_clock_schedule_timeout(clock, next); ··· 1522 1522 } 1523 1523 __set_current_state(TASK_RUNNING); 1524 1524 1525 1525 - last = atomic_long_read(&clock->now); 1525 1525 + last = atomic64_read(&clock->now); 1526 1526 last_kick = atomic_read(&c->kick_gc); 1527 1527 1528 1528 /*

+1 -8

fs/bcachefs/buckets.h

reviewed

··· 58 58 return __bucket(ca, b, false); 59 59 } 60 60 61 61 - static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw) 62 62 - { 63 63 - return c->bucket_clock[rw].hand - g->io_time[rw]; 64 64 - } 65 65 - 66 61 /* 67 62 * bucket_gc_gen() returns the difference between the bucket's current gen and 68 63 * the oldest gen of any pointer into that bucket in the btree. 69 64 */ 70 65 71 71 - static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b) 66 66 + static inline u8 bucket_gc_gen(struct bucket *g) 72 67 { 73 73 - struct bucket *g = bucket(ca, b); 74 74 - 75 68 return g->mark.gen - g->oldest_gen; 76 69 } 77 70

+1 -1

fs/bcachefs/buckets_types.h

reviewed

··· 37 37 const struct bucket_mark mark; 38 38 }; 39 39 40 40 - u16 io_time[2]; 40 40 + u64 io_time[2]; 41 41 u8 oldest_gen; 42 42 u8 gc_gen; 43 43 unsigned gen_valid:1;

+4 -4

fs/bcachefs/clock.c

reviewed

··· 19 19 20 20 spin_lock(&clock->timer_lock); 21 21 22 22 - if (time_after_eq((unsigned long) atomic_long_read(&clock->now), 22 22 + if (time_after_eq((unsigned long) atomic64_read(&clock->now), 23 23 timer->expire)) { 24 24 spin_unlock(&clock->timer_lock); 25 25 timer->fn(timer); ··· 146 146 void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) 147 147 { 148 148 struct io_timer *timer; 149 149 - unsigned long now = atomic_long_add_return(sectors, &clock->now); 149 149 + unsigned long now = atomic64_add_return(sectors, &clock->now); 150 150 151 151 while ((timer = get_expired_timer(clock, now))) 152 152 timer->fn(timer); ··· 158 158 unsigned i; 159 159 160 160 spin_lock(&clock->timer_lock); 161 161 - now = atomic_long_read(&clock->now); 161 161 + now = atomic64_read(&clock->now); 162 162 163 163 for (i = 0; i < clock->timers.used; i++) 164 164 pr_buf(out, "%ps:\t%li\n", ··· 175 175 176 176 int bch2_io_clock_init(struct io_clock *clock) 177 177 { 178 178 - atomic_long_set(&clock->now, 0); 178 178 + atomic64_set(&clock->now, 0); 179 179 spin_lock_init(&clock->timer_lock); 180 180 181 181 clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();

+1 -1

fs/bcachefs/clock_types.h

reviewed

··· 26 26 typedef HEAP(struct io_timer *) io_timer_heap; 27 27 28 28 struct io_clock { 29 29 - atomic_long_t now; 29 29 + atomic64_t now; 30 30 u16 __percpu *pcpu_buf; 31 31 unsigned max_slop; 32 32

fs/bcachefs/journal.c

reviewed

··· 1123 1123 j->entry_u64s_reserved += 1124 1124 BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX); 1125 1125 1126 1126 + j->entry_u64s_reserved += 1127 1127 + 2 * (sizeof(struct jset_entry_clock) / sizeof(u64)); 1128 1128 + 1126 1129 atomic64_set(&j->reservations.counter, 1127 1130 ((union journal_res_state) 1128 1131 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);

+28 -5

fs/bcachefs/journal_io.c

reviewed

··· 426 426 return ret; 427 427 } 428 428 429 429 + static int journal_entry_validate_clock(struct bch_fs *c, 430 430 + struct jset *jset, 431 431 + struct jset_entry *entry, 432 432 + int write) 433 433 + { 434 434 + struct jset_entry_clock *clock = 435 435 + container_of(entry, struct jset_entry_clock, entry); 436 436 + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 437 437 + int ret = 0; 438 438 + 439 439 + if (journal_entry_err_on(bytes != sizeof(*clock), 440 440 + c, "invalid journal entry clock: bad size")) { 441 441 + journal_entry_null_range(entry, vstruct_next(entry)); 442 442 + return ret; 443 443 + } 444 444 + 445 445 + if (journal_entry_err_on(clock->rw > 1, 446 446 + c, "invalid journal entry clock: bad rw")) { 447 447 + journal_entry_null_range(entry, vstruct_next(entry)); 448 448 + return ret; 449 449 + } 450 450 + 451 451 + fsck_err: 452 452 + return ret; 453 453 + } 454 454 + 429 455 struct jset_entry_ops { 430 456 int (*validate)(struct bch_fs *, struct jset *, 431 457 struct jset_entry *, int); ··· 1387 1361 1388 1362 end = bch2_btree_roots_to_journal_entries(c, jset->start, end); 1389 1363 1390 1390 - end = bch2_journal_super_entries_add_common(c, end, 1391 1391 - le64_to_cpu(jset->seq)); 1364 1364 + bch2_journal_super_entries_add_common(c, &end, 1365 1365 + le64_to_cpu(jset->seq)); 1392 1366 u64s = (u64 *) end - (u64 *) start; 1393 1367 BUG_ON(u64s > j->entry_u64s_reserved); 1394 1368 ··· 1397 1371 1398 1372 journal_write_compact(jset); 1399 1373 1400 1400 - jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); 1401 1401 - jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); 1402 1374 jset->magic = cpu_to_le64(jset_magic(c)); 1403 1403 - 1404 1375 jset->version = c->sb.version < bcachefs_metadata_version_new_versioning 1405 1376 ? cpu_to_le32(BCH_JSET_VERSION_OLD) 1406 1377 : cpu_to_le32(c->sb.version);

+2 -2

fs/bcachefs/movinggc.c

reviewed

··· 298 298 { 299 299 struct bch_fs *c = arg; 300 300 struct io_clock *clock = &c->io_clock[WRITE]; 301 301 - unsigned long last, wait; 301 301 + u64 last, wait; 302 302 303 303 set_freezable(); 304 304 ··· 306 306 if (kthread_wait_freezable(c->copy_gc_enabled)) 307 307 break; 308 308 309 309 - last = atomic_long_read(&clock->now); 309 309 + last = atomic64_read(&clock->now); 310 310 wait = bch2_copygc_wait_amount(c); 311 311 312 312 if (wait > clock->max_slop) {

+5 -5

fs/bcachefs/rebalance.c

reviewed

··· 169 169 unsigned long start, prev_start; 170 170 unsigned long prev_run_time, prev_run_cputime; 171 171 unsigned long cputime, prev_cputime; 172 172 - unsigned long io_start; 172 172 + u64 io_start; 173 173 long throttle; 174 174 175 175 set_freezable(); 176 176 177 177 - io_start = atomic_long_read(&clock->now); 177 177 + io_start = atomic64_read(&clock->now); 178 178 p = rebalance_work(c); 179 179 prev_start = jiffies; 180 180 prev_cputime = curr_cputime(); ··· 210 210 (20 - w.dev_most_full_percent), 211 211 50); 212 212 213 213 - if (atomic_long_read(&clock->now) + clock->max_slop < 213 213 + if (atomic64_read(&clock->now) + clock->max_slop < 214 214 r->throttled_until_iotime) { 215 215 r->throttled_until_cputime = start + throttle; 216 216 r->state = REBALANCE_THROTTLED; ··· 229 229 max(p.dev_most_full_percent, 1U) / 230 230 max(w.dev_most_full_percent, 1U)); 231 231 232 232 - io_start = atomic_long_read(&clock->now); 232 232 + io_start = atomic64_read(&clock->now); 233 233 p = w; 234 234 prev_start = start; 235 235 prev_cputime = cputime; ··· 274 274 case REBALANCE_THROTTLED: 275 275 bch2_hprint(&PBUF(h1), 276 276 (r->throttled_until_iotime - 277 277 - atomic_long_read(&c->io_clock[WRITE].now)) << 9); 277 277 + atomic64_read(&c->io_clock[WRITE].now)) << 9); 278 278 pr_buf(out, "throttled for %lu sec or %s io\n", 279 279 (r->throttled_until_cputime - jiffies) / HZ, 280 280 h1);

+1 -1

fs/bcachefs/rebalance_types.h

reviewed

··· 17 17 atomic64_t work_unknown_dev; 18 18 19 19 enum rebalance_state state; 20 20 - unsigned long throttled_until_iotime; 20 20 + u64 throttled_until_iotime; 21 21 unsigned long throttled_until_cputime; 22 22 struct bch_move_stats move_stats; 23 23

+6 -13

fs/bcachefs/recovery.c

reviewed

··· 847 847 le64_to_cpu(bl_entry->end) + 1); 848 848 break; 849 849 } 850 850 + case BCH_JSET_ENTRY_clock: { 851 851 + struct jset_entry_clock *clock = 852 852 + container_of(entry, struct jset_entry_clock, entry); 853 853 + 854 854 + atomic64_set(&c->io_clock[clock->rw].now, clock->time); 855 855 + } 850 856 } 851 857 852 858 return ret; ··· 867 861 int ret; 868 862 869 863 if (clean) { 870 870 - c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock); 871 871 - c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock); 872 872 - 873 864 for (entry = clean->start; 874 865 entry != vstruct_end(&clean->field); 875 866 entry = vstruct_next(entry)) { ··· 878 875 list_for_each_entry(i, journal, list) { 879 876 if (i->ignore) 880 877 continue; 881 881 - 882 882 - c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); 883 883 - c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); 884 878 885 879 vstruct_for_each(&i->j, entry) { 886 880 ret = journal_replay_entry_early(c, entry); ··· 941 941 *cleanp = NULL; 942 942 return 0; 943 943 } 944 944 - 945 945 - mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, 946 946 - "superblock read clock %u doesn't match journal %u after clean shutdown", 947 947 - clean->read_clock, j->read_clock); 948 948 - mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, 949 949 - "superblock write clock %u doesn't match journal %u after clean shutdown", 950 950 - clean->write_clock, j->write_clock); 951 944 952 945 for (i = 0; i < BTREE_ID_NR; i++) { 953 946 char buf1[200], buf2[200];

+28 -34

fs/bcachefs/super-io.c

reviewed

··· 966 966 return ret; 967 967 } 968 968 969 969 - static void 970 970 - entry_init_u64s(struct jset_entry *entry, unsigned u64s) 969 969 + static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) 971 970 { 972 972 - memset(entry, 0, u64s * sizeof(u64)); 971 971 + struct jset_entry *entry = *end; 972 972 + unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); 973 973 974 974 + memset(entry, 0, u64s * sizeof(u64)); 974 975 /* 975 976 * The u64s field counts from the start of data, ignoring the shared 976 977 * fields. 977 978 */ 978 979 entry->u64s = u64s - 1; 980 980 + 981 981 + *end = vstruct_next(*end); 982 982 + return entry; 979 983 } 980 984 981 981 - static void 982 982 - entry_init_size(struct jset_entry *entry, size_t size) 983 983 - { 984 984 - unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); 985 985 - entry_init_u64s(entry, u64s); 986 986 - } 987 987 - 988 988 - struct jset_entry * 989 989 - bch2_journal_super_entries_add_common(struct bch_fs *c, 990 990 - struct jset_entry *entry, 991 991 - u64 journal_seq) 985 985 + void bch2_journal_super_entries_add_common(struct bch_fs *c, 986 986 + struct jset_entry **end, 987 987 + u64 journal_seq) 992 988 { 993 989 unsigned i; 994 990 ··· 999 1003 1000 1004 { 1001 1005 struct jset_entry_usage *u = 1002 1002 - container_of(entry, struct jset_entry_usage, entry); 1006 1006 + container_of(jset_entry_init(end, sizeof(*u)), 1007 1007 + struct jset_entry_usage, entry); 1003 1008 1004 1004 - entry_init_size(entry, sizeof(*u)); 1005 1009 u->entry.type = BCH_JSET_ENTRY_usage; 1006 1010 u->entry.btree_id = FS_USAGE_INODES; 1007 1011 u->v = cpu_to_le64(c->usage_base->nr_inodes); 1008 1008 - 1009 1009 - entry = vstruct_next(entry); 1010 1012 } 1011 1013 1012 1014 { 1013 1015 struct jset_entry_usage *u = 1014 1014 - container_of(entry, struct jset_entry_usage, entry); 1016 1016 + container_of(jset_entry_init(end, sizeof(*u)), 1017 1017 + struct jset_entry_usage, entry); 1015 1018 1016 1016 - entry_init_size(entry, sizeof(*u)); 1017 1019 u->entry.type = BCH_JSET_ENTRY_usage; 1018 1020 u->entry.btree_id = FS_USAGE_KEY_VERSION; 1019 1021 u->v = cpu_to_le64(atomic64_read(&c->key_version)); 1020 1020 - 1021 1021 - entry = vstruct_next(entry); 1022 1022 } 1023 1023 1024 1024 for (i = 0; i < BCH_REPLICAS_MAX; i++) { 1025 1025 struct jset_entry_usage *u = 1026 1026 - container_of(entry, struct jset_entry_usage, entry); 1026 1026 + container_of(jset_entry_init(end, sizeof(*u)), 1027 1027 + struct jset_entry_usage, entry); 1027 1028 1028 1028 - entry_init_size(entry, sizeof(*u)); 1029 1029 u->entry.type = BCH_JSET_ENTRY_usage; 1030 1030 u->entry.btree_id = FS_USAGE_RESERVED; 1031 1031 u->entry.level = i; 1032 1032 u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); 1033 1033 - 1034 1034 - entry = vstruct_next(entry); 1035 1033 } 1036 1034 1037 1035 for (i = 0; i < c->replicas.nr; i++) { 1038 1036 struct bch_replicas_entry *e = 1039 1037 cpu_replicas_entry(&c->replicas, i); 1040 1038 struct jset_entry_data_usage *u = 1041 1041 - container_of(entry, struct jset_entry_data_usage, entry); 1039 1039 + container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs), 1040 1040 + struct jset_entry_data_usage, entry); 1042 1041 1043 1043 - entry_init_size(entry, sizeof(*u) + e->nr_devs); 1044 1042 u->entry.type = BCH_JSET_ENTRY_data_usage; 1045 1043 u->v = cpu_to_le64(c->usage_base->replicas[i]); 1046 1044 unsafe_memcpy(&u->r, e, replicas_entry_bytes(e), 1047 1045 "embedded variable length struct"); 1048 1048 - 1049 1049 - entry = vstruct_next(entry); 1050 1046 } 1051 1047 1052 1048 percpu_up_read(&c->mark_lock); 1053 1049 1054 1054 - return entry; 1050 1050 + for (i = 0; i < 2; i++) { 1051 1051 + struct jset_entry_clock *clock = 1052 1052 + container_of(jset_entry_init(end, sizeof(*clock)), 1053 1053 + struct jset_entry_clock, entry); 1054 1054 + 1055 1055 + clock->entry.type = BCH_JSET_ENTRY_clock; 1056 1056 + clock->rw = i; 1057 1057 + clock->time = atomic64_read(&c->io_clock[i].now); 1058 1058 + } 1055 1059 } 1056 1060 1057 1061 void bch2_fs_mark_clean(struct bch_fs *c) ··· 1080 1084 } 1081 1085 1082 1086 sb_clean->flags = 0; 1083 1083 - sb_clean->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); 1084 1084 - sb_clean->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); 1085 1087 sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1); 1086 1088 1087 1089 /* Trying to catch outstanding bug: */ 1088 1090 BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); 1089 1091 1090 1092 entry = sb_clean->start; 1091 1091 - entry = bch2_journal_super_entries_add_common(c, entry, 0); 1093 1093 + bch2_journal_super_entries_add_common(c, &entry, 0); 1092 1094 entry = bch2_btree_roots_to_journal_entries(c, entry, entry); 1093 1095 BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); 1094 1096

+2 -3

fs/bcachefs/super-io.h

reviewed

··· 122 122 123 123 /* BCH_SB_FIELD_clean: */ 124 124 125 125 - struct jset_entry * 126 126 - bch2_journal_super_entries_add_common(struct bch_fs *, 127 127 - struct jset_entry *, u64); 125 125 + void bch2_journal_super_entries_add_common(struct bch_fs *, 126 126 + struct jset_entry **, u64); 128 127 129 128 void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int); 130 129

-6

fs/bcachefs/super.c

reviewed

··· 181 181 bch2_copygc_stop(c); 182 182 bch2_gc_thread_stop(c); 183 183 184 184 - bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale); 185 185 - bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale); 186 186 - 187 184 /* 188 185 * Flush journal before stopping allocators, because flushing journal 189 186 * blacklist entries involves allocating new btree nodes: ··· 402 405 for_each_rw_member(ca, c, i) 403 406 bch2_dev_allocator_add(c, ca); 404 407 bch2_recalc_capacity(c); 405 405 - 406 406 - bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale); 407 407 - bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale); 408 408 409 409 for_each_rw_member(ca, c, i) { 410 410 ret = bch2_dev_allocator_start(ca);

+2 -2

fs/bcachefs/sysfs.c

reviewed

··· 705 705 { 706 706 int rw = (private ? 1 : 0); 707 707 708 708 - return bucket_last_io(c, bucket(ca, b), rw); 708 708 + return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw]; 709 709 } 710 710 711 711 static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca, ··· 718 718 static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca, 719 719 size_t b, void *private) 720 720 { 721 721 - return bucket_gc_gen(ca, b); 721 721 + return bucket_gc_gen(bucket(ca, b)); 722 722 } 723 723 724 724 static int unsigned_cmp(const void *_l, const void *_r)