Merge tag 'bcachefs-2024-03-13' of https://evilpiepirate.org/git/bcachefs

+30

Documentation/filesystems/bcachefs/errorcodes.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + bcachefs private error codes 4 + ---------------------------- 5 + 6 + In bcachefs, as a hard rule we do not throw or directly use standard error 7 + codes (-EINVAL, -EBUSY, etc.). Instead, we define private error codes as needed 8 + in fs/bcachefs/errcode.h. 9 + 10 + This gives us much better error messages and makes debugging much easier. Any 11 + direct uses of standard error codes you see in the source code are simply old 12 + code that has yet to be converted - feel free to clean it up! 13 + 14 + Private error codes may subtype another error code, this allows for grouping of 15 + related errors that should be handled similarly (e.g. transaction restart 16 + errors), as well as specifying which standard error code should be returned at 17 + the bcachefs module boundary. 18 + 19 + At the module boundary, we use bch2_err_class() to convert to a standard error 20 + code; this also emits a trace event so that the original error code be 21 + recovered even if it wasn't logged. 22 + 23 + Do not reuse error codes! Generally speaking, a private error code should only 24 + be thrown in one place. That means that when we see it in a log message we can 25 + see, unambiguously, exactly which file and line number it was returned from. 26 + 27 + Try to give error codes names that are as reasonably descriptive of the error 28 + as possible. Frequently, the error will be logged at a place far removed from 29 + where the error was generated; good names for error codes mean much more 30 + descriptive and useful error messages.

+1

MAINTAINERS

··· 3555 3555 L: linux-bcachefs@vger.kernel.org 3556 3556 S: Supported 3557 3557 C: irc://irc.oftc.net/bcache 3558 + T: git https://evilpiepirate.org/git/bcachefs.git 3558 3559 F: fs/bcachefs/ 3559 3560 3560 3561 BDISP ST MEDIA DRIVER

+4

fs/bcachefs/Makefile

··· 82 82 super-io.o \ 83 83 sysfs.o \ 84 84 tests.o \ 85 + time_stats.o \ 85 86 thread_with_file.o \ 86 87 trace.o \ 87 88 two_state_shared_lock.o \ ··· 91 90 xattr.o 92 91 93 92 obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o 93 + 94 + # Silence "note: xyz changed in GCC X.X" messages 95 + subdir-ccflags-y += $(call cc-disable-warning, psabi)

+177 -42

fs/bcachefs/alloc_background.c

··· 29 29 #include <linux/sched/task.h> 30 30 #include <linux/sort.h> 31 31 32 + static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket); 33 + 32 34 /* Persistent alloc info: */ 33 35 34 36 static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { ··· 862 860 *bucket_gen(ca, new.k->p.offset) = new_a->gen; 863 861 864 862 bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false); 863 + percpu_up_read(&c->mark_lock); 865 864 866 - if (new_a->data_type == BCH_DATA_free && 867 - (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk)) 865 + #define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; }) 866 + #define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr) 867 + #define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk) 868 + 869 + if (statechange(a->data_type == BCH_DATA_free) && 870 + bucket_flushed(new_a)) 868 871 closure_wake_up(&c->freelist_wait); 869 872 870 - if (new_a->data_type == BCH_DATA_need_discard && 871 - (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk)) 872 - bch2_do_discards(c); 873 + if (statechange(a->data_type == BCH_DATA_need_discard) && 874 + !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && 875 + bucket_flushed(new_a)) 876 + bch2_discard_one_bucket_fast(c, new.k->p); 873 877 874 - if (old_a->data_type != BCH_DATA_cached && 875 - new_a->data_type == BCH_DATA_cached && 878 + if (statechange(a->data_type == BCH_DATA_cached) && 879 + !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && 876 880 should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) 877 881 bch2_do_invalidates(c); 878 882 879 - if (new_a->data_type == BCH_DATA_need_gc_gens) 883 + if (statechange(a->data_type == BCH_DATA_need_gc_gens)) 880 884 bch2_do_gc_gens(c); 881 - percpu_up_read(&c->mark_lock); 882 885 } 883 886 884 887 if ((flags & BTREE_TRIGGER_GC) && ··· 1052 1045 if (ret) 1053 1046 goto err; 1054 1047 1055 - if (k.k->type != discard_key_type && 1056 - (c->opts.reconstruct_alloc || 1057 - fsck_err(c, need_discard_key_wrong, 1058 - "incorrect key in need_discard btree (got %s should be %s)\n" 1059 - " %s", 1060 - bch2_bkey_types[k.k->type], 1061 - bch2_bkey_types[discard_key_type], 1062 - (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { 1048 + if (fsck_err_on(k.k->type != discard_key_type, 1049 + c, need_discard_key_wrong, 1050 + "incorrect key in need_discard btree (got %s should be %s)\n" 1051 + " %s", 1052 + bch2_bkey_types[k.k->type], 1053 + bch2_bkey_types[discard_key_type], 1054 + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { 1063 1055 struct bkey_i *update = 1064 1056 bch2_trans_kmalloc(trans, sizeof(*update)); 1065 1057 ··· 1082 1076 if (ret) 1083 1077 goto err; 1084 1078 1085 - if (k.k->type != freespace_key_type && 1086 - (c->opts.reconstruct_alloc || 1087 - fsck_err(c, freespace_key_wrong, 1088 - "incorrect key in freespace btree (got %s should be %s)\n" 1089 - " %s", 1090 - bch2_bkey_types[k.k->type], 1091 - bch2_bkey_types[freespace_key_type], 1092 - (printbuf_reset(&buf), 1093 - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { 1079 + if (fsck_err_on(k.k->type != freespace_key_type, 1080 + c, freespace_key_wrong, 1081 + "incorrect key in freespace btree (got %s should be %s)\n" 1082 + " %s", 1083 + bch2_bkey_types[k.k->type], 1084 + bch2_bkey_types[freespace_key_type], 1085 + (printbuf_reset(&buf), 1086 + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { 1094 1087 struct bkey_i *update = 1095 1088 bch2_trans_kmalloc(trans, sizeof(*update)); 1096 1089 ··· 1113 1108 if (ret) 1114 1109 goto err; 1115 1110 1116 - if (a->gen != alloc_gen(k, gens_offset) && 1117 - (c->opts.reconstruct_alloc || 1118 - fsck_err(c, bucket_gens_key_wrong, 1119 - "incorrect gen in bucket_gens btree (got %u should be %u)\n" 1120 - " %s", 1121 - alloc_gen(k, gens_offset), a->gen, 1122 - (printbuf_reset(&buf), 1123 - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { 1111 + if (fsck_err_on(a->gen != alloc_gen(k, gens_offset), 1112 + c, bucket_gens_key_wrong, 1113 + "incorrect gen in bucket_gens btree (got %u should be %u)\n" 1114 + " %s", 1115 + alloc_gen(k, gens_offset), a->gen, 1116 + (printbuf_reset(&buf), 1117 + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { 1124 1118 struct bkey_i_bucket_gens *g = 1125 1119 bch2_trans_kmalloc(trans, sizeof(*g)); 1126 1120 ··· 1171 1167 1172 1168 *end = bkey_min(k.k->p, *end); 1173 1169 1174 - if (k.k->type != KEY_TYPE_set && 1175 - (c->opts.reconstruct_alloc || 1176 - fsck_err(c, freespace_hole_missing, 1177 - "hole in alloc btree missing in freespace btree\n" 1178 - " device %llu buckets %llu-%llu", 1179 - freespace_iter->pos.inode, 1180 - freespace_iter->pos.offset, 1181 - end->offset))) { 1170 + if (fsck_err_on(k.k->type != KEY_TYPE_set, 1171 + c, freespace_hole_missing, 1172 + "hole in alloc btree missing in freespace btree\n" 1173 + " device %llu buckets %llu-%llu", 1174 + freespace_iter->pos.inode, 1175 + freespace_iter->pos.offset, 1176 + end->offset)) { 1182 1177 struct bkey_i *update = 1183 1178 bch2_trans_kmalloc(trans, sizeof(*update)); 1184 1179 ··· 1607 1604 return ret; 1608 1605 } 1609 1606 1607 + static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket) 1608 + { 1609 + int ret; 1610 + 1611 + mutex_lock(&c->discard_buckets_in_flight_lock); 1612 + darray_for_each(c->discard_buckets_in_flight, i) 1613 + if (bkey_eq(*i, bucket)) { 1614 + ret = -EEXIST; 1615 + goto out; 1616 + } 1617 + 1618 + ret = darray_push(&c->discard_buckets_in_flight, bucket); 1619 + out: 1620 + mutex_unlock(&c->discard_buckets_in_flight_lock); 1621 + return ret; 1622 + } 1623 + 1624 + static void discard_in_flight_remove(struct bch_fs *c, struct bpos bucket) 1625 + { 1626 + mutex_lock(&c->discard_buckets_in_flight_lock); 1627 + darray_for_each(c->discard_buckets_in_flight, i) 1628 + if (bkey_eq(*i, bucket)) { 1629 + darray_remove_item(&c->discard_buckets_in_flight, i); 1630 + goto found; 1631 + } 1632 + BUG(); 1633 + found: 1634 + mutex_unlock(&c->discard_buckets_in_flight_lock); 1635 + } 1636 + 1610 1637 struct discard_buckets_state { 1611 1638 u64 seen; 1612 1639 u64 open; ··· 1675 1642 struct bch_dev *ca; 1676 1643 struct bkey_i_alloc_v4 *a; 1677 1644 struct printbuf buf = PRINTBUF; 1645 + bool discard_locked = false; 1678 1646 int ret = 0; 1679 1647 1680 1648 ca = bch_dev_bkey_exists(c, pos.inode); ··· 1743 1709 goto out; 1744 1710 } 1745 1711 1712 + if (discard_in_flight_add(c, SPOS(iter.pos.inode, iter.pos.offset, true))) 1713 + goto out; 1714 + 1715 + discard_locked = true; 1716 + 1746 1717 if (!bkey_eq(*discard_pos_done, iter.pos) && 1747 1718 ca->mi.discard && !c->opts.nochanges) { 1748 1719 /* ··· 1779 1740 count_event(c, bucket_discard); 1780 1741 s->discarded++; 1781 1742 out: 1743 + if (discard_locked) 1744 + discard_in_flight_remove(c, iter.pos); 1782 1745 s->seen++; 1783 1746 bch2_trans_iter_exit(trans, &iter); 1784 1747 percpu_ref_put(&ca->io_ref); ··· 1818 1777 if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) && 1819 1778 !queue_work(c->write_ref_wq, &c->discard_work)) 1820 1779 bch2_write_ref_put(c, BCH_WRITE_REF_discard); 1780 + } 1781 + 1782 + static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket) 1783 + { 1784 + struct btree_iter iter; 1785 + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_INTENT); 1786 + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); 1787 + int ret = bkey_err(k); 1788 + if (ret) 1789 + goto err; 1790 + 1791 + struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k); 1792 + ret = PTR_ERR_OR_ZERO(a); 1793 + if (ret) 1794 + goto err; 1795 + 1796 + SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); 1797 + a->v.data_type = alloc_data_type(a->v, a->v.data_type); 1798 + 1799 + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); 1800 + err: 1801 + bch2_trans_iter_exit(trans, &iter); 1802 + return ret; 1803 + } 1804 + 1805 + static void bch2_do_discards_fast_work(struct work_struct *work) 1806 + { 1807 + struct bch_fs *c = container_of(work, struct bch_fs, discard_fast_work); 1808 + 1809 + while (1) { 1810 + bool got_bucket = false; 1811 + struct bpos bucket; 1812 + struct bch_dev *ca; 1813 + 1814 + mutex_lock(&c->discard_buckets_in_flight_lock); 1815 + darray_for_each(c->discard_buckets_in_flight, i) { 1816 + if (i->snapshot) 1817 + continue; 1818 + 1819 + ca = bch_dev_bkey_exists(c, i->inode); 1820 + 1821 + if (!percpu_ref_tryget(&ca->io_ref)) { 1822 + darray_remove_item(&c->discard_buckets_in_flight, i); 1823 + continue; 1824 + } 1825 + 1826 + got_bucket = true; 1827 + bucket = *i; 1828 + i->snapshot = true; 1829 + break; 1830 + } 1831 + mutex_unlock(&c->discard_buckets_in_flight_lock); 1832 + 1833 + if (!got_bucket) 1834 + break; 1835 + 1836 + if (ca->mi.discard && !c->opts.nochanges) 1837 + blkdev_issue_discard(ca->disk_sb.bdev, 1838 + bucket.offset * ca->mi.bucket_size, 1839 + ca->mi.bucket_size, 1840 + GFP_KERNEL); 1841 + 1842 + int ret = bch2_trans_do(c, NULL, NULL, 1843 + BCH_WATERMARK_btree| 1844 + BCH_TRANS_COMMIT_no_enospc, 1845 + bch2_clear_bucket_needs_discard(trans, bucket)); 1846 + bch_err_fn(c, ret); 1847 + 1848 + percpu_ref_put(&ca->io_ref); 1849 + discard_in_flight_remove(c, bucket); 1850 + 1851 + if (ret) 1852 + break; 1853 + } 1854 + 1855 + bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); 1856 + } 1857 + 1858 + static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket) 1859 + { 1860 + struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); 1861 + 1862 + if (!percpu_ref_is_dying(&ca->io_ref) && 1863 + !discard_in_flight_add(c, bucket) && 1864 + bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) && 1865 + !queue_work(c->write_ref_wq, &c->discard_fast_work)) 1866 + bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); 1821 1867 } 1822 1868 1823 1869 static int invalidate_one_bucket(struct btree_trans *trans, ··· 2338 2210 set_bit(ca->dev_idx, c->rw_devs[i].d); 2339 2211 } 2340 2212 2213 + void bch2_fs_allocator_background_exit(struct bch_fs *c) 2214 + { 2215 + darray_exit(&c->discard_buckets_in_flight); 2216 + } 2217 + 2341 2218 void bch2_fs_allocator_background_init(struct bch_fs *c) 2342 2219 { 2343 2220 spin_lock_init(&c->freelist_lock); 2221 + mutex_init(&c->discard_buckets_in_flight_lock); 2344 2222 INIT_WORK(&c->discard_work, bch2_do_discards_work); 2223 + INIT_WORK(&c->discard_fast_work, bch2_do_discards_fast_work); 2345 2224 INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work); 2346 2225 }

+1

fs/bcachefs/alloc_background.h

··· 269 269 void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); 270 270 void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); 271 271 272 + void bch2_fs_allocator_background_exit(struct bch_fs *); 272 273 void bch2_fs_allocator_background_init(struct bch_fs *); 273 274 274 275 #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */

+4 -9

fs/bcachefs/alloc_foreground.c

··· 236 236 if (cl) 237 237 closure_wait(&c->open_buckets_wait, cl); 238 238 239 - track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], 240 - &c->blocked_allocate_open_bucket, true); 239 + track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true); 241 240 spin_unlock(&c->freelist_lock); 242 241 return ERR_PTR(-BCH_ERR_open_buckets_empty); 243 242 } ··· 262 263 ca->nr_open_buckets++; 263 264 bch2_open_bucket_hash_add(c, ob); 264 265 265 - track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], 266 - &c->blocked_allocate_open_bucket, false); 267 - 268 - track_event_change(&c->times[BCH_TIME_blocked_allocate], 269 - &c->blocked_allocate, false); 266 + track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false); 267 + track_event_change(&c->times[BCH_TIME_blocked_allocate], false); 270 268 271 269 spin_unlock(&c->freelist_lock); 272 270 return ob; ··· 551 555 goto again; 552 556 } 553 557 554 - track_event_change(&c->times[BCH_TIME_blocked_allocate], 555 - &c->blocked_allocate, true); 558 + track_event_change(&c->times[BCH_TIME_blocked_allocate], true); 556 559 557 560 ob = ERR_PTR(-BCH_ERR_freelist_empty); 558 561 goto err;

+49 -94

fs/bcachefs/backpointers.c

··· 131 131 printbuf_exit(&buf); 132 132 133 133 if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { 134 - bch2_inconsistent_error(c); 135 - return -EIO; 134 + return bch2_inconsistent_error(c) ? BCH_ERR_erofs_unfixed_errors : 0; 136 135 } else { 137 136 return 0; 138 137 } ··· 477 478 prt_printf(&buf, "\nbp pos "); 478 479 bch2_bpos_to_text(&buf, bp_iter.pos); 479 480 480 - if (c->opts.reconstruct_alloc || 481 - fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) 481 + if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) 482 482 ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true); 483 483 484 484 goto out; ··· 553 555 }; 554 556 } 555 557 556 - static size_t btree_nodes_fit_in_ram(struct bch_fs *c) 558 + static u64 mem_may_pin_bytes(struct bch_fs *c) 557 559 { 558 560 struct sysinfo i; 559 - u64 mem_bytes; 560 - 561 561 si_meminfo(&i); 562 - mem_bytes = i.totalram * i.mem_unit; 563 - return div_u64(mem_bytes >> 1, c->opts.btree_node_size); 562 + 563 + u64 mem_bytes = i.totalram * i.mem_unit; 564 + return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100); 565 + } 566 + 567 + static size_t btree_nodes_fit_in_ram(struct bch_fs *c) 568 + { 569 + return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size); 564 570 } 565 571 566 572 static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, 567 - unsigned btree_leaf_mask, 568 - unsigned btree_interior_mask, 573 + u64 btree_leaf_mask, 574 + u64 btree_interior_mask, 569 575 struct bbpos start, struct bbpos *end) 570 576 { 571 - struct btree_iter iter; 572 - struct bkey_s_c k; 573 - size_t btree_nodes = btree_nodes_fit_in_ram(trans->c); 574 - enum btree_id btree; 577 + struct bch_fs *c = trans->c; 578 + s64 mem_may_pin = mem_may_pin_bytes(c); 575 579 int ret = 0; 576 580 577 - for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) { 578 - unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2; 581 + btree_interior_mask |= btree_leaf_mask; 582 + 583 + c->btree_cache.pinned_nodes_leaf_mask = btree_leaf_mask; 584 + c->btree_cache.pinned_nodes_interior_mask = btree_interior_mask; 585 + c->btree_cache.pinned_nodes_start = start; 586 + c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX; 587 + 588 + for (enum btree_id btree = start.btree; 589 + btree < BTREE_ID_NR && !ret; 590 + btree++) { 591 + unsigned depth = ((1U << btree) & btree_leaf_mask) ? 0 : 1; 592 + struct btree_iter iter; 593 + struct btree *b; 579 594 580 595 if (!((1U << btree) & btree_leaf_mask) && 581 596 !((1U << btree) & btree_interior_mask)) 582 597 continue; 583 598 584 - bch2_trans_node_iter_init(trans, &iter, btree, 585 - btree == start.btree ? start.pos : POS_MIN, 586 - 0, depth, 0); 587 - /* 588 - * for_each_btree_key_contineu() doesn't check the return value 589 - * from bch2_btree_iter_advance(), which is needed when 590 - * iterating over interior nodes where we'll see keys at 591 - * SPOS_MAX: 592 - */ 593 - do { 594 - k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0); 595 - ret = bkey_err(k); 596 - if (!k.k || ret) 597 - break; 598 - 599 - --btree_nodes; 600 - if (!btree_nodes) { 601 - *end = BBPOS(btree, k.k->p); 599 + __for_each_btree_node(trans, iter, btree, 600 + btree == start.btree ? start.pos : POS_MIN, 601 + 0, depth, BTREE_ITER_PREFETCH, b, ret) { 602 + mem_may_pin -= btree_buf_bytes(b); 603 + if (mem_may_pin <= 0) { 604 + c->btree_cache.pinned_nodes_end = *end = 605 + BBPOS(btree, b->key.k.p); 602 606 bch2_trans_iter_exit(trans, &iter); 603 607 return 0; 604 608 } 605 - } while (bch2_btree_iter_advance(&iter)); 609 + } 606 610 bch2_trans_iter_exit(trans, &iter); 607 611 } 608 612 609 - *end = BBPOS_MAX; 610 613 return ret; 611 614 } 612 615 ··· 665 666 return 0; 666 667 } 667 668 668 - static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c, 669 - struct bpos bucket) 670 - { 671 - return bch2_dev_exists2(c, bucket.inode) 672 - ? bucket_pos_to_bp(c, bucket, 0) 673 - : bucket; 674 - } 675 - 676 - static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, 677 - struct bpos start, struct bpos *end) 678 - { 679 - struct btree_iter alloc_iter; 680 - struct btree_iter bp_iter; 681 - struct bkey_s_c alloc_k, bp_k; 682 - size_t btree_nodes = btree_nodes_fit_in_ram(trans->c); 683 - bool alloc_end = false, bp_end = false; 684 - int ret = 0; 685 - 686 - bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc, 687 - start, 0, 1, 0); 688 - bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers, 689 - bucket_pos_to_bp_safe(trans->c, start), 0, 1, 0); 690 - while (1) { 691 - alloc_k = !alloc_end 692 - ? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0) 693 - : bkey_s_c_null; 694 - bp_k = !bp_end 695 - ? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0) 696 - : bkey_s_c_null; 697 - 698 - ret = bkey_err(alloc_k) ?: bkey_err(bp_k); 699 - if ((!alloc_k.k && !bp_k.k) || ret) { 700 - *end = SPOS_MAX; 701 - break; 702 - } 703 - 704 - --btree_nodes; 705 - if (!btree_nodes) { 706 - *end = alloc_k.k ? alloc_k.k->p : SPOS_MAX; 707 - break; 708 - } 709 - 710 - if (bpos_lt(alloc_iter.pos, SPOS_MAX) && 711 - bpos_lt(bucket_pos_to_bp_safe(trans->c, alloc_iter.pos), bp_iter.pos)) { 712 - if (!bch2_btree_iter_advance(&alloc_iter)) 713 - alloc_end = true; 714 - } else { 715 - if (!bch2_btree_iter_advance(&bp_iter)) 716 - bp_end = true; 717 - } 718 - } 719 - bch2_trans_iter_exit(trans, &bp_iter); 720 - bch2_trans_iter_exit(trans, &alloc_iter); 721 - return ret; 722 - } 723 - 724 669 int bch2_check_extents_to_backpointers(struct bch_fs *c) 725 670 { 726 671 struct btree_trans *trans = bch2_trans_get(c); ··· 675 732 bkey_init(&s.last_flushed.k->k); 676 733 677 734 while (1) { 678 - ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end); 735 + struct bbpos end; 736 + ret = bch2_get_btree_in_memory_pos(trans, 737 + BIT_ULL(BTREE_ID_backpointers), 738 + BIT_ULL(BTREE_ID_backpointers), 739 + BBPOS(BTREE_ID_backpointers, s.bucket_start), &end); 679 740 if (ret) 680 741 break; 742 + 743 + s.bucket_end = end.pos; 681 744 682 745 if ( bpos_eq(s.bucket_start, POS_MIN) && 683 746 !bpos_eq(s.bucket_end, SPOS_MAX)) ··· 711 762 } 712 763 bch2_trans_put(trans); 713 764 bch2_bkey_buf_exit(&s.last_flushed, c); 765 + 766 + c->btree_cache.pinned_nodes_leaf_mask = 0; 767 + c->btree_cache.pinned_nodes_interior_mask = 0; 714 768 715 769 bch_err_fn(c, ret); 716 770 return ret; ··· 819 867 start = bbpos_successor(end); 820 868 } 821 869 bch2_trans_put(trans); 870 + 871 + c->btree_cache.pinned_nodes_leaf_mask = 0; 872 + c->btree_cache.pinned_nodes_interior_mask = 0; 822 873 823 874 bch_err_fn(c, ret); 824 875 return ret;

+1 -1

fs/bcachefs/bbpos_types.h

··· 13 13 } 14 14 15 15 #define BBPOS_MIN BBPOS(0, POS_MIN) 16 - #define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX) 16 + #define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, SPOS_MAX) 17 17 18 18 #endif /* _BCACHEFS_BBPOS_TYPES_H */

+14 -7

fs/bcachefs/bcachefs.h

··· 212 212 #include "recovery_types.h" 213 213 #include "sb-errors_types.h" 214 214 #include "seqmutex.h" 215 + #include "time_stats.h" 215 216 #include "util.h" 216 217 217 218 #ifdef CONFIG_BCACHEFS_DEBUG ··· 265 264 #endif 266 265 267 266 #define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") 267 + 268 + __printf(2, 3) 269 + void bch2_print_opts(struct bch_opts *, const char *, ...); 268 270 269 271 __printf(2, 3) 270 272 void __bch2_print(struct bch_fs *c, const char *fmt, ...); ··· 508 504 GC_PHASE_BTREE_deleted_inodes, 509 505 GC_PHASE_BTREE_logged_ops, 510 506 GC_PHASE_BTREE_rebalance_work, 507 + GC_PHASE_BTREE_subvolume_children, 511 508 512 509 GC_PHASE_PENDING_DELETE, 513 510 }; ··· 598 593 599 594 /* The rest of this all shows up in sysfs */ 600 595 atomic64_t cur_latency[2]; 601 - struct bch2_time_stats io_latency[2]; 596 + struct bch2_time_stats_quantiles io_latency[2]; 602 597 603 598 #define CONGESTED_MAX 1024 604 599 atomic_t congested; ··· 668 663 }; 669 664 670 665 struct journal_keys { 666 + /* must match layout in darray_types.h */ 667 + size_t nr, size; 671 668 struct journal_key { 672 669 u64 journal_seq; 673 670 u32 journal_offset; ··· 678 671 bool allocated; 679 672 bool overwritten; 680 673 struct bkey_i *k; 681 - } *d; 674 + } *data; 682 675 /* 683 676 * Gap buffer: instead of all the empty space in the array being at the 684 677 * end of the buffer - from @nr to @size - the empty space is at @gap. 685 678 * This means that sequential insertions are O(n) instead of O(n^2). 686 679 */ 687 680 size_t gap; 688 - size_t nr; 689 - size_t size; 690 681 atomic_t ref; 691 682 bool initial_ref_held; 692 683 }; ··· 708 703 x(reflink) \ 709 704 x(fallocate) \ 710 705 x(discard) \ 706 + x(discard_fast) \ 711 707 x(invalidate) \ 712 708 x(delete_dead_snapshots) \ 713 709 x(snapshot_delete_pagecache) \ ··· 925 919 /* ALLOCATOR */ 926 920 spinlock_t freelist_lock; 927 921 struct closure_waitlist freelist_wait; 928 - u64 blocked_allocate; 929 - u64 blocked_allocate_open_bucket; 930 922 931 923 open_bucket_idx_t open_buckets_freelist; 932 924 open_bucket_idx_t open_buckets_nr_free; ··· 944 940 unsigned write_points_nr; 945 941 946 942 struct buckets_waiting_for_journal buckets_waiting_for_journal; 947 - struct work_struct discard_work; 948 943 struct work_struct invalidate_work; 944 + struct work_struct discard_work; 945 + struct mutex discard_buckets_in_flight_lock; 946 + DARRAY(struct bpos) discard_buckets_in_flight; 947 + struct work_struct discard_fast_work; 949 948 950 949 /* GARBAGE COLLECTION */ 951 950 struct task_struct *gc_thread;

+48 -5

fs/bcachefs/bcachefs_format.h

··· 189 189 __u32 hi; 190 190 __u64 lo; 191 191 #endif 192 - } __packed __aligned(4); 192 + } __packed 193 + #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 194 + __aligned(4) 195 + #endif 196 + ; 193 197 194 198 struct bkey { 195 199 /* Size of combined key and value, in u64s */ ··· 226 222 227 223 __u8 pad[1]; 228 224 #endif 229 - } __packed __aligned(8); 225 + } __packed 226 + #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 227 + /* 228 + * The big-endian version of bkey can't be compiled by rustc with the "aligned" 229 + * attr since it doesn't allow types to have both "packed" and "aligned" attrs. 230 + * So for Rust compatibility, don't include this. It can be included in the LE 231 + * version because the "packed" attr is redundant in that case. 232 + * 233 + * History: (quoting Kent) 234 + * 235 + * Specifically, when i was designing bkey, I wanted the header to be no 236 + * bigger than necessary so that bkey_packed could use the rest. That means that 237 + * decently offten extent keys will fit into only 8 bytes, instead of spilling over 238 + * to 16. 239 + * 240 + * But packed_bkey treats the part after the header - the packed section - 241 + * as a single multi word, variable length integer. And bkey, the unpacked 242 + * version, is just a special case version of a bkey_packed; all the packed 243 + * bkey code will work on keys in any packed format, the in-memory 244 + * representation of an unpacked key also is just one type of packed key... 245 + * 246 + * So that constrains the key part of a bkig endian bkey to start right 247 + * after the header. 248 + * 249 + * If we ever do a bkey_v2 and need to expand the hedaer by another byte for 250 + * some reason - that will clean up this wart. 251 + */ 252 + __aligned(8) 253 + #endif 254 + ; 230 255 231 256 struct bkey_packed { 232 257 __u64 _data[0]; ··· 873 840 x(snapshot_skiplists, BCH_VERSION(1, 1)) \ 874 841 x(deleted_inodes, BCH_VERSION(1, 2)) \ 875 842 x(rebalance_work, BCH_VERSION(1, 3)) \ 876 - x(member_seq, BCH_VERSION(1, 4)) 843 + x(member_seq, BCH_VERSION(1, 4)) \ 844 + x(subvolume_fs_parent, BCH_VERSION(1, 5)) \ 845 + x(btree_subvolume_children, BCH_VERSION(1, 6)) 877 846 878 847 enum bcachefs_metadata_version { 879 848 bcachefs_metadata_version_min = 9, ··· 1310 1275 x(dev_usage, 8) \ 1311 1276 x(log, 9) \ 1312 1277 x(overwrite, 10) \ 1313 - x(write_buffer_keys, 11) 1278 + x(write_buffer_keys, 11) \ 1279 + x(datetime, 12) 1314 1280 1315 1281 enum { 1316 1282 #define x(f, nr) BCH_JSET_ENTRY_##f = nr, ··· 1410 1374 struct jset_entry_log { 1411 1375 struct jset_entry entry; 1412 1376 u8 d[]; 1377 + } __packed __aligned(8); 1378 + 1379 + struct jset_entry_datetime { 1380 + struct jset_entry entry; 1381 + __le64 seconds; 1413 1382 } __packed __aligned(8); 1414 1383 1415 1384 /* ··· 1523 1482 BIT_ULL(KEY_TYPE_logged_op_truncate)| \ 1524 1483 BIT_ULL(KEY_TYPE_logged_op_finsert)) \ 1525 1484 x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \ 1526 - BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) 1485 + BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \ 1486 + x(subvolume_children, 19, 0, \ 1487 + BIT_ULL(KEY_TYPE_set)) 1527 1488 1528 1489 enum btree_id { 1529 1490 #define x(name, nr, ...) BTREE_ID_##name = nr,

+2 -205

fs/bcachefs/bkey.h

··· 4 4 5 5 #include <linux/bug.h> 6 6 #include "bcachefs_format.h" 7 - 7 + #include "bkey_types.h" 8 8 #include "btree_types.h" 9 9 #include "util.h" 10 10 #include "vstructs.h" ··· 30 30 void bch2_bkey_packed_to_binary_text(struct printbuf *, 31 31 const struct bkey_format *, 32 32 const struct bkey_packed *); 33 - 34 - /* bkey with split value, const */ 35 - struct bkey_s_c { 36 - const struct bkey *k; 37 - const struct bch_val *v; 38 - }; 39 - 40 - /* bkey with split value */ 41 - struct bkey_s { 42 - union { 43 - struct { 44 - struct bkey *k; 45 - struct bch_val *v; 46 - }; 47 - struct bkey_s_c s_c; 48 - }; 49 - }; 50 - 51 - #define bkey_p_next(_k) vstruct_next(_k) 52 - 53 - static inline struct bkey_i *bkey_next(struct bkey_i *k) 54 - { 55 - return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s); 56 - } 57 - 58 - #define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) 59 - 60 - static inline size_t bkey_val_bytes(const struct bkey *k) 61 - { 62 - return bkey_val_u64s(k) * sizeof(u64); 63 - } 64 - 65 - static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) 66 - { 67 - unsigned u64s = BKEY_U64s + val_u64s; 68 - 69 - BUG_ON(u64s > U8_MAX); 70 - k->u64s = u64s; 71 - } 72 - 73 - static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) 74 - { 75 - set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64))); 76 - } 77 - 78 - #define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) 79 - 80 - #define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) 81 - 82 - #define bkey_whiteout(_k) \ 83 - ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) 84 33 85 34 enum bkey_lr_packed { 86 35 BKEY_PACKED_BOTH, ··· 311 362 static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, 312 363 const struct bkey_packed *k) 313 364 { 314 - unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s; 315 - 316 - EBUG_ON(k->u64s < ret); 317 - return ret; 365 + return bkey_packed(k) ? format->key_u64s : BKEY_U64s; 318 366 } 319 367 320 368 static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, ··· 498 552 dst->k = *src.k; 499 553 memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k)); 500 554 } 501 - 502 - #define bkey_s_null ((struct bkey_s) { .k = NULL }) 503 - #define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) 504 - 505 - #define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) 506 - #define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) 507 - 508 - static inline struct bkey_s bkey_to_s(struct bkey *k) 509 - { 510 - return (struct bkey_s) { .k = k, .v = NULL }; 511 - } 512 - 513 - static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) 514 - { 515 - return (struct bkey_s_c) { .k = k, .v = NULL }; 516 - } 517 - 518 - static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) 519 - { 520 - return (struct bkey_s) { .k = &k->k, .v = &k->v }; 521 - } 522 - 523 - static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) 524 - { 525 - return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; 526 - } 527 - 528 - /* 529 - * For a given type of value (e.g. struct bch_extent), generates the types for 530 - * bkey + bch_extent - inline, split, split const - and also all the conversion 531 - * functions, which also check that the value is of the correct type. 532 - * 533 - * We use anonymous unions for upcasting - e.g. converting from e.g. a 534 - * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion 535 - * functions. 536 - */ 537 - #define x(name, ...) \ 538 - struct bkey_i_##name { \ 539 - union { \ 540 - struct bkey k; \ 541 - struct bkey_i k_i; \ 542 - }; \ 543 - struct bch_##name v; \ 544 - }; \ 545 - \ 546 - struct bkey_s_c_##name { \ 547 - union { \ 548 - struct { \ 549 - const struct bkey *k; \ 550 - const struct bch_##name *v; \ 551 - }; \ 552 - struct bkey_s_c s_c; \ 553 - }; \ 554 - }; \ 555 - \ 556 - struct bkey_s_##name { \ 557 - union { \ 558 - struct { \ 559 - struct bkey *k; \ 560 - struct bch_##name *v; \ 561 - }; \ 562 - struct bkey_s_c_##name c; \ 563 - struct bkey_s s; \ 564 - struct bkey_s_c s_c; \ 565 - }; \ 566 - }; \ 567 - \ 568 - static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ 569 - { \ 570 - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ 571 - return container_of(&k->k, struct bkey_i_##name, k); \ 572 - } \ 573 - \ 574 - static inline const struct bkey_i_##name * \ 575 - bkey_i_to_##name##_c(const struct bkey_i *k) \ 576 - { \ 577 - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ 578 - return container_of(&k->k, struct bkey_i_##name, k); \ 579 - } \ 580 - \ 581 - static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ 582 - { \ 583 - EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ 584 - return (struct bkey_s_##name) { \ 585 - .k = k.k, \ 586 - .v = container_of(k.v, struct bch_##name, v), \ 587 - }; \ 588 - } \ 589 - \ 590 - static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ 591 - { \ 592 - EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ 593 - return (struct bkey_s_c_##name) { \ 594 - .k = k.k, \ 595 - .v = container_of(k.v, struct bch_##name, v), \ 596 - }; \ 597 - } \ 598 - \ 599 - static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ 600 - { \ 601 - return (struct bkey_s_##name) { \ 602 - .k = &k->k, \ 603 - .v = &k->v, \ 604 - }; \ 605 - } \ 606 - \ 607 - static inline struct bkey_s_c_##name \ 608 - name##_i_to_s_c(const struct bkey_i_##name *k) \ 609 - { \ 610 - return (struct bkey_s_c_##name) { \ 611 - .k = &k->k, \ 612 - .v = &k->v, \ 613 - }; \ 614 - } \ 615 - \ 616 - static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ 617 - { \ 618 - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ 619 - return (struct bkey_s_##name) { \ 620 - .k = &k->k, \ 621 - .v = container_of(&k->v, struct bch_##name, v), \ 622 - }; \ 623 - } \ 624 - \ 625 - static inline struct bkey_s_c_##name \ 626 - bkey_i_to_s_c_##name(const struct bkey_i *k) \ 627 - { \ 628 - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ 629 - return (struct bkey_s_c_##name) { \ 630 - .k = &k->k, \ 631 - .v = container_of(&k->v, struct bch_##name, v), \ 632 - }; \ 633 - } \ 634 - \ 635 - static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ 636 - { \ 637 - struct bkey_i_##name *k = \ 638 - container_of(&_k->k, struct bkey_i_##name, k); \ 639 - \ 640 - bkey_init(&k->k); \ 641 - memset(&k->v, 0, sizeof(k->v)); \ 642 - k->k.type = KEY_TYPE_##name; \ 643 - set_bkey_val_bytes(&k->k, sizeof(k->v)); \ 644 - \ 645 - return k; \ 646 - } 647 - 648 - BCH_BKEY_TYPES(); 649 - #undef x 650 555 651 556 /* byte order helpers */ 652 557

+213

fs/bcachefs/bkey_types.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _BCACHEFS_BKEY_TYPES_H 3 + #define _BCACHEFS_BKEY_TYPES_H 4 + 5 + #include "bcachefs_format.h" 6 + 7 + /* 8 + * bkey_i - bkey with inline value 9 + * bkey_s - bkey with split value 10 + * bkey_s_c - bkey with split value, const 11 + */ 12 + 13 + #define bkey_p_next(_k) vstruct_next(_k) 14 + 15 + static inline struct bkey_i *bkey_next(struct bkey_i *k) 16 + { 17 + return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s); 18 + } 19 + 20 + #define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) 21 + 22 + static inline size_t bkey_val_bytes(const struct bkey *k) 23 + { 24 + return bkey_val_u64s(k) * sizeof(u64); 25 + } 26 + 27 + static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) 28 + { 29 + unsigned u64s = BKEY_U64s + val_u64s; 30 + 31 + BUG_ON(u64s > U8_MAX); 32 + k->u64s = u64s; 33 + } 34 + 35 + static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) 36 + { 37 + set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64))); 38 + } 39 + 40 + #define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) 41 + 42 + #define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) 43 + 44 + #define bkey_whiteout(_k) \ 45 + ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) 46 + 47 + /* bkey with split value, const */ 48 + struct bkey_s_c { 49 + const struct bkey *k; 50 + const struct bch_val *v; 51 + }; 52 + 53 + /* bkey with split value */ 54 + struct bkey_s { 55 + union { 56 + struct { 57 + struct bkey *k; 58 + struct bch_val *v; 59 + }; 60 + struct bkey_s_c s_c; 61 + }; 62 + }; 63 + 64 + #define bkey_s_null ((struct bkey_s) { .k = NULL }) 65 + #define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) 66 + 67 + #define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) 68 + #define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) 69 + 70 + static inline struct bkey_s bkey_to_s(struct bkey *k) 71 + { 72 + return (struct bkey_s) { .k = k, .v = NULL }; 73 + } 74 + 75 + static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) 76 + { 77 + return (struct bkey_s_c) { .k = k, .v = NULL }; 78 + } 79 + 80 + static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) 81 + { 82 + return (struct bkey_s) { .k = &k->k, .v = &k->v }; 83 + } 84 + 85 + static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) 86 + { 87 + return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; 88 + } 89 + 90 + /* 91 + * For a given type of value (e.g. struct bch_extent), generates the types for 92 + * bkey + bch_extent - inline, split, split const - and also all the conversion 93 + * functions, which also check that the value is of the correct type. 94 + * 95 + * We use anonymous unions for upcasting - e.g. converting from e.g. a 96 + * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion 97 + * functions. 98 + */ 99 + #define x(name, ...) \ 100 + struct bkey_i_##name { \ 101 + union { \ 102 + struct bkey k; \ 103 + struct bkey_i k_i; \ 104 + }; \ 105 + struct bch_##name v; \ 106 + }; \ 107 + \ 108 + struct bkey_s_c_##name { \ 109 + union { \ 110 + struct { \ 111 + const struct bkey *k; \ 112 + const struct bch_##name *v; \ 113 + }; \ 114 + struct bkey_s_c s_c; \ 115 + }; \ 116 + }; \ 117 + \ 118 + struct bkey_s_##name { \ 119 + union { \ 120 + struct { \ 121 + struct bkey *k; \ 122 + struct bch_##name *v; \ 123 + }; \ 124 + struct bkey_s_c_##name c; \ 125 + struct bkey_s s; \ 126 + struct bkey_s_c s_c; \ 127 + }; \ 128 + }; \ 129 + \ 130 + static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ 131 + { \ 132 + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ 133 + return container_of(&k->k, struct bkey_i_##name, k); \ 134 + } \ 135 + \ 136 + static inline const struct bkey_i_##name * \ 137 + bkey_i_to_##name##_c(const struct bkey_i *k) \ 138 + { \ 139 + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ 140 + return container_of(&k->k, struct bkey_i_##name, k); \ 141 + } \ 142 + \ 143 + static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ 144 + { \ 145 + EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ 146 + return (struct bkey_s_##name) { \ 147 + .k = k.k, \ 148 + .v = container_of(k.v, struct bch_##name, v), \ 149 + }; \ 150 + } \ 151 + \ 152 + static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ 153 + { \ 154 + EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ 155 + return (struct bkey_s_c_##name) { \ 156 + .k = k.k, \ 157 + .v = container_of(k.v, struct bch_##name, v), \ 158 + }; \ 159 + } \ 160 + \ 161 + static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ 162 + { \ 163 + return (struct bkey_s_##name) { \ 164 + .k = &k->k, \ 165 + .v = &k->v, \ 166 + }; \ 167 + } \ 168 + \ 169 + static inline struct bkey_s_c_##name \ 170 + name##_i_to_s_c(const struct bkey_i_##name *k) \ 171 + { \ 172 + return (struct bkey_s_c_##name) { \ 173 + .k = &k->k, \ 174 + .v = &k->v, \ 175 + }; \ 176 + } \ 177 + \ 178 + static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ 179 + { \ 180 + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ 181 + return (struct bkey_s_##name) { \ 182 + .k = &k->k, \ 183 + .v = container_of(&k->v, struct bch_##name, v), \ 184 + }; \ 185 + } \ 186 + \ 187 + static inline struct bkey_s_c_##name \ 188 + bkey_i_to_s_c_##name(const struct bkey_i *k) \ 189 + { \ 190 + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ 191 + return (struct bkey_s_c_##name) { \ 192 + .k = &k->k, \ 193 + .v = container_of(&k->v, struct bch_##name, v), \ 194 + }; \ 195 + } \ 196 + \ 197 + static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ 198 + { \ 199 + struct bkey_i_##name *k = \ 200 + container_of(&_k->k, struct bkey_i_##name, k); \ 201 + \ 202 + bkey_init(&k->k); \ 203 + memset(&k->v, 0, sizeof(k->v)); \ 204 + k->k.type = KEY_TYPE_##name; \ 205 + set_bkey_val_bytes(&k->k, sizeof(k->v)); \ 206 + \ 207 + return k; \ 208 + } 209 + 210 + BCH_BKEY_TYPES(); 211 + #undef x 212 + 213 + #endif /* _BCACHEFS_BKEY_TYPES_H */

+27 -10

fs/bcachefs/btree_cache.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 3 3 #include "bcachefs.h" 4 + #include "bbpos.h" 4 5 #include "bkey_buf.h" 5 6 #include "btree_cache.h" 6 7 #include "btree_io.h" ··· 61 60 62 61 clear_btree_node_just_written(b); 63 62 64 - kvpfree(b->data, btree_buf_bytes(b)); 63 + kvfree(b->data); 65 64 b->data = NULL; 66 65 #ifdef __KERNEL__ 67 66 kvfree(b->aux_data); ··· 95 94 { 96 95 BUG_ON(b->data || b->aux_data); 97 96 98 - b->data = kvpmalloc(btree_buf_bytes(b), gfp); 97 + b->data = kvmalloc(btree_buf_bytes(b), gfp); 99 98 if (!b->data) 100 99 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 101 100 #ifdef __KERNEL__ ··· 108 107 b->aux_data = NULL; 109 108 #endif 110 109 if (!b->aux_data) { 111 - kvpfree(b->data, btree_buf_bytes(b)); 110 + kvfree(b->data); 112 111 b->data = NULL; 113 112 return -BCH_ERR_ENOMEM_btree_node_mem_alloc; 114 113 } ··· 209 208 int ret = 0; 210 209 211 210 lockdep_assert_held(&bc->lock); 211 + 212 + struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); 213 + 214 + u64 mask = b->c.level 215 + ? bc->pinned_nodes_interior_mask 216 + : bc->pinned_nodes_leaf_mask; 217 + 218 + if ((mask & BIT_ULL(b->c.btree_id)) && 219 + bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && 220 + bbpos_cmp(bc->pinned_nodes_end, pos) >= 0) 221 + return -BCH_ERR_ENOMEM_btree_node_reclaim; 222 + 212 223 wait_on_io: 213 224 if (b->flags & ((1U << BTREE_NODE_dirty)| 214 225 (1U << BTREE_NODE_read_in_flight)| ··· 421 408 if (c->verify_data) 422 409 list_move(&c->verify_data->list, &bc->live); 423 410 424 - kvpfree(c->verify_ondisk, c->opts.btree_node_size); 411 + kvfree(c->verify_ondisk); 425 412 426 413 for (i = 0; i < btree_id_nr_alive(c); i++) { 427 414 struct btree_root *r = bch2_btree_id_root(c, i); ··· 724 711 b = bch2_btree_node_mem_alloc(trans, level != 0); 725 712 726 713 if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { 714 + if (!path) 715 + return b; 716 + 727 717 trans->memory_allocation_failure = true; 728 718 trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); 729 719 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); ··· 776 760 } 777 761 778 762 if (!six_relock_type(&b->c.lock, lock_type, seq)) { 779 - if (path) 780 - trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); 763 + BUG_ON(!path); 764 + 765 + trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); 781 766 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); 782 767 } 783 768 ··· 918 901 919 902 if (unlikely(btree_node_read_error(b))) { 920 903 six_unlock_type(&b->c.lock, lock_type); 921 - return ERR_PTR(-EIO); 904 + return ERR_PTR(-BCH_ERR_btree_node_read_error); 922 905 } 923 906 924 907 EBUG_ON(b->c.btree_id != path->btree_id); ··· 1009 992 1010 993 if (unlikely(btree_node_read_error(b))) { 1011 994 six_unlock_type(&b->c.lock, lock_type); 1012 - return ERR_PTR(-EIO); 995 + return ERR_PTR(-BCH_ERR_btree_node_read_error); 1013 996 } 1014 997 1015 998 EBUG_ON(b->c.btree_id != path->btree_id); ··· 1092 1075 1093 1076 if (unlikely(btree_node_read_error(b))) { 1094 1077 six_unlock_read(&b->c.lock); 1095 - b = ERR_PTR(-EIO); 1078 + b = ERR_PTR(-BCH_ERR_btree_node_read_error); 1096 1079 goto out; 1097 1080 } 1098 1081 ··· 1113 1096 struct btree_cache *bc = &c->btree_cache; 1114 1097 struct btree *b; 1115 1098 1116 - BUG_ON(trans && !btree_node_locked(path, level + 1)); 1099 + BUG_ON(path && !btree_node_locked(path, level + 1)); 1117 1100 BUG_ON(level >= BTREE_MAX_DEPTH); 1118 1101 1119 1102 b = btree_cache_find(bc, k);

+73 -78

fs/bcachefs/btree_gc.c

··· 389 389 have_child = dropped_children = false; 390 390 bch2_bkey_buf_init(&prev_k); 391 391 bch2_bkey_buf_init(&cur_k); 392 - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); 392 + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); 393 + iter.prefetch = true; 393 394 394 395 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { 395 396 BUG_ON(bpos_lt(k.k->p, b->data->min_key)); ··· 407 406 printbuf_reset(&buf); 408 407 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); 409 408 410 - if (mustfix_fsck_err_on(ret == -EIO, c, 409 + if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), c, 411 410 btree_node_unreadable, 412 411 "Topology repair: unreadable btree node at btree %s level %u:\n" 413 412 " %s", ··· 479 478 goto err; 480 479 481 480 bch2_btree_and_journal_iter_exit(&iter); 482 - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); 481 + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); 482 + iter.prefetch = true; 483 483 484 484 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { 485 485 bch2_bkey_buf_reassemble(&cur_k, c, k); ··· 593 591 struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); 594 592 enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr); 595 593 596 - if (!g->gen_valid && 597 - (c->opts.reconstruct_alloc || 598 - fsck_err(c, ptr_to_missing_alloc_key, 599 - "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" 600 - "while marking %s", 601 - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), 602 - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), 603 - p.ptr.gen, 604 - (printbuf_reset(&buf), 605 - bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { 594 + if (fsck_err_on(!g->gen_valid, 595 + c, ptr_to_missing_alloc_key, 596 + "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" 597 + "while marking %s", 598 + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), 599 + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), 600 + p.ptr.gen, 601 + (printbuf_reset(&buf), 602 + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { 606 603 if (!p.ptr.cached) { 607 604 g->gen_valid = true; 608 605 g->gen = p.ptr.gen; ··· 610 609 } 611 610 } 612 611 613 - if (gen_cmp(p.ptr.gen, g->gen) > 0 && 614 - (c->opts.reconstruct_alloc || 615 - fsck_err(c, ptr_gen_newer_than_bucket_gen, 616 - "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" 617 - "while marking %s", 618 - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), 619 - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), 620 - p.ptr.gen, g->gen, 621 - (printbuf_reset(&buf), 622 - bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { 612 + if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, 613 + c, ptr_gen_newer_than_bucket_gen, 614 + "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" 615 + "while marking %s", 616 + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), 617 + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), 618 + p.ptr.gen, g->gen, 619 + (printbuf_reset(&buf), 620 + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { 623 621 if (!p.ptr.cached) { 624 622 g->gen_valid = true; 625 623 g->gen = p.ptr.gen; ··· 631 631 } 632 632 } 633 633 634 - if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX && 635 - (c->opts.reconstruct_alloc || 636 - fsck_err(c, ptr_gen_newer_than_bucket_gen, 637 - "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" 638 - "while marking %s", 639 - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, 640 - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), 641 - p.ptr.gen, 642 - (printbuf_reset(&buf), 643 - bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) 634 + if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, 635 + c, ptr_gen_newer_than_bucket_gen, 636 + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" 637 + "while marking %s", 638 + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, 639 + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), 640 + p.ptr.gen, 641 + (printbuf_reset(&buf), 642 + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) 644 643 do_update = true; 645 644 646 - if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 && 647 - (c->opts.reconstruct_alloc || 648 - fsck_err(c, stale_dirty_ptr, 649 - "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" 650 - "while marking %s", 651 - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), 652 - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), 653 - p.ptr.gen, g->gen, 654 - (printbuf_reset(&buf), 655 - bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) 645 + if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0, 646 + c, stale_dirty_ptr, 647 + "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" 648 + "while marking %s", 649 + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), 650 + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), 651 + p.ptr.gen, g->gen, 652 + (printbuf_reset(&buf), 653 + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) 656 654 do_update = true; 657 655 658 656 if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) ··· 929 931 struct printbuf buf = PRINTBUF; 930 932 int ret = 0; 931 933 932 - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); 934 + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); 933 935 bch2_bkey_buf_init(&prev); 934 936 bch2_bkey_buf_init(&cur); 935 937 bkey_init(&prev.k->k); ··· 961 963 962 964 if (b->c.level > target_depth) { 963 965 bch2_btree_and_journal_iter_exit(&iter); 964 - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); 966 + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); 967 + iter.prefetch = true; 965 968 966 969 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { 967 970 struct btree *child; ··· 975 976 false); 976 977 ret = PTR_ERR_OR_ZERO(child); 977 978 978 - if (ret == -EIO) { 979 + if (bch2_err_matches(ret, EIO)) { 979 980 bch2_topology_error(c); 980 981 981 982 if (__fsck_err(c, ··· 1189 1190 genradix_free(&c->gc_stripes); 1190 1191 1191 1192 for_each_member_device(c, ca) { 1192 - kvpfree(rcu_dereference_protected(ca->buckets_gc, 1), 1193 - sizeof(struct bucket_array) + 1194 - ca->mi.nbuckets * sizeof(struct bucket)); 1193 + kvfree(rcu_dereference_protected(ca->buckets_gc, 1)); 1195 1194 ca->buckets_gc = NULL; 1196 1195 1197 1196 free_percpu(ca->usage_gc); ··· 1362 1365 { 1363 1366 struct bch_fs *c = trans->c; 1364 1367 struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); 1365 - struct bucket gc, *b; 1368 + struct bucket old_gc, gc, *b; 1366 1369 struct bkey_i_alloc_v4 *a; 1367 1370 struct bch_alloc_v4 old_convert, new; 1368 1371 const struct bch_alloc_v4 *old; 1369 - enum bch_data_type type; 1370 1372 int ret; 1371 1373 1372 1374 old = bch2_alloc_to_v4(k, &old_convert); ··· 1373 1377 1374 1378 percpu_down_read(&c->mark_lock); 1375 1379 b = gc_bucket(ca, iter->pos.offset); 1380 + old_gc = *b; 1381 + 1382 + if ((old->data_type == BCH_DATA_sb || 1383 + old->data_type == BCH_DATA_journal) && 1384 + !bch2_dev_is_online(ca)) { 1385 + b->data_type = old->data_type; 1386 + b->dirty_sectors = old->dirty_sectors; 1387 + } 1376 1388 1377 1389 /* 1378 1390 * b->data_type doesn't yet include need_discard & need_gc_gen states - 1379 1391 * fix that here: 1380 1392 */ 1381 - type = __alloc_data_type(b->dirty_sectors, 1382 - b->cached_sectors, 1383 - b->stripe, 1384 - *old, 1385 - b->data_type); 1386 - if (b->data_type != type) { 1387 - struct bch_dev_usage *u; 1388 - 1389 - preempt_disable(); 1390 - u = this_cpu_ptr(ca->usage_gc); 1391 - u->d[b->data_type].buckets--; 1392 - b->data_type = type; 1393 - u->d[b->data_type].buckets++; 1394 - preempt_enable(); 1395 - } 1396 - 1393 + b->data_type = __alloc_data_type(b->dirty_sectors, 1394 + b->cached_sectors, 1395 + b->stripe, 1396 + *old, 1397 + b->data_type); 1397 1398 gc = *b; 1398 1399 percpu_up_read(&c->mark_lock); 1400 + 1401 + if (gc.data_type != old_gc.data_type || 1402 + gc.dirty_sectors != old_gc.dirty_sectors) 1403 + bch2_dev_usage_update_m(c, ca, &old_gc, &gc); 1399 1404 1400 1405 if (metadata_only && 1401 1406 gc.data_type != BCH_DATA_sb && ··· 1407 1410 if (gen_after(old->gen, gc.gen)) 1408 1411 return 0; 1409 1412 1410 - if (c->opts.reconstruct_alloc || 1411 - fsck_err_on(new.data_type != gc.data_type, c, 1413 + if (fsck_err_on(new.data_type != gc.data_type, c, 1412 1414 alloc_key_data_type_wrong, 1413 1415 "bucket %llu:%llu gen %u has wrong data_type" 1414 1416 ": got %s, should be %s", ··· 1418 1422 new.data_type = gc.data_type; 1419 1423 1420 1424 #define copy_bucket_field(_errtype, _f) \ 1421 - if (c->opts.reconstruct_alloc || \ 1422 - fsck_err_on(new._f != gc._f, c, _errtype, \ 1425 + if (fsck_err_on(new._f != gc._f, c, _errtype, \ 1423 1426 "bucket %llu:%llu gen %u data type %s has wrong " #_f \ 1424 1427 ": got %u, should be %u", \ 1425 1428 iter->pos.inode, iter->pos.offset, \ ··· 1486 1491 static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) 1487 1492 { 1488 1493 for_each_member_device(c, ca) { 1489 - struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) + 1494 + struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) + 1490 1495 ca->mi.nbuckets * sizeof(struct bucket), 1491 1496 GFP_KERNEL|__GFP_ZERO); 1492 1497 if (!buckets) { ··· 1580 1585 " should be %u", 1581 1586 (bch2_bkey_val_to_text(&buf, c, k), buf.buf), 1582 1587 r->refcount)) { 1583 - struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0); 1584 - 1588 + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); 1585 1589 ret = PTR_ERR_OR_ZERO(new); 1586 1590 if (ret) 1587 1591 return ret; ··· 1589 1595 new->k.type = KEY_TYPE_deleted; 1590 1596 else 1591 1597 *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount); 1598 + ret = bch2_trans_update(trans, iter, new, 0); 1592 1599 } 1593 1600 fsck_err: 1594 1601 printbuf_exit(&buf); ··· 1812 1817 if (!ret) { 1813 1818 bch2_journal_block(&c->journal); 1814 1819 1815 - ret = bch2_gc_stripes_done(c, metadata_only) ?: 1816 - bch2_gc_reflink_done(c, metadata_only) ?: 1817 - bch2_gc_alloc_done(c, metadata_only) ?: 1818 - bch2_gc_done(c, initial, metadata_only); 1820 + ret = bch2_gc_alloc_done(c, metadata_only) ?: 1821 + bch2_gc_done(c, initial, metadata_only) ?: 1822 + bch2_gc_stripes_done(c, metadata_only) ?: 1823 + bch2_gc_reflink_done(c, metadata_only); 1819 1824 1820 1825 bch2_journal_unblock(&c->journal); 1821 1826 }

+15 -7

fs/bcachefs/btree_io.c

··· 103 103 if (used_mempool) 104 104 mempool_free(p, &c->btree_bounce_pool); 105 105 else 106 - vpfree(p, size); 106 + kvfree(p); 107 107 } 108 108 109 109 static void *btree_bounce_alloc(struct bch_fs *c, size_t size, ··· 115 115 BUG_ON(size > c->opts.btree_node_size); 116 116 117 117 *used_mempool = false; 118 - p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); 118 + p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT); 119 119 if (!p) { 120 120 *used_mempool = true; 121 121 p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); ··· 581 581 break; 582 582 case -BCH_ERR_btree_node_read_err_bad_node: 583 583 bch2_print_string_as_lines(KERN_ERR, out.buf); 584 - bch2_topology_error(c); 585 - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO; 584 + ret = bch2_topology_error(c); 586 585 break; 587 586 case -BCH_ERR_btree_node_read_err_incompatible: 588 587 bch2_print_string_as_lines(KERN_ERR, out.buf); ··· 839 840 if (k->format > KEY_FORMAT_CURRENT) 840 841 return false; 841 842 843 + if (k->u64s < bkeyp_key_u64s(&b->format, k)) 844 + return false; 845 + 842 846 struct printbuf buf = PRINTBUF; 843 847 struct bkey tmp; 844 848 struct bkey_s u = __bkey_disassemble(b, k, &tmp); ··· 883 881 "invalid bkey format %u", k->format)) 884 882 goto drop_this_key; 885 883 886 - /* XXX: validate k->u64s */ 884 + if (btree_err_on(k->u64s < bkeyp_key_u64s(&b->format, k), 885 + -BCH_ERR_btree_node_read_err_fixable, 886 + c, NULL, b, i, 887 + btree_node_bkey_bad_u64s, 888 + "k->u64s too small (%u < %u)", k->u64s, bkeyp_key_u64s(&b->format, k))) 889 + goto drop_this_key; 890 + 887 891 if (!write) 888 892 bch2_bkey_compat(b->c.level, b->c.btree_id, version, 889 893 BSET_BIG_ENDIAN(i), write, ··· 1745 1737 list_move(&b->list, &c->btree_cache.freeable); 1746 1738 mutex_unlock(&c->btree_cache.lock); 1747 1739 1748 - ret = -EIO; 1740 + ret = -BCH_ERR_btree_node_read_error; 1749 1741 goto err; 1750 1742 } 1751 1743 ··· 1849 1841 bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); 1850 1842 1851 1843 if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) { 1852 - ret = -BCH_ERR_btree_write_all_failed; 1844 + ret = -BCH_ERR_btree_node_write_all_failed; 1853 1845 goto err; 1854 1846 } 1855 1847

+13 -7

fs/bcachefs/btree_iter.c

··· 891 891 struct bkey_s_c k; 892 892 int ret = 0; 893 893 894 - __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos); 894 + __bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos); 895 895 896 896 k = bch2_btree_and_journal_iter_peek(&jiter); 897 897 ··· 1146 1146 path = &trans->paths[path_idx]; 1147 1147 1148 1148 if (unlikely(path->level >= BTREE_MAX_DEPTH)) 1149 - goto out; 1149 + goto out_uptodate; 1150 1150 1151 1151 path->level = btree_path_up_until_good_node(trans, path, 0); 1152 1152 ··· 1179 1179 goto out; 1180 1180 } 1181 1181 } 1182 - 1182 + out_uptodate: 1183 1183 path->uptodate = BTREE_ITER_UPTODATE; 1184 1184 out: 1185 1185 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted) ··· 1520 1520 { 1521 1521 unsigned nr = trans->nr_paths * 2; 1522 1522 1523 - void *p = kzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) + 1523 + void *p = kvzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) + 1524 1524 sizeof(struct btree_trans_paths) + 1525 1525 nr * sizeof(struct btree_path) + 1526 1526 nr * sizeof(btree_path_idx_t) + 8 + ··· 1729 1729 if (ret) 1730 1730 return ret; 1731 1731 1732 - btree_path_set_should_be_locked(trans->paths + iter->path); 1732 + struct btree_path *path = btree_iter_path(trans, iter); 1733 + if (btree_path_node(path, path->level)) 1734 + btree_path_set_should_be_locked(path); 1733 1735 return 0; 1734 1736 } 1735 1737 ··· 2307 2305 btree_iter_path(trans, iter)->level); 2308 2306 2309 2307 if (iter->flags & BTREE_ITER_WITH_JOURNAL) 2310 - return bkey_s_c_err(-EIO); 2308 + return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported); 2311 2309 2312 2310 bch2_btree_iter_verify(iter); 2313 2311 bch2_btree_iter_verify_entry_exit(iter); ··· 2505 2503 k = bch2_btree_iter_peek_upto(&iter2, end); 2506 2504 2507 2505 if (k.k && !bkey_err(k)) { 2506 + swap(iter->key_cache_path, iter2.key_cache_path); 2508 2507 iter->k = iter2.k; 2509 2508 k.k = &iter->k; 2510 2509 } ··· 2765 2762 struct btree_trans *trans = src->trans; 2766 2763 2767 2764 *dst = *src; 2765 + #ifdef TRACK_PATH_ALLOCATED 2766 + dst->ip_allocated = _RET_IP_; 2767 + #endif 2768 2768 if (src->path) 2769 2769 __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_INTENT); 2770 2770 if (src->update_path) ··· 3091 3085 trans->paths = NULL; 3092 3086 3093 3087 if (paths_allocated != trans->_paths_allocated) 3094 - kfree_rcu_mightsleep(paths_allocated); 3088 + kvfree_rcu_mightsleep(paths_allocated); 3095 3089 3096 3090 if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) 3097 3091 mempool_free(trans->mem, &c->btree_trans_mem_pool);

+87 -93

fs/bcachefs/btree_journal_iter.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 3 3 #include "bcachefs.h" 4 + #include "bkey_buf.h" 4 5 #include "bset.h" 6 + #include "btree_cache.h" 5 7 #include "btree_journal_iter.h" 6 8 #include "journal_io.h" 7 9 ··· 42 40 43 41 static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) 44 42 { 45 - return keys->d + idx_to_pos(keys, idx); 43 + return keys->data + idx_to_pos(keys, idx); 46 44 } 47 45 48 46 static size_t __bch2_journal_key_search(struct journal_keys *keys, ··· 182 180 BUG_ON(test_bit(BCH_FS_rw, &c->flags)); 183 181 184 182 if (idx < keys->size && 185 - journal_key_cmp(&n, &keys->d[idx]) == 0) { 186 - if (keys->d[idx].allocated) 187 - kfree(keys->d[idx].k); 188 - keys->d[idx] = n; 183 + journal_key_cmp(&n, &keys->data[idx]) == 0) { 184 + if (keys->data[idx].allocated) 185 + kfree(keys->data[idx].k); 186 + keys->data[idx] = n; 189 187 return 0; 190 188 } 191 189 ··· 198 196 .size = max_t(size_t, keys->size, 8) * 2, 199 197 }; 200 198 201 - new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL); 202 - if (!new_keys.d) { 199 + new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL); 200 + if (!new_keys.data) { 203 201 bch_err(c, "%s: error allocating new key array (size %zu)", 204 202 __func__, new_keys.size); 205 203 return -BCH_ERR_ENOMEM_journal_key_insert; 206 204 } 207 205 208 206 /* Since @keys was full, there was no gap: */ 209 - memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); 210 - kvfree(keys->d); 211 - keys->d = new_keys.d; 207 + memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr); 208 + kvfree(keys->data); 209 + keys->data = new_keys.data; 212 210 keys->nr = new_keys.nr; 213 211 keys->size = new_keys.size; 214 212 ··· 218 216 219 217 journal_iters_move_gap(c, keys->gap, idx); 220 218 221 - move_gap(keys->d, keys->nr, keys->size, keys->gap, idx); 222 - keys->gap = idx; 219 + move_gap(keys, idx); 223 220 224 221 keys->nr++; 225 - keys->d[keys->gap++] = n; 222 + keys->data[keys->gap++] = n; 226 223 227 224 journal_iters_fix(c); 228 225 ··· 268 267 size_t idx = bch2_journal_key_search(keys, btree, level, pos); 269 268 270 269 if (idx < keys->size && 271 - keys->d[idx].btree_id == btree && 272 - keys->d[idx].level == level && 273 - bpos_eq(keys->d[idx].k->k.p, pos)) 274 - keys->d[idx].overwritten = true; 270 + keys->data[idx].btree_id == btree && 271 + keys->data[idx].level == level && 272 + bpos_eq(keys->data[idx].k->k.p, pos)) 273 + keys->data[idx].overwritten = true; 275 274 } 276 275 277 276 static void bch2_journal_iter_advance(struct journal_iter *iter) ··· 285 284 286 285 static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) 287 286 { 288 - struct journal_key *k = iter->keys->d + iter->idx; 287 + struct journal_key *k = iter->keys->data + iter->idx; 289 288 290 - while (k < iter->keys->d + iter->keys->size && 289 + while (k < iter->keys->data + iter->keys->size && 291 290 k->btree_id == iter->btree_id && 292 291 k->level == iter->level) { 293 292 if (!k->overwritten) 294 293 return bkey_i_to_s_c(k->k); 295 294 296 295 bch2_journal_iter_advance(iter); 297 - k = iter->keys->d + iter->idx; 296 + k = iter->keys->data + iter->idx; 298 297 } 299 298 300 299 return bkey_s_c_null; ··· 335 334 iter->pos = bpos_successor(iter->pos); 336 335 } 337 336 337 + static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter) 338 + { 339 + struct btree_and_journal_iter iter = *_iter; 340 + struct bch_fs *c = iter.trans->c; 341 + unsigned level = iter.journal.level; 342 + struct bkey_buf tmp; 343 + unsigned nr = test_bit(BCH_FS_started, &c->flags) 344 + ? (level > 1 ? 0 : 2) 345 + : (level > 1 ? 1 : 16); 346 + 347 + iter.prefetch = false; 348 + bch2_bkey_buf_init(&tmp); 349 + 350 + while (nr--) { 351 + bch2_btree_and_journal_iter_advance(&iter); 352 + struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter); 353 + if (!k.k) 354 + break; 355 + 356 + bch2_bkey_buf_reassemble(&tmp, c, k); 357 + bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1); 358 + } 359 + 360 + bch2_bkey_buf_exit(&tmp, c); 361 + } 362 + 338 363 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) 339 364 { 340 365 struct bkey_s_c btree_k, journal_k, ret; 366 + 367 + if (iter->prefetch && iter->journal.level) 368 + btree_and_journal_iter_prefetch(iter); 341 369 again: 342 370 if (iter->at_end) 343 371 return bkey_s_c_null; ··· 406 376 bch2_journal_iter_exit(&iter->journal); 407 377 } 408 378 409 - void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, 410 - struct bch_fs *c, 379 + void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, 380 + struct btree_and_journal_iter *iter, 411 381 struct btree *b, 412 382 struct btree_node_iter node_iter, 413 383 struct bpos pos) 414 384 { 415 385 memset(iter, 0, sizeof(*iter)); 416 386 387 + iter->trans = trans; 417 388 iter->b = b; 418 389 iter->node_iter = node_iter; 419 - bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos); 390 + bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos); 420 391 INIT_LIST_HEAD(&iter->journal.list); 421 392 iter->pos = b->data->min_key; 422 393 iter->at_end = false; ··· 427 396 * this version is used by btree_gc before filesystem has gone RW and 428 397 * multithreaded, so uses the journal_iters list: 429 398 */ 430 - void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, 431 - struct bch_fs *c, 399 + void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, 400 + struct btree_and_journal_iter *iter, 432 401 struct btree *b) 433 402 { 434 403 struct btree_node_iter node_iter; 435 404 436 405 bch2_btree_node_iter_init_from_start(&node_iter, b); 437 - __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key); 438 - list_add(&iter->journal.list, &c->journal_iters); 406 + __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key); 407 + list_add(&iter->journal.list, &trans->c->journal_iters); 439 408 } 440 409 441 410 /* sort and dedup all keys in the journal: */ ··· 446 415 struct genradix_iter iter; 447 416 448 417 genradix_for_each(&c->journal_entries, iter, i) 449 - if (*i) 450 - kvpfree(*i, offsetof(struct journal_replay, j) + 451 - vstruct_bytes(&(*i)->j)); 418 + kvfree(*i); 452 419 genradix_free(&c->journal_entries); 453 420 } 454 421 ··· 466 437 void bch2_journal_keys_put(struct bch_fs *c) 467 438 { 468 439 struct journal_keys *keys = &c->journal_keys; 469 - struct journal_key *i; 470 440 471 441 BUG_ON(atomic_read(&keys->ref) <= 0); 472 442 473 443 if (!atomic_dec_and_test(&keys->ref)) 474 444 return; 475 445 476 - move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); 477 - keys->gap = keys->nr; 446 + move_gap(keys, keys->nr); 478 447 479 - for (i = keys->d; i < keys->d + keys->nr; i++) 448 + darray_for_each(*keys, i) 480 449 if (i->allocated) 481 450 kfree(i->k); 482 451 483 - kvfree(keys->d); 484 - keys->d = NULL; 452 + kvfree(keys->data); 453 + keys->data = NULL; 485 454 keys->nr = keys->gap = keys->size = 0; 486 455 487 456 bch2_journal_entries_free(c); ··· 487 460 488 461 static void __journal_keys_sort(struct journal_keys *keys) 489 462 { 490 - struct journal_key *src, *dst; 463 + sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL); 491 464 492 - sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL); 465 + struct journal_key *dst = keys->data; 493 466 494 - src = dst = keys->d; 495 - while (src < keys->d + keys->nr) { 496 - while (src + 1 < keys->d + keys->nr && 497 - !journal_key_cmp(src, src + 1)) 498 - src++; 467 + darray_for_each(*keys, src) { 468 + if (src + 1 < &darray_top(*keys) && 469 + !journal_key_cmp(src, src + 1)) 470 + continue; 499 471 500 - *dst++ = *src++; 472 + *dst++ = *src; 501 473 } 502 474 503 - keys->nr = dst - keys->d; 475 + keys->nr = dst - keys->data; 504 476 } 505 477 506 478 int bch2_journal_keys_sort(struct bch_fs *c) 507 479 { 508 480 struct genradix_iter iter; 509 481 struct journal_replay *i, **_i; 510 - struct jset_entry *entry; 511 - struct bkey_i *k; 512 482 struct journal_keys *keys = &c->journal_keys; 513 - size_t nr_keys = 0, nr_read = 0; 483 + size_t nr_read = 0; 514 484 515 485 genradix_for_each(&c->journal_entries, iter, _i) { 516 486 i = *_i; 517 487 518 - if (!i || i->ignore) 519 - continue; 520 - 521 - for_each_jset_key(k, entry, &i->j) 522 - nr_keys++; 523 - } 524 - 525 - if (!nr_keys) 526 - return 0; 527 - 528 - keys->size = roundup_pow_of_two(nr_keys); 529 - 530 - keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); 531 - if (!keys->d) { 532 - bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath", 533 - nr_keys); 534 - 535 - do { 536 - keys->size >>= 1; 537 - keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); 538 - } while (!keys->d && keys->size > nr_keys / 8); 539 - 540 - if (!keys->d) { 541 - bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting", 542 - keys->size); 543 - return -BCH_ERR_ENOMEM_journal_keys_sort; 544 - } 545 - } 546 - 547 - genradix_for_each(&c->journal_entries, iter, _i) { 548 - i = *_i; 549 - 550 - if (!i || i->ignore) 488 + if (journal_replay_ignore(i)) 551 489 continue; 552 490 553 491 cond_resched(); 554 492 555 493 for_each_jset_key(k, entry, &i->j) { 556 - if (keys->nr == keys->size) { 557 - __journal_keys_sort(keys); 558 - 559 - if (keys->nr > keys->size * 7 / 8) { 560 - bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu", 561 - keys->nr, keys->size, nr_read, nr_keys); 562 - return -BCH_ERR_ENOMEM_journal_keys_sort; 563 - } 564 - } 565 - 566 - keys->d[keys->nr++] = (struct journal_key) { 494 + struct journal_key n = (struct journal_key) { 567 495 .btree_id = entry->btree_id, 568 496 .level = entry->level, 569 497 .k = k, 570 498 .journal_seq = le64_to_cpu(i->j.seq), 571 499 .journal_offset = k->_data - i->j._data, 572 500 }; 501 + 502 + if (darray_push(keys, n)) { 503 + __journal_keys_sort(keys); 504 + 505 + if (keys->nr * 8 > keys->size * 7) { 506 + bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", 507 + keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); 508 + return -BCH_ERR_ENOMEM_journal_keys_sort; 509 + } 510 + 511 + BUG_ON(darray_push(keys, n)); 512 + } 573 513 574 514 nr_read++; 575 515 } ··· 545 551 __journal_keys_sort(keys); 546 552 keys->gap = keys->nr; 547 553 548 - bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr); 554 + bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr); 549 555 return 0; 550 556 }

+9 -5

fs/bcachefs/btree_journal_iter.h

··· 15 15 */ 16 16 17 17 struct btree_and_journal_iter { 18 + struct btree_trans *trans; 18 19 struct btree *b; 19 20 struct btree_node_iter node_iter; 20 21 struct bkey unpacked; ··· 23 22 struct journal_iter journal; 24 23 struct bpos pos; 25 24 bool at_end; 25 + bool prefetch; 26 26 }; 27 27 28 28 struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id, 29 29 unsigned, struct bpos, struct bpos, size_t *); 30 30 struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, 31 31 unsigned, struct bpos); 32 + 33 + int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *, 34 + struct btree_and_journal_iter *); 32 35 33 36 int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, 34 37 unsigned, struct bkey_i *); ··· 47 42 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); 48 43 49 44 void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); 50 - void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, 51 - struct bch_fs *, struct btree *, 45 + void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *, 46 + struct btree_and_journal_iter *, struct btree *, 52 47 struct btree_node_iter, struct bpos); 53 - void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, 54 - struct bch_fs *, 55 - struct btree *); 48 + void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *, 49 + struct btree_and_journal_iter *, struct btree *); 56 50 57 51 void bch2_journal_keys_put(struct bch_fs *); 58 52

+5 -3

fs/bcachefs/btree_key_cache.c

··· 380 380 struct bkey_i *new_k = NULL; 381 381 int ret; 382 382 383 - k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos, 384 - BTREE_ITER_KEY_CACHE_FILL| 385 - BTREE_ITER_CACHED_NOFILL); 383 + bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos, 384 + BTREE_ITER_KEY_CACHE_FILL| 385 + BTREE_ITER_CACHED_NOFILL); 386 + iter.flags &= ~BTREE_ITER_WITH_JOURNAL; 387 + k = bch2_btree_iter_peek_slot(&iter); 386 388 ret = bkey_err(k); 387 389 if (ret) 388 390 goto err;

+2 -1

fs/bcachefs/btree_locking.c

··· 747 747 return; 748 748 749 749 trans_for_each_path(trans, path, i) 750 - bch2_btree_path_downgrade(trans, path); 750 + if (path->ref) 751 + bch2_btree_path_downgrade(trans, path); 751 752 } 752 753 753 754 int bch2_trans_relock(struct btree_trans *trans)

+8 -1

fs/bcachefs/btree_types.h

··· 5 5 #include <linux/list.h> 6 6 #include <linux/rhashtable.h> 7 7 8 + #include "bbpos_types.h" 8 9 #include "btree_key_cache_types.h" 9 10 #include "buckets_types.h" 10 11 #include "darray.h" ··· 174 173 */ 175 174 struct task_struct *alloc_lock; 176 175 struct closure_waitlist alloc_wait; 176 + 177 + struct bbpos pinned_nodes_start; 178 + struct bbpos pinned_nodes_end; 179 + u64 pinned_nodes_leaf_mask; 180 + u64 pinned_nodes_interior_mask; 177 181 }; 178 182 179 183 struct btree_node_iter { ··· 660 654 BIT_ULL(BKEY_TYPE_inodes)| \ 661 655 BIT_ULL(BKEY_TYPE_stripes)| \ 662 656 BIT_ULL(BKEY_TYPE_reflink)| \ 657 + BIT_ULL(BKEY_TYPE_subvolumes)| \ 663 658 BIT_ULL(BKEY_TYPE_btree)) 664 659 665 660 #define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \ ··· 734 727 __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); 735 728 u8 level; 736 729 u8 alive; 737 - s8 error; 730 + s16 error; 738 731 }; 739 732 740 733 enum btree_gc_coalesce_fail_reason {

+22 -1

fs/bcachefs/btree_update.c

··· 452 452 * the key cache - but the key has to exist in the btree for that to 453 453 * work: 454 454 */ 455 - if (path->cached && bkey_deleted(&i->old_k)) 455 + if (path->cached && !i->old_btree_u64s) 456 456 return flush_new_cached_update(trans, i, flags, ip); 457 457 458 458 return 0; ··· 787 787 788 788 int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, 789 789 struct bpos pos, bool set) 790 + { 791 + struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); 792 + int ret = PTR_ERR_OR_ZERO(k); 793 + if (ret) 794 + return ret; 795 + 796 + bkey_init(&k->k); 797 + k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; 798 + k->k.p = pos; 799 + 800 + struct btree_iter iter; 801 + bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT); 802 + 803 + ret = bch2_btree_iter_traverse(&iter) ?: 804 + bch2_trans_update(trans, &iter, k, 0); 805 + bch2_trans_iter_exit(trans, &iter); 806 + return ret; 807 + } 808 + 809 + int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree, 810 + struct bpos pos, bool set) 790 811 { 791 812 struct bkey_i k; 792 813

+2 -1

fs/bcachefs/btree_update.h

··· 63 63 struct bpos, struct bpos, unsigned, u64 *); 64 64 65 65 int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool); 66 + int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool); 66 67 67 68 static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans, 68 69 enum btree_id btree, struct bpos pos) 69 70 { 70 - return bch2_btree_bit_mod(trans, btree, pos, false); 71 + return bch2_btree_bit_mod_buffered(trans, btree, pos, false); 71 72 } 72 73 73 74 int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,

+64 -19

fs/bcachefs/btree_update_interior.c

··· 25 25 #include <linux/random.h> 26 26 27 27 static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, 28 - btree_path_idx_t, struct btree *, 29 - struct keylist *, unsigned); 28 + btree_path_idx_t, struct btree *, struct keylist *); 30 29 static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); 31 30 32 31 static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans, ··· 1207 1208 mutex_unlock(&c->btree_cache.lock); 1208 1209 1209 1210 mutex_lock(&c->btree_root_lock); 1210 - BUG_ON(btree_node_root(c, b) && 1211 - (b->c.level < btree_node_root(c, b)->c.level || 1212 - !btree_node_dying(btree_node_root(c, b)))); 1213 - 1214 1211 bch2_btree_id_root(c, b->c.btree_id)->b = b; 1215 1212 mutex_unlock(&c->btree_root_lock); 1216 1213 ··· 1472 1477 1473 1478 static int btree_split(struct btree_update *as, struct btree_trans *trans, 1474 1479 btree_path_idx_t path, struct btree *b, 1475 - struct keylist *keys, unsigned flags) 1480 + struct keylist *keys) 1476 1481 { 1477 1482 struct bch_fs *c = as->c; 1478 1483 struct btree *parent = btree_node_parent(trans->paths + path, b); ··· 1573 1578 1574 1579 if (parent) { 1575 1580 /* Split a non root node */ 1576 - ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); 1581 + ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); 1577 1582 if (ret) 1578 1583 goto err; 1579 1584 } else if (n3) { ··· 1668 1673 * @path_idx: path that points to current node 1669 1674 * @b: node to insert keys into 1670 1675 * @keys: list of keys to insert 1671 - * @flags: transaction commit flags 1672 1676 * 1673 1677 * Returns: 0 on success, typically transaction restart error on failure 1674 1678 * ··· 1677 1683 */ 1678 1684 static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, 1679 1685 btree_path_idx_t path_idx, struct btree *b, 1680 - struct keylist *keys, unsigned flags) 1686 + struct keylist *keys) 1681 1687 { 1682 1688 struct bch_fs *c = as->c; 1683 1689 struct btree_path *path = trans->paths + path_idx; ··· 1733 1739 return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); 1734 1740 } 1735 1741 1736 - return btree_split(as, trans, path_idx, b, keys, flags); 1742 + return btree_split(as, trans, path_idx, b, keys); 1737 1743 } 1738 1744 1739 1745 int bch2_btree_split_leaf(struct btree_trans *trans, ··· 1741 1747 unsigned flags) 1742 1748 { 1743 1749 /* btree_split & merge may both cause paths array to be reallocated */ 1744 - 1745 1750 struct btree *b = path_l(trans->paths + path)->b; 1746 1751 struct btree_update *as; 1747 1752 unsigned l; ··· 1752 1759 if (IS_ERR(as)) 1753 1760 return PTR_ERR(as); 1754 1761 1755 - ret = btree_split(as, trans, path, b, NULL, flags); 1762 + ret = btree_split(as, trans, path, b, NULL); 1756 1763 if (ret) { 1757 1764 bch2_btree_update_free(as, trans); 1758 1765 return ret; ··· 1766 1773 ret = bch2_foreground_maybe_merge(trans, path, l, flags); 1767 1774 1768 1775 return ret; 1776 + } 1777 + 1778 + static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans, 1779 + btree_path_idx_t path_idx) 1780 + { 1781 + struct bch_fs *c = as->c; 1782 + struct btree_path *path = trans->paths + path_idx; 1783 + struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b; 1784 + 1785 + BUG_ON(!btree_node_locked(path, b->c.level)); 1786 + 1787 + n = __btree_root_alloc(as, trans, b->c.level + 1); 1788 + 1789 + bch2_btree_update_add_new_node(as, n); 1790 + six_unlock_write(&n->c.lock); 1791 + 1792 + path->locks_want++; 1793 + BUG_ON(btree_node_locked(path, n->c.level)); 1794 + six_lock_increment(&n->c.lock, SIX_LOCK_intent); 1795 + mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED); 1796 + bch2_btree_path_level_init(trans, path, n); 1797 + 1798 + n->sib_u64s[0] = U16_MAX; 1799 + n->sib_u64s[1] = U16_MAX; 1800 + 1801 + bch2_keylist_add(&as->parent_keys, &b->key); 1802 + btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys); 1803 + 1804 + bch2_btree_set_root(as, trans, path, n); 1805 + bch2_btree_update_get_open_buckets(as, n); 1806 + bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); 1807 + bch2_trans_node_add(trans, path, n); 1808 + six_unlock_intent(&n->c.lock); 1809 + 1810 + mutex_lock(&c->btree_cache.lock); 1811 + list_add_tail(&b->list, &c->btree_cache.live); 1812 + mutex_unlock(&c->btree_cache.lock); 1813 + 1814 + bch2_trans_verify_locks(trans); 1815 + } 1816 + 1817 + int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags) 1818 + { 1819 + struct bch_fs *c = trans->c; 1820 + struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b; 1821 + struct btree_update *as = 1822 + bch2_btree_update_start(trans, trans->paths + path, 1823 + b->c.level, true, flags); 1824 + if (IS_ERR(as)) 1825 + return PTR_ERR(as); 1826 + 1827 + __btree_increase_depth(as, trans, path); 1828 + bch2_btree_update_done(as, trans); 1829 + return 0; 1769 1830 } 1770 1831 1771 1832 int __bch2_foreground_maybe_merge(struct btree_trans *trans, ··· 1892 1845 __func__, buf1.buf, buf2.buf); 1893 1846 printbuf_exit(&buf1); 1894 1847 printbuf_exit(&buf2); 1895 - bch2_topology_error(c); 1896 - ret = -EIO; 1848 + ret = bch2_topology_error(c); 1897 1849 goto err; 1898 1850 } 1899 1851 ··· 1962 1916 1963 1917 bch2_trans_verify_paths(trans); 1964 1918 1965 - ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); 1919 + ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); 1966 1920 if (ret) 1967 1921 goto err_free_update; 1968 1922 ··· 2033 1987 2034 1988 if (parent) { 2035 1989 bch2_keylist_add(&as->parent_keys, &n->key); 2036 - ret = bch2_btree_insert_node(as, trans, iter->path, 2037 - parent, &as->parent_keys, flags); 1990 + ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys); 2038 1991 if (ret) 2039 1992 goto err; 2040 1993 } else { ··· 2530 2485 int bch2_fs_btree_interior_update_init(struct bch_fs *c) 2531 2486 { 2532 2487 c->btree_interior_update_worker = 2533 - alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1); 2488 + alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8); 2534 2489 if (!c->btree_interior_update_worker) 2535 2490 return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; 2536 2491

+2

fs/bcachefs/btree_update_interior.h

··· 119 119 120 120 int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned); 121 121 122 + int bch2_btree_increase_depth(struct btree_trans *, btree_path_idx_t, unsigned); 123 + 122 124 int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t, 123 125 unsigned, unsigned, enum btree_node_sibling); 124 126

+2 -2

fs/bcachefs/btree_write_buffer.c

··· 574 574 static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) 575 575 { 576 576 struct journal_keys_to_wb dst; 577 - struct jset_entry *entry; 578 - struct bkey_i *k; 579 577 int ret = 0; 580 578 581 579 bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); ··· 588 590 entry->type = BCH_JSET_ENTRY_btree_keys; 589 591 } 590 592 593 + spin_lock(&c->journal.lock); 591 594 buf->need_flush_to_write_buffer = false; 595 + spin_unlock(&c->journal.lock); 592 596 out: 593 597 bch2_journal_keys_to_write_buffer_end(c, &dst); 594 598 return ret;

+13 -19

fs/bcachefs/buckets.c

··· 1053 1053 (int) bch2_bkey_needs_rebalance(c, old); 1054 1054 1055 1055 if (mod) { 1056 - int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new.k->p, mod > 0); 1056 + int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, 1057 + new.k->p, mod > 0); 1057 1058 if (ret) 1058 1059 return ret; 1059 1060 } ··· 1336 1335 struct bucket_gens *buckets = 1337 1336 container_of(rcu, struct bucket_gens, rcu); 1338 1337 1339 - kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets); 1338 + kvfree(buckets); 1340 1339 } 1341 1340 1342 1341 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ··· 1346 1345 bool resize = ca->bucket_gens != NULL; 1347 1346 int ret; 1348 1347 1349 - if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, 1350 - GFP_KERNEL|__GFP_ZERO))) { 1348 + if (!(bucket_gens = kvmalloc(sizeof(struct bucket_gens) + nbuckets, 1349 + GFP_KERNEL|__GFP_ZERO))) { 1351 1350 ret = -BCH_ERR_ENOMEM_bucket_gens; 1352 1351 goto err; 1353 1352 } 1354 1353 1355 1354 if ((c->opts.buckets_nouse && 1356 - !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * 1357 - sizeof(unsigned long), 1358 - GFP_KERNEL|__GFP_ZERO)))) { 1355 + !(buckets_nouse = kvmalloc(BITS_TO_LONGS(nbuckets) * 1356 + sizeof(unsigned long), 1357 + GFP_KERNEL|__GFP_ZERO)))) { 1359 1358 ret = -BCH_ERR_ENOMEM_buckets_nouse; 1360 1359 goto err; 1361 1360 } ··· 1398 1397 1399 1398 ret = 0; 1400 1399 err: 1401 - kvpfree(buckets_nouse, 1402 - BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); 1400 + kvfree(buckets_nouse); 1403 1401 if (bucket_gens) 1404 1402 call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); 1405 1403 ··· 1407 1407 1408 1408 void bch2_dev_buckets_free(struct bch_dev *ca) 1409 1409 { 1410 - unsigned i; 1410 + kvfree(ca->buckets_nouse); 1411 + kvfree(rcu_dereference_protected(ca->bucket_gens, 1)); 1411 1412 1412 - kvpfree(ca->buckets_nouse, 1413 - BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); 1414 - kvpfree(rcu_dereference_protected(ca->bucket_gens, 1), 1415 - sizeof(struct bucket_gens) + ca->mi.nbuckets); 1416 - 1417 - for (i = 0; i < ARRAY_SIZE(ca->usage); i++) 1413 + for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) 1418 1414 free_percpu(ca->usage[i]); 1419 1415 kfree(ca->usage_base); 1420 1416 } 1421 1417 1422 1418 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) 1423 1419 { 1424 - unsigned i; 1425 - 1426 1420 ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL); 1427 1421 if (!ca->usage_base) 1428 1422 return -BCH_ERR_ENOMEM_usage_init; 1429 1423 1430 - for (i = 0; i < ARRAY_SIZE(ca->usage); i++) { 1424 + for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) { 1431 1425 ca->usage[i] = alloc_percpu(struct bch_dev_usage); 1432 1426 if (!ca->usage[i]) 1433 1427 return -BCH_ERR_ENOMEM_usage_init;

+33 -24

fs/bcachefs/chardev.c

··· 22 22 #include <linux/slab.h> 23 23 #include <linux/uaccess.h> 24 24 25 - __must_check 26 - static int copy_to_user_errcode(void __user *to, const void *from, unsigned long n) 27 - { 28 - return copy_to_user(to, from, n) ? -EFAULT : 0; 29 - } 30 - 31 25 /* returns with ref on ca->ref */ 32 26 static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, 33 27 unsigned flags) ··· 149 155 kfree(thr); 150 156 } 151 157 152 - static int bch2_fsck_offline_thread_fn(void *arg) 158 + static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) 153 159 { 154 - struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr); 160 + struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); 155 161 struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts); 156 162 157 - thr->thr.thr.ret = PTR_ERR_OR_ZERO(c); 158 - if (!thr->thr.thr.ret) 159 - bch2_fs_stop(c); 163 + if (IS_ERR(c)) 164 + return PTR_ERR(c); 160 165 161 - thread_with_stdio_done(&thr->thr); 162 - return 0; 166 + int ret = 0; 167 + if (test_bit(BCH_FS_errors_fixed, &c->flags)) 168 + ret |= 1; 169 + if (test_bit(BCH_FS_error, &c->flags)) 170 + ret |= 4; 171 + 172 + bch2_fs_stop(c); 173 + 174 + if (ret & 1) 175 + bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name); 176 + if (ret & 4) 177 + bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name); 178 + 179 + return ret; 163 180 } 181 + 182 + static const struct thread_with_stdio_ops bch2_offline_fsck_ops = { 183 + .exit = bch2_fsck_thread_exit, 184 + .fn = bch2_fsck_offline_thread_fn, 185 + }; 164 186 165 187 static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) 166 188 { ··· 230 220 231 221 opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); 232 222 233 - ret = bch2_run_thread_with_stdio(&thr->thr, 234 - bch2_fsck_thread_exit, 235 - bch2_fsck_offline_thread_fn); 223 + ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_offline_fsck_ops); 236 224 err: 237 225 if (ret < 0) { 238 226 if (thr) ··· 771 763 return ret; 772 764 } 773 765 774 - static int bch2_fsck_online_thread_fn(void *arg) 766 + static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) 775 767 { 776 - struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr); 768 + struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); 777 769 struct bch_fs *c = thr->c; 778 770 779 771 c->stdio_filter = current; ··· 801 793 c->stdio_filter = NULL; 802 794 c->opts.fix_errors = old_fix_errors; 803 795 804 - thread_with_stdio_done(&thr->thr); 805 - 806 796 up(&c->online_fsck_mutex); 807 797 bch2_ro_ref_put(c); 808 - return 0; 798 + return ret; 809 799 } 800 + 801 + static const struct thread_with_stdio_ops bch2_online_fsck_ops = { 802 + .exit = bch2_fsck_thread_exit, 803 + .fn = bch2_fsck_online_thread_fn, 804 + }; 810 805 811 806 static long bch2_ioctl_fsck_online(struct bch_fs *c, 812 807 struct bch_ioctl_fsck_online arg) ··· 851 840 goto err; 852 841 } 853 842 854 - ret = bch2_run_thread_with_stdio(&thr->thr, 855 - bch2_fsck_thread_exit, 856 - bch2_fsck_online_thread_fn); 843 + ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops); 857 844 err: 858 845 if (ret < 0) { 859 846 bch_err_fn(c, ret);

+1 -1

fs/bcachefs/checksum.c

··· 558 558 return 0; 559 559 } 560 560 561 - #include "../crypto.h" 561 + #include "crypto.h" 562 562 #endif 563 563 564 564 int bch2_request_key(struct bch_sb *sb, struct bch_key *key)

+7 -7

fs/bcachefs/compress.c

··· 601 601 return 0; 602 602 603 603 if (!mempool_initialized(&c->compression_bounce[READ]) && 604 - mempool_init_kvpmalloc_pool(&c->compression_bounce[READ], 605 - 1, c->opts.encoded_extent_max)) 604 + mempool_init_kvmalloc_pool(&c->compression_bounce[READ], 605 + 1, c->opts.encoded_extent_max)) 606 606 return -BCH_ERR_ENOMEM_compression_bounce_read_init; 607 607 608 608 if (!mempool_initialized(&c->compression_bounce[WRITE]) && 609 - mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE], 610 - 1, c->opts.encoded_extent_max)) 609 + mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE], 610 + 1, c->opts.encoded_extent_max)) 611 611 return -BCH_ERR_ENOMEM_compression_bounce_write_init; 612 612 613 613 for (i = compression_types; ··· 622 622 if (mempool_initialized(&c->compress_workspace[i->type])) 623 623 continue; 624 624 625 - if (mempool_init_kvpmalloc_pool( 625 + if (mempool_init_kvmalloc_pool( 626 626 &c->compress_workspace[i->type], 627 627 1, i->compress_workspace)) 628 628 return -BCH_ERR_ENOMEM_compression_workspace_init; 629 629 } 630 630 631 631 if (!mempool_initialized(&c->decompress_workspace) && 632 - mempool_init_kvpmalloc_pool(&c->decompress_workspace, 633 - 1, decompress_workspace_size)) 632 + mempool_init_kvmalloc_pool(&c->decompress_workspace, 633 + 1, decompress_workspace_size)) 634 634 return -BCH_ERR_ENOMEM_decompression_workspace_init; 635 635 636 636 return 0;

+3 -3

fs/bcachefs/debug.c

··· 137 137 mutex_lock(&c->verify_lock); 138 138 139 139 if (!c->verify_ondisk) { 140 - c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL); 140 + c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL); 141 141 if (!c->verify_ondisk) 142 142 goto out; 143 143 } ··· 199 199 return; 200 200 } 201 201 202 - n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL); 202 + n_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL); 203 203 if (!n_ondisk) { 204 204 prt_printf(out, "memory allocation failure\n"); 205 205 goto out; ··· 293 293 out: 294 294 if (bio) 295 295 bio_put(bio); 296 - kvpfree(n_ondisk, btree_buf_bytes(b)); 296 + kvfree(n_ondisk); 297 297 percpu_ref_put(&ca->io_ref); 298 298 } 299 299

+76 -69

fs/bcachefs/dirent.c

··· 144 144 return ret; 145 145 } 146 146 147 - void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, 148 - struct bkey_s_c k) 147 + void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) 149 148 { 150 149 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); 151 150 struct qstr d_name = bch2_dirent_get_name(d); 152 151 153 - prt_printf(out, "%.*s -> %llu type %s", 154 - d_name.len, 155 - d_name.name, 156 - d.v->d_type != DT_SUBVOL 157 - ? le64_to_cpu(d.v->d_inum) 158 - : le32_to_cpu(d.v->d_child_subvol), 159 - bch2_d_type_str(d.v->d_type)); 152 + prt_printf(out, "%.*s -> ", d_name.len, d_name.name); 153 + 154 + if (d.v->d_type != DT_SUBVOL) 155 + prt_printf(out, "%llu", le64_to_cpu(d.v->d_inum)); 156 + else 157 + prt_printf(out, "%u -> %u", 158 + le32_to_cpu(d.v->d_parent_subvol), 159 + le32_to_cpu(d.v->d_child_subvol)); 160 + 161 + prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type)); 160 162 } 161 163 162 164 static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, ··· 201 199 } 202 200 203 201 int bch2_dirent_create_snapshot(struct btree_trans *trans, 204 - u64 dir, u32 snapshot, 202 + u32 dir_subvol, u64 dir, u32 snapshot, 205 203 const struct bch_hash_info *hash_info, 206 204 u8 type, const struct qstr *name, u64 dst_inum, 207 205 u64 *dir_offset, 208 206 bch_str_hash_flags_t str_hash_flags) 209 207 { 210 - subvol_inum zero_inum = { 0 }; 208 + subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir }; 211 209 struct bkey_i_dirent *dirent; 212 210 int ret; 213 211 214 - dirent = dirent_create_key(trans, zero_inum, type, name, dst_inum); 212 + dirent = dirent_create_key(trans, dir_inum, type, name, dst_inum); 215 213 ret = PTR_ERR_OR_ZERO(dirent); 216 214 if (ret) 217 215 return ret; ··· 219 217 dirent->k.p.inode = dir; 220 218 dirent->k.p.snapshot = snapshot; 221 219 222 - ret = bch2_hash_set_snapshot(trans, bch2_dirent_hash_desc, hash_info, 223 - zero_inum, snapshot, 224 - &dirent->k_i, str_hash_flags, 225 - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); 220 + ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, 221 + dir_inum, snapshot, 222 + &dirent->k_i, str_hash_flags, 223 + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); 226 224 *dir_offset = dirent->k.p.offset; 227 225 228 226 return ret; ··· 293 291 struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; 294 292 struct bpos dst_pos = 295 293 POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name)); 296 - unsigned src_type = 0, dst_type = 0, src_update_flags = 0; 294 + unsigned src_update_flags = 0; 295 + bool delete_src, delete_dst; 297 296 int ret = 0; 298 - 299 - if (src_dir.subvol != dst_dir.subvol) 300 - return -EXDEV; 301 297 302 298 memset(src_inum, 0, sizeof(*src_inum)); 303 299 memset(dst_inum, 0, sizeof(*dst_inum)); ··· 316 316 bkey_s_c_to_dirent(old_src), src_inum); 317 317 if (ret) 318 318 goto out; 319 - 320 - src_type = bkey_s_c_to_dirent(old_src).v->d_type; 321 - 322 - if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE) 323 - return -EOPNOTSUPP; 324 - 325 319 326 320 /* Lookup dst: */ 327 321 if (mode == BCH_RENAME) { ··· 344 350 bkey_s_c_to_dirent(old_dst), dst_inum); 345 351 if (ret) 346 352 goto out; 347 - 348 - dst_type = bkey_s_c_to_dirent(old_dst).v->d_type; 349 - 350 - if (dst_type == DT_SUBVOL) 351 - return -EOPNOTSUPP; 352 353 } 353 354 354 355 if (mode != BCH_RENAME_EXCHANGE) ··· 413 424 } 414 425 } 415 426 427 + if (new_dst->v.d_type == DT_SUBVOL) 428 + new_dst->v.d_parent_subvol = cpu_to_le32(dst_dir.subvol); 429 + 430 + if ((mode == BCH_RENAME_EXCHANGE) && 431 + new_src->v.d_type == DT_SUBVOL) 432 + new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol); 433 + 416 434 ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); 417 435 if (ret) 418 436 goto out; 419 437 out_set_src: 420 - 421 438 /* 422 - * If we're deleting a subvolume, we need to really delete the dirent, 423 - * not just emit a whiteout in the current snapshot: 439 + * If we're deleting a subvolume we need to really delete the dirent, 440 + * not just emit a whiteout in the current snapshot - there can only be 441 + * single dirent that points to a given subvolume. 442 + * 443 + * IOW, we don't maintain multiple versions in different snapshots of 444 + * dirents that point to subvolumes - dirents that point to subvolumes 445 + * are only visible in one particular subvolume so it's not necessary, 446 + * and it would be particularly confusing for fsck to have to deal with. 424 447 */ 425 - if (src_type == DT_SUBVOL) { 426 - bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); 427 - ret = bch2_btree_iter_traverse(&src_iter); 448 + delete_src = bkey_s_c_to_dirent(old_src).v->d_type == DT_SUBVOL && 449 + new_src->k.p.snapshot != old_src.k->p.snapshot; 450 + 451 + delete_dst = old_dst.k && 452 + bkey_s_c_to_dirent(old_dst).v->d_type == DT_SUBVOL && 453 + new_dst->k.p.snapshot != old_dst.k->p.snapshot; 454 + 455 + if (!delete_src || !bkey_deleted(&new_src->k)) { 456 + ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags); 428 457 if (ret) 429 458 goto out; 430 - 431 - new_src->k.p = src_iter.pos; 432 - src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE; 433 459 } 434 460 435 - ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags); 436 - if (ret) 437 - goto out; 461 + if (delete_src) { 462 + bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); 463 + ret = bch2_btree_iter_traverse(&src_iter) ?: 464 + bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); 465 + if (ret) 466 + goto out; 467 + } 468 + 469 + if (delete_dst) { 470 + bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot); 471 + ret = bch2_btree_iter_traverse(&dst_iter) ?: 472 + bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); 473 + if (ret) 474 + goto out; 475 + } 438 476 439 477 if (mode == BCH_RENAME_EXCHANGE) 440 478 *src_offset = new_src->k.p.offset; ··· 472 456 return ret; 473 457 } 474 458 475 - int __bch2_dirent_lookup_trans(struct btree_trans *trans, 476 - struct btree_iter *iter, 477 - subvol_inum dir, 478 - const struct bch_hash_info *hash_info, 479 - const struct qstr *name, subvol_inum *inum, 480 - unsigned flags) 459 + int bch2_dirent_lookup_trans(struct btree_trans *trans, 460 + struct btree_iter *iter, 461 + subvol_inum dir, 462 + const struct bch_hash_info *hash_info, 463 + const struct qstr *name, subvol_inum *inum, 464 + unsigned flags) 481 465 { 482 - struct bkey_s_c k; 483 - struct bkey_s_c_dirent d; 484 - u32 snapshot; 485 - int ret; 486 - 487 - ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); 466 + int ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, 467 + hash_info, dir, name, flags); 488 468 if (ret) 489 469 return ret; 490 470 491 - ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, 492 - hash_info, dir, name, flags); 493 - if (ret) 494 - return ret; 495 - 496 - k = bch2_btree_iter_peek_slot(iter); 471 + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); 497 472 ret = bkey_err(k); 498 473 if (ret) 499 474 goto err; 500 475 501 - d = bkey_s_c_to_dirent(k); 502 - 503 - ret = bch2_dirent_read_target(trans, dir, d, inum); 476 + ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum); 504 477 if (ret > 0) 505 478 ret = -ENOENT; 506 479 err: 507 480 if (ret) 508 481 bch2_trans_iter_exit(trans, iter); 509 - 510 482 return ret; 511 483 } 512 484 ··· 506 502 struct btree_iter iter = { NULL }; 507 503 508 504 int ret = lockrestart_do(trans, 509 - __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0)); 505 + bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0)); 510 506 bch2_trans_iter_exit(trans, &iter); 511 507 bch2_trans_put(trans); 512 508 return ret; 513 509 } 514 510 515 - int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot) 511 + int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 snapshot) 516 512 { 517 513 struct btree_iter iter; 518 514 struct bkey_s_c k; ··· 522 518 SPOS(dir, 0, snapshot), 523 519 POS(dir, U64_MAX), 0, k, ret) 524 520 if (k.k->type == KEY_TYPE_dirent) { 525 - ret = -ENOTEMPTY; 521 + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); 522 + if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol) 523 + continue; 524 + ret = -BCH_ERR_ENOTEMPTY_dir_not_empty; 526 525 break; 527 526 } 528 527 bch2_trans_iter_exit(trans, &iter); ··· 538 531 u32 snapshot; 539 532 540 533 return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?: 541 - bch2_empty_dir_snapshot(trans, dir.inum, snapshot); 534 + bch2_empty_dir_snapshot(trans, dir.inum, dir.subvol, snapshot); 542 535 } 543 536 544 537 int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)

+3 -3

fs/bcachefs/dirent.h

··· 35 35 int bch2_dirent_read_target(struct btree_trans *, subvol_inum, 36 36 struct bkey_s_c_dirent, subvol_inum *); 37 37 38 - int bch2_dirent_create_snapshot(struct btree_trans *, u64, u32, 38 + int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32, 39 39 const struct bch_hash_info *, u8, 40 40 const struct qstr *, u64, u64 *, 41 41 bch_str_hash_flags_t); ··· 62 62 const struct qstr *, subvol_inum *, u64 *, 63 63 enum bch_rename_mode); 64 64 65 - int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, 65 + int bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, 66 66 subvol_inum, const struct bch_hash_info *, 67 67 const struct qstr *, subvol_inum *, unsigned); 68 68 u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum, 69 69 const struct bch_hash_info *, 70 70 const struct qstr *, subvol_inum *); 71 71 72 - int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32); 72 + int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32); 73 73 int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); 74 74 int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); 75 75

+2 -2

fs/bcachefs/ec.c

··· 504 504 unsigned i; 505 505 506 506 for (i = 0; i < s->v.nr_blocks; i++) { 507 - kvpfree(buf->data[i], buf->size << 9); 507 + kvfree(buf->data[i]); 508 508 buf->data[i] = NULL; 509 509 } 510 510 } ··· 531 531 memset(buf->valid, 0xFF, sizeof(buf->valid)); 532 532 533 533 for (i = 0; i < v->nr_blocks; i++) { 534 - buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL); 534 + buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL); 535 535 if (!buf->data[i]) 536 536 goto err; 537 537 }

+9 -6

fs/bcachefs/errcode.c

··· 2 2 3 3 #include "bcachefs.h" 4 4 #include "errcode.h" 5 + #include "trace.h" 5 6 6 7 #include <linux/errname.h> 7 8 ··· 50 49 return err == class; 51 50 } 52 51 53 - int __bch2_err_class(int err) 52 + int __bch2_err_class(int bch_err) 54 53 { 55 - err = -err; 56 - BUG_ON((unsigned) err >= BCH_ERR_MAX); 54 + int std_err = -bch_err; 55 + BUG_ON((unsigned) std_err >= BCH_ERR_MAX); 57 56 58 - while (err >= BCH_ERR_START && bch2_errcode_parents[err - BCH_ERR_START]) 59 - err = bch2_errcode_parents[err - BCH_ERR_START]; 57 + while (std_err >= BCH_ERR_START && bch2_errcode_parents[std_err - BCH_ERR_START]) 58 + std_err = bch2_errcode_parents[std_err - BCH_ERR_START]; 60 59 61 - return -err; 60 + trace_error_downcast(bch_err, std_err, _RET_IP_); 61 + 62 + return -std_err; 62 63 } 63 64 64 65 const char *bch2_blk_status_to_str(blk_status_t status)

+16 -2

fs/bcachefs/errcode.h

··· 5 5 #define BCH_ERRCODES() \ 6 6 x(ERANGE, ERANGE_option_too_small) \ 7 7 x(ERANGE, ERANGE_option_too_big) \ 8 + x(EINVAL, mount_option) \ 9 + x(BCH_ERR_mount_option, option_name) \ 10 + x(BCH_ERR_mount_option, option_value) \ 11 + x(BCH_ERR_mount_option, option_not_bool) \ 8 12 x(ENOMEM, ENOMEM_stripe_buf) \ 9 13 x(ENOMEM, ENOMEM_replicas_table) \ 10 14 x(ENOMEM, ENOMEM_cpu_replicas) \ ··· 82 78 x(ENOMEM, ENOMEM_fs_name_alloc) \ 83 79 x(ENOMEM, ENOMEM_fs_other_alloc) \ 84 80 x(ENOMEM, ENOMEM_dev_alloc) \ 81 + x(ENOMEM, ENOMEM_disk_accounting) \ 85 82 x(ENOSPC, ENOSPC_disk_reservation) \ 86 83 x(ENOSPC, ENOSPC_bucket_alloc) \ 87 84 x(ENOSPC, ENOSPC_disk_label_add) \ ··· 114 109 x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ 115 110 x(ENOENT, ENOENT_dev_not_found) \ 116 111 x(ENOENT, ENOENT_dev_idx_not_found) \ 112 + x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \ 113 + x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \ 117 114 x(0, open_buckets_empty) \ 118 115 x(0, freelist_empty) \ 119 116 x(BCH_ERR_freelist_empty, no_buckets_found) \ ··· 183 176 x(EINVAL, invalid) \ 184 177 x(EINVAL, internal_fsck_err) \ 185 178 x(EINVAL, opt_parse_error) \ 179 + x(EINVAL, remove_with_metadata_missing_unimplemented)\ 180 + x(EINVAL, remove_would_lose_data) \ 181 + x(EINVAL, btree_iter_with_journal_not_supported) \ 186 182 x(EROFS, erofs_trans_commit) \ 187 183 x(EROFS, erofs_no_writes) \ 188 184 x(EROFS, erofs_journal_err) \ ··· 235 225 x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ 236 226 x(EIO, btree_node_read_err) \ 237 227 x(EIO, sb_not_downgraded) \ 238 - x(EIO, btree_write_all_failed) \ 228 + x(EIO, btree_node_write_all_failed) \ 229 + x(EIO, btree_node_read_error) \ 230 + x(EIO, btree_node_read_validate_error) \ 231 + x(EIO, btree_need_topology_repair) \ 239 232 x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ 240 233 x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ 241 234 x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ ··· 251 238 x(BCH_ERR_nopromote, nopromote_congested) \ 252 239 x(BCH_ERR_nopromote, nopromote_in_flight) \ 253 240 x(BCH_ERR_nopromote, nopromote_no_writes) \ 254 - x(BCH_ERR_nopromote, nopromote_enomem) 241 + x(BCH_ERR_nopromote, nopromote_enomem) \ 242 + x(0, need_inode_lock) 255 243 256 244 enum bch_errcode { 257 245 BCH_ERR_START = 2048,

+8 -2

fs/bcachefs/error.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 #include "bcachefs.h" 3 3 #include "error.h" 4 + #include "recovery.h" 4 5 #include "super.h" 5 6 #include "thread_with_file.h" 6 7 ··· 26 25 } 27 26 } 28 27 29 - void bch2_topology_error(struct bch_fs *c) 28 + int bch2_topology_error(struct bch_fs *c) 30 29 { 31 30 set_bit(BCH_FS_topology_error, &c->flags); 32 - if (!test_bit(BCH_FS_fsck_running, &c->flags)) 31 + if (!test_bit(BCH_FS_fsck_running, &c->flags)) { 33 32 bch2_inconsistent_error(c); 33 + return -BCH_ERR_btree_need_topology_repair; 34 + } else { 35 + return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: 36 + -BCH_ERR_btree_node_read_validate_error; 37 + } 34 38 } 35 39 36 40 void bch2_fatal_error(struct bch_fs *c)

+1 -1

fs/bcachefs/error.h

··· 30 30 31 31 bool bch2_inconsistent_error(struct bch_fs *); 32 32 33 - void bch2_topology_error(struct bch_fs *); 33 + int bch2_topology_error(struct bch_fs *); 34 34 35 35 #define bch2_fs_inconsistent(c, ...) \ 36 36 ({ \

+8 -3

fs/bcachefs/extents.h

··· 43 43 #define extent_entry_next(_entry) \ 44 44 ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) 45 45 46 + #define extent_entry_next_safe(_entry, _end) \ 47 + (likely(__extent_entry_type(_entry) < BCH_EXTENT_ENTRY_MAX) \ 48 + ? extent_entry_next(_entry) \ 49 + : _end) 50 + 46 51 static inline unsigned 47 52 __extent_entry_type(const union bch_extent_entry *e) 48 53 { ··· 285 280 #define __bkey_extent_entry_for_each_from(_start, _end, _entry) \ 286 281 for ((_entry) = (_start); \ 287 282 (_entry) < (_end); \ 288 - (_entry) = extent_entry_next(_entry)) 283 + (_entry) = extent_entry_next_safe(_entry, _end)) 289 284 290 285 #define __bkey_ptr_next(_ptr, _end) \ 291 286 ({ \ ··· 323 318 (_ptr).has_ec = false; \ 324 319 \ 325 320 __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ 326 - switch (extent_entry_type(_entry)) { \ 321 + switch (__extent_entry_type(_entry)) { \ 327 322 case BCH_EXTENT_ENTRY_ptr: \ 328 323 (_ptr).ptr = _entry->ptr; \ 329 324 goto out; \ ··· 349 344 for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \ 350 345 (_entry) = _start; \ 351 346 __bkey_ptr_next_decode(_k, _end, _ptr, _entry); \ 352 - (_entry) = extent_entry_next(_entry)) 347 + (_entry) = extent_entry_next_safe(_entry, _end)) 353 348 354 349 #define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \ 355 350 __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \

+2 -2

fs/bcachefs/fifo.h

··· 24 24 (fifo)->mask = (fifo)->size \ 25 25 ? roundup_pow_of_two((fifo)->size) - 1 \ 26 26 : 0; \ 27 - (fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \ 27 + (fifo)->data = kvmalloc(fifo_buf_size(fifo), (_gfp)); \ 28 28 }) 29 29 30 30 #define free_fifo(fifo) \ 31 31 do { \ 32 - kvpfree((fifo)->data, fifo_buf_size(fifo)); \ 32 + kvfree((fifo)->data); \ 33 33 (fifo)->data = NULL; \ 34 34 } while (0) 35 35

+64 -10

fs/bcachefs/fs-common.c

··· 107 107 u32 new_subvol, dir_snapshot; 108 108 109 109 ret = bch2_subvolume_create(trans, new_inode->bi_inum, 110 + dir.subvol, 110 111 snapshot_src.subvol, 111 112 &new_subvol, &snapshot, 112 113 (flags & BCH_CREATE_SNAPSHOT_RO) != 0); ··· 243 242 struct bch_inode_unpacked *dir_u, 244 243 struct bch_inode_unpacked *inode_u, 245 244 const struct qstr *name, 246 - bool deleting_snapshot) 245 + bool deleting_subvol) 247 246 { 248 247 struct bch_fs *c = trans->c; 249 248 struct btree_iter dir_iter = { NULL }; ··· 261 260 262 261 dir_hash = bch2_hash_info_init(c, dir_u); 263 262 264 - ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, 265 - name, &inum, BTREE_ITER_INTENT); 263 + ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, 264 + name, &inum, BTREE_ITER_INTENT); 266 265 if (ret) 267 266 goto err; 268 267 ··· 271 270 if (ret) 272 271 goto err; 273 272 274 - if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) { 273 + if (!deleting_subvol && S_ISDIR(inode_u->bi_mode)) { 275 274 ret = bch2_empty_dir_trans(trans, inum); 276 275 if (ret) 277 276 goto err; 278 277 } 279 278 280 - if (deleting_snapshot && !inode_u->bi_subvol) { 279 + if (deleting_subvol && !inode_u->bi_subvol) { 281 280 ret = -BCH_ERR_ENOENT_not_subvol; 282 281 goto err; 283 282 } 284 283 285 - if (deleting_snapshot || inode_u->bi_subvol) { 284 + if (inode_u->bi_subvol) { 285 + /* Recursive subvolume destroy not allowed (yet?) */ 286 + ret = bch2_subvol_has_children(trans, inode_u->bi_subvol); 287 + if (ret) 288 + goto err; 289 + } 290 + 291 + if (deleting_subvol || inode_u->bi_subvol) { 286 292 ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol); 287 293 if (ret) 288 294 goto err; ··· 357 349 return ret; 358 350 } 359 351 352 + static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_parent) 353 + { 354 + struct btree_iter iter; 355 + struct bkey_i_subvolume *s = 356 + bch2_bkey_get_mut_typed(trans, &iter, 357 + BTREE_ID_subvolumes, POS(0, subvol), 358 + BTREE_ITER_CACHED, subvolume); 359 + int ret = PTR_ERR_OR_ZERO(s); 360 + if (ret) 361 + return ret; 362 + 363 + s->v.fs_path_parent = cpu_to_le32(new_parent); 364 + bch2_trans_iter_exit(trans, &iter); 365 + return 0; 366 + } 367 + 360 368 int bch2_rename_trans(struct btree_trans *trans, 361 369 subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u, 362 370 subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u, ··· 434 410 goto err; 435 411 } 436 412 413 + if (src_inode_u->bi_subvol && 414 + dst_dir.subvol != src_inode_u->bi_parent_subvol) { 415 + ret = subvol_update_parent(trans, src_inode_u->bi_subvol, dst_dir.subvol); 416 + if (ret) 417 + goto err; 418 + } 419 + 420 + if (mode == BCH_RENAME_EXCHANGE && 421 + dst_inode_u->bi_subvol && 422 + src_dir.subvol != dst_inode_u->bi_parent_subvol) { 423 + ret = subvol_update_parent(trans, dst_inode_u->bi_subvol, src_dir.subvol); 424 + if (ret) 425 + goto err; 426 + } 427 + 428 + /* Can't move across subvolumes, unless it's a subvolume root: */ 429 + if (src_dir.subvol != dst_dir.subvol && 430 + (!src_inode_u->bi_subvol || 431 + (dst_inum.inum && !dst_inode_u->bi_subvol))) { 432 + ret = -EXDEV; 433 + goto err; 434 + } 435 + 436 + if (src_inode_u->bi_parent_subvol) 437 + src_inode_u->bi_parent_subvol = dst_dir.subvol; 438 + 439 + if ((mode == BCH_RENAME_EXCHANGE) && 440 + dst_inode_u->bi_parent_subvol) 441 + dst_inode_u->bi_parent_subvol = src_dir.subvol; 442 + 437 443 src_inode_u->bi_dir = dst_dir_u->bi_inum; 438 444 src_inode_u->bi_dir_offset = dst_offset; 439 445 ··· 486 432 goto err; 487 433 } 488 434 489 - if (S_ISDIR(dst_inode_u->bi_mode) && 490 - bch2_empty_dir_trans(trans, dst_inum)) { 491 - ret = -ENOTEMPTY; 492 - goto err; 435 + if (S_ISDIR(dst_inode_u->bi_mode)) { 436 + ret = bch2_empty_dir_trans(trans, dst_inum); 437 + if (ret) 438 + goto err; 493 439 } 494 440 } 495 441

+110 -41

fs/bcachefs/fs-io-buffered.c

··· 810 810 static int __bch2_buffered_write(struct bch_inode_info *inode, 811 811 struct address_space *mapping, 812 812 struct iov_iter *iter, 813 - loff_t pos, unsigned len) 813 + loff_t pos, unsigned len, 814 + bool inode_locked) 814 815 { 815 816 struct bch_fs *c = inode->v.i_sb->s_fs_info; 816 817 struct bch2_folio_reservation res; ··· 835 834 goto out; 836 835 837 836 BUG_ON(!fs.nr); 837 + 838 + /* 839 + * If we're not using the inode lock, we need to lock all the folios for 840 + * atomiticity of writes vs. other writes: 841 + */ 842 + if (!inode_locked && folio_end_pos(darray_last(fs)) < end) { 843 + ret = -BCH_ERR_need_inode_lock; 844 + goto out; 845 + } 838 846 839 847 f = darray_first(fs); 840 848 if (pos != folio_pos(f) && !folio_test_uptodate(f)) { ··· 939 929 end = pos + copied; 940 930 941 931 spin_lock(&inode->v.i_lock); 942 - if (end > inode->v.i_size) 932 + if (end > inode->v.i_size) { 933 + BUG_ON(!inode_locked); 943 934 i_size_write(&inode->v, end); 935 + } 944 936 spin_unlock(&inode->v.i_lock); 945 937 946 938 f_pos = pos; ··· 986 974 struct file *file = iocb->ki_filp; 987 975 struct address_space *mapping = file->f_mapping; 988 976 struct bch_inode_info *inode = file_bch_inode(file); 989 - loff_t pos = iocb->ki_pos; 990 - ssize_t written = 0; 991 - int ret = 0; 977 + loff_t pos; 978 + bool inode_locked = false; 979 + ssize_t written = 0, written2 = 0, ret = 0; 980 + 981 + /* 982 + * We don't take the inode lock unless i_size will be changing. Folio 983 + * locks provide exclusion with other writes, and the pagecache add lock 984 + * provides exclusion with truncate and hole punching. 985 + * 986 + * There is one nasty corner case where atomicity would be broken 987 + * without great care: when copying data from userspace to the page 988 + * cache, we do that with faults disable - a page fault would recurse 989 + * back into the filesystem, taking filesystem locks again, and 990 + * deadlock; so it's done with faults disabled, and we fault in the user 991 + * buffer when we aren't holding locks. 992 + * 993 + * If we do part of the write, but we then race and in the userspace 994 + * buffer have been evicted and are no longer resident, then we have to 995 + * drop our folio locks to re-fault them in, breaking write atomicity. 996 + * 997 + * To fix this, we restart the write from the start, if we weren't 998 + * holding the inode lock. 999 + * 1000 + * There is another wrinkle after that; if we restart the write from the 1001 + * start, and then get an unrecoverable error, we _cannot_ claim to 1002 + * userspace that we did not write data we actually did - so we must 1003 + * track (written2) the most we ever wrote. 1004 + */ 1005 + 1006 + if ((iocb->ki_flags & IOCB_APPEND) || 1007 + (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) { 1008 + inode_lock(&inode->v); 1009 + inode_locked = true; 1010 + } 1011 + 1012 + ret = generic_write_checks(iocb, iter); 1013 + if (ret <= 0) 1014 + goto unlock; 1015 + 1016 + ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0); 1017 + if (ret) { 1018 + if (!inode_locked) { 1019 + inode_lock(&inode->v); 1020 + inode_locked = true; 1021 + ret = file_remove_privs_flags(file, 0); 1022 + } 1023 + if (ret) 1024 + goto unlock; 1025 + } 1026 + 1027 + ret = file_update_time(file); 1028 + if (ret) 1029 + goto unlock; 1030 + 1031 + pos = iocb->ki_pos; 992 1032 993 1033 bch2_pagecache_add_get(inode); 1034 + 1035 + if (!inode_locked && 1036 + (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) 1037 + goto get_inode_lock; 994 1038 995 1039 do { 996 1040 unsigned offset = pos & (PAGE_SIZE - 1); ··· 1072 1004 } 1073 1005 } 1074 1006 1007 + if (unlikely(bytes != iov_iter_count(iter) && !inode_locked)) 1008 + goto get_inode_lock; 1009 + 1075 1010 if (unlikely(fatal_signal_pending(current))) { 1076 1011 ret = -EINTR; 1077 1012 break; 1078 1013 } 1079 1014 1080 - ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); 1015 + ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked); 1016 + if (ret == -BCH_ERR_need_inode_lock) 1017 + goto get_inode_lock; 1081 1018 if (unlikely(ret < 0)) 1082 1019 break; 1083 1020 ··· 1103 1030 } 1104 1031 pos += ret; 1105 1032 written += ret; 1033 + written2 = max(written, written2); 1034 + 1035 + if (ret != bytes && !inode_locked) 1036 + goto get_inode_lock; 1106 1037 ret = 0; 1107 1038 1108 1039 balance_dirty_pages_ratelimited(mapping); 1040 + 1041 + if (0) { 1042 + get_inode_lock: 1043 + bch2_pagecache_add_put(inode); 1044 + inode_lock(&inode->v); 1045 + inode_locked = true; 1046 + bch2_pagecache_add_get(inode); 1047 + 1048 + iov_iter_revert(iter, written); 1049 + pos -= written; 1050 + written = 0; 1051 + ret = 0; 1052 + } 1109 1053 } while (iov_iter_count(iter)); 1110 - 1111 1054 bch2_pagecache_add_put(inode); 1112 - 1113 - return written ? written : ret; 1114 - } 1115 - 1116 - ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) 1117 - { 1118 - struct file *file = iocb->ki_filp; 1119 - struct bch_inode_info *inode = file_bch_inode(file); 1120 - ssize_t ret; 1121 - 1122 - if (iocb->ki_flags & IOCB_DIRECT) { 1123 - ret = bch2_direct_write(iocb, from); 1124 - goto out; 1125 - } 1126 - 1127 - inode_lock(&inode->v); 1128 - 1129 - ret = generic_write_checks(iocb, from); 1130 - if (ret <= 0) 1131 - goto unlock; 1132 - 1133 - ret = file_remove_privs(file); 1134 - if (ret) 1135 - goto unlock; 1136 - 1137 - ret = file_update_time(file); 1138 - if (ret) 1139 - goto unlock; 1140 - 1141 - ret = bch2_buffered_write(iocb, from); 1142 - if (likely(ret > 0)) 1143 - iocb->ki_pos += ret; 1144 1055 unlock: 1145 - inode_unlock(&inode->v); 1056 + if (inode_locked) 1057 + inode_unlock(&inode->v); 1146 1058 1059 + iocb->ki_pos += written; 1060 + 1061 + ret = max(written, written2) ?: ret; 1147 1062 if (ret > 0) 1148 1063 ret = generic_write_sync(iocb, ret); 1149 - out: 1064 + return ret; 1065 + } 1066 + 1067 + ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter) 1068 + { 1069 + ssize_t ret = iocb->ki_flags & IOCB_DIRECT 1070 + ? bch2_direct_write(iocb, iter) 1071 + : bch2_buffered_write(iocb, iter); 1072 + 1150 1073 return bch2_err_class(ret); 1151 1074 } 1152 1075

+3 -6

fs/bcachefs/fs-io-pagecache.h

··· 51 51 52 52 struct bch_folio_sector { 53 53 /* Uncompressed, fully allocated replicas (or on disk reservation): */ 54 - unsigned nr_replicas:4; 55 - 54 + u8 nr_replicas:4, 56 55 /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */ 57 - unsigned replicas_reserved:4; 58 - 59 - /* i_sectors: */ 60 - enum bch_folio_sector_state state:8; 56 + replicas_reserved:4; 57 + u8 state; 61 58 }; 62 59 63 60 struct bch_folio {

+148 -80

fs/bcachefs/fs.c

··· 176 176 return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL); 177 177 } 178 178 179 - struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) 179 + static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode) 180 180 { 181 - struct bch_inode_unpacked inode_u; 182 - struct bch_inode_info *inode; 183 - struct btree_trans *trans; 184 - struct bch_subvolume subvol; 185 - int ret; 181 + subvol_inum inum = inode_inum(inode); 182 + struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v, 183 + bch2_inode_hash(inum), 184 + bch2_iget5_test, 185 + bch2_iget5_set, 186 + &inum)); 187 + BUG_ON(!old); 186 188 187 - inode = to_bch_ei(iget5_locked(c->vfs_sb, 188 - bch2_inode_hash(inum), 189 - bch2_iget5_test, 190 - bch2_iget5_set, 191 - &inum)); 192 - if (unlikely(!inode)) 193 - return ERR_PTR(-ENOMEM); 194 - if (!(inode->v.i_state & I_NEW)) 195 - return &inode->v; 196 - 197 - trans = bch2_trans_get(c); 198 - ret = lockrestart_do(trans, 199 - bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: 200 - bch2_inode_find_by_inum_trans(trans, inum, &inode_u)); 201 - 202 - if (!ret) 203 - bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); 204 - bch2_trans_put(trans); 205 - 206 - if (ret) { 207 - iget_failed(&inode->v); 208 - return ERR_PTR(bch2_err_class(ret)); 189 + if (unlikely(old != inode)) { 190 + discard_new_inode(&inode->v); 191 + inode = old; 192 + } else { 193 + mutex_lock(&c->vfs_inodes_lock); 194 + list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); 195 + mutex_unlock(&c->vfs_inodes_lock); 196 + /* 197 + * we really don't want insert_inode_locked2() to be setting 198 + * I_NEW... 199 + */ 200 + unlock_new_inode(&inode->v); 209 201 } 210 202 211 - mutex_lock(&c->vfs_inodes_lock); 212 - list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); 213 - mutex_unlock(&c->vfs_inodes_lock); 203 + return inode; 204 + } 214 205 215 - unlock_new_inode(&inode->v); 206 + #define memalloc_flags_do(_flags, _do) \ 207 + ({ \ 208 + unsigned _saved_flags = memalloc_flags_save(_flags); \ 209 + typeof(_do) _ret = _do; \ 210 + memalloc_noreclaim_restore(_saved_flags); \ 211 + _ret; \ 212 + }) 216 213 217 - return &inode->v; 214 + /* 215 + * Allocate a new inode, dropping/retaking btree locks if necessary: 216 + */ 217 + static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans) 218 + { 219 + struct bch_fs *c = trans->c; 220 + 221 + struct bch_inode_info *inode = 222 + memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN, 223 + to_bch_ei(new_inode(c->vfs_sb))); 224 + 225 + if (unlikely(!inode)) { 226 + int ret = drop_locks_do(trans, (inode = to_bch_ei(new_inode(c->vfs_sb))) ? 0 : -ENOMEM); 227 + if (ret && inode) 228 + discard_new_inode(&inode->v); 229 + if (ret) 230 + return ERR_PTR(ret); 231 + } 232 + 233 + return inode; 234 + } 235 + 236 + struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) 237 + { 238 + struct bch_inode_info *inode = 239 + to_bch_ei(ilookup5_nowait(c->vfs_sb, 240 + bch2_inode_hash(inum), 241 + bch2_iget5_test, 242 + &inum)); 243 + if (inode) 244 + return &inode->v; 245 + 246 + struct btree_trans *trans = bch2_trans_get(c); 247 + 248 + struct bch_inode_unpacked inode_u; 249 + struct bch_subvolume subvol; 250 + int ret = lockrestart_do(trans, 251 + bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: 252 + bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: 253 + PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans)); 254 + if (!ret) { 255 + bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); 256 + inode = bch2_inode_insert(c, inode); 257 + } 258 + bch2_trans_put(trans); 259 + 260 + return ret ? ERR_PTR(ret) : &inode->v; 218 261 } 219 262 220 263 struct bch_inode_info * ··· 269 226 struct bch_fs *c = dir->v.i_sb->s_fs_info; 270 227 struct btree_trans *trans; 271 228 struct bch_inode_unpacked dir_u; 272 - struct bch_inode_info *inode, *old; 229 + struct bch_inode_info *inode; 273 230 struct bch_inode_unpacked inode_u; 274 231 struct posix_acl *default_acl = NULL, *acl = NULL; 275 232 subvol_inum inum; ··· 336 293 mutex_unlock(&dir->ei_update_lock); 337 294 } 338 295 339 - bch2_iget5_set(&inode->v, &inum); 340 296 bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); 341 297 342 298 set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); ··· 346 304 * bch2_trans_exit() and dropping locks, else we could race with another 347 305 * thread pulling the inode in and modifying it: 348 306 */ 349 - 350 - inode->v.i_state |= I_CREATING; 351 - 352 - old = to_bch_ei(inode_insert5(&inode->v, 353 - bch2_inode_hash(inum), 354 - bch2_iget5_test, 355 - bch2_iget5_set, 356 - &inum)); 357 - BUG_ON(!old); 358 - 359 - if (unlikely(old != inode)) { 360 - /* 361 - * We raced, another process pulled the new inode into cache 362 - * before us: 363 - */ 364 - make_bad_inode(&inode->v); 365 - iput(&inode->v); 366 - 367 - inode = old; 368 - } else { 369 - mutex_lock(&c->vfs_inodes_lock); 370 - list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); 371 - mutex_unlock(&c->vfs_inodes_lock); 372 - /* 373 - * we really don't want insert_inode_locked2() to be setting 374 - * I_NEW... 375 - */ 376 - unlock_new_inode(&inode->v); 377 - } 378 - 307 + inode = bch2_inode_insert(c, inode); 379 308 bch2_trans_put(trans); 380 309 err: 381 310 posix_acl_release(default_acl); ··· 365 352 366 353 /* methods */ 367 354 355 + static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, 356 + subvol_inum dir, struct bch_hash_info *dir_hash_info, 357 + const struct qstr *name) 358 + { 359 + struct bch_fs *c = trans->c; 360 + struct btree_iter dirent_iter = {}; 361 + subvol_inum inum = {}; 362 + 363 + int ret = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, 364 + dir_hash_info, dir, name, 0); 365 + if (ret) 366 + return ERR_PTR(ret); 367 + 368 + struct bkey_s_c k = bch2_btree_iter_peek_slot(&dirent_iter); 369 + ret = bkey_err(k); 370 + if (ret) 371 + goto err; 372 + 373 + ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum); 374 + if (ret > 0) 375 + ret = -ENOENT; 376 + if (ret) 377 + goto err; 378 + 379 + struct bch_inode_info *inode = 380 + to_bch_ei(ilookup5_nowait(c->vfs_sb, 381 + bch2_inode_hash(inum), 382 + bch2_iget5_test, 383 + &inum)); 384 + if (inode) 385 + goto out; 386 + 387 + struct bch_subvolume subvol; 388 + struct bch_inode_unpacked inode_u; 389 + ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: 390 + bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: 391 + PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans)); 392 + if (bch2_err_matches(ret, ENOENT)) { 393 + struct printbuf buf = PRINTBUF; 394 + 395 + bch2_bkey_val_to_text(&buf, c, k); 396 + bch_err(c, "%s points to missing inode", buf.buf); 397 + printbuf_exit(&buf); 398 + } 399 + if (ret) 400 + goto err; 401 + 402 + bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); 403 + inode = bch2_inode_insert(c, inode); 404 + out: 405 + bch2_trans_iter_exit(trans, &dirent_iter); 406 + return inode; 407 + err: 408 + inode = ERR_PTR(ret); 409 + goto out; 410 + } 411 + 368 412 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, 369 413 unsigned int flags) 370 414 { 371 415 struct bch_fs *c = vdir->i_sb->s_fs_info; 372 416 struct bch_inode_info *dir = to_bch_ei(vdir); 373 417 struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); 374 - struct inode *vinode = NULL; 375 - subvol_inum inum = { .subvol = 1 }; 376 - int ret; 377 418 378 - ret = bch2_dirent_lookup(c, inode_inum(dir), &hash, 379 - &dentry->d_name, &inum); 419 + struct bch_inode_info *inode; 420 + bch2_trans_do(c, NULL, NULL, 0, 421 + PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir), 422 + &hash, &dentry->d_name))); 423 + if (IS_ERR(inode)) 424 + inode = NULL; 380 425 381 - if (!ret) 382 - vinode = bch2_vfs_inode_get(c, inum); 383 - 384 - return d_splice_alias(vinode, dentry); 426 + return d_splice_alias(&inode->v, dentry); 385 427 } 386 428 387 429 static int bch2_mknod(struct mnt_idmap *idmap, ··· 1440 1372 struct bch_inode_unpacked *bi, 1441 1373 struct bch_subvolume *subvol) 1442 1374 { 1375 + bch2_iget5_set(&inode->v, &inum); 1443 1376 bch2_inode_update_after_write(trans, inode, bi, ~0); 1444 1377 1445 1378 if (BCH_SUBVOLUME_SNAP(subvol)) ··· 1641 1572 * number: 1642 1573 */ 1643 1574 u64 avail_inodes = ((usage.capacity - usage.used) << 3); 1644 - u64 fsid; 1645 1575 1646 1576 buf->f_type = BCACHEFS_STATFS_MAGIC; 1647 1577 buf->f_bsize = sb->s_blocksize; ··· 1651 1583 buf->f_files = usage.nr_inodes + avail_inodes; 1652 1584 buf->f_ffree = avail_inodes; 1653 1585 1654 - fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ 1655 - le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); 1656 - buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; 1657 - buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; 1586 + buf->f_fsid = uuid_to_fsid(c->sb.user_uuid.b); 1658 1587 buf->f_namelen = BCH_NAME_MAX; 1659 1588 1660 1589 return 0; ··· 1870 1805 opt_set(opts, read_only, (flags & SB_RDONLY) != 0); 1871 1806 1872 1807 ret = bch2_parse_mount_opts(NULL, &opts, data); 1873 - if (ret) 1808 + if (ret) { 1809 + ret = bch2_err_class(ret); 1874 1810 return ERR_PTR(ret); 1811 + } 1875 1812 1876 1813 if (!dev_name || strlen(dev_name) == 0) 1877 1814 return ERR_PTR(-EINVAL); ··· 1949 1882 sb->s_time_gran = c->sb.nsec_per_time_unit; 1950 1883 sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; 1951 1884 sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); 1885 + sb->s_uuid = c->sb.user_uuid; 1952 1886 c->vfs_sb = sb; 1953 1887 strscpy(sb->s_id, c->name, sizeof(sb->s_id)); 1954 1888

+569 -294

fs/bcachefs/fsck.c

··· 100 100 } 101 101 102 102 static int lookup_inode(struct btree_trans *trans, u64 inode_nr, 103 - struct bch_inode_unpacked *inode, 104 - u32 *snapshot) 103 + struct bch_inode_unpacked *inode, 104 + u32 *snapshot) 105 105 { 106 106 struct btree_iter iter; 107 107 struct bkey_s_c k; ··· 140 140 *type = d.v->d_type; 141 141 bch2_trans_iter_exit(trans, &iter); 142 142 return 0; 143 - } 144 - 145 - static int __write_inode(struct btree_trans *trans, 146 - struct bch_inode_unpacked *inode, 147 - u32 snapshot) 148 - { 149 - struct bkey_inode_buf *inode_p = 150 - bch2_trans_kmalloc(trans, sizeof(*inode_p)); 151 - 152 - if (IS_ERR(inode_p)) 153 - return PTR_ERR(inode_p); 154 - 155 - bch2_inode_pack(inode_p, inode); 156 - inode_p->inode.k.p.snapshot = snapshot; 157 - 158 - return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, 159 - &inode_p->inode.k_i, 160 - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); 161 - } 162 - 163 - static int fsck_write_inode(struct btree_trans *trans, 164 - struct bch_inode_unpacked *inode, 165 - u32 snapshot) 166 - { 167 - int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 168 - __write_inode(trans, inode, snapshot)); 169 - bch_err_fn(trans->c, ret); 170 - return ret; 171 143 } 172 144 173 145 static int __remove_dirent(struct btree_trans *trans, struct bpos pos) ··· 252 280 goto err; 253 281 254 282 ret = bch2_dirent_create_snapshot(trans, 255 - root_inode.bi_inum, snapshot, &root_hash_info, 283 + 0, root_inode.bi_inum, snapshot, &root_hash_info, 256 284 mode_to_type(lostfound->bi_mode), 257 285 &lostfound_str, 258 286 lostfound->bi_inum, ··· 275 303 char name_buf[20]; 276 304 struct qstr name; 277 305 u64 dir_offset = 0; 306 + u32 dirent_snapshot = inode_snapshot; 278 307 int ret; 279 308 280 - ret = lookup_lostfound(trans, inode_snapshot, &lostfound); 309 + if (inode->bi_subvol) { 310 + inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; 311 + 312 + u64 root_inum; 313 + ret = subvol_lookup(trans, inode->bi_parent_subvol, 314 + &dirent_snapshot, &root_inum); 315 + if (ret) 316 + return ret; 317 + 318 + snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol); 319 + } else { 320 + snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); 321 + } 322 + 323 + ret = lookup_lostfound(trans, dirent_snapshot, &lostfound); 281 324 if (ret) 282 325 return ret; 283 326 284 327 if (S_ISDIR(inode->bi_mode)) { 285 328 lostfound.bi_nlink++; 286 329 287 - ret = __write_inode(trans, &lostfound, U32_MAX); 330 + ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX); 288 331 if (ret) 289 332 return ret; 290 333 } 291 334 292 335 dir_hash = bch2_hash_info_init(trans->c, &lostfound); 293 336 294 - snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); 295 337 name = (struct qstr) QSTR(name_buf); 296 338 297 339 ret = bch2_dirent_create_snapshot(trans, 298 - lostfound.bi_inum, inode_snapshot, 340 + inode->bi_parent_subvol, lostfound.bi_inum, 341 + dirent_snapshot, 299 342 &dir_hash, 300 343 inode_d_type(inode), 301 - &name, inode->bi_inum, &dir_offset, 344 + &name, 345 + inode->bi_subvol ?: inode->bi_inum, 346 + &dir_offset, 302 347 BCH_HASH_SET_MUST_CREATE); 303 348 if (ret) 304 349 return ret; ··· 323 334 inode->bi_dir = lostfound.bi_inum; 324 335 inode->bi_dir_offset = dir_offset; 325 336 326 - return __write_inode(trans, inode, inode_snapshot); 337 + return __bch2_fsck_write_inode(trans, inode, inode_snapshot); 327 338 } 328 339 329 340 static int remove_backpointer(struct btree_trans *trans, ··· 339 350 ret = bkey_err(d) ?: 340 351 __remove_dirent(trans, d.k->p); 341 352 bch2_trans_iter_exit(trans, &iter); 353 + return ret; 354 + } 355 + 356 + static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s) 357 + { 358 + struct bch_fs *c = trans->c; 359 + 360 + struct bch_inode_unpacked inode; 361 + int ret = bch2_inode_find_by_inum_trans(trans, 362 + (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, 363 + &inode); 364 + if (ret) 365 + return ret; 366 + 367 + ret = remove_backpointer(trans, &inode); 368 + bch_err_msg(c, ret, "removing dirent"); 369 + if (ret) 370 + return ret; 371 + 372 + ret = reattach_inode(trans, &inode, le32_to_cpu(s.v->snapshot)); 373 + bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); 342 374 return ret; 343 375 } 344 376 ··· 602 592 } 603 593 604 594 static struct inode_walker_entry * 605 - lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, 606 - u32 snapshot, bool is_whiteout) 595 + lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k) 607 596 { 597 + bool is_whiteout = k.k->type == KEY_TYPE_whiteout; 598 + u32 snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); 599 + 608 600 struct inode_walker_entry *i; 609 - 610 - snapshot = bch2_snapshot_equiv(c, snapshot); 611 - 612 601 __darray_for_each(w->inodes, i) 613 602 if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot)) 614 603 goto found; ··· 618 609 619 610 if (snapshot != i->snapshot && !is_whiteout) { 620 611 struct inode_walker_entry new = *i; 621 - size_t pos; 622 - int ret; 623 612 624 613 new.snapshot = snapshot; 625 614 new.count = 0; 626 615 627 - bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u", 628 - w->last_pos.inode, snapshot, i->snapshot); 616 + struct printbuf buf = PRINTBUF; 617 + bch2_bkey_val_to_text(&buf, c, k); 618 + 619 + bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" 620 + "unexpected because we should always update the inode when we update a key in that inode\n" 621 + "%s", 622 + w->last_pos.inode, snapshot, i->snapshot, buf.buf); 623 + printbuf_exit(&buf); 629 624 630 625 while (i > w->inodes.data && i[-1].snapshot > snapshot) 631 626 --i; 632 627 633 - pos = i - w->inodes.data; 634 - ret = darray_insert_item(&w->inodes, pos, new); 628 + size_t pos = i - w->inodes.data; 629 + int ret = darray_insert_item(&w->inodes, pos, new); 635 630 if (ret) 636 631 return ERR_PTR(ret); 637 632 ··· 646 633 } 647 634 648 635 static struct inode_walker_entry *walk_inode(struct btree_trans *trans, 649 - struct inode_walker *w, struct bpos pos, 650 - bool is_whiteout) 636 + struct inode_walker *w, 637 + struct bkey_s_c k) 651 638 { 652 - if (w->last_pos.inode != pos.inode) { 653 - int ret = get_inodes_all_snapshots(trans, w, pos.inode); 639 + if (w->last_pos.inode != k.k->p.inode) { 640 + int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode); 654 641 if (ret) 655 642 return ERR_PTR(ret); 656 - } else if (bkey_cmp(w->last_pos, pos)) { 643 + } else if (bkey_cmp(w->last_pos, k.k->p)) { 657 644 darray_for_each(w->inodes, i) 658 645 i->seen_this_pos = false; 659 646 } 660 647 661 - w->last_pos = pos; 648 + w->last_pos = k.k->p; 662 649 663 - return lookup_inode_for_snapshot(trans->c, w, pos.snapshot, is_whiteout); 650 + return lookup_inode_for_snapshot(trans->c, w, k); 664 651 } 665 652 666 653 static int __get_visible_inodes(struct btree_trans *trans, ··· 735 722 delete->k.p = k_iter->pos; 736 723 return bch2_btree_iter_traverse(k_iter) ?: 737 724 bch2_trans_update(trans, k_iter, delete, 0) ?: 738 - bch2_hash_set_snapshot(trans, desc, hash_info, 725 + bch2_hash_set_in_snapshot(trans, desc, hash_info, 739 726 (subvol_inum) { 0, k.k->p.inode }, 740 727 k.k->p.snapshot, tmp, 741 728 BCH_HASH_SET_MUST_CREATE, ··· 808 795 goto out; 809 796 } 810 797 798 + static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, 799 + struct btree_iter *iter, 800 + struct bpos pos) 801 + { 802 + return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); 803 + } 804 + 805 + static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, 806 + struct btree_iter *iter, 807 + struct bch_inode_unpacked *inode, 808 + u32 *snapshot) 809 + { 810 + if (inode->bi_subvol) { 811 + u64 inum; 812 + int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum); 813 + if (ret) 814 + return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) }); 815 + } 816 + 817 + return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot)); 818 + } 819 + 820 + static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, 821 + struct bkey_s_c_dirent d) 822 + { 823 + return inode->bi_dir == d.k->p.inode && 824 + inode->bi_dir_offset == d.k->p.offset; 825 + } 826 + 827 + static bool dirent_points_to_inode(struct bkey_s_c_dirent d, 828 + struct bch_inode_unpacked *inode) 829 + { 830 + return d.v->d_type == DT_SUBVOL 831 + ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol 832 + : le64_to_cpu(d.v->d_inum) == inode->bi_inum; 833 + } 834 + 811 835 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) 812 836 { 813 837 struct btree_iter iter; 814 838 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0); 815 - int ret = bkey_err(k); 816 - if (ret) 839 + int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set; 840 + bch2_trans_iter_exit(trans, &iter); 841 + return ret; 842 + } 843 + 844 + static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k, 845 + struct bch_inode_unpacked *inode, 846 + u32 inode_snapshot, bool *write_inode) 847 + { 848 + struct bch_fs *c = trans->c; 849 + struct printbuf buf = PRINTBUF; 850 + 851 + struct btree_iter dirent_iter = {}; 852 + struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot); 853 + int ret = bkey_err(d); 854 + if (ret && !bch2_err_matches(ret, ENOENT)) 817 855 return ret; 818 856 819 - bch2_trans_iter_exit(trans, &iter); 820 - return k.k->type == KEY_TYPE_set; 857 + if (fsck_err_on(ret, 858 + c, inode_points_to_missing_dirent, 859 + "inode points to missing dirent\n%s", 860 + (bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) || 861 + fsck_err_on(!ret && !dirent_points_to_inode(d, inode), 862 + c, inode_points_to_wrong_dirent, 863 + "inode points to dirent that does not point back:\n%s", 864 + (bch2_bkey_val_to_text(&buf, c, inode_k), 865 + prt_newline(&buf), 866 + bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { 867 + /* 868 + * We just clear the backpointer fields for now. If we find a 869 + * dirent that points to this inode in check_dirents(), we'll 870 + * update it then; then when we get to check_path() if the 871 + * backpointer is still 0 we'll reattach it. 872 + */ 873 + inode->bi_dir = 0; 874 + inode->bi_dir_offset = 0; 875 + inode->bi_flags &= ~BCH_INODE_backptr_untrusted; 876 + *write_inode = true; 877 + } 878 + 879 + ret = 0; 880 + fsck_err: 881 + bch2_trans_iter_exit(trans, &dirent_iter); 882 + printbuf_exit(&buf); 883 + bch_err_fn(c, ret); 884 + return ret; 821 885 } 822 886 823 887 static int check_inode(struct btree_trans *trans, ··· 951 861 952 862 u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked; 953 863 954 - ret = __write_inode(trans, &u, iter->pos.snapshot); 864 + ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot); 865 + 955 866 bch_err_msg(c, ret, "in fsck updating inode"); 956 867 if (ret) 957 868 return ret; ··· 967 876 if (ret < 0) 968 877 return ret; 969 878 970 - fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list, 879 + fsck_err_on(!ret, c, unlinked_inode_not_on_deleted_list, 971 880 "inode %llu:%u unlinked, but not on deleted list", 972 881 u.bi_inum, k.k->p.snapshot); 973 882 ret = 0; ··· 1041 950 do_update = true; 1042 951 } 1043 952 953 + if (u.bi_dir || u.bi_dir_offset) { 954 + ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update); 955 + if (ret) 956 + goto err; 957 + } 958 + 959 + if (fsck_err_on(u.bi_parent_subvol && 960 + (u.bi_subvol == 0 || 961 + u.bi_subvol == BCACHEFS_ROOT_SUBVOL), 962 + c, inode_bi_parent_nonzero, 963 + "inode %llu:%u has subvol %u but nonzero parent subvol %u", 964 + u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) { 965 + u.bi_parent_subvol = 0; 966 + do_update = true; 967 + } 968 + 969 + if (u.bi_subvol) { 970 + struct bch_subvolume s; 971 + 972 + ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s); 973 + if (ret && !bch2_err_matches(ret, ENOENT)) 974 + goto err; 975 + 976 + if (fsck_err_on(ret, 977 + c, inode_bi_subvol_missing, 978 + "inode %llu:%u bi_subvol points to missing subvolume %u", 979 + u.bi_inum, k.k->p.snapshot, u.bi_subvol) || 980 + fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum || 981 + !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot), 982 + k.k->p.snapshot), 983 + c, inode_bi_subvol_wrong, 984 + "inode %llu:%u points to subvol %u, but subvol points to %llu:%u", 985 + u.bi_inum, k.k->p.snapshot, u.bi_subvol, 986 + le64_to_cpu(s.inode), 987 + le32_to_cpu(s.snapshot))) { 988 + u.bi_subvol = 0; 989 + u.bi_parent_subvol = 0; 990 + do_update = true; 991 + } 992 + } 993 + 1044 994 if (do_update) { 1045 - ret = __write_inode(trans, &u, iter->pos.snapshot); 995 + ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot); 1046 996 bch_err_msg(c, ret, "in fsck updating inode"); 1047 997 if (ret) 1048 998 return ret; ··· 1114 982 return ret; 1115 983 } 1116 984 1117 - static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, 1118 - struct btree_iter *iter, 1119 - struct bpos pos) 1120 - { 1121 - return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); 1122 - } 1123 - 1124 - static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, 1125 - struct bkey_s_c_dirent d) 1126 - { 1127 - return inode->bi_dir == d.k->p.inode && 1128 - inode->bi_dir_offset == d.k->p.offset; 1129 - } 1130 - 1131 - static bool dirent_points_to_inode(struct bkey_s_c_dirent d, 1132 - struct bch_inode_unpacked *inode) 1133 - { 1134 - return d.v->d_type == DT_SUBVOL 1135 - ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol 1136 - : le64_to_cpu(d.v->d_inum) == inode->bi_inum; 1137 - } 1138 - 1139 985 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) 1140 986 { 1141 987 struct bch_fs *c = trans->c; ··· 1142 1032 w->last_pos.inode, i->snapshot, 1143 1033 i->inode.bi_sectors, i->count)) { 1144 1034 i->inode.bi_sectors = i->count; 1145 - ret = fsck_write_inode(trans, &i->inode, i->snapshot); 1035 + ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot); 1146 1036 if (ret) 1147 1037 break; 1148 1038 } ··· 1422 1312 goto err; 1423 1313 } 1424 1314 1425 - i = walk_inode(trans, inode, equiv, k.k->type == KEY_TYPE_whiteout); 1315 + i = walk_inode(trans, inode, k); 1426 1316 ret = PTR_ERR_OR_ZERO(i); 1427 1317 if (ret) 1428 1318 goto err; ··· 1591 1481 "directory %llu:%u with wrong i_nlink: got %u, should be %llu", 1592 1482 w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { 1593 1483 i->inode.bi_nlink = i->count; 1594 - ret = fsck_write_inode(trans, &i->inode, i->snapshot); 1484 + ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot); 1595 1485 if (ret) 1596 1486 break; 1597 1487 } ··· 1599 1489 fsck_err: 1600 1490 bch_err_fn(c, ret); 1601 1491 return ret ?: trans_was_restarted(trans, restart_count); 1492 + } 1493 + 1494 + static int check_dirent_inode_dirent(struct btree_trans *trans, 1495 + struct btree_iter *iter, 1496 + struct bkey_s_c_dirent d, 1497 + struct bch_inode_unpacked *target, 1498 + u32 target_snapshot) 1499 + { 1500 + struct bch_fs *c = trans->c; 1501 + struct printbuf buf = PRINTBUF; 1502 + int ret = 0; 1503 + 1504 + if (inode_points_to_dirent(target, d)) 1505 + return 0; 1506 + 1507 + if (!target->bi_dir && 1508 + !target->bi_dir_offset) { 1509 + target->bi_dir = d.k->p.inode; 1510 + target->bi_dir_offset = d.k->p.offset; 1511 + return __bch2_fsck_write_inode(trans, target, target_snapshot); 1512 + } 1513 + 1514 + struct btree_iter bp_iter = { NULL }; 1515 + struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, 1516 + SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot)); 1517 + ret = bkey_err(bp_dirent); 1518 + if (ret && !bch2_err_matches(ret, ENOENT)) 1519 + goto err; 1520 + 1521 + bool backpointer_exists = !ret; 1522 + ret = 0; 1523 + 1524 + if (fsck_err_on(!backpointer_exists, 1525 + c, inode_wrong_backpointer, 1526 + "inode %llu:%u has wrong backpointer:\n" 1527 + "got %llu:%llu\n" 1528 + "should be %llu:%llu", 1529 + target->bi_inum, target_snapshot, 1530 + target->bi_dir, 1531 + target->bi_dir_offset, 1532 + d.k->p.inode, 1533 + d.k->p.offset)) { 1534 + target->bi_dir = d.k->p.inode; 1535 + target->bi_dir_offset = d.k->p.offset; 1536 + ret = __bch2_fsck_write_inode(trans, target, target_snapshot); 1537 + goto out; 1538 + } 1539 + 1540 + bch2_bkey_val_to_text(&buf, c, d.s_c); 1541 + prt_newline(&buf); 1542 + if (backpointer_exists) 1543 + bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); 1544 + 1545 + if (fsck_err_on(backpointer_exists && 1546 + (S_ISDIR(target->bi_mode) || 1547 + target->bi_subvol), 1548 + c, inode_dir_multiple_links, 1549 + "%s %llu:%u with multiple links\n%s", 1550 + S_ISDIR(target->bi_mode) ? "directory" : "subvolume", 1551 + target->bi_inum, target_snapshot, buf.buf)) { 1552 + ret = __remove_dirent(trans, d.k->p); 1553 + goto out; 1554 + } 1555 + 1556 + /* 1557 + * hardlinked file with nlink 0: 1558 + * We're just adjusting nlink here so check_nlinks() will pick 1559 + * it up, it ignores inodes with nlink 0 1560 + */ 1561 + if (fsck_err_on(backpointer_exists && !target->bi_nlink, 1562 + c, inode_multiple_links_but_nlink_0, 1563 + "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", 1564 + target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { 1565 + target->bi_nlink++; 1566 + target->bi_flags &= ~BCH_INODE_unlinked; 1567 + ret = __bch2_fsck_write_inode(trans, target, target_snapshot); 1568 + if (ret) 1569 + goto err; 1570 + } 1571 + out: 1572 + err: 1573 + fsck_err: 1574 + bch2_trans_iter_exit(trans, &bp_iter); 1575 + printbuf_exit(&buf); 1576 + bch_err_fn(c, ret); 1577 + return ret; 1602 1578 } 1603 1579 1604 1580 static int check_dirent_target(struct btree_trans *trans, ··· 1696 1500 struct bch_fs *c = trans->c; 1697 1501 struct bkey_i_dirent *n; 1698 1502 struct printbuf buf = PRINTBUF; 1699 - struct btree_iter bp_iter = { NULL }; 1700 1503 int ret = 0; 1701 1504 1702 - if (!target->bi_dir && 1703 - !target->bi_dir_offset) { 1704 - target->bi_dir = d.k->p.inode; 1705 - target->bi_dir_offset = d.k->p.offset; 1706 - 1707 - ret = __write_inode(trans, target, target_snapshot); 1708 - if (ret) 1709 - goto err; 1710 - } 1711 - 1712 - if (!inode_points_to_dirent(target, d)) { 1713 - struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, 1714 - SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot)); 1715 - ret = bkey_err(bp_dirent); 1716 - if (ret && !bch2_err_matches(ret, ENOENT)) 1717 - goto err; 1718 - 1719 - bool backpointer_exists = !ret; 1720 - ret = 0; 1721 - 1722 - bch2_bkey_val_to_text(&buf, c, d.s_c); 1723 - prt_newline(&buf); 1724 - if (backpointer_exists) 1725 - bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); 1726 - 1727 - if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists, 1728 - c, inode_dir_multiple_links, 1729 - "directory %llu:%u with multiple links\n%s", 1730 - target->bi_inum, target_snapshot, buf.buf)) { 1731 - ret = __remove_dirent(trans, d.k->p); 1732 - goto out; 1733 - } 1734 - 1735 - /* 1736 - * hardlinked file with nlink 0: 1737 - * We're just adjusting nlink here so check_nlinks() will pick 1738 - * it up, it ignores inodes with nlink 0 1739 - */ 1740 - if (fsck_err_on(backpointer_exists && !target->bi_nlink, 1741 - c, inode_multiple_links_but_nlink_0, 1742 - "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", 1743 - target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { 1744 - target->bi_nlink++; 1745 - target->bi_flags &= ~BCH_INODE_unlinked; 1746 - 1747 - ret = __write_inode(trans, target, target_snapshot); 1748 - if (ret) 1749 - goto err; 1750 - } 1751 - 1752 - if (fsck_err_on(!backpointer_exists, 1753 - c, inode_wrong_backpointer, 1754 - "inode %llu:%u has wrong backpointer:\n" 1755 - "got %llu:%llu\n" 1756 - "should be %llu:%llu", 1757 - target->bi_inum, target_snapshot, 1758 - target->bi_dir, 1759 - target->bi_dir_offset, 1760 - d.k->p.inode, 1761 - d.k->p.offset)) { 1762 - target->bi_dir = d.k->p.inode; 1763 - target->bi_dir_offset = d.k->p.offset; 1764 - 1765 - ret = __write_inode(trans, target, target_snapshot); 1766 - if (ret) 1767 - goto err; 1768 - } 1769 - } 1505 + ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot); 1506 + if (ret) 1507 + goto err; 1770 1508 1771 1509 if (fsck_err_on(d.v->d_type != inode_d_type(target), 1772 1510 c, dirent_d_type_wrong, ··· 1716 1586 1717 1587 bkey_reassemble(&n->k_i, d.s_c); 1718 1588 n->v.d_type = inode_d_type(target); 1589 + if (n->v.d_type == DT_SUBVOL) { 1590 + n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); 1591 + n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); 1592 + } else { 1593 + n->v.d_inum = cpu_to_le64(target->bi_inum); 1594 + } 1719 1595 1720 1596 ret = bch2_trans_update(trans, iter, &n->k_i, 0); 1721 1597 if (ret) ··· 1729 1593 1730 1594 d = dirent_i_to_s_c(n); 1731 1595 } 1596 + err: 1597 + fsck_err: 1598 + printbuf_exit(&buf); 1599 + bch_err_fn(c, ret); 1600 + return ret; 1601 + } 1732 1602 1733 - if (fsck_err_on(d.v->d_type == DT_SUBVOL && 1734 - target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol), 1735 - c, dirent_d_parent_subvol_wrong, 1736 - "dirent has wrong d_parent_subvol field: got %u, should be %u", 1737 - le32_to_cpu(d.v->d_parent_subvol), 1738 - target->bi_parent_subvol)) { 1739 - n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); 1603 + /* find a subvolume that's a descendent of @snapshot: */ 1604 + static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid) 1605 + { 1606 + struct btree_iter iter; 1607 + struct bkey_s_c k; 1608 + int ret; 1609 + 1610 + for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) { 1611 + if (k.k->type != KEY_TYPE_subvolume) 1612 + continue; 1613 + 1614 + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); 1615 + if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) { 1616 + bch2_trans_iter_exit(trans, &iter); 1617 + *subvolid = k.k->p.offset; 1618 + goto found; 1619 + } 1620 + } 1621 + if (!ret) 1622 + ret = -ENOENT; 1623 + found: 1624 + bch2_trans_iter_exit(trans, &iter); 1625 + return ret; 1626 + } 1627 + 1628 + static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter, 1629 + struct bkey_s_c_dirent d) 1630 + { 1631 + struct bch_fs *c = trans->c; 1632 + struct btree_iter subvol_iter = {}; 1633 + struct bch_inode_unpacked subvol_root; 1634 + u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol); 1635 + u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); 1636 + u32 parent_snapshot; 1637 + u64 parent_inum; 1638 + struct printbuf buf = PRINTBUF; 1639 + int ret = 0; 1640 + 1641 + ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum); 1642 + if (ret && !bch2_err_matches(ret, ENOENT)) 1643 + return ret; 1644 + 1645 + if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol, 1646 + "dirent parent_subvol points to missing subvolume\n%s", 1647 + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) || 1648 + fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot), 1649 + c, dirent_not_visible_in_parent_subvol, 1650 + "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s", 1651 + parent_snapshot, 1652 + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { 1653 + u32 new_parent_subvol; 1654 + ret = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); 1655 + if (ret) 1656 + goto err; 1657 + 1658 + struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); 1659 + ret = PTR_ERR_OR_ZERO(new_dirent); 1660 + if (ret) 1661 + goto err; 1662 + 1663 + new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol); 1664 + } 1665 + 1666 + struct bkey_s_c_subvolume s = 1667 + bch2_bkey_get_iter_typed(trans, &subvol_iter, 1668 + BTREE_ID_subvolumes, POS(0, target_subvol), 1669 + 0, subvolume); 1670 + ret = bkey_err(s.s_c); 1671 + if (ret && !bch2_err_matches(ret, ENOENT)) 1672 + return ret; 1673 + 1674 + if (ret) { 1675 + if (fsck_err(c, dirent_to_missing_subvol, 1676 + "dirent points to missing subvolume\n%s", 1677 + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) 1678 + return __remove_dirent(trans, d.k->p); 1679 + ret = 0; 1680 + goto out; 1681 + } 1682 + 1683 + if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol, 1684 + c, subvol_fs_path_parent_wrong, 1685 + "subvol with wrong fs_path_parent, should be be %u\n%s", 1686 + parent_subvol, 1687 + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { 1688 + struct bkey_i_subvolume *n = 1689 + bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); 1740 1690 ret = PTR_ERR_OR_ZERO(n); 1741 1691 if (ret) 1742 1692 goto err; 1743 1693 1744 - bkey_reassemble(&n->k_i, d.s_c); 1745 - n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); 1746 - 1747 - ret = bch2_trans_update(trans, iter, &n->k_i, 0); 1748 - if (ret) 1749 - goto err; 1750 - 1751 - d = dirent_i_to_s_c(n); 1694 + n->v.fs_path_parent = cpu_to_le32(parent_subvol); 1752 1695 } 1696 + 1697 + u64 target_inum = le64_to_cpu(s.v->inode); 1698 + u32 target_snapshot = le32_to_cpu(s.v->snapshot); 1699 + 1700 + ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot); 1701 + if (ret && !bch2_err_matches(ret, ENOENT)) 1702 + return ret; 1703 + 1704 + if (fsck_err_on(parent_subvol != subvol_root.bi_parent_subvol, 1705 + c, inode_bi_parent_wrong, 1706 + "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u", 1707 + target_inum, 1708 + subvol_root.bi_parent_subvol, parent_subvol)) { 1709 + subvol_root.bi_parent_subvol = parent_subvol; 1710 + ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot); 1711 + if (ret) 1712 + return ret; 1713 + } 1714 + 1715 + ret = check_dirent_target(trans, iter, d, &subvol_root, 1716 + target_snapshot); 1717 + if (ret) 1718 + return ret; 1753 1719 out: 1754 1720 err: 1755 1721 fsck_err: 1756 - bch2_trans_iter_exit(trans, &bp_iter); 1722 + bch2_trans_iter_exit(trans, &subvol_iter); 1757 1723 printbuf_exit(&buf); 1758 - bch_err_fn(c, ret); 1759 1724 return ret; 1760 1725 } 1761 1726 ··· 1898 1661 1899 1662 BUG_ON(!btree_iter_path(trans, iter)->should_be_locked); 1900 1663 1901 - i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout); 1664 + i = walk_inode(trans, dir, k); 1902 1665 ret = PTR_ERR_OR_ZERO(i); 1903 1666 if (ret < 0) 1904 1667 goto err; ··· 1944 1707 d = bkey_s_c_to_dirent(k); 1945 1708 1946 1709 if (d.v->d_type == DT_SUBVOL) { 1947 - struct bch_inode_unpacked subvol_root; 1948 - u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); 1949 - u32 target_snapshot; 1950 - u64 target_inum; 1951 - 1952 - ret = subvol_lookup(trans, target_subvol, 1953 - &target_snapshot, &target_inum); 1954 - if (ret && !bch2_err_matches(ret, ENOENT)) 1955 - goto err; 1956 - 1957 - if (fsck_err_on(ret, c, dirent_to_missing_subvol, 1958 - "dirent points to missing subvolume %u", 1959 - le32_to_cpu(d.v->d_child_subvol))) { 1960 - ret = __remove_dirent(trans, d.k->p); 1961 - goto err; 1962 - } 1963 - 1964 - ret = lookup_inode(trans, target_inum, 1965 - &subvol_root, &target_snapshot); 1966 - if (ret && !bch2_err_matches(ret, ENOENT)) 1967 - goto err; 1968 - 1969 - if (fsck_err_on(ret, c, subvol_to_missing_root, 1970 - "subvolume %u points to missing subvolume root %llu", 1971 - target_subvol, 1972 - target_inum)) { 1973 - bch_err(c, "repair not implemented yet"); 1974 - ret = -EINVAL; 1975 - goto err; 1976 - } 1977 - 1978 - if (fsck_err_on(subvol_root.bi_subvol != target_subvol, 1979 - c, subvol_root_wrong_bi_subvol, 1980 - "subvol root %llu has wrong bi_subvol field: got %u, should be %u", 1981 - target_inum, 1982 - subvol_root.bi_subvol, target_subvol)) { 1983 - subvol_root.bi_subvol = target_subvol; 1984 - ret = __write_inode(trans, &subvol_root, target_snapshot); 1985 - if (ret) 1986 - goto err; 1987 - } 1988 - 1989 - ret = check_dirent_target(trans, iter, d, &subvol_root, 1990 - target_snapshot); 1710 + ret = check_dirent_to_subvol(trans, iter, d); 1991 1711 if (ret) 1992 1712 goto err; 1993 1713 } else { ··· 1970 1776 if (ret) 1971 1777 goto err; 1972 1778 } 1779 + 1780 + if (d.v->d_type == DT_DIR) 1781 + for_each_visible_inode(c, s, dir, equiv.snapshot, i) 1782 + i->count++; 1973 1783 } 1974 - 1975 - if (d.v->d_type == DT_DIR) 1976 - for_each_visible_inode(c, s, dir, equiv.snapshot, i) 1977 - i->count++; 1978 - 1979 1784 out: 1980 1785 err: 1981 1786 fsck_err: ··· 2025 1832 if (ret) 2026 1833 return ret; 2027 1834 2028 - i = walk_inode(trans, inode, k.k->p, k.k->type == KEY_TYPE_whiteout); 1835 + i = walk_inode(trans, inode, k); 2029 1836 ret = PTR_ERR_OR_ZERO(i); 2030 1837 if (ret) 2031 1838 return ret; ··· 2112 1919 0, NULL); 2113 1920 root_inode.bi_inum = inum; 2114 1921 2115 - ret = __write_inode(trans, &root_inode, snapshot); 1922 + ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot); 2116 1923 bch_err_msg(c, ret, "writing root inode"); 2117 1924 } 2118 1925 err: ··· 2125 1932 { 2126 1933 int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2127 1934 check_root_trans(trans)); 1935 + bch_err_fn(c, ret); 1936 + return ret; 1937 + } 1938 + 1939 + typedef DARRAY(u32) darray_u32; 1940 + 1941 + static bool darray_u32_has(darray_u32 *d, u32 v) 1942 + { 1943 + darray_for_each(*d, i) 1944 + if (*i == v) 1945 + return true; 1946 + return false; 1947 + } 1948 + 1949 + /* 1950 + * We've checked that inode backpointers point to valid dirents; here, it's 1951 + * sufficient to check that the subvolume root has a dirent: 1952 + */ 1953 + static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s) 1954 + { 1955 + struct bch_inode_unpacked inode; 1956 + int ret = bch2_inode_find_by_inum_trans(trans, 1957 + (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, 1958 + &inode); 1959 + if (ret) 1960 + return ret; 1961 + 1962 + return inode.bi_dir != 0; 1963 + } 1964 + 1965 + static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) 1966 + { 1967 + struct bch_fs *c = trans->c; 1968 + struct btree_iter parent_iter = {}; 1969 + darray_u32 subvol_path = {}; 1970 + struct printbuf buf = PRINTBUF; 1971 + int ret = 0; 1972 + 1973 + if (k.k->type != KEY_TYPE_subvolume) 1974 + return 0; 1975 + 1976 + while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) { 1977 + ret = darray_push(&subvol_path, k.k->p.offset); 1978 + if (ret) 1979 + goto err; 1980 + 1981 + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); 1982 + 1983 + ret = subvol_has_dirent(trans, s); 1984 + if (ret < 0) 1985 + break; 1986 + 1987 + if (fsck_err_on(!ret, 1988 + c, subvol_unreachable, 1989 + "unreachable subvolume %s", 1990 + (bch2_bkey_val_to_text(&buf, c, s.s_c), 1991 + buf.buf))) { 1992 + ret = reattach_subvol(trans, s); 1993 + break; 1994 + } 1995 + 1996 + u32 parent = le32_to_cpu(s.v->fs_path_parent); 1997 + 1998 + if (darray_u32_has(&subvol_path, parent)) { 1999 + if (fsck_err(c, subvol_loop, "subvolume loop")) 2000 + ret = reattach_subvol(trans, s); 2001 + break; 2002 + } 2003 + 2004 + bch2_trans_iter_exit(trans, &parent_iter); 2005 + bch2_trans_iter_init(trans, &parent_iter, 2006 + BTREE_ID_subvolumes, POS(0, parent), 0); 2007 + k = bch2_btree_iter_peek_slot(&parent_iter); 2008 + ret = bkey_err(k); 2009 + if (ret) 2010 + goto err; 2011 + 2012 + if (fsck_err_on(k.k->type != KEY_TYPE_subvolume, 2013 + c, subvol_unreachable, 2014 + "unreachable subvolume %s", 2015 + (bch2_bkey_val_to_text(&buf, c, s.s_c), 2016 + buf.buf))) { 2017 + ret = reattach_subvol(trans, s); 2018 + break; 2019 + } 2020 + } 2021 + fsck_err: 2022 + err: 2023 + printbuf_exit(&buf); 2024 + darray_exit(&subvol_path); 2025 + bch2_trans_iter_exit(trans, &parent_iter); 2026 + return ret; 2027 + } 2028 + 2029 + int bch2_check_subvolume_structure(struct bch_fs *c) 2030 + { 2031 + int ret = bch2_trans_run(c, 2032 + for_each_btree_key_commit(trans, iter, 2033 + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, 2034 + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2035 + check_subvol_path(trans, &iter, k))); 2128 2036 bch_err_fn(c, ret); 2129 2037 return ret; 2130 2038 } ··· 2246 1952 return false; 2247 1953 } 2248 1954 2249 - static int path_down(struct bch_fs *c, pathbuf *p, 2250 - u64 inum, u32 snapshot) 2251 - { 2252 - int ret = darray_push(p, ((struct pathbuf_entry) { 2253 - .inum = inum, 2254 - .snapshot = snapshot, 2255 - })); 2256 - 2257 - if (ret) 2258 - bch_err(c, "fsck: error allocating memory for pathbuf, size %zu", 2259 - p->size); 2260 - return ret; 2261 - } 2262 - 2263 1955 /* 2264 - * Check that a given inode is reachable from the root: 1956 + * Check that a given inode is reachable from its subvolume root - we already 1957 + * verified subvolume connectivity: 2265 1958 * 2266 1959 * XXX: we should also be verifying that inodes are in the right subvolumes 2267 1960 */ 2268 - static int check_path(struct btree_trans *trans, 2269 - pathbuf *p, 2270 - struct bch_inode_unpacked *inode, 2271 - u32 snapshot) 1961 + static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k) 2272 1962 { 2273 1963 struct bch_fs *c = trans->c; 1964 + struct btree_iter inode_iter = {}; 1965 + struct bch_inode_unpacked inode; 1966 + struct printbuf buf = PRINTBUF; 1967 + u32 snapshot = bch2_snapshot_equiv(c, inode_k.k->p.snapshot); 2274 1968 int ret = 0; 2275 1969 2276 - snapshot = bch2_snapshot_equiv(c, snapshot); 2277 1970 p->nr = 0; 2278 1971 2279 - while (!(inode->bi_inum == BCACHEFS_ROOT_INO && 2280 - inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) { 1972 + BUG_ON(bch2_inode_unpack(inode_k, &inode)); 1973 + 1974 + while (!inode.bi_subvol) { 2281 1975 struct btree_iter dirent_iter; 2282 1976 struct bkey_s_c_dirent d; 2283 1977 u32 parent_snapshot = snapshot; 2284 1978 2285 - if (inode->bi_subvol) { 2286 - u64 inum; 2287 - 2288 - ret = subvol_lookup(trans, inode->bi_parent_subvol, 2289 - &parent_snapshot, &inum); 2290 - if (ret) 2291 - break; 2292 - } 2293 - 2294 - d = dirent_get_by_pos(trans, &dirent_iter, 2295 - SPOS(inode->bi_dir, inode->bi_dir_offset, 2296 - parent_snapshot)); 1979 + d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot); 2297 1980 ret = bkey_err(d.s_c); 2298 1981 if (ret && !bch2_err_matches(ret, ENOENT)) 2299 1982 break; 2300 1983 2301 - if (!ret && !dirent_points_to_inode(d, inode)) { 1984 + if (!ret && !dirent_points_to_inode(d, &inode)) { 2302 1985 bch2_trans_iter_exit(trans, &dirent_iter); 2303 1986 ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; 2304 1987 } 2305 1988 2306 1989 if (bch2_err_matches(ret, ENOENT)) { 2307 - if (fsck_err(c, inode_unreachable, 2308 - "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu", 2309 - inode->bi_inum, snapshot, 2310 - bch2_d_type_str(inode_d_type(inode)), 2311 - inode->bi_nlink, 2312 - inode->bi_dir, 2313 - inode->bi_dir_offset)) 2314 - ret = reattach_inode(trans, inode, snapshot); 2315 - break; 1990 + ret = 0; 1991 + if (fsck_err(c, inode_unreachable, 1992 + "unreachable inode\n%s", 1993 + (printbuf_reset(&buf), 1994 + bch2_bkey_val_to_text(&buf, c, inode_k), 1995 + buf.buf))) 1996 + ret = reattach_inode(trans, &inode, snapshot); 1997 + goto out; 2316 1998 } 2317 1999 2318 2000 bch2_trans_iter_exit(trans, &dirent_iter); 2319 2001 2320 - if (!S_ISDIR(inode->bi_mode)) 2002 + if (!S_ISDIR(inode.bi_mode)) 2321 2003 break; 2322 2004 2323 - ret = path_down(c, p, inode->bi_inum, snapshot); 2324 - if (ret) { 2325 - bch_err(c, "memory allocation failure"); 2005 + ret = darray_push(p, ((struct pathbuf_entry) { 2006 + .inum = inode.bi_inum, 2007 + .snapshot = snapshot, 2008 + })); 2009 + if (ret) 2326 2010 return ret; 2327 - } 2328 2011 2329 2012 snapshot = parent_snapshot; 2330 2013 2331 - ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot); 2014 + bch2_trans_iter_exit(trans, &inode_iter); 2015 + inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, 2016 + SPOS(0, inode.bi_dir, snapshot), 0); 2017 + ret = bkey_err(inode_k) ?: 2018 + !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode 2019 + : bch2_inode_unpack(inode_k, &inode); 2332 2020 if (ret) { 2333 2021 /* Should have been caught in dirents pass */ 2334 2022 if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) ··· 2318 2042 break; 2319 2043 } 2320 2044 2321 - if (path_is_dup(p, inode->bi_inum, snapshot)) { 2045 + snapshot = inode_k.k->p.snapshot; 2046 + 2047 + if (path_is_dup(p, inode.bi_inum, snapshot)) { 2322 2048 /* XXX print path */ 2323 2049 bch_err(c, "directory structure loop"); 2324 2050 2325 2051 darray_for_each(*p, i) 2326 2052 pr_err("%llu:%u", i->inum, i->snapshot); 2327 - pr_err("%llu:%u", inode->bi_inum, snapshot); 2053 + pr_err("%llu:%u", inode.bi_inum, snapshot); 2328 2054 2329 - if (!fsck_err(c, dir_loop, "directory structure loop")) 2330 - return 0; 2331 - 2332 - ret = remove_backpointer(trans, inode); 2333 - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) 2055 + if (fsck_err(c, dir_loop, "directory structure loop")) { 2056 + ret = remove_backpointer(trans, &inode); 2334 2057 bch_err_msg(c, ret, "removing dirent"); 2335 - if (ret) 2336 - break; 2058 + if (ret) 2059 + break; 2337 2060 2338 - ret = reattach_inode(trans, inode, snapshot); 2339 - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) 2340 - bch_err_msg(c, ret, "reattaching inode %llu", inode->bi_inum); 2061 + ret = reattach_inode(trans, &inode, snapshot); 2062 + bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); 2063 + } 2341 2064 break; 2342 2065 } 2343 2066 } 2067 + out: 2344 2068 fsck_err: 2069 + bch2_trans_iter_exit(trans, &inode_iter); 2070 + printbuf_exit(&buf); 2345 2071 bch_err_fn(c, ret); 2346 2072 return ret; 2347 2073 } ··· 2355 2077 */ 2356 2078 int bch2_check_directory_structure(struct bch_fs *c) 2357 2079 { 2358 - struct bch_inode_unpacked u; 2359 2080 pathbuf path = { 0, }; 2360 2081 int ret; 2361 2082 ··· 2367 2090 if (!bkey_is_inode(k.k)) 2368 2091 continue; 2369 2092 2370 - BUG_ON(bch2_inode_unpack(k, &u)); 2371 - 2372 - if (u.bi_flags & BCH_INODE_unlinked) 2093 + if (bch2_inode_flags(k) & BCH_INODE_unlinked) 2373 2094 continue; 2374 2095 2375 - check_path(trans, &path, &u, iter.pos.snapshot); 2096 + check_path(trans, &path, k); 2376 2097 }))); 2377 2098 darray_exit(&path); 2378 2099 ··· 2566 2291 u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], 2567 2292 bch2_inode_nlink_get(&u), link->count)) { 2568 2293 bch2_inode_nlink_set(&u, link->count); 2569 - ret = __write_inode(trans, &u, k.k->p.snapshot); 2294 + ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot); 2570 2295 } 2571 2296 fsck_err: 2572 2297 return ret;

+1

fs/bcachefs/fsck.h

··· 8 8 int bch2_check_dirents(struct bch_fs *); 9 9 int bch2_check_xattrs(struct bch_fs *); 10 10 int bch2_check_root(struct bch_fs *); 11 + int bch2_check_subvolume_structure(struct bch_fs *); 11 12 int bch2_check_directory_structure(struct bch_fs *); 12 13 int bch2_check_nlinks(struct bch_fs *); 13 14 int bch2_fix_reflink_p(struct bch_fs *);

+45 -10

fs/bcachefs/inode.c

··· 324 324 return bch2_inode_unpack_slowpath(k, unpacked); 325 325 } 326 326 327 - static int bch2_inode_peek_nowarn(struct btree_trans *trans, 327 + int bch2_inode_peek_nowarn(struct btree_trans *trans, 328 328 struct btree_iter *iter, 329 329 struct bch_inode_unpacked *inode, 330 330 subvol_inum inum, unsigned flags) ··· 382 382 bch2_inode_pack_inlined(inode_p, inode); 383 383 inode_p->inode.k.p.snapshot = iter->snapshot; 384 384 return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags); 385 + } 386 + 387 + int __bch2_fsck_write_inode(struct btree_trans *trans, 388 + struct bch_inode_unpacked *inode, 389 + u32 snapshot) 390 + { 391 + struct bkey_inode_buf *inode_p = 392 + bch2_trans_kmalloc(trans, sizeof(*inode_p)); 393 + 394 + if (IS_ERR(inode_p)) 395 + return PTR_ERR(inode_p); 396 + 397 + bch2_inode_pack(inode_p, inode); 398 + inode_p->inode.k.p.snapshot = snapshot; 399 + 400 + return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, 401 + &inode_p->inode.k_i, 402 + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); 403 + } 404 + 405 + int bch2_fsck_write_inode(struct btree_trans *trans, 406 + struct bch_inode_unpacked *inode, 407 + u32 snapshot) 408 + { 409 + int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 410 + __bch2_fsck_write_inode(trans, inode, snapshot)); 411 + bch_err_fn(trans->c, ret); 412 + return ret; 385 413 } 386 414 387 415 struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) ··· 620 592 bool old_deleted = bkey_is_deleted_inode(old); 621 593 bool new_deleted = bkey_is_deleted_inode(new.s_c); 622 594 if (old_deleted != new_deleted) { 623 - int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new.k->p, new_deleted); 595 + int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, 596 + new.k->p, new_deleted); 624 597 if (ret) 625 598 return ret; 626 599 } ··· 1117 1088 goto out; 1118 1089 1119 1090 if (S_ISDIR(inode.bi_mode)) { 1120 - ret = bch2_empty_dir_snapshot(trans, pos.offset, pos.snapshot); 1121 - if (fsck_err_on(ret == -ENOTEMPTY, c, deleted_inode_is_dir, 1091 + ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot); 1092 + if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY), 1093 + c, deleted_inode_is_dir, 1122 1094 "non empty directory %llu:%u in deleted_inodes btree", 1123 1095 pos.offset, pos.snapshot)) 1124 1096 goto delete; ··· 1171 1141 bch2_trans_iter_exit(trans, &inode_iter); 1172 1142 return ret; 1173 1143 delete: 1174 - ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false); 1144 + ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false); 1175 1145 goto out; 1176 1146 } 1177 1147 ··· 1181 1151 bool need_another_pass; 1182 1152 int ret; 1183 1153 again: 1154 + /* 1155 + * if we ran check_inodes() unlinked inodes will have already been 1156 + * cleaned up but the write buffer will be out of sync; therefore we 1157 + * alway need a write buffer flush 1158 + */ 1159 + ret = bch2_btree_write_buffer_flush_sync(trans); 1160 + if (ret) 1161 + goto err; 1162 + 1184 1163 need_another_pass = false; 1185 1164 1186 1165 /* ··· 1222 1183 ret; 1223 1184 })); 1224 1185 1225 - if (!ret && need_another_pass) { 1226 - ret = bch2_btree_write_buffer_flush_sync(trans); 1227 - if (ret) 1228 - goto err; 1186 + if (!ret && need_another_pass) 1229 1187 goto again; 1230 - } 1231 1188 err: 1232 1189 bch2_trans_put(trans); 1233 1190 return ret;

+19

fs/bcachefs/inode.h

··· 95 95 96 96 void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *); 97 97 98 + int bch2_inode_peek_nowarn(struct btree_trans *, struct btree_iter *, 99 + struct bch_inode_unpacked *, subvol_inum, unsigned); 98 100 int bch2_inode_peek(struct btree_trans *, struct btree_iter *, 99 101 struct bch_inode_unpacked *, subvol_inum, unsigned); 100 102 ··· 109 107 { 110 108 return bch2_inode_write_flags(trans, iter, inode, 0); 111 109 } 110 + 111 + int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32); 112 + int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32); 112 113 113 114 void bch2_inode_init_early(struct bch_fs *, 114 115 struct bch_inode_unpacked *); ··· 175 170 static inline u8 inode_d_type(struct bch_inode_unpacked *inode) 176 171 { 177 172 return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode); 173 + } 174 + 175 + static inline u32 bch2_inode_flags(struct bkey_s_c k) 176 + { 177 + switch (k.k->type) { 178 + case KEY_TYPE_inode: 179 + return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags); 180 + case KEY_TYPE_inode_v2: 181 + return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags); 182 + case KEY_TYPE_inode_v3: 183 + return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags); 184 + default: 185 + return 0; 186 + } 178 187 } 179 188 180 189 /* i_nlink: */

+1 -1

fs/bcachefs/io_read.c

··· 174 174 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) 175 175 return ERR_PTR(-BCH_ERR_nopromote_no_writes); 176 176 177 - op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_KERNEL); 177 + op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL); 178 178 if (!op) { 179 179 ret = -BCH_ERR_nopromote_enomem; 180 180 goto err;

+12 -6

fs/bcachefs/io_write.c

··· 88 88 89 89 bch2_congested_acct(ca, io_latency, now, rw); 90 90 91 - __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); 91 + __bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now); 92 92 } 93 93 94 94 #endif ··· 530 530 531 531 bch_err_inum_offset_ratelimited(c, 532 532 insert->k.p.inode, insert->k.p.offset << 9, 533 - "write error while doing btree update: %s", 533 + "%s write error while doing btree update: %s", 534 + op->flags & BCH_WRITE_MOVE ? "move" : "user", 534 535 bch2_err_str(ret)); 535 536 } 536 537 ··· 1068 1067 *_dst = dst; 1069 1068 return more; 1070 1069 csum_err: 1071 - bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)"); 1070 + bch_err(c, "%s writ error: error verifying existing checksum while rewriting existing data (memory corruption?)", 1071 + op->flags & BCH_WRITE_MOVE ? "move" : "user"); 1072 1072 ret = -EIO; 1073 1073 err: 1074 1074 if (to_wbio(dst)->bounce) ··· 1171 1169 1172 1170 bch_err_inum_offset_ratelimited(c, 1173 1171 insert->k.p.inode, insert->k.p.offset << 9, 1174 - "write error while doing btree update: %s", 1172 + "%s write error while doing btree update: %s", 1173 + op->flags & BCH_WRITE_MOVE ? "move" : "user", 1175 1174 bch2_err_str(ret)); 1176 1175 } 1177 1176 ··· 1452 1449 bch_err_inum_offset_ratelimited(c, 1453 1450 op->pos.inode, 1454 1451 op->pos.offset << 9, 1455 - "%s(): error: %s", __func__, bch2_err_str(ret)); 1452 + "%s(): %s error: %s", __func__, 1453 + op->flags & BCH_WRITE_MOVE ? "move" : "user", 1454 + bch2_err_str(ret)); 1456 1455 op->error = ret; 1457 1456 break; 1458 1457 } ··· 1578 1573 bch_err_inum_offset_ratelimited(c, 1579 1574 op->pos.inode, 1580 1575 op->pos.offset << 9, 1581 - "misaligned write"); 1576 + "%s write error: misaligned write", 1577 + op->flags & BCH_WRITE_MOVE ? "move" : "user"); 1582 1578 op->error = -EIO; 1583 1579 goto err; 1584 1580 }

+182 -132

fs/bcachefs/journal.c

··· 27 27 NULL 28 28 }; 29 29 30 - static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq) 31 - { 32 - union journal_res_state s = READ_ONCE(j->reservations); 33 - unsigned i = seq & JOURNAL_BUF_MASK; 34 - struct journal_buf *buf = j->buf + i; 35 - 36 - prt_printf(out, "seq:"); 37 - prt_tab(out); 38 - prt_printf(out, "%llu", seq); 39 - prt_newline(out); 40 - printbuf_indent_add(out, 2); 41 - 42 - prt_printf(out, "refcount:"); 43 - prt_tab(out); 44 - prt_printf(out, "%u", journal_state_count(s, i)); 45 - prt_newline(out); 46 - 47 - prt_printf(out, "size:"); 48 - prt_tab(out); 49 - prt_human_readable_u64(out, vstruct_bytes(buf->data)); 50 - prt_newline(out); 51 - 52 - prt_printf(out, "expires"); 53 - prt_tab(out); 54 - prt_printf(out, "%li jiffies", buf->expires - jiffies); 55 - prt_newline(out); 56 - 57 - printbuf_indent_sub(out, 2); 58 - } 59 - 60 - static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) 61 - { 62 - if (!out->nr_tabstops) 63 - printbuf_tabstop_push(out, 24); 64 - 65 - for (u64 seq = journal_last_unwritten_seq(j); 66 - seq <= journal_cur_seq(j); 67 - seq++) 68 - bch2_journal_buf_to_text(out, j, seq); 69 - } 70 - 71 30 static inline bool journal_seq_unwritten(struct journal *j, u64 seq) 72 31 { 73 32 return seq > j->seq_ondisk; ··· 45 86 static bool journal_entry_is_open(struct journal *j) 46 87 { 47 88 return __journal_entry_is_open(j->reservations); 89 + } 90 + 91 + static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq) 92 + { 93 + union journal_res_state s = READ_ONCE(j->reservations); 94 + unsigned i = seq & JOURNAL_BUF_MASK; 95 + struct journal_buf *buf = j->buf + i; 96 + 97 + prt_str(out, "seq:"); 98 + prt_tab(out); 99 + prt_printf(out, "%llu", seq); 100 + prt_newline(out); 101 + printbuf_indent_add(out, 2); 102 + 103 + prt_str(out, "refcount:"); 104 + prt_tab(out); 105 + prt_printf(out, "%u", journal_state_count(s, i)); 106 + prt_newline(out); 107 + 108 + prt_str(out, "size:"); 109 + prt_tab(out); 110 + prt_human_readable_u64(out, vstruct_bytes(buf->data)); 111 + prt_newline(out); 112 + 113 + prt_str(out, "expires:"); 114 + prt_tab(out); 115 + prt_printf(out, "%li jiffies", buf->expires - jiffies); 116 + prt_newline(out); 117 + 118 + prt_str(out, "flags:"); 119 + prt_tab(out); 120 + if (buf->noflush) 121 + prt_str(out, "noflush "); 122 + if (buf->must_flush) 123 + prt_str(out, "must_flush "); 124 + if (buf->separate_flush) 125 + prt_str(out, "separate_flush "); 126 + if (buf->need_flush_to_write_buffer) 127 + prt_str(out, "need_flush_to_write_buffer "); 128 + if (buf->write_started) 129 + prt_str(out, "write_started "); 130 + if (buf->write_allocated) 131 + prt_str(out, "write allocated "); 132 + if (buf->write_done) 133 + prt_str(out, "write done"); 134 + prt_newline(out); 135 + 136 + printbuf_indent_sub(out, 2); 137 + } 138 + 139 + static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) 140 + { 141 + if (!out->nr_tabstops) 142 + printbuf_tabstop_push(out, 24); 143 + 144 + for (u64 seq = journal_last_unwritten_seq(j); 145 + seq <= journal_cur_seq(j); 146 + seq++) 147 + bch2_journal_buf_to_text(out, j, seq); 148 + prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed"); 48 149 } 49 150 50 151 static inline struct journal_buf * ··· 193 174 return stuck; 194 175 } 195 176 177 + void bch2_journal_do_writes(struct journal *j) 178 + { 179 + for (u64 seq = journal_last_unwritten_seq(j); 180 + seq <= journal_cur_seq(j); 181 + seq++) { 182 + unsigned idx = seq & JOURNAL_BUF_MASK; 183 + struct journal_buf *w = j->buf + idx; 184 + 185 + if (w->write_started && !w->write_allocated) 186 + break; 187 + if (w->write_started) 188 + continue; 189 + 190 + if (!journal_state_count(j->reservations, idx)) { 191 + w->write_started = true; 192 + closure_call(&w->io, bch2_journal_write, j->wq, NULL); 193 + } 194 + 195 + break; 196 + } 197 + } 198 + 196 199 /* 197 200 * Final processing when the last reference of a journal buffer has been 198 201 * dropped. Drop the pin list reference acquired at journal entry open and write 199 202 * the buffer, if requested. 200 203 */ 201 - void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write) 204 + void bch2_journal_buf_put_final(struct journal *j, u64 seq) 202 205 { 203 - struct bch_fs *c = container_of(j, struct bch_fs, journal); 204 - 205 206 lockdep_assert_held(&j->lock); 206 207 207 208 if (__bch2_journal_pin_put(j, seq)) 208 209 bch2_journal_reclaim_fast(j); 209 - if (write) 210 - closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); 210 + bch2_journal_do_writes(j); 211 211 } 212 212 213 213 /* ··· 418 380 BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); 419 381 420 382 bkey_extent_init(&buf->key); 421 - buf->noflush = false; 422 - buf->must_flush = false; 423 - buf->separate_flush = false; 424 - buf->flush_time = 0; 383 + buf->noflush = false; 384 + buf->must_flush = false; 385 + buf->separate_flush = false; 386 + buf->flush_time = 0; 425 387 buf->need_flush_to_write_buffer = true; 388 + buf->write_started = false; 389 + buf->write_allocated = false; 390 + buf->write_done = false; 426 391 427 392 memset(buf->data, 0, sizeof(*buf->data)); 428 393 buf->data->seq = cpu_to_le64(journal_cur_seq(j)); ··· 459 418 } while ((v = atomic64_cmpxchg(&j->reservations.counter, 460 419 old.v, new.v)) != old.v); 461 420 462 - mod_delayed_work(c->io_complete_wq, 463 - &j->write_work, 464 - msecs_to_jiffies(c->opts.journal_flush_delay)); 421 + if (nr_unwritten_journal_entries(j) == 1) 422 + mod_delayed_work(j->wq, 423 + &j->write_work, 424 + msecs_to_jiffies(c->opts.journal_flush_delay)); 465 425 journal_wake(j); 466 426 467 427 if (j->early_journal_entries.nr) ··· 487 445 static void journal_write_work(struct work_struct *work) 488 446 { 489 447 struct journal *j = container_of(work, struct journal, write_work.work); 490 - struct bch_fs *c = container_of(j, struct bch_fs, journal); 491 - long delta; 492 448 493 449 spin_lock(&j->lock); 494 - if (!__journal_entry_is_open(j->reservations)) 495 - goto unlock; 450 + if (__journal_entry_is_open(j->reservations)) { 451 + long delta = journal_cur_buf(j)->expires - jiffies; 496 452 497 - delta = journal_cur_buf(j)->expires - jiffies; 498 - 499 - if (delta > 0) 500 - mod_delayed_work(c->io_complete_wq, &j->write_work, delta); 501 - else 502 - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); 503 - unlock: 453 + if (delta > 0) 454 + mod_delayed_work(j->wq, &j->write_work, delta); 455 + else 456 + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); 457 + } 504 458 spin_unlock(&j->lock); 505 459 } 506 460 ··· 511 473 if (journal_res_get_fast(j, res, flags)) 512 474 return 0; 513 475 476 + if ((flags & BCH_WATERMARK_MASK) < j->watermark) { 477 + ret = JOURNAL_ERR_journal_full; 478 + can_discard = j->can_discard; 479 + goto out; 480 + } 481 + 482 + if (j->blocked) 483 + return -BCH_ERR_journal_res_get_blocked; 484 + 514 485 if (bch2_journal_error(j)) 515 486 return -BCH_ERR_erofs_journal_err; 516 487 517 - spin_lock(&j->lock); 518 - 519 - /* check once more in case somebody else shut things down... */ 520 - if (bch2_journal_error(j)) { 521 - spin_unlock(&j->lock); 522 - return -BCH_ERR_erofs_journal_err; 488 + if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) { 489 + ret = JOURNAL_ERR_max_in_flight; 490 + goto out; 523 491 } 492 + 493 + spin_lock(&j->lock); 524 494 525 495 /* 526 496 * Recheck after taking the lock, so we don't race with another thread ··· 536 490 * unnecessarily 537 491 */ 538 492 if (journal_res_get_fast(j, res, flags)) { 539 - spin_unlock(&j->lock); 540 - return 0; 541 - } 542 - 543 - if ((flags & BCH_WATERMARK_MASK) < j->watermark) { 544 - /* 545 - * Don't want to close current journal entry, just need to 546 - * invoke reclaim: 547 - */ 548 - ret = JOURNAL_ERR_journal_full; 493 + ret = 0; 549 494 goto unlock; 550 495 } 551 496 ··· 552 515 j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); 553 516 554 517 __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false); 555 - ret = journal_entry_open(j); 556 - 557 - if (ret == JOURNAL_ERR_max_in_flight) { 558 - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], 559 - &j->max_in_flight_start, true); 560 - if (trace_journal_entry_full_enabled()) { 561 - struct printbuf buf = PRINTBUF; 562 - buf.atomic++; 563 - 564 - bch2_journal_bufs_to_text(&buf, j); 565 - trace_journal_entry_full(c, buf.buf); 566 - printbuf_exit(&buf); 567 - } 568 - count_event(c, journal_entry_full); 569 - } 518 + ret = journal_entry_open(j) ?: JOURNAL_ERR_retry; 570 519 unlock: 571 520 can_discard = j->can_discard; 572 521 spin_unlock(&j->lock); 573 - 574 - if (!ret) 522 + out: 523 + if (ret == JOURNAL_ERR_retry) 575 524 goto retry; 525 + if (!ret) 526 + return 0; 527 + 576 528 if (journal_error_check_stuck(j, ret, flags)) 577 529 ret = -BCH_ERR_journal_res_get_blocked; 530 + 531 + if (ret == JOURNAL_ERR_max_in_flight && 532 + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) { 533 + 534 + struct printbuf buf = PRINTBUF; 535 + prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); 536 + bch2_journal_bufs_to_text(&buf, j); 537 + trace_journal_entry_full(c, buf.buf); 538 + printbuf_exit(&buf); 539 + count_event(c, journal_entry_full); 540 + } 578 541 579 542 /* 580 543 * Journal is full - can't rely on reclaim from work item due to ··· 711 674 return ret; 712 675 713 676 seq = res.seq; 714 - buf = j->buf + (seq & JOURNAL_BUF_MASK); 677 + buf = journal_seq_to_buf(j, seq); 715 678 buf->must_flush = true; 716 679 717 680 if (!buf->flush_time) { ··· 729 692 } 730 693 731 694 /* 732 - * if write was kicked off without a flush, flush the next sequence 733 - * number instead 695 + * if write was kicked off without a flush, or if we promised it 696 + * wouldn't be a flush, flush the next sequence number instead 734 697 */ 735 698 buf = journal_seq_to_buf(j, seq); 736 699 if (buf->noflush) { ··· 808 771 unwritten_seq++) { 809 772 struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); 810 773 811 - /* journal write is already in flight, and was a flush write: */ 812 - if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush) 774 + /* journal flush already in flight, or flush requseted */ 775 + if (buf->must_flush) 813 776 goto out; 814 777 815 778 buf->noflush = true; ··· 1194 1157 struct journal_replay *i, **_i; 1195 1158 struct genradix_iter iter; 1196 1159 bool had_entries = false; 1197 - unsigned ptr; 1198 1160 u64 last_seq = cur_seq, nr, seq; 1199 1161 1200 1162 genradix_for_each_reverse(&c->journal_entries, iter, _i) { 1201 1163 i = *_i; 1202 1164 1203 - if (!i || i->ignore) 1165 + if (journal_replay_ignore(i)) 1204 1166 continue; 1205 1167 1206 1168 last_seq = le64_to_cpu(i->j.last_seq); ··· 1232 1196 genradix_for_each(&c->journal_entries, iter, _i) { 1233 1197 i = *_i; 1234 1198 1235 - if (!i || i->ignore) 1199 + if (journal_replay_ignore(i)) 1236 1200 continue; 1237 1201 1238 1202 seq = le64_to_cpu(i->j.seq); ··· 1247 1211 p = journal_seq_pin(j, seq); 1248 1212 1249 1213 p->devs.nr = 0; 1250 - for (ptr = 0; ptr < i->nr_ptrs; ptr++) 1251 - bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev); 1214 + darray_for_each(i->ptrs, ptr) 1215 + bch2_dev_list_add_dev(&p->devs, ptr->dev); 1252 1216 1253 1217 had_entries = true; 1254 1218 } ··· 1276 1240 1277 1241 void bch2_dev_journal_exit(struct bch_dev *ca) 1278 1242 { 1279 - kfree(ca->journal.bio); 1280 - kfree(ca->journal.buckets); 1281 - kfree(ca->journal.bucket_seq); 1243 + struct journal_device *ja = &ca->journal; 1282 1244 1283 - ca->journal.bio = NULL; 1284 - ca->journal.buckets = NULL; 1285 - ca->journal.bucket_seq = NULL; 1245 + for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { 1246 + kfree(ja->bio[i]); 1247 + ja->bio[i] = NULL; 1248 + } 1249 + 1250 + kfree(ja->buckets); 1251 + kfree(ja->bucket_seq); 1252 + ja->buckets = NULL; 1253 + ja->bucket_seq = NULL; 1286 1254 } 1287 1255 1288 1256 int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) ··· 1296 1256 bch2_sb_field_get(sb, journal); 1297 1257 struct bch_sb_field_journal_v2 *journal_buckets_v2 = 1298 1258 bch2_sb_field_get(sb, journal_v2); 1299 - unsigned i, nr_bvecs; 1300 1259 1301 1260 ja->nr = 0; 1302 1261 1303 1262 if (journal_buckets_v2) { 1304 1263 unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); 1305 1264 1306 - for (i = 0; i < nr; i++) 1265 + for (unsigned i = 0; i < nr; i++) 1307 1266 ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr); 1308 1267 } else if (journal_buckets) { 1309 1268 ja->nr = bch2_nr_journal_buckets(journal_buckets); ··· 1312 1273 if (!ja->bucket_seq) 1313 1274 return -BCH_ERR_ENOMEM_dev_journal_init; 1314 1275 1315 - nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); 1276 + unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); 1316 1277 1317 - ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); 1318 - if (!ca->journal.bio) 1319 - return -BCH_ERR_ENOMEM_dev_journal_init; 1278 + for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { 1279 + ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, 1280 + nr_bvecs), GFP_KERNEL); 1281 + if (!ja->bio[i]) 1282 + return -BCH_ERR_ENOMEM_dev_journal_init; 1320 1283 1321 - bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0); 1284 + ja->bio[i]->ca = ca; 1285 + ja->bio[i]->buf_idx = i; 1286 + bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0); 1287 + } 1322 1288 1323 1289 ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); 1324 1290 if (!ja->buckets) ··· 1331 1287 1332 1288 if (journal_buckets_v2) { 1333 1289 unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); 1334 - unsigned j, dst = 0; 1290 + unsigned dst = 0; 1335 1291 1336 - for (i = 0; i < nr; i++) 1337 - for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++) 1292 + for (unsigned i = 0; i < nr; i++) 1293 + for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++) 1338 1294 ja->buckets[dst++] = 1339 1295 le64_to_cpu(journal_buckets_v2->d[i].start) + j; 1340 1296 } else if (journal_buckets) { 1341 - for (i = 0; i < ja->nr; i++) 1297 + for (unsigned i = 0; i < ja->nr; i++) 1342 1298 ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); 1343 1299 } 1344 1300 ··· 1347 1303 1348 1304 void bch2_fs_journal_exit(struct journal *j) 1349 1305 { 1350 - unsigned i; 1306 + if (j->wq) 1307 + destroy_workqueue(j->wq); 1351 1308 1352 1309 darray_exit(&j->early_journal_entries); 1353 1310 1354 - for (i = 0; i < ARRAY_SIZE(j->buf); i++) 1355 - kvpfree(j->buf[i].data, j->buf[i].buf_size); 1311 + for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) 1312 + kvfree(j->buf[i].data); 1356 1313 free_fifo(&j->pin); 1357 1314 } 1358 1315 1359 1316 int bch2_fs_journal_init(struct journal *j) 1360 1317 { 1361 1318 static struct lock_class_key res_key; 1362 - unsigned i; 1363 1319 1364 1320 mutex_init(&j->buf_lock); 1365 1321 spin_lock_init(&j->lock); ··· 1380 1336 if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) 1381 1337 return -BCH_ERR_ENOMEM_journal_pin_fifo; 1382 1338 1383 - for (i = 0; i < ARRAY_SIZE(j->buf); i++) { 1339 + for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) { 1384 1340 j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN; 1385 - j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL); 1341 + j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL); 1386 1342 if (!j->buf[i].data) 1387 1343 return -BCH_ERR_ENOMEM_journal_buf; 1344 + j->buf[i].idx = i; 1388 1345 } 1389 1346 1390 1347 j->pin.front = j->pin.back = 1; 1348 + 1349 + j->wq = alloc_workqueue("bcachefs_journal", 1350 + WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512); 1351 + if (!j->wq) 1352 + return -BCH_ERR_ENOMEM_fs_other_alloc; 1391 1353 return 0; 1392 1354 } 1393 1355 ··· 1431 1381 prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked); 1432 1382 prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) 1433 1383 ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); 1384 + prt_printf(out, "blocked:\t\t%u\n", j->blocked); 1434 1385 prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); 1435 1386 prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); 1436 1387 prt_printf(out, "current entry:\t\t"); ··· 1506 1455 { 1507 1456 struct journal_entry_pin_list *pin_list; 1508 1457 struct journal_entry_pin *pin; 1509 - unsigned i; 1510 1458 1511 1459 spin_lock(&j->lock); 1512 1460 *seq = max(*seq, j->pin.front); ··· 1523 1473 prt_newline(out); 1524 1474 printbuf_indent_add(out, 2); 1525 1475 1526 - for (i = 0; i < ARRAY_SIZE(pin_list->list); i++) 1476 + for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++) 1527 1477 list_for_each_entry(pin, &pin_list->list[i], list) { 1528 1478 prt_printf(out, "\t%px %ps", pin, pin->flush); 1529 1479 prt_newline(out);

+4 -3

fs/bcachefs/journal.h

··· 264 264 } 265 265 266 266 bool bch2_journal_entry_close(struct journal *); 267 - void bch2_journal_buf_put_final(struct journal *, u64, bool); 267 + void bch2_journal_do_writes(struct journal *); 268 + void bch2_journal_buf_put_final(struct journal *, u64); 268 269 269 270 static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) 270 271 { ··· 273 272 274 273 s = journal_state_buf_put(j, idx); 275 274 if (!journal_state_count(s, idx)) 276 - bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx); 275 + bch2_journal_buf_put_final(j, seq); 277 276 } 278 277 279 278 static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) ··· 283 282 s = journal_state_buf_put(j, idx); 284 283 if (!journal_state_count(s, idx)) { 285 284 spin_lock(&j->lock); 286 - bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx); 285 + bch2_journal_buf_put_final(j, seq); 287 286 spin_unlock(&j->lock); 288 287 } 289 288 }

+227 -176

fs/bcachefs/journal_io.c

··· 17 17 #include "sb-clean.h" 18 18 #include "trace.h" 19 19 20 + void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 21 + struct journal_replay *j) 22 + { 23 + darray_for_each(j->ptrs, i) { 24 + struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev); 25 + u64 offset; 26 + 27 + div64_u64_rem(i->sector, ca->mi.bucket_size, &offset); 28 + 29 + if (i != j->ptrs.data) 30 + prt_printf(out, " "); 31 + prt_printf(out, "%u:%u:%u (sector %llu)", 32 + i->dev, i->bucket, i->bucket_offset, i->sector); 33 + } 34 + } 35 + 36 + static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, 37 + struct journal_replay *j) 38 + { 39 + prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); 40 + 41 + bch2_journal_ptrs_to_text(out, c, j); 42 + 43 + for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) { 44 + struct jset_entry_datetime *datetime = 45 + container_of(entry, struct jset_entry_datetime, entry); 46 + bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); 47 + break; 48 + } 49 + } 50 + 20 51 static struct nonce journal_nonce(const struct jset *jset) 21 52 { 22 53 return (struct nonce) {{ ··· 83 52 84 53 BUG_ON(*p != i); 85 54 *p = NULL; 86 - kvpfree(i, offsetof(struct journal_replay, j) + 87 - vstruct_bytes(&i->j)); 55 + kvfree(i); 88 56 } 89 57 90 - static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) 58 + static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted) 91 59 { 92 - i->ignore = true; 60 + if (blacklisted) 61 + i->ignore_blacklisted = true; 62 + else 63 + i->ignore_not_dirty = true; 93 64 94 65 if (!c->opts.read_entire_journal) 95 66 __journal_replay_free(c, i); ··· 117 84 { 118 85 struct genradix_iter iter; 119 86 struct journal_replay **_i, *i, *dup; 120 - struct journal_ptr *ptr; 121 87 size_t bytes = vstruct_bytes(j); 122 88 u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; 89 + struct printbuf buf = PRINTBUF; 123 90 int ret = JOURNAL_ENTRY_ADD_OK; 124 91 125 92 /* Is this entry older than the range we need? */ ··· 141 108 journal_entry_radix_idx(c, jlist->last_seq)) { 142 109 i = *_i; 143 110 144 - if (!i || i->ignore) 111 + if (journal_replay_ignore(i)) 145 112 continue; 146 113 147 114 if (le64_to_cpu(i->j.seq) >= last_seq) 148 115 break; 149 - journal_replay_free(c, i); 116 + 117 + journal_replay_free(c, i, false); 150 118 } 151 119 } 152 120 ··· 165 131 */ 166 132 dup = *_i; 167 133 if (dup) { 168 - if (bytes == vstruct_bytes(&dup->j) && 169 - !memcmp(j, &dup->j, bytes)) { 170 - i = dup; 171 - goto found; 172 - } 134 + bool identical = bytes == vstruct_bytes(&dup->j) && 135 + !memcmp(j, &dup->j, bytes); 136 + bool not_identical = !identical && 137 + entry_ptr.csum_good && 138 + dup->csum_good; 173 139 174 - if (!entry_ptr.csum_good) { 175 - i = dup; 176 - goto found; 177 - } 140 + bool same_device = false; 141 + darray_for_each(dup->ptrs, ptr) 142 + if (ptr->dev == ca->dev_idx) 143 + same_device = true; 178 144 179 - if (!dup->csum_good) 145 + ret = darray_push(&dup->ptrs, entry_ptr); 146 + if (ret) 147 + goto out; 148 + 149 + bch2_journal_replay_to_text(&buf, c, dup); 150 + 151 + fsck_err_on(same_device, 152 + c, journal_entry_dup_same_device, 153 + "duplicate journal entry on same device\n %s", 154 + buf.buf); 155 + 156 + fsck_err_on(not_identical, 157 + c, journal_entry_replicas_data_mismatch, 158 + "found duplicate but non identical journal entries\n %s", 159 + buf.buf); 160 + 161 + if (entry_ptr.csum_good && !identical) 180 162 goto replace; 181 163 182 - fsck_err(c, journal_entry_replicas_data_mismatch, 183 - "found duplicate but non identical journal entries (seq %llu)", 184 - le64_to_cpu(j->seq)); 185 - i = dup; 186 - goto found; 164 + goto out; 187 165 } 188 166 replace: 189 - i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); 167 + i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); 190 168 if (!i) 191 169 return -BCH_ERR_ENOMEM_journal_entry_add; 192 170 193 - i->nr_ptrs = 0; 194 - i->csum_good = entry_ptr.csum_good; 195 - i->ignore = false; 171 + darray_init(&i->ptrs); 172 + i->csum_good = entry_ptr.csum_good; 173 + i->ignore_blacklisted = false; 174 + i->ignore_not_dirty = false; 196 175 unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); 197 - i->ptrs[i->nr_ptrs++] = entry_ptr; 198 176 199 177 if (dup) { 200 - if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) { 201 - bch_err(c, "found too many copies of journal entry %llu", 202 - le64_to_cpu(i->j.seq)); 203 - dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1; 204 - } 205 - 206 178 /* The first ptr should represent the jset we kept: */ 207 - memcpy(i->ptrs + i->nr_ptrs, 208 - dup->ptrs, 209 - sizeof(dup->ptrs[0]) * dup->nr_ptrs); 210 - i->nr_ptrs += dup->nr_ptrs; 179 + darray_for_each(dup->ptrs, ptr) 180 + darray_push(&i->ptrs, *ptr); 211 181 __journal_replay_free(c, dup); 182 + } else { 183 + darray_push(&i->ptrs, entry_ptr); 212 184 } 213 185 214 186 *_i = i; 215 - return 0; 216 - found: 217 - for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) { 218 - if (ptr->dev == ca->dev_idx) { 219 - bch_err(c, "duplicate journal entry %llu on same device", 220 - le64_to_cpu(i->j.seq)); 221 - goto out; 222 - } 223 - } 224 - 225 - if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) { 226 - bch_err(c, "found too many copies of journal entry %llu", 227 - le64_to_cpu(i->j.seq)); 228 - goto out; 229 - } 230 - 231 - i->ptrs[i->nr_ptrs++] = entry_ptr; 232 187 out: 233 188 fsck_err: 189 + printbuf_exit(&buf); 234 190 return ret; 235 191 } 236 192 ··· 398 374 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, 399 375 struct jset_entry *entry) 400 376 { 401 - struct bkey_i *k; 402 377 bool first = true; 403 378 404 379 jset_entry_for_each_key(entry, k) { ··· 764 741 journal_entry_btree_keys_to_text(out, c, entry); 765 742 } 766 743 744 + static int journal_entry_datetime_validate(struct bch_fs *c, 745 + struct jset *jset, 746 + struct jset_entry *entry, 747 + unsigned version, int big_endian, 748 + enum bkey_invalid_flags flags) 749 + { 750 + unsigned bytes = vstruct_bytes(entry); 751 + unsigned expected = 16; 752 + int ret = 0; 753 + 754 + if (journal_entry_err_on(vstruct_bytes(entry) < expected, 755 + c, version, jset, entry, 756 + journal_entry_dev_usage_bad_size, 757 + "bad size (%u < %u)", 758 + bytes, expected)) { 759 + journal_entry_null_range(entry, vstruct_next(entry)); 760 + return ret; 761 + } 762 + fsck_err: 763 + return ret; 764 + } 765 + 766 + static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c, 767 + struct jset_entry *entry) 768 + { 769 + struct jset_entry_datetime *datetime = 770 + container_of(entry, struct jset_entry_datetime, entry); 771 + 772 + bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); 773 + } 774 + 767 775 struct jset_entry_ops { 768 776 int (*validate)(struct bch_fs *, struct jset *, 769 777 struct jset_entry *, unsigned, int, ··· 967 913 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 968 914 969 915 new_size = roundup_pow_of_two(new_size); 970 - n = kvpmalloc(new_size, GFP_KERNEL); 916 + n = kvmalloc(new_size, GFP_KERNEL); 971 917 if (!n) 972 918 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 973 919 974 - kvpfree(b->data, b->size); 920 + kvfree(b->data); 975 921 b->data = n; 976 922 b->size = new_size; 977 923 return 0; ··· 1156 1102 if (!r) 1157 1103 continue; 1158 1104 1159 - for (i = 0; i < r->nr_ptrs; i++) { 1160 - if (r->ptrs[i].dev == ca->dev_idx) { 1161 - unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) + 1105 + darray_for_each(r->ptrs, i) 1106 + if (i->dev == ca->dev_idx) { 1107 + unsigned wrote = bucket_remainder(ca, i->sector) + 1162 1108 vstruct_sectors(&r->j, c->block_bits); 1163 1109 1164 - ja->cur_idx = r->ptrs[i].bucket; 1110 + ja->cur_idx = i->bucket; 1165 1111 ja->sectors_free = ca->mi.bucket_size - wrote; 1166 1112 goto found; 1167 1113 } 1168 - } 1169 1114 } 1170 1115 found: 1171 1116 mutex_unlock(&jlist->lock); ··· 1197 1144 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; 1198 1145 out: 1199 1146 bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); 1200 - kvpfree(buf.data, buf.size); 1147 + kvfree(buf.data); 1201 1148 percpu_ref_put(&ca->io_ref); 1202 1149 closure_return(cl); 1203 1150 return; ··· 1206 1153 jlist->ret = ret; 1207 1154 mutex_unlock(&jlist->lock); 1208 1155 goto out; 1209 - } 1210 - 1211 - void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 1212 - struct journal_replay *j) 1213 - { 1214 - unsigned i; 1215 - 1216 - for (i = 0; i < j->nr_ptrs; i++) { 1217 - struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev); 1218 - u64 offset; 1219 - 1220 - div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset); 1221 - 1222 - if (i) 1223 - prt_printf(out, " "); 1224 - prt_printf(out, "%u:%u:%u (sector %llu)", 1225 - j->ptrs[i].dev, 1226 - j->ptrs[i].bucket, 1227 - j->ptrs[i].bucket_offset, 1228 - j->ptrs[i].sector); 1229 - } 1230 1156 } 1231 1157 1232 1158 int bch2_journal_read(struct bch_fs *c, ··· 1260 1228 1261 1229 i = *_i; 1262 1230 1263 - if (!i || i->ignore) 1231 + if (journal_replay_ignore(i)) 1264 1232 continue; 1265 1233 1266 1234 if (!*start_seq) 1267 1235 *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; 1268 1236 1269 1237 if (JSET_NO_FLUSH(&i->j)) { 1270 - i->ignore = true; 1238 + i->ignore_blacklisted = true; 1271 1239 continue; 1272 1240 } 1273 1241 1274 1242 if (!last_write_torn && !i->csum_good) { 1275 1243 last_write_torn = true; 1276 - i->ignore = true; 1244 + i->ignore_blacklisted = true; 1277 1245 continue; 1278 1246 } 1279 1247 ··· 1312 1280 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1313 1281 i = *_i; 1314 1282 1315 - if (!i || i->ignore) 1283 + if (journal_replay_ignore(i)) 1316 1284 continue; 1317 1285 1318 1286 seq = le64_to_cpu(i->j.seq); 1319 1287 if (seq < *last_seq) { 1320 - journal_replay_free(c, i); 1288 + journal_replay_free(c, i, false); 1321 1289 continue; 1322 1290 } 1323 1291 ··· 1325 1293 fsck_err_on(!JSET_NO_FLUSH(&i->j), c, 1326 1294 jset_seq_blacklisted, 1327 1295 "found blacklisted journal entry %llu", seq); 1328 - i->ignore = true; 1296 + i->ignore_blacklisted = true; 1329 1297 } 1330 1298 } 1331 1299 ··· 1334 1302 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1335 1303 i = *_i; 1336 1304 1337 - if (!i || i->ignore) 1305 + if (journal_replay_ignore(i)) 1338 1306 continue; 1339 1307 1340 1308 BUG_ON(seq > le64_to_cpu(i->j.seq)); ··· 1385 1353 .e.data_type = BCH_DATA_journal, 1386 1354 .e.nr_required = 1, 1387 1355 }; 1388 - unsigned ptr; 1389 1356 1390 1357 i = *_i; 1391 - if (!i || i->ignore) 1358 + if (journal_replay_ignore(i)) 1392 1359 continue; 1393 1360 1394 - for (ptr = 0; ptr < i->nr_ptrs; ptr++) { 1395 - struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); 1361 + darray_for_each(i->ptrs, ptr) { 1362 + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); 1396 1363 1397 - if (!i->ptrs[ptr].csum_good) 1398 - bch_err_dev_offset(ca, i->ptrs[ptr].sector, 1364 + if (!ptr->csum_good) 1365 + bch_err_dev_offset(ca, ptr->sector, 1399 1366 "invalid journal checksum, seq %llu%s", 1400 1367 le64_to_cpu(i->j.seq), 1401 1368 i->csum_good ? " (had good copy on another device)" : ""); 1402 1369 } 1403 1370 1404 1371 ret = jset_validate(c, 1405 - bch_dev_bkey_exists(c, i->ptrs[0].dev), 1372 + bch_dev_bkey_exists(c, i->ptrs.data[0].dev), 1406 1373 &i->j, 1407 - i->ptrs[0].sector, 1374 + i->ptrs.data[0].sector, 1408 1375 READ); 1409 1376 if (ret) 1410 1377 goto err; 1411 1378 1412 - for (ptr = 0; ptr < i->nr_ptrs; ptr++) 1413 - replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev; 1379 + darray_for_each(i->ptrs, ptr) 1380 + replicas.e.devs[replicas.e.nr_devs++] = ptr->dev; 1414 1381 1415 1382 bch2_replicas_entry_sort(&replicas.e); 1416 1383 ··· 1578 1547 if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) 1579 1548 return; 1580 1549 1581 - new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN); 1550 + new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN); 1582 1551 if (!new_buf) 1583 1552 return; 1584 1553 ··· 1589 1558 swap(buf->buf_size, new_size); 1590 1559 spin_unlock(&j->lock); 1591 1560 1592 - kvpfree(new_buf, new_size); 1561 + kvfree(new_buf); 1593 1562 } 1594 1563 1595 1564 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) ··· 1599 1568 1600 1569 static CLOSURE_CALLBACK(journal_write_done) 1601 1570 { 1602 - closure_type(j, struct journal, io); 1571 + closure_type(w, struct journal_buf, io); 1572 + struct journal *j = container_of(w, struct journal, buf[w->idx]); 1603 1573 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1604 - struct journal_buf *w = journal_last_unwritten_buf(j); 1605 1574 struct bch_replicas_padded replicas; 1606 1575 union journal_res_state old, new; 1607 - u64 v, seq; 1576 + u64 v, seq = le64_to_cpu(w->data->seq); 1608 1577 int err = 0; 1609 1578 1610 1579 bch2_time_stats_update(!JSET_NO_FLUSH(w->data) ··· 1624 1593 if (err) 1625 1594 bch2_fatal_error(c); 1626 1595 1627 - spin_lock(&j->lock); 1628 - seq = le64_to_cpu(w->data->seq); 1596 + closure_debug_destroy(cl); 1629 1597 1598 + spin_lock(&j->lock); 1630 1599 if (seq >= j->pin.front) 1631 1600 journal_seq_pin(j, seq)->devs = w->devs_written; 1601 + if (err && (!j->err_seq || seq < j->err_seq)) 1602 + j->err_seq = seq; 1603 + w->write_done = true; 1632 1604 1633 - if (!err) { 1634 - if (!JSET_NO_FLUSH(w->data)) { 1605 + bool completed = false; 1606 + 1607 + for (seq = journal_last_unwritten_seq(j); 1608 + seq <= journal_cur_seq(j); 1609 + seq++) { 1610 + w = j->buf + (seq & JOURNAL_BUF_MASK); 1611 + if (!w->write_done) 1612 + break; 1613 + 1614 + if (!j->err_seq && !JSET_NO_FLUSH(w->data)) { 1635 1615 j->flushed_seq_ondisk = seq; 1636 1616 j->last_seq_ondisk = w->last_seq; 1637 1617 1638 1618 bch2_do_discards(c); 1639 1619 closure_wake_up(&c->freelist_wait); 1640 - 1641 1620 bch2_reset_alloc_cursors(c); 1642 1621 } 1643 - } else if (!j->err_seq || seq < j->err_seq) 1644 - j->err_seq = seq; 1645 1622 1646 - j->seq_ondisk = seq; 1623 + j->seq_ondisk = seq; 1647 1624 1648 - /* 1649 - * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard 1650 - * more buckets: 1651 - * 1652 - * Must come before signaling write completion, for 1653 - * bch2_fs_journal_stop(): 1654 - */ 1655 - if (j->watermark != BCH_WATERMARK_stripe) 1656 - journal_reclaim_kick(&c->journal); 1625 + /* 1626 + * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard 1627 + * more buckets: 1628 + * 1629 + * Must come before signaling write completion, for 1630 + * bch2_fs_journal_stop(): 1631 + */ 1632 + if (j->watermark != BCH_WATERMARK_stripe) 1633 + journal_reclaim_kick(&c->journal); 1657 1634 1658 - /* also must come before signalling write completion: */ 1659 - closure_debug_destroy(cl); 1635 + v = atomic64_read(&j->reservations.counter); 1636 + do { 1637 + old.v = new.v = v; 1638 + BUG_ON(journal_state_count(new, new.unwritten_idx)); 1639 + BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK)); 1660 1640 1661 - v = atomic64_read(&j->reservations.counter); 1662 - do { 1663 - old.v = new.v = v; 1664 - BUG_ON(journal_state_count(new, new.unwritten_idx)); 1641 + new.unwritten_idx++; 1642 + } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); 1665 1643 1666 - new.unwritten_idx++; 1667 - } while ((v = atomic64_cmpxchg(&j->reservations.counter, 1668 - old.v, new.v)) != old.v); 1644 + closure_wake_up(&w->wait); 1645 + completed = true; 1646 + } 1669 1647 1670 - bch2_journal_reclaim_fast(j); 1671 - bch2_journal_space_available(j); 1648 + if (completed) { 1649 + bch2_journal_reclaim_fast(j); 1650 + bch2_journal_space_available(j); 1672 1651 1673 - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], 1674 - &j->max_in_flight_start, false); 1652 + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false); 1675 1653 1676 - closure_wake_up(&w->wait); 1677 - journal_wake(j); 1654 + journal_wake(j); 1655 + } 1678 1656 1679 - if (!journal_state_count(new, new.unwritten_idx) && 1680 - journal_last_unwritten_seq(j) <= journal_cur_seq(j)) { 1681 - spin_unlock(&j->lock); 1682 - closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); 1683 - } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && 1657 + if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && 1684 1658 new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { 1685 1659 struct journal_buf *buf = journal_cur_buf(j); 1686 1660 long delta = buf->expires - jiffies; ··· 1695 1659 * previous entries still in flight - the current journal entry 1696 1660 * might want to be written now: 1697 1661 */ 1698 - 1699 - spin_unlock(&j->lock); 1700 - mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta)); 1701 - } else { 1702 - spin_unlock(&j->lock); 1662 + mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); 1703 1663 } 1664 + 1665 + spin_unlock(&j->lock); 1704 1666 } 1705 1667 1706 1668 static void journal_write_endio(struct bio *bio) 1707 1669 { 1708 - struct bch_dev *ca = bio->bi_private; 1670 + struct journal_bio *jbio = container_of(bio, struct journal_bio, bio); 1671 + struct bch_dev *ca = jbio->ca; 1709 1672 struct journal *j = &ca->fs->journal; 1710 - struct journal_buf *w = journal_last_unwritten_buf(j); 1711 - unsigned long flags; 1673 + struct journal_buf *w = j->buf + jbio->buf_idx; 1712 1674 1713 1675 if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, 1714 1676 "error writing journal entry %llu: %s", 1715 1677 le64_to_cpu(w->data->seq), 1716 1678 bch2_blk_status_to_str(bio->bi_status)) || 1717 1679 bch2_meta_write_fault("journal")) { 1680 + unsigned long flags; 1681 + 1718 1682 spin_lock_irqsave(&j->err_lock, flags); 1719 1683 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); 1720 1684 spin_unlock_irqrestore(&j->err_lock, flags); 1721 1685 } 1722 1686 1723 - closure_put(&j->io); 1687 + closure_put(&w->io); 1724 1688 percpu_ref_put(&ca->io_ref); 1725 1689 } 1726 1690 1727 1691 static CLOSURE_CALLBACK(do_journal_write) 1728 1692 { 1729 - closure_type(j, struct journal, io); 1693 + closure_type(w, struct journal_buf, io); 1694 + struct journal *j = container_of(w, struct journal, buf[w->idx]); 1730 1695 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1731 - struct bch_dev *ca; 1732 - struct journal_buf *w = journal_last_unwritten_buf(j); 1733 - struct bio *bio; 1734 1696 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1735 1697 1736 1698 extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { 1737 - ca = bch_dev_bkey_exists(c, ptr->dev); 1699 + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); 1700 + struct journal_device *ja = &ca->journal; 1701 + 1738 1702 if (!percpu_ref_tryget(&ca->io_ref)) { 1739 1703 /* XXX: fix this */ 1740 1704 bch_err(c, "missing device for journal write\n"); ··· 1744 1708 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], 1745 1709 sectors); 1746 1710 1747 - bio = ca->journal.bio; 1711 + struct bio *bio = &ja->bio[w->idx]->bio; 1748 1712 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); 1749 1713 bio->bi_iter.bi_sector = ptr->offset; 1750 1714 bio->bi_end_io = journal_write_endio; ··· 1763 1727 trace_and_count(c, journal_write, bio); 1764 1728 closure_bio_submit(bio, cl); 1765 1729 1766 - ca->journal.bucket_seq[ca->journal.cur_idx] = 1767 - le64_to_cpu(w->data->seq); 1730 + ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1768 1731 } 1769 1732 1770 - continue_at(cl, journal_write_done, c->io_complete_wq); 1733 + continue_at(cl, journal_write_done, j->wq); 1771 1734 } 1772 1735 1773 1736 static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) ··· 1817 1782 if (!wb.wb) 1818 1783 bch2_journal_keys_to_write_buffer_start(c, &wb, seq); 1819 1784 1820 - struct bkey_i *k; 1821 1785 jset_entry_for_each_key(i, k) { 1822 1786 ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); 1823 1787 if (ret) { ··· 1832 1798 1833 1799 if (wb.wb) 1834 1800 bch2_journal_keys_to_write_buffer_end(c, &wb); 1801 + 1802 + spin_lock(&c->journal.lock); 1835 1803 w->need_flush_to_write_buffer = false; 1804 + spin_unlock(&c->journal.lock); 1836 1805 1837 1806 start = end = vstruct_last(jset); 1838 1807 1839 1808 end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); 1809 + 1810 + struct jset_entry_datetime *d = 1811 + container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry); 1812 + d->entry.type = BCH_JSET_ENTRY_datetime; 1813 + d->seconds = cpu_to_le64(ktime_get_real_seconds()); 1840 1814 1841 1815 bch2_journal_super_entries_add_common(c, &end, seq); 1842 1816 u64s = (u64 *) end - (u64 *) start; ··· 1935 1893 1936 1894 j->nr_noflush_writes++; 1937 1895 } else { 1896 + w->must_flush = true; 1938 1897 j->last_flush_write = jiffies; 1939 1898 j->nr_flush_writes++; 1940 1899 clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags); ··· 1946 1903 1947 1904 CLOSURE_CALLBACK(bch2_journal_write) 1948 1905 { 1949 - closure_type(j, struct journal, io); 1906 + closure_type(w, struct journal_buf, io); 1907 + struct journal *j = container_of(w, struct journal, buf[w->idx]); 1950 1908 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1951 - struct journal_buf *w = journal_last_unwritten_buf(j); 1952 1909 struct bch_replicas_padded replicas; 1953 - struct bio *bio; 1954 1910 struct printbuf journal_debug_buf = PRINTBUF; 1955 1911 unsigned nr_rw_members = 0; 1956 1912 int ret; 1957 1913 1914 + for_each_rw_member(c, ca) 1915 + nr_rw_members++; 1916 + 1958 1917 BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); 1918 + BUG_ON(!w->write_started); 1919 + BUG_ON(w->write_allocated); 1920 + BUG_ON(w->write_done); 1959 1921 1960 1922 j->write_start_time = local_clock(); 1961 1923 1962 1924 spin_lock(&j->lock); 1925 + if (nr_rw_members > 1) 1926 + w->separate_flush = true; 1927 + 1963 1928 ret = bch2_journal_write_pick_flush(j, w); 1964 1929 spin_unlock(&j->lock); 1965 1930 if (ret) ··· 2007 1956 * bch2_journal_space_available(): 2008 1957 */ 2009 1958 w->sectors = 0; 1959 + w->write_allocated = true; 2010 1960 2011 1961 /* 2012 1962 * journal entry has been compacted and allocated, recalculate space 2013 1963 * available: 2014 1964 */ 2015 1965 bch2_journal_space_available(j); 1966 + bch2_journal_do_writes(j); 2016 1967 spin_unlock(&j->lock); 2017 1968 2018 1969 w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); 2019 1970 2020 1971 if (c->opts.nochanges) 2021 1972 goto no_io; 2022 - 2023 - for_each_rw_member(c, ca) 2024 - nr_rw_members++; 2025 - 2026 - if (nr_rw_members > 1) 2027 - w->separate_flush = true; 2028 1973 2029 1974 /* 2030 1975 * Mark journal replicas before we submit the write to guarantee ··· 2032 1985 if (ret) 2033 1986 goto err; 2034 1987 1988 + if (!JSET_NO_FLUSH(w->data)) 1989 + closure_wait_event(&j->async_wait, j->seq_ondisk + 1 == le64_to_cpu(w->data->seq)); 1990 + 2035 1991 if (!JSET_NO_FLUSH(w->data) && w->separate_flush) { 2036 1992 for_each_rw_member(c, ca) { 2037 1993 percpu_ref_get(&ca->io_ref); 2038 1994 2039 - bio = ca->journal.bio; 1995 + struct journal_device *ja = &ca->journal; 1996 + struct bio *bio = &ja->bio[w->idx]->bio; 2040 1997 bio_reset(bio, ca->disk_sb.bdev, 2041 - REQ_OP_WRITE|REQ_PREFLUSH); 1998 + REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH); 2042 1999 bio->bi_end_io = journal_write_endio; 2043 2000 bio->bi_private = ca; 2044 2001 closure_bio_submit(bio, cl); 2045 2002 } 2046 2003 } 2047 2004 2048 - continue_at(cl, do_journal_write, c->io_complete_wq); 2005 + continue_at(cl, do_journal_write, j->wq); 2049 2006 return; 2050 2007 no_io: 2051 - continue_at(cl, journal_write_done, c->io_complete_wq); 2008 + continue_at(cl, journal_write_done, j->wq); 2052 2009 return; 2053 2010 err: 2054 2011 bch2_fatal_error(c); 2055 - continue_at(cl, journal_write_done, c->io_complete_wq); 2012 + continue_at(cl, journal_write_done, j->wq); 2056 2013 }

+36 -11

fs/bcachefs/journal_io.h

··· 2 2 #ifndef _BCACHEFS_JOURNAL_IO_H 3 3 #define _BCACHEFS_JOURNAL_IO_H 4 4 5 + #include "darray.h" 6 + 7 + struct journal_ptr { 8 + bool csum_good; 9 + u8 dev; 10 + u32 bucket; 11 + u32 bucket_offset; 12 + u64 sector; 13 + }; 14 + 5 15 /* 6 16 * Only used for holding the journal entries we read in btree_journal_read() 7 17 * during cache_registration 8 18 */ 9 19 struct journal_replay { 10 - struct journal_ptr { 11 - bool csum_good; 12 - u8 dev; 13 - u32 bucket; 14 - u32 bucket_offset; 15 - u64 sector; 16 - } ptrs[BCH_REPLICAS_MAX]; 17 - unsigned nr_ptrs; 20 + DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs; 18 21 19 22 bool csum_good; 20 - bool ignore; 23 + bool ignore_blacklisted; 24 + bool ignore_not_dirty; 21 25 /* must be last: */ 22 26 struct jset j; 23 27 }; 28 + 29 + static inline bool journal_replay_ignore(struct journal_replay *i) 30 + { 31 + return !i || i->ignore_blacklisted || i->ignore_not_dirty; 32 + } 24 33 25 34 static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, 26 35 struct jset_entry *entry, unsigned type) ··· 45 36 } 46 37 47 38 #define for_each_jset_entry_type(entry, jset, type) \ 48 - for (entry = (jset)->start; \ 39 + for (struct jset_entry *entry = (jset)->start; \ 49 40 (entry = __jset_entry_type_next(jset, entry, type)); \ 50 41 entry = vstruct_next(entry)) 51 42 52 43 #define jset_entry_for_each_key(_e, _k) \ 53 - for (_k = (_e)->start; \ 44 + for (struct bkey_i *_k = (_e)->start; \ 54 45 _k < vstruct_last(_e); \ 55 46 _k = bkey_next(_k)) 56 47 ··· 70 61 int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *); 71 62 72 63 CLOSURE_CALLBACK(bch2_journal_write); 64 + 65 + static inline struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) 66 + { 67 + struct jset_entry *entry = *end; 68 + unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); 69 + 70 + memset(entry, 0, u64s * sizeof(u64)); 71 + /* 72 + * The u64s field counts from the start of data, ignoring the shared 73 + * fields. 74 + */ 75 + entry->u64s = cpu_to_le16(u64s - 1); 76 + 77 + *end = vstruct_next(*end); 78 + return entry; 79 + } 73 80 74 81 #endif /* _BCACHEFS_JOURNAL_IO_H */

+12 -17

fs/bcachefs/journal_reclaim.c

··· 62 62 ? BCH_WATERMARK_reclaim 63 63 : BCH_WATERMARK_stripe; 64 64 65 - if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], 66 - &j->low_on_space_start, low_on_space) || 67 - track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], 68 - &j->low_on_pin_start, low_on_pin) || 69 - track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], 70 - &j->write_buffer_full_start, low_on_wb)) 65 + if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) || 66 + track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) || 67 + track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb)) 71 68 trace_and_count(c, journal_full, c); 72 69 73 70 swap(watermark, j->watermark); ··· 391 394 struct journal_entry_pin *src, 392 395 journal_pin_flush_fn flush_fn) 393 396 { 394 - bool reclaim; 395 - 396 397 spin_lock(&j->lock); 397 398 398 399 u64 seq = READ_ONCE(src->seq); ··· 406 411 return; 407 412 } 408 413 409 - reclaim = __journal_pin_drop(j, dst); 414 + bool reclaim = __journal_pin_drop(j, dst); 410 415 411 416 bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn)); 412 417 413 418 if (reclaim) 414 419 bch2_journal_reclaim_fast(j); 415 - spin_unlock(&j->lock); 416 420 417 421 /* 418 422 * If the journal is currently full, we might want to call flush_fn 419 423 * immediately: 420 424 */ 421 - journal_wake(j); 425 + if (seq == journal_last_seq(j)) 426 + journal_wake(j); 427 + spin_unlock(&j->lock); 422 428 } 423 429 424 430 void bch2_journal_pin_set(struct journal *j, u64 seq, 425 431 struct journal_entry_pin *pin, 426 432 journal_pin_flush_fn flush_fn) 427 433 { 428 - bool reclaim; 429 - 430 434 spin_lock(&j->lock); 431 435 432 436 BUG_ON(seq < journal_last_seq(j)); 433 437 434 - reclaim = __journal_pin_drop(j, pin); 438 + bool reclaim = __journal_pin_drop(j, pin); 435 439 436 440 bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn)); 437 441 438 442 if (reclaim) 439 443 bch2_journal_reclaim_fast(j); 440 - spin_unlock(&j->lock); 441 - 442 444 /* 443 445 * If the journal is currently full, we might want to call flush_fn 444 446 * immediately: 445 447 */ 446 - journal_wake(j); 448 + if (seq == journal_last_seq(j)) 449 + journal_wake(j); 450 + 451 + spin_unlock(&j->lock); 447 452 } 448 453 449 454 /**

+21 -46

fs/bcachefs/journal_seq_blacklist.c

··· 43 43 return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64); 44 44 } 45 45 46 - static struct bch_sb_field_journal_seq_blacklist * 47 - blacklist_entry_try_merge(struct bch_fs *c, 48 - struct bch_sb_field_journal_seq_blacklist *bl, 49 - unsigned i) 50 - { 51 - unsigned nr = blacklist_nr_entries(bl); 52 - 53 - if (le64_to_cpu(bl->start[i].end) >= 54 - le64_to_cpu(bl->start[i + 1].start)) { 55 - bl->start[i].end = bl->start[i + 1].end; 56 - --nr; 57 - memmove(&bl->start[i], 58 - &bl->start[i + 1], 59 - sizeof(bl->start[0]) * (nr - i)); 60 - 61 - bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, 62 - sb_blacklist_u64s(nr)); 63 - BUG_ON(!bl); 64 - } 65 - 66 - return bl; 67 - } 68 - 69 - static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e, 70 - u64 start, u64 end) 71 - { 72 - return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start); 73 - } 74 - 75 46 int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) 76 47 { 77 48 struct bch_sb_field_journal_seq_blacklist *bl; 78 - unsigned i, nr; 49 + unsigned i = 0, nr; 79 50 int ret = 0; 80 51 81 52 mutex_lock(&c->sb_lock); 82 53 bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); 83 54 nr = blacklist_nr_entries(bl); 84 55 85 - for (i = 0; i < nr; i++) { 56 + while (i < nr) { 86 57 struct journal_seq_blacklist_entry *e = 87 58 bl->start + i; 88 59 89 - if (bl_entry_contig_or_overlaps(e, start, end)) { 90 - e->start = cpu_to_le64(min(start, le64_to_cpu(e->start))); 91 - e->end = cpu_to_le64(max(end, le64_to_cpu(e->end))); 60 + if (end < le64_to_cpu(e->start)) 61 + break; 92 62 93 - if (i + 1 < nr) 94 - bl = blacklist_entry_try_merge(c, 95 - bl, i); 96 - if (i) 97 - bl = blacklist_entry_try_merge(c, 98 - bl, i - 1); 99 - goto out_write_sb; 63 + if (start > le64_to_cpu(e->end)) { 64 + i++; 65 + continue; 100 66 } 67 + 68 + /* 69 + * Entry is contiguous or overlapping with new entry: merge it 70 + * with new entry, and delete: 71 + */ 72 + 73 + start = min(start, le64_to_cpu(e->start)); 74 + end = max(end, le64_to_cpu(e->end)); 75 + array_remove_item(bl->start, nr, i); 101 76 } 102 77 103 78 bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, ··· 82 107 goto out; 83 108 } 84 109 85 - bl->start[nr].start = cpu_to_le64(start); 86 - bl->start[nr].end = cpu_to_le64(end); 87 - out_write_sb: 110 + array_insert_item(bl->start, nr, i, ((struct journal_seq_blacklist_entry) { 111 + .start = cpu_to_le64(start), 112 + .end = cpu_to_le64(end), 113 + })); 88 114 c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); 89 115 90 116 ret = bch2_write_super(c); ··· 141 165 if (!bl) 142 166 return 0; 143 167 144 - t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr, 145 - GFP_KERNEL); 168 + t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL); 146 169 if (!t) 147 170 return -BCH_ERR_ENOMEM_blacklist_table_init; 148 171

+19 -11

fs/bcachefs/journal_types.h

··· 18 18 * the journal that are being staged or in flight. 19 19 */ 20 20 struct journal_buf { 21 + struct closure io; 21 22 struct jset *data; 22 23 23 24 __BKEY_PADDED(key, BCH_REPLICAS_MAX); ··· 34 33 unsigned disk_sectors; /* maximum size entry could have been, if 35 34 buf_size was bigger */ 36 35 unsigned u64s_reserved; 37 - bool noflush; /* write has already been kicked off, and was noflush */ 38 - bool must_flush; /* something wants a flush */ 39 - bool separate_flush; 40 - bool need_flush_to_write_buffer; 36 + bool noflush:1; /* write has already been kicked off, and was noflush */ 37 + bool must_flush:1; /* something wants a flush */ 38 + bool separate_flush:1; 39 + bool need_flush_to_write_buffer:1; 40 + bool write_started:1; 41 + bool write_allocated:1; 42 + bool write_done:1; 43 + u8 idx; 41 44 }; 42 45 43 46 /* ··· 139 134 /* Reasons we may fail to get a journal reservation: */ 140 135 #define JOURNAL_ERRORS() \ 141 136 x(ok) \ 137 + x(retry) \ 142 138 x(blocked) \ 143 139 x(max_in_flight) \ 144 140 x(journal_full) \ ··· 154 148 }; 155 149 156 150 typedef DARRAY(u64) darray_u64; 151 + 152 + struct journal_bio { 153 + struct bch_dev *ca; 154 + unsigned buf_idx; 155 + 156 + struct bio bio; 157 + }; 157 158 158 159 /* Embedded in struct bch_fs */ 159 160 struct journal { ··· 216 203 wait_queue_head_t wait; 217 204 struct closure_waitlist async_wait; 218 205 219 - struct closure io; 220 206 struct delayed_work write_work; 207 + struct workqueue_struct *wq; 221 208 222 209 /* Sequence number of most recent journal entry (last entry in @pin) */ 223 210 atomic64_t seq; ··· 287 274 u64 nr_noflush_writes; 288 275 u64 entry_bytes_written; 289 276 290 - u64 low_on_space_start; 291 - u64 low_on_pin_start; 292 - u64 max_in_flight_start; 293 - u64 write_buffer_full_start; 294 - 295 277 struct bch2_time_stats *flush_write_time; 296 278 struct bch2_time_stats *noflush_write_time; 297 279 struct bch2_time_stats *flush_seq_time; ··· 321 313 u64 *buckets; 322 314 323 315 /* Bio for journal reads/writes to this device */ 324 - struct bio *bio; 316 + struct journal_bio *bio[JOURNAL_BUF_NR]; 325 317 326 318 /* for bch_journal_read_device */ 327 319 struct closure read;

+3 -4

fs/bcachefs/lru.c

··· 44 44 u64 dev_bucket, u64 time, bool set) 45 45 { 46 46 return time 47 - ? bch2_btree_bit_mod(trans, BTREE_ID_lru, 48 - lru_pos(lru_id, dev_bucket, time), set) 47 + ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, 48 + lru_pos(lru_id, dev_bucket, time), set) 49 49 : 0; 50 50 } 51 51 ··· 125 125 goto out; 126 126 } 127 127 128 - if (c->opts.reconstruct_alloc || 129 - fsck_err(c, lru_entry_bad, 128 + if (fsck_err(c, lru_entry_bad, 130 129 "incorrect lru entry: lru %s time %llu\n" 131 130 " %s\n" 132 131 " for %s",

+18 -10

fs/bcachefs/mean_and_variance.c

··· 103 103 * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update() 104 104 * @s: mean and variance number of samples and their sums 105 105 * @x: new value to include in the &mean_and_variance_weighted 106 + * @initted: caller must track whether this is the first use or not 107 + * @weight: ewma weight 106 108 * 107 109 * see linked pdf: function derived from equations 140-143 where alpha = 2^w. 108 110 * values are stored bitshifted for performance and added precision. 109 111 */ 110 - void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x) 112 + void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, 113 + s64 x, bool initted, u8 weight) 111 114 { 112 115 // previous weighted variance. 113 - u8 w = s->weight; 116 + u8 w = weight; 114 117 u64 var_w0 = s->variance; 115 118 // new value weighted. 116 119 s64 x_w = x << w; ··· 122 119 // new mean weighted. 123 120 s64 u_w1 = s->mean + diff; 124 121 125 - if (!s->init) { 122 + if (!initted) { 126 123 s->mean = x_w; 127 124 s->variance = 0; 128 125 } else { 129 126 s->mean = u_w1; 130 127 s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w; 131 128 } 132 - s->init = true; 133 129 } 134 130 EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update); 135 131 136 132 /** 137 133 * mean_and_variance_weighted_get_mean() - get mean from @s 138 134 * @s: mean and variance number of samples and their sums 135 + * @weight: ewma weight 139 136 */ 140 - s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s) 137 + s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s, 138 + u8 weight) 141 139 { 142 - return fast_divpow2(s.mean, s.weight); 140 + return fast_divpow2(s.mean, weight); 143 141 } 144 142 EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean); 145 143 146 144 /** 147 145 * mean_and_variance_weighted_get_variance() -- get variance from @s 148 146 * @s: mean and variance number of samples and their sums 147 + * @weight: ewma weight 149 148 */ 150 - u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s) 149 + u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s, 150 + u8 weight) 151 151 { 152 152 // always positive don't need fast divpow2 153 - return s.variance >> s.weight; 153 + return s.variance >> weight; 154 154 } 155 155 EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance); 156 156 157 157 /** 158 158 * mean_and_variance_weighted_get_stddev() - get standard deviation from @s 159 159 * @s: mean and variance number of samples and their sums 160 + * @weight: ewma weight 160 161 */ 161 - u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s) 162 + u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s, 163 + u8 weight) 162 164 { 163 - return int_sqrt64(mean_and_variance_weighted_get_variance(s)); 165 + return int_sqrt64(mean_and_variance_weighted_get_variance(s, weight)); 164 166 } 165 167 EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev); 166 168

+8 -6

fs/bcachefs/mean_and_variance.h

··· 154 154 155 155 /* expontentially weighted variant */ 156 156 struct mean_and_variance_weighted { 157 - bool init; 158 - u8 weight; /* base 2 logarithim */ 159 157 s64 mean; 160 158 u64 variance; 161 159 }; ··· 190 192 u64 mean_and_variance_get_variance(struct mean_and_variance s1); 191 193 u32 mean_and_variance_get_stddev(struct mean_and_variance s); 192 194 193 - void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v); 195 + void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, 196 + s64 v, bool initted, u8 weight); 194 197 195 - s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s); 196 - u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s); 197 - u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s); 198 + s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s, 199 + u8 weight); 200 + u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s, 201 + u8 weight); 202 + u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s, 203 + u8 weight); 198 204 199 205 #endif // MEAN_AND_VAIRANCE_H_

+43 -37

fs/bcachefs/mean_and_variance_test.c

··· 31 31 32 32 static void mean_and_variance_weighted_test(struct kunit *test) 33 33 { 34 - struct mean_and_variance_weighted s = { .weight = 2 }; 34 + struct mean_and_variance_weighted s = { }; 35 35 36 - mean_and_variance_weighted_update(&s, 10); 37 - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 10); 38 - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0); 36 + mean_and_variance_weighted_update(&s, 10, false, 2); 37 + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 10); 38 + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0); 39 39 40 - mean_and_variance_weighted_update(&s, 20); 41 - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 12); 42 - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18); 40 + mean_and_variance_weighted_update(&s, 20, true, 2); 41 + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 12); 42 + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18); 43 43 44 - mean_and_variance_weighted_update(&s, 30); 45 - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 16); 46 - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72); 44 + mean_and_variance_weighted_update(&s, 30, true, 2); 45 + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 16); 46 + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72); 47 47 48 - s = (struct mean_and_variance_weighted) { .weight = 2 }; 48 + s = (struct mean_and_variance_weighted) { }; 49 49 50 - mean_and_variance_weighted_update(&s, -10); 51 - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -10); 52 - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0); 50 + mean_and_variance_weighted_update(&s, -10, false, 2); 51 + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -10); 52 + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0); 53 53 54 - mean_and_variance_weighted_update(&s, -20); 55 - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -12); 56 - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18); 54 + mean_and_variance_weighted_update(&s, -20, true, 2); 55 + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -12); 56 + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18); 57 57 58 - mean_and_variance_weighted_update(&s, -30); 59 - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -16); 60 - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72); 58 + mean_and_variance_weighted_update(&s, -30, true, 2); 59 + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -16); 60 + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72); 61 61 } 62 62 63 63 static void mean_and_variance_weighted_advanced_test(struct kunit *test) 64 64 { 65 - struct mean_and_variance_weighted s = { .weight = 8 }; 65 + struct mean_and_variance_weighted s = { }; 66 + bool initted = false; 66 67 s64 i; 67 68 68 - for (i = 10; i <= 100; i += 10) 69 - mean_and_variance_weighted_update(&s, i); 69 + for (i = 10; i <= 100; i += 10) { 70 + mean_and_variance_weighted_update(&s, i, initted, 8); 71 + initted = true; 72 + } 70 73 71 - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 11); 72 - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107); 74 + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), 11); 75 + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107); 73 76 74 - s = (struct mean_and_variance_weighted) { .weight = 8 }; 77 + s = (struct mean_and_variance_weighted) { }; 78 + initted = false; 75 79 76 - for (i = -10; i >= -100; i -= 10) 77 - mean_and_variance_weighted_update(&s, i); 80 + for (i = -10; i >= -100; i -= 10) { 81 + mean_and_variance_weighted_update(&s, i, initted, 8); 82 + initted = true; 83 + } 78 84 79 - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -11); 80 - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107); 85 + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), -11); 86 + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107); 81 87 } 82 88 83 89 static void do_mean_and_variance_test(struct kunit *test, ··· 98 92 s64 *weighted_stddev) 99 93 { 100 94 struct mean_and_variance mv = {}; 101 - struct mean_and_variance_weighted vw = { .weight = weight }; 95 + struct mean_and_variance_weighted vw = { }; 102 96 103 97 for (unsigned i = 0; i < initial_n; i++) { 104 98 mean_and_variance_update(&mv, initial_value); 105 - mean_and_variance_weighted_update(&vw, initial_value); 99 + mean_and_variance_weighted_update(&vw, initial_value, false, weight); 106 100 107 101 KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), initial_value); 108 102 KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), 0); 109 - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), initial_value); 110 - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),0); 103 + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), initial_value); 104 + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),0); 111 105 } 112 106 113 107 for (unsigned i = 0; i < n; i++) { 114 108 mean_and_variance_update(&mv, data[i]); 115 - mean_and_variance_weighted_update(&vw, data[i]); 109 + mean_and_variance_weighted_update(&vw, data[i], true, weight); 116 110 117 111 KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), mean[i]); 118 112 KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), stddev[i]); 119 - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), weighted_mean[i]); 120 - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),weighted_stddev[i]); 113 + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), weighted_mean[i]); 114 + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),weighted_stddev[i]); 121 115 } 122 116 123 117 KUNIT_EXPECT_EQ(test, mv.n, initial_n + n);

+3 -5

fs/bcachefs/migrate.c

··· 31 31 nr_good = bch2_bkey_durability(c, k.s_c); 32 32 if ((!nr_good && !(flags & lost)) || 33 33 (nr_good < replicas && !(flags & degraded))) 34 - return -EINVAL; 34 + return -BCH_ERR_remove_would_lose_data; 35 35 36 36 return 0; 37 37 } ··· 111 111 112 112 /* don't handle this yet: */ 113 113 if (flags & BCH_FORCE_IF_METADATA_LOST) 114 - return -EINVAL; 114 + return -BCH_ERR_remove_with_metadata_missing_unimplemented; 115 115 116 116 trans = bch2_trans_get(c); 117 117 bch2_bkey_buf_init(&k); ··· 132 132 133 133 ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), 134 134 dev_idx, flags, true); 135 - if (ret) { 136 - bch_err(c, "Cannot drop device without losing data"); 135 + if (ret) 137 136 break; 138 - } 139 137 140 138 ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false); 141 139 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {

+4 -4

fs/bcachefs/opts.c

··· 314 314 if (ret < 0 || (*res != 0 && *res != 1)) { 315 315 if (err) 316 316 prt_printf(err, "%s: must be bool", opt->attr.name); 317 - return ret; 317 + return ret < 0 ? ret : -BCH_ERR_option_not_bool; 318 318 } 319 319 break; 320 320 case BCH_OPT_UINT: ··· 456 456 457 457 copied_opts = kstrdup(options, GFP_KERNEL); 458 458 if (!copied_opts) 459 - return -1; 459 + return -ENOMEM; 460 460 copied_opts_start = copied_opts; 461 461 462 462 while ((opt = strsep(&copied_opts, ",")) != NULL) { ··· 501 501 502 502 bad_opt: 503 503 pr_err("Bad mount option %s", name); 504 - ret = -1; 504 + ret = -BCH_ERR_option_name; 505 505 goto out; 506 506 bad_val: 507 507 pr_err("Invalid mount option %s", err.buf); 508 - ret = -1; 508 + ret = -BCH_ERR_option_value; 509 509 goto out; 510 510 out: 511 511 kfree(copied_opts_start);

+10

fs/bcachefs/opts.h

··· 290 290 OPT_BOOL(), \ 291 291 BCH2_NO_SB_OPT, false, \ 292 292 NULL, "Allow mounting in when data will be missing") \ 293 + x(no_splitbrain_check, u8, \ 294 + OPT_FS|OPT_MOUNT, \ 295 + OPT_BOOL(), \ 296 + BCH2_NO_SB_OPT, false, \ 297 + NULL, "Don't kick drives out when splitbrain detected")\ 293 298 x(discard, u8, \ 294 299 OPT_FS|OPT_MOUNT|OPT_DEVICE, \ 295 300 OPT_BOOL(), \ ··· 337 332 OPT_BOOL(), \ 338 333 BCH2_NO_SB_OPT, false, \ 339 334 NULL, "Run fsck on mount") \ 335 + x(fsck_memory_usage_percent, u8, \ 336 + OPT_FS|OPT_MOUNT, \ 337 + OPT_UINT(20, 70), \ 338 + BCH2_NO_SB_OPT, 50, \ 339 + NULL, "Maximum percentage of system ram fsck is allowed to pin")\ 340 340 x(fix_errors, u8, \ 341 341 OPT_FS|OPT_MOUNT, \ 342 342 OPT_FN(bch2_opt_fix_errors), \

+2 -2

fs/bcachefs/rebalance.c

··· 412 412 u64 now = atomic64_read(&c->io_clock[WRITE].now); 413 413 414 414 prt_str(out, "io wait duration: "); 415 - bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start); 415 + bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9); 416 416 prt_newline(out); 417 417 418 418 prt_str(out, "io wait remaining: "); 419 - bch2_prt_human_readable_s64(out, r->wait_iotime_end - now); 419 + bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9); 420 420 prt_newline(out); 421 421 422 422 prt_str(out, "duration waited: ");

+60 -28

fs/bcachefs/recovery.c

··· 52 52 } 53 53 54 54 /* for -o reconstruct_alloc: */ 55 - static void drop_alloc_keys(struct journal_keys *keys) 55 + static void do_reconstruct_alloc(struct bch_fs *c) 56 56 { 57 + bch2_journal_log_msg(c, "dropping alloc info"); 58 + bch_info(c, "dropping and reconstructing all alloc info"); 59 + 60 + mutex_lock(&c->sb_lock); 61 + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); 62 + 63 + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required); 64 + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required); 65 + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required); 66 + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required); 67 + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required); 68 + 69 + __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent); 70 + __set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent); 71 + __set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent); 72 + __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); 73 + __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); 74 + __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); 75 + __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); 76 + __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); 77 + __set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent); 78 + __set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent); 79 + __set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent); 80 + __set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent); 81 + __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent); 82 + __set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent); 83 + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); 84 + 85 + bch2_write_super(c); 86 + mutex_unlock(&c->sb_lock); 87 + 88 + c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); 89 + 90 + struct journal_keys *keys = &c->journal_keys; 57 91 size_t src, dst; 58 92 59 93 for (src = 0, dst = 0; src < keys->nr; src++) 60 - if (!btree_id_is_alloc(keys->d[src].btree_id)) 61 - keys->d[dst++] = keys->d[src]; 62 - 94 + if (!btree_id_is_alloc(keys->data[src].btree_id)) 95 + keys->data[dst++] = keys->data[src]; 63 96 keys->nr = dst; 64 97 } 65 98 ··· 103 70 */ 104 71 static void zero_out_btree_mem_ptr(struct journal_keys *keys) 105 72 { 106 - struct journal_key *i; 107 - 108 - for (i = keys->d; i < keys->d + keys->nr; i++) 73 + darray_for_each(*keys, i) 109 74 if (i->k->k.type == KEY_TYPE_btree_ptr_v2) 110 75 bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0; 111 76 } ··· 155 124 if (ret) 156 125 goto out; 157 126 127 + struct btree_path *path = btree_iter_path(trans, &iter); 128 + if (unlikely(!btree_path_node(path, k->level))) { 129 + bch2_trans_iter_exit(trans, &iter); 130 + bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, 131 + BTREE_MAX_DEPTH, 0, iter_flags); 132 + ret = bch2_btree_iter_traverse(&iter) ?: 133 + bch2_btree_increase_depth(trans, iter.path, 0) ?: 134 + -BCH_ERR_transaction_restart_nested; 135 + goto out; 136 + } 137 + 158 138 /* Must be checked with btree locked: */ 159 139 if (k->overwritten) 160 140 goto out; ··· 208 166 * efficient - better locality of btree access - but some might fail if 209 167 * that would cause a journal deadlock. 210 168 */ 211 - for (size_t i = 0; i < keys->nr; i++) { 169 + darray_for_each(*keys, k) { 212 170 cond_resched(); 213 - 214 - struct journal_key *k = keys->d + i; 215 171 216 172 /* Skip fastpath if we're low on space in the journal */ 217 173 ret = c->journal.watermark ? -1 : ··· 304 264 bkey_copy(&r->key, (struct bkey_i *) entry->start); 305 265 r->error = 0; 306 266 } else { 307 - r->error = -EIO; 267 + r->error = -BCH_ERR_btree_node_read_error; 308 268 } 309 269 r->alive = true; 310 270 break; ··· 399 359 genradix_for_each(&c->journal_entries, iter, _i) { 400 360 i = *_i; 401 361 402 - if (!i || i->ignore) 362 + if (journal_replay_ignore(i)) 403 363 continue; 404 364 405 365 vstruct_for_each(&i->j, entry) { ··· 428 388 if (!r->alive) 429 389 continue; 430 390 431 - if (btree_id_is_alloc(i) && 432 - c->opts.reconstruct_alloc) { 433 - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); 391 + if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc) 434 392 continue; 435 - } 436 393 437 394 if (r->error) { 438 395 __fsck_err(c, ··· 561 524 * setting journal_key->overwritten: it will be accessed by multiple 562 525 * threads 563 526 */ 564 - move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); 565 - keys->gap = keys->nr; 527 + move_gap(keys, keys->nr); 566 528 567 529 set_bit(BCH_FS_may_go_rw, &c->flags); 568 530 ··· 898 862 goto out; 899 863 900 864 genradix_for_each_reverse(&c->journal_entries, iter, i) 901 - if (*i && !(*i)->ignore) { 865 + if (!journal_replay_ignore(*i)) { 902 866 last_journal_entry = &(*i)->j; 903 867 break; 904 868 } ··· 923 887 genradix_for_each_reverse(&c->journal_entries, iter, i) 924 888 if (*i) { 925 889 last_journal_entry = &(*i)->j; 926 - (*i)->ignore = false; 890 + (*i)->ignore_blacklisted = false; 891 + (*i)->ignore_not_dirty= false; 927 892 /* 928 893 * This was probably a NO_FLUSH entry, 929 894 * so last_seq was garbage - but we know ··· 960 923 c->journal_replay_seq_start = last_seq; 961 924 c->journal_replay_seq_end = blacklist_seq - 1; 962 925 963 - if (c->opts.reconstruct_alloc) { 964 - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); 965 - drop_alloc_keys(&c->journal_keys); 966 - } 926 + if (c->opts.reconstruct_alloc) 927 + do_reconstruct_alloc(c); 967 928 968 929 zero_out_btree_mem_ptr(&c->journal_keys); 969 930 ··· 985 950 bch2_journal_seq_blacklist_add(c, 986 951 blacklist_seq, journal_seq); 987 952 if (ret) { 988 - bch_err(c, "error creating new journal seq blacklist entry"); 953 + bch_err_msg(c, ret, "error creating new journal seq blacklist entry"); 989 954 goto err; 990 955 } 991 956 } ··· 995 960 bch2_fs_journal_start(&c->journal, journal_seq); 996 961 if (ret) 997 962 goto err; 998 - 999 - if (c->opts.reconstruct_alloc) 1000 - bch2_journal_log_msg(c, "dropping alloc info"); 1001 963 1002 964 /* 1003 965 * Skip past versions that might have possibly been used (as nonces),

+2

fs/bcachefs/recovery_types.h

··· 34 34 x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ 35 35 x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ 36 36 x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ 37 + x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ 37 38 x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ 38 39 x(fs_upgrade_for_subvolumes, 22, 0) \ 39 40 x(resume_logged_ops, 23, PASS_ALWAYS) \ ··· 44 43 x(check_dirents, 27, PASS_FSCK) \ 45 44 x(check_xattrs, 28, PASS_FSCK) \ 46 45 x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ 46 + x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ 47 47 x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ 48 48 x(check_nlinks, 31, PASS_FSCK) \ 49 49 x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \

-16

fs/bcachefs/sb-clean.c

··· 171 171 return ERR_PTR(ret); 172 172 } 173 173 174 - static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) 175 - { 176 - struct jset_entry *entry = *end; 177 - unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); 178 - 179 - memset(entry, 0, u64s * sizeof(u64)); 180 - /* 181 - * The u64s field counts from the start of data, ignoring the shared 182 - * fields. 183 - */ 184 - entry->u64s = cpu_to_le16(u64s - 1); 185 - 186 - *end = vstruct_next(*end); 187 - return entry; 188 - } 189 - 190 174 void bch2_journal_super_entries_add_common(struct bch_fs *c, 191 175 struct jset_entry **end, 192 176 u64 journal_seq)

+8 -2

fs/bcachefs/sb-downgrade.c

··· 45 45 BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ 46 46 BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list) \ 47 47 x(rebalance_work, \ 48 - BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) 48 + BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) \ 49 + x(subvolume_fs_parent, \ 50 + BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \ 51 + BCH_FSCK_ERR_subvol_fs_path_parent_wrong) \ 52 + x(btree_subvolume_children, \ 53 + BIT_ULL(BCH_RECOVERY_PASS_check_subvols), \ 54 + BCH_FSCK_ERR_subvol_children_not_set) 49 55 50 56 #define DOWNGRADE_TABLE() 51 57 ··· 259 253 if (e < BCH_SB_ERR_MAX) 260 254 __set_bit(e, c->sb.errors_silent); 261 255 if (e < sizeof(ext->errors_silent) * 8) 262 - ext->errors_silent[e / 64] |= cpu_to_le64(BIT_ULL(e % 64)); 256 + __set_bit_le64(e, ext->errors_silent); 263 257 } 264 258 } 265 259 }

+17 -2

fs/bcachefs/sb-errors_types.h

··· 231 231 x(dirent_name_dot_or_dotdot, 223) \ 232 232 x(dirent_name_has_slash, 224) \ 233 233 x(dirent_d_type_wrong, 225) \ 234 - x(dirent_d_parent_subvol_wrong, 226) \ 234 + x(inode_bi_parent_wrong, 226) \ 235 235 x(dirent_in_missing_dir_inode, 227) \ 236 236 x(dirent_in_non_dir_inode, 228) \ 237 237 x(dirent_to_missing_inode, 229) \ ··· 250 250 x(hash_table_key_duplicate, 242) \ 251 251 x(hash_table_key_wrong_offset, 243) \ 252 252 x(unlinked_inode_not_on_deleted_list, 244) \ 253 - x(reflink_p_front_pad_bad, 245) 253 + x(reflink_p_front_pad_bad, 245) \ 254 + x(journal_entry_dup_same_device, 246) \ 255 + x(inode_bi_subvol_missing, 247) \ 256 + x(inode_bi_subvol_wrong, 248) \ 257 + x(inode_points_to_missing_dirent, 249) \ 258 + x(inode_points_to_wrong_dirent, 250) \ 259 + x(inode_bi_parent_nonzero, 251) \ 260 + x(dirent_to_missing_parent_subvol, 252) \ 261 + x(dirent_not_visible_in_parent_subvol, 253) \ 262 + x(subvol_fs_path_parent_wrong, 254) \ 263 + x(subvol_root_fs_path_parent_nonzero, 255) \ 264 + x(subvol_children_not_set, 256) \ 265 + x(subvol_children_bad, 257) \ 266 + x(subvol_loop, 258) \ 267 + x(subvol_unreachable, 259) \ 268 + x(btree_node_bkey_bad_u64s, 260) 254 269 255 270 enum bch_sb_error_id { 256 271 #define x(t, n) BCH_FSCK_ERR_##t = n,

+5 -10

fs/bcachefs/str_hash.h

··· 259 259 } 260 260 261 261 static __always_inline 262 - int bch2_hash_set_snapshot(struct btree_trans *trans, 262 + int bch2_hash_set_in_snapshot(struct btree_trans *trans, 263 263 const struct bch_hash_desc desc, 264 264 const struct bch_hash_info *info, 265 265 subvol_inum inum, u32 snapshot, ··· 328 328 struct bkey_i *insert, 329 329 bch_str_hash_flags_t str_hash_flags) 330 330 { 331 - u32 snapshot; 332 - int ret; 333 - 334 - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 335 - if (ret) 336 - return ret; 337 - 338 331 insert->k.p.inode = inum.inum; 339 332 340 - return bch2_hash_set_snapshot(trans, desc, info, inum, 341 - snapshot, insert, str_hash_flags, 0); 333 + u32 snapshot; 334 + return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: 335 + bch2_hash_set_in_snapshot(trans, desc, info, inum, 336 + snapshot, insert, str_hash_flags, 0); 342 337 } 343 338 344 339 static __always_inline

+174 -13

fs/bcachefs/subvolume.c

··· 13 13 14 14 static int bch2_subvolume_delete(struct btree_trans *, u32); 15 15 16 + static struct bpos subvolume_children_pos(struct bkey_s_c k) 17 + { 18 + if (k.k->type != KEY_TYPE_subvolume) 19 + return POS_MIN; 20 + 21 + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); 22 + if (!s.v->fs_path_parent) 23 + return POS_MIN; 24 + return POS(le32_to_cpu(s.v->fs_path_parent), s.k->p.offset); 25 + } 26 + 16 27 static int check_subvol(struct btree_trans *trans, 17 28 struct btree_iter *iter, 18 29 struct bkey_s_c k) 19 30 { 20 31 struct bch_fs *c = trans->c; 21 32 struct bkey_s_c_subvolume subvol; 33 + struct btree_iter subvol_children_iter = {}; 22 34 struct bch_snapshot snapshot; 35 + struct printbuf buf = PRINTBUF; 23 36 unsigned snapid; 24 37 int ret = 0; 25 38 ··· 53 40 ret = bch2_subvolume_delete(trans, iter->pos.offset); 54 41 bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); 55 42 return ret ?: -BCH_ERR_transaction_restart_nested; 43 + } 44 + 45 + if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL && 46 + subvol.v->fs_path_parent, 47 + c, subvol_root_fs_path_parent_nonzero, 48 + "root subvolume has nonzero fs_path_parent\n%s", 49 + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 50 + struct bkey_i_subvolume *n = 51 + bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume); 52 + ret = PTR_ERR_OR_ZERO(n); 53 + if (ret) 54 + goto err; 55 + 56 + n->v.fs_path_parent = 0; 57 + } 58 + 59 + if (subvol.v->fs_path_parent) { 60 + struct bpos pos = subvolume_children_pos(k); 61 + 62 + struct bkey_s_c subvol_children_k = 63 + bch2_bkey_get_iter(trans, &subvol_children_iter, 64 + BTREE_ID_subvolume_children, pos, 0); 65 + ret = bkey_err(subvol_children_k); 66 + if (ret) 67 + goto err; 68 + 69 + if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set, 70 + c, subvol_children_not_set, 71 + "subvolume not set in subvolume_children btree at %llu:%llu\n%s", 72 + pos.inode, pos.offset, 73 + (printbuf_reset(&buf), 74 + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 75 + ret = bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, true); 76 + if (ret) 77 + goto err; 78 + } 79 + } 80 + 81 + struct bch_inode_unpacked inode; 82 + struct btree_iter inode_iter = {}; 83 + ret = bch2_inode_peek_nowarn(trans, &inode_iter, &inode, 84 + (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) }, 85 + 0); 86 + bch2_trans_iter_exit(trans, &inode_iter); 87 + 88 + if (ret && !bch2_err_matches(ret, ENOENT)) 89 + return ret; 90 + 91 + if (fsck_err_on(ret, c, subvol_to_missing_root, 92 + "subvolume %llu points to missing subvolume root %llu:%u", 93 + k.k->p.offset, le64_to_cpu(subvol.v->inode), 94 + le32_to_cpu(subvol.v->snapshot))) { 95 + ret = bch2_subvolume_delete(trans, iter->pos.offset); 96 + bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); 97 + return ret ?: -BCH_ERR_transaction_restart_nested; 98 + } 99 + 100 + if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset, 101 + c, subvol_root_wrong_bi_subvol, 102 + "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu", 103 + inode.bi_inum, inode_iter.k.p.snapshot, 104 + inode.bi_subvol, subvol.k->p.offset)) { 105 + inode.bi_subvol = subvol.k->p.offset; 106 + ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot)); 107 + if (ret) 108 + goto err; 56 109 } 57 110 58 111 if (!BCH_SUBVOLUME_SNAP(subvol.v)) { ··· 151 72 SET_BCH_SUBVOLUME_SNAP(&s->v, true); 152 73 } 153 74 } 154 - 75 + err: 155 76 fsck_err: 77 + bch2_trans_iter_exit(trans, &subvol_children_iter); 78 + printbuf_exit(&buf); 156 79 return ret; 157 80 } 158 81 ··· 167 86 check_subvol(trans, &iter, k))); 168 87 bch_err_fn(c, ret); 169 88 return ret; 89 + } 90 + 91 + static int check_subvol_child(struct btree_trans *trans, 92 + struct btree_iter *child_iter, 93 + struct bkey_s_c child_k) 94 + { 95 + struct bch_fs *c = trans->c; 96 + struct bch_subvolume s; 97 + int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, child_k.k->p.offset), 98 + 0, subvolume, &s); 99 + if (ret && !bch2_err_matches(ret, ENOENT)) 100 + return ret; 101 + 102 + if (fsck_err_on(ret || 103 + le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode, 104 + c, subvol_children_bad, 105 + "incorrect entry in subvolume_children btree %llu:%llu", 106 + child_k.k->p.inode, child_k.k->p.offset)) { 107 + ret = bch2_btree_delete_at(trans, child_iter, 0); 108 + if (ret) 109 + goto err; 110 + } 111 + err: 112 + fsck_err: 113 + return ret; 114 + } 115 + 116 + int bch2_check_subvol_children(struct bch_fs *c) 117 + { 118 + int ret = bch2_trans_run(c, 119 + for_each_btree_key_commit(trans, iter, 120 + BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_PREFETCH, k, 121 + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 122 + check_subvol_child(trans, &iter, k))); 123 + bch_err_fn(c, ret); 124 + return 0; 170 125 } 171 126 172 127 /* Subvolumes: */ ··· 229 112 le64_to_cpu(s.v->inode), 230 113 le32_to_cpu(s.v->snapshot)); 231 114 232 - if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, parent)) 233 - prt_printf(out, " parent %u", le32_to_cpu(s.v->parent)); 115 + if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, creation_parent)) { 116 + prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent)); 117 + prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent)); 118 + } 119 + } 120 + 121 + static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set) 122 + { 123 + return !bpos_eq(pos, POS_MIN) 124 + ? bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, set) 125 + : 0; 126 + } 127 + 128 + int bch2_subvolume_trigger(struct btree_trans *trans, 129 + enum btree_id btree_id, unsigned level, 130 + struct bkey_s_c old, struct bkey_s new, 131 + unsigned flags) 132 + { 133 + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { 134 + struct bpos children_pos_old = subvolume_children_pos(old); 135 + struct bpos children_pos_new = subvolume_children_pos(new.s_c); 136 + 137 + if (!bpos_eq(children_pos_old, children_pos_new)) { 138 + int ret = subvolume_children_mod(trans, children_pos_old, false) ?: 139 + subvolume_children_mod(trans, children_pos_new, true); 140 + if (ret) 141 + return ret; 142 + } 143 + } 144 + 145 + return 0; 146 + } 147 + 148 + int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol) 149 + { 150 + struct btree_iter iter; 151 + 152 + bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0); 153 + struct bkey_s_c k = bch2_btree_iter_peek(&iter); 154 + bch2_trans_iter_exit(trans, &iter); 155 + 156 + return bkey_err(k) ?: k.k && k.k->p.inode == subvol 157 + ? -BCH_ERR_ENOTEMPTY_subvol_not_empty 158 + : 0; 234 159 } 235 160 236 161 static __always_inline int ··· 356 197 if (k.k->type != KEY_TYPE_subvolume) 357 198 return 0; 358 199 359 - if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, parent) && 360 - le32_to_cpu(bkey_s_c_to_subvolume(k).v->parent) != old_parent) 200 + if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, creation_parent) && 201 + le32_to_cpu(bkey_s_c_to_subvolume(k).v->creation_parent) != old_parent) 361 202 return 0; 362 203 363 204 s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume); ··· 365 206 if (ret) 366 207 return ret; 367 208 368 - s->v.parent = cpu_to_le32(new_parent); 209 + s->v.creation_parent = cpu_to_le32(new_parent); 369 210 return 0; 370 211 } 371 212 ··· 388 229 BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, 389 230 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 390 231 bch2_subvolume_reparent(trans, &iter, k, 391 - subvolid_to_delete, le32_to_cpu(s.parent))); 232 + subvolid_to_delete, le32_to_cpu(s.creation_parent))); 392 233 } 393 234 394 235 /* ··· 519 360 } 520 361 521 362 int bch2_subvolume_create(struct btree_trans *trans, u64 inode, 363 + u32 parent_subvolid, 522 364 u32 src_subvolid, 523 365 u32 *new_subvolid, 524 366 u32 *new_snapshotid, ··· 576 416 if (ret) 577 417 goto err; 578 418 579 - new_subvol->v.flags = 0; 580 - new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]); 581 - new_subvol->v.inode = cpu_to_le64(inode); 582 - new_subvol->v.parent = cpu_to_le32(src_subvolid); 583 - new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c)); 584 - new_subvol->v.otime.hi = 0; 419 + new_subvol->v.flags = 0; 420 + new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]); 421 + new_subvol->v.inode = cpu_to_le64(inode); 422 + new_subvol->v.creation_parent = cpu_to_le32(src_subvolid); 423 + new_subvol->v.fs_path_parent = cpu_to_le32(parent_subvolid); 424 + new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c)); 425 + new_subvol->v.otime.hi = 0; 585 426 586 427 SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro); 587 428 SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);

+6 -2

fs/bcachefs/subvolume.h

··· 8 8 enum bkey_invalid_flags; 9 9 10 10 int bch2_check_subvols(struct bch_fs *); 11 + int bch2_check_subvol_children(struct bch_fs *); 11 12 12 13 int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c, 13 14 enum bkey_invalid_flags, struct printbuf *); 14 15 void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); 16 + int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned, 17 + struct bkey_s_c, struct bkey_s, unsigned); 15 18 16 19 #define bch2_bkey_ops_subvolume ((struct bkey_ops) { \ 17 20 .key_invalid = bch2_subvolume_invalid, \ 18 21 .val_to_text = bch2_subvolume_to_text, \ 22 + .trigger = bch2_subvolume_trigger, \ 19 23 .min_val_size = 16, \ 20 24 }) 21 25 26 + int bch2_subvol_has_children(struct btree_trans *, u32); 22 27 int bch2_subvolume_get(struct btree_trans *, unsigned, 23 28 bool, int, struct bch_subvolume *); 24 29 int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); ··· 35 30 void bch2_delete_dead_snapshots_async(struct bch_fs *); 36 31 37 32 int bch2_subvolume_unlink(struct btree_trans *, u32); 38 - int bch2_subvolume_create(struct btree_trans *, u64, u32, 39 - u32 *, u32 *, bool); 33 + int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool); 40 34 41 35 int bch2_fs_subvolumes_init(struct bch_fs *); 42 36

+2 -2

fs/bcachefs/subvolume_format.h

··· 19 19 * This is _not_ necessarily the subvolume of the directory containing 20 20 * this subvolume: 21 21 */ 22 - __le32 parent; 23 - __le32 pad; 22 + __le32 creation_parent; 23 + __le32 fs_path_parent; 24 24 bch_le128 otime; 25 25 }; 26 26

+15 -7

fs/bcachefs/super-io.c

··· 470 470 return ret; 471 471 } 472 472 473 + if (rw == WRITE && 474 + bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) { 475 + prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu", 476 + le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq), 477 + le64_to_cpu(sb->seq)); 478 + return -BCH_ERR_invalid_sb_members_missing; 479 + } 480 + 473 481 return 0; 474 482 } 475 483 ··· 725 717 726 718 if (IS_ERR(sb->s_bdev_file)) { 727 719 ret = PTR_ERR(sb->s_bdev_file); 720 + prt_printf(&err, "error opening %s: %s", path, bch2_err_str(ret)); 728 721 goto err; 729 722 } 730 723 sb->bdev = file_bdev(sb->s_bdev_file); ··· 752 743 prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n", 753 744 path, err.buf); 754 745 if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg) 755 - printk(KERN_INFO "%s", err2.buf); 746 + bch2_print_opts(opts, KERN_INFO "%s", err2.buf); 756 747 else 757 - printk(KERN_ERR "%s", err2.buf); 748 + bch2_print_opts(opts, KERN_ERR "%s", err2.buf); 758 749 759 750 printbuf_exit(&err2); 760 751 printbuf_reset(&err); ··· 812 803 goto err; 813 804 } 814 805 815 - ret = 0; 816 806 sb->have_layout = true; 817 807 818 808 ret = bch2_sb_validate(sb, &err, READ); 819 809 if (ret) { 820 - printk(KERN_ERR "bcachefs (%s): error validating superblock: %s\n", 821 - path, err.buf); 810 + bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n", 811 + path, err.buf); 822 812 goto err_no_print; 823 813 } 824 814 out: 825 815 printbuf_exit(&err); 826 816 return ret; 827 817 err: 828 - printk(KERN_ERR "bcachefs (%s): error reading superblock: %s\n", 829 - path, err.buf); 818 + bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error reading superblock: %s\n", 819 + path, err.buf); 830 820 err_no_print: 831 821 bch2_free_super(sb); 832 822 goto out;

+57 -36

fs/bcachefs/super.c

··· 56 56 #include "super.h" 57 57 #include "super-io.h" 58 58 #include "sysfs.h" 59 + #include "thread_with_file.h" 59 60 #include "trace.h" 60 61 61 62 #include <linux/backing-dev.h> ··· 87 86 NULL 88 87 }; 89 88 89 + void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...) 90 + { 91 + struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio; 92 + 93 + va_list args; 94 + va_start(args, fmt); 95 + if (likely(!stdio)) { 96 + vprintk(fmt, args); 97 + } else { 98 + if (fmt[0] == KERN_SOH[0]) 99 + fmt += 2; 100 + 101 + bch2_stdio_redirect_vprintf(stdio, true, fmt, args); 102 + } 103 + va_end(args); 104 + } 105 + 90 106 void __bch2_print(struct bch_fs *c, const char *fmt, ...) 91 107 { 92 108 struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); ··· 113 95 if (likely(!stdio)) { 114 96 vprintk(fmt, args); 115 97 } else { 116 - unsigned long flags; 117 - 118 98 if (fmt[0] == KERN_SOH[0]) 119 99 fmt += 2; 120 100 121 - spin_lock_irqsave(&stdio->output_lock, flags); 122 - prt_vprintf(&stdio->output_buf, fmt, args); 123 - spin_unlock_irqrestore(&stdio->output_lock, flags); 124 - 125 - wake_up(&stdio->output_wait); 101 + bch2_stdio_redirect_vprintf(stdio, true, fmt, args); 126 102 } 127 103 va_end(args); 128 104 } ··· 588 576 destroy_workqueue(c->btree_update_wq); 589 577 590 578 bch2_free_super(&c->disk_sb); 591 - kvpfree(c, sizeof(*c)); 579 + kvfree(c); 592 580 module_put(THIS_MODULE); 593 581 } 594 582 ··· 727 715 unsigned i, iter_size; 728 716 int ret = 0; 729 717 730 - c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); 718 + c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); 731 719 if (!c) { 732 720 c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc); 733 721 goto out; ··· 830 818 goto err; 831 819 832 820 pr_uuid(&name, c->sb.user_uuid.b); 833 - strscpy(c->name, name.buf, sizeof(c->name)); 834 - printbuf_exit(&name); 835 - 836 821 ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; 837 822 if (ret) 838 823 goto err; 824 + 825 + strscpy(c->name, name.buf, sizeof(c->name)); 826 + printbuf_exit(&name); 839 827 840 828 /* Compat: */ 841 829 if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && ··· 874 862 c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); 875 863 876 864 if (!(c->btree_update_wq = alloc_workqueue("bcachefs", 877 - WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) || 865 + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || 878 866 !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", 879 - WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || 867 + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || 880 868 !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", 881 - WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || 869 + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || 882 870 !(c->io_complete_wq = alloc_workqueue("bcachefs_io", 883 - WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 512)) || 871 + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || 884 872 !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", 885 873 WQ_FREEZABLE, 0)) || 886 874 #ifndef BCH_WRITE_REF_DEBUG ··· 894 882 BIOSET_NEED_BVECS) || 895 883 !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || 896 884 !(c->online_reserved = alloc_percpu(u64)) || 897 - mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, 898 - c->opts.btree_node_size) || 885 + mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, 886 + c->opts.btree_node_size) || 899 887 mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || 900 888 !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, 901 889 sizeof(u64), GFP_KERNEL))) { ··· 1073 1061 } 1074 1062 1075 1063 static int bch2_dev_in_fs(struct bch_sb_handle *fs, 1076 - struct bch_sb_handle *sb) 1064 + struct bch_sb_handle *sb, 1065 + struct bch_opts *opts) 1077 1066 { 1078 1067 if (fs == sb) 1079 1068 return 0; ··· 1115 1102 bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));; 1116 1103 prt_newline(&buf); 1117 1104 1118 - prt_printf(&buf, "Not using older sb"); 1105 + if (!opts->no_splitbrain_check) 1106 + prt_printf(&buf, "Not using older sb"); 1119 1107 1120 1108 pr_err("%s", buf.buf); 1121 1109 printbuf_exit(&buf); 1122 - return -BCH_ERR_device_splitbrain; 1110 + 1111 + if (!opts->no_splitbrain_check) 1112 + return -BCH_ERR_device_splitbrain; 1123 1113 } 1124 1114 1125 1115 struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx); ··· 1140 1124 prt_newline(&buf); 1141 1125 1142 1126 prt_bdevname(&buf, fs->bdev); 1143 - prt_str(&buf, "believes seq of "); 1127 + prt_str(&buf, " believes seq of "); 1144 1128 prt_bdevname(&buf, sb->bdev); 1145 1129 prt_printf(&buf, " to be %llu, but ", seq_from_fs); 1146 1130 prt_bdevname(&buf, sb->bdev); 1147 1131 prt_printf(&buf, " has %llu\n", seq_from_member); 1148 - prt_str(&buf, "Not using "); 1149 - prt_bdevname(&buf, sb->bdev); 1132 + 1133 + if (!opts->no_splitbrain_check) { 1134 + prt_str(&buf, "Not using "); 1135 + prt_bdevname(&buf, sb->bdev); 1136 + } 1150 1137 1151 1138 pr_err("%s", buf.buf); 1152 1139 printbuf_exit(&buf); 1153 - return -BCH_ERR_device_splitbrain; 1140 + 1141 + if (!opts->no_splitbrain_check) 1142 + return -BCH_ERR_device_splitbrain; 1154 1143 } 1155 1144 1156 1145 return 0; ··· 1189 1168 bch2_dev_buckets_free(ca); 1190 1169 free_page((unsigned long) ca->sb_read_scratch); 1191 1170 1192 - bch2_time_stats_exit(&ca->io_latency[WRITE]); 1193 - bch2_time_stats_exit(&ca->io_latency[READ]); 1171 + bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); 1172 + bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); 1194 1173 1195 1174 percpu_ref_exit(&ca->io_ref); 1196 1175 percpu_ref_exit(&ca->ref); ··· 1281 1260 1282 1261 INIT_WORK(&ca->io_error_work, bch2_io_error_work); 1283 1262 1284 - bch2_time_stats_init(&ca->io_latency[READ]); 1285 - bch2_time_stats_init(&ca->io_latency[WRITE]); 1263 + bch2_time_stats_quantiles_init(&ca->io_latency[READ]); 1264 + bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]); 1286 1265 1287 1266 ca->mi = bch2_mi_to_cpu(member); 1288 1267 ··· 1618 1597 __bch2_dev_read_only(c, ca); 1619 1598 1620 1599 ret = bch2_dev_data_drop(c, ca->dev_idx, flags); 1621 - bch_err_msg(ca, ret, "dropping data"); 1600 + bch_err_msg(ca, ret, "bch2_dev_data_drop()"); 1622 1601 if (ret) 1623 1602 goto err; 1624 1603 1625 1604 ret = bch2_dev_remove_alloc(c, ca); 1626 - bch_err_msg(ca, ret, "deleting alloc info"); 1605 + bch_err_msg(ca, ret, "bch2_dev_remove_alloc()"); 1627 1606 if (ret) 1628 1607 goto err; 1629 1608 1630 1609 ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); 1631 - bch_err_msg(ca, ret, "flushing journal"); 1610 + bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()"); 1632 1611 if (ret) 1633 1612 goto err; 1634 1613 1635 1614 ret = bch2_journal_flush(&c->journal); 1636 - bch_err(ca, "journal error"); 1615 + bch_err_msg(ca, ret, "bch2_journal_flush()"); 1637 1616 if (ret) 1638 1617 goto err; 1639 1618 1640 1619 ret = bch2_replicas_gc2(c); 1641 - bch_err_msg(ca, ret, "in replicas_gc2()"); 1620 + bch_err_msg(ca, ret, "bch2_replicas_gc2()"); 1642 1621 if (ret) 1643 1622 goto err; 1644 1623 ··· 1856 1835 1857 1836 dev_idx = sb.sb->dev_idx; 1858 1837 1859 - ret = bch2_dev_in_fs(&c->disk_sb, &sb); 1838 + ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts); 1860 1839 bch_err_msg(c, ret, "bringing %s online", path); 1861 1840 if (ret) 1862 1841 goto err; ··· 2044 2023 best = sb; 2045 2024 2046 2025 darray_for_each_reverse(sbs, sb) { 2047 - ret = bch2_dev_in_fs(best, sb); 2026 + ret = bch2_dev_in_fs(best, sb, &opts); 2048 2027 2049 2028 if (ret == -BCH_ERR_device_has_been_removed || 2050 2029 ret == -BCH_ERR_device_splitbrain) {

+2 -2

fs/bcachefs/sysfs.c

··· 930 930 sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); 931 931 932 932 if (attr == &sysfs_io_latency_stats_read) 933 - bch2_time_stats_to_text(out, &ca->io_latency[READ]); 933 + bch2_time_stats_to_text(out, &ca->io_latency[READ].stats); 934 934 935 935 if (attr == &sysfs_io_latency_stats_write) 936 - bch2_time_stats_to_text(out, &ca->io_latency[WRITE]); 936 + bch2_time_stats_to_text(out, &ca->io_latency[WRITE].stats); 937 937 938 938 sysfs_printf(congested, "%u%%", 939 939 clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)

+271 -120

fs/bcachefs/thread_with_file.c

··· 2 2 #ifndef NO_BCACHEFS_FS 3 3 4 4 #include "bcachefs.h" 5 - #include "printbuf.h" 6 5 #include "thread_with_file.h" 7 6 8 7 #include <linux/anon_inodes.h> ··· 9 10 #include <linux/kthread.h> 10 11 #include <linux/pagemap.h> 11 12 #include <linux/poll.h> 13 + #include <linux/sched/sysctl.h> 12 14 13 15 void bch2_thread_with_file_exit(struct thread_with_file *thr) 14 16 { ··· 65 65 return ret; 66 66 } 67 67 68 - static inline bool thread_with_stdio_has_output(struct thread_with_stdio *thr) 68 + /* stdio_redirect */ 69 + 70 + static bool stdio_redirect_has_input(struct stdio_redirect *stdio) 69 71 { 70 - return thr->stdio.output_buf.pos || 71 - thr->output2.nr || 72 - thr->thr.done; 72 + return stdio->input.buf.nr || stdio->done; 73 73 } 74 74 75 - static ssize_t thread_with_stdio_read(struct file *file, char __user *buf, 75 + static bool stdio_redirect_has_output(struct stdio_redirect *stdio) 76 + { 77 + return stdio->output.buf.nr || stdio->done; 78 + } 79 + 80 + #define STDIO_REDIRECT_BUFSIZE 4096 81 + 82 + static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio) 83 + { 84 + return stdio->input.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done; 85 + } 86 + 87 + static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio) 88 + { 89 + return stdio->output.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done; 90 + } 91 + 92 + static void stdio_buf_init(struct stdio_buf *buf) 93 + { 94 + spin_lock_init(&buf->lock); 95 + init_waitqueue_head(&buf->wait); 96 + darray_init(&buf->buf); 97 + } 98 + 99 + /* thread_with_stdio */ 100 + 101 + static void thread_with_stdio_done(struct thread_with_stdio *thr) 102 + { 103 + thr->thr.done = true; 104 + thr->stdio.done = true; 105 + wake_up(&thr->stdio.input.wait); 106 + wake_up(&thr->stdio.output.wait); 107 + } 108 + 109 + static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf, 76 110 size_t len, loff_t *ppos) 77 111 { 78 112 struct thread_with_stdio *thr = 79 113 container_of(file->private_data, struct thread_with_stdio, thr); 114 + struct stdio_buf *buf = &thr->stdio.output; 80 115 size_t copied = 0, b; 81 116 int ret = 0; 82 117 83 - if ((file->f_flags & O_NONBLOCK) && 84 - !thread_with_stdio_has_output(thr)) 118 + if (!(file->f_flags & O_NONBLOCK)) { 119 + ret = wait_event_interruptible(buf->wait, stdio_redirect_has_output(&thr->stdio)); 120 + if (ret) 121 + return ret; 122 + } else if (!stdio_redirect_has_output(&thr->stdio)) 85 123 return -EAGAIN; 86 124 87 - ret = wait_event_interruptible(thr->stdio.output_wait, 88 - thread_with_stdio_has_output(thr)); 89 - if (ret) 90 - return ret; 91 - 92 - if (thr->thr.done) 93 - return 0; 94 - 95 - while (len) { 96 - ret = darray_make_room(&thr->output2, thr->stdio.output_buf.pos); 97 - if (ret) 98 - break; 99 - 100 - spin_lock_irq(&thr->stdio.output_lock); 101 - b = min_t(size_t, darray_room(thr->output2), thr->stdio.output_buf.pos); 102 - 103 - memcpy(&darray_top(thr->output2), thr->stdio.output_buf.buf, b); 104 - memmove(thr->stdio.output_buf.buf, 105 - thr->stdio.output_buf.buf + b, 106 - thr->stdio.output_buf.pos - b); 107 - 108 - thr->output2.nr += b; 109 - thr->stdio.output_buf.pos -= b; 110 - spin_unlock_irq(&thr->stdio.output_lock); 111 - 112 - b = min(len, thr->output2.nr); 113 - if (!b) 114 - break; 115 - 116 - b -= copy_to_user(buf, thr->output2.data, b); 117 - if (!b) { 125 + while (len && buf->buf.nr) { 126 + if (fault_in_writeable(ubuf, len) == len) { 118 127 ret = -EFAULT; 119 128 break; 120 129 } 121 130 122 - copied += b; 123 - buf += b; 124 - len -= b; 131 + spin_lock_irq(&buf->lock); 132 + b = min_t(size_t, len, buf->buf.nr); 125 133 126 - memmove(thr->output2.data, 127 - thr->output2.data + b, 128 - thr->output2.nr - b); 129 - thr->output2.nr -= b; 134 + if (b && !copy_to_user_nofault(ubuf, buf->buf.data, b)) { 135 + ubuf += b; 136 + len -= b; 137 + copied += b; 138 + buf->buf.nr -= b; 139 + memmove(buf->buf.data, 140 + buf->buf.data + b, 141 + buf->buf.nr); 142 + } 143 + spin_unlock_irq(&buf->lock); 130 144 } 131 145 132 146 return copied ?: ret; ··· 151 137 struct thread_with_stdio *thr = 152 138 container_of(file->private_data, struct thread_with_stdio, thr); 153 139 140 + thread_with_stdio_done(thr); 154 141 bch2_thread_with_file_exit(&thr->thr); 155 - printbuf_exit(&thr->stdio.input_buf); 156 - printbuf_exit(&thr->stdio.output_buf); 157 - darray_exit(&thr->output2); 158 - thr->exit(thr); 142 + darray_exit(&thr->stdio.input.buf); 143 + darray_exit(&thr->stdio.output.buf); 144 + thr->ops->exit(thr); 159 145 return 0; 160 - } 161 - 162 - #define WRITE_BUFFER 4096 163 - 164 - static inline bool thread_with_stdio_has_input_space(struct thread_with_stdio *thr) 165 - { 166 - return thr->stdio.input_buf.pos < WRITE_BUFFER || thr->thr.done; 167 146 } 168 147 169 148 static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf, ··· 164 157 { 165 158 struct thread_with_stdio *thr = 166 159 container_of(file->private_data, struct thread_with_stdio, thr); 167 - struct printbuf *buf = &thr->stdio.input_buf; 160 + struct stdio_buf *buf = &thr->stdio.input; 168 161 size_t copied = 0; 169 162 ssize_t ret = 0; 170 163 ··· 180 173 break; 181 174 } 182 175 183 - spin_lock(&thr->stdio.input_lock); 184 - if (buf->pos < WRITE_BUFFER) 185 - bch2_printbuf_make_room(buf, min(b, WRITE_BUFFER - buf->pos)); 186 - b = min(len, printbuf_remaining_size(buf)); 176 + spin_lock(&buf->lock); 177 + if (buf->buf.nr < STDIO_REDIRECT_BUFSIZE) 178 + darray_make_room_gfp(&buf->buf, 179 + min(b, STDIO_REDIRECT_BUFSIZE - buf->buf.nr), GFP_NOWAIT); 180 + b = min(len, darray_room(buf->buf)); 187 181 188 - if (b && !copy_from_user_nofault(&buf->buf[buf->pos], ubuf, b)) { 189 - ubuf += b; 190 - len -= b; 191 - copied += b; 192 - buf->pos += b; 182 + if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) { 183 + buf->buf.nr += b; 184 + ubuf += b; 185 + len -= b; 186 + copied += b; 193 187 } 194 - spin_unlock(&thr->stdio.input_lock); 188 + spin_unlock(&buf->lock); 195 189 196 190 if (b) { 197 - wake_up(&thr->stdio.input_wait); 191 + wake_up(&buf->wait); 198 192 } else { 199 193 if ((file->f_flags & O_NONBLOCK)) { 200 194 ret = -EAGAIN; 201 195 break; 202 196 } 203 197 204 - ret = wait_event_interruptible(thr->stdio.input_wait, 205 - thread_with_stdio_has_input_space(thr)); 198 + ret = wait_event_interruptible(buf->wait, 199 + stdio_redirect_has_input_space(&thr->stdio)); 206 200 if (ret) 207 201 break; 208 202 } ··· 217 209 struct thread_with_stdio *thr = 218 210 container_of(file->private_data, struct thread_with_stdio, thr); 219 211 220 - poll_wait(file, &thr->stdio.output_wait, wait); 221 - poll_wait(file, &thr->stdio.input_wait, wait); 212 + poll_wait(file, &thr->stdio.output.wait, wait); 213 + poll_wait(file, &thr->stdio.input.wait, wait); 222 214 223 215 __poll_t mask = 0; 224 216 225 - if (thread_with_stdio_has_output(thr)) 217 + if (stdio_redirect_has_output(&thr->stdio)) 226 218 mask |= EPOLLIN; 227 - if (thread_with_stdio_has_input_space(thr)) 219 + if (stdio_redirect_has_input_space(&thr->stdio)) 228 220 mask |= EPOLLOUT; 229 221 if (thr->thr.done) 230 222 mask |= EPOLLHUP|EPOLLERR; 231 223 return mask; 232 224 } 233 225 226 + static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_struct *wait) 227 + { 228 + struct thread_with_stdio *thr = 229 + container_of(file->private_data, struct thread_with_stdio, thr); 230 + 231 + poll_wait(file, &thr->stdio.output.wait, wait); 232 + 233 + __poll_t mask = 0; 234 + 235 + if (stdio_redirect_has_output(&thr->stdio)) 236 + mask |= EPOLLIN; 237 + if (thr->thr.done) 238 + mask |= EPOLLHUP|EPOLLERR; 239 + return mask; 240 + } 241 + 242 + static int thread_with_stdio_flush(struct file *file, fl_owner_t id) 243 + { 244 + struct thread_with_stdio *thr = 245 + container_of(file->private_data, struct thread_with_stdio, thr); 246 + 247 + return thr->thr.ret; 248 + } 249 + 250 + static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigned long p) 251 + { 252 + struct thread_with_stdio *thr = 253 + container_of(file->private_data, struct thread_with_stdio, thr); 254 + 255 + if (thr->ops->unlocked_ioctl) 256 + return thr->ops->unlocked_ioctl(thr, cmd, p); 257 + return -ENOTTY; 258 + } 259 + 234 260 static const struct file_operations thread_with_stdio_fops = { 235 - .release = thread_with_stdio_release, 261 + .llseek = no_llseek, 236 262 .read = thread_with_stdio_read, 237 263 .write = thread_with_stdio_write, 238 264 .poll = thread_with_stdio_poll, 239 - .llseek = no_llseek, 265 + .flush = thread_with_stdio_flush, 266 + .release = thread_with_stdio_release, 267 + .unlocked_ioctl = thread_with_stdio_ioctl, 240 268 }; 241 269 242 - int bch2_run_thread_with_stdio(struct thread_with_stdio *thr, 243 - void (*exit)(struct thread_with_stdio *), 244 - int (*fn)(void *)) 270 + static const struct file_operations thread_with_stdout_fops = { 271 + .llseek = no_llseek, 272 + .read = thread_with_stdio_read, 273 + .poll = thread_with_stdout_poll, 274 + .flush = thread_with_stdio_flush, 275 + .release = thread_with_stdio_release, 276 + .unlocked_ioctl = thread_with_stdio_ioctl, 277 + }; 278 + 279 + static int thread_with_stdio_fn(void *arg) 245 280 { 246 - thr->stdio.input_buf = PRINTBUF; 247 - thr->stdio.input_buf.atomic++; 248 - spin_lock_init(&thr->stdio.input_lock); 249 - init_waitqueue_head(&thr->stdio.input_wait); 281 + struct thread_with_stdio *thr = arg; 250 282 251 - thr->stdio.output_buf = PRINTBUF; 252 - thr->stdio.output_buf.atomic++; 253 - spin_lock_init(&thr->stdio.output_lock); 254 - init_waitqueue_head(&thr->stdio.output_wait); 283 + thr->thr.ret = thr->ops->fn(thr); 255 284 256 - darray_init(&thr->output2); 257 - thr->exit = exit; 258 - 259 - return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, fn); 285 + thread_with_stdio_done(thr); 286 + return 0; 260 287 } 261 288 262 - int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *buf, size_t len) 289 + int bch2_run_thread_with_stdio(struct thread_with_stdio *thr, 290 + const struct thread_with_stdio_ops *ops) 263 291 { 264 - wait_event(stdio->input_wait, 265 - stdio->input_buf.pos || stdio->done); 292 + stdio_buf_init(&thr->stdio.input); 293 + stdio_buf_init(&thr->stdio.output); 294 + thr->ops = ops; 295 + 296 + return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn); 297 + } 298 + 299 + int bch2_run_thread_with_stdout(struct thread_with_stdio *thr, 300 + const struct thread_with_stdio_ops *ops) 301 + { 302 + stdio_buf_init(&thr->stdio.input); 303 + stdio_buf_init(&thr->stdio.output); 304 + thr->ops = ops; 305 + 306 + return bch2_run_thread_with_file(&thr->thr, &thread_with_stdout_fops, thread_with_stdio_fn); 307 + } 308 + EXPORT_SYMBOL_GPL(bch2_run_thread_with_stdout); 309 + 310 + int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len) 311 + { 312 + struct stdio_buf *buf = &stdio->input; 313 + 314 + /* 315 + * we're waiting on user input (or for the file descriptor to be 316 + * closed), don't want a hung task warning: 317 + */ 318 + do { 319 + wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio), 320 + sysctl_hung_task_timeout_secs * HZ / 2); 321 + } while (!stdio_redirect_has_input(stdio)); 266 322 267 323 if (stdio->done) 268 324 return -1; 269 325 270 - spin_lock(&stdio->input_lock); 271 - int ret = min(len, stdio->input_buf.pos); 272 - stdio->input_buf.pos -= ret; 273 - memcpy(buf, stdio->input_buf.buf, ret); 274 - memmove(stdio->input_buf.buf, 275 - stdio->input_buf.buf + ret, 276 - stdio->input_buf.pos); 277 - spin_unlock(&stdio->input_lock); 326 + spin_lock(&buf->lock); 327 + int ret = min(len, buf->buf.nr); 328 + buf->buf.nr -= ret; 329 + memcpy(ubuf, buf->buf.data, ret); 330 + memmove(buf->buf.data, 331 + buf->buf.data + ret, 332 + buf->buf.nr); 333 + spin_unlock(&buf->lock); 278 334 279 - wake_up(&stdio->input_wait); 335 + wake_up(&buf->wait); 280 336 return ret; 281 337 } 282 338 283 - int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *buf, size_t len) 339 + int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *ubuf, size_t len) 284 340 { 285 - wait_event(stdio->input_wait, 286 - stdio->input_buf.pos || stdio->done); 341 + struct stdio_buf *buf = &stdio->input; 342 + size_t copied = 0; 343 + ssize_t ret = 0; 344 + again: 345 + do { 346 + wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio), 347 + sysctl_hung_task_timeout_secs * HZ / 2); 348 + } while (!stdio_redirect_has_input(stdio)); 287 349 288 - if (stdio->done) 289 - return -1; 350 + if (stdio->done) { 351 + ret = -1; 352 + goto out; 353 + } 290 354 291 - spin_lock(&stdio->input_lock); 292 - int ret = min(len, stdio->input_buf.pos); 293 - char *n = memchr(stdio->input_buf.buf, '\n', ret); 355 + spin_lock(&buf->lock); 356 + size_t b = min(len, buf->buf.nr); 357 + char *n = memchr(buf->buf.data, '\n', b); 294 358 if (n) 295 - ret = min(ret, n + 1 - stdio->input_buf.buf); 296 - stdio->input_buf.pos -= ret; 297 - memcpy(buf, stdio->input_buf.buf, ret); 298 - memmove(stdio->input_buf.buf, 299 - stdio->input_buf.buf + ret, 300 - stdio->input_buf.pos); 301 - spin_unlock(&stdio->input_lock); 359 + b = min_t(size_t, b, n + 1 - buf->buf.data); 360 + buf->buf.nr -= b; 361 + memcpy(ubuf, buf->buf.data, b); 362 + memmove(buf->buf.data, 363 + buf->buf.data + b, 364 + buf->buf.nr); 365 + ubuf += b; 366 + len -= b; 367 + copied += b; 368 + spin_unlock(&buf->lock); 302 369 303 - wake_up(&stdio->input_wait); 370 + wake_up(&buf->wait); 371 + 372 + if (!n && len) 373 + goto again; 374 + out: 375 + return copied ?: ret; 376 + } 377 + 378 + __printf(3, 0) 379 + static ssize_t bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args) 380 + { 381 + ssize_t ret; 382 + 383 + do { 384 + va_list args2; 385 + size_t len; 386 + 387 + va_copy(args2, args); 388 + len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2); 389 + va_end(args2); 390 + 391 + if (len + 1 <= darray_room(*out)) { 392 + out->nr += len; 393 + return len; 394 + } 395 + 396 + ret = darray_make_room_gfp(out, len + 1, gfp); 397 + } while (ret == 0); 398 + 399 + return ret; 400 + } 401 + 402 + ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking, 403 + const char *fmt, va_list args) 404 + { 405 + struct stdio_buf *buf = &stdio->output; 406 + unsigned long flags; 407 + ssize_t ret; 408 + 409 + again: 410 + spin_lock_irqsave(&buf->lock, flags); 411 + ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args); 412 + spin_unlock_irqrestore(&buf->lock, flags); 413 + 414 + if (ret < 0) { 415 + if (nonblocking) 416 + return -EAGAIN; 417 + 418 + ret = wait_event_interruptible(buf->wait, 419 + stdio_redirect_has_output_space(stdio)); 420 + if (ret) 421 + return ret; 422 + goto again; 423 + } 424 + 425 + wake_up(&buf->wait); 426 + return ret; 427 + } 428 + 429 + ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking, 430 + const char *fmt, ...) 431 + { 432 + va_list args; 433 + ssize_t ret; 434 + 435 + va_start(args, fmt); 436 + ret = bch2_stdio_redirect_vprintf(stdio, nonblocking, fmt, args); 437 + va_end(args); 438 + 304 439 return ret; 305 440 } 306 441

+47 -12

fs/bcachefs/thread_with_file.h

··· 4 4 5 5 #include "thread_with_file_types.h" 6 6 7 + /* 8 + * Thread with file: Run a kthread and connect it to a file descriptor, so that 9 + * it can be interacted with via fd read/write methods and closing the file 10 + * descriptor stops the kthread. 11 + * 12 + * We have two different APIs: 13 + * 14 + * thread_with_file, the low level version. 15 + * You get to define the full file_operations, including your release function, 16 + * which means that you must call bch2_thread_with_file_exit() from your 17 + * .release method 18 + * 19 + * thread_with_stdio, the higher level version 20 + * This implements full piping of input and output, including .poll. 21 + * 22 + * Notes on behaviour: 23 + * - kthread shutdown behaves like writing or reading from a pipe that has been 24 + * closed 25 + * - Input and output buffers are 4096 bytes, although buffers may in some 26 + * situations slightly exceed that limit so as to avoid chopping off a 27 + * message in the middle in nonblocking mode. 28 + * - Input/output buffers are lazily allocated, with GFP_NOWAIT allocations - 29 + * should be fine but might change in future revisions. 30 + * - Output buffer may grow past 4096 bytes to deal with messages that are 31 + * bigger than 4096 bytes 32 + * - Writing may be done blocking or nonblocking; in nonblocking mode, we only 33 + * drop entire messages. 34 + * 35 + * To write, use stdio_redirect_printf() 36 + * To read, use stdio_redirect_read() or stdio_redirect_readline() 37 + */ 38 + 7 39 struct task_struct; 8 40 9 41 struct thread_with_file { ··· 49 17 const struct file_operations *, 50 18 int (*fn)(void *)); 51 19 20 + struct thread_with_stdio; 21 + 22 + struct thread_with_stdio_ops { 23 + void (*exit)(struct thread_with_stdio *); 24 + int (*fn)(struct thread_with_stdio *); 25 + long (*unlocked_ioctl)(struct thread_with_stdio *, unsigned int, unsigned long); 26 + }; 27 + 52 28 struct thread_with_stdio { 53 29 struct thread_with_file thr; 54 30 struct stdio_redirect stdio; 55 - DARRAY(char) output2; 56 - void (*exit)(struct thread_with_stdio *); 31 + const struct thread_with_stdio_ops *ops; 57 32 }; 58 33 59 - static inline void thread_with_stdio_done(struct thread_with_stdio *thr) 60 - { 61 - thr->thr.done = true; 62 - thr->stdio.done = true; 63 - wake_up(&thr->stdio.input_wait); 64 - wake_up(&thr->stdio.output_wait); 65 - } 66 - 67 34 int bch2_run_thread_with_stdio(struct thread_with_stdio *, 68 - void (*exit)(struct thread_with_stdio *), 69 - int (*fn)(void *)); 35 + const struct thread_with_stdio_ops *); 36 + int bch2_run_thread_with_stdout(struct thread_with_stdio *, 37 + const struct thread_with_stdio_ops *); 70 38 int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t); 71 39 int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t); 40 + 41 + __printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list); 42 + __printf(3, 4) ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...); 72 43 73 44 #endif /* _BCACHEFS_THREAD_WITH_FILE_H */

+11 -4

fs/bcachefs/thread_with_file_types.h

··· 2 2 #ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H 3 3 #define _BCACHEFS_THREAD_WITH_FILE_TYPES_H 4 4 5 + #include "darray.h" 6 + 7 + struct stdio_buf { 8 + spinlock_t lock; 9 + wait_queue_head_t wait; 10 + darray_char buf; 11 + }; 12 + 5 13 struct stdio_redirect { 6 - spinlock_t output_lock; 7 - wait_queue_head_t output_wait; 8 - struct printbuf output_buf; 14 + struct stdio_buf input; 15 + struct stdio_buf output; 9 16 10 17 spinlock_t input_lock; 11 18 wait_queue_head_t input_wait; 12 - struct printbuf input_buf; 19 + darray_char input_buf; 13 20 bool done; 14 21 }; 15 22

+165

fs/bcachefs/time_stats.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include <linux/jiffies.h> 4 + #include <linux/module.h> 5 + #include <linux/percpu.h> 6 + #include <linux/preempt.h> 7 + #include <linux/time.h> 8 + #include <linux/spinlock.h> 9 + 10 + #include "eytzinger.h" 11 + #include "time_stats.h" 12 + 13 + static const struct time_unit time_units[] = { 14 + { "ns", 1 }, 15 + { "us", NSEC_PER_USEC }, 16 + { "ms", NSEC_PER_MSEC }, 17 + { "s", NSEC_PER_SEC }, 18 + { "m", (u64) NSEC_PER_SEC * 60}, 19 + { "h", (u64) NSEC_PER_SEC * 3600}, 20 + { "d", (u64) NSEC_PER_SEC * 3600 * 24}, 21 + { "w", (u64) NSEC_PER_SEC * 3600 * 24 * 7}, 22 + { "y", (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */ 23 + { "eon", U64_MAX }, 24 + }; 25 + 26 + const struct time_unit *bch2_pick_time_units(u64 ns) 27 + { 28 + const struct time_unit *u; 29 + 30 + for (u = time_units; 31 + u + 1 < time_units + ARRAY_SIZE(time_units) && 32 + ns >= u[1].nsecs << 1; 33 + u++) 34 + ; 35 + 36 + return u; 37 + } 38 + 39 + static void quantiles_update(struct quantiles *q, u64 v) 40 + { 41 + unsigned i = 0; 42 + 43 + while (i < ARRAY_SIZE(q->entries)) { 44 + struct quantile_entry *e = q->entries + i; 45 + 46 + if (unlikely(!e->step)) { 47 + e->m = v; 48 + e->step = max_t(unsigned, v / 2, 1024); 49 + } else if (e->m > v) { 50 + e->m = e->m >= e->step 51 + ? e->m - e->step 52 + : 0; 53 + } else if (e->m < v) { 54 + e->m = e->m + e->step > e->m 55 + ? e->m + e->step 56 + : U32_MAX; 57 + } 58 + 59 + if ((e->m > v ? e->m - v : v - e->m) < e->step) 60 + e->step = max_t(unsigned, e->step / 2, 1); 61 + 62 + if (v >= e->m) 63 + break; 64 + 65 + i = eytzinger0_child(i, v > e->m); 66 + } 67 + } 68 + 69 + static inline void time_stats_update_one(struct bch2_time_stats *stats, 70 + u64 start, u64 end) 71 + { 72 + u64 duration, freq; 73 + bool initted = stats->last_event != 0; 74 + 75 + if (time_after64(end, start)) { 76 + struct quantiles *quantiles = time_stats_to_quantiles(stats); 77 + 78 + duration = end - start; 79 + mean_and_variance_update(&stats->duration_stats, duration); 80 + mean_and_variance_weighted_update(&stats->duration_stats_weighted, 81 + duration, initted, TIME_STATS_MV_WEIGHT); 82 + stats->max_duration = max(stats->max_duration, duration); 83 + stats->min_duration = min(stats->min_duration, duration); 84 + stats->total_duration += duration; 85 + 86 + if (quantiles) 87 + quantiles_update(quantiles, duration); 88 + } 89 + 90 + if (stats->last_event && time_after64(end, stats->last_event)) { 91 + freq = end - stats->last_event; 92 + mean_and_variance_update(&stats->freq_stats, freq); 93 + mean_and_variance_weighted_update(&stats->freq_stats_weighted, 94 + freq, initted, TIME_STATS_MV_WEIGHT); 95 + stats->max_freq = max(stats->max_freq, freq); 96 + stats->min_freq = min(stats->min_freq, freq); 97 + } 98 + 99 + stats->last_event = end; 100 + } 101 + 102 + void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, 103 + struct time_stat_buffer *b) 104 + { 105 + for (struct time_stat_buffer_entry *i = b->entries; 106 + i < b->entries + ARRAY_SIZE(b->entries); 107 + i++) 108 + time_stats_update_one(stats, i->start, i->end); 109 + b->nr = 0; 110 + } 111 + 112 + static noinline void time_stats_clear_buffer(struct bch2_time_stats *stats, 113 + struct time_stat_buffer *b) 114 + { 115 + unsigned long flags; 116 + 117 + spin_lock_irqsave(&stats->lock, flags); 118 + __bch2_time_stats_clear_buffer(stats, b); 119 + spin_unlock_irqrestore(&stats->lock, flags); 120 + } 121 + 122 + void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) 123 + { 124 + unsigned long flags; 125 + 126 + if (!stats->buffer) { 127 + spin_lock_irqsave(&stats->lock, flags); 128 + time_stats_update_one(stats, start, end); 129 + 130 + if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 && 131 + stats->duration_stats.n > 1024) 132 + stats->buffer = 133 + alloc_percpu_gfp(struct time_stat_buffer, 134 + GFP_ATOMIC); 135 + spin_unlock_irqrestore(&stats->lock, flags); 136 + } else { 137 + struct time_stat_buffer *b; 138 + 139 + preempt_disable(); 140 + b = this_cpu_ptr(stats->buffer); 141 + 142 + BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); 143 + b->entries[b->nr++] = (struct time_stat_buffer_entry) { 144 + .start = start, 145 + .end = end 146 + }; 147 + 148 + if (unlikely(b->nr == ARRAY_SIZE(b->entries))) 149 + time_stats_clear_buffer(stats, b); 150 + preempt_enable(); 151 + } 152 + } 153 + 154 + void bch2_time_stats_exit(struct bch2_time_stats *stats) 155 + { 156 + free_percpu(stats->buffer); 157 + } 158 + 159 + void bch2_time_stats_init(struct bch2_time_stats *stats) 160 + { 161 + memset(stats, 0, sizeof(*stats)); 162 + stats->min_duration = U64_MAX; 163 + stats->min_freq = U64_MAX; 164 + spin_lock_init(&stats->lock); 165 + }

+159

fs/bcachefs/time_stats.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * bch2_time_stats - collect statistics on events that have a duration, with nicely 4 + * formatted textual output on demand 5 + * 6 + * - percpu buffering of event collection: cheap enough to shotgun 7 + * everywhere without worrying about overhead 8 + * 9 + * tracks: 10 + * - number of events 11 + * - maximum event duration ever seen 12 + * - sum of all event durations 13 + * - average event duration, standard and weighted 14 + * - standard deviation of event durations, standard and weighted 15 + * and analagous statistics for the frequency of events 16 + * 17 + * We provide both mean and weighted mean (exponentially weighted), and standard 18 + * deviation and weighted standard deviation, to give an efficient-to-compute 19 + * view of current behaviour versus. average behaviour - "did this event source 20 + * just become wonky, or is this typical?". 21 + * 22 + * Particularly useful for tracking down latency issues. 23 + */ 24 + #ifndef _BCACHEFS_TIME_STATS_H 25 + #define _BCACHEFS_TIME_STATS_H 26 + 27 + #include <linux/sched/clock.h> 28 + #include <linux/spinlock_types.h> 29 + #include <linux/string.h> 30 + 31 + #include "mean_and_variance.h" 32 + 33 + struct time_unit { 34 + const char *name; 35 + u64 nsecs; 36 + }; 37 + 38 + /* 39 + * given a nanosecond value, pick the preferred time units for printing: 40 + */ 41 + const struct time_unit *bch2_pick_time_units(u64 ns); 42 + 43 + /* 44 + * quantiles - do not use: 45 + * 46 + * Only enabled if bch2_time_stats->quantiles_enabled has been manually set - don't 47 + * use in new code. 48 + */ 49 + 50 + #define NR_QUANTILES 15 51 + #define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) 52 + #define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) 53 + #define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) 54 + 55 + struct quantiles { 56 + struct quantile_entry { 57 + u64 m; 58 + u64 step; 59 + } entries[NR_QUANTILES]; 60 + }; 61 + 62 + struct time_stat_buffer { 63 + unsigned nr; 64 + struct time_stat_buffer_entry { 65 + u64 start; 66 + u64 end; 67 + } entries[31]; 68 + }; 69 + 70 + struct bch2_time_stats { 71 + spinlock_t lock; 72 + bool have_quantiles; 73 + /* all fields are in nanoseconds */ 74 + u64 min_duration; 75 + u64 max_duration; 76 + u64 total_duration; 77 + u64 max_freq; 78 + u64 min_freq; 79 + u64 last_event; 80 + u64 last_event_start; 81 + 82 + struct mean_and_variance duration_stats; 83 + struct mean_and_variance freq_stats; 84 + 85 + /* default weight for weighted mean and variance calculations */ 86 + #define TIME_STATS_MV_WEIGHT 8 87 + 88 + struct mean_and_variance_weighted duration_stats_weighted; 89 + struct mean_and_variance_weighted freq_stats_weighted; 90 + struct time_stat_buffer __percpu *buffer; 91 + }; 92 + 93 + struct bch2_time_stats_quantiles { 94 + struct bch2_time_stats stats; 95 + struct quantiles quantiles; 96 + }; 97 + 98 + static inline struct quantiles *time_stats_to_quantiles(struct bch2_time_stats *stats) 99 + { 100 + return stats->have_quantiles 101 + ? &container_of(stats, struct bch2_time_stats_quantiles, stats)->quantiles 102 + : NULL; 103 + } 104 + 105 + void __bch2_time_stats_clear_buffer(struct bch2_time_stats *, struct time_stat_buffer *); 106 + void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64); 107 + 108 + /** 109 + * time_stats_update - collect a new event being tracked 110 + * 111 + * @stats - bch2_time_stats to update 112 + * @start - start time of event, recorded with local_clock() 113 + * 114 + * The end duration of the event will be the current time 115 + */ 116 + static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) 117 + { 118 + __bch2_time_stats_update(stats, start, local_clock()); 119 + } 120 + 121 + /** 122 + * track_event_change - track state change events 123 + * 124 + * @stats - bch2_time_stats to update 125 + * @v - new state, true or false 126 + * 127 + * Use this when tracking time stats for state changes, i.e. resource X becoming 128 + * blocked/unblocked. 129 + */ 130 + static inline bool track_event_change(struct bch2_time_stats *stats, bool v) 131 + { 132 + if (v != !!stats->last_event_start) { 133 + if (!v) { 134 + bch2_time_stats_update(stats, stats->last_event_start); 135 + stats->last_event_start = 0; 136 + } else { 137 + stats->last_event_start = local_clock() ?: 1; 138 + return true; 139 + } 140 + } 141 + 142 + return false; 143 + } 144 + 145 + void bch2_time_stats_exit(struct bch2_time_stats *); 146 + void bch2_time_stats_init(struct bch2_time_stats *); 147 + 148 + static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq) 149 + { 150 + bch2_time_stats_exit(&statq->stats); 151 + } 152 + static inline void bch2_time_stats_quantiles_init(struct bch2_time_stats_quantiles *statq) 153 + { 154 + bch2_time_stats_init(&statq->stats); 155 + statq->stats.have_quantiles = true; 156 + memset(&statq->quantiles, 0, sizeof(statq->quantiles)); 157 + } 158 + 159 + #endif /* _BCACHEFS_TIME_STATS_H */

+19

fs/bcachefs/trace.h

··· 1431 1431 TP_ARGS(c, str) 1432 1432 ); 1433 1433 1434 + TRACE_EVENT(error_downcast, 1435 + TP_PROTO(int bch_err, int std_err, unsigned long ip), 1436 + TP_ARGS(bch_err, std_err, ip), 1437 + 1438 + TP_STRUCT__entry( 1439 + __array(char, bch_err, 32 ) 1440 + __array(char, std_err, 32 ) 1441 + __array(char, ip, 32 ) 1442 + ), 1443 + 1444 + TP_fast_assign( 1445 + strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err)); 1446 + strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err)); 1447 + snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); 1448 + ), 1449 + 1450 + TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip) 1451 + ); 1452 + 1434 1453 #endif /* _TRACE_BCACHEFS_H */ 1435 1454 1436 1455 /* This part must be outside protection */

+22 -201

fs/bcachefs/util.c

··· 337 337 } 338 338 #endif 339 339 340 - static const struct time_unit { 341 - const char *name; 342 - u64 nsecs; 343 - } time_units[] = { 344 - { "ns", 1 }, 345 - { "us", NSEC_PER_USEC }, 346 - { "ms", NSEC_PER_MSEC }, 347 - { "s", NSEC_PER_SEC }, 348 - { "m", (u64) NSEC_PER_SEC * 60}, 349 - { "h", (u64) NSEC_PER_SEC * 3600}, 350 - { "eon", U64_MAX }, 351 - }; 352 - 353 - static const struct time_unit *pick_time_units(u64 ns) 354 - { 355 - const struct time_unit *u; 356 - 357 - for (u = time_units; 358 - u + 1 < time_units + ARRAY_SIZE(time_units) && 359 - ns >= u[1].nsecs << 1; 360 - u++) 361 - ; 362 - 363 - return u; 364 - } 365 - 366 340 void bch2_pr_time_units(struct printbuf *out, u64 ns) 367 341 { 368 - const struct time_unit *u = pick_time_units(ns); 342 + const struct time_unit *u = bch2_pick_time_units(ns); 369 343 370 344 prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); 371 345 } 372 346 373 - /* time stats: */ 374 - 375 - #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT 376 - static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v) 377 - { 378 - unsigned i = 0; 379 - 380 - while (i < ARRAY_SIZE(q->entries)) { 381 - struct bch2_quantile_entry *e = q->entries + i; 382 - 383 - if (unlikely(!e->step)) { 384 - e->m = v; 385 - e->step = max_t(unsigned, v / 2, 1024); 386 - } else if (e->m > v) { 387 - e->m = e->m >= e->step 388 - ? e->m - e->step 389 - : 0; 390 - } else if (e->m < v) { 391 - e->m = e->m + e->step > e->m 392 - ? e->m + e->step 393 - : U32_MAX; 394 - } 395 - 396 - if ((e->m > v ? e->m - v : v - e->m) < e->step) 397 - e->step = max_t(unsigned, e->step / 2, 1); 398 - 399 - if (v >= e->m) 400 - break; 401 - 402 - i = eytzinger0_child(i, v > e->m); 403 - } 404 - } 405 - 406 - static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats, 407 - u64 start, u64 end) 408 - { 409 - u64 duration, freq; 410 - 411 - if (time_after64(end, start)) { 412 - duration = end - start; 413 - mean_and_variance_update(&stats->duration_stats, duration); 414 - mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration); 415 - stats->max_duration = max(stats->max_duration, duration); 416 - stats->min_duration = min(stats->min_duration, duration); 417 - stats->total_duration += duration; 418 - bch2_quantiles_update(&stats->quantiles, duration); 419 - } 420 - 421 - if (stats->last_event && time_after64(end, stats->last_event)) { 422 - freq = end - stats->last_event; 423 - mean_and_variance_update(&stats->freq_stats, freq); 424 - mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq); 425 - stats->max_freq = max(stats->max_freq, freq); 426 - stats->min_freq = min(stats->min_freq, freq); 427 - } 428 - 429 - stats->last_event = end; 430 - } 431 - 432 - static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, 433 - struct bch2_time_stat_buffer *b) 434 - { 435 - for (struct bch2_time_stat_buffer_entry *i = b->entries; 436 - i < b->entries + ARRAY_SIZE(b->entries); 437 - i++) 438 - bch2_time_stats_update_one(stats, i->start, i->end); 439 - b->nr = 0; 440 - } 441 - 442 - static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, 443 - struct bch2_time_stat_buffer *b) 444 - { 445 - unsigned long flags; 446 - 447 - spin_lock_irqsave(&stats->lock, flags); 448 - __bch2_time_stats_clear_buffer(stats, b); 449 - spin_unlock_irqrestore(&stats->lock, flags); 450 - } 451 - 452 - void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) 453 - { 454 - unsigned long flags; 455 - 456 - WARN_ONCE(!stats->duration_stats_weighted.weight || 457 - !stats->freq_stats_weighted.weight, 458 - "uninitialized time_stats"); 459 - 460 - if (!stats->buffer) { 461 - spin_lock_irqsave(&stats->lock, flags); 462 - bch2_time_stats_update_one(stats, start, end); 463 - 464 - if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 && 465 - stats->duration_stats.n > 1024) 466 - stats->buffer = 467 - alloc_percpu_gfp(struct bch2_time_stat_buffer, 468 - GFP_ATOMIC); 469 - spin_unlock_irqrestore(&stats->lock, flags); 470 - } else { 471 - struct bch2_time_stat_buffer *b; 472 - 473 - preempt_disable(); 474 - b = this_cpu_ptr(stats->buffer); 475 - 476 - BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); 477 - b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) { 478 - .start = start, 479 - .end = end 480 - }; 481 - 482 - if (unlikely(b->nr == ARRAY_SIZE(b->entries))) 483 - bch2_time_stats_clear_buffer(stats, b); 484 - preempt_enable(); 485 - } 486 - } 487 - 488 347 static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) 489 348 { 490 - const struct time_unit *u = pick_time_units(ns); 349 + const struct time_unit *u = bch2_pick_time_units(ns); 491 350 492 351 prt_printf(out, "%llu ", div64_u64(ns, u->nsecs)); 493 352 prt_tab_rjust(out); ··· 365 506 366 507 void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) 367 508 { 368 - const struct time_unit *u; 509 + struct quantiles *quantiles = time_stats_to_quantiles(stats); 369 510 s64 f_mean = 0, d_mean = 0; 370 - u64 q, last_q = 0, f_stddev = 0, d_stddev = 0; 371 - int i; 511 + u64 f_stddev = 0, d_stddev = 0; 372 512 373 513 if (stats->buffer) { 374 514 int cpu; ··· 429 571 prt_tab(out); 430 572 bch2_pr_time_units_aligned(out, d_mean); 431 573 prt_tab(out); 432 - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted)); 574 + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); 433 575 prt_newline(out); 434 576 435 577 prt_printf(out, "stddev:"); 436 578 prt_tab(out); 437 579 bch2_pr_time_units_aligned(out, d_stddev); 438 580 prt_tab(out); 439 - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted)); 581 + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); 440 582 441 583 printbuf_indent_sub(out, 2); 442 584 prt_newline(out); ··· 452 594 prt_tab(out); 453 595 bch2_pr_time_units_aligned(out, f_mean); 454 596 prt_tab(out); 455 - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted)); 597 + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); 456 598 prt_newline(out); 457 599 458 600 prt_printf(out, "stddev:"); 459 601 prt_tab(out); 460 602 bch2_pr_time_units_aligned(out, f_stddev); 461 603 prt_tab(out); 462 - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted)); 604 + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); 463 605 464 606 printbuf_indent_sub(out, 2); 465 607 prt_newline(out); 466 608 467 609 printbuf_tabstops_reset(out); 468 610 469 - i = eytzinger0_first(NR_QUANTILES); 470 - u = pick_time_units(stats->quantiles.entries[i].m); 611 + if (quantiles) { 612 + int i = eytzinger0_first(NR_QUANTILES); 613 + const struct time_unit *u = 614 + bch2_pick_time_units(quantiles->entries[i].m); 615 + u64 last_q = 0; 471 616 472 - prt_printf(out, "quantiles (%s):\t", u->name); 473 - eytzinger0_for_each(i, NR_QUANTILES) { 474 - bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; 617 + prt_printf(out, "quantiles (%s):\t", u->name); 618 + eytzinger0_for_each(i, NR_QUANTILES) { 619 + bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; 475 620 476 - q = max(stats->quantiles.entries[i].m, last_q); 477 - prt_printf(out, "%llu ", 478 - div_u64(q, u->nsecs)); 479 - if (is_last) 480 - prt_newline(out); 481 - last_q = q; 621 + u64 q = max(quantiles->entries[i].m, last_q); 622 + prt_printf(out, "%llu ", div_u64(q, u->nsecs)); 623 + if (is_last) 624 + prt_newline(out); 625 + last_q = q; 626 + } 482 627 } 483 - } 484 - #else 485 - void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {} 486 - #endif 487 - 488 - void bch2_time_stats_exit(struct bch2_time_stats *stats) 489 - { 490 - free_percpu(stats->buffer); 491 - } 492 - 493 - void bch2_time_stats_init(struct bch2_time_stats *stats) 494 - { 495 - memset(stats, 0, sizeof(*stats)); 496 - stats->duration_stats_weighted.weight = 8; 497 - stats->freq_stats_weighted.weight = 8; 498 - stats->min_duration = U64_MAX; 499 - stats->min_freq = U64_MAX; 500 - spin_lock_init(&stats->lock); 501 628 } 502 629 503 630 /* ratelimit: */ ··· 848 1005 swap_func(base + r, base + c, size); 849 1006 } 850 1007 } 851 - } 852 - 853 - static void mempool_free_vp(void *element, void *pool_data) 854 - { 855 - size_t size = (size_t) pool_data; 856 - 857 - vpfree(element, size); 858 - } 859 - 860 - static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data) 861 - { 862 - size_t size = (size_t) pool_data; 863 - 864 - return vpmalloc(size, gfp_mask); 865 - } 866 - 867 - int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size) 868 - { 869 - return size < PAGE_SIZE 870 - ? mempool_init_kmalloc_pool(pool, min_nr, size) 871 - : mempool_init(pool, min_nr, mempool_alloc_vp, 872 - mempool_free_vp, (void *) size); 873 1008 } 874 1009 875 1010 #if 0

+30 -112

fs/bcachefs/util.h

··· 21 21 #include "mean_and_variance.h" 22 22 23 23 #include "darray.h" 24 + #include "time_stats.h" 24 25 25 26 struct closure; 26 27 ··· 54 53 PAGE_SIZE); 55 54 } 56 55 57 - static inline void vpfree(void *p, size_t size) 58 - { 59 - if (is_vmalloc_addr(p)) 60 - vfree(p); 61 - else 62 - free_pages((unsigned long) p, get_order(size)); 63 - } 64 - 65 - static inline void *vpmalloc(size_t size, gfp_t gfp_mask) 66 - { 67 - return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, 68 - get_order(size)) ?: 69 - __vmalloc(size, gfp_mask); 70 - } 71 - 72 - static inline void kvpfree(void *p, size_t size) 73 - { 74 - if (size < PAGE_SIZE) 75 - kfree(p); 76 - else 77 - vpfree(p, size); 78 - } 79 - 80 - static inline void *kvpmalloc(size_t size, gfp_t gfp_mask) 81 - { 82 - return size < PAGE_SIZE 83 - ? kmalloc(size, gfp_mask) 84 - : vpmalloc(size, gfp_mask); 85 - } 86 - 87 - int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t); 88 - 89 56 #define HEAP(type) \ 90 57 struct { \ 91 58 size_t size, used; \ ··· 66 97 ({ \ 67 98 (heap)->used = 0; \ 68 99 (heap)->size = (_size); \ 69 - (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\ 100 + (heap)->data = kvmalloc((heap)->size * sizeof((heap)->data[0]),\ 70 101 (gfp)); \ 71 102 }) 72 103 73 104 #define free_heap(heap) \ 74 105 do { \ 75 - kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \ 106 + kvfree((heap)->data); \ 76 107 (heap)->data = NULL; \ 77 108 } while (0) 78 109 ··· 330 361 #endif 331 362 } 332 363 333 - #define NR_QUANTILES 15 334 - #define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) 335 - #define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) 336 - #define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) 337 - 338 - struct bch2_quantiles { 339 - struct bch2_quantile_entry { 340 - u64 m; 341 - u64 step; 342 - } entries[NR_QUANTILES]; 343 - }; 344 - 345 - struct bch2_time_stat_buffer { 346 - unsigned nr; 347 - struct bch2_time_stat_buffer_entry { 348 - u64 start; 349 - u64 end; 350 - } entries[32]; 351 - }; 352 - 353 - struct bch2_time_stats { 354 - spinlock_t lock; 355 - /* all fields are in nanoseconds */ 356 - u64 min_duration; 357 - u64 max_duration; 358 - u64 total_duration; 359 - u64 max_freq; 360 - u64 min_freq; 361 - u64 last_event; 362 - struct bch2_quantiles quantiles; 363 - 364 - struct mean_and_variance duration_stats; 365 - struct mean_and_variance_weighted duration_stats_weighted; 366 - struct mean_and_variance freq_stats; 367 - struct mean_and_variance_weighted freq_stats_weighted; 368 - struct bch2_time_stat_buffer __percpu *buffer; 369 - }; 370 - 371 - #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT 372 - void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64); 373 - 374 - static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) 375 - { 376 - __bch2_time_stats_update(stats, start, local_clock()); 377 - } 378 - 379 - static inline bool track_event_change(struct bch2_time_stats *stats, 380 - u64 *start, bool v) 381 - { 382 - if (v != !!*start) { 383 - if (!v) { 384 - bch2_time_stats_update(stats, *start); 385 - *start = 0; 386 - } else { 387 - *start = local_clock() ?: 1; 388 - return true; 389 - } 390 - } 391 - 392 - return false; 393 - } 394 - #else 395 - static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {} 396 - static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {} 397 - static inline bool track_event_change(struct bch2_time_stats *stats, 398 - u64 *start, bool v) 399 - { 400 - bool ret = v && !*start; 401 - *start = v; 402 - return ret; 403 - } 404 - #endif 405 - 406 364 void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *); 407 - 408 - void bch2_time_stats_exit(struct bch2_time_stats *); 409 - void bch2_time_stats_init(struct bch2_time_stats *); 410 365 411 366 #define ewma_add(ewma, val, weight) \ 412 367 ({ \ ··· 681 788 } 682 789 683 790 /* Move the gap in a gap buffer: */ 684 - #define move_gap(_array, _nr, _size, _old_gap, _new_gap) \ 685 - __move_gap(_array, sizeof(_array[0]), _nr, _size, _old_gap, _new_gap) 791 + #define move_gap(_d, _new_gap) \ 792 + do { \ 793 + __move_gap((_d)->data, sizeof((_d)->data[0]), \ 794 + (_d)->nr, (_d)->size, (_d)->gap, _new_gap); \ 795 + (_d)->gap = _new_gap; \ 796 + } while (0) 686 797 687 798 #define bubble_sort(_base, _nr, _cmp) \ 688 799 do { \ ··· 772 875 773 876 void bch2_darray_str_exit(darray_str *); 774 877 int bch2_split_devs(const char *, darray_str *); 878 + 879 + #ifdef __KERNEL__ 880 + 881 + __must_check 882 + static inline int copy_to_user_errcode(void __user *to, const void *from, unsigned long n) 883 + { 884 + return copy_to_user(to, from, n) ? -EFAULT : 0; 885 + } 886 + 887 + __must_check 888 + static inline int copy_from_user_errcode(void *to, const void __user *from, unsigned long n) 889 + { 890 + return copy_from_user(to, from, n) ? -EFAULT : 0; 891 + } 892 + 893 + #endif 894 + 895 + static inline void __set_bit_le64(size_t bit, __le64 *addr) 896 + { 897 + addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64)); 898 + } 775 899 776 900 #endif /* _BCACHEFS_UTIL_H */

+3 -2

fs/bcachefs/xattr.c

··· 544 544 kfree(buf); 545 545 546 546 if (ret < 0) 547 - return ret; 547 + goto err_class_exit; 548 548 549 549 ret = bch2_opt_check_may_set(c, opt_id, v); 550 550 if (ret < 0) 551 - return ret; 551 + goto err_class_exit; 552 552 553 553 s.v = v + 1; 554 554 s.defined = true; ··· 595 595 (opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression)))) 596 596 bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum); 597 597 598 + err_class_exit: 598 599 return bch2_err_class(ret); 599 600 } 600 601

+4 -3

fs/inode.c

··· 2033 2033 return notify_change(idmap, dentry, &newattrs, NULL); 2034 2034 } 2035 2035 2036 - static int __file_remove_privs(struct file *file, unsigned int flags) 2036 + int file_remove_privs_flags(struct file *file, unsigned int flags) 2037 2037 { 2038 2038 struct dentry *dentry = file_dentry(file); 2039 2039 struct inode *inode = file_inode(file); ··· 2058 2058 inode_has_no_xattr(inode); 2059 2059 return error; 2060 2060 } 2061 + EXPORT_SYMBOL_GPL(file_remove_privs_flags); 2061 2062 2062 2063 /** 2063 2064 * file_remove_privs - remove special file privileges (suid, capabilities) ··· 2071 2070 */ 2072 2071 int file_remove_privs(struct file *file) 2073 2072 { 2074 - return __file_remove_privs(file, 0); 2073 + return file_remove_privs_flags(file, 0); 2075 2074 } 2076 2075 EXPORT_SYMBOL(file_remove_privs); 2077 2076 ··· 2164 2163 * Clear the security bits if the process is not being run by root. 2165 2164 * This keeps people from modifying setuid and setgid binaries. 2166 2165 */ 2167 - ret = __file_remove_privs(file, flags); 2166 + ret = file_remove_privs_flags(file, flags); 2168 2167 if (ret) 2169 2168 return ret; 2170 2169

+1

include/linux/fs.h

··· 3074 3074 extern struct inode *new_inode(struct super_block *sb); 3075 3075 extern void free_inode_nonrcu(struct inode *inode); 3076 3076 extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *); 3077 + extern int file_remove_privs_flags(struct file *file, unsigned int flags); 3077 3078 extern int file_remove_privs(struct file *); 3078 3079 int setattr_should_drop_sgid(struct mnt_idmap *idmap, 3079 3080 const struct inode *inode);

+16 -13

include/linux/generic-radix-tree.h

··· 5 5 * DOC: Generic radix trees/sparse arrays 6 6 * 7 7 * Very simple and minimalistic, supporting arbitrary size entries up to 8 - * PAGE_SIZE. 8 + * GENRADIX_NODE_SIZE. 9 9 * 10 10 * A genradix is defined with the type it will store, like so: 11 11 * ··· 45 45 46 46 struct genradix_root; 47 47 48 + #define GENRADIX_NODE_SHIFT 9 49 + #define GENRADIX_NODE_SIZE (1U << GENRADIX_NODE_SHIFT) 50 + 48 51 struct __genradix { 49 52 struct genradix_root *root; 50 53 }; 51 54 52 55 /* 53 - * NOTE: currently, sizeof(_type) must not be larger than PAGE_SIZE: 56 + * NOTE: currently, sizeof(_type) must not be larger than GENRADIX_NODE_SIZE: 54 57 */ 55 58 56 59 #define __GENRADIX_INITIALIZER \ ··· 104 101 static inline size_t __idx_to_offset(size_t idx, size_t obj_size) 105 102 { 106 103 if (__builtin_constant_p(obj_size)) 107 - BUILD_BUG_ON(obj_size > PAGE_SIZE); 104 + BUILD_BUG_ON(obj_size > GENRADIX_NODE_SIZE); 108 105 else 109 - BUG_ON(obj_size > PAGE_SIZE); 106 + BUG_ON(obj_size > GENRADIX_NODE_SIZE); 110 107 111 108 if (!is_power_of_2(obj_size)) { 112 - size_t objs_per_page = PAGE_SIZE / obj_size; 109 + size_t objs_per_page = GENRADIX_NODE_SIZE / obj_size; 113 110 114 - return (idx / objs_per_page) * PAGE_SIZE + 111 + return (idx / objs_per_page) * GENRADIX_NODE_SIZE + 115 112 (idx % objs_per_page) * obj_size; 116 113 } else { 117 114 return idx * obj_size; ··· 121 118 #define __genradix_cast(_radix) (typeof((_radix)->type[0]) *) 122 119 #define __genradix_obj_size(_radix) sizeof((_radix)->type[0]) 123 120 #define __genradix_objs_per_page(_radix) \ 124 - (PAGE_SIZE / sizeof((_radix)->type[0])) 121 + (GENRADIX_NODE_SIZE / sizeof((_radix)->type[0])) 125 122 #define __genradix_page_remainder(_radix) \ 126 - (PAGE_SIZE % sizeof((_radix)->type[0])) 123 + (GENRADIX_NODE_SIZE % sizeof((_radix)->type[0])) 127 124 128 125 #define __genradix_idx_to_offset(_radix, _idx) \ 129 126 __idx_to_offset(_idx, __genradix_obj_size(_radix)) ··· 220 217 iter->offset += obj_size; 221 218 222 219 if (!is_power_of_2(obj_size) && 223 - (iter->offset & (PAGE_SIZE - 1)) + obj_size > PAGE_SIZE) 224 - iter->offset = round_up(iter->offset, PAGE_SIZE); 220 + (iter->offset & (GENRADIX_NODE_SIZE - 1)) + obj_size > GENRADIX_NODE_SIZE) 221 + iter->offset = round_up(iter->offset, GENRADIX_NODE_SIZE); 225 222 226 223 iter->pos++; 227 224 } ··· 238 235 return; 239 236 } 240 237 241 - if ((iter->offset & (PAGE_SIZE - 1)) == 0) 242 - iter->offset -= PAGE_SIZE % obj_size; 238 + if ((iter->offset & (GENRADIX_NODE_SIZE - 1)) == 0) 239 + iter->offset -= GENRADIX_NODE_SIZE % obj_size; 243 240 244 241 iter->offset -= obj_size; 245 242 iter->pos--; ··· 266 263 genradix_for_each_from(_radix, _iter, _p, 0) 267 264 268 265 #define genradix_last_pos(_radix) \ 269 - (SIZE_MAX / PAGE_SIZE * __genradix_objs_per_page(_radix) - 1) 266 + (SIZE_MAX / GENRADIX_NODE_SIZE * __genradix_objs_per_page(_radix) - 1) 270 267 271 268 /** 272 269 * genradix_for_each_reverse - iterate over entry in a genradix, reverse order

+13

include/linux/mempool.h

··· 95 95 (void *) size); 96 96 } 97 97 98 + void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data); 99 + void mempool_kvfree(void *element, void *pool_data); 100 + 101 + static inline int mempool_init_kvmalloc_pool(mempool_t *pool, int min_nr, size_t size) 102 + { 103 + return mempool_init(pool, min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size); 104 + } 105 + 106 + static inline mempool_t *mempool_create_kvmalloc_pool(int min_nr, size_t size) 107 + { 108 + return mempool_create(min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size); 109 + } 110 + 98 111 /* 99 112 * A mempool_alloc_t and mempool_free_t for a simple page allocator that 100 113 * allocates pages of the order specified by pool_data

+2 -2

include/linux/sched.h

··· 1639 1639 * I am cleaning dirty pages from some other bdi. */ 1640 1640 #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ 1641 1641 #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ 1642 - #define PF__HOLE__00800000 0x00800000 1643 - #define PF__HOLE__01000000 0x01000000 1642 + #define PF_MEMALLOC_NORECLAIM 0x00800000 /* All allocation requests will clear __GFP_DIRECT_RECLAIM */ 1643 + #define PF_MEMALLOC_NOWARN 0x01000000 /* All allocation requests will inherit __GFP_NOWARN */ 1644 1644 #define PF__HOLE__02000000 0x02000000 1645 1645 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ 1646 1646 #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */

+39 -21

include/linux/sched/mm.h

··· 236 236 { 237 237 unsigned int pflags = READ_ONCE(current->flags); 238 238 239 - if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) { 239 + if (unlikely(pflags & (PF_MEMALLOC_NOIO | 240 + PF_MEMALLOC_NOFS | 241 + PF_MEMALLOC_NORECLAIM | 242 + PF_MEMALLOC_NOWARN | 243 + PF_MEMALLOC_PIN))) { 240 244 /* 241 - * NOIO implies both NOIO and NOFS and it is a weaker context 242 - * so always make sure it makes precedence 245 + * Stronger flags before weaker flags: 246 + * NORECLAIM implies NOIO, which in turn implies NOFS 243 247 */ 244 - if (pflags & PF_MEMALLOC_NOIO) 248 + if (pflags & PF_MEMALLOC_NORECLAIM) 249 + flags &= ~__GFP_DIRECT_RECLAIM; 250 + else if (pflags & PF_MEMALLOC_NOIO) 245 251 flags &= ~(__GFP_IO | __GFP_FS); 246 252 else if (pflags & PF_MEMALLOC_NOFS) 247 253 flags &= ~__GFP_FS; 254 + 255 + if (pflags & PF_MEMALLOC_NOWARN) 256 + flags |= __GFP_NOWARN; 248 257 249 258 if (pflags & PF_MEMALLOC_PIN) 250 259 flags &= ~__GFP_MOVABLE; ··· 316 307 } 317 308 318 309 /** 310 + * memalloc_flags_save - Add a PF_* flag to current->flags, save old value 311 + * 312 + * This allows PF_* flags to be conveniently added, irrespective of current 313 + * value, and then the old version restored with memalloc_flags_restore(). 314 + */ 315 + static inline unsigned memalloc_flags_save(unsigned flags) 316 + { 317 + unsigned oldflags = ~current->flags & flags; 318 + current->flags |= flags; 319 + return oldflags; 320 + } 321 + 322 + static inline void memalloc_flags_restore(unsigned flags) 323 + { 324 + current->flags &= ~flags; 325 + } 326 + 327 + /** 319 328 * memalloc_noio_save - Marks implicit GFP_NOIO allocation scope. 320 329 * 321 330 * This functions marks the beginning of the GFP_NOIO allocation scope. ··· 347 320 */ 348 321 static inline unsigned int memalloc_noio_save(void) 349 322 { 350 - unsigned int flags = current->flags & PF_MEMALLOC_NOIO; 351 - current->flags |= PF_MEMALLOC_NOIO; 352 - return flags; 323 + return memalloc_flags_save(PF_MEMALLOC_NOIO); 353 324 } 354 325 355 326 /** ··· 360 335 */ 361 336 static inline void memalloc_noio_restore(unsigned int flags) 362 337 { 363 - current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags; 338 + memalloc_flags_restore(flags); 364 339 } 365 340 366 341 /** ··· 377 352 */ 378 353 static inline unsigned int memalloc_nofs_save(void) 379 354 { 380 - unsigned int flags = current->flags & PF_MEMALLOC_NOFS; 381 - current->flags |= PF_MEMALLOC_NOFS; 382 - return flags; 355 + return memalloc_flags_save(PF_MEMALLOC_NOFS); 383 356 } 384 357 385 358 /** ··· 390 367 */ 391 368 static inline void memalloc_nofs_restore(unsigned int flags) 392 369 { 393 - current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags; 370 + memalloc_flags_restore(flags); 394 371 } 395 372 396 373 /** ··· 418 395 */ 419 396 static inline unsigned int memalloc_noreclaim_save(void) 420 397 { 421 - unsigned int flags = current->flags & PF_MEMALLOC; 422 - current->flags |= PF_MEMALLOC; 423 - return flags; 398 + return memalloc_flags_save(PF_MEMALLOC); 424 399 } 425 400 426 401 /** ··· 431 410 */ 432 411 static inline void memalloc_noreclaim_restore(unsigned int flags) 433 412 { 434 - current->flags = (current->flags & ~PF_MEMALLOC) | flags; 413 + memalloc_flags_restore(flags); 435 414 } 436 415 437 416 /** ··· 446 425 */ 447 426 static inline unsigned int memalloc_pin_save(void) 448 427 { 449 - unsigned int flags = current->flags & PF_MEMALLOC_PIN; 450 - 451 - current->flags |= PF_MEMALLOC_PIN; 452 - return flags; 428 + return memalloc_flags_save(PF_MEMALLOC_PIN); 453 429 } 454 430 455 431 /** ··· 459 441 */ 460 442 static inline void memalloc_pin_restore(unsigned int flags) 461 443 { 462 - current->flags = (current->flags & ~PF_MEMALLOC_PIN) | flags; 444 + memalloc_flags_restore(flags); 463 445 } 464 446 465 447 #ifdef CONFIG_MEMCG

+1

kernel/hung_task.c

··· 43 43 * Zero means infinite timeout - no checking done: 44 44 */ 45 45 unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; 46 + EXPORT_SYMBOL_GPL(sysctl_hung_task_timeout_secs); 46 47 47 48 /* 48 49 * Zero (default value) means use sysctl_hung_task_timeout_secs:

+12 -23

lib/generic-radix-tree.c

··· 5 5 #include <linux/gfp.h> 6 6 #include <linux/kmemleak.h> 7 7 8 - #define GENRADIX_ARY (PAGE_SIZE / sizeof(struct genradix_node *)) 8 + #define GENRADIX_ARY (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *)) 9 9 #define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY) 10 10 11 11 struct genradix_node { ··· 14 14 struct genradix_node *children[GENRADIX_ARY]; 15 15 16 16 /* Leaf: */ 17 - u8 data[PAGE_SIZE]; 17 + u8 data[GENRADIX_NODE_SIZE]; 18 18 }; 19 19 }; 20 20 21 21 static inline int genradix_depth_shift(unsigned depth) 22 22 { 23 - return PAGE_SHIFT + GENRADIX_ARY_SHIFT * depth; 23 + return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth; 24 24 } 25 25 26 26 /* ··· 33 33 34 34 /* depth that's needed for a genradix that can address up to ULONG_MAX: */ 35 35 #define GENRADIX_MAX_DEPTH \ 36 - DIV_ROUND_UP(BITS_PER_LONG - PAGE_SHIFT, GENRADIX_ARY_SHIFT) 36 + DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT) 37 37 38 38 #define GENRADIX_DEPTH_MASK \ 39 39 ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1)) ··· 79 79 80 80 static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask) 81 81 { 82 - struct genradix_node *node; 83 - 84 - node = (struct genradix_node *)__get_free_page(gfp_mask|__GFP_ZERO); 85 - 86 - /* 87 - * We're using pages (not slab allocations) directly for kernel data 88 - * structures, so we need to explicitly inform kmemleak of them in order 89 - * to avoid false positive memory leak reports. 90 - */ 91 - kmemleak_alloc(node, PAGE_SIZE, 1, gfp_mask); 92 - return node; 82 + return kzalloc(GENRADIX_NODE_SIZE, gfp_mask); 93 83 } 94 84 95 85 static inline void genradix_free_node(struct genradix_node *node) 96 86 { 97 - kmemleak_free(node); 98 - free_page((unsigned long)node); 87 + kfree(node); 99 88 } 100 89 101 90 /* ··· 189 200 i++; 190 201 iter->offset = round_down(iter->offset + objs_per_ptr, 191 202 objs_per_ptr); 192 - iter->pos = (iter->offset >> PAGE_SHIFT) * 203 + iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * 193 204 objs_per_page; 194 205 if (i == GENRADIX_ARY) 195 206 goto restart; ··· 198 209 n = n->children[i]; 199 210 } 200 211 201 - return &n->data[iter->offset & (PAGE_SIZE - 1)]; 212 + return &n->data[iter->offset & (GENRADIX_NODE_SIZE - 1)]; 202 213 } 203 214 EXPORT_SYMBOL(__genradix_iter_peek); 204 215 ··· 224 235 225 236 if (ilog2(iter->offset) >= genradix_depth_shift(level)) { 226 237 iter->offset = genradix_depth_size(level); 227 - iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page; 238 + iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page; 228 239 229 240 iter->offset -= obj_size_plus_page_remainder; 230 241 iter->pos--; ··· 240 251 size_t objs_per_ptr = genradix_depth_size(level); 241 252 242 253 iter->offset = round_down(iter->offset, objs_per_ptr); 243 - iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page; 254 + iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page; 244 255 245 256 if (!iter->offset) 246 257 return NULL; ··· 256 267 n = n->children[i]; 257 268 } 258 269 259 - return &n->data[iter->offset & (PAGE_SIZE - 1)]; 270 + return &n->data[iter->offset & (GENRADIX_NODE_SIZE - 1)]; 260 271 } 261 272 EXPORT_SYMBOL(__genradix_iter_peek_prev); 262 273 ··· 278 289 { 279 290 size_t offset; 280 291 281 - for (offset = 0; offset < size; offset += PAGE_SIZE) 292 + for (offset = 0; offset < size; offset += GENRADIX_NODE_SIZE) 282 293 if (!__genradix_ptr_alloc(radix, offset, gfp_mask)) 283 294 return -ENOMEM; 284 295

+13

mm/mempool.c

··· 590 590 } 591 591 EXPORT_SYMBOL(mempool_kfree); 592 592 593 + void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data) 594 + { 595 + size_t size = (size_t)pool_data; 596 + return kvmalloc(size, gfp_mask); 597 + } 598 + EXPORT_SYMBOL(mempool_kvmalloc); 599 + 600 + void mempool_kvfree(void *element, void *pool_data) 601 + { 602 + kvfree(element); 603 + } 604 + EXPORT_SYMBOL(mempool_kvfree); 605 + 593 606 /* 594 607 * A simple mempool-backed page allocator that allocates pages 595 608 * of the order specified by pool_data.