Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bcachefs: bch_sb_field_errors

Add a new superblock section to keep counts of errors seen since
filesystem creation: we'll be addingcounters for every distinct fsck
error.

The new superblock section has entries of the for [ id, count,
time_of_last_error ]; this is intended to let us see what errors are
occuring - and getting fixed - via show-super output.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

+270 -23
+1
fs/bcachefs/Makefile
··· 70 70 reflink.o \ 71 71 replicas.o \ 72 72 sb-clean.o \ 73 + sb-errors.o \ 73 74 sb-members.o \ 74 75 siphash.o \ 75 76 six.o \
+9 -5
fs/bcachefs/bcachefs.h
··· 209 209 #include "nocow_locking_types.h" 210 210 #include "opts.h" 211 211 #include "recovery_types.h" 212 + #include "sb-errors_types.h" 212 213 #include "seqmutex.h" 213 214 #include "util.h" 214 215 ··· 993 992 struct bio_set dio_read_bioset; 994 993 struct bio_set nocow_flush_bioset; 995 994 996 - /* ERRORS */ 997 - struct list_head fsck_errors; 998 - struct mutex fsck_error_lock; 999 - bool fsck_alloc_err; 1000 - 1001 995 /* QUOTAS */ 1002 996 struct bch_memquota_type quotas[QTYP_NR]; 1003 997 ··· 1041 1045 struct bch2_time_stats times[BCH_TIME_STAT_NR]; 1042 1046 1043 1047 struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR]; 1048 + 1049 + /* ERRORS */ 1050 + struct list_head fsck_error_msgs; 1051 + struct mutex fsck_error_msgs_lock; 1052 + bool fsck_alloc_msgs_err; 1053 + 1054 + bch_sb_errors_cpu fsck_error_counts; 1055 + struct mutex fsck_error_counts_lock; 1044 1056 }; 1045 1057 1046 1058 extern struct wait_queue_head bch2_read_only_wait;
+13 -1
fs/bcachefs/bcachefs_format.h
··· 1218 1218 x(journal_seq_blacklist, 8) \ 1219 1219 x(journal_v2, 9) \ 1220 1220 x(counters, 10) \ 1221 - x(members_v2, 11) 1221 + x(members_v2, 11) \ 1222 + x(errors, 12) 1222 1223 1223 1224 enum bch_sb_field_type { 1224 1225 #define x(f, nr) BCH_SB_FIELD_##f = nr, ··· 1621 1620 struct journal_seq_blacklist_entry start[0]; 1622 1621 __u64 _data[]; 1623 1622 }; 1623 + 1624 + struct bch_sb_field_errors { 1625 + struct bch_sb_field field; 1626 + struct bch_sb_field_error_entry { 1627 + __le64 v; 1628 + __le64 last_error_time; 1629 + } entries[]; 1630 + }; 1631 + 1632 + LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16); 1633 + LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64); 1624 1634 1625 1635 /* Superblock: */ 1626 1636
+1
fs/bcachefs/errcode.h
··· 213 213 x(BCH_ERR_invalid_sb, invalid_sb_crypt) \ 214 214 x(BCH_ERR_invalid_sb, invalid_sb_clean) \ 215 215 x(BCH_ERR_invalid_sb, invalid_sb_quota) \ 216 + x(BCH_ERR_invalid_sb, invalid_sb_errors) \ 216 217 x(BCH_ERR_invalid_sb, invalid_sb_opt_compression) \ 217 218 x(BCH_ERR_invalid, invalid_bkey) \ 218 219 x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
+11 -11
fs/bcachefs/error.c
··· 117 117 if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) 118 118 return NULL; 119 119 120 - list_for_each_entry(s, &c->fsck_errors, list) 120 + list_for_each_entry(s, &c->fsck_error_msgs, list) 121 121 if (s->fmt == fmt) { 122 122 /* 123 123 * move it to the head of the list: repeated fsck errors 124 124 * are common 125 125 */ 126 - list_move(&s->list, &c->fsck_errors); 126 + list_move(&s->list, &c->fsck_error_msgs); 127 127 return s; 128 128 } 129 129 130 130 s = kzalloc(sizeof(*s), GFP_NOFS); 131 131 if (!s) { 132 - if (!c->fsck_alloc_err) 132 + if (!c->fsck_alloc_msgs_err) 133 133 bch_err(c, "kmalloc err, cannot ratelimit fsck errs"); 134 - c->fsck_alloc_err = true; 134 + c->fsck_alloc_msgs_err = true; 135 135 return NULL; 136 136 } 137 137 138 138 INIT_LIST_HEAD(&s->list); 139 139 s->fmt = fmt; 140 - list_add(&s->list, &c->fsck_errors); 140 + list_add(&s->list, &c->fsck_error_msgs); 141 141 return s; 142 142 } 143 143 ··· 153 153 prt_vprintf(out, fmt, args); 154 154 va_end(args); 155 155 156 - mutex_lock(&c->fsck_error_lock); 156 + mutex_lock(&c->fsck_error_msgs_lock); 157 157 s = fsck_err_get(c, fmt); 158 158 if (s) { 159 159 /* ··· 163 163 */ 164 164 if (s->last_msg && !strcmp(buf.buf, s->last_msg)) { 165 165 ret = s->ret; 166 - mutex_unlock(&c->fsck_error_lock); 166 + mutex_unlock(&c->fsck_error_msgs_lock); 167 167 printbuf_exit(&buf); 168 168 return ret; 169 169 } ··· 258 258 if (s) 259 259 s->ret = ret; 260 260 261 - mutex_unlock(&c->fsck_error_lock); 261 + mutex_unlock(&c->fsck_error_msgs_lock); 262 262 263 263 printbuf_exit(&buf); 264 264 ··· 279 279 { 280 280 struct fsck_err_state *s, *n; 281 281 282 - mutex_lock(&c->fsck_error_lock); 282 + mutex_lock(&c->fsck_error_msgs_lock); 283 283 284 - list_for_each_entry_safe(s, n, &c->fsck_errors, list) { 284 + list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) { 285 285 if (s->ratelimited && s->last_msg) 286 286 bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg); 287 287 ··· 290 290 kfree(s); 291 291 } 292 292 293 - mutex_unlock(&c->fsck_error_lock); 293 + mutex_unlock(&c->fsck_error_msgs_lock); 294 294 }
+175
fs/bcachefs/sb-errors.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include "bcachefs.h" 4 + #include "sb-errors.h" 5 + #include "super-io.h" 6 + 7 + static const char * const bch2_sb_error_strs[] = { 8 + #define x(t, n, ...) [n] = #t, 9 + BCH_SB_ERRS() 10 + NULL 11 + }; 12 + 13 + static void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id) 14 + { 15 + if (id < BCH_SB_ERR_MAX) 16 + prt_str(out, bch2_sb_error_strs[id]); 17 + else 18 + prt_printf(out, "(unknown error %u)", id); 19 + } 20 + 21 + static inline unsigned bch2_sb_field_errors_nr_entries(struct bch_sb_field_errors *e) 22 + { 23 + return e 24 + ? (bch2_sb_field_bytes(&e->field) - sizeof(*e)) / sizeof(e->entries[0]) 25 + : 0; 26 + } 27 + 28 + static inline unsigned bch2_sb_field_errors_u64s(unsigned nr) 29 + { 30 + return (sizeof(struct bch_sb_field_errors) + 31 + sizeof(struct bch_sb_field_error_entry) * nr) / sizeof(u64); 32 + } 33 + 34 + static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f, 35 + struct printbuf *err) 36 + { 37 + struct bch_sb_field_errors *e = field_to_type(f, errors); 38 + unsigned i, nr = bch2_sb_field_errors_nr_entries(e); 39 + 40 + for (i = 0; i < nr; i++) { 41 + if (!BCH_SB_ERROR_ENTRY_NR(&e->entries[i])) { 42 + prt_printf(err, "entry with count 0 (id "); 43 + bch2_sb_error_id_to_text(err, BCH_SB_ERROR_ENTRY_ID(&e->entries[i])); 44 + prt_printf(err, ")"); 45 + return -BCH_ERR_invalid_sb_errors; 46 + } 47 + 48 + if (i + 1 < nr && 49 + BCH_SB_ERROR_ENTRY_ID(&e->entries[i]) >= 50 + BCH_SB_ERROR_ENTRY_ID(&e->entries[i + 1])) { 51 + prt_printf(err, "entries out of order"); 52 + return -BCH_ERR_invalid_sb_errors; 53 + } 54 + } 55 + 56 + return 0; 57 + } 58 + 59 + static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb, 60 + struct bch_sb_field *f) 61 + { 62 + struct bch_sb_field_errors *e = field_to_type(f, errors); 63 + unsigned i, nr = bch2_sb_field_errors_nr_entries(e); 64 + u64 now = ktime_get_real_seconds(); 65 + 66 + if (out->nr_tabstops <= 1) 67 + printbuf_tabstop_push(out, 16); 68 + 69 + for (i = 0; i < nr; i++) { 70 + bch2_sb_error_id_to_text(out, BCH_SB_ERROR_ENTRY_ID(&e->entries[i])); 71 + prt_tab(out); 72 + prt_u64(out, BCH_SB_ERROR_ENTRY_NR(&e->entries[i])); 73 + prt_tab(out); 74 + bch2_pr_time_units(out, (now - le64_to_cpu(e->entries[i].last_error_time)) * 75 + NSEC_PER_SEC); 76 + prt_str(out, " ago"); 77 + prt_newline(out); 78 + } 79 + } 80 + 81 + const struct bch_sb_field_ops bch_sb_field_ops_errors = { 82 + .validate = bch2_sb_errors_validate, 83 + .to_text = bch2_sb_errors_to_text, 84 + }; 85 + 86 + void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err) 87 + { 88 + bch_sb_errors_cpu *e = &c->fsck_error_counts; 89 + struct bch_sb_error_entry_cpu n = { 90 + .id = err, 91 + .nr = 1, 92 + .last_error_time = ktime_get_real_seconds() 93 + }; 94 + unsigned i; 95 + 96 + mutex_lock(&c->fsck_error_counts_lock); 97 + for (i = 0; i < e->nr; i++) { 98 + if (err == e->data[i].id) { 99 + e->data[i].nr++; 100 + e->data[i].last_error_time = n.last_error_time; 101 + goto out; 102 + } 103 + if (err < e->data[i].id) 104 + break; 105 + } 106 + 107 + if (darray_make_room(e, 1)) 108 + goto out; 109 + 110 + darray_insert_item(e, i, n); 111 + out: 112 + mutex_unlock(&c->fsck_error_counts_lock); 113 + } 114 + 115 + void bch2_sb_errors_from_cpu(struct bch_fs *c) 116 + { 117 + bch_sb_errors_cpu *src = &c->fsck_error_counts; 118 + struct bch_sb_field_errors *dst = 119 + bch2_sb_field_resize(&c->disk_sb, errors, 120 + bch2_sb_field_errors_u64s(src->nr)); 121 + unsigned i; 122 + 123 + if (!dst) 124 + return; 125 + 126 + for (i = 0; i < src->nr; i++) { 127 + SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id); 128 + SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr); 129 + dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time); 130 + } 131 + } 132 + 133 + static int bch2_sb_errors_to_cpu(struct bch_fs *c) 134 + { 135 + struct bch_sb_field_errors *src = bch2_sb_field_get(c->disk_sb.sb, errors); 136 + bch_sb_errors_cpu *dst = &c->fsck_error_counts; 137 + unsigned i, nr = bch2_sb_field_errors_nr_entries(src); 138 + int ret; 139 + 140 + if (!nr) 141 + return 0; 142 + 143 + mutex_lock(&c->fsck_error_counts_lock); 144 + ret = darray_make_room(dst, nr); 145 + if (ret) 146 + goto err; 147 + 148 + dst->nr = nr; 149 + 150 + for (i = 0; i < nr; i++) { 151 + dst->data[i].id = BCH_SB_ERROR_ENTRY_ID(&src->entries[i]); 152 + dst->data[i].nr = BCH_SB_ERROR_ENTRY_NR(&src->entries[i]); 153 + dst->data[i].last_error_time = le64_to_cpu(src->entries[i].last_error_time); 154 + } 155 + err: 156 + mutex_unlock(&c->fsck_error_counts_lock); 157 + 158 + return ret; 159 + } 160 + 161 + void bch2_fs_sb_errors_exit(struct bch_fs *c) 162 + { 163 + darray_exit(&c->fsck_error_counts); 164 + } 165 + 166 + void bch2_fs_sb_errors_init_early(struct bch_fs *c) 167 + { 168 + mutex_init(&c->fsck_error_counts_lock); 169 + darray_init(&c->fsck_error_counts); 170 + } 171 + 172 + int bch2_fs_sb_errors_init(struct bch_fs *c) 173 + { 174 + return bch2_sb_errors_to_cpu(c); 175 + }
+26
fs/bcachefs/sb-errors.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _BCACHEFS_SB_ERRORS_H 3 + #define _BCACHEFS_SB_ERRORS_H 4 + 5 + #include "sb-errors_types.h" 6 + 7 + #define BCH_SB_ERRS() 8 + 9 + enum bch_sb_error_id { 10 + #define x(t, n) BCH_FSCK_ERR_##t = n, 11 + BCH_SB_ERRS() 12 + #undef x 13 + BCH_SB_ERR_MAX 14 + }; 15 + 16 + extern const struct bch_sb_field_ops bch_sb_field_ops_errors; 17 + 18 + void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id); 19 + 20 + void bch2_sb_errors_from_cpu(struct bch_fs *); 21 + 22 + void bch2_fs_sb_errors_exit(struct bch_fs *); 23 + void bch2_fs_sb_errors_init_early(struct bch_fs *); 24 + int bch2_fs_sb_errors_init(struct bch_fs *); 25 + 26 + #endif /* _BCACHEFS_SB_ERRORS_H */
+16
fs/bcachefs/sb-errors_types.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _BCACHEFS_SB_ERRORS_TYPES_H 3 + #define _BCACHEFS_SB_ERRORS_TYPES_H 4 + 5 + #include "darray.h" 6 + 7 + struct bch_sb_error_entry_cpu { 8 + u64 id:16, 9 + nr:48; 10 + u64 last_error_time; 11 + }; 12 + 13 + typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu; 14 + 15 + #endif /* _BCACHEFS_SB_ERRORS_TYPES_H */ 16 +
+1 -1
fs/bcachefs/sb-members.c
··· 84 84 return 0; 85 85 } 86 86 87 - int bch2_members_v2_init(struct bch_fs *c) 87 + int bch2_sb_members_v2_init(struct bch_fs *c) 88 88 { 89 89 struct bch_sb_field_members_v1 *mi1; 90 90 struct bch_sb_field_members_v2 *mi2;
+1 -1
fs/bcachefs/sb-members.h
··· 4 4 5 5 extern char * const bch2_member_error_strs[]; 6 6 7 - int bch2_members_v2_init(struct bch_fs *c); 7 + int bch2_sb_members_v2_init(struct bch_fs *c); 8 8 int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb); 9 9 struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i); 10 10 struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i);
+3
fs/bcachefs/super-io.c
··· 13 13 #include "replicas.h" 14 14 #include "quota.h" 15 15 #include "sb-clean.h" 16 + #include "sb-errors.h" 16 17 #include "sb-members.h" 17 18 #include "super-io.h" 18 19 #include "super.h" ··· 898 897 SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN); 899 898 900 899 bch2_sb_counters_from_cpu(c); 900 + bch2_sb_members_from_cpu(c); 901 901 bch2_sb_members_cpy_v2_v1(&c->disk_sb); 902 + bch2_sb_errors_from_cpu(c); 902 903 903 904 for_each_online_member(ca, c, i) 904 905 bch2_sb_from_fs(c, ca);
+5
fs/bcachefs/super-io.h
··· 23 23 unsigned, 24 24 unsigned); 25 25 26 + static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f) 27 + { 28 + return le32_to_cpu(f->u64s) * sizeof(u64); 29 + } 30 + 26 31 #define field_to_type(_f, _name) \ 27 32 container_of_or_null(_f, struct bch_sb_field_##_name, field) 28 33
+8 -4
fs/bcachefs/super.c
··· 49 49 #include "recovery.h" 50 50 #include "replicas.h" 51 51 #include "sb-clean.h" 52 + #include "sb-errors.h" 52 53 #include "sb-members.h" 53 54 #include "snapshot.h" 54 55 #include "subvolume.h" ··· 401 400 402 401 bch_info(c, "going read-write"); 403 402 404 - ret = bch2_members_v2_init(c); 403 + ret = bch2_sb_members_v2_init(c); 405 404 if (ret) 406 405 goto err; 407 406 ··· 482 481 bch2_time_stats_exit(&c->times[i]); 483 482 484 483 bch2_free_pending_node_rewrites(c); 484 + bch2_fs_sb_errors_exit(c); 485 485 bch2_fs_counters_exit(c); 486 486 bch2_fs_snapshots_exit(c); 487 487 bch2_fs_quota_exit(c); ··· 715 713 bch2_fs_quota_init(c); 716 714 bch2_fs_ec_init_early(c); 717 715 bch2_fs_move_init(c); 716 + bch2_fs_sb_errors_init_early(c); 718 717 719 718 INIT_LIST_HEAD(&c->list); 720 719 ··· 732 729 733 730 INIT_LIST_HEAD(&c->journal_iters); 734 731 735 - INIT_LIST_HEAD(&c->fsck_errors); 736 - mutex_init(&c->fsck_error_lock); 732 + INIT_LIST_HEAD(&c->fsck_error_msgs); 733 + mutex_init(&c->fsck_error_msgs_lock); 737 734 738 735 seqcount_init(&c->gc_pos_lock); 739 736 ··· 843 840 } 844 841 845 842 ret = bch2_fs_counters_init(c) ?: 843 + bch2_fs_sb_errors_init(c) ?: 846 844 bch2_io_clock_init(&c->io_clock[READ]) ?: 847 845 bch2_io_clock_init(&c->io_clock[WRITE]) ?: 848 846 bch2_fs_journal_init(&c->journal) ?: ··· 946 942 947 943 mutex_lock(&c->sb_lock); 948 944 949 - ret = bch2_members_v2_init(c); 945 + ret = bch2_sb_members_v2_init(c); 950 946 if (ret) { 951 947 mutex_unlock(&c->sb_lock); 952 948 goto err;