Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0
2#include "bcachefs.h"
3#include "alloc_background.h"
4#include "alloc_foreground.h"
5#include "backpointers.h"
6#include "bkey_buf.h"
7#include "btree_cache.h"
8#include "btree_io.h"
9#include "btree_key_cache.h"
10#include "btree_update.h"
11#include "btree_update_interior.h"
12#include "btree_gc.h"
13#include "btree_write_buffer.h"
14#include "buckets.h"
15#include "buckets_waiting_for_journal.h"
16#include "clock.h"
17#include "debug.h"
18#include "ec.h"
19#include "error.h"
20#include "lru.h"
21#include "recovery.h"
22#include "trace.h"
23#include "varint.h"
24
25#include <linux/kthread.h>
26#include <linux/math64.h>
27#include <linux/random.h>
28#include <linux/rculist.h>
29#include <linux/rcupdate.h>
30#include <linux/sched/task.h>
31#include <linux/sort.h>
32
33static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);
34
35/* Persistent alloc info: */
36
37static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
38#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
39 BCH_ALLOC_FIELDS_V1()
40#undef x
41};
42
43struct bkey_alloc_unpacked {
44 u64 journal_seq;
45 u8 gen;
46 u8 oldest_gen;
47 u8 data_type;
48 bool need_discard:1;
49 bool need_inc_gen:1;
50#define x(_name, _bits) u##_bits _name;
51 BCH_ALLOC_FIELDS_V2()
52#undef x
53};
54
55static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
56 const void **p, unsigned field)
57{
58 unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
59 u64 v;
60
61 if (!(a->fields & (1 << field)))
62 return 0;
63
64 switch (bytes) {
65 case 1:
66 v = *((const u8 *) *p);
67 break;
68 case 2:
69 v = le16_to_cpup(*p);
70 break;
71 case 4:
72 v = le32_to_cpup(*p);
73 break;
74 case 8:
75 v = le64_to_cpup(*p);
76 break;
77 default:
78 BUG();
79 }
80
81 *p += bytes;
82 return v;
83}
84
85static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
86 struct bkey_s_c k)
87{
88 const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
89 const void *d = in->data;
90 unsigned idx = 0;
91
92 out->gen = in->gen;
93
94#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
95 BCH_ALLOC_FIELDS_V1()
96#undef x
97}
98
99static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
100 struct bkey_s_c k)
101{
102 struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
103 const u8 *in = a.v->data;
104 const u8 *end = bkey_val_end(a);
105 unsigned fieldnr = 0;
106 int ret;
107 u64 v;
108
109 out->gen = a.v->gen;
110 out->oldest_gen = a.v->oldest_gen;
111 out->data_type = a.v->data_type;
112
113#define x(_name, _bits) \
114 if (fieldnr < a.v->nr_fields) { \
115 ret = bch2_varint_decode_fast(in, end, &v); \
116 if (ret < 0) \
117 return ret; \
118 in += ret; \
119 } else { \
120 v = 0; \
121 } \
122 out->_name = v; \
123 if (v != out->_name) \
124 return -1; \
125 fieldnr++;
126
127 BCH_ALLOC_FIELDS_V2()
128#undef x
129 return 0;
130}
131
132static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
133 struct bkey_s_c k)
134{
135 struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
136 const u8 *in = a.v->data;
137 const u8 *end = bkey_val_end(a);
138 unsigned fieldnr = 0;
139 int ret;
140 u64 v;
141
142 out->gen = a.v->gen;
143 out->oldest_gen = a.v->oldest_gen;
144 out->data_type = a.v->data_type;
145 out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
146 out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
147 out->journal_seq = le64_to_cpu(a.v->journal_seq);
148
149#define x(_name, _bits) \
150 if (fieldnr < a.v->nr_fields) { \
151 ret = bch2_varint_decode_fast(in, end, &v); \
152 if (ret < 0) \
153 return ret; \
154 in += ret; \
155 } else { \
156 v = 0; \
157 } \
158 out->_name = v; \
159 if (v != out->_name) \
160 return -1; \
161 fieldnr++;
162
163 BCH_ALLOC_FIELDS_V2()
164#undef x
165 return 0;
166}
167
168static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
169{
170 struct bkey_alloc_unpacked ret = { .gen = 0 };
171
172 switch (k.k->type) {
173 case KEY_TYPE_alloc:
174 bch2_alloc_unpack_v1(&ret, k);
175 break;
176 case KEY_TYPE_alloc_v2:
177 bch2_alloc_unpack_v2(&ret, k);
178 break;
179 case KEY_TYPE_alloc_v3:
180 bch2_alloc_unpack_v3(&ret, k);
181 break;
182 }
183
184 return ret;
185}
186
187static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
188{
189 unsigned i, bytes = offsetof(struct bch_alloc, data);
190
191 for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
192 if (a->fields & (1 << i))
193 bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
194
195 return DIV_ROUND_UP(bytes, sizeof(u64));
196}
197
198int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k,
199 enum bch_validate_flags flags,
200 struct printbuf *err)
201{
202 struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
203 int ret = 0;
204
205 /* allow for unknown fields */
206 bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), c, err,
207 alloc_v1_val_size_bad,
208 "incorrect value size (%zu < %u)",
209 bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
210fsck_err:
211 return ret;
212}
213
214int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
215 enum bch_validate_flags flags,
216 struct printbuf *err)
217{
218 struct bkey_alloc_unpacked u;
219 int ret = 0;
220
221 bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), c, err,
222 alloc_v2_unpack_error,
223 "unpack error");
224fsck_err:
225 return ret;
226}
227
228int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
229 enum bch_validate_flags flags,
230 struct printbuf *err)
231{
232 struct bkey_alloc_unpacked u;
233 int ret = 0;
234
235 bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), c, err,
236 alloc_v2_unpack_error,
237 "unpack error");
238fsck_err:
239 return ret;
240}
241
242int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
243 enum bch_validate_flags flags, struct printbuf *err)
244{
245 struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
246 int ret = 0;
247
248 bkey_fsck_err_on(alloc_v4_u64s_noerror(a.v) > bkey_val_u64s(k.k), c, err,
249 alloc_v4_val_size_bad,
250 "bad val size (%u > %zu)",
251 alloc_v4_u64s_noerror(a.v), bkey_val_u64s(k.k));
252
253 bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
254 BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err,
255 alloc_v4_backpointers_start_bad,
256 "invalid backpointers_start");
257
258 bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, c, err,
259 alloc_key_data_type_bad,
260 "invalid data type (got %u should be %u)",
261 a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
262
263 for (unsigned i = 0; i < 2; i++)
264 bkey_fsck_err_on(a.v->io_time[i] > LRU_TIME_MAX,
265 c, err,
266 alloc_key_io_time_bad,
267 "invalid io_time[%s]: %llu, max %llu",
268 i == READ ? "read" : "write",
269 a.v->io_time[i], LRU_TIME_MAX);
270
271 switch (a.v->data_type) {
272 case BCH_DATA_free:
273 case BCH_DATA_need_gc_gens:
274 case BCH_DATA_need_discard:
275 bkey_fsck_err_on(bch2_bucket_sectors_total(*a.v) || a.v->stripe,
276 c, err, alloc_key_empty_but_have_data,
277 "empty data type free but have data");
278 break;
279 case BCH_DATA_sb:
280 case BCH_DATA_journal:
281 case BCH_DATA_btree:
282 case BCH_DATA_user:
283 case BCH_DATA_parity:
284 bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v),
285 c, err, alloc_key_dirty_sectors_0,
286 "data_type %s but dirty_sectors==0",
287 bch2_data_type_str(a.v->data_type));
288 break;
289 case BCH_DATA_cached:
290 bkey_fsck_err_on(!a.v->cached_sectors ||
291 bch2_bucket_sectors_dirty(*a.v) ||
292 a.v->stripe,
293 c, err, alloc_key_cached_inconsistency,
294 "data type inconsistency");
295
296 bkey_fsck_err_on(!a.v->io_time[READ] &&
297 c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
298 c, err, alloc_key_cached_but_read_time_zero,
299 "cached bucket with read_time == 0");
300 break;
301 case BCH_DATA_stripe:
302 break;
303 }
304fsck_err:
305 return ret;
306}
307
308void bch2_alloc_v4_swab(struct bkey_s k)
309{
310 struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
311 struct bch_backpointer *bp, *bps;
312
313 a->journal_seq = swab64(a->journal_seq);
314 a->flags = swab32(a->flags);
315 a->dirty_sectors = swab32(a->dirty_sectors);
316 a->cached_sectors = swab32(a->cached_sectors);
317 a->io_time[0] = swab64(a->io_time[0]);
318 a->io_time[1] = swab64(a->io_time[1]);
319 a->stripe = swab32(a->stripe);
320 a->nr_external_backpointers = swab32(a->nr_external_backpointers);
321 a->fragmentation_lru = swab64(a->fragmentation_lru);
322
323 bps = alloc_v4_backpointers(a);
324 for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
325 bp->bucket_offset = swab40(bp->bucket_offset);
326 bp->bucket_len = swab32(bp->bucket_len);
327 bch2_bpos_swab(&bp->pos);
328 }
329}
330
331void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
332{
333 struct bch_alloc_v4 _a;
334 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
335
336 prt_newline(out);
337 printbuf_indent_add(out, 2);
338
339 prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
340 bch2_prt_data_type(out, a->data_type);
341 prt_newline(out);
342 prt_printf(out, "journal_seq %llu\n", a->journal_seq);
343 prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a));
344 prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a));
345 prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors);
346 prt_printf(out, "cached_sectors %u\n", a->cached_sectors);
347 prt_printf(out, "stripe %u\n", a->stripe);
348 prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy);
349 prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]);
350 prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]);
351 prt_printf(out, "fragmentation %llu\n", a->fragmentation_lru);
352 prt_printf(out, "bp_start %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a));
353 printbuf_indent_sub(out, 2);
354}
355
356void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
357{
358 if (k.k->type == KEY_TYPE_alloc_v4) {
359 void *src, *dst;
360
361 *out = *bkey_s_c_to_alloc_v4(k).v;
362
363 src = alloc_v4_backpointers(out);
364 SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
365 dst = alloc_v4_backpointers(out);
366
367 if (src < dst)
368 memset(src, 0, dst - src);
369
370 SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0);
371 } else {
372 struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
373
374 *out = (struct bch_alloc_v4) {
375 .journal_seq = u.journal_seq,
376 .flags = u.need_discard,
377 .gen = u.gen,
378 .oldest_gen = u.oldest_gen,
379 .data_type = u.data_type,
380 .stripe_redundancy = u.stripe_redundancy,
381 .dirty_sectors = u.dirty_sectors,
382 .cached_sectors = u.cached_sectors,
383 .io_time[READ] = u.read_time,
384 .io_time[WRITE] = u.write_time,
385 .stripe = u.stripe,
386 };
387
388 SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
389 }
390}
391
392static noinline struct bkey_i_alloc_v4 *
393__bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
394{
395 struct bkey_i_alloc_v4 *ret;
396
397 ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4)));
398 if (IS_ERR(ret))
399 return ret;
400
401 if (k.k->type == KEY_TYPE_alloc_v4) {
402 void *src, *dst;
403
404 bkey_reassemble(&ret->k_i, k);
405
406 src = alloc_v4_backpointers(&ret->v);
407 SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
408 dst = alloc_v4_backpointers(&ret->v);
409
410 if (src < dst)
411 memset(src, 0, dst - src);
412
413 SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0);
414 set_alloc_v4_u64s(ret);
415 } else {
416 bkey_alloc_v4_init(&ret->k_i);
417 ret->k.p = k.k->p;
418 bch2_alloc_to_v4(k, &ret->v);
419 }
420 return ret;
421}
422
423static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k)
424{
425 struct bkey_s_c_alloc_v4 a;
426
427 if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
428 ((a = bkey_s_c_to_alloc_v4(k), true) &&
429 BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0))
430 return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4);
431
432 return __bch2_alloc_to_v4_mut(trans, k);
433}
434
435struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
436{
437 return bch2_alloc_to_v4_mut_inlined(trans, k);
438}
439
440struct bkey_i_alloc_v4 *
441bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter,
442 struct bpos pos)
443{
444 struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos,
445 BTREE_ITER_with_updates|
446 BTREE_ITER_cached|
447 BTREE_ITER_intent);
448 int ret = bkey_err(k);
449 if (unlikely(ret))
450 return ERR_PTR(ret);
451
452 struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k);
453 ret = PTR_ERR_OR_ZERO(a);
454 if (unlikely(ret))
455 goto err;
456 return a;
457err:
458 bch2_trans_iter_exit(trans, iter);
459 return ERR_PTR(ret);
460}
461
462__flatten
463struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos)
464{
465 struct btree_iter iter;
466 struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos);
467 int ret = PTR_ERR_OR_ZERO(a);
468 if (ret)
469 return ERR_PTR(ret);
470
471 ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
472 bch2_trans_iter_exit(trans, &iter);
473 return unlikely(ret) ? ERR_PTR(ret) : a;
474}
475
476static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset)
477{
478 *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK;
479
480 pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS;
481 return pos;
482}
483
484static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset)
485{
486 pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS;
487 pos.offset += offset;
488 return pos;
489}
490
491static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
492{
493 return k.k->type == KEY_TYPE_bucket_gens
494 ? bkey_s_c_to_bucket_gens(k).v->gens[offset]
495 : 0;
496}
497
498int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k,
499 enum bch_validate_flags flags,
500 struct printbuf *err)
501{
502 int ret = 0;
503
504 bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), c, err,
505 bucket_gens_val_size_bad,
506 "bad val size (%zu != %zu)",
507 bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
508fsck_err:
509 return ret;
510}
511
512void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
513{
514 struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k);
515 unsigned i;
516
517 for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) {
518 if (i)
519 prt_char(out, ' ');
520 prt_printf(out, "%u", g.v->gens[i]);
521 }
522}
523
524int bch2_bucket_gens_init(struct bch_fs *c)
525{
526 struct btree_trans *trans = bch2_trans_get(c);
527 struct bkey_i_bucket_gens g;
528 bool have_bucket_gens_key = false;
529 int ret;
530
531 ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
532 BTREE_ITER_prefetch, k, ({
533 /*
534 * Not a fsck error because this is checked/repaired by
535 * bch2_check_alloc_key() which runs later:
536 */
537 if (!bch2_dev_bucket_exists(c, k.k->p))
538 continue;
539
540 struct bch_alloc_v4 a;
541 u8 gen = bch2_alloc_to_v4(k, &a)->gen;
542 unsigned offset;
543 struct bpos pos = alloc_gens_pos(iter.pos, &offset);
544 int ret2 = 0;
545
546 if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
547 ret2 = bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?:
548 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
549 if (ret2)
550 goto iter_err;
551 have_bucket_gens_key = false;
552 }
553
554 if (!have_bucket_gens_key) {
555 bkey_bucket_gens_init(&g.k_i);
556 g.k.p = pos;
557 have_bucket_gens_key = true;
558 }
559
560 g.v.gens[offset] = gen;
561iter_err:
562 ret2;
563 }));
564
565 if (have_bucket_gens_key && !ret)
566 ret = commit_do(trans, NULL, NULL,
567 BCH_TRANS_COMMIT_no_enospc,
568 bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
569
570 bch2_trans_put(trans);
571
572 bch_err_fn(c, ret);
573 return ret;
574}
575
576int bch2_alloc_read(struct bch_fs *c)
577{
578 struct btree_trans *trans = bch2_trans_get(c);
579 struct bch_dev *ca = NULL;
580 int ret;
581
582 down_read(&c->gc_lock);
583
584 if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
585 ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
586 BTREE_ITER_prefetch, k, ({
587 u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
588 u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
589
590 if (k.k->type != KEY_TYPE_bucket_gens)
591 continue;
592
593 ca = bch2_dev_iterate(c, ca, k.k->p.inode);
594 /*
595 * Not a fsck error because this is checked/repaired by
596 * bch2_check_alloc_key() which runs later:
597 */
598 if (!ca) {
599 bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
600 continue;
601 }
602
603 const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v;
604
605 for (u64 b = max_t(u64, ca->mi.first_bucket, start);
606 b < min_t(u64, ca->mi.nbuckets, end);
607 b++)
608 *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
609 0;
610 }));
611 } else {
612 ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
613 BTREE_ITER_prefetch, k, ({
614 ca = bch2_dev_iterate(c, ca, k.k->p.inode);
615 /*
616 * Not a fsck error because this is checked/repaired by
617 * bch2_check_alloc_key() which runs later:
618 */
619 if (!ca) {
620 bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
621 continue;
622 }
623
624 struct bch_alloc_v4 a;
625 *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
626 0;
627 }));
628 }
629
630 bch2_dev_put(ca);
631 bch2_trans_put(trans);
632 up_read(&c->gc_lock);
633
634 bch_err_fn(c, ret);
635 return ret;
636}
637
638/* Free space/discard btree: */
639
640static int bch2_bucket_do_index(struct btree_trans *trans,
641 struct bch_dev *ca,
642 struct bkey_s_c alloc_k,
643 const struct bch_alloc_v4 *a,
644 bool set)
645{
646 struct bch_fs *c = trans->c;
647 struct btree_iter iter;
648 struct bkey_s_c old;
649 struct bkey_i *k;
650 enum btree_id btree;
651 enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
652 enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
653 struct printbuf buf = PRINTBUF;
654 int ret;
655
656 if (a->data_type != BCH_DATA_free &&
657 a->data_type != BCH_DATA_need_discard)
658 return 0;
659
660 k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
661 if (IS_ERR(k))
662 return PTR_ERR(k);
663
664 bkey_init(&k->k);
665 k->k.type = new_type;
666
667 switch (a->data_type) {
668 case BCH_DATA_free:
669 btree = BTREE_ID_freespace;
670 k->k.p = alloc_freespace_pos(alloc_k.k->p, *a);
671 bch2_key_resize(&k->k, 1);
672 break;
673 case BCH_DATA_need_discard:
674 btree = BTREE_ID_need_discard;
675 k->k.p = alloc_k.k->p;
676 break;
677 default:
678 return 0;
679 }
680
681 old = bch2_bkey_get_iter(trans, &iter, btree,
682 bkey_start_pos(&k->k),
683 BTREE_ITER_intent);
684 ret = bkey_err(old);
685 if (ret)
686 return ret;
687
688 if (ca->mi.freespace_initialized &&
689 c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info &&
690 bch2_trans_inconsistent_on(old.k->type != old_type, trans,
691 "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n"
692 " for %s",
693 set ? "setting" : "clearing",
694 bch2_btree_id_str(btree),
695 iter.pos.inode,
696 iter.pos.offset,
697 bch2_bkey_types[old.k->type],
698 bch2_bkey_types[old_type],
699 (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
700 ret = -EIO;
701 goto err;
702 }
703
704 ret = bch2_trans_update(trans, &iter, k, 0);
705err:
706 bch2_trans_iter_exit(trans, &iter);
707 printbuf_exit(&buf);
708 return ret;
709}
710
711static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
712 struct bpos bucket, u8 gen)
713{
714 struct btree_iter iter;
715 unsigned offset;
716 struct bpos pos = alloc_gens_pos(bucket, &offset);
717 struct bkey_i_bucket_gens *g;
718 struct bkey_s_c k;
719 int ret;
720
721 g = bch2_trans_kmalloc(trans, sizeof(*g));
722 ret = PTR_ERR_OR_ZERO(g);
723 if (ret)
724 return ret;
725
726 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos,
727 BTREE_ITER_intent|
728 BTREE_ITER_with_updates);
729 ret = bkey_err(k);
730 if (ret)
731 return ret;
732
733 if (k.k->type != KEY_TYPE_bucket_gens) {
734 bkey_bucket_gens_init(&g->k_i);
735 g->k.p = iter.pos;
736 } else {
737 bkey_reassemble(&g->k_i, k);
738 }
739
740 g->v.gens[offset] = gen;
741
742 ret = bch2_trans_update(trans, &iter, &g->k_i, 0);
743 bch2_trans_iter_exit(trans, &iter);
744 return ret;
745}
746
747int bch2_trigger_alloc(struct btree_trans *trans,
748 enum btree_id btree, unsigned level,
749 struct bkey_s_c old, struct bkey_s new,
750 enum btree_iter_update_trigger_flags flags)
751{
752 struct bch_fs *c = trans->c;
753 struct printbuf buf = PRINTBUF;
754 int ret = 0;
755
756 struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
757 if (!ca)
758 return -EIO;
759
760 struct bch_alloc_v4 old_a_convert;
761 const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
762
763 if (flags & BTREE_TRIGGER_transactional) {
764 struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
765
766 alloc_data_type_set(new_a, new_a->data_type);
767
768 if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) {
769 new_a->io_time[READ] = bch2_current_io_time(c, READ);
770 new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE);
771 SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
772 SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
773 }
774
775 if (data_type_is_empty(new_a->data_type) &&
776 BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
777 !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
778 new_a->gen++;
779 SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
780 alloc_data_type_set(new_a, new_a->data_type);
781 }
782
783 if (old_a->data_type != new_a->data_type ||
784 (new_a->data_type == BCH_DATA_free &&
785 alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
786 ret = bch2_bucket_do_index(trans, ca, old, old_a, false) ?:
787 bch2_bucket_do_index(trans, ca, new.s_c, new_a, true);
788 if (ret)
789 goto err;
790 }
791
792 if (new_a->data_type == BCH_DATA_cached &&
793 !new_a->io_time[READ])
794 new_a->io_time[READ] = bch2_current_io_time(c, READ);
795
796 u64 old_lru = alloc_lru_idx_read(*old_a);
797 u64 new_lru = alloc_lru_idx_read(*new_a);
798 if (old_lru != new_lru) {
799 ret = bch2_lru_change(trans, new.k->p.inode,
800 bucket_to_u64(new.k->p),
801 old_lru, new_lru);
802 if (ret)
803 goto err;
804 }
805
806 new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, ca);
807 if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
808 ret = bch2_lru_change(trans,
809 BCH_LRU_FRAGMENTATION_START,
810 bucket_to_u64(new.k->p),
811 old_a->fragmentation_lru, new_a->fragmentation_lru);
812 if (ret)
813 goto err;
814 }
815
816 if (old_a->gen != new_a->gen) {
817 ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen);
818 if (ret)
819 goto err;
820 }
821
822 /*
823 * need to know if we're getting called from the invalidate path or
824 * not:
825 */
826
827 if ((flags & BTREE_TRIGGER_bucket_invalidate) &&
828 old_a->cached_sectors) {
829 ret = bch2_update_cached_sectors_list(trans, new.k->p.inode,
830 -((s64) old_a->cached_sectors));
831 if (ret)
832 goto err;
833 }
834 }
835
836 if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
837 struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
838 u64 journal_seq = trans->journal_res.seq;
839 u64 bucket_journal_seq = new_a->journal_seq;
840
841 if ((flags & BTREE_TRIGGER_insert) &&
842 data_type_is_empty(old_a->data_type) !=
843 data_type_is_empty(new_a->data_type) &&
844 new.k->type == KEY_TYPE_alloc_v4) {
845 struct bch_alloc_v4 *v = bkey_s_to_alloc_v4(new).v;
846
847 /*
848 * If the btree updates referring to a bucket weren't flushed
849 * before the bucket became empty again, then the we don't have
850 * to wait on a journal flush before we can reuse the bucket:
851 */
852 v->journal_seq = bucket_journal_seq =
853 data_type_is_empty(new_a->data_type) &&
854 (journal_seq == v->journal_seq ||
855 bch2_journal_noflush_seq(&c->journal, v->journal_seq))
856 ? 0 : journal_seq;
857 }
858
859 if (!data_type_is_empty(old_a->data_type) &&
860 data_type_is_empty(new_a->data_type) &&
861 bucket_journal_seq) {
862 ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
863 c->journal.flushed_seq_ondisk,
864 new.k->p.inode, new.k->p.offset,
865 bucket_journal_seq);
866 if (ret) {
867 bch2_fs_fatal_error(c,
868 "setting bucket_needs_journal_commit: %s", bch2_err_str(ret));
869 goto err;
870 }
871 }
872
873 percpu_down_read(&c->mark_lock);
874 if (new_a->gen != old_a->gen) {
875 u8 *gen = bucket_gen(ca, new.k->p.offset);
876 if (unlikely(!gen)) {
877 percpu_up_read(&c->mark_lock);
878 goto invalid_bucket;
879 }
880 *gen = new_a->gen;
881 }
882
883 bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false);
884 percpu_up_read(&c->mark_lock);
885
886#define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; })
887#define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr)
888#define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk)
889
890 if (statechange(a->data_type == BCH_DATA_free) &&
891 bucket_flushed(new_a))
892 closure_wake_up(&c->freelist_wait);
893
894 if (statechange(a->data_type == BCH_DATA_need_discard) &&
895 !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) &&
896 bucket_flushed(new_a))
897 bch2_discard_one_bucket_fast(ca, new.k->p.offset);
898
899 if (statechange(a->data_type == BCH_DATA_cached) &&
900 !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
901 should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
902 bch2_dev_do_invalidates(ca);
903
904 if (statechange(a->data_type == BCH_DATA_need_gc_gens))
905 bch2_gc_gens_async(c);
906 }
907
908 if ((flags & BTREE_TRIGGER_gc) &&
909 (flags & BTREE_TRIGGER_bucket_invalidate)) {
910 struct bch_alloc_v4 new_a_convert;
911 const struct bch_alloc_v4 *new_a = bch2_alloc_to_v4(new.s_c, &new_a_convert);
912
913 percpu_down_read(&c->mark_lock);
914 struct bucket *g = gc_bucket(ca, new.k->p.offset);
915 if (unlikely(!g)) {
916 percpu_up_read(&c->mark_lock);
917 goto invalid_bucket;
918 }
919 g->gen_valid = 1;
920
921 bucket_lock(g);
922
923 g->gen_valid = 1;
924 g->gen = new_a->gen;
925 g->data_type = new_a->data_type;
926 g->stripe = new_a->stripe;
927 g->stripe_redundancy = new_a->stripe_redundancy;
928 g->dirty_sectors = new_a->dirty_sectors;
929 g->cached_sectors = new_a->cached_sectors;
930
931 bucket_unlock(g);
932 percpu_up_read(&c->mark_lock);
933 }
934err:
935 printbuf_exit(&buf);
936 bch2_dev_put(ca);
937 return ret;
938invalid_bucket:
939 bch2_fs_inconsistent(c, "reference to invalid bucket\n %s",
940 (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf));
941 ret = -EIO;
942 goto err;
943}
944
945/*
946 * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for
947 * extents style btrees, but works on non-extents btrees:
948 */
949static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole)
950{
951 struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
952
953 if (bkey_err(k))
954 return k;
955
956 if (k.k->type) {
957 return k;
958 } else {
959 struct btree_iter iter2;
960 struct bpos next;
961
962 bch2_trans_copy_iter(&iter2, iter);
963
964 struct btree_path *path = btree_iter_path(iter->trans, iter);
965 if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX))
966 end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p));
967
968 end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1));
969
970 /*
971 * btree node min/max is a closed interval, upto takes a half
972 * open interval:
973 */
974 k = bch2_btree_iter_peek_upto(&iter2, end);
975 next = iter2.pos;
976 bch2_trans_iter_exit(iter->trans, &iter2);
977
978 BUG_ON(next.offset >= iter->pos.offset + U32_MAX);
979
980 if (bkey_err(k))
981 return k;
982
983 bkey_init(hole);
984 hole->p = iter->pos;
985
986 bch2_key_resize(hole, next.offset - iter->pos.offset);
987 return (struct bkey_s_c) { hole, NULL };
988 }
989}
990
991static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *bucket)
992{
993 if (*ca) {
994 if (bucket->offset < (*ca)->mi.first_bucket)
995 bucket->offset = (*ca)->mi.first_bucket;
996
997 if (bucket->offset < (*ca)->mi.nbuckets)
998 return true;
999
1000 bch2_dev_put(*ca);
1001 *ca = NULL;
1002 bucket->inode++;
1003 bucket->offset = 0;
1004 }
1005
1006 rcu_read_lock();
1007 *ca = __bch2_next_dev_idx(c, bucket->inode, NULL);
1008 if (*ca) {
1009 *bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket);
1010 bch2_dev_get(*ca);
1011 }
1012 rcu_read_unlock();
1013
1014 return *ca != NULL;
1015}
1016
1017static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter,
1018 struct bch_dev **ca, struct bkey *hole)
1019{
1020 struct bch_fs *c = iter->trans->c;
1021 struct bkey_s_c k;
1022again:
1023 k = bch2_get_key_or_hole(iter, POS_MAX, hole);
1024 if (bkey_err(k))
1025 return k;
1026
1027 *ca = bch2_dev_iterate_noerror(c, *ca, k.k->p.inode);
1028
1029 if (!k.k->type) {
1030 struct bpos hole_start = bkey_start_pos(k.k);
1031
1032 if (!*ca || !bucket_valid(*ca, hole_start.offset)) {
1033 if (!next_bucket(c, ca, &hole_start))
1034 return bkey_s_c_null;
1035
1036 bch2_btree_iter_set_pos(iter, hole_start);
1037 goto again;
1038 }
1039
1040 if (k.k->p.offset > (*ca)->mi.nbuckets)
1041 bch2_key_resize(hole, (*ca)->mi.nbuckets - hole_start.offset);
1042 }
1043
1044 return k;
1045}
1046
1047static noinline_for_stack
1048int bch2_check_alloc_key(struct btree_trans *trans,
1049 struct bkey_s_c alloc_k,
1050 struct btree_iter *alloc_iter,
1051 struct btree_iter *discard_iter,
1052 struct btree_iter *freespace_iter,
1053 struct btree_iter *bucket_gens_iter)
1054{
1055 struct bch_fs *c = trans->c;
1056 struct bch_alloc_v4 a_convert;
1057 const struct bch_alloc_v4 *a;
1058 unsigned discard_key_type, freespace_key_type;
1059 unsigned gens_offset;
1060 struct bkey_s_c k;
1061 struct printbuf buf = PRINTBUF;
1062 int ret = 0;
1063
1064 struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p);
1065 if (fsck_err_on(!ca,
1066 c, alloc_key_to_missing_dev_bucket,
1067 "alloc key for invalid device:bucket %llu:%llu",
1068 alloc_k.k->p.inode, alloc_k.k->p.offset))
1069 ret = bch2_btree_delete_at(trans, alloc_iter, 0);
1070 if (!ca)
1071 return ret;
1072
1073 if (!ca->mi.freespace_initialized)
1074 goto out;
1075
1076 a = bch2_alloc_to_v4(alloc_k, &a_convert);
1077
1078 discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0;
1079 bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
1080 k = bch2_btree_iter_peek_slot(discard_iter);
1081 ret = bkey_err(k);
1082 if (ret)
1083 goto err;
1084
1085 if (fsck_err_on(k.k->type != discard_key_type,
1086 c, need_discard_key_wrong,
1087 "incorrect key in need_discard btree (got %s should be %s)\n"
1088 " %s",
1089 bch2_bkey_types[k.k->type],
1090 bch2_bkey_types[discard_key_type],
1091 (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1092 struct bkey_i *update =
1093 bch2_trans_kmalloc(trans, sizeof(*update));
1094
1095 ret = PTR_ERR_OR_ZERO(update);
1096 if (ret)
1097 goto err;
1098
1099 bkey_init(&update->k);
1100 update->k.type = discard_key_type;
1101 update->k.p = discard_iter->pos;
1102
1103 ret = bch2_trans_update(trans, discard_iter, update, 0);
1104 if (ret)
1105 goto err;
1106 }
1107
1108 freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0;
1109 bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
1110 k = bch2_btree_iter_peek_slot(freespace_iter);
1111 ret = bkey_err(k);
1112 if (ret)
1113 goto err;
1114
1115 if (fsck_err_on(k.k->type != freespace_key_type,
1116 c, freespace_key_wrong,
1117 "incorrect key in freespace btree (got %s should be %s)\n"
1118 " %s",
1119 bch2_bkey_types[k.k->type],
1120 bch2_bkey_types[freespace_key_type],
1121 (printbuf_reset(&buf),
1122 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1123 struct bkey_i *update =
1124 bch2_trans_kmalloc(trans, sizeof(*update));
1125
1126 ret = PTR_ERR_OR_ZERO(update);
1127 if (ret)
1128 goto err;
1129
1130 bkey_init(&update->k);
1131 update->k.type = freespace_key_type;
1132 update->k.p = freespace_iter->pos;
1133 bch2_key_resize(&update->k, 1);
1134
1135 ret = bch2_trans_update(trans, freespace_iter, update, 0);
1136 if (ret)
1137 goto err;
1138 }
1139
1140 bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
1141 k = bch2_btree_iter_peek_slot(bucket_gens_iter);
1142 ret = bkey_err(k);
1143 if (ret)
1144 goto err;
1145
1146 if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
1147 c, bucket_gens_key_wrong,
1148 "incorrect gen in bucket_gens btree (got %u should be %u)\n"
1149 " %s",
1150 alloc_gen(k, gens_offset), a->gen,
1151 (printbuf_reset(&buf),
1152 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1153 struct bkey_i_bucket_gens *g =
1154 bch2_trans_kmalloc(trans, sizeof(*g));
1155
1156 ret = PTR_ERR_OR_ZERO(g);
1157 if (ret)
1158 goto err;
1159
1160 if (k.k->type == KEY_TYPE_bucket_gens) {
1161 bkey_reassemble(&g->k_i, k);
1162 } else {
1163 bkey_bucket_gens_init(&g->k_i);
1164 g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset);
1165 }
1166
1167 g->v.gens[gens_offset] = a->gen;
1168
1169 ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0);
1170 if (ret)
1171 goto err;
1172 }
1173out:
1174err:
1175fsck_err:
1176 bch2_dev_put(ca);
1177 printbuf_exit(&buf);
1178 return ret;
1179}
1180
1181static noinline_for_stack
1182int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
1183 struct bch_dev *ca,
1184 struct bpos start,
1185 struct bpos *end,
1186 struct btree_iter *freespace_iter)
1187{
1188 struct bch_fs *c = trans->c;
1189 struct bkey_s_c k;
1190 struct printbuf buf = PRINTBUF;
1191 int ret;
1192
1193 if (!ca->mi.freespace_initialized)
1194 return 0;
1195
1196 bch2_btree_iter_set_pos(freespace_iter, start);
1197
1198 k = bch2_btree_iter_peek_slot(freespace_iter);
1199 ret = bkey_err(k);
1200 if (ret)
1201 goto err;
1202
1203 *end = bkey_min(k.k->p, *end);
1204
1205 if (fsck_err_on(k.k->type != KEY_TYPE_set,
1206 c, freespace_hole_missing,
1207 "hole in alloc btree missing in freespace btree\n"
1208 " device %llu buckets %llu-%llu",
1209 freespace_iter->pos.inode,
1210 freespace_iter->pos.offset,
1211 end->offset)) {
1212 struct bkey_i *update =
1213 bch2_trans_kmalloc(trans, sizeof(*update));
1214
1215 ret = PTR_ERR_OR_ZERO(update);
1216 if (ret)
1217 goto err;
1218
1219 bkey_init(&update->k);
1220 update->k.type = KEY_TYPE_set;
1221 update->k.p = freespace_iter->pos;
1222 bch2_key_resize(&update->k,
1223 min_t(u64, U32_MAX, end->offset -
1224 freespace_iter->pos.offset));
1225
1226 ret = bch2_trans_update(trans, freespace_iter, update, 0);
1227 if (ret)
1228 goto err;
1229 }
1230err:
1231fsck_err:
1232 printbuf_exit(&buf);
1233 return ret;
1234}
1235
1236static noinline_for_stack
1237int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
1238 struct bpos start,
1239 struct bpos *end,
1240 struct btree_iter *bucket_gens_iter)
1241{
1242 struct bch_fs *c = trans->c;
1243 struct bkey_s_c k;
1244 struct printbuf buf = PRINTBUF;
1245 unsigned i, gens_offset, gens_end_offset;
1246 int ret;
1247
1248 bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
1249
1250 k = bch2_btree_iter_peek_slot(bucket_gens_iter);
1251 ret = bkey_err(k);
1252 if (ret)
1253 goto err;
1254
1255 if (bkey_cmp(alloc_gens_pos(start, &gens_offset),
1256 alloc_gens_pos(*end, &gens_end_offset)))
1257 gens_end_offset = KEY_TYPE_BUCKET_GENS_NR;
1258
1259 if (k.k->type == KEY_TYPE_bucket_gens) {
1260 struct bkey_i_bucket_gens g;
1261 bool need_update = false;
1262
1263 bkey_reassemble(&g.k_i, k);
1264
1265 for (i = gens_offset; i < gens_end_offset; i++) {
1266 if (fsck_err_on(g.v.gens[i], c,
1267 bucket_gens_hole_wrong,
1268 "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
1269 bucket_gens_pos_to_alloc(k.k->p, i).inode,
1270 bucket_gens_pos_to_alloc(k.k->p, i).offset,
1271 g.v.gens[i])) {
1272 g.v.gens[i] = 0;
1273 need_update = true;
1274 }
1275 }
1276
1277 if (need_update) {
1278 struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
1279
1280 ret = PTR_ERR_OR_ZERO(u);
1281 if (ret)
1282 goto err;
1283
1284 memcpy(u, &g, sizeof(g));
1285
1286 ret = bch2_trans_update(trans, bucket_gens_iter, u, 0);
1287 if (ret)
1288 goto err;
1289 }
1290 }
1291
1292 *end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0));
1293err:
1294fsck_err:
1295 printbuf_exit(&buf);
1296 return ret;
1297}
1298
1299static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans,
1300 struct btree_iter *iter)
1301{
1302 struct bch_fs *c = trans->c;
1303 struct btree_iter alloc_iter;
1304 struct bkey_s_c alloc_k;
1305 struct bch_alloc_v4 a_convert;
1306 const struct bch_alloc_v4 *a;
1307 u64 genbits;
1308 struct bpos pos;
1309 enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
1310 ? BCH_DATA_need_discard
1311 : BCH_DATA_free;
1312 struct printbuf buf = PRINTBUF;
1313 int ret;
1314
1315 pos = iter->pos;
1316 pos.offset &= ~(~0ULL << 56);
1317 genbits = iter->pos.offset & (~0ULL << 56);
1318
1319 alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
1320 ret = bkey_err(alloc_k);
1321 if (ret)
1322 return ret;
1323
1324 if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
1325 need_discard_freespace_key_to_invalid_dev_bucket,
1326 "entry in %s btree for nonexistant dev:bucket %llu:%llu",
1327 bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset))
1328 goto delete;
1329
1330 a = bch2_alloc_to_v4(alloc_k, &a_convert);
1331
1332 if (fsck_err_on(a->data_type != state ||
1333 (state == BCH_DATA_free &&
1334 genbits != alloc_freespace_genbits(*a)), c,
1335 need_discard_freespace_key_bad,
1336 "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
1337 (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
1338 bch2_btree_id_str(iter->btree_id),
1339 iter->pos.inode,
1340 iter->pos.offset,
1341 a->data_type == state,
1342 genbits >> 56, alloc_freespace_genbits(*a) >> 56))
1343 goto delete;
1344out:
1345fsck_err:
1346 bch2_set_btree_iter_dontneed(&alloc_iter);
1347 bch2_trans_iter_exit(trans, &alloc_iter);
1348 printbuf_exit(&buf);
1349 return ret;
1350delete:
1351 ret = bch2_btree_delete_extent_at(trans, iter,
1352 iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?:
1353 bch2_trans_commit(trans, NULL, NULL,
1354 BCH_TRANS_COMMIT_no_enospc);
1355 goto out;
1356}
1357
1358/*
1359 * We've already checked that generation numbers in the bucket_gens btree are
1360 * valid for buckets that exist; this just checks for keys for nonexistent
1361 * buckets.
1362 */
1363static noinline_for_stack
1364int bch2_check_bucket_gens_key(struct btree_trans *trans,
1365 struct btree_iter *iter,
1366 struct bkey_s_c k)
1367{
1368 struct bch_fs *c = trans->c;
1369 struct bkey_i_bucket_gens g;
1370 u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
1371 u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
1372 u64 b;
1373 bool need_update = false;
1374 struct printbuf buf = PRINTBUF;
1375 int ret = 0;
1376
1377 BUG_ON(k.k->type != KEY_TYPE_bucket_gens);
1378 bkey_reassemble(&g.k_i, k);
1379
1380 struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode);
1381 if (!ca) {
1382 if (fsck_err(c, bucket_gens_to_invalid_dev,
1383 "bucket_gens key for invalid device:\n %s",
1384 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1385 ret = bch2_btree_delete_at(trans, iter, 0);
1386 goto out;
1387 }
1388
1389 if (fsck_err_on(end <= ca->mi.first_bucket ||
1390 start >= ca->mi.nbuckets, c,
1391 bucket_gens_to_invalid_buckets,
1392 "bucket_gens key for invalid buckets:\n %s",
1393 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1394 ret = bch2_btree_delete_at(trans, iter, 0);
1395 goto out;
1396 }
1397
1398 for (b = start; b < ca->mi.first_bucket; b++)
1399 if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
1400 bucket_gens_nonzero_for_invalid_buckets,
1401 "bucket_gens key has nonzero gen for invalid bucket")) {
1402 g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
1403 need_update = true;
1404 }
1405
1406 for (b = ca->mi.nbuckets; b < end; b++)
1407 if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
1408 bucket_gens_nonzero_for_invalid_buckets,
1409 "bucket_gens key has nonzero gen for invalid bucket")) {
1410 g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
1411 need_update = true;
1412 }
1413
1414 if (need_update) {
1415 struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
1416
1417 ret = PTR_ERR_OR_ZERO(u);
1418 if (ret)
1419 goto out;
1420
1421 memcpy(u, &g, sizeof(g));
1422 ret = bch2_trans_update(trans, iter, u, 0);
1423 }
1424out:
1425fsck_err:
1426 bch2_dev_put(ca);
1427 printbuf_exit(&buf);
1428 return ret;
1429}
1430
1431int bch2_check_alloc_info(struct bch_fs *c)
1432{
1433 struct btree_trans *trans = bch2_trans_get(c);
1434 struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
1435 struct bch_dev *ca = NULL;
1436 struct bkey hole;
1437 struct bkey_s_c k;
1438 int ret = 0;
1439
1440 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN,
1441 BTREE_ITER_prefetch);
1442 bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
1443 BTREE_ITER_prefetch);
1444 bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
1445 BTREE_ITER_prefetch);
1446 bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
1447 BTREE_ITER_prefetch);
1448
1449 while (1) {
1450 struct bpos next;
1451
1452 bch2_trans_begin(trans);
1453
1454 k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole);
1455 ret = bkey_err(k);
1456 if (ret)
1457 goto bkey_err;
1458
1459 if (!k.k)
1460 break;
1461
1462 if (k.k->type) {
1463 next = bpos_nosnap_successor(k.k->p);
1464
1465 ret = bch2_check_alloc_key(trans,
1466 k, &iter,
1467 &discard_iter,
1468 &freespace_iter,
1469 &bucket_gens_iter);
1470 if (ret)
1471 goto bkey_err;
1472 } else {
1473 next = k.k->p;
1474
1475 ret = bch2_check_alloc_hole_freespace(trans, ca,
1476 bkey_start_pos(k.k),
1477 &next,
1478 &freespace_iter) ?:
1479 bch2_check_alloc_hole_bucket_gens(trans,
1480 bkey_start_pos(k.k),
1481 &next,
1482 &bucket_gens_iter);
1483 if (ret)
1484 goto bkey_err;
1485 }
1486
1487 ret = bch2_trans_commit(trans, NULL, NULL,
1488 BCH_TRANS_COMMIT_no_enospc);
1489 if (ret)
1490 goto bkey_err;
1491
1492 bch2_btree_iter_set_pos(&iter, next);
1493bkey_err:
1494 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1495 continue;
1496 if (ret)
1497 break;
1498 }
1499 bch2_trans_iter_exit(trans, &bucket_gens_iter);
1500 bch2_trans_iter_exit(trans, &freespace_iter);
1501 bch2_trans_iter_exit(trans, &discard_iter);
1502 bch2_trans_iter_exit(trans, &iter);
1503 bch2_dev_put(ca);
1504 ca = NULL;
1505
1506 if (ret < 0)
1507 goto err;
1508
1509 ret = for_each_btree_key(trans, iter,
1510 BTREE_ID_need_discard, POS_MIN,
1511 BTREE_ITER_prefetch, k,
1512 bch2_check_discard_freespace_key(trans, &iter));
1513 if (ret)
1514 goto err;
1515
1516 bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN,
1517 BTREE_ITER_prefetch);
1518 while (1) {
1519 bch2_trans_begin(trans);
1520 k = bch2_btree_iter_peek(&iter);
1521 if (!k.k)
1522 break;
1523
1524 ret = bkey_err(k) ?:
1525 bch2_check_discard_freespace_key(trans, &iter);
1526 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
1527 ret = 0;
1528 continue;
1529 }
1530 if (ret) {
1531 struct printbuf buf = PRINTBUF;
1532 bch2_bkey_val_to_text(&buf, c, k);
1533
1534 bch_err(c, "while checking %s", buf.buf);
1535 printbuf_exit(&buf);
1536 break;
1537 }
1538
1539 bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
1540 }
1541 bch2_trans_iter_exit(trans, &iter);
1542 if (ret)
1543 goto err;
1544
1545 ret = for_each_btree_key_commit(trans, iter,
1546 BTREE_ID_bucket_gens, POS_MIN,
1547 BTREE_ITER_prefetch, k,
1548 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1549 bch2_check_bucket_gens_key(trans, &iter, k));
1550err:
1551 bch2_trans_put(trans);
1552 bch_err_fn(c, ret);
1553 return ret;
1554}
1555
1556static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
1557 struct btree_iter *alloc_iter,
1558 struct bkey_buf *last_flushed)
1559{
1560 struct bch_fs *c = trans->c;
1561 struct bch_alloc_v4 a_convert;
1562 const struct bch_alloc_v4 *a;
1563 struct bkey_s_c alloc_k;
1564 struct printbuf buf = PRINTBUF;
1565 int ret;
1566
1567 alloc_k = bch2_btree_iter_peek(alloc_iter);
1568 if (!alloc_k.k)
1569 return 0;
1570
1571 ret = bkey_err(alloc_k);
1572 if (ret)
1573 return ret;
1574
1575 a = bch2_alloc_to_v4(alloc_k, &a_convert);
1576
1577 if (a->fragmentation_lru) {
1578 ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START,
1579 a->fragmentation_lru,
1580 alloc_k, last_flushed);
1581 if (ret)
1582 return ret;
1583 }
1584
1585 if (a->data_type != BCH_DATA_cached)
1586 return 0;
1587
1588 if (fsck_err_on(!a->io_time[READ], c,
1589 alloc_key_cached_but_read_time_zero,
1590 "cached bucket with read_time 0\n"
1591 " %s",
1592 (printbuf_reset(&buf),
1593 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1594 struct bkey_i_alloc_v4 *a_mut =
1595 bch2_alloc_to_v4_mut(trans, alloc_k);
1596 ret = PTR_ERR_OR_ZERO(a_mut);
1597 if (ret)
1598 goto err;
1599
1600 a_mut->v.io_time[READ] = bch2_current_io_time(c, READ);
1601 ret = bch2_trans_update(trans, alloc_iter,
1602 &a_mut->k_i, BTREE_TRIGGER_norun);
1603 if (ret)
1604 goto err;
1605
1606 a = &a_mut->v;
1607 }
1608
1609 ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ],
1610 alloc_k, last_flushed);
1611 if (ret)
1612 goto err;
1613err:
1614fsck_err:
1615 printbuf_exit(&buf);
1616 return ret;
1617}
1618
1619int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
1620{
1621 struct bkey_buf last_flushed;
1622
1623 bch2_bkey_buf_init(&last_flushed);
1624 bkey_init(&last_flushed.k->k);
1625
1626 int ret = bch2_trans_run(c,
1627 for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
1628 POS_MIN, BTREE_ITER_prefetch, k,
1629 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1630 bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed)));
1631
1632 bch2_bkey_buf_exit(&last_flushed, c);
1633 bch_err_fn(c, ret);
1634 return ret;
1635}
1636
1637static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress)
1638{
1639 int ret;
1640
1641 mutex_lock(&ca->discard_buckets_in_flight_lock);
1642 darray_for_each(ca->discard_buckets_in_flight, i)
1643 if (i->bucket == bucket) {
1644 ret = -BCH_ERR_EEXIST_discard_in_flight_add;
1645 goto out;
1646 }
1647
1648 ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) {
1649 .in_progress = in_progress,
1650 .bucket = bucket,
1651 }));
1652out:
1653 mutex_unlock(&ca->discard_buckets_in_flight_lock);
1654 return ret;
1655}
1656
1657static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket)
1658{
1659 mutex_lock(&ca->discard_buckets_in_flight_lock);
1660 darray_for_each(ca->discard_buckets_in_flight, i)
1661 if (i->bucket == bucket) {
1662 BUG_ON(!i->in_progress);
1663 darray_remove_item(&ca->discard_buckets_in_flight, i);
1664 goto found;
1665 }
1666 BUG();
1667found:
1668 mutex_unlock(&ca->discard_buckets_in_flight_lock);
1669}
1670
1671struct discard_buckets_state {
1672 u64 seen;
1673 u64 open;
1674 u64 need_journal_commit;
1675 u64 discarded;
1676 u64 need_journal_commit_this_dev;
1677};
1678
1679static int bch2_discard_one_bucket(struct btree_trans *trans,
1680 struct bch_dev *ca,
1681 struct btree_iter *need_discard_iter,
1682 struct bpos *discard_pos_done,
1683 struct discard_buckets_state *s)
1684{
1685 struct bch_fs *c = trans->c;
1686 struct bpos pos = need_discard_iter->pos;
1687 struct btree_iter iter = { NULL };
1688 struct bkey_s_c k;
1689 struct bkey_i_alloc_v4 *a;
1690 struct printbuf buf = PRINTBUF;
1691 bool discard_locked = false;
1692 int ret = 0;
1693
1694 if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
1695 s->open++;
1696 goto out;
1697 }
1698
1699 if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
1700 c->journal.flushed_seq_ondisk,
1701 pos.inode, pos.offset)) {
1702 s->need_journal_commit++;
1703 s->need_journal_commit_this_dev++;
1704 goto out;
1705 }
1706
1707 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
1708 need_discard_iter->pos,
1709 BTREE_ITER_cached);
1710 ret = bkey_err(k);
1711 if (ret)
1712 goto out;
1713
1714 a = bch2_alloc_to_v4_mut(trans, k);
1715 ret = PTR_ERR_OR_ZERO(a);
1716 if (ret)
1717 goto out;
1718
1719 if (bch2_bucket_sectors_total(a->v)) {
1720 if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
1721 trans, "attempting to discard bucket with dirty data\n%s",
1722 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1723 ret = -EIO;
1724 goto out;
1725 }
1726
1727 if (a->v.data_type != BCH_DATA_need_discard) {
1728 if (data_type_is_empty(a->v.data_type) &&
1729 BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
1730 a->v.gen++;
1731 SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
1732 goto write;
1733 }
1734
1735 if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
1736 trans, "bucket incorrectly set in need_discard btree\n"
1737 "%s",
1738 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1739 ret = -EIO;
1740 goto out;
1741 }
1742
1743 if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
1744 if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
1745 trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s",
1746 a->v.journal_seq,
1747 c->journal.flushed_seq_ondisk,
1748 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1749 ret = -EIO;
1750 goto out;
1751 }
1752
1753 if (discard_in_flight_add(ca, iter.pos.offset, true))
1754 goto out;
1755
1756 discard_locked = true;
1757
1758 if (!bkey_eq(*discard_pos_done, iter.pos) &&
1759 ca->mi.discard && !c->opts.nochanges) {
1760 /*
1761 * This works without any other locks because this is the only
1762 * thread that removes items from the need_discard tree
1763 */
1764 bch2_trans_unlock_long(trans);
1765 blkdev_issue_discard(ca->disk_sb.bdev,
1766 k.k->p.offset * ca->mi.bucket_size,
1767 ca->mi.bucket_size,
1768 GFP_KERNEL);
1769 *discard_pos_done = iter.pos;
1770
1771 ret = bch2_trans_relock_notrace(trans);
1772 if (ret)
1773 goto out;
1774 }
1775
1776 SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
1777write:
1778 alloc_data_type_set(&a->v, a->v.data_type);
1779
1780 ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
1781 bch2_trans_commit(trans, NULL, NULL,
1782 BCH_WATERMARK_btree|
1783 BCH_TRANS_COMMIT_no_enospc);
1784 if (ret)
1785 goto out;
1786
1787 count_event(c, bucket_discard);
1788 s->discarded++;
1789out:
1790 if (discard_locked)
1791 discard_in_flight_remove(ca, iter.pos.offset);
1792 s->seen++;
1793 bch2_trans_iter_exit(trans, &iter);
1794 printbuf_exit(&buf);
1795 return ret;
1796}
1797
1798static void bch2_do_discards_work(struct work_struct *work)
1799{
1800 struct bch_dev *ca = container_of(work, struct bch_dev, discard_work);
1801 struct bch_fs *c = ca->fs;
1802 struct discard_buckets_state s = {};
1803 struct bpos discard_pos_done = POS_MAX;
1804 int ret;
1805
1806 /*
1807 * We're doing the commit in bch2_discard_one_bucket instead of using
1808 * for_each_btree_key_commit() so that we can increment counters after
1809 * successful commit:
1810 */
1811 ret = bch2_trans_run(c,
1812 for_each_btree_key_upto(trans, iter,
1813 BTREE_ID_need_discard,
1814 POS(ca->dev_idx, 0),
1815 POS(ca->dev_idx, U64_MAX), 0, k,
1816 bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s)));
1817
1818 trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
1819 bch2_err_str(ret));
1820
1821 bch2_write_ref_put(c, BCH_WRITE_REF_discard);
1822 percpu_ref_put(&ca->io_ref);
1823}
1824
1825void bch2_dev_do_discards(struct bch_dev *ca)
1826{
1827 struct bch_fs *c = ca->fs;
1828
1829 if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
1830 return;
1831
1832 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
1833 goto put_ioref;
1834
1835 if (queue_work(c->write_ref_wq, &ca->discard_work))
1836 return;
1837
1838 bch2_write_ref_put(c, BCH_WRITE_REF_discard);
1839put_ioref:
1840 percpu_ref_put(&ca->io_ref);
1841}
1842
1843void bch2_do_discards(struct bch_fs *c)
1844{
1845 for_each_member_device(c, ca)
1846 bch2_dev_do_discards(ca);
1847}
1848
1849static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket)
1850{
1851 struct btree_iter iter;
1852 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_intent);
1853 struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
1854 int ret = bkey_err(k);
1855 if (ret)
1856 goto err;
1857
1858 struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k);
1859 ret = PTR_ERR_OR_ZERO(a);
1860 if (ret)
1861 goto err;
1862
1863 BUG_ON(a->v.dirty_sectors);
1864 SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
1865 alloc_data_type_set(&a->v, a->v.data_type);
1866
1867 ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
1868err:
1869 bch2_trans_iter_exit(trans, &iter);
1870 return ret;
1871}
1872
1873static void bch2_do_discards_fast_work(struct work_struct *work)
1874{
1875 struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work);
1876 struct bch_fs *c = ca->fs;
1877
1878 while (1) {
1879 bool got_bucket = false;
1880 u64 bucket;
1881
1882 mutex_lock(&ca->discard_buckets_in_flight_lock);
1883 darray_for_each(ca->discard_buckets_in_flight, i) {
1884 if (i->in_progress)
1885 continue;
1886
1887 got_bucket = true;
1888 bucket = i->bucket;
1889 i->in_progress = true;
1890 break;
1891 }
1892 mutex_unlock(&ca->discard_buckets_in_flight_lock);
1893
1894 if (!got_bucket)
1895 break;
1896
1897 if (ca->mi.discard && !c->opts.nochanges)
1898 blkdev_issue_discard(ca->disk_sb.bdev,
1899 bucket_to_sector(ca, bucket),
1900 ca->mi.bucket_size,
1901 GFP_KERNEL);
1902
1903 int ret = bch2_trans_do(c, NULL, NULL,
1904 BCH_WATERMARK_btree|
1905 BCH_TRANS_COMMIT_no_enospc,
1906 bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket)));
1907 bch_err_fn(c, ret);
1908
1909 discard_in_flight_remove(ca, bucket);
1910
1911 if (ret)
1912 break;
1913 }
1914
1915 bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
1916 percpu_ref_put(&ca->io_ref);
1917}
1918
1919static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
1920{
1921 struct bch_fs *c = ca->fs;
1922
1923 if (discard_in_flight_add(ca, bucket, false))
1924 return;
1925
1926 if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
1927 return;
1928
1929 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast))
1930 goto put_ioref;
1931
1932 if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
1933 return;
1934
1935 bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
1936put_ioref:
1937 percpu_ref_put(&ca->io_ref);
1938}
1939
1940static int invalidate_one_bucket(struct btree_trans *trans,
1941 struct btree_iter *lru_iter,
1942 struct bkey_s_c lru_k,
1943 s64 *nr_to_invalidate)
1944{
1945 struct bch_fs *c = trans->c;
1946 struct bkey_i_alloc_v4 *a = NULL;
1947 struct printbuf buf = PRINTBUF;
1948 struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
1949 unsigned cached_sectors;
1950 int ret = 0;
1951
1952 if (*nr_to_invalidate <= 0)
1953 return 1;
1954
1955 if (!bch2_dev_bucket_exists(c, bucket)) {
1956 prt_str(&buf, "lru entry points to invalid bucket");
1957 goto err;
1958 }
1959
1960 if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
1961 return 0;
1962
1963 a = bch2_trans_start_alloc_update(trans, bucket);
1964 ret = PTR_ERR_OR_ZERO(a);
1965 if (ret)
1966 goto out;
1967
1968 /* We expect harmless races here due to the btree write buffer: */
1969 if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v))
1970 goto out;
1971
1972 BUG_ON(a->v.data_type != BCH_DATA_cached);
1973 BUG_ON(a->v.dirty_sectors);
1974
1975 if (!a->v.cached_sectors)
1976 bch_err(c, "invalidating empty bucket, confused");
1977
1978 cached_sectors = a->v.cached_sectors;
1979
1980 SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
1981 a->v.gen++;
1982 a->v.data_type = 0;
1983 a->v.dirty_sectors = 0;
1984 a->v.cached_sectors = 0;
1985 a->v.io_time[READ] = bch2_current_io_time(c, READ);
1986 a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE);
1987
1988 ret = bch2_trans_commit(trans, NULL, NULL,
1989 BCH_WATERMARK_btree|
1990 BCH_TRANS_COMMIT_no_enospc);
1991 if (ret)
1992 goto out;
1993
1994 trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
1995 --*nr_to_invalidate;
1996out:
1997 printbuf_exit(&buf);
1998 return ret;
1999err:
2000 prt_str(&buf, "\n lru key: ");
2001 bch2_bkey_val_to_text(&buf, c, lru_k);
2002
2003 prt_str(&buf, "\n lru entry: ");
2004 bch2_lru_pos_to_text(&buf, lru_iter->pos);
2005
2006 prt_str(&buf, "\n alloc key: ");
2007 if (!a)
2008 bch2_bpos_to_text(&buf, bucket);
2009 else
2010 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
2011
2012 bch_err(c, "%s", buf.buf);
2013 if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) {
2014 bch2_inconsistent_error(c);
2015 ret = -EINVAL;
2016 }
2017
2018 goto out;
2019}
2020
2021static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter,
2022 struct bch_dev *ca, bool *wrapped)
2023{
2024 struct bkey_s_c k;
2025again:
2026 k = bch2_btree_iter_peek_upto(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
2027 if (!k.k && !*wrapped) {
2028 bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0));
2029 *wrapped = true;
2030 goto again;
2031 }
2032
2033 return k;
2034}
2035
2036static void bch2_do_invalidates_work(struct work_struct *work)
2037{
2038 struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work);
2039 struct bch_fs *c = ca->fs;
2040 struct btree_trans *trans = bch2_trans_get(c);
2041 int ret = 0;
2042
2043 ret = bch2_btree_write_buffer_tryflush(trans);
2044 if (ret)
2045 goto err;
2046
2047 s64 nr_to_invalidate =
2048 should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
2049 struct btree_iter iter;
2050 bool wrapped = false;
2051
2052 bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
2053 lru_pos(ca->dev_idx, 0,
2054 ((bch2_current_io_time(c, READ) + U32_MAX) &
2055 LRU_TIME_MAX)), 0);
2056
2057 while (true) {
2058 bch2_trans_begin(trans);
2059
2060 struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped);
2061 ret = bkey_err(k);
2062 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
2063 continue;
2064 if (ret)
2065 break;
2066 if (!k.k)
2067 break;
2068
2069 ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate);
2070 if (ret)
2071 break;
2072
2073 bch2_btree_iter_advance(&iter);
2074 }
2075 bch2_trans_iter_exit(trans, &iter);
2076err:
2077 bch2_trans_put(trans);
2078 bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
2079 percpu_ref_put(&ca->io_ref);
2080}
2081
2082void bch2_dev_do_invalidates(struct bch_dev *ca)
2083{
2084 struct bch_fs *c = ca->fs;
2085
2086 if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
2087 return;
2088
2089 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate))
2090 goto put_ioref;
2091
2092 if (queue_work(c->write_ref_wq, &ca->invalidate_work))
2093 return;
2094
2095 bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
2096put_ioref:
2097 percpu_ref_put(&ca->io_ref);
2098}
2099
2100void bch2_do_invalidates(struct bch_fs *c)
2101{
2102 for_each_member_device(c, ca)
2103 bch2_dev_do_invalidates(ca);
2104}
2105
2106int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
2107 u64 bucket_start, u64 bucket_end)
2108{
2109 struct btree_trans *trans = bch2_trans_get(c);
2110 struct btree_iter iter;
2111 struct bkey_s_c k;
2112 struct bkey hole;
2113 struct bpos end = POS(ca->dev_idx, bucket_end);
2114 struct bch_member *m;
2115 unsigned long last_updated = jiffies;
2116 int ret;
2117
2118 BUG_ON(bucket_start > bucket_end);
2119 BUG_ON(bucket_end > ca->mi.nbuckets);
2120
2121 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
2122 POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)),
2123 BTREE_ITER_prefetch);
2124 /*
2125 * Scan the alloc btree for every bucket on @ca, and add buckets to the
2126 * freespace/need_discard/need_gc_gens btrees as needed:
2127 */
2128 while (1) {
2129 if (last_updated + HZ * 10 < jiffies) {
2130 bch_info(ca, "%s: currently at %llu/%llu",
2131 __func__, iter.pos.offset, ca->mi.nbuckets);
2132 last_updated = jiffies;
2133 }
2134
2135 bch2_trans_begin(trans);
2136
2137 if (bkey_ge(iter.pos, end)) {
2138 ret = 0;
2139 break;
2140 }
2141
2142 k = bch2_get_key_or_hole(&iter, end, &hole);
2143 ret = bkey_err(k);
2144 if (ret)
2145 goto bkey_err;
2146
2147 if (k.k->type) {
2148 /*
2149 * We process live keys in the alloc btree one at a
2150 * time:
2151 */
2152 struct bch_alloc_v4 a_convert;
2153 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
2154
2155 ret = bch2_bucket_do_index(trans, ca, k, a, true) ?:
2156 bch2_trans_commit(trans, NULL, NULL,
2157 BCH_TRANS_COMMIT_no_enospc);
2158 if (ret)
2159 goto bkey_err;
2160
2161 bch2_btree_iter_advance(&iter);
2162 } else {
2163 struct bkey_i *freespace;
2164
2165 freespace = bch2_trans_kmalloc(trans, sizeof(*freespace));
2166 ret = PTR_ERR_OR_ZERO(freespace);
2167 if (ret)
2168 goto bkey_err;
2169
2170 bkey_init(&freespace->k);
2171 freespace->k.type = KEY_TYPE_set;
2172 freespace->k.p = k.k->p;
2173 freespace->k.size = k.k->size;
2174
2175 ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
2176 bch2_trans_commit(trans, NULL, NULL,
2177 BCH_TRANS_COMMIT_no_enospc);
2178 if (ret)
2179 goto bkey_err;
2180
2181 bch2_btree_iter_set_pos(&iter, k.k->p);
2182 }
2183bkey_err:
2184 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
2185 continue;
2186 if (ret)
2187 break;
2188 }
2189
2190 bch2_trans_iter_exit(trans, &iter);
2191 bch2_trans_put(trans);
2192
2193 if (ret < 0) {
2194 bch_err_msg(ca, ret, "initializing free space");
2195 return ret;
2196 }
2197
2198 mutex_lock(&c->sb_lock);
2199 m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
2200 SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
2201 mutex_unlock(&c->sb_lock);
2202
2203 return 0;
2204}
2205
2206int bch2_fs_freespace_init(struct bch_fs *c)
2207{
2208 int ret = 0;
2209 bool doing_init = false;
2210
2211 /*
2212 * We can crash during the device add path, so we need to check this on
2213 * every mount:
2214 */
2215
2216 for_each_member_device(c, ca) {
2217 if (ca->mi.freespace_initialized)
2218 continue;
2219
2220 if (!doing_init) {
2221 bch_info(c, "initializing freespace");
2222 doing_init = true;
2223 }
2224
2225 ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
2226 if (ret) {
2227 bch2_dev_put(ca);
2228 bch_err_fn(c, ret);
2229 return ret;
2230 }
2231 }
2232
2233 if (doing_init) {
2234 mutex_lock(&c->sb_lock);
2235 bch2_write_super(c);
2236 mutex_unlock(&c->sb_lock);
2237 bch_verbose(c, "done initializing freespace");
2238 }
2239
2240 return 0;
2241}
2242
2243/* Bucket IO clocks: */
2244
2245int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
2246 size_t bucket_nr, int rw)
2247{
2248 struct bch_fs *c = trans->c;
2249 struct btree_iter iter;
2250 struct bkey_i_alloc_v4 *a;
2251 u64 now;
2252 int ret = 0;
2253
2254 if (bch2_trans_relock(trans))
2255 bch2_trans_begin(trans);
2256
2257 a = bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr));
2258 ret = PTR_ERR_OR_ZERO(a);
2259 if (ret)
2260 return ret;
2261
2262 now = bch2_current_io_time(c, rw);
2263 if (a->v.io_time[rw] == now)
2264 goto out;
2265
2266 a->v.io_time[rw] = now;
2267
2268 ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
2269 bch2_trans_commit(trans, NULL, NULL, 0);
2270out:
2271 bch2_trans_iter_exit(trans, &iter);
2272 return ret;
2273}
2274
2275/* Startup/shutdown (ro/rw): */
2276
2277void bch2_recalc_capacity(struct bch_fs *c)
2278{
2279 u64 capacity = 0, reserved_sectors = 0, gc_reserve;
2280 unsigned bucket_size_max = 0;
2281 unsigned long ra_pages = 0;
2282
2283 lockdep_assert_held(&c->state_lock);
2284
2285 for_each_online_member(c, ca) {
2286 struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
2287
2288 ra_pages += bdi->ra_pages;
2289 }
2290
2291 bch2_set_ra_pages(c, ra_pages);
2292
2293 for_each_rw_member(c, ca) {
2294 u64 dev_reserve = 0;
2295
2296 /*
2297 * We need to reserve buckets (from the number
2298 * of currently available buckets) against
2299 * foreground writes so that mainly copygc can
2300 * make forward progress.
2301 *
2302 * We need enough to refill the various reserves
2303 * from scratch - copygc will use its entire
2304 * reserve all at once, then run against when
2305 * its reserve is refilled (from the formerly
2306 * available buckets).
2307 *
2308 * This reserve is just used when considering if
2309 * allocations for foreground writes must wait -
2310 * not -ENOSPC calculations.
2311 */
2312
2313 dev_reserve += ca->nr_btree_reserve * 2;
2314 dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
2315
2316 dev_reserve += 1; /* btree write point */
2317 dev_reserve += 1; /* copygc write point */
2318 dev_reserve += 1; /* rebalance write point */
2319
2320 dev_reserve *= ca->mi.bucket_size;
2321
2322 capacity += bucket_to_sector(ca, ca->mi.nbuckets -
2323 ca->mi.first_bucket);
2324
2325 reserved_sectors += dev_reserve * 2;
2326
2327 bucket_size_max = max_t(unsigned, bucket_size_max,
2328 ca->mi.bucket_size);
2329 }
2330
2331 gc_reserve = c->opts.gc_reserve_bytes
2332 ? c->opts.gc_reserve_bytes >> 9
2333 : div64_u64(capacity * c->opts.gc_reserve_percent, 100);
2334
2335 reserved_sectors = max(gc_reserve, reserved_sectors);
2336
2337 reserved_sectors = min(reserved_sectors, capacity);
2338
2339 c->capacity = capacity - reserved_sectors;
2340
2341 c->bucket_size_max = bucket_size_max;
2342
2343 /* Wake up case someone was waiting for buckets */
2344 closure_wake_up(&c->freelist_wait);
2345}
2346
2347u64 bch2_min_rw_member_capacity(struct bch_fs *c)
2348{
2349 u64 ret = U64_MAX;
2350
2351 for_each_rw_member(c, ca)
2352 ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
2353 return ret;
2354}
2355
2356static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
2357{
2358 struct open_bucket *ob;
2359 bool ret = false;
2360
2361 for (ob = c->open_buckets;
2362 ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
2363 ob++) {
2364 spin_lock(&ob->lock);
2365 if (ob->valid && !ob->on_partial_list &&
2366 ob->dev == ca->dev_idx)
2367 ret = true;
2368 spin_unlock(&ob->lock);
2369 }
2370
2371 return ret;
2372}
2373
2374/* device goes ro: */
2375void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
2376{
2377 unsigned i;
2378
2379 /* First, remove device from allocation groups: */
2380
2381 for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
2382 clear_bit(ca->dev_idx, c->rw_devs[i].d);
2383
2384 /*
2385 * Capacity is calculated based off of devices in allocation groups:
2386 */
2387 bch2_recalc_capacity(c);
2388
2389 bch2_open_buckets_stop(c, ca, false);
2390
2391 /*
2392 * Wake up threads that were blocked on allocation, so they can notice
2393 * the device can no longer be removed and the capacity has changed:
2394 */
2395 closure_wake_up(&c->freelist_wait);
2396
2397 /*
2398 * journal_res_get() can block waiting for free space in the journal -
2399 * it needs to notice there may not be devices to allocate from anymore:
2400 */
2401 wake_up(&c->journal.wait);
2402
2403 /* Now wait for any in flight writes: */
2404
2405 closure_wait_event(&c->open_buckets_wait,
2406 !bch2_dev_has_open_write_point(c, ca));
2407}
2408
2409/* device goes rw: */
2410void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
2411{
2412 unsigned i;
2413
2414 for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
2415 if (ca->mi.data_allowed & (1 << i))
2416 set_bit(ca->dev_idx, c->rw_devs[i].d);
2417}
2418
2419void bch2_dev_allocator_background_exit(struct bch_dev *ca)
2420{
2421 darray_exit(&ca->discard_buckets_in_flight);
2422}
2423
2424void bch2_dev_allocator_background_init(struct bch_dev *ca)
2425{
2426 mutex_init(&ca->discard_buckets_in_flight_lock);
2427 INIT_WORK(&ca->discard_work, bch2_do_discards_work);
2428 INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work);
2429 INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work);
2430}
2431
2432void bch2_fs_allocator_background_init(struct bch_fs *c)
2433{
2434 spin_lock_init(&c->freelist_lock);
2435}