Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
4 *
5 * Code for managing the extent btree and dynamically updating the writeback
6 * dirty sector count.
7 */
8
9#include "bcachefs.h"
10#include "bkey_methods.h"
11#include "btree_cache.h"
12#include "btree_gc.h"
13#include "btree_io.h"
14#include "btree_iter.h"
15#include "buckets.h"
16#include "checksum.h"
17#include "compress.h"
18#include "debug.h"
19#include "disk_groups.h"
20#include "error.h"
21#include "extents.h"
22#include "inode.h"
23#include "journal.h"
24#include "rebalance.h"
25#include "replicas.h"
26#include "super.h"
27#include "super-io.h"
28#include "trace.h"
29#include "util.h"
30
31static const char * const bch2_extent_flags_strs[] = {
32#define x(n, v) [BCH_EXTENT_FLAG_##n] = #n,
33 BCH_EXTENT_FLAGS()
34#undef x
35 NULL,
36};
37
38static unsigned bch2_crc_field_size_max[] = {
39 [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
40 [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
41 [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX,
42};
43
44static void bch2_extent_crc_pack(union bch_extent_crc *,
45 struct bch_extent_crc_unpacked,
46 enum bch_extent_entry_type);
47
48struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
49 unsigned dev)
50{
51 struct bch_dev_io_failures *i;
52
53 for (i = f->devs; i < f->devs + f->nr; i++)
54 if (i->dev == dev)
55 return i;
56
57 return NULL;
58}
59
60void bch2_mark_io_failure(struct bch_io_failures *failed,
61 struct extent_ptr_decoded *p,
62 bool csum_error)
63{
64 struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev);
65
66 if (!f) {
67 BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
68
69 f = &failed->devs[failed->nr++];
70 memset(f, 0, sizeof(*f));
71 f->dev = p->ptr.dev;
72 }
73
74 if (p->do_ec_reconstruct)
75 f->failed_ec = true;
76 else if (!csum_error)
77 f->failed_io = true;
78 else
79 f->failed_csum_nr++;
80}
81
82static inline u64 dev_latency(struct bch_dev *ca)
83{
84 return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX;
85}
86
87static inline int dev_failed(struct bch_dev *ca)
88{
89 return !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
90}
91
92/*
93 * returns true if p1 is better than p2:
94 */
95static inline bool ptr_better(struct bch_fs *c,
96 const struct extent_ptr_decoded p1,
97 u64 p1_latency,
98 struct bch_dev *ca1,
99 const struct extent_ptr_decoded p2,
100 u64 p2_latency)
101{
102 struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
103
104 int failed_delta = dev_failed(ca1) - dev_failed(ca2);
105 if (unlikely(failed_delta))
106 return failed_delta < 0;
107
108 if (unlikely(bch2_force_reconstruct_read))
109 return p1.do_ec_reconstruct > p2.do_ec_reconstruct;
110
111 if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct))
112 return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
113
114 int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr;
115 if (unlikely(crc_retry_delta))
116 return crc_retry_delta < 0;
117
118 /* Pick at random, biased in favor of the faster device: */
119
120 return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency;
121}
122
123/*
124 * This picks a non-stale pointer, preferably from a device other than @avoid.
125 * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
126 * other devices, it will still pick a pointer from avoid.
127 */
128int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
129 struct bch_io_failures *failed,
130 struct extent_ptr_decoded *pick,
131 int dev)
132{
133 bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false;
134 bool have_dirty_ptrs = false, have_pick = false;
135
136 if (k.k->type == KEY_TYPE_error)
137 return -BCH_ERR_key_type_error;
138
139 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
140
141 if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
142 return -BCH_ERR_extent_poisoned;
143
144 rcu_read_lock();
145 const union bch_extent_entry *entry;
146 struct extent_ptr_decoded p;
147 u64 pick_latency;
148
149 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
150 have_dirty_ptrs |= !p.ptr.cached;
151
152 /*
153 * Unwritten extent: no need to actually read, treat it as a
154 * hole and return 0s:
155 */
156 if (p.ptr.unwritten) {
157 rcu_read_unlock();
158 return 0;
159 }
160
161 /* Are we being asked to read from a specific device? */
162 if (dev >= 0 && p.ptr.dev != dev)
163 continue;
164
165 struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
166
167 if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
168 continue;
169
170 struct bch_dev_io_failures *f =
171 unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
172 if (unlikely(f)) {
173 p.crc_retry_nr = f->failed_csum_nr;
174 p.has_ec &= ~f->failed_ec;
175
176 if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) {
177 have_io_errors |= f->failed_io;
178 have_io_errors |= f->failed_ec;
179 }
180 have_csum_errors |= !!f->failed_csum_nr;
181
182 if (p.has_ec && (f->failed_io || f->failed_csum_nr))
183 p.do_ec_reconstruct = true;
184 else if (f->failed_io ||
185 f->failed_csum_nr > c->opts.checksum_err_retry_nr)
186 continue;
187 }
188
189 have_missing_devs |= ca && !bch2_dev_is_online(ca);
190
191 if (!ca || !bch2_dev_is_online(ca)) {
192 if (!p.has_ec)
193 continue;
194 p.do_ec_reconstruct = true;
195 }
196
197 if (bch2_force_reconstruct_read && p.has_ec)
198 p.do_ec_reconstruct = true;
199
200 u64 p_latency = dev_latency(ca);
201 /*
202 * Square the latencies, to bias more in favor of the faster
203 * device - we never want to stop issuing reads to the slower
204 * device altogether, so that we can update our latency numbers:
205 */
206 p_latency *= p_latency;
207
208 if (!have_pick ||
209 ptr_better(c,
210 p, p_latency, ca,
211 *pick, pick_latency)) {
212 *pick = p;
213 pick_latency = p_latency;
214 have_pick = true;
215 }
216 }
217 rcu_read_unlock();
218
219 if (have_pick)
220 return 1;
221 if (!have_dirty_ptrs)
222 return 0;
223 if (have_missing_devs)
224 return -BCH_ERR_no_device_to_read_from;
225 if (have_csum_errors)
226 return -BCH_ERR_data_read_csum_err;
227 if (have_io_errors)
228 return -BCH_ERR_data_read_io_err;
229
230 /*
231 * If we get here, we have pointers (bkey_ptrs_validate() ensures that),
232 * but they don't point to valid devices:
233 */
234 return -BCH_ERR_no_devices_valid;
235}
236
237/* KEY_TYPE_btree_ptr: */
238
239int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k,
240 struct bkey_validate_context from)
241{
242 int ret = 0;
243
244 bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX,
245 c, btree_ptr_val_too_big,
246 "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
247
248 ret = bch2_bkey_ptrs_validate(c, k, from);
249fsck_err:
250 return ret;
251}
252
253void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
254 struct bkey_s_c k)
255{
256 bch2_bkey_ptrs_to_text(out, c, k);
257}
258
259int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k,
260 struct bkey_validate_context from)
261{
262 struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
263 int ret = 0;
264
265 bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX,
266 c, btree_ptr_v2_val_too_big,
267 "value too big (%zu > %zu)",
268 bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
269
270 bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p),
271 c, btree_ptr_v2_min_key_bad,
272 "min_key > key");
273
274 if ((from.flags & BCH_VALIDATE_write) &&
275 c->sb.version_min >= bcachefs_metadata_version_btree_ptr_sectors_written)
276 bkey_fsck_err_on(!bp.v->sectors_written,
277 c, btree_ptr_v2_written_0,
278 "sectors_written == 0");
279
280 ret = bch2_bkey_ptrs_validate(c, k, from);
281fsck_err:
282 return ret;
283}
284
285void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
286 struct bkey_s_c k)
287{
288 struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
289
290 prt_printf(out, "seq %llx written %u min_key %s",
291 le64_to_cpu(bp.v->seq),
292 le16_to_cpu(bp.v->sectors_written),
293 BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : "");
294
295 bch2_bpos_to_text(out, bp.v->min_key);
296 prt_printf(out, " ");
297 bch2_bkey_ptrs_to_text(out, c, k);
298}
299
300void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
301 unsigned big_endian, int write,
302 struct bkey_s k)
303{
304 struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k);
305
306 compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key);
307
308 if (version < bcachefs_metadata_version_inode_btree_change &&
309 btree_id_is_extents(btree_id) &&
310 !bkey_eq(bp.v->min_key, POS_MIN))
311 bp.v->min_key = write
312 ? bpos_nosnap_predecessor(bp.v->min_key)
313 : bpos_nosnap_successor(bp.v->min_key);
314}
315
316/* KEY_TYPE_extent: */
317
318bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
319{
320 struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l);
321 struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r);
322 union bch_extent_entry *en_l;
323 const union bch_extent_entry *en_r;
324 struct extent_ptr_decoded lp, rp;
325 bool use_right_ptr;
326
327 en_l = l_ptrs.start;
328 en_r = r_ptrs.start;
329 while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
330 if (extent_entry_type(en_l) != extent_entry_type(en_r))
331 return false;
332
333 en_l = extent_entry_next(en_l);
334 en_r = extent_entry_next(en_r);
335 }
336
337 if (en_l < l_ptrs.end || en_r < r_ptrs.end)
338 return false;
339
340 en_l = l_ptrs.start;
341 en_r = r_ptrs.start;
342 lp.crc = bch2_extent_crc_unpack(l.k, NULL);
343 rp.crc = bch2_extent_crc_unpack(r.k, NULL);
344
345 while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) &&
346 __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) {
347 if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size !=
348 rp.ptr.offset + rp.crc.offset ||
349 lp.ptr.dev != rp.ptr.dev ||
350 lp.ptr.gen != rp.ptr.gen ||
351 lp.ptr.unwritten != rp.ptr.unwritten ||
352 lp.has_ec != rp.has_ec)
353 return false;
354
355 /* Extents may not straddle buckets: */
356 rcu_read_lock();
357 struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev);
358 bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr);
359 rcu_read_unlock();
360
361 if (!same_bucket)
362 return false;
363
364 if (lp.has_ec != rp.has_ec ||
365 (lp.has_ec &&
366 (lp.ec.block != rp.ec.block ||
367 lp.ec.redundancy != rp.ec.redundancy ||
368 lp.ec.idx != rp.ec.idx)))
369 return false;
370
371 if (lp.crc.compression_type != rp.crc.compression_type ||
372 lp.crc.nonce != rp.crc.nonce)
373 return false;
374
375 if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <=
376 lp.crc.uncompressed_size) {
377 /* can use left extent's crc entry */
378 } else if (lp.crc.live_size <= rp.crc.offset) {
379 /* can use right extent's crc entry */
380 } else {
381 /* check if checksums can be merged: */
382 if (lp.crc.csum_type != rp.crc.csum_type ||
383 lp.crc.nonce != rp.crc.nonce ||
384 crc_is_compressed(lp.crc) ||
385 !bch2_checksum_mergeable(lp.crc.csum_type))
386 return false;
387
388 if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size ||
389 rp.crc.offset)
390 return false;
391
392 if (lp.crc.csum_type &&
393 lp.crc.uncompressed_size +
394 rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
395 return false;
396 }
397
398 en_l = extent_entry_next(en_l);
399 en_r = extent_entry_next(en_r);
400 }
401
402 en_l = l_ptrs.start;
403 en_r = r_ptrs.start;
404 while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
405 if (extent_entry_is_crc(en_l)) {
406 struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
407 struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
408
409 if (crc_l.uncompressed_size + crc_r.uncompressed_size >
410 bch2_crc_field_size_max[extent_entry_type(en_l)])
411 return false;
412 }
413
414 en_l = extent_entry_next(en_l);
415 en_r = extent_entry_next(en_r);
416 }
417
418 use_right_ptr = false;
419 en_l = l_ptrs.start;
420 en_r = r_ptrs.start;
421 while (en_l < l_ptrs.end) {
422 if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr &&
423 use_right_ptr)
424 en_l->ptr = en_r->ptr;
425
426 if (extent_entry_is_crc(en_l)) {
427 struct bch_extent_crc_unpacked crc_l =
428 bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
429 struct bch_extent_crc_unpacked crc_r =
430 bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
431
432 use_right_ptr = false;
433
434 if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
435 crc_l.uncompressed_size) {
436 /* can use left extent's crc entry */
437 } else if (crc_l.live_size <= crc_r.offset) {
438 /* can use right extent's crc entry */
439 crc_r.offset -= crc_l.live_size;
440 bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
441 extent_entry_type(en_l));
442 use_right_ptr = true;
443 } else {
444 crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
445 crc_l.csum,
446 crc_r.csum,
447 crc_r.uncompressed_size << 9);
448
449 crc_l.uncompressed_size += crc_r.uncompressed_size;
450 crc_l.compressed_size += crc_r.compressed_size;
451 bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
452 extent_entry_type(en_l));
453 }
454 }
455
456 en_l = extent_entry_next(en_l);
457 en_r = extent_entry_next(en_r);
458 }
459
460 bch2_key_resize(l.k, l.k->size + r.k->size);
461 return true;
462}
463
464/* KEY_TYPE_reservation: */
465
466int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k,
467 struct bkey_validate_context from)
468{
469 struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
470 int ret = 0;
471
472 bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX,
473 c, reservation_key_nr_replicas_invalid,
474 "invalid nr_replicas (%u)", r.v->nr_replicas);
475fsck_err:
476 return ret;
477}
478
479void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
480 struct bkey_s_c k)
481{
482 struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
483
484 prt_printf(out, "generation %u replicas %u",
485 le32_to_cpu(r.v->generation),
486 r.v->nr_replicas);
487}
488
489bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
490{
491 struct bkey_s_reservation l = bkey_s_to_reservation(_l);
492 struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r);
493
494 if (l.v->generation != r.v->generation ||
495 l.v->nr_replicas != r.v->nr_replicas)
496 return false;
497
498 bch2_key_resize(l.k, l.k->size + r.k->size);
499 return true;
500}
501
502/* Extent checksum entries: */
503
504/* returns true if not equal */
505static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
506 struct bch_extent_crc_unpacked r)
507{
508 return (l.csum_type != r.csum_type ||
509 l.compression_type != r.compression_type ||
510 l.compressed_size != r.compressed_size ||
511 l.uncompressed_size != r.uncompressed_size ||
512 l.offset != r.offset ||
513 l.live_size != r.live_size ||
514 l.nonce != r.nonce ||
515 bch2_crc_cmp(l.csum, r.csum));
516}
517
518static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
519 struct bch_extent_crc_unpacked n)
520{
521 return !crc_is_compressed(u) &&
522 u.csum_type &&
523 u.uncompressed_size > u.live_size &&
524 bch2_csum_type_is_encryption(u.csum_type) ==
525 bch2_csum_type_is_encryption(n.csum_type);
526}
527
528bool bch2_can_narrow_extent_crcs(struct bkey_s_c k,
529 struct bch_extent_crc_unpacked n)
530{
531 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
532 struct bch_extent_crc_unpacked crc;
533 const union bch_extent_entry *i;
534
535 if (!n.csum_type)
536 return false;
537
538 bkey_for_each_crc(k.k, ptrs, crc, i)
539 if (can_narrow_crc(crc, n))
540 return true;
541
542 return false;
543}
544
545/*
546 * We're writing another replica for this extent, so while we've got the data in
547 * memory we'll be computing a new checksum for the currently live data.
548 *
549 * If there are other replicas we aren't moving, and they are checksummed but
550 * not compressed, we can modify them to point to only the data that is
551 * currently live (so that readers won't have to bounce) while we've got the
552 * checksum we need:
553 */
554bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
555{
556 struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
557 struct bch_extent_crc_unpacked u;
558 struct extent_ptr_decoded p;
559 union bch_extent_entry *i;
560 bool ret = false;
561
562 /* Find a checksum entry that covers only live data: */
563 if (!n.csum_type) {
564 bkey_for_each_crc(&k->k, ptrs, u, i)
565 if (!crc_is_compressed(u) &&
566 u.csum_type &&
567 u.live_size == u.uncompressed_size) {
568 n = u;
569 goto found;
570 }
571 return false;
572 }
573found:
574 BUG_ON(crc_is_compressed(n));
575 BUG_ON(n.offset);
576 BUG_ON(n.live_size != k->k.size);
577
578restart_narrow_pointers:
579 ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
580
581 bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
582 if (can_narrow_crc(p.crc, n)) {
583 bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr);
584 p.ptr.offset += p.crc.offset;
585 p.crc = n;
586 bch2_extent_ptr_decoded_append(k, &p);
587 ret = true;
588 goto restart_narrow_pointers;
589 }
590
591 return ret;
592}
593
594static void bch2_extent_crc_pack(union bch_extent_crc *dst,
595 struct bch_extent_crc_unpacked src,
596 enum bch_extent_entry_type type)
597{
598#define common_fields(_src) \
599 .type = BIT(type), \
600 .csum_type = _src.csum_type, \
601 .compression_type = _src.compression_type, \
602 ._compressed_size = _src.compressed_size - 1, \
603 ._uncompressed_size = _src.uncompressed_size - 1, \
604 .offset = _src.offset
605
606 switch (type) {
607 case BCH_EXTENT_ENTRY_crc32:
608 dst->crc32 = (struct bch_extent_crc32) {
609 common_fields(src),
610 .csum = (u32 __force) *((__le32 *) &src.csum.lo),
611 };
612 break;
613 case BCH_EXTENT_ENTRY_crc64:
614 dst->crc64 = (struct bch_extent_crc64) {
615 common_fields(src),
616 .nonce = src.nonce,
617 .csum_lo = (u64 __force) src.csum.lo,
618 .csum_hi = (u64 __force) *((__le16 *) &src.csum.hi),
619 };
620 break;
621 case BCH_EXTENT_ENTRY_crc128:
622 dst->crc128 = (struct bch_extent_crc128) {
623 common_fields(src),
624 .nonce = src.nonce,
625 .csum = src.csum,
626 };
627 break;
628 default:
629 BUG();
630 }
631#undef set_common_fields
632}
633
634void bch2_extent_crc_append(struct bkey_i *k,
635 struct bch_extent_crc_unpacked new)
636{
637 struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
638 union bch_extent_crc *crc = (void *) ptrs.end;
639 enum bch_extent_entry_type type;
640
641 if (bch_crc_bytes[new.csum_type] <= 4 &&
642 new.uncompressed_size <= CRC32_SIZE_MAX &&
643 new.nonce <= CRC32_NONCE_MAX)
644 type = BCH_EXTENT_ENTRY_crc32;
645 else if (bch_crc_bytes[new.csum_type] <= 10 &&
646 new.uncompressed_size <= CRC64_SIZE_MAX &&
647 new.nonce <= CRC64_NONCE_MAX)
648 type = BCH_EXTENT_ENTRY_crc64;
649 else if (bch_crc_bytes[new.csum_type] <= 16 &&
650 new.uncompressed_size <= CRC128_SIZE_MAX &&
651 new.nonce <= CRC128_NONCE_MAX)
652 type = BCH_EXTENT_ENTRY_crc128;
653 else
654 BUG();
655
656 bch2_extent_crc_pack(crc, new, type);
657
658 k->k.u64s += extent_entry_u64s(ptrs.end);
659
660 EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
661}
662
663/* Generic code for keys with pointers: */
664
665unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
666{
667 return bch2_bkey_devs(k).nr;
668}
669
670unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
671{
672 return k.k->type == KEY_TYPE_reservation
673 ? bkey_s_c_to_reservation(k).v->nr_replicas
674 : bch2_bkey_dirty_devs(k).nr;
675}
676
677unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)
678{
679 unsigned ret = 0;
680
681 if (k.k->type == KEY_TYPE_reservation) {
682 ret = bkey_s_c_to_reservation(k).v->nr_replicas;
683 } else {
684 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
685 const union bch_extent_entry *entry;
686 struct extent_ptr_decoded p;
687
688 bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
689 ret += !p.ptr.cached && !crc_is_compressed(p.crc);
690 }
691
692 return ret;
693}
694
695unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k)
696{
697 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
698 const union bch_extent_entry *entry;
699 struct extent_ptr_decoded p;
700 unsigned ret = 0;
701
702 bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
703 if (!p.ptr.cached && crc_is_compressed(p.crc))
704 ret += p.crc.compressed_size;
705
706 return ret;
707}
708
709bool bch2_bkey_is_incompressible(struct bkey_s_c k)
710{
711 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
712 const union bch_extent_entry *entry;
713 struct bch_extent_crc_unpacked crc;
714
715 bkey_for_each_crc(k.k, ptrs, crc, entry)
716 if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
717 return true;
718 return false;
719}
720
721unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
722{
723 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
724 const union bch_extent_entry *entry;
725 struct extent_ptr_decoded p = { 0 };
726 unsigned replicas = 0;
727
728 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
729 if (p.ptr.cached)
730 continue;
731
732 if (p.has_ec)
733 replicas += p.ec.redundancy;
734
735 replicas++;
736
737 }
738
739 return replicas;
740}
741
742static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent_ptr_decoded *p)
743{
744 if (p->ptr.cached)
745 return 0;
746
747 return p->has_ec
748 ? p->ec.redundancy + 1
749 : ca->mi.durability;
750}
751
752unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
753{
754 struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
755
756 return ca ? __extent_ptr_durability(ca, p) : 0;
757}
758
759unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
760{
761 struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
762
763 if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed)
764 return 0;
765
766 return __extent_ptr_durability(ca, p);
767}
768
769unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
770{
771 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
772 const union bch_extent_entry *entry;
773 struct extent_ptr_decoded p;
774 unsigned durability = 0;
775
776 rcu_read_lock();
777 bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
778 durability += bch2_extent_ptr_durability(c, &p);
779 rcu_read_unlock();
780
781 return durability;
782}
783
784static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
785{
786 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
787 const union bch_extent_entry *entry;
788 struct extent_ptr_decoded p;
789 unsigned durability = 0;
790
791 rcu_read_lock();
792 bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
793 if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev])
794 durability += bch2_extent_ptr_durability(c, &p);
795 rcu_read_unlock();
796
797 return durability;
798}
799
800void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
801{
802 union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
803 union bch_extent_entry *next = extent_entry_next(entry);
804
805 memmove_u64s(entry, next, (u64 *) end - (u64 *) next);
806 k->k.u64s -= extent_entry_u64s(entry);
807}
808
809void bch2_extent_ptr_decoded_append(struct bkey_i *k,
810 struct extent_ptr_decoded *p)
811{
812 struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
813 struct bch_extent_crc_unpacked crc =
814 bch2_extent_crc_unpack(&k->k, NULL);
815 union bch_extent_entry *pos;
816
817 if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
818 pos = ptrs.start;
819 goto found;
820 }
821
822 bkey_for_each_crc(&k->k, ptrs, crc, pos)
823 if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
824 pos = extent_entry_next(pos);
825 goto found;
826 }
827
828 bch2_extent_crc_append(k, p->crc);
829 pos = bkey_val_end(bkey_i_to_s(k));
830found:
831 p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
832 __extent_entry_insert(k, pos, to_entry(&p->ptr));
833
834 if (p->has_ec) {
835 p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
836 __extent_entry_insert(k, pos, to_entry(&p->ec));
837 }
838}
839
840static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
841 union bch_extent_entry *entry)
842{
843 union bch_extent_entry *i = ptrs.start;
844
845 if (i == entry)
846 return NULL;
847
848 while (extent_entry_next(i) != entry)
849 i = extent_entry_next(i);
850 return i;
851}
852
853/*
854 * Returns pointer to the next entry after the one being dropped:
855 */
856void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr)
857{
858 struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
859 union bch_extent_entry *entry = to_entry(ptr), *next;
860 bool drop_crc = true;
861
862 if (k.k->type == KEY_TYPE_stripe) {
863 ptr->dev = BCH_SB_MEMBER_INVALID;
864 return;
865 }
866
867 EBUG_ON(ptr < &ptrs.start->ptr ||
868 ptr >= &ptrs.end->ptr);
869 EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
870
871 for (next = extent_entry_next(entry);
872 next != ptrs.end;
873 next = extent_entry_next(next)) {
874 if (extent_entry_is_crc(next)) {
875 break;
876 } else if (extent_entry_is_ptr(next)) {
877 drop_crc = false;
878 break;
879 }
880 }
881
882 extent_entry_drop(k, entry);
883
884 while ((entry = extent_entry_prev(ptrs, entry))) {
885 if (extent_entry_is_ptr(entry))
886 break;
887
888 if ((extent_entry_is_crc(entry) && drop_crc) ||
889 extent_entry_is_stripe_ptr(entry))
890 extent_entry_drop(k, entry);
891 }
892}
893
894void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr)
895{
896 if (k.k->type != KEY_TYPE_stripe) {
897 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k.s_c);
898 const union bch_extent_entry *entry;
899 struct extent_ptr_decoded p;
900
901 bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
902 if (p.ptr.dev == ptr->dev && p.has_ec) {
903 ptr->dev = BCH_SB_MEMBER_INVALID;
904 return;
905 }
906 }
907
908 bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
909
910 bch2_bkey_drop_ptr_noerror(k, ptr);
911
912 /*
913 * If we deleted all the dirty pointers and there's still cached
914 * pointers, we could set the cached pointers to dirty if they're not
915 * stale - but to do that correctly we'd need to grab an open_bucket
916 * reference so that we don't race with bucket reuse:
917 */
918 if (have_dirty &&
919 !bch2_bkey_dirty_devs(k.s_c).nr) {
920 k.k->type = KEY_TYPE_error;
921 set_bkey_val_u64s(k.k, 0);
922 } else if (!bch2_bkey_nr_ptrs(k.s_c)) {
923 k.k->type = KEY_TYPE_deleted;
924 set_bkey_val_u64s(k.k, 0);
925 }
926}
927
928void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
929{
930 bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
931}
932
933void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
934{
935 bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev);
936}
937
938const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
939{
940 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
941
942 bkey_for_each_ptr(ptrs, ptr)
943 if (ptr->dev == dev)
944 return ptr;
945
946 return NULL;
947}
948
949bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
950{
951 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
952 struct bch_dev *ca;
953 bool ret = false;
954
955 rcu_read_lock();
956 bkey_for_each_ptr(ptrs, ptr)
957 if (bch2_dev_in_target(c, ptr->dev, target) &&
958 (ca = bch2_dev_rcu(c, ptr->dev)) &&
959 (!ptr->cached ||
960 !dev_ptr_stale_rcu(ca, ptr))) {
961 ret = true;
962 break;
963 }
964 rcu_read_unlock();
965
966 return ret;
967}
968
969bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
970 struct bch_extent_ptr m, u64 offset)
971{
972 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
973 const union bch_extent_entry *entry;
974 struct extent_ptr_decoded p;
975
976 bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
977 if (p.ptr.dev == m.dev &&
978 p.ptr.gen == m.gen &&
979 (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
980 (s64) m.offset - offset)
981 return true;
982
983 return false;
984}
985
986/*
987 * Returns true if two extents refer to the same data:
988 */
989bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
990{
991 if (k1.k->type != k2.k->type)
992 return false;
993
994 if (bkey_extent_is_direct_data(k1.k)) {
995 struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1);
996 struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
997 const union bch_extent_entry *entry1, *entry2;
998 struct extent_ptr_decoded p1, p2;
999
1000 if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2))
1001 return false;
1002
1003 bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
1004 bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
1005 if (p1.ptr.dev == p2.ptr.dev &&
1006 p1.ptr.gen == p2.ptr.gen &&
1007
1008 /*
1009 * This checks that the two pointers point
1010 * to the same region on disk - adjusting
1011 * for the difference in where the extents
1012 * start, since one may have been trimmed:
1013 */
1014 (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
1015 (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) &&
1016
1017 /*
1018 * This additionally checks that the
1019 * extents overlap on disk, since the
1020 * previous check may trigger spuriously
1021 * when one extent is immediately partially
1022 * overwritten with another extent (so that
1023 * on disk they are adjacent) and
1024 * compression is in use:
1025 */
1026 ((p1.ptr.offset >= p2.ptr.offset &&
1027 p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) ||
1028 (p2.ptr.offset >= p1.ptr.offset &&
1029 p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size)))
1030 return true;
1031
1032 return false;
1033 } else {
1034 /* KEY_TYPE_deleted, etc. */
1035 return true;
1036 }
1037}
1038
1039struct bch_extent_ptr *
1040bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2)
1041{
1042 struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2);
1043 union bch_extent_entry *entry2;
1044 struct extent_ptr_decoded p2;
1045
1046 bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
1047 if (p1.ptr.dev == p2.ptr.dev &&
1048 p1.ptr.gen == p2.ptr.gen &&
1049 (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
1050 (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
1051 return &entry2->ptr;
1052
1053 return NULL;
1054}
1055
1056static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
1057 struct bch_extent_ptr *ptr)
1058{
1059 if (!opts->promote_target ||
1060 !bch2_dev_in_target(c, ptr->dev, opts->promote_target))
1061 return false;
1062
1063 struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
1064
1065 return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr);
1066}
1067
1068void bch2_extent_ptr_set_cached(struct bch_fs *c,
1069 struct bch_io_opts *opts,
1070 struct bkey_s k,
1071 struct bch_extent_ptr *ptr)
1072{
1073 struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
1074 union bch_extent_entry *entry;
1075 struct extent_ptr_decoded p;
1076
1077 rcu_read_lock();
1078 if (!want_cached_ptr(c, opts, ptr)) {
1079 bch2_bkey_drop_ptr_noerror(k, ptr);
1080 goto out;
1081 }
1082
1083 /*
1084 * Stripes can't contain cached data, for - reasons.
1085 *
1086 * Possibly something we can fix in the future?
1087 */
1088 bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
1089 if (&entry->ptr == ptr) {
1090 if (p.has_ec)
1091 bch2_bkey_drop_ptr_noerror(k, ptr);
1092 else
1093 ptr->cached = true;
1094 goto out;
1095 }
1096
1097 BUG();
1098out:
1099 rcu_read_unlock();
1100}
1101
1102/*
1103 * bch2_extent_normalize - clean up an extent, dropping stale pointers etc.
1104 *
1105 * Returns true if @k should be dropped entirely
1106 *
1107 * For existing keys, only called when btree nodes are being rewritten, not when
1108 * they're merely being compacted/resorted in memory.
1109 */
1110bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
1111{
1112 struct bch_dev *ca;
1113
1114 rcu_read_lock();
1115 bch2_bkey_drop_ptrs(k, ptr,
1116 ptr->cached &&
1117 (!(ca = bch2_dev_rcu(c, ptr->dev)) ||
1118 dev_ptr_stale_rcu(ca, ptr) > 0));
1119 rcu_read_unlock();
1120
1121 return bkey_deleted(k.k);
1122}
1123
1124/*
1125 * bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc.
1126 *
1127 * Like bch2_extent_normalize(), but also only keeps a single cached pointer on
1128 * the promote target.
1129 */
1130bool bch2_extent_normalize_by_opts(struct bch_fs *c,
1131 struct bch_io_opts *opts,
1132 struct bkey_s k)
1133{
1134 struct bkey_ptrs ptrs;
1135 bool have_cached_ptr;
1136
1137 rcu_read_lock();
1138restart_drop_ptrs:
1139 ptrs = bch2_bkey_ptrs(k);
1140 have_cached_ptr = false;
1141
1142 bkey_for_each_ptr(ptrs, ptr)
1143 if (ptr->cached) {
1144 if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) {
1145 bch2_bkey_drop_ptr(k, ptr);
1146 goto restart_drop_ptrs;
1147 }
1148 have_cached_ptr = true;
1149 }
1150 rcu_read_unlock();
1151
1152 return bkey_deleted(k.k);
1153}
1154
1155void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr)
1156{
1157 out->atomic++;
1158 rcu_read_lock();
1159 struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
1160 if (!ca) {
1161 prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
1162 (u64) ptr->offset, ptr->gen,
1163 ptr->cached ? " cached" : "");
1164 } else {
1165 u32 offset;
1166 u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
1167
1168 prt_printf(out, "ptr: %u:%llu:%u gen %u",
1169 ptr->dev, b, offset, ptr->gen);
1170 if (ca->mi.durability != 1)
1171 prt_printf(out, " d=%u", ca->mi.durability);
1172 if (ptr->cached)
1173 prt_str(out, " cached");
1174 if (ptr->unwritten)
1175 prt_str(out, " unwritten");
1176 int stale = dev_ptr_stale_rcu(ca, ptr);
1177 if (stale > 0)
1178 prt_printf(out, " stale");
1179 else if (stale)
1180 prt_printf(out, " invalid");
1181 }
1182 rcu_read_unlock();
1183 --out->atomic;
1184}
1185
1186void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_crc_unpacked *crc)
1187{
1188 prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ",
1189 crc->compressed_size,
1190 crc->uncompressed_size,
1191 crc->offset, crc->nonce);
1192 bch2_prt_csum_type(out, crc->csum_type);
1193 prt_printf(out, " %0llx:%0llx ", crc->csum.hi, crc->csum.lo);
1194 prt_str(out, " compress ");
1195 bch2_prt_compression_type(out, crc->compression_type);
1196}
1197
1198static void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c,
1199 const struct bch_extent_rebalance *r)
1200{
1201 prt_str(out, "rebalance:");
1202
1203 prt_printf(out, " replicas=%u", r->data_replicas);
1204 if (r->data_replicas_from_inode)
1205 prt_str(out, " (inode)");
1206
1207 prt_str(out, " checksum=");
1208 bch2_prt_csum_opt(out, r->data_checksum);
1209 if (r->data_checksum_from_inode)
1210 prt_str(out, " (inode)");
1211
1212 if (r->background_compression || r->background_compression_from_inode) {
1213 prt_str(out, " background_compression=");
1214 bch2_compression_opt_to_text(out, r->background_compression);
1215
1216 if (r->background_compression_from_inode)
1217 prt_str(out, " (inode)");
1218 }
1219
1220 if (r->background_target || r->background_target_from_inode) {
1221 prt_str(out, " background_target=");
1222 if (c)
1223 bch2_target_to_text(out, c, r->background_target);
1224 else
1225 prt_printf(out, "%u", r->background_target);
1226
1227 if (r->background_target_from_inode)
1228 prt_str(out, " (inode)");
1229 }
1230
1231 if (r->promote_target || r->promote_target_from_inode) {
1232 prt_str(out, " promote_target=");
1233 if (c)
1234 bch2_target_to_text(out, c, r->promote_target);
1235 else
1236 prt_printf(out, "%u", r->promote_target);
1237
1238 if (r->promote_target_from_inode)
1239 prt_str(out, " (inode)");
1240 }
1241
1242 if (r->erasure_code || r->erasure_code_from_inode) {
1243 prt_printf(out, " ec=%u", r->erasure_code);
1244 if (r->erasure_code_from_inode)
1245 prt_str(out, " (inode)");
1246 }
1247}
1248
1249void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
1250 struct bkey_s_c k)
1251{
1252 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1253 const union bch_extent_entry *entry;
1254 bool first = true;
1255
1256 if (c)
1257 prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k));
1258
1259 bkey_extent_entry_for_each(ptrs, entry) {
1260 if (!first)
1261 prt_printf(out, " ");
1262
1263 switch (__extent_entry_type(entry)) {
1264 case BCH_EXTENT_ENTRY_ptr:
1265 bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry));
1266 break;
1267
1268 case BCH_EXTENT_ENTRY_crc32:
1269 case BCH_EXTENT_ENTRY_crc64:
1270 case BCH_EXTENT_ENTRY_crc128: {
1271 struct bch_extent_crc_unpacked crc =
1272 bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
1273
1274 bch2_extent_crc_unpacked_to_text(out, &crc);
1275 break;
1276 }
1277 case BCH_EXTENT_ENTRY_stripe_ptr: {
1278 const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr;
1279
1280 prt_printf(out, "ec: idx %llu block %u",
1281 (u64) ec->idx, ec->block);
1282 break;
1283 }
1284 case BCH_EXTENT_ENTRY_rebalance:
1285 bch2_extent_rebalance_to_text(out, c, &entry->rebalance);
1286 break;
1287
1288 case BCH_EXTENT_ENTRY_flags:
1289 prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags);
1290 break;
1291
1292 default:
1293 prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
1294 return;
1295 }
1296
1297 first = false;
1298 }
1299}
1300
1301static int extent_ptr_validate(struct bch_fs *c,
1302 struct bkey_s_c k,
1303 struct bkey_validate_context from,
1304 const struct bch_extent_ptr *ptr,
1305 unsigned size_ondisk,
1306 bool metadata)
1307{
1308 int ret = 0;
1309
1310 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1311 bkey_for_each_ptr(ptrs, ptr2)
1312 bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev,
1313 c, ptr_to_duplicate_device,
1314 "multiple pointers to same device (%u)", ptr->dev);
1315
1316 /* bad pointers are repaired by check_fix_ptrs(): */
1317 rcu_read_lock();
1318 struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
1319 if (!ca) {
1320 rcu_read_unlock();
1321 return 0;
1322 }
1323 u32 bucket_offset;
1324 u64 bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
1325 unsigned first_bucket = ca->mi.first_bucket;
1326 u64 nbuckets = ca->mi.nbuckets;
1327 unsigned bucket_size = ca->mi.bucket_size;
1328 rcu_read_unlock();
1329
1330 bkey_fsck_err_on(bucket >= nbuckets,
1331 c, ptr_after_last_bucket,
1332 "pointer past last bucket (%llu > %llu)", bucket, nbuckets);
1333 bkey_fsck_err_on(bucket < first_bucket,
1334 c, ptr_before_first_bucket,
1335 "pointer before first bucket (%llu < %u)", bucket, first_bucket);
1336 bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size,
1337 c, ptr_spans_multiple_buckets,
1338 "pointer spans multiple buckets (%u + %u > %u)",
1339 bucket_offset, size_ondisk, bucket_size);
1340fsck_err:
1341 return ret;
1342}
1343
1344int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
1345 struct bkey_validate_context from)
1346{
1347 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1348 const union bch_extent_entry *entry;
1349 struct bch_extent_crc_unpacked crc;
1350 unsigned size_ondisk = k.k->size;
1351 unsigned nonce = UINT_MAX;
1352 unsigned nr_ptrs = 0;
1353 bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false;
1354 int ret = 0;
1355
1356 if (bkey_is_btree_ptr(k.k))
1357 size_ondisk = btree_sectors(c);
1358
1359 bkey_extent_entry_for_each(ptrs, entry) {
1360 bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX,
1361 c, extent_ptrs_invalid_entry,
1362 "invalid extent entry type (got %u, max %u)",
1363 __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
1364
1365 bkey_fsck_err_on(bkey_is_btree_ptr(k.k) &&
1366 !extent_entry_is_ptr(entry),
1367 c, btree_ptr_has_non_ptr,
1368 "has non ptr field");
1369
1370 switch (extent_entry_type(entry)) {
1371 case BCH_EXTENT_ENTRY_ptr:
1372 ret = extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false);
1373 if (ret)
1374 return ret;
1375
1376 bkey_fsck_err_on(entry->ptr.cached && have_ec,
1377 c, ptr_cached_and_erasure_coded,
1378 "cached, erasure coded ptr");
1379
1380 if (!entry->ptr.unwritten)
1381 have_written = true;
1382 else
1383 have_unwritten = true;
1384
1385 have_ec = false;
1386 crc_since_last_ptr = false;
1387 nr_ptrs++;
1388 break;
1389 case BCH_EXTENT_ENTRY_crc32:
1390 case BCH_EXTENT_ENTRY_crc64:
1391 case BCH_EXTENT_ENTRY_crc128:
1392 crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
1393
1394 bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type),
1395 c, ptr_crc_csum_type_unknown,
1396 "invalid checksum type");
1397 bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR,
1398 c, ptr_crc_compression_type_unknown,
1399 "invalid compression type");
1400
1401 bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size,
1402 c, ptr_crc_uncompressed_size_too_small,
1403 "checksum offset + key size > uncompressed size");
1404 bkey_fsck_err_on(crc_is_encoded(crc) &&
1405 (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
1406 (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)),
1407 c, ptr_crc_uncompressed_size_too_big,
1408 "too large encoded extent");
1409 bkey_fsck_err_on(!crc_is_compressed(crc) &&
1410 crc.compressed_size != crc.uncompressed_size,
1411 c, ptr_crc_uncompressed_size_mismatch,
1412 "not compressed but compressed != uncompressed size");
1413
1414 if (bch2_csum_type_is_encryption(crc.csum_type)) {
1415 if (nonce == UINT_MAX)
1416 nonce = crc.offset + crc.nonce;
1417 else if (nonce != crc.offset + crc.nonce)
1418 bkey_fsck_err(c, ptr_crc_nonce_mismatch,
1419 "incorrect nonce");
1420 }
1421
1422 bkey_fsck_err_on(crc_since_last_ptr,
1423 c, ptr_crc_redundant,
1424 "redundant crc entry");
1425 crc_since_last_ptr = true;
1426
1427 size_ondisk = crc.compressed_size;
1428 break;
1429 case BCH_EXTENT_ENTRY_stripe_ptr:
1430 bkey_fsck_err_on(have_ec,
1431 c, ptr_stripe_redundant,
1432 "redundant stripe entry");
1433 have_ec = true;
1434 break;
1435 case BCH_EXTENT_ENTRY_rebalance: {
1436 /*
1437 * this shouldn't be a fsck error, for forward
1438 * compatibility; the rebalance code should just refetch
1439 * the compression opt if it's unknown
1440 */
1441#if 0
1442 const struct bch_extent_rebalance *r = &entry->rebalance;
1443
1444 if (!bch2_compression_opt_valid(r->compression)) {
1445 struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
1446 prt_printf(err, "invalid compression opt %u:%u",
1447 opt.type, opt.level);
1448 return -BCH_ERR_invalid_bkey;
1449 }
1450#endif
1451 break;
1452 }
1453 case BCH_EXTENT_ENTRY_flags:
1454 bkey_fsck_err_on(entry != ptrs.start,
1455 c, extent_flags_not_at_start,
1456 "extent flags entry not at start");
1457 break;
1458 }
1459 }
1460
1461 bkey_fsck_err_on(!nr_ptrs,
1462 c, extent_ptrs_no_ptrs,
1463 "no ptrs");
1464 bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX,
1465 c, extent_ptrs_too_many_ptrs,
1466 "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX);
1467 bkey_fsck_err_on(have_written && have_unwritten,
1468 c, extent_ptrs_written_and_unwritten,
1469 "extent with unwritten and written ptrs");
1470 bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten,
1471 c, extent_ptrs_unwritten,
1472 "has unwritten ptrs");
1473 bkey_fsck_err_on(crc_since_last_ptr,
1474 c, extent_ptrs_redundant_crc,
1475 "redundant crc entry");
1476 bkey_fsck_err_on(have_ec,
1477 c, extent_ptrs_redundant_stripe,
1478 "redundant stripe entry");
1479fsck_err:
1480 return ret;
1481}
1482
1483void bch2_ptr_swab(struct bkey_s k)
1484{
1485 struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
1486 union bch_extent_entry *entry;
1487 u64 *d;
1488
1489 for (d = (u64 *) ptrs.start;
1490 d != (u64 *) ptrs.end;
1491 d++)
1492 *d = swab64(*d);
1493
1494 for (entry = ptrs.start;
1495 entry < ptrs.end;
1496 entry = extent_entry_next(entry)) {
1497 switch (__extent_entry_type(entry)) {
1498 case BCH_EXTENT_ENTRY_ptr:
1499 break;
1500 case BCH_EXTENT_ENTRY_crc32:
1501 entry->crc32.csum = swab32(entry->crc32.csum);
1502 break;
1503 case BCH_EXTENT_ENTRY_crc64:
1504 entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
1505 entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
1506 break;
1507 case BCH_EXTENT_ENTRY_crc128:
1508 entry->crc128.csum.hi = (__force __le64)
1509 swab64((__force u64) entry->crc128.csum.hi);
1510 entry->crc128.csum.lo = (__force __le64)
1511 swab64((__force u64) entry->crc128.csum.lo);
1512 break;
1513 case BCH_EXTENT_ENTRY_stripe_ptr:
1514 break;
1515 case BCH_EXTENT_ENTRY_rebalance:
1516 break;
1517 default:
1518 /* Bad entry type: will be caught by validate() */
1519 return;
1520 }
1521 }
1522}
1523
1524int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags)
1525{
1526 int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags);
1527 if (ret)
1528 return ret;
1529
1530 struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
1531
1532 if (ptrs.start != ptrs.end &&
1533 extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) {
1534 ptrs.start->flags.flags = flags;
1535 } else {
1536 struct bch_extent_flags f = {
1537 .type = BIT(BCH_EXTENT_ENTRY_flags),
1538 .flags = flags,
1539 };
1540 __extent_entry_insert(k, ptrs.start, (union bch_extent_entry *) &f);
1541 }
1542
1543 return 0;
1544}
1545
1546/* Generic extent code: */
1547
1548int bch2_cut_front_s(struct bpos where, struct bkey_s k)
1549{
1550 unsigned new_val_u64s = bkey_val_u64s(k.k);
1551 int val_u64s_delta;
1552 u64 sub;
1553
1554 if (bkey_le(where, bkey_start_pos(k.k)))
1555 return 0;
1556
1557 EBUG_ON(bkey_gt(where, k.k->p));
1558
1559 sub = where.offset - bkey_start_offset(k.k);
1560
1561 k.k->size -= sub;
1562
1563 if (!k.k->size) {
1564 k.k->type = KEY_TYPE_deleted;
1565 new_val_u64s = 0;
1566 }
1567
1568 switch (k.k->type) {
1569 case KEY_TYPE_extent:
1570 case KEY_TYPE_reflink_v: {
1571 struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
1572 union bch_extent_entry *entry;
1573 bool seen_crc = false;
1574
1575 bkey_extent_entry_for_each(ptrs, entry) {
1576 switch (extent_entry_type(entry)) {
1577 case BCH_EXTENT_ENTRY_ptr:
1578 if (!seen_crc)
1579 entry->ptr.offset += sub;
1580 break;
1581 case BCH_EXTENT_ENTRY_crc32:
1582 entry->crc32.offset += sub;
1583 break;
1584 case BCH_EXTENT_ENTRY_crc64:
1585 entry->crc64.offset += sub;
1586 break;
1587 case BCH_EXTENT_ENTRY_crc128:
1588 entry->crc128.offset += sub;
1589 break;
1590 case BCH_EXTENT_ENTRY_stripe_ptr:
1591 case BCH_EXTENT_ENTRY_rebalance:
1592 case BCH_EXTENT_ENTRY_flags:
1593 break;
1594 }
1595
1596 if (extent_entry_is_crc(entry))
1597 seen_crc = true;
1598 }
1599
1600 break;
1601 }
1602 case KEY_TYPE_reflink_p: {
1603 struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
1604
1605 SET_REFLINK_P_IDX(p.v, REFLINK_P_IDX(p.v) + sub);
1606 break;
1607 }
1608 case KEY_TYPE_inline_data:
1609 case KEY_TYPE_indirect_inline_data: {
1610 void *p = bkey_inline_data_p(k);
1611 unsigned bytes = bkey_inline_data_bytes(k.k);
1612
1613 sub = min_t(u64, sub << 9, bytes);
1614
1615 memmove(p, p + sub, bytes - sub);
1616
1617 new_val_u64s -= sub >> 3;
1618 break;
1619 }
1620 }
1621
1622 val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
1623 BUG_ON(val_u64s_delta < 0);
1624
1625 set_bkey_val_u64s(k.k, new_val_u64s);
1626 memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
1627 return -val_u64s_delta;
1628}
1629
1630int bch2_cut_back_s(struct bpos where, struct bkey_s k)
1631{
1632 unsigned new_val_u64s = bkey_val_u64s(k.k);
1633 int val_u64s_delta;
1634 u64 len = 0;
1635
1636 if (bkey_ge(where, k.k->p))
1637 return 0;
1638
1639 EBUG_ON(bkey_lt(where, bkey_start_pos(k.k)));
1640
1641 len = where.offset - bkey_start_offset(k.k);
1642
1643 k.k->p.offset = where.offset;
1644 k.k->size = len;
1645
1646 if (!len) {
1647 k.k->type = KEY_TYPE_deleted;
1648 new_val_u64s = 0;
1649 }
1650
1651 switch (k.k->type) {
1652 case KEY_TYPE_inline_data:
1653 case KEY_TYPE_indirect_inline_data:
1654 new_val_u64s = (bkey_inline_data_offset(k.k) +
1655 min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3;
1656 break;
1657 }
1658
1659 val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
1660 BUG_ON(val_u64s_delta < 0);
1661
1662 set_bkey_val_u64s(k.k, new_val_u64s);
1663 memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
1664 return -val_u64s_delta;
1665}