Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright 2012 Google, Inc.
4 *
5 * Foreground allocator code: allocate buckets from freelist, and allocate in
6 * sector granularity from writepoints.
7 *
8 * bch2_bucket_alloc() allocates a single bucket from a specific device.
9 *
10 * bch2_bucket_alloc_set() allocates one or more buckets from different devices
11 * in a given filesystem.
12 */
13
14#include "bcachefs.h"
15#include "alloc_background.h"
16#include "alloc_foreground.h"
17#include "backpointers.h"
18#include "btree_iter.h"
19#include "btree_update.h"
20#include "btree_gc.h"
21#include "buckets.h"
22#include "buckets_waiting_for_journal.h"
23#include "clock.h"
24#include "debug.h"
25#include "disk_groups.h"
26#include "ec.h"
27#include "error.h"
28#include "io_write.h"
29#include "journal.h"
30#include "movinggc.h"
31#include "nocow_locking.h"
32#include "trace.h"
33
34#include <linux/math64.h>
35#include <linux/rculist.h>
36#include <linux/rcupdate.h>
37
38static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans,
39 struct mutex *lock)
40{
41 if (!mutex_trylock(lock)) {
42 bch2_trans_unlock(trans);
43 mutex_lock(lock);
44 }
45}
46
47const char * const bch2_watermarks[] = {
48#define x(t) #t,
49 BCH_WATERMARKS()
50#undef x
51 NULL
52};
53
54/*
55 * Open buckets represent a bucket that's currently being allocated from. They
56 * serve two purposes:
57 *
58 * - They track buckets that have been partially allocated, allowing for
59 * sub-bucket sized allocations - they're used by the sector allocator below
60 *
61 * - They provide a reference to the buckets they own that mark and sweep GC
62 * can find, until the new allocation has a pointer to it inserted into the
63 * btree
64 *
65 * When allocating some space with the sector allocator, the allocation comes
66 * with a reference to an open bucket - the caller is required to put that
67 * reference _after_ doing the index update that makes its allocation reachable.
68 */
69
70void bch2_reset_alloc_cursors(struct bch_fs *c)
71{
72 struct bch_dev *ca;
73 unsigned i;
74
75 rcu_read_lock();
76 for_each_member_device_rcu(ca, c, i, NULL)
77 ca->alloc_cursor = 0;
78 rcu_read_unlock();
79}
80
81static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
82{
83 open_bucket_idx_t idx = ob - c->open_buckets;
84 open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
85
86 ob->hash = *slot;
87 *slot = idx;
88}
89
90static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob)
91{
92 open_bucket_idx_t idx = ob - c->open_buckets;
93 open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
94
95 while (*slot != idx) {
96 BUG_ON(!*slot);
97 slot = &c->open_buckets[*slot].hash;
98 }
99
100 *slot = ob->hash;
101 ob->hash = 0;
102}
103
104void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
105{
106 struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
107
108 if (ob->ec) {
109 ec_stripe_new_put(c, ob->ec, STRIPE_REF_io);
110 return;
111 }
112
113 percpu_down_read(&c->mark_lock);
114 spin_lock(&ob->lock);
115
116 ob->valid = false;
117 ob->data_type = 0;
118
119 spin_unlock(&ob->lock);
120 percpu_up_read(&c->mark_lock);
121
122 spin_lock(&c->freelist_lock);
123 bch2_open_bucket_hash_remove(c, ob);
124
125 ob->freelist = c->open_buckets_freelist;
126 c->open_buckets_freelist = ob - c->open_buckets;
127
128 c->open_buckets_nr_free++;
129 ca->nr_open_buckets--;
130 spin_unlock(&c->freelist_lock);
131
132 closure_wake_up(&c->open_buckets_wait);
133}
134
135void bch2_open_bucket_write_error(struct bch_fs *c,
136 struct open_buckets *obs,
137 unsigned dev)
138{
139 struct open_bucket *ob;
140 unsigned i;
141
142 open_bucket_for_each(c, obs, ob, i)
143 if (ob->dev == dev && ob->ec)
144 bch2_ec_bucket_cancel(c, ob);
145}
146
147static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
148{
149 struct open_bucket *ob;
150
151 BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
152
153 ob = c->open_buckets + c->open_buckets_freelist;
154 c->open_buckets_freelist = ob->freelist;
155 atomic_set(&ob->pin, 1);
156 ob->data_type = 0;
157
158 c->open_buckets_nr_free--;
159 return ob;
160}
161
162static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
163{
164 BUG_ON(c->open_buckets_partial_nr >=
165 ARRAY_SIZE(c->open_buckets_partial));
166
167 spin_lock(&c->freelist_lock);
168 ob->on_partial_list = true;
169 c->open_buckets_partial[c->open_buckets_partial_nr++] =
170 ob - c->open_buckets;
171 spin_unlock(&c->freelist_lock);
172
173 closure_wake_up(&c->open_buckets_wait);
174 closure_wake_up(&c->freelist_wait);
175}
176
177/* _only_ for allocating the journal on a new device: */
178long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
179{
180 while (ca->new_fs_bucket_idx < ca->mi.nbuckets) {
181 u64 b = ca->new_fs_bucket_idx++;
182
183 if (!is_superblock_bucket(ca, b) &&
184 (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)))
185 return b;
186 }
187
188 return -1;
189}
190
191static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
192{
193 switch (watermark) {
194 case BCH_WATERMARK_reclaim:
195 return 0;
196 case BCH_WATERMARK_btree:
197 case BCH_WATERMARK_btree_copygc:
198 return OPEN_BUCKETS_COUNT / 4;
199 case BCH_WATERMARK_copygc:
200 return OPEN_BUCKETS_COUNT / 3;
201 default:
202 return OPEN_BUCKETS_COUNT / 2;
203 }
204}
205
206static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
207 u64 bucket,
208 enum bch_watermark watermark,
209 const struct bch_alloc_v4 *a,
210 struct bucket_alloc_state *s,
211 struct closure *cl)
212{
213 struct open_bucket *ob;
214
215 if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
216 s->skipped_nouse++;
217 return NULL;
218 }
219
220 if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
221 s->skipped_open++;
222 return NULL;
223 }
224
225 if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
226 c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
227 s->skipped_need_journal_commit++;
228 return NULL;
229 }
230
231 if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) {
232 s->skipped_nocow++;
233 return NULL;
234 }
235
236 spin_lock(&c->freelist_lock);
237
238 if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) {
239 if (cl)
240 closure_wait(&c->open_buckets_wait, cl);
241
242 if (!c->blocked_allocate_open_bucket)
243 c->blocked_allocate_open_bucket = local_clock();
244
245 spin_unlock(&c->freelist_lock);
246 return ERR_PTR(-BCH_ERR_open_buckets_empty);
247 }
248
249 /* Recheck under lock: */
250 if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
251 spin_unlock(&c->freelist_lock);
252 s->skipped_open++;
253 return NULL;
254 }
255
256 ob = bch2_open_bucket_alloc(c);
257
258 spin_lock(&ob->lock);
259
260 ob->valid = true;
261 ob->sectors_free = ca->mi.bucket_size;
262 ob->dev = ca->dev_idx;
263 ob->gen = a->gen;
264 ob->bucket = bucket;
265 spin_unlock(&ob->lock);
266
267 ca->nr_open_buckets++;
268 bch2_open_bucket_hash_add(c, ob);
269
270 if (c->blocked_allocate_open_bucket) {
271 bch2_time_stats_update(
272 &c->times[BCH_TIME_blocked_allocate_open_bucket],
273 c->blocked_allocate_open_bucket);
274 c->blocked_allocate_open_bucket = 0;
275 }
276
277 if (c->blocked_allocate) {
278 bch2_time_stats_update(
279 &c->times[BCH_TIME_blocked_allocate],
280 c->blocked_allocate);
281 c->blocked_allocate = 0;
282 }
283
284 spin_unlock(&c->freelist_lock);
285 return ob;
286}
287
288static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
289 enum bch_watermark watermark, u64 free_entry,
290 struct bucket_alloc_state *s,
291 struct bkey_s_c freespace_k,
292 struct closure *cl)
293{
294 struct bch_fs *c = trans->c;
295 struct btree_iter iter = { NULL };
296 struct bkey_s_c k;
297 struct open_bucket *ob;
298 struct bch_alloc_v4 a_convert;
299 const struct bch_alloc_v4 *a;
300 u64 b = free_entry & ~(~0ULL << 56);
301 unsigned genbits = free_entry >> 56;
302 struct printbuf buf = PRINTBUF;
303 int ret;
304
305 if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) {
306 prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
307 " freespace key ",
308 ca->mi.first_bucket, ca->mi.nbuckets);
309 bch2_bkey_val_to_text(&buf, c, freespace_k);
310 bch2_trans_inconsistent(trans, "%s", buf.buf);
311 ob = ERR_PTR(-EIO);
312 goto err;
313 }
314
315 k = bch2_bkey_get_iter(trans, &iter,
316 BTREE_ID_alloc, POS(ca->dev_idx, b),
317 BTREE_ITER_CACHED);
318 ret = bkey_err(k);
319 if (ret) {
320 ob = ERR_PTR(ret);
321 goto err;
322 }
323
324 a = bch2_alloc_to_v4(k, &a_convert);
325
326 if (a->data_type != BCH_DATA_free) {
327 if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
328 ob = NULL;
329 goto err;
330 }
331
332 prt_printf(&buf, "non free bucket in freespace btree\n"
333 " freespace key ");
334 bch2_bkey_val_to_text(&buf, c, freespace_k);
335 prt_printf(&buf, "\n ");
336 bch2_bkey_val_to_text(&buf, c, k);
337 bch2_trans_inconsistent(trans, "%s", buf.buf);
338 ob = ERR_PTR(-EIO);
339 goto err;
340 }
341
342 if (genbits != (alloc_freespace_genbits(*a) >> 56) &&
343 c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
344 prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
345 " freespace key ",
346 genbits, alloc_freespace_genbits(*a) >> 56);
347 bch2_bkey_val_to_text(&buf, c, freespace_k);
348 prt_printf(&buf, "\n ");
349 bch2_bkey_val_to_text(&buf, c, k);
350 bch2_trans_inconsistent(trans, "%s", buf.buf);
351 ob = ERR_PTR(-EIO);
352 goto err;
353 }
354
355 if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) {
356 struct bch_backpointer bp;
357 struct bpos bp_pos = POS_MIN;
358
359 ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1,
360 &bp_pos, &bp,
361 BTREE_ITER_NOPRESERVE);
362 if (ret) {
363 ob = ERR_PTR(ret);
364 goto err;
365 }
366
367 if (!bkey_eq(bp_pos, POS_MAX)) {
368 /*
369 * Bucket may have data in it - we don't call
370 * bc2h_trans_inconnsistent() because fsck hasn't
371 * finished yet
372 */
373 ob = NULL;
374 goto err;
375 }
376 }
377
378 ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl);
379 if (!ob)
380 iter.path->preserve = false;
381err:
382 if (iter.trans && iter.path)
383 set_btree_iter_dontneed(&iter);
384 bch2_trans_iter_exit(trans, &iter);
385 printbuf_exit(&buf);
386 return ob;
387}
388
389/*
390 * This path is for before the freespace btree is initialized:
391 *
392 * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
393 * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
394 */
395static noinline struct open_bucket *
396bch2_bucket_alloc_early(struct btree_trans *trans,
397 struct bch_dev *ca,
398 enum bch_watermark watermark,
399 struct bucket_alloc_state *s,
400 struct closure *cl)
401{
402 struct btree_iter iter, citer;
403 struct bkey_s_c k, ck;
404 struct open_bucket *ob = NULL;
405 u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
406 u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor));
407 u64 alloc_cursor = alloc_start;
408 int ret;
409
410 /*
411 * Scan with an uncached iterator to avoid polluting the key cache. An
412 * uncached iter will return a cached key if one exists, but if not
413 * there is no other underlying protection for the associated key cache
414 * slot. To avoid racing bucket allocations, look up the cached key slot
415 * of any likely allocation candidate before attempting to proceed with
416 * the allocation. This provides proper exclusion on the associated
417 * bucket.
418 */
419again:
420 for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
421 BTREE_ITER_SLOTS, k, ret) {
422 struct bch_alloc_v4 a_convert;
423 const struct bch_alloc_v4 *a;
424
425 if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
426 break;
427
428 if (ca->new_fs_bucket_idx &&
429 is_superblock_bucket(ca, k.k->p.offset))
430 continue;
431
432 a = bch2_alloc_to_v4(k, &a_convert);
433 if (a->data_type != BCH_DATA_free)
434 continue;
435
436 /* now check the cached key to serialize concurrent allocs of the bucket */
437 ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED);
438 ret = bkey_err(ck);
439 if (ret)
440 break;
441
442 a = bch2_alloc_to_v4(ck, &a_convert);
443 if (a->data_type != BCH_DATA_free)
444 goto next;
445
446 s->buckets_seen++;
447
448 ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
449next:
450 citer.path->preserve = false;
451 bch2_trans_iter_exit(trans, &citer);
452 if (ob)
453 break;
454 }
455 bch2_trans_iter_exit(trans, &iter);
456
457 alloc_cursor = iter.pos.offset;
458 ca->alloc_cursor = alloc_cursor;
459
460 if (!ob && ret)
461 ob = ERR_PTR(ret);
462
463 if (!ob && alloc_start > first_bucket) {
464 alloc_cursor = alloc_start = first_bucket;
465 goto again;
466 }
467
468 return ob;
469}
470
471static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
472 struct bch_dev *ca,
473 enum bch_watermark watermark,
474 struct bucket_alloc_state *s,
475 struct closure *cl)
476{
477 struct btree_iter iter;
478 struct bkey_s_c k;
479 struct open_bucket *ob = NULL;
480 u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor));
481 u64 alloc_cursor = alloc_start;
482 int ret;
483
484 BUG_ON(ca->new_fs_bucket_idx);
485again:
486 for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
487 POS(ca->dev_idx, alloc_cursor), 0, k, ret) {
488 if (k.k->p.inode != ca->dev_idx)
489 break;
490
491 for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k));
492 alloc_cursor < k.k->p.offset;
493 alloc_cursor++) {
494 ret = btree_trans_too_many_iters(trans);
495 if (ret) {
496 ob = ERR_PTR(ret);
497 break;
498 }
499
500 s->buckets_seen++;
501
502 ob = try_alloc_bucket(trans, ca, watermark,
503 alloc_cursor, s, k, cl);
504 if (ob) {
505 iter.path->preserve = false;
506 break;
507 }
508 }
509
510 if (ob || ret)
511 break;
512 }
513 bch2_trans_iter_exit(trans, &iter);
514
515 ca->alloc_cursor = alloc_cursor;
516
517 if (!ob && ret)
518 ob = ERR_PTR(ret);
519
520 if (!ob && alloc_start > ca->mi.first_bucket) {
521 alloc_cursor = alloc_start = ca->mi.first_bucket;
522 goto again;
523 }
524
525 return ob;
526}
527
528/**
529 * bch2_bucket_alloc_trans - allocate a single bucket from a specific device
530 * @trans: transaction object
531 * @ca: device to allocate from
532 * @watermark: how important is this allocation?
533 * @cl: if not NULL, closure to be used to wait if buckets not available
534 * @usage: for secondarily also returning the current device usage
535 *
536 * Returns: an open_bucket on success, or an ERR_PTR() on failure.
537 */
538static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
539 struct bch_dev *ca,
540 enum bch_watermark watermark,
541 struct closure *cl,
542 struct bch_dev_usage *usage)
543{
544 struct bch_fs *c = trans->c;
545 struct open_bucket *ob = NULL;
546 bool freespace = READ_ONCE(ca->mi.freespace_initialized);
547 u64 avail;
548 struct bucket_alloc_state s = { 0 };
549 bool waiting = false;
550again:
551 bch2_dev_usage_read_fast(ca, usage);
552 avail = dev_buckets_free(ca, *usage, watermark);
553
554 if (usage->d[BCH_DATA_need_discard].buckets > avail)
555 bch2_do_discards(c);
556
557 if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
558 bch2_do_gc_gens(c);
559
560 if (should_invalidate_buckets(ca, *usage))
561 bch2_do_invalidates(c);
562
563 if (!avail) {
564 if (cl && !waiting) {
565 closure_wait(&c->freelist_wait, cl);
566 waiting = true;
567 goto again;
568 }
569
570 if (!c->blocked_allocate)
571 c->blocked_allocate = local_clock();
572
573 ob = ERR_PTR(-BCH_ERR_freelist_empty);
574 goto err;
575 }
576
577 if (waiting)
578 closure_wake_up(&c->freelist_wait);
579alloc:
580 ob = likely(freespace)
581 ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl)
582 : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl);
583
584 if (s.skipped_need_journal_commit * 2 > avail)
585 bch2_journal_flush_async(&c->journal, NULL);
586
587 if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
588 freespace = false;
589 goto alloc;
590 }
591err:
592 if (!ob)
593 ob = ERR_PTR(-BCH_ERR_no_buckets_found);
594
595 if (!IS_ERR(ob))
596 trace_and_count(c, bucket_alloc, ca,
597 bch2_watermarks[watermark],
598 ob->bucket,
599 usage->d[BCH_DATA_free].buckets,
600 avail,
601 bch2_copygc_wait_amount(c),
602 c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
603 &s,
604 cl == NULL,
605 "");
606 else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
607 trace_and_count(c, bucket_alloc_fail, ca,
608 bch2_watermarks[watermark],
609 0,
610 usage->d[BCH_DATA_free].buckets,
611 avail,
612 bch2_copygc_wait_amount(c),
613 c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
614 &s,
615 cl == NULL,
616 bch2_err_str(PTR_ERR(ob)));
617
618 return ob;
619}
620
621struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
622 enum bch_watermark watermark,
623 struct closure *cl)
624{
625 struct bch_dev_usage usage;
626 struct open_bucket *ob;
627
628 bch2_trans_do(c, NULL, NULL, 0,
629 PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
630 cl, &usage)));
631 return ob;
632}
633
634static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
635 unsigned l, unsigned r)
636{
637 return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
638 (stripe->next_alloc[l] < stripe->next_alloc[r]));
639}
640
641#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
642
643struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
644 struct dev_stripe_state *stripe,
645 struct bch_devs_mask *devs)
646{
647 struct dev_alloc_list ret = { .nr = 0 };
648 unsigned i;
649
650 for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
651 ret.devs[ret.nr++] = i;
652
653 bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
654 return ret;
655}
656
657static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
658 struct dev_stripe_state *stripe,
659 struct bch_dev_usage *usage)
660{
661 u64 *v = stripe->next_alloc + ca->dev_idx;
662 u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal);
663 u64 free_space_inv = free_space
664 ? div64_u64(1ULL << 48, free_space)
665 : 1ULL << 48;
666 u64 scale = *v / 4;
667
668 if (*v + free_space_inv >= *v)
669 *v += free_space_inv;
670 else
671 *v = U64_MAX;
672
673 for (v = stripe->next_alloc;
674 v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
675 *v = *v < scale ? 0 : *v - scale;
676}
677
678void bch2_dev_stripe_increment(struct bch_dev *ca,
679 struct dev_stripe_state *stripe)
680{
681 struct bch_dev_usage usage;
682
683 bch2_dev_usage_read_fast(ca, &usage);
684 bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
685}
686
687static int add_new_bucket(struct bch_fs *c,
688 struct open_buckets *ptrs,
689 struct bch_devs_mask *devs_may_alloc,
690 unsigned nr_replicas,
691 unsigned *nr_effective,
692 bool *have_cache,
693 unsigned flags,
694 struct open_bucket *ob)
695{
696 unsigned durability =
697 bch_dev_bkey_exists(c, ob->dev)->mi.durability;
698
699 BUG_ON(*nr_effective >= nr_replicas);
700 BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
701
702 __clear_bit(ob->dev, devs_may_alloc->d);
703 *nr_effective += (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
704 ? durability : 1;
705 *have_cache |= !durability;
706
707 ob_push(c, ptrs, ob);
708
709 if (*nr_effective >= nr_replicas)
710 return 1;
711 if (ob->ec)
712 return 1;
713 return 0;
714}
715
716int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
717 struct open_buckets *ptrs,
718 struct dev_stripe_state *stripe,
719 struct bch_devs_mask *devs_may_alloc,
720 unsigned nr_replicas,
721 unsigned *nr_effective,
722 bool *have_cache,
723 unsigned flags,
724 enum bch_data_type data_type,
725 enum bch_watermark watermark,
726 struct closure *cl)
727{
728 struct bch_fs *c = trans->c;
729 struct dev_alloc_list devs_sorted =
730 bch2_dev_alloc_list(c, stripe, devs_may_alloc);
731 unsigned dev;
732 struct bch_dev *ca;
733 int ret = -BCH_ERR_insufficient_devices;
734 unsigned i;
735
736 BUG_ON(*nr_effective >= nr_replicas);
737
738 for (i = 0; i < devs_sorted.nr; i++) {
739 struct bch_dev_usage usage;
740 struct open_bucket *ob;
741
742 dev = devs_sorted.devs[i];
743
744 rcu_read_lock();
745 ca = rcu_dereference(c->devs[dev]);
746 if (ca)
747 percpu_ref_get(&ca->ref);
748 rcu_read_unlock();
749
750 if (!ca)
751 continue;
752
753 if (!ca->mi.durability && *have_cache) {
754 percpu_ref_put(&ca->ref);
755 continue;
756 }
757
758 ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage);
759 if (!IS_ERR(ob))
760 bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
761 percpu_ref_put(&ca->ref);
762
763 if (IS_ERR(ob)) {
764 ret = PTR_ERR(ob);
765 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl)
766 break;
767 continue;
768 }
769
770 ob->data_type = data_type;
771
772 if (add_new_bucket(c, ptrs, devs_may_alloc,
773 nr_replicas, nr_effective,
774 have_cache, flags, ob)) {
775 ret = 0;
776 break;
777 }
778 }
779
780 return ret;
781}
782
783/* Allocate from stripes: */
784
785/*
786 * if we can't allocate a new stripe because there are already too many
787 * partially filled stripes, force allocating from an existing stripe even when
788 * it's to a device we don't want:
789 */
790
791static int bucket_alloc_from_stripe(struct btree_trans *trans,
792 struct open_buckets *ptrs,
793 struct write_point *wp,
794 struct bch_devs_mask *devs_may_alloc,
795 u16 target,
796 unsigned nr_replicas,
797 unsigned *nr_effective,
798 bool *have_cache,
799 enum bch_watermark watermark,
800 unsigned flags,
801 struct closure *cl)
802{
803 struct bch_fs *c = trans->c;
804 struct dev_alloc_list devs_sorted;
805 struct ec_stripe_head *h;
806 struct open_bucket *ob;
807 unsigned i, ec_idx;
808 int ret = 0;
809
810 if (nr_replicas < 2)
811 return 0;
812
813 if (ec_open_bucket(c, ptrs))
814 return 0;
815
816 h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl);
817 if (IS_ERR(h))
818 return PTR_ERR(h);
819 if (!h)
820 return 0;
821
822 devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
823
824 for (i = 0; i < devs_sorted.nr; i++)
825 for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
826 if (!h->s->blocks[ec_idx])
827 continue;
828
829 ob = c->open_buckets + h->s->blocks[ec_idx];
830 if (ob->dev == devs_sorted.devs[i] &&
831 !test_and_set_bit(ec_idx, h->s->blocks_allocated))
832 goto got_bucket;
833 }
834 goto out_put_head;
835got_bucket:
836 ob->ec_idx = ec_idx;
837 ob->ec = h->s;
838 ec_stripe_new_get(h->s, STRIPE_REF_io);
839
840 ret = add_new_bucket(c, ptrs, devs_may_alloc,
841 nr_replicas, nr_effective,
842 have_cache, flags, ob);
843out_put_head:
844 bch2_ec_stripe_head_put(c, h);
845 return ret;
846}
847
848/* Sector allocator */
849
850static bool want_bucket(struct bch_fs *c,
851 struct write_point *wp,
852 struct bch_devs_mask *devs_may_alloc,
853 bool *have_cache, bool ec,
854 struct open_bucket *ob)
855{
856 struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
857
858 if (!test_bit(ob->dev, devs_may_alloc->d))
859 return false;
860
861 if (ob->data_type != wp->data_type)
862 return false;
863
864 if (!ca->mi.durability &&
865 (wp->data_type == BCH_DATA_btree || ec || *have_cache))
866 return false;
867
868 if (ec != (ob->ec != NULL))
869 return false;
870
871 return true;
872}
873
874static int bucket_alloc_set_writepoint(struct bch_fs *c,
875 struct open_buckets *ptrs,
876 struct write_point *wp,
877 struct bch_devs_mask *devs_may_alloc,
878 unsigned nr_replicas,
879 unsigned *nr_effective,
880 bool *have_cache,
881 bool ec, unsigned flags)
882{
883 struct open_buckets ptrs_skip = { .nr = 0 };
884 struct open_bucket *ob;
885 unsigned i;
886 int ret = 0;
887
888 open_bucket_for_each(c, &wp->ptrs, ob, i) {
889 if (!ret && want_bucket(c, wp, devs_may_alloc,
890 have_cache, ec, ob))
891 ret = add_new_bucket(c, ptrs, devs_may_alloc,
892 nr_replicas, nr_effective,
893 have_cache, flags, ob);
894 else
895 ob_push(c, &ptrs_skip, ob);
896 }
897 wp->ptrs = ptrs_skip;
898
899 return ret;
900}
901
902static int bucket_alloc_set_partial(struct bch_fs *c,
903 struct open_buckets *ptrs,
904 struct write_point *wp,
905 struct bch_devs_mask *devs_may_alloc,
906 unsigned nr_replicas,
907 unsigned *nr_effective,
908 bool *have_cache, bool ec,
909 enum bch_watermark watermark,
910 unsigned flags)
911{
912 int i, ret = 0;
913
914 if (!c->open_buckets_partial_nr)
915 return 0;
916
917 spin_lock(&c->freelist_lock);
918
919 if (!c->open_buckets_partial_nr)
920 goto unlock;
921
922 for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
923 struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
924
925 if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) {
926 struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
927 struct bch_dev_usage usage;
928 u64 avail;
929
930 bch2_dev_usage_read_fast(ca, &usage);
931 avail = dev_buckets_free(ca, usage, watermark);
932 if (!avail)
933 continue;
934
935 array_remove_item(c->open_buckets_partial,
936 c->open_buckets_partial_nr,
937 i);
938 ob->on_partial_list = false;
939
940 ret = add_new_bucket(c, ptrs, devs_may_alloc,
941 nr_replicas, nr_effective,
942 have_cache, flags, ob);
943 if (ret)
944 break;
945 }
946 }
947unlock:
948 spin_unlock(&c->freelist_lock);
949 return ret;
950}
951
952static int __open_bucket_add_buckets(struct btree_trans *trans,
953 struct open_buckets *ptrs,
954 struct write_point *wp,
955 struct bch_devs_list *devs_have,
956 u16 target,
957 bool erasure_code,
958 unsigned nr_replicas,
959 unsigned *nr_effective,
960 bool *have_cache,
961 enum bch_watermark watermark,
962 unsigned flags,
963 struct closure *_cl)
964{
965 struct bch_fs *c = trans->c;
966 struct bch_devs_mask devs;
967 struct open_bucket *ob;
968 struct closure *cl = NULL;
969 unsigned i;
970 int ret;
971
972 devs = target_rw_devs(c, wp->data_type, target);
973
974 /* Don't allocate from devices we already have pointers to: */
975 for (i = 0; i < devs_have->nr; i++)
976 __clear_bit(devs_have->devs[i], devs.d);
977
978 open_bucket_for_each(c, ptrs, ob, i)
979 __clear_bit(ob->dev, devs.d);
980
981 if (erasure_code && ec_open_bucket(c, ptrs))
982 return 0;
983
984 ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
985 nr_replicas, nr_effective,
986 have_cache, erasure_code, flags);
987 if (ret)
988 return ret;
989
990 ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
991 nr_replicas, nr_effective,
992 have_cache, erasure_code, watermark, flags);
993 if (ret)
994 return ret;
995
996 if (erasure_code) {
997 ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs,
998 target,
999 nr_replicas, nr_effective,
1000 have_cache,
1001 watermark, flags, _cl);
1002 } else {
1003retry_blocking:
1004 /*
1005 * Try nonblocking first, so that if one device is full we'll try from
1006 * other devices:
1007 */
1008 ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
1009 nr_replicas, nr_effective, have_cache,
1010 flags, wp->data_type, watermark, cl);
1011 if (ret &&
1012 !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
1013 !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
1014 !cl && _cl) {
1015 cl = _cl;
1016 goto retry_blocking;
1017 }
1018 }
1019
1020 return ret;
1021}
1022
1023static int open_bucket_add_buckets(struct btree_trans *trans,
1024 struct open_buckets *ptrs,
1025 struct write_point *wp,
1026 struct bch_devs_list *devs_have,
1027 u16 target,
1028 unsigned erasure_code,
1029 unsigned nr_replicas,
1030 unsigned *nr_effective,
1031 bool *have_cache,
1032 enum bch_watermark watermark,
1033 unsigned flags,
1034 struct closure *cl)
1035{
1036 int ret;
1037
1038 if (erasure_code) {
1039 ret = __open_bucket_add_buckets(trans, ptrs, wp,
1040 devs_have, target, erasure_code,
1041 nr_replicas, nr_effective, have_cache,
1042 watermark, flags, cl);
1043 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
1044 bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
1045 bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
1046 bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
1047 return ret;
1048 if (*nr_effective >= nr_replicas)
1049 return 0;
1050 }
1051
1052 ret = __open_bucket_add_buckets(trans, ptrs, wp,
1053 devs_have, target, false,
1054 nr_replicas, nr_effective, have_cache,
1055 watermark, flags, cl);
1056 return ret < 0 ? ret : 0;
1057}
1058
1059/**
1060 * should_drop_bucket - check if this is open_bucket should go away
1061 * @ob: open_bucket to predicate on
1062 * @c: filesystem handle
1063 * @ca: if set, we're killing buckets for a particular device
1064 * @ec: if true, we're shutting down erasure coding and killing all ec
1065 * open_buckets
1066 * otherwise, return true
1067 * Returns: true if we should kill this open_bucket
1068 *
1069 * We're killing open_buckets because we're shutting down a device, erasure
1070 * coding, or the entire filesystem - check if this open_bucket matches:
1071 */
1072static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
1073 struct bch_dev *ca, bool ec)
1074{
1075 if (ec) {
1076 return ob->ec != NULL;
1077 } else if (ca) {
1078 bool drop = ob->dev == ca->dev_idx;
1079 struct open_bucket *ob2;
1080 unsigned i;
1081
1082 if (!drop && ob->ec) {
1083 unsigned nr_blocks;
1084
1085 mutex_lock(&ob->ec->lock);
1086 nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks;
1087
1088 for (i = 0; i < nr_blocks; i++) {
1089 if (!ob->ec->blocks[i])
1090 continue;
1091
1092 ob2 = c->open_buckets + ob->ec->blocks[i];
1093 drop |= ob2->dev == ca->dev_idx;
1094 }
1095 mutex_unlock(&ob->ec->lock);
1096 }
1097
1098 return drop;
1099 } else {
1100 return true;
1101 }
1102}
1103
1104static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
1105 bool ec, struct write_point *wp)
1106{
1107 struct open_buckets ptrs = { .nr = 0 };
1108 struct open_bucket *ob;
1109 unsigned i;
1110
1111 mutex_lock(&wp->lock);
1112 open_bucket_for_each(c, &wp->ptrs, ob, i)
1113 if (should_drop_bucket(ob, c, ca, ec))
1114 bch2_open_bucket_put(c, ob);
1115 else
1116 ob_push(c, &ptrs, ob);
1117 wp->ptrs = ptrs;
1118 mutex_unlock(&wp->lock);
1119}
1120
1121void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
1122 bool ec)
1123{
1124 unsigned i;
1125
1126 /* Next, close write points that point to this device... */
1127 for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
1128 bch2_writepoint_stop(c, ca, ec, &c->write_points[i]);
1129
1130 bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point);
1131 bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point);
1132 bch2_writepoint_stop(c, ca, ec, &c->btree_write_point);
1133
1134 mutex_lock(&c->btree_reserve_cache_lock);
1135 while (c->btree_reserve_cache_nr) {
1136 struct btree_alloc *a =
1137 &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
1138
1139 bch2_open_buckets_put(c, &a->ob);
1140 }
1141 mutex_unlock(&c->btree_reserve_cache_lock);
1142
1143 spin_lock(&c->freelist_lock);
1144 i = 0;
1145 while (i < c->open_buckets_partial_nr) {
1146 struct open_bucket *ob =
1147 c->open_buckets + c->open_buckets_partial[i];
1148
1149 if (should_drop_bucket(ob, c, ca, ec)) {
1150 --c->open_buckets_partial_nr;
1151 swap(c->open_buckets_partial[i],
1152 c->open_buckets_partial[c->open_buckets_partial_nr]);
1153 ob->on_partial_list = false;
1154 spin_unlock(&c->freelist_lock);
1155 bch2_open_bucket_put(c, ob);
1156 spin_lock(&c->freelist_lock);
1157 } else {
1158 i++;
1159 }
1160 }
1161 spin_unlock(&c->freelist_lock);
1162
1163 bch2_ec_stop_dev(c, ca);
1164}
1165
1166static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
1167 unsigned long write_point)
1168{
1169 unsigned hash =
1170 hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
1171
1172 return &c->write_points_hash[hash];
1173}
1174
1175static struct write_point *__writepoint_find(struct hlist_head *head,
1176 unsigned long write_point)
1177{
1178 struct write_point *wp;
1179
1180 rcu_read_lock();
1181 hlist_for_each_entry_rcu(wp, head, node)
1182 if (wp->write_point == write_point)
1183 goto out;
1184 wp = NULL;
1185out:
1186 rcu_read_unlock();
1187 return wp;
1188}
1189
1190static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
1191{
1192 u64 stranded = c->write_points_nr * c->bucket_size_max;
1193 u64 free = bch2_fs_usage_read_short(c).free;
1194
1195 return stranded * factor > free;
1196}
1197
1198static bool try_increase_writepoints(struct bch_fs *c)
1199{
1200 struct write_point *wp;
1201
1202 if (c->write_points_nr == ARRAY_SIZE(c->write_points) ||
1203 too_many_writepoints(c, 32))
1204 return false;
1205
1206 wp = c->write_points + c->write_points_nr++;
1207 hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
1208 return true;
1209}
1210
1211static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
1212{
1213 struct bch_fs *c = trans->c;
1214 struct write_point *wp;
1215 struct open_bucket *ob;
1216 unsigned i;
1217
1218 mutex_lock(&c->write_points_hash_lock);
1219 if (c->write_points_nr < old_nr) {
1220 mutex_unlock(&c->write_points_hash_lock);
1221 return true;
1222 }
1223
1224 if (c->write_points_nr == 1 ||
1225 !too_many_writepoints(c, 8)) {
1226 mutex_unlock(&c->write_points_hash_lock);
1227 return false;
1228 }
1229
1230 wp = c->write_points + --c->write_points_nr;
1231
1232 hlist_del_rcu(&wp->node);
1233 mutex_unlock(&c->write_points_hash_lock);
1234
1235 bch2_trans_mutex_lock_norelock(trans, &wp->lock);
1236 open_bucket_for_each(c, &wp->ptrs, ob, i)
1237 open_bucket_free_unused(c, ob);
1238 wp->ptrs.nr = 0;
1239 mutex_unlock(&wp->lock);
1240 return true;
1241}
1242
1243static struct write_point *writepoint_find(struct btree_trans *trans,
1244 unsigned long write_point)
1245{
1246 struct bch_fs *c = trans->c;
1247 struct write_point *wp, *oldest;
1248 struct hlist_head *head;
1249
1250 if (!(write_point & 1UL)) {
1251 wp = (struct write_point *) write_point;
1252 bch2_trans_mutex_lock_norelock(trans, &wp->lock);
1253 return wp;
1254 }
1255
1256 head = writepoint_hash(c, write_point);
1257restart_find:
1258 wp = __writepoint_find(head, write_point);
1259 if (wp) {
1260lock_wp:
1261 bch2_trans_mutex_lock_norelock(trans, &wp->lock);
1262 if (wp->write_point == write_point)
1263 goto out;
1264 mutex_unlock(&wp->lock);
1265 goto restart_find;
1266 }
1267restart_find_oldest:
1268 oldest = NULL;
1269 for (wp = c->write_points;
1270 wp < c->write_points + c->write_points_nr; wp++)
1271 if (!oldest || time_before64(wp->last_used, oldest->last_used))
1272 oldest = wp;
1273
1274 bch2_trans_mutex_lock_norelock(trans, &oldest->lock);
1275 bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock);
1276 if (oldest >= c->write_points + c->write_points_nr ||
1277 try_increase_writepoints(c)) {
1278 mutex_unlock(&c->write_points_hash_lock);
1279 mutex_unlock(&oldest->lock);
1280 goto restart_find_oldest;
1281 }
1282
1283 wp = __writepoint_find(head, write_point);
1284 if (wp && wp != oldest) {
1285 mutex_unlock(&c->write_points_hash_lock);
1286 mutex_unlock(&oldest->lock);
1287 goto lock_wp;
1288 }
1289
1290 wp = oldest;
1291 hlist_del_rcu(&wp->node);
1292 wp->write_point = write_point;
1293 hlist_add_head_rcu(&wp->node, head);
1294 mutex_unlock(&c->write_points_hash_lock);
1295out:
1296 wp->last_used = local_clock();
1297 return wp;
1298}
1299
1300static noinline void
1301deallocate_extra_replicas(struct bch_fs *c,
1302 struct open_buckets *ptrs,
1303 struct open_buckets *ptrs_no_use,
1304 unsigned extra_replicas)
1305{
1306 struct open_buckets ptrs2 = { 0 };
1307 struct open_bucket *ob;
1308 unsigned i;
1309
1310 open_bucket_for_each(c, ptrs, ob, i) {
1311 unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability;
1312
1313 if (d && d <= extra_replicas) {
1314 extra_replicas -= d;
1315 ob_push(c, ptrs_no_use, ob);
1316 } else {
1317 ob_push(c, &ptrs2, ob);
1318 }
1319 }
1320
1321 *ptrs = ptrs2;
1322}
1323
1324/*
1325 * Get us an open_bucket we can allocate from, return with it locked:
1326 */
1327int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
1328 unsigned target,
1329 unsigned erasure_code,
1330 struct write_point_specifier write_point,
1331 struct bch_devs_list *devs_have,
1332 unsigned nr_replicas,
1333 unsigned nr_replicas_required,
1334 enum bch_watermark watermark,
1335 unsigned flags,
1336 struct closure *cl,
1337 struct write_point **wp_ret)
1338{
1339 struct bch_fs *c = trans->c;
1340 struct write_point *wp;
1341 struct open_bucket *ob;
1342 struct open_buckets ptrs;
1343 unsigned nr_effective, write_points_nr;
1344 bool have_cache;
1345 int ret;
1346 int i;
1347
1348 if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
1349 erasure_code = false;
1350
1351 BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
1352
1353 BUG_ON(!nr_replicas || !nr_replicas_required);
1354retry:
1355 ptrs.nr = 0;
1356 nr_effective = 0;
1357 write_points_nr = c->write_points_nr;
1358 have_cache = false;
1359
1360 *wp_ret = wp = writepoint_find(trans, write_point.v);
1361
1362 /* metadata may not allocate on cache devices: */
1363 if (wp->data_type != BCH_DATA_user)
1364 have_cache = true;
1365
1366 if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
1367 ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1368 target, erasure_code,
1369 nr_replicas, &nr_effective,
1370 &have_cache, watermark,
1371 flags, NULL);
1372 if (!ret ||
1373 bch2_err_matches(ret, BCH_ERR_transaction_restart))
1374 goto alloc_done;
1375
1376 /* Don't retry from all devices if we're out of open buckets: */
1377 if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
1378 int ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1379 target, erasure_code,
1380 nr_replicas, &nr_effective,
1381 &have_cache, watermark,
1382 flags, cl);
1383 if (!ret ||
1384 bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
1385 bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
1386 goto alloc_done;
1387 }
1388
1389 /*
1390 * Only try to allocate cache (durability = 0 devices) from the
1391 * specified target:
1392 */
1393 have_cache = true;
1394
1395 ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1396 0, erasure_code,
1397 nr_replicas, &nr_effective,
1398 &have_cache, watermark,
1399 flags, cl);
1400 } else {
1401 ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1402 target, erasure_code,
1403 nr_replicas, &nr_effective,
1404 &have_cache, watermark,
1405 flags, cl);
1406 }
1407alloc_done:
1408 BUG_ON(!ret && nr_effective < nr_replicas);
1409
1410 if (erasure_code && !ec_open_bucket(c, &ptrs))
1411 pr_debug("failed to get ec bucket: ret %u", ret);
1412
1413 if (ret == -BCH_ERR_insufficient_devices &&
1414 nr_effective >= nr_replicas_required)
1415 ret = 0;
1416
1417 if (ret)
1418 goto err;
1419
1420 if (nr_effective > nr_replicas)
1421 deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas);
1422
1423 /* Free buckets we didn't use: */
1424 open_bucket_for_each(c, &wp->ptrs, ob, i)
1425 open_bucket_free_unused(c, ob);
1426
1427 wp->ptrs = ptrs;
1428
1429 wp->sectors_free = UINT_MAX;
1430
1431 open_bucket_for_each(c, &wp->ptrs, ob, i)
1432 wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
1433
1434 BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
1435
1436 return 0;
1437err:
1438 open_bucket_for_each(c, &wp->ptrs, ob, i)
1439 if (ptrs.nr < ARRAY_SIZE(ptrs.v))
1440 ob_push(c, &ptrs, ob);
1441 else
1442 open_bucket_free_unused(c, ob);
1443 wp->ptrs = ptrs;
1444
1445 mutex_unlock(&wp->lock);
1446
1447 if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
1448 try_decrease_writepoints(trans, write_points_nr))
1449 goto retry;
1450
1451 if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
1452 bch2_err_matches(ret, BCH_ERR_freelist_empty))
1453 return cl
1454 ? -BCH_ERR_bucket_alloc_blocked
1455 : -BCH_ERR_ENOSPC_bucket_alloc;
1456
1457 return ret;
1458}
1459
1460struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
1461{
1462 struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
1463
1464 return (struct bch_extent_ptr) {
1465 .type = 1 << BCH_EXTENT_ENTRY_ptr,
1466 .gen = ob->gen,
1467 .dev = ob->dev,
1468 .offset = bucket_to_sector(ca, ob->bucket) +
1469 ca->mi.bucket_size -
1470 ob->sectors_free,
1471 };
1472}
1473
1474void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
1475 struct bkey_i *k, unsigned sectors,
1476 bool cached)
1477{
1478 bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached);
1479}
1480
1481/*
1482 * Append pointers to the space we just allocated to @k, and mark @sectors space
1483 * as allocated out of @ob
1484 */
1485void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
1486{
1487 bch2_alloc_sectors_done_inlined(c, wp);
1488}
1489
1490static inline void writepoint_init(struct write_point *wp,
1491 enum bch_data_type type)
1492{
1493 mutex_init(&wp->lock);
1494 wp->data_type = type;
1495
1496 INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates);
1497 INIT_LIST_HEAD(&wp->writes);
1498 spin_lock_init(&wp->writes_lock);
1499}
1500
1501void bch2_fs_allocator_foreground_init(struct bch_fs *c)
1502{
1503 struct open_bucket *ob;
1504 struct write_point *wp;
1505
1506 mutex_init(&c->write_points_hash_lock);
1507 c->write_points_nr = ARRAY_SIZE(c->write_points);
1508
1509 /* open bucket 0 is a sentinal NULL: */
1510 spin_lock_init(&c->open_buckets[0].lock);
1511
1512 for (ob = c->open_buckets + 1;
1513 ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
1514 spin_lock_init(&ob->lock);
1515 c->open_buckets_nr_free++;
1516
1517 ob->freelist = c->open_buckets_freelist;
1518 c->open_buckets_freelist = ob - c->open_buckets;
1519 }
1520
1521 writepoint_init(&c->btree_write_point, BCH_DATA_btree);
1522 writepoint_init(&c->rebalance_write_point, BCH_DATA_user);
1523 writepoint_init(&c->copygc_write_point, BCH_DATA_user);
1524
1525 for (wp = c->write_points;
1526 wp < c->write_points + c->write_points_nr; wp++) {
1527 writepoint_init(wp, BCH_DATA_user);
1528
1529 wp->last_used = local_clock();
1530 wp->write_point = (unsigned long) wp;
1531 hlist_add_head_rcu(&wp->node,
1532 writepoint_hash(c, wp->write_point));
1533 }
1534}
1535
1536static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
1537{
1538 struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
1539 unsigned data_type = ob->data_type;
1540 barrier(); /* READ_ONCE() doesn't work on bitfields */
1541
1542 prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u",
1543 ob - c->open_buckets,
1544 atomic_read(&ob->pin),
1545 data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type",
1546 ob->dev, ob->bucket, ob->gen,
1547 ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
1548 if (ob->ec)
1549 prt_printf(out, " ec idx %llu", ob->ec->idx);
1550 if (ob->on_partial_list)
1551 prt_str(out, " partial");
1552 prt_newline(out);
1553}
1554
1555void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
1556{
1557 struct open_bucket *ob;
1558
1559 out->atomic++;
1560
1561 for (ob = c->open_buckets;
1562 ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
1563 ob++) {
1564 spin_lock(&ob->lock);
1565 if (ob->valid && !ob->on_partial_list)
1566 bch2_open_bucket_to_text(out, c, ob);
1567 spin_unlock(&ob->lock);
1568 }
1569
1570 --out->atomic;
1571}
1572
1573void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c)
1574{
1575 unsigned i;
1576
1577 out->atomic++;
1578 spin_lock(&c->freelist_lock);
1579
1580 for (i = 0; i < c->open_buckets_partial_nr; i++)
1581 bch2_open_bucket_to_text(out, c,
1582 c->open_buckets + c->open_buckets_partial[i]);
1583
1584 spin_unlock(&c->freelist_lock);
1585 --out->atomic;
1586}
1587
1588static const char * const bch2_write_point_states[] = {
1589#define x(n) #n,
1590 WRITE_POINT_STATES()
1591#undef x
1592 NULL
1593};
1594
1595static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
1596 struct write_point *wp)
1597{
1598 struct open_bucket *ob;
1599 unsigned i;
1600
1601 prt_printf(out, "%lu: ", wp->write_point);
1602 prt_human_readable_u64(out, wp->sectors_allocated);
1603
1604 prt_printf(out, " last wrote: ");
1605 bch2_pr_time_units(out, sched_clock() - wp->last_used);
1606
1607 for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
1608 prt_printf(out, " %s: ", bch2_write_point_states[i]);
1609 bch2_pr_time_units(out, wp->time[i]);
1610 }
1611
1612 prt_newline(out);
1613
1614 printbuf_indent_add(out, 2);
1615 open_bucket_for_each(c, &wp->ptrs, ob, i)
1616 bch2_open_bucket_to_text(out, c, ob);
1617 printbuf_indent_sub(out, 2);
1618}
1619
1620void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
1621{
1622 struct write_point *wp;
1623
1624 prt_str(out, "Foreground write points\n");
1625 for (wp = c->write_points;
1626 wp < c->write_points + ARRAY_SIZE(c->write_points);
1627 wp++)
1628 bch2_write_point_to_text(out, c, wp);
1629
1630 prt_str(out, "Copygc write point\n");
1631 bch2_write_point_to_text(out, c, &c->copygc_write_point);
1632
1633 prt_str(out, "Rebalance write point\n");
1634 bch2_write_point_to_text(out, c, &c->rebalance_write_point);
1635
1636 prt_str(out, "Btree write point\n");
1637 bch2_write_point_to_text(out, c, &c->btree_write_point);
1638}