Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * background writeback - scan btree for dirty data and write it to the backing
4 * device
5 *
6 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
7 * Copyright 2012 Google, Inc.
8 */
9
10#include "bcache.h"
11#include "btree.h"
12#include "debug.h"
13#include "writeback.h"
14
15#include <linux/delay.h>
16#include <linux/kthread.h>
17#include <linux/sched/clock.h>
18#include <trace/events/bcache.h>
19
20/* Rate limiting */
21
22static void __update_writeback_rate(struct cached_dev *dc)
23{
24 struct cache_set *c = dc->disk.c;
25 uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
26 bcache_flash_devs_sectors_dirty(c);
27 uint64_t cache_dirty_target =
28 div_u64(cache_sectors * dc->writeback_percent, 100);
29 int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
30 c->cached_dev_sectors);
31
32 /*
33 * PI controller:
34 * Figures out the amount that should be written per second.
35 *
36 * First, the error (number of sectors that are dirty beyond our
37 * target) is calculated. The error is accumulated (numerically
38 * integrated).
39 *
40 * Then, the proportional value and integral value are scaled
41 * based on configured values. These are stored as inverses to
42 * avoid fixed point math and to make configuration easy-- e.g.
43 * the default value of 40 for writeback_rate_p_term_inverse
44 * attempts to write at a rate that would retire all the dirty
45 * blocks in 40 seconds.
46 *
47 * The writeback_rate_i_inverse value of 10000 means that 1/10000th
48 * of the error is accumulated in the integral term per second.
49 * This acts as a slow, long-term average that is not subject to
50 * variations in usage like the p term.
51 */
52 int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
53 int64_t error = dirty - target;
54 int64_t proportional_scaled =
55 div_s64(error, dc->writeback_rate_p_term_inverse);
56 int64_t integral_scaled;
57 uint32_t new_rate;
58
59 if ((error < 0 && dc->writeback_rate_integral > 0) ||
60 (error > 0 && time_before64(local_clock(),
61 dc->writeback_rate.next + NSEC_PER_MSEC))) {
62 /*
63 * Only decrease the integral term if it's more than
64 * zero. Only increase the integral term if the device
65 * is keeping up. (Don't wind up the integral
66 * ineffectively in either case).
67 *
68 * It's necessary to scale this by
69 * writeback_rate_update_seconds to keep the integral
70 * term dimensioned properly.
71 */
72 dc->writeback_rate_integral += error *
73 dc->writeback_rate_update_seconds;
74 }
75
76 integral_scaled = div_s64(dc->writeback_rate_integral,
77 dc->writeback_rate_i_term_inverse);
78
79 new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled),
80 dc->writeback_rate_minimum, NSEC_PER_SEC);
81
82 dc->writeback_rate_proportional = proportional_scaled;
83 dc->writeback_rate_integral_scaled = integral_scaled;
84 dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
85 dc->writeback_rate.rate = new_rate;
86 dc->writeback_rate_target = target;
87}
88
89static void update_writeback_rate(struct work_struct *work)
90{
91 struct cached_dev *dc = container_of(to_delayed_work(work),
92 struct cached_dev,
93 writeback_rate_update);
94
95 down_read(&dc->writeback_lock);
96
97 if (atomic_read(&dc->has_dirty) &&
98 dc->writeback_percent)
99 __update_writeback_rate(dc);
100
101 up_read(&dc->writeback_lock);
102
103 schedule_delayed_work(&dc->writeback_rate_update,
104 dc->writeback_rate_update_seconds * HZ);
105}
106
107static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
108{
109 if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
110 !dc->writeback_percent)
111 return 0;
112
113 return bch_next_delay(&dc->writeback_rate, sectors);
114}
115
116struct dirty_io {
117 struct closure cl;
118 struct cached_dev *dc;
119 struct bio bio;
120};
121
122static void dirty_init(struct keybuf_key *w)
123{
124 struct dirty_io *io = w->private;
125 struct bio *bio = &io->bio;
126
127 bio_init(bio, bio->bi_inline_vecs,
128 DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS));
129 if (!io->dc->writeback_percent)
130 bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
131
132 bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9;
133 bio->bi_private = w;
134 bch_bio_map(bio, NULL);
135}
136
137static void dirty_io_destructor(struct closure *cl)
138{
139 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
140 kfree(io);
141}
142
143static void write_dirty_finish(struct closure *cl)
144{
145 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
146 struct keybuf_key *w = io->bio.bi_private;
147 struct cached_dev *dc = io->dc;
148
149 bio_free_pages(&io->bio);
150
151 /* This is kind of a dumb way of signalling errors. */
152 if (KEY_DIRTY(&w->key)) {
153 int ret;
154 unsigned i;
155 struct keylist keys;
156
157 bch_keylist_init(&keys);
158
159 bkey_copy(keys.top, &w->key);
160 SET_KEY_DIRTY(keys.top, false);
161 bch_keylist_push(&keys);
162
163 for (i = 0; i < KEY_PTRS(&w->key); i++)
164 atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
165
166 ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key);
167
168 if (ret)
169 trace_bcache_writeback_collision(&w->key);
170
171 atomic_long_inc(ret
172 ? &dc->disk.c->writeback_keys_failed
173 : &dc->disk.c->writeback_keys_done);
174 }
175
176 bch_keybuf_del(&dc->writeback_keys, w);
177 up(&dc->in_flight);
178
179 closure_return_with_destructor(cl, dirty_io_destructor);
180}
181
182static void dirty_endio(struct bio *bio)
183{
184 struct keybuf_key *w = bio->bi_private;
185 struct dirty_io *io = w->private;
186
187 if (bio->bi_status)
188 SET_KEY_DIRTY(&w->key, false);
189
190 closure_put(&io->cl);
191}
192
193static void write_dirty(struct closure *cl)
194{
195 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
196 struct keybuf_key *w = io->bio.bi_private;
197
198 /*
199 * IO errors are signalled using the dirty bit on the key.
200 * If we failed to read, we should not attempt to write to the
201 * backing device. Instead, immediately go to write_dirty_finish
202 * to clean up.
203 */
204 if (KEY_DIRTY(&w->key)) {
205 dirty_init(w);
206 bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
207 io->bio.bi_iter.bi_sector = KEY_START(&w->key);
208 bio_set_dev(&io->bio, io->dc->bdev);
209 io->bio.bi_end_io = dirty_endio;
210
211 closure_bio_submit(&io->bio, cl);
212 }
213
214 continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
215}
216
217static void read_dirty_endio(struct bio *bio)
218{
219 struct keybuf_key *w = bio->bi_private;
220 struct dirty_io *io = w->private;
221
222 bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
223 bio->bi_status, "reading dirty data from cache");
224
225 dirty_endio(bio);
226}
227
228static void read_dirty_submit(struct closure *cl)
229{
230 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
231
232 closure_bio_submit(&io->bio, cl);
233
234 continue_at(cl, write_dirty, io->dc->writeback_write_wq);
235}
236
237static void read_dirty(struct cached_dev *dc)
238{
239 unsigned delay = 0;
240 struct keybuf_key *w;
241 struct dirty_io *io;
242 struct closure cl;
243
244 closure_init_stack(&cl);
245
246 /*
247 * XXX: if we error, background writeback just spins. Should use some
248 * mempools.
249 */
250
251 while (!kthread_should_stop()) {
252
253 w = bch_keybuf_next(&dc->writeback_keys);
254 if (!w)
255 break;
256
257 BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
258
259 if (KEY_START(&w->key) != dc->last_read ||
260 jiffies_to_msecs(delay) > 50)
261 while (!kthread_should_stop() && delay)
262 delay = schedule_timeout_interruptible(delay);
263
264 dc->last_read = KEY_OFFSET(&w->key);
265
266 io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
267 * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
268 GFP_KERNEL);
269 if (!io)
270 goto err;
271
272 w->private = io;
273 io->dc = dc;
274
275 dirty_init(w);
276 bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
277 io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
278 bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
279 io->bio.bi_end_io = read_dirty_endio;
280
281 if (bio_alloc_pages(&io->bio, GFP_KERNEL))
282 goto err_free;
283
284 trace_bcache_writeback(&w->key);
285
286 down(&dc->in_flight);
287 closure_call(&io->cl, read_dirty_submit, NULL, &cl);
288
289 delay = writeback_delay(dc, KEY_SIZE(&w->key));
290 }
291
292 if (0) {
293err_free:
294 kfree(w->private);
295err:
296 bch_keybuf_del(&dc->writeback_keys, w);
297 }
298
299 /*
300 * Wait for outstanding writeback IOs to finish (and keybuf slots to be
301 * freed) before refilling again
302 */
303 closure_sync(&cl);
304}
305
306/* Scan for dirty data */
307
308void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
309 uint64_t offset, int nr_sectors)
310{
311 struct bcache_device *d = c->devices[inode];
312 unsigned stripe_offset, stripe, sectors_dirty;
313
314 if (!d)
315 return;
316
317 stripe = offset_to_stripe(d, offset);
318 stripe_offset = offset & (d->stripe_size - 1);
319
320 while (nr_sectors) {
321 int s = min_t(unsigned, abs(nr_sectors),
322 d->stripe_size - stripe_offset);
323
324 if (nr_sectors < 0)
325 s = -s;
326
327 if (stripe >= d->nr_stripes)
328 return;
329
330 sectors_dirty = atomic_add_return(s,
331 d->stripe_sectors_dirty + stripe);
332 if (sectors_dirty == d->stripe_size)
333 set_bit(stripe, d->full_dirty_stripes);
334 else
335 clear_bit(stripe, d->full_dirty_stripes);
336
337 nr_sectors -= s;
338 stripe_offset = 0;
339 stripe++;
340 }
341}
342
343static bool dirty_pred(struct keybuf *buf, struct bkey *k)
344{
345 struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys);
346
347 BUG_ON(KEY_INODE(k) != dc->disk.id);
348
349 return KEY_DIRTY(k);
350}
351
352static void refill_full_stripes(struct cached_dev *dc)
353{
354 struct keybuf *buf = &dc->writeback_keys;
355 unsigned start_stripe, stripe, next_stripe;
356 bool wrapped = false;
357
358 stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned));
359
360 if (stripe >= dc->disk.nr_stripes)
361 stripe = 0;
362
363 start_stripe = stripe;
364
365 while (1) {
366 stripe = find_next_bit(dc->disk.full_dirty_stripes,
367 dc->disk.nr_stripes, stripe);
368
369 if (stripe == dc->disk.nr_stripes)
370 goto next;
371
372 next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes,
373 dc->disk.nr_stripes, stripe);
374
375 buf->last_scanned = KEY(dc->disk.id,
376 stripe * dc->disk.stripe_size, 0);
377
378 bch_refill_keybuf(dc->disk.c, buf,
379 &KEY(dc->disk.id,
380 next_stripe * dc->disk.stripe_size, 0),
381 dirty_pred);
382
383 if (array_freelist_empty(&buf->freelist))
384 return;
385
386 stripe = next_stripe;
387next:
388 if (wrapped && stripe > start_stripe)
389 return;
390
391 if (stripe == dc->disk.nr_stripes) {
392 stripe = 0;
393 wrapped = true;
394 }
395 }
396}
397
398/*
399 * Returns true if we scanned the entire disk
400 */
401static bool refill_dirty(struct cached_dev *dc)
402{
403 struct keybuf *buf = &dc->writeback_keys;
404 struct bkey start = KEY(dc->disk.id, 0, 0);
405 struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0);
406 struct bkey start_pos;
407
408 /*
409 * make sure keybuf pos is inside the range for this disk - at bringup
410 * we might not be attached yet so this disk's inode nr isn't
411 * initialized then
412 */
413 if (bkey_cmp(&buf->last_scanned, &start) < 0 ||
414 bkey_cmp(&buf->last_scanned, &end) > 0)
415 buf->last_scanned = start;
416
417 if (dc->partial_stripes_expensive) {
418 refill_full_stripes(dc);
419 if (array_freelist_empty(&buf->freelist))
420 return false;
421 }
422
423 start_pos = buf->last_scanned;
424 bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
425
426 if (bkey_cmp(&buf->last_scanned, &end) < 0)
427 return false;
428
429 /*
430 * If we get to the end start scanning again from the beginning, and
431 * only scan up to where we initially started scanning from:
432 */
433 buf->last_scanned = start;
434 bch_refill_keybuf(dc->disk.c, buf, &start_pos, dirty_pred);
435
436 return bkey_cmp(&buf->last_scanned, &start_pos) >= 0;
437}
438
439static int bch_writeback_thread(void *arg)
440{
441 struct cached_dev *dc = arg;
442 bool searched_full_index;
443
444 bch_ratelimit_reset(&dc->writeback_rate);
445
446 while (!kthread_should_stop()) {
447 down_write(&dc->writeback_lock);
448 if (!atomic_read(&dc->has_dirty) ||
449 (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
450 !dc->writeback_running)) {
451 up_write(&dc->writeback_lock);
452 set_current_state(TASK_INTERRUPTIBLE);
453
454 if (kthread_should_stop())
455 return 0;
456
457 schedule();
458 continue;
459 }
460
461 searched_full_index = refill_dirty(dc);
462
463 if (searched_full_index &&
464 RB_EMPTY_ROOT(&dc->writeback_keys.keys)) {
465 atomic_set(&dc->has_dirty, 0);
466 cached_dev_put(dc);
467 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
468 bch_write_bdev_super(dc, NULL);
469 }
470
471 up_write(&dc->writeback_lock);
472
473 read_dirty(dc);
474
475 if (searched_full_index) {
476 unsigned delay = dc->writeback_delay * HZ;
477
478 while (delay &&
479 !kthread_should_stop() &&
480 !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
481 delay = schedule_timeout_interruptible(delay);
482
483 bch_ratelimit_reset(&dc->writeback_rate);
484 }
485 }
486
487 return 0;
488}
489
490/* Init */
491
492struct sectors_dirty_init {
493 struct btree_op op;
494 unsigned inode;
495};
496
497static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
498 struct bkey *k)
499{
500 struct sectors_dirty_init *op = container_of(_op,
501 struct sectors_dirty_init, op);
502 if (KEY_INODE(k) > op->inode)
503 return MAP_DONE;
504
505 if (KEY_DIRTY(k))
506 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
507 KEY_START(k), KEY_SIZE(k));
508
509 return MAP_CONTINUE;
510}
511
512void bch_sectors_dirty_init(struct bcache_device *d)
513{
514 struct sectors_dirty_init op;
515
516 bch_btree_op_init(&op.op, -1);
517 op.inode = d->id;
518
519 bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
520 sectors_dirty_init_fn, 0);
521}
522
523void bch_cached_dev_writeback_init(struct cached_dev *dc)
524{
525 sema_init(&dc->in_flight, 64);
526 init_rwsem(&dc->writeback_lock);
527 bch_keybuf_init(&dc->writeback_keys);
528
529 dc->writeback_metadata = true;
530 dc->writeback_running = true;
531 dc->writeback_percent = 10;
532 dc->writeback_delay = 30;
533 dc->writeback_rate.rate = 1024;
534 dc->writeback_rate_minimum = 8;
535
536 dc->writeback_rate_update_seconds = 5;
537 dc->writeback_rate_p_term_inverse = 40;
538 dc->writeback_rate_i_term_inverse = 10000;
539
540 INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
541}
542
543int bch_cached_dev_writeback_start(struct cached_dev *dc)
544{
545 dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq",
546 WQ_MEM_RECLAIM, 0);
547 if (!dc->writeback_write_wq)
548 return -ENOMEM;
549
550 dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
551 "bcache_writeback");
552 if (IS_ERR(dc->writeback_thread))
553 return PTR_ERR(dc->writeback_thread);
554
555 schedule_delayed_work(&dc->writeback_rate_update,
556 dc->writeback_rate_update_seconds * HZ);
557
558 bch_writeback_queue(dc);
559
560 return 0;
561}