drivers/md/bcache/request.c at v3.11-rc6 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / drivers / md / bcache / request.c
at v3.11-rc6 1372 lines 34 kB view raw
   1/*
   2 * Main bcache entry point - handle a read or a write request and decide what to
   3 * do with it; the make_request functions are called by the block layer.
   4 *
   5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
   6 * Copyright 2012 Google, Inc.
   7 */
   8
   9#include "bcache.h"
  10#include "btree.h"
  11#include "debug.h"
  12#include "request.h"
  13#include "writeback.h"
  14
  15#include <linux/cgroup.h>
  16#include <linux/module.h>
  17#include <linux/hash.h>
  18#include <linux/random.h>
  19#include "blk-cgroup.h"
  20
  21#include <trace/events/bcache.h>
  22
  23#define CUTOFF_CACHE_ADD	95
  24#define CUTOFF_CACHE_READA	90
  25
  26struct kmem_cache *bch_search_cache;
  27
  28static void check_should_skip(struct cached_dev *, struct search *);
  29
  30/* Cgroup interface */
  31
  32#ifdef CONFIG_CGROUP_BCACHE
  33static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 };
  34
  35static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup)
  36{
  37	struct cgroup_subsys_state *css;
  38	return cgroup &&
  39		(css = cgroup_subsys_state(cgroup, bcache_subsys_id))
  40		? container_of(css, struct bch_cgroup, css)
  41		: &bcache_default_cgroup;
  42}
  43
  44struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio)
  45{
  46	struct cgroup_subsys_state *css = bio->bi_css
  47		? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id)
  48		: task_subsys_state(current, bcache_subsys_id);
  49
  50	return css
  51		? container_of(css, struct bch_cgroup, css)
  52		: &bcache_default_cgroup;
  53}
  54
  55static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft,
  56			struct file *file,
  57			char __user *buf, size_t nbytes, loff_t *ppos)
  58{
  59	char tmp[1024];
  60	int len = bch_snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes,
  61					  cgroup_to_bcache(cgrp)->cache_mode + 1);
  62
  63	if (len < 0)
  64		return len;
  65
  66	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
  67}
  68
  69static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft,
  70			    const char *buf)
  71{
  72	int v = bch_read_string_list(buf, bch_cache_modes);
  73	if (v < 0)
  74		return v;
  75
  76	cgroup_to_bcache(cgrp)->cache_mode = v - 1;
  77	return 0;
  78}
  79
  80static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft)
  81{
  82	return cgroup_to_bcache(cgrp)->verify;
  83}
  84
  85static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
  86{
  87	cgroup_to_bcache(cgrp)->verify = val;
  88	return 0;
  89}
  90
  91static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft)
  92{
  93	struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
  94	return atomic_read(&bcachecg->stats.cache_hits);
  95}
  96
  97static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft)
  98{
  99	struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
 100	return atomic_read(&bcachecg->stats.cache_misses);
 101}
 102
 103static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp,
 104					 struct cftype *cft)
 105{
 106	struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
 107	return atomic_read(&bcachecg->stats.cache_bypass_hits);
 108}
 109
 110static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp,
 111					   struct cftype *cft)
 112{
 113	struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
 114	return atomic_read(&bcachecg->stats.cache_bypass_misses);
 115}
 116
 117static struct cftype bch_files[] = {
 118	{
 119		.name		= "cache_mode",
 120		.read		= cache_mode_read,
 121		.write_string	= cache_mode_write,
 122	},
 123	{
 124		.name		= "verify",
 125		.read_u64	= bch_verify_read,
 126		.write_u64	= bch_verify_write,
 127	},
 128	{
 129		.name		= "cache_hits",
 130		.read_u64	= bch_cache_hits_read,
 131	},
 132	{
 133		.name		= "cache_misses",
 134		.read_u64	= bch_cache_misses_read,
 135	},
 136	{
 137		.name		= "cache_bypass_hits",
 138		.read_u64	= bch_cache_bypass_hits_read,
 139	},
 140	{
 141		.name		= "cache_bypass_misses",
 142		.read_u64	= bch_cache_bypass_misses_read,
 143	},
 144	{ }	/* terminate */
 145};
 146
 147static void init_bch_cgroup(struct bch_cgroup *cg)
 148{
 149	cg->cache_mode = -1;
 150}
 151
 152static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup)
 153{
 154	struct bch_cgroup *cg;
 155
 156	cg = kzalloc(sizeof(*cg), GFP_KERNEL);
 157	if (!cg)
 158		return ERR_PTR(-ENOMEM);
 159	init_bch_cgroup(cg);
 160	return &cg->css;
 161}
 162
 163static void bcachecg_destroy(struct cgroup *cgroup)
 164{
 165	struct bch_cgroup *cg = cgroup_to_bcache(cgroup);
 166	free_css_id(&bcache_subsys, &cg->css);
 167	kfree(cg);
 168}
 169
 170struct cgroup_subsys bcache_subsys = {
 171	.create		= bcachecg_create,
 172	.destroy	= bcachecg_destroy,
 173	.subsys_id	= bcache_subsys_id,
 174	.name		= "bcache",
 175	.module		= THIS_MODULE,
 176};
 177EXPORT_SYMBOL_GPL(bcache_subsys);
 178#endif
 179
 180static unsigned cache_mode(struct cached_dev *dc, struct bio *bio)
 181{
 182#ifdef CONFIG_CGROUP_BCACHE
 183	int r = bch_bio_to_cgroup(bio)->cache_mode;
 184	if (r >= 0)
 185		return r;
 186#endif
 187	return BDEV_CACHE_MODE(&dc->sb);
 188}
 189
 190static bool verify(struct cached_dev *dc, struct bio *bio)
 191{
 192#ifdef CONFIG_CGROUP_BCACHE
 193	if (bch_bio_to_cgroup(bio)->verify)
 194		return true;
 195#endif
 196	return dc->verify;
 197}
 198
 199static void bio_csum(struct bio *bio, struct bkey *k)
 200{
 201	struct bio_vec *bv;
 202	uint64_t csum = 0;
 203	int i;
 204
 205	bio_for_each_segment(bv, bio, i) {
 206		void *d = kmap(bv->bv_page) + bv->bv_offset;
 207		csum = bch_crc64_update(csum, d, bv->bv_len);
 208		kunmap(bv->bv_page);
 209	}
 210
 211	k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1);
 212}
 213
 214/* Insert data into cache */
 215
 216static void bio_invalidate(struct closure *cl)
 217{
 218	struct btree_op *op = container_of(cl, struct btree_op, cl);
 219	struct bio *bio = op->cache_bio;
 220
 221	pr_debug("invalidating %i sectors from %llu",
 222		 bio_sectors(bio), (uint64_t) bio->bi_sector);
 223
 224	while (bio_sectors(bio)) {
 225		unsigned len = min(bio_sectors(bio), 1U << 14);
 226
 227		if (bch_keylist_realloc(&op->keys, 0, op->c))
 228			goto out;
 229
 230		bio->bi_sector	+= len;
 231		bio->bi_size	-= len << 9;
 232
 233		bch_keylist_add(&op->keys,
 234				&KEY(op->inode, bio->bi_sector, len));
 235	}
 236
 237	op->insert_data_done = true;
 238	bio_put(bio);
 239out:
 240	continue_at(cl, bch_journal, bcache_wq);
 241}
 242
 243struct open_bucket {
 244	struct list_head	list;
 245	struct task_struct	*last;
 246	unsigned		sectors_free;
 247	BKEY_PADDED(key);
 248};
 249
 250void bch_open_buckets_free(struct cache_set *c)
 251{
 252	struct open_bucket *b;
 253
 254	while (!list_empty(&c->data_buckets)) {
 255		b = list_first_entry(&c->data_buckets,
 256				     struct open_bucket, list);
 257		list_del(&b->list);
 258		kfree(b);
 259	}
 260}
 261
 262int bch_open_buckets_alloc(struct cache_set *c)
 263{
 264	int i;
 265
 266	spin_lock_init(&c->data_bucket_lock);
 267
 268	for (i = 0; i < 6; i++) {
 269		struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
 270		if (!b)
 271			return -ENOMEM;
 272
 273		list_add(&b->list, &c->data_buckets);
 274	}
 275
 276	return 0;
 277}
 278
 279/*
 280 * We keep multiple buckets open for writes, and try to segregate different
 281 * write streams for better cache utilization: first we look for a bucket where
 282 * the last write to it was sequential with the current write, and failing that
 283 * we look for a bucket that was last used by the same task.
 284 *
 285 * The ideas is if you've got multiple tasks pulling data into the cache at the
 286 * same time, you'll get better cache utilization if you try to segregate their
 287 * data and preserve locality.
 288 *
 289 * For example, say you've starting Firefox at the same time you're copying a
 290 * bunch of files. Firefox will likely end up being fairly hot and stay in the
 291 * cache awhile, but the data you copied might not be; if you wrote all that
 292 * data to the same buckets it'd get invalidated at the same time.
 293 *
 294 * Both of those tasks will be doing fairly random IO so we can't rely on
 295 * detecting sequential IO to segregate their data, but going off of the task
 296 * should be a sane heuristic.
 297 */
 298static struct open_bucket *pick_data_bucket(struct cache_set *c,
 299					    const struct bkey *search,
 300					    struct task_struct *task,
 301					    struct bkey *alloc)
 302{
 303	struct open_bucket *ret, *ret_task = NULL;
 304
 305	list_for_each_entry_reverse(ret, &c->data_buckets, list)
 306		if (!bkey_cmp(&ret->key, search))
 307			goto found;
 308		else if (ret->last == task)
 309			ret_task = ret;
 310
 311	ret = ret_task ?: list_first_entry(&c->data_buckets,
 312					   struct open_bucket, list);
 313found:
 314	if (!ret->sectors_free && KEY_PTRS(alloc)) {
 315		ret->sectors_free = c->sb.bucket_size;
 316		bkey_copy(&ret->key, alloc);
 317		bkey_init(alloc);
 318	}
 319
 320	if (!ret->sectors_free)
 321		ret = NULL;
 322
 323	return ret;
 324}
 325
 326/*
 327 * Allocates some space in the cache to write to, and k to point to the newly
 328 * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the
 329 * end of the newly allocated space).
 330 *
 331 * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many
 332 * sectors were actually allocated.
 333 *
 334 * If s->writeback is true, will not fail.
 335 */
 336static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
 337			      struct search *s)
 338{
 339	struct cache_set *c = s->op.c;
 340	struct open_bucket *b;
 341	BKEY_PADDED(key) alloc;
 342	struct closure cl, *w = NULL;
 343	unsigned i;
 344
 345	if (s->writeback) {
 346		closure_init_stack(&cl);
 347		w = &cl;
 348	}
 349
 350	/*
 351	 * We might have to allocate a new bucket, which we can't do with a
 352	 * spinlock held. So if we have to allocate, we drop the lock, allocate
 353	 * and then retry. KEY_PTRS() indicates whether alloc points to
 354	 * allocated bucket(s).
 355	 */
 356
 357	bkey_init(&alloc.key);
 358	spin_lock(&c->data_bucket_lock);
 359
 360	while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) {
 361		unsigned watermark = s->op.write_prio
 362			? WATERMARK_MOVINGGC
 363			: WATERMARK_NONE;
 364
 365		spin_unlock(&c->data_bucket_lock);
 366
 367		if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w))
 368			return false;
 369
 370		spin_lock(&c->data_bucket_lock);
 371	}
 372
 373	/*
 374	 * If we had to allocate, we might race and not need to allocate the
 375	 * second time we call find_data_bucket(). If we allocated a bucket but
 376	 * didn't use it, drop the refcount bch_bucket_alloc_set() took:
 377	 */
 378	if (KEY_PTRS(&alloc.key))
 379		__bkey_put(c, &alloc.key);
 380
 381	for (i = 0; i < KEY_PTRS(&b->key); i++)
 382		EBUG_ON(ptr_stale(c, &b->key, i));
 383
 384	/* Set up the pointer to the space we're allocating: */
 385
 386	for (i = 0; i < KEY_PTRS(&b->key); i++)
 387		k->ptr[i] = b->key.ptr[i];
 388
 389	sectors = min(sectors, b->sectors_free);
 390
 391	SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors);
 392	SET_KEY_SIZE(k, sectors);
 393	SET_KEY_PTRS(k, KEY_PTRS(&b->key));
 394
 395	/*
 396	 * Move b to the end of the lru, and keep track of what this bucket was
 397	 * last used for:
 398	 */
 399	list_move_tail(&b->list, &c->data_buckets);
 400	bkey_copy_key(&b->key, k);
 401	b->last = s->task;
 402
 403	b->sectors_free	-= sectors;
 404
 405	for (i = 0; i < KEY_PTRS(&b->key); i++) {
 406		SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors);
 407
 408		atomic_long_add(sectors,
 409				&PTR_CACHE(c, &b->key, i)->sectors_written);
 410	}
 411
 412	if (b->sectors_free < c->sb.block_size)
 413		b->sectors_free = 0;
 414
 415	/*
 416	 * k takes refcounts on the buckets it points to until it's inserted
 417	 * into the btree, but if we're done with this bucket we just transfer
 418	 * get_data_bucket()'s refcount.
 419	 */
 420	if (b->sectors_free)
 421		for (i = 0; i < KEY_PTRS(&b->key); i++)
 422			atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin);
 423
 424	spin_unlock(&c->data_bucket_lock);
 425	return true;
 426}
 427
 428static void bch_insert_data_error(struct closure *cl)
 429{
 430	struct btree_op *op = container_of(cl, struct btree_op, cl);
 431
 432	/*
 433	 * Our data write just errored, which means we've got a bunch of keys to
 434	 * insert that point to data that wasn't succesfully written.
 435	 *
 436	 * We don't have to insert those keys but we still have to invalidate
 437	 * that region of the cache - so, if we just strip off all the pointers
 438	 * from the keys we'll accomplish just that.
 439	 */
 440
 441	struct bkey *src = op->keys.bottom, *dst = op->keys.bottom;
 442
 443	while (src != op->keys.top) {
 444		struct bkey *n = bkey_next(src);
 445
 446		SET_KEY_PTRS(src, 0);
 447		bkey_copy(dst, src);
 448
 449		dst = bkey_next(dst);
 450		src = n;
 451	}
 452
 453	op->keys.top = dst;
 454
 455	bch_journal(cl);
 456}
 457
 458static void bch_insert_data_endio(struct bio *bio, int error)
 459{
 460	struct closure *cl = bio->bi_private;
 461	struct btree_op *op = container_of(cl, struct btree_op, cl);
 462	struct search *s = container_of(op, struct search, op);
 463
 464	if (error) {
 465		/* TODO: We could try to recover from this. */
 466		if (s->writeback)
 467			s->error = error;
 468		else if (s->write)
 469			set_closure_fn(cl, bch_insert_data_error, bcache_wq);
 470		else
 471			set_closure_fn(cl, NULL, NULL);
 472	}
 473
 474	bch_bbio_endio(op->c, bio, error, "writing data to cache");
 475}
 476
 477static void bch_insert_data_loop(struct closure *cl)
 478{
 479	struct btree_op *op = container_of(cl, struct btree_op, cl);
 480	struct search *s = container_of(op, struct search, op);
 481	struct bio *bio = op->cache_bio, *n;
 482
 483	if (op->skip)
 484		return bio_invalidate(cl);
 485
 486	if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
 487		set_gc_sectors(op->c);
 488		bch_queue_gc(op->c);
 489	}
 490
 491	/*
 492	 * Journal writes are marked REQ_FLUSH; if the original write was a
 493	 * flush, it'll wait on the journal write.
 494	 */
 495	bio->bi_rw &= ~(REQ_FLUSH|REQ_FUA);
 496
 497	do {
 498		unsigned i;
 499		struct bkey *k;
 500		struct bio_set *split = s->d
 501			? s->d->bio_split : op->c->bio_split;
 502
 503		/* 1 for the device pointer and 1 for the chksum */
 504		if (bch_keylist_realloc(&op->keys,
 505					1 + (op->csum ? 1 : 0),
 506					op->c))
 507			continue_at(cl, bch_journal, bcache_wq);
 508
 509		k = op->keys.top;
 510		bkey_init(k);
 511		SET_KEY_INODE(k, op->inode);
 512		SET_KEY_OFFSET(k, bio->bi_sector);
 513
 514		if (!bch_alloc_sectors(k, bio_sectors(bio), s))
 515			goto err;
 516
 517		n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
 518
 519		n->bi_end_io	= bch_insert_data_endio;
 520		n->bi_private	= cl;
 521
 522		if (s->writeback) {
 523			SET_KEY_DIRTY(k, true);
 524
 525			for (i = 0; i < KEY_PTRS(k); i++)
 526				SET_GC_MARK(PTR_BUCKET(op->c, k, i),
 527					    GC_MARK_DIRTY);
 528		}
 529
 530		SET_KEY_CSUM(k, op->csum);
 531		if (KEY_CSUM(k))
 532			bio_csum(n, k);
 533
 534		trace_bcache_cache_insert(k);
 535		bch_keylist_push(&op->keys);
 536
 537		n->bi_rw |= REQ_WRITE;
 538		bch_submit_bbio(n, op->c, k, 0);
 539	} while (n != bio);
 540
 541	op->insert_data_done = true;
 542	continue_at(cl, bch_journal, bcache_wq);
 543err:
 544	/* bch_alloc_sectors() blocks if s->writeback = true */
 545	BUG_ON(s->writeback);
 546
 547	/*
 548	 * But if it's not a writeback write we'd rather just bail out if
 549	 * there aren't any buckets ready to write to - it might take awhile and
 550	 * we might be starving btree writes for gc or something.
 551	 */
 552
 553	if (s->write) {
 554		/*
 555		 * Writethrough write: We can't complete the write until we've
 556		 * updated the index. But we don't want to delay the write while
 557		 * we wait for buckets to be freed up, so just invalidate the
 558		 * rest of the write.
 559		 */
 560		op->skip = true;
 561		return bio_invalidate(cl);
 562	} else {
 563		/*
 564		 * From a cache miss, we can just insert the keys for the data
 565		 * we have written or bail out if we didn't do anything.
 566		 */
 567		op->insert_data_done = true;
 568		bio_put(bio);
 569
 570		if (!bch_keylist_empty(&op->keys))
 571			continue_at(cl, bch_journal, bcache_wq);
 572		else
 573			closure_return(cl);
 574	}
 575}
 576
 577/**
 578 * bch_insert_data - stick some data in the cache
 579 *
 580 * This is the starting point for any data to end up in a cache device; it could
 581 * be from a normal write, or a writeback write, or a write to a flash only
 582 * volume - it's also used by the moving garbage collector to compact data in
 583 * mostly empty buckets.
 584 *
 585 * It first writes the data to the cache, creating a list of keys to be inserted
 586 * (if the data had to be fragmented there will be multiple keys); after the
 587 * data is written it calls bch_journal, and after the keys have been added to
 588 * the next journal write they're inserted into the btree.
 589 *
 590 * It inserts the data in op->cache_bio; bi_sector is used for the key offset,
 591 * and op->inode is used for the key inode.
 592 *
 593 * If op->skip is true, instead of inserting the data it invalidates the region
 594 * of the cache represented by op->cache_bio and op->inode.
 595 */
 596void bch_insert_data(struct closure *cl)
 597{
 598	struct btree_op *op = container_of(cl, struct btree_op, cl);
 599
 600	bch_keylist_init(&op->keys);
 601	bio_get(op->cache_bio);
 602	bch_insert_data_loop(cl);
 603}
 604
 605void bch_btree_insert_async(struct closure *cl)
 606{
 607	struct btree_op *op = container_of(cl, struct btree_op, cl);
 608	struct search *s = container_of(op, struct search, op);
 609
 610	if (bch_btree_insert(op, op->c)) {
 611		s->error		= -ENOMEM;
 612		op->insert_data_done	= true;
 613	}
 614
 615	if (op->insert_data_done) {
 616		bch_keylist_free(&op->keys);
 617		closure_return(cl);
 618	} else
 619		continue_at(cl, bch_insert_data_loop, bcache_wq);
 620}
 621
 622/* Common code for the make_request functions */
 623
 624static void request_endio(struct bio *bio, int error)
 625{
 626	struct closure *cl = bio->bi_private;
 627
 628	if (error) {
 629		struct search *s = container_of(cl, struct search, cl);
 630		s->error = error;
 631		/* Only cache read errors are recoverable */
 632		s->recoverable = false;
 633	}
 634
 635	bio_put(bio);
 636	closure_put(cl);
 637}
 638
 639void bch_cache_read_endio(struct bio *bio, int error)
 640{
 641	struct bbio *b = container_of(bio, struct bbio, bio);
 642	struct closure *cl = bio->bi_private;
 643	struct search *s = container_of(cl, struct search, cl);
 644
 645	/*
 646	 * If the bucket was reused while our bio was in flight, we might have
 647	 * read the wrong data. Set s->error but not error so it doesn't get
 648	 * counted against the cache device, but we'll still reread the data
 649	 * from the backing device.
 650	 */
 651
 652	if (error)
 653		s->error = error;
 654	else if (ptr_stale(s->op.c, &b->key, 0)) {
 655		atomic_long_inc(&s->op.c->cache_read_races);
 656		s->error = -EINTR;
 657	}
 658
 659	bch_bbio_endio(s->op.c, bio, error, "reading from cache");
 660}
 661
 662static void bio_complete(struct search *s)
 663{
 664	if (s->orig_bio) {
 665		int cpu, rw = bio_data_dir(s->orig_bio);
 666		unsigned long duration = jiffies - s->start_time;
 667
 668		cpu = part_stat_lock();
 669		part_round_stats(cpu, &s->d->disk->part0);
 670		part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration);
 671		part_stat_unlock();
 672
 673		trace_bcache_request_end(s, s->orig_bio);
 674		bio_endio(s->orig_bio, s->error);
 675		s->orig_bio = NULL;
 676	}
 677}
 678
 679static void do_bio_hook(struct search *s)
 680{
 681	struct bio *bio = &s->bio.bio;
 682	memcpy(bio, s->orig_bio, sizeof(struct bio));
 683
 684	bio->bi_end_io		= request_endio;
 685	bio->bi_private		= &s->cl;
 686	atomic_set(&bio->bi_cnt, 3);
 687}
 688
 689static void search_free(struct closure *cl)
 690{
 691	struct search *s = container_of(cl, struct search, cl);
 692	bio_complete(s);
 693
 694	if (s->op.cache_bio)
 695		bio_put(s->op.cache_bio);
 696
 697	if (s->unaligned_bvec)
 698		mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec);
 699
 700	closure_debug_destroy(cl);
 701	mempool_free(s, s->d->c->search);
 702}
 703
 704static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
 705{
 706	struct bio_vec *bv;
 707	struct search *s = mempool_alloc(d->c->search, GFP_NOIO);
 708	memset(s, 0, offsetof(struct search, op.keys));
 709
 710	__closure_init(&s->cl, NULL);
 711
 712	s->op.inode		= d->id;
 713	s->op.c			= d->c;
 714	s->d			= d;
 715	s->op.lock		= -1;
 716	s->task			= current;
 717	s->orig_bio		= bio;
 718	s->write		= (bio->bi_rw & REQ_WRITE) != 0;
 719	s->op.flush_journal	= (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
 720	s->op.skip		= (bio->bi_rw & REQ_DISCARD) != 0;
 721	s->recoverable		= 1;
 722	s->start_time		= jiffies;
 723	do_bio_hook(s);
 724
 725	if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) {
 726		bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO);
 727		memcpy(bv, bio_iovec(bio),
 728		       sizeof(struct bio_vec) * bio_segments(bio));
 729
 730		s->bio.bio.bi_io_vec	= bv;
 731		s->unaligned_bvec	= 1;
 732	}
 733
 734	return s;
 735}
 736
 737static void btree_read_async(struct closure *cl)
 738{
 739	struct btree_op *op = container_of(cl, struct btree_op, cl);
 740
 741	int ret = btree_root(search_recurse, op->c, op);
 742
 743	if (ret == -EAGAIN)
 744		continue_at(cl, btree_read_async, bcache_wq);
 745
 746	closure_return(cl);
 747}
 748
 749/* Cached devices */
 750
 751static void cached_dev_bio_complete(struct closure *cl)
 752{
 753	struct search *s = container_of(cl, struct search, cl);
 754	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 755
 756	search_free(cl);
 757	cached_dev_put(dc);
 758}
 759
 760/* Process reads */
 761
 762static void cached_dev_read_complete(struct closure *cl)
 763{
 764	struct search *s = container_of(cl, struct search, cl);
 765
 766	if (s->op.insert_collision)
 767		bch_mark_cache_miss_collision(s);
 768
 769	if (s->op.cache_bio) {
 770		int i;
 771		struct bio_vec *bv;
 772
 773		__bio_for_each_segment(bv, s->op.cache_bio, i, 0)
 774			__free_page(bv->bv_page);
 775	}
 776
 777	cached_dev_bio_complete(cl);
 778}
 779
 780static void request_read_error(struct closure *cl)
 781{
 782	struct search *s = container_of(cl, struct search, cl);
 783	struct bio_vec *bv;
 784	int i;
 785
 786	if (s->recoverable) {
 787		/* Retry from the backing device: */
 788		trace_bcache_read_retry(s->orig_bio);
 789
 790		s->error = 0;
 791		bv = s->bio.bio.bi_io_vec;
 792		do_bio_hook(s);
 793		s->bio.bio.bi_io_vec = bv;
 794
 795		if (!s->unaligned_bvec)
 796			bio_for_each_segment(bv, s->orig_bio, i)
 797				bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
 798		else
 799			memcpy(s->bio.bio.bi_io_vec,
 800			       bio_iovec(s->orig_bio),
 801			       sizeof(struct bio_vec) *
 802			       bio_segments(s->orig_bio));
 803
 804		/* XXX: invalidate cache */
 805
 806		closure_bio_submit(&s->bio.bio, &s->cl, s->d);
 807	}
 808
 809	continue_at(cl, cached_dev_read_complete, NULL);
 810}
 811
 812static void request_read_done(struct closure *cl)
 813{
 814	struct search *s = container_of(cl, struct search, cl);
 815	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 816
 817	/*
 818	 * s->cache_bio != NULL implies that we had a cache miss; cache_bio now
 819	 * contains data ready to be inserted into the cache.
 820	 *
 821	 * First, we copy the data we just read from cache_bio's bounce buffers
 822	 * to the buffers the original bio pointed to:
 823	 */
 824
 825	if (s->op.cache_bio) {
 826		bio_reset(s->op.cache_bio);
 827		s->op.cache_bio->bi_sector	= s->cache_miss->bi_sector;
 828		s->op.cache_bio->bi_bdev	= s->cache_miss->bi_bdev;
 829		s->op.cache_bio->bi_size	= s->cache_bio_sectors << 9;
 830		bch_bio_map(s->op.cache_bio, NULL);
 831
 832		bio_copy_data(s->cache_miss, s->op.cache_bio);
 833
 834		bio_put(s->cache_miss);
 835		s->cache_miss = NULL;
 836	}
 837
 838	if (verify(dc, &s->bio.bio) && s->recoverable)
 839		bch_data_verify(s);
 840
 841	bio_complete(s);
 842
 843	if (s->op.cache_bio &&
 844	    !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) {
 845		s->op.type = BTREE_REPLACE;
 846		closure_call(&s->op.cl, bch_insert_data, NULL, cl);
 847	}
 848
 849	continue_at(cl, cached_dev_read_complete, NULL);
 850}
 851
 852static void request_read_done_bh(struct closure *cl)
 853{
 854	struct search *s = container_of(cl, struct search, cl);
 855	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 856
 857	bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip);
 858	trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip);
 859
 860	if (s->error)
 861		continue_at_nobarrier(cl, request_read_error, bcache_wq);
 862	else if (s->op.cache_bio || verify(dc, &s->bio.bio))
 863		continue_at_nobarrier(cl, request_read_done, bcache_wq);
 864	else
 865		continue_at_nobarrier(cl, cached_dev_read_complete, NULL);
 866}
 867
 868static int cached_dev_cache_miss(struct btree *b, struct search *s,
 869				 struct bio *bio, unsigned sectors)
 870{
 871	int ret = 0;
 872	unsigned reada;
 873	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 874	struct bio *miss;
 875
 876	miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
 877	if (miss == bio)
 878		s->op.lookup_done = true;
 879
 880	miss->bi_end_io		= request_endio;
 881	miss->bi_private	= &s->cl;
 882
 883	if (s->cache_miss || s->op.skip)
 884		goto out_submit;
 885
 886	if (miss != bio ||
 887	    (bio->bi_rw & REQ_RAHEAD) ||
 888	    (bio->bi_rw & REQ_META) ||
 889	    s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA)
 890		reada = 0;
 891	else {
 892		reada = min(dc->readahead >> 9,
 893			    sectors - bio_sectors(miss));
 894
 895		if (bio_end_sector(miss) + reada > bdev_sectors(miss->bi_bdev))
 896			reada = bdev_sectors(miss->bi_bdev) -
 897				bio_end_sector(miss);
 898	}
 899
 900	s->cache_bio_sectors = bio_sectors(miss) + reada;
 901	s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT,
 902			DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS),
 903			dc->disk.bio_split);
 904
 905	if (!s->op.cache_bio)
 906		goto out_submit;
 907
 908	s->op.cache_bio->bi_sector	= miss->bi_sector;
 909	s->op.cache_bio->bi_bdev	= miss->bi_bdev;
 910	s->op.cache_bio->bi_size	= s->cache_bio_sectors << 9;
 911
 912	s->op.cache_bio->bi_end_io	= request_endio;
 913	s->op.cache_bio->bi_private	= &s->cl;
 914
 915	/* btree_search_recurse()'s btree iterator is no good anymore */
 916	ret = -EINTR;
 917	if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio))
 918		goto out_put;
 919
 920	bch_bio_map(s->op.cache_bio, NULL);
 921	if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO))
 922		goto out_put;
 923
 924	s->cache_miss = miss;
 925	bio_get(s->op.cache_bio);
 926
 927	closure_bio_submit(s->op.cache_bio, &s->cl, s->d);
 928
 929	return ret;
 930out_put:
 931	bio_put(s->op.cache_bio);
 932	s->op.cache_bio = NULL;
 933out_submit:
 934	closure_bio_submit(miss, &s->cl, s->d);
 935	return ret;
 936}
 937
 938static void request_read(struct cached_dev *dc, struct search *s)
 939{
 940	struct closure *cl = &s->cl;
 941
 942	check_should_skip(dc, s);
 943	closure_call(&s->op.cl, btree_read_async, NULL, cl);
 944
 945	continue_at(cl, request_read_done_bh, NULL);
 946}
 947
 948/* Process writes */
 949
 950static void cached_dev_write_complete(struct closure *cl)
 951{
 952	struct search *s = container_of(cl, struct search, cl);
 953	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 954
 955	up_read_non_owner(&dc->writeback_lock);
 956	cached_dev_bio_complete(cl);
 957}
 958
 959static void request_write(struct cached_dev *dc, struct search *s)
 960{
 961	struct closure *cl = &s->cl;
 962	struct bio *bio = &s->bio.bio;
 963	struct bkey start, end;
 964	start = KEY(dc->disk.id, bio->bi_sector, 0);
 965	end = KEY(dc->disk.id, bio_end_sector(bio), 0);
 966
 967	bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end);
 968
 969	check_should_skip(dc, s);
 970	down_read_non_owner(&dc->writeback_lock);
 971
 972	if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) {
 973		s->op.skip	= false;
 974		s->writeback	= true;
 975	}
 976
 977	if (bio->bi_rw & REQ_DISCARD)
 978		goto skip;
 979
 980	if (should_writeback(dc, s->orig_bio,
 981			     cache_mode(dc, bio),
 982			     s->op.skip)) {
 983		s->op.skip = false;
 984		s->writeback = true;
 985	}
 986
 987	if (s->op.skip)
 988		goto skip;
 989
 990	trace_bcache_write(s->orig_bio, s->writeback, s->op.skip);
 991
 992	if (!s->writeback) {
 993		s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
 994						   dc->disk.bio_split);
 995
 996		closure_bio_submit(bio, cl, s->d);
 997	} else {
 998		bch_writeback_add(dc);
 999
1000		if (s->op.flush_journal) {
1001			/* Also need to send a flush to the backing device */
1002			s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
1003							   dc->disk.bio_split);
1004
1005			bio->bi_size = 0;
1006			bio->bi_vcnt = 0;
1007			closure_bio_submit(bio, cl, s->d);
1008		} else {
1009			s->op.cache_bio = bio;
1010		}
1011	}
1012out:
1013	closure_call(&s->op.cl, bch_insert_data, NULL, cl);
1014	continue_at(cl, cached_dev_write_complete, NULL);
1015skip:
1016	s->op.skip = true;
1017	s->op.cache_bio = s->orig_bio;
1018	bio_get(s->op.cache_bio);
1019
1020	if ((bio->bi_rw & REQ_DISCARD) &&
1021	    !blk_queue_discard(bdev_get_queue(dc->bdev)))
1022		goto out;
1023
1024	closure_bio_submit(bio, cl, s->d);
1025	goto out;
1026}
1027
1028static void request_nodata(struct cached_dev *dc, struct search *s)
1029{
1030	struct closure *cl = &s->cl;
1031	struct bio *bio = &s->bio.bio;
1032
1033	if (bio->bi_rw & REQ_DISCARD) {
1034		request_write(dc, s);
1035		return;
1036	}
1037
1038	if (s->op.flush_journal)
1039		bch_journal_meta(s->op.c, cl);
1040
1041	closure_bio_submit(bio, cl, s->d);
1042
1043	continue_at(cl, cached_dev_bio_complete, NULL);
1044}
1045
1046/* Cached devices - read & write stuff */
1047
1048unsigned bch_get_congested(struct cache_set *c)
1049{
1050	int i;
1051	long rand;
1052
1053	if (!c->congested_read_threshold_us &&
1054	    !c->congested_write_threshold_us)
1055		return 0;
1056
1057	i = (local_clock_us() - c->congested_last_us) / 1024;
1058	if (i < 0)
1059		return 0;
1060
1061	i += atomic_read(&c->congested);
1062	if (i >= 0)
1063		return 0;
1064
1065	i += CONGESTED_MAX;
1066
1067	if (i > 0)
1068		i = fract_exp_two(i, 6);
1069
1070	rand = get_random_int();
1071	i -= bitmap_weight(&rand, BITS_PER_LONG);
1072
1073	return i > 0 ? i : 1;
1074}
1075
1076static void add_sequential(struct task_struct *t)
1077{
1078	ewma_add(t->sequential_io_avg,
1079		 t->sequential_io, 8, 0);
1080
1081	t->sequential_io = 0;
1082}
1083
1084static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
1085{
1086	return &dc->io_hash[hash_64(k, RECENT_IO_BITS)];
1087}
1088
1089static void check_should_skip(struct cached_dev *dc, struct search *s)
1090{
1091	struct cache_set *c = s->op.c;
1092	struct bio *bio = &s->bio.bio;
1093	unsigned mode = cache_mode(dc, bio);
1094	unsigned sectors, congested = bch_get_congested(c);
1095
1096	if (atomic_read(&dc->disk.detaching) ||
1097	    c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
1098	    (bio->bi_rw & REQ_DISCARD))
1099		goto skip;
1100
1101	if (mode == CACHE_MODE_NONE ||
1102	    (mode == CACHE_MODE_WRITEAROUND &&
1103	     (bio->bi_rw & REQ_WRITE)))
1104		goto skip;
1105
1106	if (bio->bi_sector   & (c->sb.block_size - 1) ||
1107	    bio_sectors(bio) & (c->sb.block_size - 1)) {
1108		pr_debug("skipping unaligned io");
1109		goto skip;
1110	}
1111
1112	if (!congested && !dc->sequential_cutoff)
1113		goto rescale;
1114
1115	if (!congested &&
1116	    mode == CACHE_MODE_WRITEBACK &&
1117	    (bio->bi_rw & REQ_WRITE) &&
1118	    (bio->bi_rw & REQ_SYNC))
1119		goto rescale;
1120
1121	if (dc->sequential_merge) {
1122		struct io *i;
1123
1124		spin_lock(&dc->io_lock);
1125
1126		hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash)
1127			if (i->last == bio->bi_sector &&
1128			    time_before(jiffies, i->jiffies))
1129				goto found;
1130
1131		i = list_first_entry(&dc->io_lru, struct io, lru);
1132
1133		add_sequential(s->task);
1134		i->sequential = 0;
1135found:
1136		if (i->sequential + bio->bi_size > i->sequential)
1137			i->sequential	+= bio->bi_size;
1138
1139		i->last			 = bio_end_sector(bio);
1140		i->jiffies		 = jiffies + msecs_to_jiffies(5000);
1141		s->task->sequential_io	 = i->sequential;
1142
1143		hlist_del(&i->hash);
1144		hlist_add_head(&i->hash, iohash(dc, i->last));
1145		list_move_tail(&i->lru, &dc->io_lru);
1146
1147		spin_unlock(&dc->io_lock);
1148	} else {
1149		s->task->sequential_io = bio->bi_size;
1150
1151		add_sequential(s->task);
1152	}
1153
1154	sectors = max(s->task->sequential_io,
1155		      s->task->sequential_io_avg) >> 9;
1156
1157	if (dc->sequential_cutoff &&
1158	    sectors >= dc->sequential_cutoff >> 9) {
1159		trace_bcache_bypass_sequential(s->orig_bio);
1160		goto skip;
1161	}
1162
1163	if (congested && sectors >= congested) {
1164		trace_bcache_bypass_congested(s->orig_bio);
1165		goto skip;
1166	}
1167
1168rescale:
1169	bch_rescale_priorities(c, bio_sectors(bio));
1170	return;
1171skip:
1172	bch_mark_sectors_bypassed(s, bio_sectors(bio));
1173	s->op.skip = true;
1174}
1175
1176static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
1177{
1178	struct search *s;
1179	struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
1180	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
1181	int cpu, rw = bio_data_dir(bio);
1182
1183	cpu = part_stat_lock();
1184	part_stat_inc(cpu, &d->disk->part0, ios[rw]);
1185	part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio));
1186	part_stat_unlock();
1187
1188	bio->bi_bdev = dc->bdev;
1189	bio->bi_sector += dc->sb.data_offset;
1190
1191	if (cached_dev_get(dc)) {
1192		s = search_alloc(bio, d);
1193		trace_bcache_request_start(s, bio);
1194
1195		if (!bio_has_data(bio))
1196			request_nodata(dc, s);
1197		else if (rw)
1198			request_write(dc, s);
1199		else
1200			request_read(dc, s);
1201	} else {
1202		if ((bio->bi_rw & REQ_DISCARD) &&
1203		    !blk_queue_discard(bdev_get_queue(dc->bdev)))
1204			bio_endio(bio, 0);
1205		else
1206			bch_generic_make_request(bio, &d->bio_split_hook);
1207	}
1208}
1209
1210static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
1211			    unsigned int cmd, unsigned long arg)
1212{
1213	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
1214	return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg);
1215}
1216
1217static int cached_dev_congested(void *data, int bits)
1218{
1219	struct bcache_device *d = data;
1220	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
1221	struct request_queue *q = bdev_get_queue(dc->bdev);
1222	int ret = 0;
1223
1224	if (bdi_congested(&q->backing_dev_info, bits))
1225		return 1;
1226
1227	if (cached_dev_get(dc)) {
1228		unsigned i;
1229		struct cache *ca;
1230
1231		for_each_cache(ca, d->c, i) {
1232			q = bdev_get_queue(ca->bdev);
1233			ret |= bdi_congested(&q->backing_dev_info, bits);
1234		}
1235
1236		cached_dev_put(dc);
1237	}
1238
1239	return ret;
1240}
1241
1242void bch_cached_dev_request_init(struct cached_dev *dc)
1243{
1244	struct gendisk *g = dc->disk.disk;
1245
1246	g->queue->make_request_fn		= cached_dev_make_request;
1247	g->queue->backing_dev_info.congested_fn = cached_dev_congested;
1248	dc->disk.cache_miss			= cached_dev_cache_miss;
1249	dc->disk.ioctl				= cached_dev_ioctl;
1250}
1251
1252/* Flash backed devices */
1253
1254static int flash_dev_cache_miss(struct btree *b, struct search *s,
1255				struct bio *bio, unsigned sectors)
1256{
1257	struct bio_vec *bv;
1258	int i;
1259
1260	/* Zero fill bio */
1261
1262	bio_for_each_segment(bv, bio, i) {
1263		unsigned j = min(bv->bv_len >> 9, sectors);
1264
1265		void *p = kmap(bv->bv_page);
1266		memset(p + bv->bv_offset, 0, j << 9);
1267		kunmap(bv->bv_page);
1268
1269		sectors	-= j;
1270	}
1271
1272	bio_advance(bio, min(sectors << 9, bio->bi_size));
1273
1274	if (!bio->bi_size)
1275		s->op.lookup_done = true;
1276
1277	return 0;
1278}
1279
1280static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
1281{
1282	struct search *s;
1283	struct closure *cl;
1284	struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
1285	int cpu, rw = bio_data_dir(bio);
1286
1287	cpu = part_stat_lock();
1288	part_stat_inc(cpu, &d->disk->part0, ios[rw]);
1289	part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio));
1290	part_stat_unlock();
1291
1292	s = search_alloc(bio, d);
1293	cl = &s->cl;
1294	bio = &s->bio.bio;
1295
1296	trace_bcache_request_start(s, bio);
1297
1298	if (bio_has_data(bio) && !rw) {
1299		closure_call(&s->op.cl, btree_read_async, NULL, cl);
1300	} else if (bio_has_data(bio) || s->op.skip) {
1301		bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys,
1302					&KEY(d->id, bio->bi_sector, 0),
1303					&KEY(d->id, bio_end_sector(bio), 0));
1304
1305		s->writeback	= true;
1306		s->op.cache_bio	= bio;
1307
1308		closure_call(&s->op.cl, bch_insert_data, NULL, cl);
1309	} else {
1310		/* No data - probably a cache flush */
1311		if (s->op.flush_journal)
1312			bch_journal_meta(s->op.c, cl);
1313	}
1314
1315	continue_at(cl, search_free, NULL);
1316}
1317
1318static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode,
1319			   unsigned int cmd, unsigned long arg)
1320{
1321	return -ENOTTY;
1322}
1323
1324static int flash_dev_congested(void *data, int bits)
1325{
1326	struct bcache_device *d = data;
1327	struct request_queue *q;
1328	struct cache *ca;
1329	unsigned i;
1330	int ret = 0;
1331
1332	for_each_cache(ca, d->c, i) {
1333		q = bdev_get_queue(ca->bdev);
1334		ret |= bdi_congested(&q->backing_dev_info, bits);
1335	}
1336
1337	return ret;
1338}
1339
1340void bch_flash_dev_request_init(struct bcache_device *d)
1341{
1342	struct gendisk *g = d->disk;
1343
1344	g->queue->make_request_fn		= flash_dev_make_request;
1345	g->queue->backing_dev_info.congested_fn = flash_dev_congested;
1346	d->cache_miss				= flash_dev_cache_miss;
1347	d->ioctl				= flash_dev_ioctl;
1348}
1349
1350void bch_request_exit(void)
1351{
1352#ifdef CONFIG_CGROUP_BCACHE
1353	cgroup_unload_subsys(&bcache_subsys);
1354#endif
1355	if (bch_search_cache)
1356		kmem_cache_destroy(bch_search_cache);
1357}
1358
1359int __init bch_request_init(void)
1360{
1361	bch_search_cache = KMEM_CACHE(search, 0);
1362	if (!bch_search_cache)
1363		return -ENOMEM;
1364
1365#ifdef CONFIG_CGROUP_BCACHE
1366	cgroup_load_subsys(&bcache_subsys);
1367	init_bch_cgroup(&bcache_default_cgroup);
1368
1369	cgroup_add_cftypes(&bcache_subsys, bch_files);
1370#endif
1371	return 0;
1372}