Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'bcache' (bcache fixes from Kent Overstreet)

Merge bcache fixes from Kent Overstreet:
"There's fixes for _three_ different data corruption bugs, all of which
were found by users hitting them in the wild.

The first one isn't bcache specific - in 3.11 bcache was switched to
the bio_copy_data in fs/bio.c, and that's when the bug in that code
was discovered, but it's also used by raid1 and pktcdvd. (That was my
code too, so the bug's doubly embarassing given that it was or
should've been just a cut and paste from bcache code. Dunno what
happened there).

Most of these (all the non data corruption bugs, actually) were ready
before the merge window and have been sitting in Jens' tree, but I
don't know what's been up with him lately..."

* emailed patches from Kent Overstreet <kmo@daterainc.com>:
bcache: Fix flushes in writeback mode
bcache: Fix for handling overlapping extents when reading in a btree node
bcache: Fix a shrinker deadlock
bcache: Fix a dumb CPU spinning bug in writeback
bcache: Fix a flush/fua performance bug
bcache: Fix a writeback performance regression
bcache: Correct printf()-style format length modifier
bcache: Fix for when no journal entries are found
bcache: Strip endline when writing the label through sysfs
bcache: Fix a dumb journal discard bug
block: Fix bio_copy_data()

+110 -66
+3 -4
drivers/md/bcache/bcache.h
··· 498 498 */ 499 499 atomic_t has_dirty; 500 500 501 - struct ratelimit writeback_rate; 501 + struct bch_ratelimit writeback_rate; 502 502 struct delayed_work writeback_rate_update; 503 503 504 504 /* ··· 507 507 */ 508 508 sector_t last_read; 509 509 510 - /* Number of writeback bios in flight */ 511 - atomic_t in_flight; 510 + /* Limit number of writeback bios in flight */ 511 + struct semaphore in_flight; 512 512 struct closure_with_timer writeback; 513 - struct closure_waitlist writeback_wait; 514 513 515 514 struct keybuf writeback_keys; 516 515
+28 -11
drivers/md/bcache/bset.c
··· 926 926 927 927 /* Mergesort */ 928 928 929 + static void sort_key_next(struct btree_iter *iter, 930 + struct btree_iter_set *i) 931 + { 932 + i->k = bkey_next(i->k); 933 + 934 + if (i->k == i->end) 935 + *i = iter->data[--iter->used]; 936 + } 937 + 929 938 static void btree_sort_fixup(struct btree_iter *iter) 930 939 { 931 940 while (iter->used > 1) { 932 941 struct btree_iter_set *top = iter->data, *i = top + 1; 933 - struct bkey *k; 934 942 935 943 if (iter->used > 2 && 936 944 btree_iter_cmp(i[0], i[1])) 937 945 i++; 938 946 939 - for (k = i->k; 940 - k != i->end && bkey_cmp(top->k, &START_KEY(k)) > 0; 941 - k = bkey_next(k)) 942 - if (top->k > i->k) 943 - __bch_cut_front(top->k, k); 944 - else if (KEY_SIZE(k)) 945 - bch_cut_back(&START_KEY(k), top->k); 946 - 947 - if (top->k < i->k || k == i->k) 947 + if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) 948 948 break; 949 949 950 - heap_sift(iter, i - top, btree_iter_cmp); 950 + if (!KEY_SIZE(i->k)) { 951 + sort_key_next(iter, i); 952 + heap_sift(iter, i - top, btree_iter_cmp); 953 + continue; 954 + } 955 + 956 + if (top->k > i->k) { 957 + if (bkey_cmp(top->k, i->k) >= 0) 958 + sort_key_next(iter, i); 959 + else 960 + bch_cut_front(top->k, i->k); 961 + 962 + heap_sift(iter, i - top, btree_iter_cmp); 963 + } else { 964 + /* can't happen because of comparison func */ 965 + BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); 966 + bch_cut_back(&START_KEY(i->k), top->k); 967 + } 951 968 } 952 969 } 953 970
+2 -2
drivers/md/bcache/btree.c
··· 255 255 256 256 return; 257 257 err: 258 - bch_cache_set_error(b->c, "io error reading bucket %lu", 258 + bch_cache_set_error(b->c, "io error reading bucket %zu", 259 259 PTR_BUCKET_NR(b->c, &b->key, 0)); 260 260 } 261 261 ··· 612 612 return SHRINK_STOP; 613 613 614 614 /* Return -1 if we can't do anything right now */ 615 - if (sc->gfp_mask & __GFP_WAIT) 615 + if (sc->gfp_mask & __GFP_IO) 616 616 mutex_lock(&c->bucket_lock); 617 617 else if (!mutex_trylock(&c->bucket_lock)) 618 618 return -1;
+20 -13
drivers/md/bcache/journal.c
··· 153 153 bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); 154 154 pr_debug("%u journal buckets", ca->sb.njournal_buckets); 155 155 156 - /* Read journal buckets ordered by golden ratio hash to quickly 156 + /* 157 + * Read journal buckets ordered by golden ratio hash to quickly 157 158 * find a sequence of buckets with valid journal entries 158 159 */ 159 160 for (i = 0; i < ca->sb.njournal_buckets; i++) { ··· 167 166 goto bsearch; 168 167 } 169 168 170 - /* If that fails, check all the buckets we haven't checked 169 + /* 170 + * If that fails, check all the buckets we haven't checked 171 171 * already 172 172 */ 173 173 pr_debug("falling back to linear search"); 174 174 175 - for (l = 0; l < ca->sb.njournal_buckets; l++) { 176 - if (test_bit(l, bitmap)) 177 - continue; 178 - 175 + for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets); 176 + l < ca->sb.njournal_buckets; 177 + l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, l + 1)) 179 178 if (read_bucket(l)) 180 179 goto bsearch; 181 - } 180 + 181 + if (list_empty(list)) 182 + continue; 182 183 bsearch: 183 184 /* Binary search */ 184 185 m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); ··· 200 197 r = m; 201 198 } 202 199 203 - /* Read buckets in reverse order until we stop finding more 200 + /* 201 + * Read buckets in reverse order until we stop finding more 204 202 * journal entries 205 203 */ 206 - pr_debug("finishing up"); 204 + pr_debug("finishing up: m %u njournal_buckets %u", 205 + m, ca->sb.njournal_buckets); 207 206 l = m; 208 207 209 208 while (1) { ··· 233 228 } 234 229 } 235 230 236 - c->journal.seq = list_entry(list->prev, 237 - struct journal_replay, 238 - list)->j.seq; 231 + if (!list_empty(list)) 232 + c->journal.seq = list_entry(list->prev, 233 + struct journal_replay, 234 + list)->j.seq; 239 235 240 236 return 0; 241 237 #undef read_bucket ··· 434 428 return; 435 429 } 436 430 437 - switch (atomic_read(&ja->discard_in_flight) == DISCARD_IN_FLIGHT) { 431 + switch (atomic_read(&ja->discard_in_flight)) { 438 432 case DISCARD_IN_FLIGHT: 439 433 return; 440 434 ··· 695 689 if (cl) 696 690 BUG_ON(!closure_wait(&w->wait, cl)); 697 691 692 + closure_flush(&c->journal.io); 698 693 __journal_try_write(c, true); 699 694 } 700 695 }
+9 -6
drivers/md/bcache/request.c
··· 997 997 } else { 998 998 bch_writeback_add(dc); 999 999 1000 - if (s->op.flush_journal) { 1000 + if (bio->bi_rw & REQ_FLUSH) { 1001 1001 /* Also need to send a flush to the backing device */ 1002 - s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, 1003 - dc->disk.bio_split); 1002 + struct bio *flush = bio_alloc_bioset(0, GFP_NOIO, 1003 + dc->disk.bio_split); 1004 1004 1005 - bio->bi_size = 0; 1006 - bio->bi_vcnt = 0; 1007 - closure_bio_submit(bio, cl, s->d); 1005 + flush->bi_rw = WRITE_FLUSH; 1006 + flush->bi_bdev = bio->bi_bdev; 1007 + flush->bi_end_io = request_endio; 1008 + flush->bi_private = cl; 1009 + 1010 + closure_bio_submit(flush, cl, s->d); 1008 1011 } else { 1009 1012 s->op.cache_bio = bio; 1010 1013 }
+7 -2
drivers/md/bcache/sysfs.c
··· 223 223 } 224 224 225 225 if (attr == &sysfs_label) { 226 - /* note: endlines are preserved */ 227 - memcpy(dc->sb.label, buf, SB_LABEL_SIZE); 226 + if (size > SB_LABEL_SIZE) 227 + return -EINVAL; 228 + memcpy(dc->sb.label, buf, size); 229 + if (size < SB_LABEL_SIZE) 230 + dc->sb.label[size] = '\0'; 231 + if (size && dc->sb.label[size - 1] == '\n') 232 + dc->sb.label[size - 1] = '\0'; 228 233 bch_write_bdev_super(dc, NULL); 229 234 if (dc->disk.c) { 230 235 memcpy(dc->disk.c->uuids[dc->disk.id].label,
+10 -1
drivers/md/bcache/util.c
··· 190 190 stats->last = now ?: 1; 191 191 } 192 192 193 - unsigned bch_next_delay(struct ratelimit *d, uint64_t done) 193 + /** 194 + * bch_next_delay() - increment @d by the amount of work done, and return how 195 + * long to delay until the next time to do some work. 196 + * 197 + * @d - the struct bch_ratelimit to update 198 + * @done - the amount of work done, in arbitrary units 199 + * 200 + * Returns the amount of time to delay by, in jiffies 201 + */ 202 + uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) 194 203 { 195 204 uint64_t now = local_clock(); 196 205
+9 -3
drivers/md/bcache/util.h
··· 450 450 (ewma) >> factor; \ 451 451 }) 452 452 453 - struct ratelimit { 453 + struct bch_ratelimit { 454 + /* Next time we want to do some work, in nanoseconds */ 454 455 uint64_t next; 456 + 457 + /* 458 + * Rate at which we want to do work, in units per nanosecond 459 + * The units here correspond to the units passed to bch_next_delay() 460 + */ 455 461 unsigned rate; 456 462 }; 457 463 458 - static inline void ratelimit_reset(struct ratelimit *d) 464 + static inline void bch_ratelimit_reset(struct bch_ratelimit *d) 459 465 { 460 466 d->next = local_clock(); 461 467 } 462 468 463 - unsigned bch_next_delay(struct ratelimit *d, uint64_t done); 469 + uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done); 464 470 465 471 #define __DIV_SAFE(n, d, zero) \ 466 472 ({ \
+20 -22
drivers/md/bcache/writeback.c
··· 94 94 95 95 static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) 96 96 { 97 + uint64_t ret; 98 + 97 99 if (atomic_read(&dc->disk.detaching) || 98 100 !dc->writeback_percent) 99 101 return 0; 100 102 101 - return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); 103 + ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); 104 + 105 + return min_t(uint64_t, ret, HZ); 102 106 } 103 107 104 108 /* Background writeback */ ··· 212 208 213 209 up_write(&dc->writeback_lock); 214 210 215 - ratelimit_reset(&dc->writeback_rate); 211 + bch_ratelimit_reset(&dc->writeback_rate); 216 212 217 213 /* Punt to workqueue only so we don't recurse and blow the stack */ 218 214 continue_at(cl, read_dirty, dirty_wq); ··· 322 318 } 323 319 324 320 bch_keybuf_del(&dc->writeback_keys, w); 325 - atomic_dec_bug(&dc->in_flight); 326 - 327 - closure_wake_up(&dc->writeback_wait); 321 + up(&dc->in_flight); 328 322 329 323 closure_return_with_destructor(cl, dirty_io_destructor); 330 324 } ··· 351 349 352 350 closure_bio_submit(&io->bio, cl, &io->dc->disk); 353 351 354 - continue_at(cl, write_dirty_finish, dirty_wq); 352 + continue_at(cl, write_dirty_finish, system_wq); 355 353 } 356 354 357 355 static void read_dirty_endio(struct bio *bio, int error) ··· 371 369 372 370 closure_bio_submit(&io->bio, cl, &io->dc->disk); 373 371 374 - continue_at(cl, write_dirty, dirty_wq); 372 + continue_at(cl, write_dirty, system_wq); 375 373 } 376 374 377 375 static void read_dirty(struct closure *cl) ··· 396 394 397 395 if (delay > 0 && 398 396 (KEY_START(&w->key) != dc->last_read || 399 - jiffies_to_msecs(delay) > 50)) { 400 - w->private = NULL; 401 - 402 - closure_delay(&dc->writeback, delay); 403 - continue_at(cl, read_dirty, dirty_wq); 404 - } 397 + jiffies_to_msecs(delay) > 50)) 398 + delay = schedule_timeout_uninterruptible(delay); 405 399 406 400 dc->last_read = KEY_OFFSET(&w->key); 407 401 ··· 422 424 423 425 trace_bcache_writeback(&w->key); 424 426 425 - closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); 427 + down(&dc->in_flight); 428 + closure_call(&io->cl, read_dirty_submit, NULL, cl); 426 429 427 430 delay = writeback_delay(dc, KEY_SIZE(&w->key)); 428 - 429 - atomic_inc(&dc->in_flight); 430 - 431 - if (!closure_wait_event(&dc->writeback_wait, cl, 432 - atomic_read(&dc->in_flight) < 64)) 433 - continue_at(cl, read_dirty, dirty_wq); 434 431 } 435 432 436 433 if (0) { ··· 435 442 bch_keybuf_del(&dc->writeback_keys, w); 436 443 } 437 444 438 - refill_dirty(cl); 445 + /* 446 + * Wait for outstanding writeback IOs to finish (and keybuf slots to be 447 + * freed) before refilling again 448 + */ 449 + continue_at(cl, refill_dirty, dirty_wq); 439 450 } 440 451 441 452 /* Init */ ··· 481 484 482 485 void bch_cached_dev_writeback_init(struct cached_dev *dc) 483 486 { 487 + sema_init(&dc->in_flight, 64); 484 488 closure_init_unlocked(&dc->writeback); 485 489 init_rwsem(&dc->writeback_lock); 486 490 ··· 511 513 512 514 int __init bch_writeback_init(void) 513 515 { 514 - dirty_wq = create_singlethread_workqueue("bcache_writeback"); 516 + dirty_wq = create_workqueue("bcache_writeback"); 515 517 if (!dirty_wq) 516 518 return -ENOMEM; 517 519
+2 -2
fs/bio.c
··· 917 917 src_p = kmap_atomic(src_bv->bv_page); 918 918 dst_p = kmap_atomic(dst_bv->bv_page); 919 919 920 - memcpy(dst_p + dst_bv->bv_offset, 921 - src_p + src_bv->bv_offset, 920 + memcpy(dst_p + dst_offset, 921 + src_p + src_offset, 922 922 bytes); 923 923 924 924 kunmap_atomic(dst_p);