Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
at v4.16-rc1 583 lines 15 kB view raw
1/* 2 * Copyright (C) 2016 CNEX Labs 3 * Initial release: Javier Gonzalez <javier@cnexlabs.com> 4 * Matias Bjorling <matias@cnexlabs.com> 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License version 8 * 2 as published by the Free Software Foundation. 9 * 10 * This program is distributed in the hope that it will be useful, but 11 * WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 * General Public License for more details. 14 * 15 * pblk-write.c - pblk's write path from write buffer to media 16 */ 17 18#include "pblk.h" 19 20static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, 21 struct pblk_c_ctx *c_ctx) 22{ 23 struct bio *original_bio; 24 struct pblk_rb *rwb = &pblk->rwb; 25 unsigned long ret; 26 int i; 27 28 for (i = 0; i < c_ctx->nr_valid; i++) { 29 struct pblk_w_ctx *w_ctx; 30 int pos = c_ctx->sentry + i; 31 int flags; 32 33 w_ctx = pblk_rb_w_ctx(rwb, pos); 34 flags = READ_ONCE(w_ctx->flags); 35 36 if (flags & PBLK_FLUSH_ENTRY) { 37 flags &= ~PBLK_FLUSH_ENTRY; 38 /* Release flags on context. Protect from writes */ 39 smp_store_release(&w_ctx->flags, flags); 40 41#ifdef CONFIG_NVM_DEBUG 42 atomic_dec(&rwb->inflight_flush_point); 43#endif 44 } 45 46 while ((original_bio = bio_list_pop(&w_ctx->bios))) 47 bio_endio(original_bio); 48 } 49 50 if (c_ctx->nr_padded) 51 pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid, 52 c_ctx->nr_padded); 53 54#ifdef CONFIG_NVM_DEBUG 55 atomic_long_add(rqd->nr_ppas, &pblk->sync_writes); 56#endif 57 58 ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid); 59 60 bio_put(rqd->bio); 61 pblk_free_rqd(pblk, rqd, PBLK_WRITE); 62 63 return ret; 64} 65 66static unsigned long pblk_end_queued_w_bio(struct pblk *pblk, 67 struct nvm_rq *rqd, 68 struct pblk_c_ctx *c_ctx) 69{ 70 list_del(&c_ctx->list); 71 return pblk_end_w_bio(pblk, rqd, c_ctx); 72} 73 74static void pblk_complete_write(struct pblk *pblk, struct nvm_rq *rqd, 75 struct pblk_c_ctx *c_ctx) 76{ 77 struct pblk_c_ctx *c, *r; 78 unsigned long flags; 79 unsigned long pos; 80 81#ifdef CONFIG_NVM_DEBUG 82 atomic_long_sub(c_ctx->nr_valid, &pblk->inflight_writes); 83#endif 84 85 pblk_up_rq(pblk, rqd->ppa_list, rqd->nr_ppas, c_ctx->lun_bitmap); 86 87 pos = pblk_rb_sync_init(&pblk->rwb, &flags); 88 if (pos == c_ctx->sentry) { 89 pos = pblk_end_w_bio(pblk, rqd, c_ctx); 90 91retry: 92 list_for_each_entry_safe(c, r, &pblk->compl_list, list) { 93 rqd = nvm_rq_from_c_ctx(c); 94 if (c->sentry == pos) { 95 pos = pblk_end_queued_w_bio(pblk, rqd, c); 96 goto retry; 97 } 98 } 99 } else { 100 WARN_ON(nvm_rq_from_c_ctx(c_ctx) != rqd); 101 list_add_tail(&c_ctx->list, &pblk->compl_list); 102 } 103 pblk_rb_sync_end(&pblk->rwb, &flags); 104} 105 106/* When a write fails, we are not sure whether the block has grown bad or a page 107 * range is more susceptible to write errors. If a high number of pages fail, we 108 * assume that the block is bad and we mark it accordingly. In all cases, we 109 * remap and resubmit the failed entries as fast as possible; if a flush is 110 * waiting on a completion, the whole stack would stall otherwise. 111 */ 112static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd) 113{ 114 void *comp_bits = &rqd->ppa_status; 115 struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); 116 struct pblk_rec_ctx *recovery; 117 struct ppa_addr *ppa_list = rqd->ppa_list; 118 int nr_ppas = rqd->nr_ppas; 119 unsigned int c_entries; 120 int bit, ret; 121 122 if (unlikely(nr_ppas == 1)) 123 ppa_list = &rqd->ppa_addr; 124 125 recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC); 126 127 INIT_LIST_HEAD(&recovery->failed); 128 129 bit = -1; 130 while ((bit = find_next_bit(comp_bits, nr_ppas, bit + 1)) < nr_ppas) { 131 struct pblk_rb_entry *entry; 132 struct ppa_addr ppa; 133 134 /* Logic error */ 135 if (bit > c_ctx->nr_valid) { 136 WARN_ONCE(1, "pblk: corrupted write request\n"); 137 mempool_free(recovery, pblk->rec_pool); 138 goto out; 139 } 140 141 ppa = ppa_list[bit]; 142 entry = pblk_rb_sync_scan_entry(&pblk->rwb, &ppa); 143 if (!entry) { 144 pr_err("pblk: could not scan entry on write failure\n"); 145 mempool_free(recovery, pblk->rec_pool); 146 goto out; 147 } 148 149 /* The list is filled first and emptied afterwards. No need for 150 * protecting it with a lock 151 */ 152 list_add_tail(&entry->index, &recovery->failed); 153 } 154 155 c_entries = find_first_bit(comp_bits, nr_ppas); 156 ret = pblk_recov_setup_rq(pblk, c_ctx, recovery, comp_bits, c_entries); 157 if (ret) { 158 pr_err("pblk: could not recover from write failure\n"); 159 mempool_free(recovery, pblk->rec_pool); 160 goto out; 161 } 162 163 INIT_WORK(&recovery->ws_rec, pblk_submit_rec); 164 queue_work(pblk->close_wq, &recovery->ws_rec); 165 166out: 167 pblk_complete_write(pblk, rqd, c_ctx); 168} 169 170static void pblk_end_io_write(struct nvm_rq *rqd) 171{ 172 struct pblk *pblk = rqd->private; 173 struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); 174 175 if (rqd->error) { 176 pblk_log_write_err(pblk, rqd); 177 return pblk_end_w_fail(pblk, rqd); 178 } 179#ifdef CONFIG_NVM_DEBUG 180 else 181 WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n"); 182#endif 183 184 pblk_complete_write(pblk, rqd, c_ctx); 185 atomic_dec(&pblk->inflight_io); 186} 187 188static void pblk_end_io_write_meta(struct nvm_rq *rqd) 189{ 190 struct pblk *pblk = rqd->private; 191 struct pblk_g_ctx *m_ctx = nvm_rq_to_pdu(rqd); 192 struct pblk_line *line = m_ctx->private; 193 struct pblk_emeta *emeta = line->emeta; 194 int sync; 195 196 pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas); 197 198 if (rqd->error) { 199 pblk_log_write_err(pblk, rqd); 200 pr_err("pblk: metadata I/O failed. Line %d\n", line->id); 201 } 202 203 sync = atomic_add_return(rqd->nr_ppas, &emeta->sync); 204 if (sync == emeta->nr_entries) 205 pblk_gen_run_ws(pblk, line, NULL, pblk_line_close_ws, 206 GFP_ATOMIC, pblk->close_wq); 207 208 pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); 209 210 atomic_dec(&pblk->inflight_io); 211} 212 213static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd, 214 unsigned int nr_secs, 215 nvm_end_io_fn(*end_io)) 216{ 217 struct nvm_tgt_dev *dev = pblk->dev; 218 219 /* Setup write request */ 220 rqd->opcode = NVM_OP_PWRITE; 221 rqd->nr_ppas = nr_secs; 222 rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE); 223 rqd->private = pblk; 224 rqd->end_io = end_io; 225 226 rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, 227 &rqd->dma_meta_list); 228 if (!rqd->meta_list) 229 return -ENOMEM; 230 231 rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size; 232 rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size; 233 234 return 0; 235} 236 237static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd, 238 struct ppa_addr *erase_ppa) 239{ 240 struct pblk_line_meta *lm = &pblk->lm; 241 struct pblk_line *e_line = pblk_line_get_erase(pblk); 242 struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); 243 unsigned int valid = c_ctx->nr_valid; 244 unsigned int padded = c_ctx->nr_padded; 245 unsigned int nr_secs = valid + padded; 246 unsigned long *lun_bitmap; 247 int ret; 248 249 lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL); 250 if (!lun_bitmap) 251 return -ENOMEM; 252 c_ctx->lun_bitmap = lun_bitmap; 253 254 ret = pblk_alloc_w_rq(pblk, rqd, nr_secs, pblk_end_io_write); 255 if (ret) { 256 kfree(lun_bitmap); 257 return ret; 258 } 259 260 if (likely(!e_line || !atomic_read(&e_line->left_eblks))) 261 pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, valid, 0); 262 else 263 pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, 264 valid, erase_ppa); 265 266 return 0; 267} 268 269int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd, 270 struct pblk_c_ctx *c_ctx) 271{ 272 struct pblk_line_meta *lm = &pblk->lm; 273 unsigned long *lun_bitmap; 274 int ret; 275 276 lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL); 277 if (!lun_bitmap) 278 return -ENOMEM; 279 280 c_ctx->lun_bitmap = lun_bitmap; 281 282 ret = pblk_alloc_w_rq(pblk, rqd, rqd->nr_ppas, pblk_end_io_write); 283 if (ret) 284 return ret; 285 286 pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, c_ctx->nr_valid, 0); 287 288 rqd->ppa_status = (u64)0; 289 rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE); 290 291 return ret; 292} 293 294static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail, 295 unsigned int secs_to_flush) 296{ 297 int secs_to_sync; 298 299 secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush); 300 301#ifdef CONFIG_NVM_DEBUG 302 if ((!secs_to_sync && secs_to_flush) 303 || (secs_to_sync < 0) 304 || (secs_to_sync > secs_avail && !secs_to_flush)) { 305 pr_err("pblk: bad sector calculation (a:%d,s:%d,f:%d)\n", 306 secs_avail, secs_to_sync, secs_to_flush); 307 } 308#endif 309 310 return secs_to_sync; 311} 312 313int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) 314{ 315 struct nvm_tgt_dev *dev = pblk->dev; 316 struct nvm_geo *geo = &dev->geo; 317 struct pblk_line_mgmt *l_mg = &pblk->l_mg; 318 struct pblk_line_meta *lm = &pblk->lm; 319 struct pblk_emeta *emeta = meta_line->emeta; 320 struct pblk_g_ctx *m_ctx; 321 struct bio *bio; 322 struct nvm_rq *rqd; 323 void *data; 324 u64 paddr; 325 int rq_ppas = pblk->min_write_pgs; 326 int id = meta_line->id; 327 int rq_len; 328 int i, j; 329 int ret; 330 331 rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT); 332 333 m_ctx = nvm_rq_to_pdu(rqd); 334 m_ctx->private = meta_line; 335 336 rq_len = rq_ppas * geo->sec_size; 337 data = ((void *)emeta->buf) + emeta->mem; 338 339 bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len, 340 l_mg->emeta_alloc_type, GFP_KERNEL); 341 if (IS_ERR(bio)) { 342 ret = PTR_ERR(bio); 343 goto fail_free_rqd; 344 } 345 bio->bi_iter.bi_sector = 0; /* internal bio */ 346 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 347 rqd->bio = bio; 348 349 ret = pblk_alloc_w_rq(pblk, rqd, rq_ppas, pblk_end_io_write_meta); 350 if (ret) 351 goto fail_free_bio; 352 353 for (i = 0; i < rqd->nr_ppas; ) { 354 spin_lock(&meta_line->lock); 355 paddr = __pblk_alloc_page(pblk, meta_line, rq_ppas); 356 spin_unlock(&meta_line->lock); 357 for (j = 0; j < rq_ppas; j++, i++, paddr++) 358 rqd->ppa_list[i] = addr_to_gen_ppa(pblk, paddr, id); 359 } 360 361 emeta->mem += rq_len; 362 if (emeta->mem >= lm->emeta_len[0]) { 363 spin_lock(&l_mg->close_lock); 364 list_del(&meta_line->list); 365 spin_unlock(&l_mg->close_lock); 366 } 367 368 pblk_down_page(pblk, rqd->ppa_list, rqd->nr_ppas); 369 370 ret = pblk_submit_io(pblk, rqd); 371 if (ret) { 372 pr_err("pblk: emeta I/O submission failed: %d\n", ret); 373 goto fail_rollback; 374 } 375 376 return NVM_IO_OK; 377 378fail_rollback: 379 pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas); 380 spin_lock(&l_mg->close_lock); 381 pblk_dealloc_page(pblk, meta_line, rq_ppas); 382 list_add(&meta_line->list, &meta_line->list); 383 spin_unlock(&l_mg->close_lock); 384fail_free_bio: 385 bio_put(bio); 386fail_free_rqd: 387 pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); 388 return ret; 389} 390 391static inline bool pblk_valid_meta_ppa(struct pblk *pblk, 392 struct pblk_line *meta_line, 393 struct nvm_rq *data_rqd) 394{ 395 struct nvm_tgt_dev *dev = pblk->dev; 396 struct nvm_geo *geo = &dev->geo; 397 struct pblk_c_ctx *data_c_ctx = nvm_rq_to_pdu(data_rqd); 398 struct pblk_line *data_line = pblk_line_get_data(pblk); 399 struct ppa_addr ppa, ppa_opt; 400 u64 paddr; 401 int pos_opt; 402 403 /* Schedule a metadata I/O that is half the distance from the data I/O 404 * with regards to the number of LUNs forming the pblk instance. This 405 * balances LUN conflicts across every I/O. 406 * 407 * When the LUN configuration changes (e.g., due to GC), this distance 408 * can align, which would result on metadata and data I/Os colliding. In 409 * this case, modify the distance to not be optimal, but move the 410 * optimal in the right direction. 411 */ 412 paddr = pblk_lookup_page(pblk, meta_line); 413 ppa = addr_to_gen_ppa(pblk, paddr, 0); 414 ppa_opt = addr_to_gen_ppa(pblk, paddr + data_line->meta_distance, 0); 415 pos_opt = pblk_ppa_to_pos(geo, ppa_opt); 416 417 if (test_bit(pos_opt, data_c_ctx->lun_bitmap) || 418 test_bit(pos_opt, data_line->blk_bitmap)) 419 return true; 420 421 if (unlikely(pblk_ppa_comp(ppa_opt, ppa))) 422 data_line->meta_distance--; 423 424 return false; 425} 426 427static struct pblk_line *pblk_should_submit_meta_io(struct pblk *pblk, 428 struct nvm_rq *data_rqd) 429{ 430 struct pblk_line_meta *lm = &pblk->lm; 431 struct pblk_line_mgmt *l_mg = &pblk->l_mg; 432 struct pblk_line *meta_line; 433 434 spin_lock(&l_mg->close_lock); 435retry: 436 if (list_empty(&l_mg->emeta_list)) { 437 spin_unlock(&l_mg->close_lock); 438 return NULL; 439 } 440 meta_line = list_first_entry(&l_mg->emeta_list, struct pblk_line, list); 441 if (meta_line->emeta->mem >= lm->emeta_len[0]) 442 goto retry; 443 spin_unlock(&l_mg->close_lock); 444 445 if (!pblk_valid_meta_ppa(pblk, meta_line, data_rqd)) 446 return NULL; 447 448 return meta_line; 449} 450 451static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) 452{ 453 struct ppa_addr erase_ppa; 454 struct pblk_line *meta_line; 455 int err; 456 457 pblk_ppa_set_empty(&erase_ppa); 458 459 /* Assign lbas to ppas and populate request structure */ 460 err = pblk_setup_w_rq(pblk, rqd, &erase_ppa); 461 if (err) { 462 pr_err("pblk: could not setup write request: %d\n", err); 463 return NVM_IO_ERR; 464 } 465 466 meta_line = pblk_should_submit_meta_io(pblk, rqd); 467 468 /* Submit data write for current data line */ 469 err = pblk_submit_io(pblk, rqd); 470 if (err) { 471 pr_err("pblk: data I/O submission failed: %d\n", err); 472 return NVM_IO_ERR; 473 } 474 475 if (!pblk_ppa_empty(erase_ppa)) { 476 /* Submit erase for next data line */ 477 if (pblk_blk_erase_async(pblk, erase_ppa)) { 478 struct pblk_line *e_line = pblk_line_get_erase(pblk); 479 struct nvm_tgt_dev *dev = pblk->dev; 480 struct nvm_geo *geo = &dev->geo; 481 int bit; 482 483 atomic_inc(&e_line->left_eblks); 484 bit = pblk_ppa_to_pos(geo, erase_ppa); 485 WARN_ON(!test_and_clear_bit(bit, e_line->erase_bitmap)); 486 } 487 } 488 489 if (meta_line) { 490 /* Submit metadata write for previous data line */ 491 err = pblk_submit_meta_io(pblk, meta_line); 492 if (err) { 493 pr_err("pblk: metadata I/O submission failed: %d", err); 494 return NVM_IO_ERR; 495 } 496 } 497 498 return NVM_IO_OK; 499} 500 501static void pblk_free_write_rqd(struct pblk *pblk, struct nvm_rq *rqd) 502{ 503 struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); 504 struct bio *bio = rqd->bio; 505 506 if (c_ctx->nr_padded) 507 pblk_bio_free_pages(pblk, bio, c_ctx->nr_valid, 508 c_ctx->nr_padded); 509} 510 511static int pblk_submit_write(struct pblk *pblk) 512{ 513 struct bio *bio; 514 struct nvm_rq *rqd; 515 unsigned int secs_avail, secs_to_sync, secs_to_com; 516 unsigned int secs_to_flush; 517 unsigned long pos; 518 519 /* If there are no sectors in the cache, flushes (bios without data) 520 * will be cleared on the cache threads 521 */ 522 secs_avail = pblk_rb_read_count(&pblk->rwb); 523 if (!secs_avail) 524 return 1; 525 526 secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb); 527 if (!secs_to_flush && secs_avail < pblk->min_write_pgs) 528 return 1; 529 530 secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush); 531 if (secs_to_sync > pblk->max_write_pgs) { 532 pr_err("pblk: bad buffer sync calculation\n"); 533 return 1; 534 } 535 536 secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync; 537 pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com); 538 539 bio = bio_alloc(GFP_KERNEL, secs_to_sync); 540 541 bio->bi_iter.bi_sector = 0; /* internal bio */ 542 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 543 544 rqd = pblk_alloc_rqd(pblk, PBLK_WRITE); 545 rqd->bio = bio; 546 547 if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync, 548 secs_avail)) { 549 pr_err("pblk: corrupted write bio\n"); 550 goto fail_put_bio; 551 } 552 553 if (pblk_submit_io_set(pblk, rqd)) 554 goto fail_free_bio; 555 556#ifdef CONFIG_NVM_DEBUG 557 atomic_long_add(secs_to_sync, &pblk->sub_writes); 558#endif 559 560 return 0; 561 562fail_free_bio: 563 pblk_free_write_rqd(pblk, rqd); 564fail_put_bio: 565 bio_put(bio); 566 pblk_free_rqd(pblk, rqd, PBLK_WRITE); 567 568 return 1; 569} 570 571int pblk_write_ts(void *data) 572{ 573 struct pblk *pblk = data; 574 575 while (!kthread_should_stop()) { 576 if (!pblk_submit_write(pblk)) 577 continue; 578 set_current_state(TASK_INTERRUPTIBLE); 579 io_schedule(); 580 } 581 582 return 0; 583}