Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v3.12 2322 lines 58 kB view raw
1/* 2 * NVM Express device driver 3 * Copyright (c) 2011, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 * 14 * You should have received a copy of the GNU General Public License along with 15 * this program; if not, write to the Free Software Foundation, Inc., 16 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 17 */ 18 19#include <linux/nvme.h> 20#include <linux/bio.h> 21#include <linux/bitops.h> 22#include <linux/blkdev.h> 23#include <linux/delay.h> 24#include <linux/errno.h> 25#include <linux/fs.h> 26#include <linux/genhd.h> 27#include <linux/idr.h> 28#include <linux/init.h> 29#include <linux/interrupt.h> 30#include <linux/io.h> 31#include <linux/kdev_t.h> 32#include <linux/kthread.h> 33#include <linux/kernel.h> 34#include <linux/mm.h> 35#include <linux/module.h> 36#include <linux/moduleparam.h> 37#include <linux/pci.h> 38#include <linux/poison.h> 39#include <linux/ptrace.h> 40#include <linux/sched.h> 41#include <linux/slab.h> 42#include <linux/types.h> 43#include <scsi/sg.h> 44#include <asm-generic/io-64-nonatomic-lo-hi.h> 45 46#define NVME_Q_DEPTH 1024 47#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) 48#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) 49#define NVME_MINORS 64 50#define ADMIN_TIMEOUT (60 * HZ) 51 52static int nvme_major; 53module_param(nvme_major, int, 0); 54 55static int use_threaded_interrupts; 56module_param(use_threaded_interrupts, int, 0); 57 58static DEFINE_SPINLOCK(dev_list_lock); 59static LIST_HEAD(dev_list); 60static struct task_struct *nvme_thread; 61 62/* 63 * An NVM Express queue. Each device has at least two (one for admin 64 * commands and one for I/O commands). 65 */ 66struct nvme_queue { 67 struct device *q_dmadev; 68 struct nvme_dev *dev; 69 spinlock_t q_lock; 70 struct nvme_command *sq_cmds; 71 volatile struct nvme_completion *cqes; 72 dma_addr_t sq_dma_addr; 73 dma_addr_t cq_dma_addr; 74 wait_queue_head_t sq_full; 75 wait_queue_t sq_cong_wait; 76 struct bio_list sq_cong; 77 u32 __iomem *q_db; 78 u16 q_depth; 79 u16 cq_vector; 80 u16 sq_head; 81 u16 sq_tail; 82 u16 cq_head; 83 u8 cq_phase; 84 u8 cqe_seen; 85 u8 q_suspended; 86 unsigned long cmdid_data[]; 87}; 88 89/* 90 * Check we didin't inadvertently grow the command struct 91 */ 92static inline void _nvme_check_size(void) 93{ 94 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); 95 BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); 96 BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); 97 BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); 98 BUILD_BUG_ON(sizeof(struct nvme_features) != 64); 99 BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); 100 BUILD_BUG_ON(sizeof(struct nvme_command) != 64); 101 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); 102 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); 103 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); 104 BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); 105} 106 107typedef void (*nvme_completion_fn)(struct nvme_dev *, void *, 108 struct nvme_completion *); 109 110struct nvme_cmd_info { 111 nvme_completion_fn fn; 112 void *ctx; 113 unsigned long timeout; 114}; 115 116static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq) 117{ 118 return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)]; 119} 120 121static unsigned nvme_queue_extra(int depth) 122{ 123 return DIV_ROUND_UP(depth, 8) + (depth * sizeof(struct nvme_cmd_info)); 124} 125 126/** 127 * alloc_cmdid() - Allocate a Command ID 128 * @nvmeq: The queue that will be used for this command 129 * @ctx: A pointer that will be passed to the handler 130 * @handler: The function to call on completion 131 * 132 * Allocate a Command ID for a queue. The data passed in will 133 * be passed to the completion handler. This is implemented by using 134 * the bottom two bits of the ctx pointer to store the handler ID. 135 * Passing in a pointer that's not 4-byte aligned will cause a BUG. 136 * We can change this if it becomes a problem. 137 * 138 * May be called with local interrupts disabled and the q_lock held, 139 * or with interrupts enabled and no locks held. 140 */ 141static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, 142 nvme_completion_fn handler, unsigned timeout) 143{ 144 int depth = nvmeq->q_depth - 1; 145 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 146 int cmdid; 147 148 do { 149 cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth); 150 if (cmdid >= depth) 151 return -EBUSY; 152 } while (test_and_set_bit(cmdid, nvmeq->cmdid_data)); 153 154 info[cmdid].fn = handler; 155 info[cmdid].ctx = ctx; 156 info[cmdid].timeout = jiffies + timeout; 157 return cmdid; 158} 159 160static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, 161 nvme_completion_fn handler, unsigned timeout) 162{ 163 int cmdid; 164 wait_event_killable(nvmeq->sq_full, 165 (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0); 166 return (cmdid < 0) ? -EINTR : cmdid; 167} 168 169/* Special values must be less than 0x1000 */ 170#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) 171#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) 172#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) 173#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) 174#define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE) 175 176static void special_completion(struct nvme_dev *dev, void *ctx, 177 struct nvme_completion *cqe) 178{ 179 if (ctx == CMD_CTX_CANCELLED) 180 return; 181 if (ctx == CMD_CTX_FLUSH) 182 return; 183 if (ctx == CMD_CTX_COMPLETED) { 184 dev_warn(&dev->pci_dev->dev, 185 "completed id %d twice on queue %d\n", 186 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 187 return; 188 } 189 if (ctx == CMD_CTX_INVALID) { 190 dev_warn(&dev->pci_dev->dev, 191 "invalid id %d completed on queue %d\n", 192 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 193 return; 194 } 195 196 dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx); 197} 198 199/* 200 * Called with local interrupts disabled and the q_lock held. May not sleep. 201 */ 202static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid, 203 nvme_completion_fn *fn) 204{ 205 void *ctx; 206 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 207 208 if (cmdid >= nvmeq->q_depth) { 209 *fn = special_completion; 210 return CMD_CTX_INVALID; 211 } 212 if (fn) 213 *fn = info[cmdid].fn; 214 ctx = info[cmdid].ctx; 215 info[cmdid].fn = special_completion; 216 info[cmdid].ctx = CMD_CTX_COMPLETED; 217 clear_bit(cmdid, nvmeq->cmdid_data); 218 wake_up(&nvmeq->sq_full); 219 return ctx; 220} 221 222static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid, 223 nvme_completion_fn *fn) 224{ 225 void *ctx; 226 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 227 if (fn) 228 *fn = info[cmdid].fn; 229 ctx = info[cmdid].ctx; 230 info[cmdid].fn = special_completion; 231 info[cmdid].ctx = CMD_CTX_CANCELLED; 232 return ctx; 233} 234 235struct nvme_queue *get_nvmeq(struct nvme_dev *dev) 236{ 237 return dev->queues[get_cpu() + 1]; 238} 239 240void put_nvmeq(struct nvme_queue *nvmeq) 241{ 242 put_cpu(); 243} 244 245/** 246 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell 247 * @nvmeq: The queue to use 248 * @cmd: The command to send 249 * 250 * Safe to use from interrupt context 251 */ 252static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) 253{ 254 unsigned long flags; 255 u16 tail; 256 spin_lock_irqsave(&nvmeq->q_lock, flags); 257 tail = nvmeq->sq_tail; 258 memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); 259 if (++tail == nvmeq->q_depth) 260 tail = 0; 261 writel(tail, nvmeq->q_db); 262 nvmeq->sq_tail = tail; 263 spin_unlock_irqrestore(&nvmeq->q_lock, flags); 264 265 return 0; 266} 267 268static __le64 **iod_list(struct nvme_iod *iod) 269{ 270 return ((void *)iod) + iod->offset; 271} 272 273/* 274 * Will slightly overestimate the number of pages needed. This is OK 275 * as it only leads to a small amount of wasted memory for the lifetime of 276 * the I/O. 277 */ 278static int nvme_npages(unsigned size) 279{ 280 unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE); 281 return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); 282} 283 284static struct nvme_iod * 285nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) 286{ 287 struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + 288 sizeof(__le64 *) * nvme_npages(nbytes) + 289 sizeof(struct scatterlist) * nseg, gfp); 290 291 if (iod) { 292 iod->offset = offsetof(struct nvme_iod, sg[nseg]); 293 iod->npages = -1; 294 iod->length = nbytes; 295 iod->nents = 0; 296 iod->start_time = jiffies; 297 } 298 299 return iod; 300} 301 302void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) 303{ 304 const int last_prp = PAGE_SIZE / 8 - 1; 305 int i; 306 __le64 **list = iod_list(iod); 307 dma_addr_t prp_dma = iod->first_dma; 308 309 if (iod->npages == 0) 310 dma_pool_free(dev->prp_small_pool, list[0], prp_dma); 311 for (i = 0; i < iod->npages; i++) { 312 __le64 *prp_list = list[i]; 313 dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); 314 dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); 315 prp_dma = next_prp_dma; 316 } 317 kfree(iod); 318} 319 320static void nvme_start_io_acct(struct bio *bio) 321{ 322 struct gendisk *disk = bio->bi_bdev->bd_disk; 323 const int rw = bio_data_dir(bio); 324 int cpu = part_stat_lock(); 325 part_round_stats(cpu, &disk->part0); 326 part_stat_inc(cpu, &disk->part0, ios[rw]); 327 part_stat_add(cpu, &disk->part0, sectors[rw], bio_sectors(bio)); 328 part_inc_in_flight(&disk->part0, rw); 329 part_stat_unlock(); 330} 331 332static void nvme_end_io_acct(struct bio *bio, unsigned long start_time) 333{ 334 struct gendisk *disk = bio->bi_bdev->bd_disk; 335 const int rw = bio_data_dir(bio); 336 unsigned long duration = jiffies - start_time; 337 int cpu = part_stat_lock(); 338 part_stat_add(cpu, &disk->part0, ticks[rw], duration); 339 part_round_stats(cpu, &disk->part0); 340 part_dec_in_flight(&disk->part0, rw); 341 part_stat_unlock(); 342} 343 344static void bio_completion(struct nvme_dev *dev, void *ctx, 345 struct nvme_completion *cqe) 346{ 347 struct nvme_iod *iod = ctx; 348 struct bio *bio = iod->private; 349 u16 status = le16_to_cpup(&cqe->status) >> 1; 350 351 if (iod->nents) { 352 dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, 353 bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 354 nvme_end_io_acct(bio, iod->start_time); 355 } 356 nvme_free_iod(dev, iod); 357 if (status) 358 bio_endio(bio, -EIO); 359 else 360 bio_endio(bio, 0); 361} 362 363/* length is in bytes. gfp flags indicates whether we may sleep. */ 364int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd, 365 struct nvme_iod *iod, int total_len, gfp_t gfp) 366{ 367 struct dma_pool *pool; 368 int length = total_len; 369 struct scatterlist *sg = iod->sg; 370 int dma_len = sg_dma_len(sg); 371 u64 dma_addr = sg_dma_address(sg); 372 int offset = offset_in_page(dma_addr); 373 __le64 *prp_list; 374 __le64 **list = iod_list(iod); 375 dma_addr_t prp_dma; 376 int nprps, i; 377 378 cmd->prp1 = cpu_to_le64(dma_addr); 379 length -= (PAGE_SIZE - offset); 380 if (length <= 0) 381 return total_len; 382 383 dma_len -= (PAGE_SIZE - offset); 384 if (dma_len) { 385 dma_addr += (PAGE_SIZE - offset); 386 } else { 387 sg = sg_next(sg); 388 dma_addr = sg_dma_address(sg); 389 dma_len = sg_dma_len(sg); 390 } 391 392 if (length <= PAGE_SIZE) { 393 cmd->prp2 = cpu_to_le64(dma_addr); 394 return total_len; 395 } 396 397 nprps = DIV_ROUND_UP(length, PAGE_SIZE); 398 if (nprps <= (256 / 8)) { 399 pool = dev->prp_small_pool; 400 iod->npages = 0; 401 } else { 402 pool = dev->prp_page_pool; 403 iod->npages = 1; 404 } 405 406 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 407 if (!prp_list) { 408 cmd->prp2 = cpu_to_le64(dma_addr); 409 iod->npages = -1; 410 return (total_len - length) + PAGE_SIZE; 411 } 412 list[0] = prp_list; 413 iod->first_dma = prp_dma; 414 cmd->prp2 = cpu_to_le64(prp_dma); 415 i = 0; 416 for (;;) { 417 if (i == PAGE_SIZE / 8) { 418 __le64 *old_prp_list = prp_list; 419 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 420 if (!prp_list) 421 return total_len - length; 422 list[iod->npages++] = prp_list; 423 prp_list[0] = old_prp_list[i - 1]; 424 old_prp_list[i - 1] = cpu_to_le64(prp_dma); 425 i = 1; 426 } 427 prp_list[i++] = cpu_to_le64(dma_addr); 428 dma_len -= PAGE_SIZE; 429 dma_addr += PAGE_SIZE; 430 length -= PAGE_SIZE; 431 if (length <= 0) 432 break; 433 if (dma_len > 0) 434 continue; 435 BUG_ON(dma_len < 0); 436 sg = sg_next(sg); 437 dma_addr = sg_dma_address(sg); 438 dma_len = sg_dma_len(sg); 439 } 440 441 return total_len; 442} 443 444struct nvme_bio_pair { 445 struct bio b1, b2, *parent; 446 struct bio_vec *bv1, *bv2; 447 int err; 448 atomic_t cnt; 449}; 450 451static void nvme_bio_pair_endio(struct bio *bio, int err) 452{ 453 struct nvme_bio_pair *bp = bio->bi_private; 454 455 if (err) 456 bp->err = err; 457 458 if (atomic_dec_and_test(&bp->cnt)) { 459 bio_endio(bp->parent, bp->err); 460 kfree(bp->bv1); 461 kfree(bp->bv2); 462 kfree(bp); 463 } 464} 465 466static struct nvme_bio_pair *nvme_bio_split(struct bio *bio, int idx, 467 int len, int offset) 468{ 469 struct nvme_bio_pair *bp; 470 471 BUG_ON(len > bio->bi_size); 472 BUG_ON(idx > bio->bi_vcnt); 473 474 bp = kmalloc(sizeof(*bp), GFP_ATOMIC); 475 if (!bp) 476 return NULL; 477 bp->err = 0; 478 479 bp->b1 = *bio; 480 bp->b2 = *bio; 481 482 bp->b1.bi_size = len; 483 bp->b2.bi_size -= len; 484 bp->b1.bi_vcnt = idx; 485 bp->b2.bi_idx = idx; 486 bp->b2.bi_sector += len >> 9; 487 488 if (offset) { 489 bp->bv1 = kmalloc(bio->bi_max_vecs * sizeof(struct bio_vec), 490 GFP_ATOMIC); 491 if (!bp->bv1) 492 goto split_fail_1; 493 494 bp->bv2 = kmalloc(bio->bi_max_vecs * sizeof(struct bio_vec), 495 GFP_ATOMIC); 496 if (!bp->bv2) 497 goto split_fail_2; 498 499 memcpy(bp->bv1, bio->bi_io_vec, 500 bio->bi_max_vecs * sizeof(struct bio_vec)); 501 memcpy(bp->bv2, bio->bi_io_vec, 502 bio->bi_max_vecs * sizeof(struct bio_vec)); 503 504 bp->b1.bi_io_vec = bp->bv1; 505 bp->b2.bi_io_vec = bp->bv2; 506 bp->b2.bi_io_vec[idx].bv_offset += offset; 507 bp->b2.bi_io_vec[idx].bv_len -= offset; 508 bp->b1.bi_io_vec[idx].bv_len = offset; 509 bp->b1.bi_vcnt++; 510 } else 511 bp->bv1 = bp->bv2 = NULL; 512 513 bp->b1.bi_private = bp; 514 bp->b2.bi_private = bp; 515 516 bp->b1.bi_end_io = nvme_bio_pair_endio; 517 bp->b2.bi_end_io = nvme_bio_pair_endio; 518 519 bp->parent = bio; 520 atomic_set(&bp->cnt, 2); 521 522 return bp; 523 524 split_fail_2: 525 kfree(bp->bv1); 526 split_fail_1: 527 kfree(bp); 528 return NULL; 529} 530 531static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq, 532 int idx, int len, int offset) 533{ 534 struct nvme_bio_pair *bp = nvme_bio_split(bio, idx, len, offset); 535 if (!bp) 536 return -ENOMEM; 537 538 if (bio_list_empty(&nvmeq->sq_cong)) 539 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); 540 bio_list_add(&nvmeq->sq_cong, &bp->b1); 541 bio_list_add(&nvmeq->sq_cong, &bp->b2); 542 543 return 0; 544} 545 546/* NVMe scatterlists require no holes in the virtual address */ 547#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \ 548 (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE)) 549 550static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod, 551 struct bio *bio, enum dma_data_direction dma_dir, int psegs) 552{ 553 struct bio_vec *bvec, *bvprv = NULL; 554 struct scatterlist *sg = NULL; 555 int i, length = 0, nsegs = 0, split_len = bio->bi_size; 556 557 if (nvmeq->dev->stripe_size) 558 split_len = nvmeq->dev->stripe_size - 559 ((bio->bi_sector << 9) & (nvmeq->dev->stripe_size - 1)); 560 561 sg_init_table(iod->sg, psegs); 562 bio_for_each_segment(bvec, bio, i) { 563 if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) { 564 sg->length += bvec->bv_len; 565 } else { 566 if (bvprv && BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec)) 567 return nvme_split_and_submit(bio, nvmeq, i, 568 length, 0); 569 570 sg = sg ? sg + 1 : iod->sg; 571 sg_set_page(sg, bvec->bv_page, bvec->bv_len, 572 bvec->bv_offset); 573 nsegs++; 574 } 575 576 if (split_len - length < bvec->bv_len) 577 return nvme_split_and_submit(bio, nvmeq, i, split_len, 578 split_len - length); 579 length += bvec->bv_len; 580 bvprv = bvec; 581 } 582 iod->nents = nsegs; 583 sg_mark_end(sg); 584 if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0) 585 return -ENOMEM; 586 587 BUG_ON(length != bio->bi_size); 588 return length; 589} 590 591/* 592 * We reuse the small pool to allocate the 16-byte range here as it is not 593 * worth having a special pool for these or additional cases to handle freeing 594 * the iod. 595 */ 596static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, 597 struct bio *bio, struct nvme_iod *iod, int cmdid) 598{ 599 struct nvme_dsm_range *range; 600 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 601 602 range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC, 603 &iod->first_dma); 604 if (!range) 605 return -ENOMEM; 606 607 iod_list(iod)[0] = (__le64 *)range; 608 iod->npages = 0; 609 610 range->cattr = cpu_to_le32(0); 611 range->nlb = cpu_to_le32(bio->bi_size >> ns->lba_shift); 612 range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_sector)); 613 614 memset(cmnd, 0, sizeof(*cmnd)); 615 cmnd->dsm.opcode = nvme_cmd_dsm; 616 cmnd->dsm.command_id = cmdid; 617 cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); 618 cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); 619 cmnd->dsm.nr = 0; 620 cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); 621 622 if (++nvmeq->sq_tail == nvmeq->q_depth) 623 nvmeq->sq_tail = 0; 624 writel(nvmeq->sq_tail, nvmeq->q_db); 625 626 return 0; 627} 628 629static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, 630 int cmdid) 631{ 632 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 633 634 memset(cmnd, 0, sizeof(*cmnd)); 635 cmnd->common.opcode = nvme_cmd_flush; 636 cmnd->common.command_id = cmdid; 637 cmnd->common.nsid = cpu_to_le32(ns->ns_id); 638 639 if (++nvmeq->sq_tail == nvmeq->q_depth) 640 nvmeq->sq_tail = 0; 641 writel(nvmeq->sq_tail, nvmeq->q_db); 642 643 return 0; 644} 645 646int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns) 647{ 648 int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH, 649 special_completion, NVME_IO_TIMEOUT); 650 if (unlikely(cmdid < 0)) 651 return cmdid; 652 653 return nvme_submit_flush(nvmeq, ns, cmdid); 654} 655 656/* 657 * Called with local interrupts disabled and the q_lock held. May not sleep. 658 */ 659static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, 660 struct bio *bio) 661{ 662 struct nvme_command *cmnd; 663 struct nvme_iod *iod; 664 enum dma_data_direction dma_dir; 665 int cmdid, length, result; 666 u16 control; 667 u32 dsmgmt; 668 int psegs = bio_phys_segments(ns->queue, bio); 669 670 if ((bio->bi_rw & REQ_FLUSH) && psegs) { 671 result = nvme_submit_flush_data(nvmeq, ns); 672 if (result) 673 return result; 674 } 675 676 result = -ENOMEM; 677 iod = nvme_alloc_iod(psegs, bio->bi_size, GFP_ATOMIC); 678 if (!iod) 679 goto nomem; 680 iod->private = bio; 681 682 result = -EBUSY; 683 cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT); 684 if (unlikely(cmdid < 0)) 685 goto free_iod; 686 687 if (bio->bi_rw & REQ_DISCARD) { 688 result = nvme_submit_discard(nvmeq, ns, bio, iod, cmdid); 689 if (result) 690 goto free_cmdid; 691 return result; 692 } 693 if ((bio->bi_rw & REQ_FLUSH) && !psegs) 694 return nvme_submit_flush(nvmeq, ns, cmdid); 695 696 control = 0; 697 if (bio->bi_rw & REQ_FUA) 698 control |= NVME_RW_FUA; 699 if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD)) 700 control |= NVME_RW_LR; 701 702 dsmgmt = 0; 703 if (bio->bi_rw & REQ_RAHEAD) 704 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; 705 706 cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 707 708 memset(cmnd, 0, sizeof(*cmnd)); 709 if (bio_data_dir(bio)) { 710 cmnd->rw.opcode = nvme_cmd_write; 711 dma_dir = DMA_TO_DEVICE; 712 } else { 713 cmnd->rw.opcode = nvme_cmd_read; 714 dma_dir = DMA_FROM_DEVICE; 715 } 716 717 result = nvme_map_bio(nvmeq, iod, bio, dma_dir, psegs); 718 if (result <= 0) 719 goto free_cmdid; 720 length = result; 721 722 cmnd->rw.command_id = cmdid; 723 cmnd->rw.nsid = cpu_to_le32(ns->ns_id); 724 length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length, 725 GFP_ATOMIC); 726 cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_sector)); 727 cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1); 728 cmnd->rw.control = cpu_to_le16(control); 729 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); 730 731 nvme_start_io_acct(bio); 732 if (++nvmeq->sq_tail == nvmeq->q_depth) 733 nvmeq->sq_tail = 0; 734 writel(nvmeq->sq_tail, nvmeq->q_db); 735 736 return 0; 737 738 free_cmdid: 739 free_cmdid(nvmeq, cmdid, NULL); 740 free_iod: 741 nvme_free_iod(nvmeq->dev, iod); 742 nomem: 743 return result; 744} 745 746static int nvme_process_cq(struct nvme_queue *nvmeq) 747{ 748 u16 head, phase; 749 750 head = nvmeq->cq_head; 751 phase = nvmeq->cq_phase; 752 753 for (;;) { 754 void *ctx; 755 nvme_completion_fn fn; 756 struct nvme_completion cqe = nvmeq->cqes[head]; 757 if ((le16_to_cpu(cqe.status) & 1) != phase) 758 break; 759 nvmeq->sq_head = le16_to_cpu(cqe.sq_head); 760 if (++head == nvmeq->q_depth) { 761 head = 0; 762 phase = !phase; 763 } 764 765 ctx = free_cmdid(nvmeq, cqe.command_id, &fn); 766 fn(nvmeq->dev, ctx, &cqe); 767 } 768 769 /* If the controller ignores the cq head doorbell and continuously 770 * writes to the queue, it is theoretically possible to wrap around 771 * the queue twice and mistakenly return IRQ_NONE. Linux only 772 * requires that 0.1% of your interrupts are handled, so this isn't 773 * a big problem. 774 */ 775 if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) 776 return 0; 777 778 writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride)); 779 nvmeq->cq_head = head; 780 nvmeq->cq_phase = phase; 781 782 nvmeq->cqe_seen = 1; 783 return 1; 784} 785 786static void nvme_make_request(struct request_queue *q, struct bio *bio) 787{ 788 struct nvme_ns *ns = q->queuedata; 789 struct nvme_queue *nvmeq = get_nvmeq(ns->dev); 790 int result = -EBUSY; 791 792 if (!nvmeq) { 793 put_nvmeq(NULL); 794 bio_endio(bio, -EIO); 795 return; 796 } 797 798 spin_lock_irq(&nvmeq->q_lock); 799 if (!nvmeq->q_suspended && bio_list_empty(&nvmeq->sq_cong)) 800 result = nvme_submit_bio_queue(nvmeq, ns, bio); 801 if (unlikely(result)) { 802 if (bio_list_empty(&nvmeq->sq_cong)) 803 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); 804 bio_list_add(&nvmeq->sq_cong, bio); 805 } 806 807 nvme_process_cq(nvmeq); 808 spin_unlock_irq(&nvmeq->q_lock); 809 put_nvmeq(nvmeq); 810} 811 812static irqreturn_t nvme_irq(int irq, void *data) 813{ 814 irqreturn_t result; 815 struct nvme_queue *nvmeq = data; 816 spin_lock(&nvmeq->q_lock); 817 nvme_process_cq(nvmeq); 818 result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE; 819 nvmeq->cqe_seen = 0; 820 spin_unlock(&nvmeq->q_lock); 821 return result; 822} 823 824static irqreturn_t nvme_irq_check(int irq, void *data) 825{ 826 struct nvme_queue *nvmeq = data; 827 struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head]; 828 if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase) 829 return IRQ_NONE; 830 return IRQ_WAKE_THREAD; 831} 832 833static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid) 834{ 835 spin_lock_irq(&nvmeq->q_lock); 836 cancel_cmdid(nvmeq, cmdid, NULL); 837 spin_unlock_irq(&nvmeq->q_lock); 838} 839 840struct sync_cmd_info { 841 struct task_struct *task; 842 u32 result; 843 int status; 844}; 845 846static void sync_completion(struct nvme_dev *dev, void *ctx, 847 struct nvme_completion *cqe) 848{ 849 struct sync_cmd_info *cmdinfo = ctx; 850 cmdinfo->result = le32_to_cpup(&cqe->result); 851 cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; 852 wake_up_process(cmdinfo->task); 853} 854 855/* 856 * Returns 0 on success. If the result is negative, it's a Linux error code; 857 * if the result is positive, it's an NVM Express status code 858 */ 859int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, 860 u32 *result, unsigned timeout) 861{ 862 int cmdid; 863 struct sync_cmd_info cmdinfo; 864 865 cmdinfo.task = current; 866 cmdinfo.status = -EINTR; 867 868 cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion, 869 timeout); 870 if (cmdid < 0) 871 return cmdid; 872 cmd->common.command_id = cmdid; 873 874 set_current_state(TASK_KILLABLE); 875 nvme_submit_cmd(nvmeq, cmd); 876 schedule_timeout(timeout); 877 878 if (cmdinfo.status == -EINTR) { 879 nvme_abort_command(nvmeq, cmdid); 880 return -EINTR; 881 } 882 883 if (result) 884 *result = cmdinfo.result; 885 886 return cmdinfo.status; 887} 888 889int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, 890 u32 *result) 891{ 892 return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT); 893} 894 895static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) 896{ 897 int status; 898 struct nvme_command c; 899 900 memset(&c, 0, sizeof(c)); 901 c.delete_queue.opcode = opcode; 902 c.delete_queue.qid = cpu_to_le16(id); 903 904 status = nvme_submit_admin_cmd(dev, &c, NULL); 905 if (status) 906 return -EIO; 907 return 0; 908} 909 910static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, 911 struct nvme_queue *nvmeq) 912{ 913 int status; 914 struct nvme_command c; 915 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; 916 917 memset(&c, 0, sizeof(c)); 918 c.create_cq.opcode = nvme_admin_create_cq; 919 c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); 920 c.create_cq.cqid = cpu_to_le16(qid); 921 c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 922 c.create_cq.cq_flags = cpu_to_le16(flags); 923 c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); 924 925 status = nvme_submit_admin_cmd(dev, &c, NULL); 926 if (status) 927 return -EIO; 928 return 0; 929} 930 931static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, 932 struct nvme_queue *nvmeq) 933{ 934 int status; 935 struct nvme_command c; 936 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; 937 938 memset(&c, 0, sizeof(c)); 939 c.create_sq.opcode = nvme_admin_create_sq; 940 c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); 941 c.create_sq.sqid = cpu_to_le16(qid); 942 c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 943 c.create_sq.sq_flags = cpu_to_le16(flags); 944 c.create_sq.cqid = cpu_to_le16(qid); 945 946 status = nvme_submit_admin_cmd(dev, &c, NULL); 947 if (status) 948 return -EIO; 949 return 0; 950} 951 952static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) 953{ 954 return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); 955} 956 957static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) 958{ 959 return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); 960} 961 962int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, 963 dma_addr_t dma_addr) 964{ 965 struct nvme_command c; 966 967 memset(&c, 0, sizeof(c)); 968 c.identify.opcode = nvme_admin_identify; 969 c.identify.nsid = cpu_to_le32(nsid); 970 c.identify.prp1 = cpu_to_le64(dma_addr); 971 c.identify.cns = cpu_to_le32(cns); 972 973 return nvme_submit_admin_cmd(dev, &c, NULL); 974} 975 976int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, 977 dma_addr_t dma_addr, u32 *result) 978{ 979 struct nvme_command c; 980 981 memset(&c, 0, sizeof(c)); 982 c.features.opcode = nvme_admin_get_features; 983 c.features.nsid = cpu_to_le32(nsid); 984 c.features.prp1 = cpu_to_le64(dma_addr); 985 c.features.fid = cpu_to_le32(fid); 986 987 return nvme_submit_admin_cmd(dev, &c, result); 988} 989 990int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, 991 dma_addr_t dma_addr, u32 *result) 992{ 993 struct nvme_command c; 994 995 memset(&c, 0, sizeof(c)); 996 c.features.opcode = nvme_admin_set_features; 997 c.features.prp1 = cpu_to_le64(dma_addr); 998 c.features.fid = cpu_to_le32(fid); 999 c.features.dword11 = cpu_to_le32(dword11); 1000 1001 return nvme_submit_admin_cmd(dev, &c, result); 1002} 1003 1004/** 1005 * nvme_cancel_ios - Cancel outstanding I/Os 1006 * @queue: The queue to cancel I/Os on 1007 * @timeout: True to only cancel I/Os which have timed out 1008 */ 1009static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) 1010{ 1011 int depth = nvmeq->q_depth - 1; 1012 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 1013 unsigned long now = jiffies; 1014 int cmdid; 1015 1016 for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) { 1017 void *ctx; 1018 nvme_completion_fn fn; 1019 static struct nvme_completion cqe = { 1020 .status = cpu_to_le16(NVME_SC_ABORT_REQ << 1), 1021 }; 1022 1023 if (timeout && !time_after(now, info[cmdid].timeout)) 1024 continue; 1025 if (info[cmdid].ctx == CMD_CTX_CANCELLED) 1026 continue; 1027 dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d\n", cmdid); 1028 ctx = cancel_cmdid(nvmeq, cmdid, &fn); 1029 fn(nvmeq->dev, ctx, &cqe); 1030 } 1031} 1032 1033static void nvme_free_queue(struct nvme_queue *nvmeq) 1034{ 1035 spin_lock_irq(&nvmeq->q_lock); 1036 while (bio_list_peek(&nvmeq->sq_cong)) { 1037 struct bio *bio = bio_list_pop(&nvmeq->sq_cong); 1038 bio_endio(bio, -EIO); 1039 } 1040 spin_unlock_irq(&nvmeq->q_lock); 1041 1042 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 1043 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 1044 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 1045 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 1046 kfree(nvmeq); 1047} 1048 1049static void nvme_free_queues(struct nvme_dev *dev) 1050{ 1051 int i; 1052 1053 for (i = dev->queue_count - 1; i >= 0; i--) { 1054 nvme_free_queue(dev->queues[i]); 1055 dev->queue_count--; 1056 dev->queues[i] = NULL; 1057 } 1058} 1059 1060static void nvme_disable_queue(struct nvme_dev *dev, int qid) 1061{ 1062 struct nvme_queue *nvmeq = dev->queues[qid]; 1063 int vector = dev->entry[nvmeq->cq_vector].vector; 1064 1065 spin_lock_irq(&nvmeq->q_lock); 1066 if (nvmeq->q_suspended) { 1067 spin_unlock_irq(&nvmeq->q_lock); 1068 return; 1069 } 1070 nvmeq->q_suspended = 1; 1071 spin_unlock_irq(&nvmeq->q_lock); 1072 1073 irq_set_affinity_hint(vector, NULL); 1074 free_irq(vector, nvmeq); 1075 1076 /* Don't tell the adapter to delete the admin queue */ 1077 if (qid) { 1078 adapter_delete_sq(dev, qid); 1079 adapter_delete_cq(dev, qid); 1080 } 1081 1082 spin_lock_irq(&nvmeq->q_lock); 1083 nvme_process_cq(nvmeq); 1084 nvme_cancel_ios(nvmeq, false); 1085 spin_unlock_irq(&nvmeq->q_lock); 1086} 1087 1088static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, 1089 int depth, int vector) 1090{ 1091 struct device *dmadev = &dev->pci_dev->dev; 1092 unsigned extra = nvme_queue_extra(depth); 1093 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL); 1094 if (!nvmeq) 1095 return NULL; 1096 1097 nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth), 1098 &nvmeq->cq_dma_addr, GFP_KERNEL); 1099 if (!nvmeq->cqes) 1100 goto free_nvmeq; 1101 memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth)); 1102 1103 nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth), 1104 &nvmeq->sq_dma_addr, GFP_KERNEL); 1105 if (!nvmeq->sq_cmds) 1106 goto free_cqdma; 1107 1108 nvmeq->q_dmadev = dmadev; 1109 nvmeq->dev = dev; 1110 spin_lock_init(&nvmeq->q_lock); 1111 nvmeq->cq_head = 0; 1112 nvmeq->cq_phase = 1; 1113 init_waitqueue_head(&nvmeq->sq_full); 1114 init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread); 1115 bio_list_init(&nvmeq->sq_cong); 1116 nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)]; 1117 nvmeq->q_depth = depth; 1118 nvmeq->cq_vector = vector; 1119 nvmeq->q_suspended = 1; 1120 dev->queue_count++; 1121 1122 return nvmeq; 1123 1124 free_cqdma: 1125 dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes, 1126 nvmeq->cq_dma_addr); 1127 free_nvmeq: 1128 kfree(nvmeq); 1129 return NULL; 1130} 1131 1132static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, 1133 const char *name) 1134{ 1135 if (use_threaded_interrupts) 1136 return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector, 1137 nvme_irq_check, nvme_irq, 1138 IRQF_DISABLED | IRQF_SHARED, 1139 name, nvmeq); 1140 return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq, 1141 IRQF_DISABLED | IRQF_SHARED, name, nvmeq); 1142} 1143 1144static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) 1145{ 1146 struct nvme_dev *dev = nvmeq->dev; 1147 unsigned extra = nvme_queue_extra(nvmeq->q_depth); 1148 1149 nvmeq->sq_tail = 0; 1150 nvmeq->cq_head = 0; 1151 nvmeq->cq_phase = 1; 1152 nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)]; 1153 memset(nvmeq->cmdid_data, 0, extra); 1154 memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); 1155 nvme_cancel_ios(nvmeq, false); 1156 nvmeq->q_suspended = 0; 1157} 1158 1159static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) 1160{ 1161 struct nvme_dev *dev = nvmeq->dev; 1162 int result; 1163 1164 result = adapter_alloc_cq(dev, qid, nvmeq); 1165 if (result < 0) 1166 return result; 1167 1168 result = adapter_alloc_sq(dev, qid, nvmeq); 1169 if (result < 0) 1170 goto release_cq; 1171 1172 result = queue_request_irq(dev, nvmeq, "nvme"); 1173 if (result < 0) 1174 goto release_sq; 1175 1176 spin_lock(&nvmeq->q_lock); 1177 nvme_init_queue(nvmeq, qid); 1178 spin_unlock(&nvmeq->q_lock); 1179 1180 return result; 1181 1182 release_sq: 1183 adapter_delete_sq(dev, qid); 1184 release_cq: 1185 adapter_delete_cq(dev, qid); 1186 return result; 1187} 1188 1189static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) 1190{ 1191 unsigned long timeout; 1192 u32 bit = enabled ? NVME_CSTS_RDY : 0; 1193 1194 timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; 1195 1196 while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) { 1197 msleep(100); 1198 if (fatal_signal_pending(current)) 1199 return -EINTR; 1200 if (time_after(jiffies, timeout)) { 1201 dev_err(&dev->pci_dev->dev, 1202 "Device not ready; aborting initialisation\n"); 1203 return -ENODEV; 1204 } 1205 } 1206 1207 return 0; 1208} 1209 1210/* 1211 * If the device has been passed off to us in an enabled state, just clear 1212 * the enabled bit. The spec says we should set the 'shutdown notification 1213 * bits', but doing so may cause the device to complete commands to the 1214 * admin queue ... and we don't know what memory that might be pointing at! 1215 */ 1216static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap) 1217{ 1218 u32 cc = readl(&dev->bar->cc); 1219 1220 if (cc & NVME_CC_ENABLE) 1221 writel(cc & ~NVME_CC_ENABLE, &dev->bar->cc); 1222 return nvme_wait_ready(dev, cap, false); 1223} 1224 1225static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap) 1226{ 1227 return nvme_wait_ready(dev, cap, true); 1228} 1229 1230static int nvme_shutdown_ctrl(struct nvme_dev *dev) 1231{ 1232 unsigned long timeout; 1233 u32 cc; 1234 1235 cc = (readl(&dev->bar->cc) & ~NVME_CC_SHN_MASK) | NVME_CC_SHN_NORMAL; 1236 writel(cc, &dev->bar->cc); 1237 1238 timeout = 2 * HZ + jiffies; 1239 while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) != 1240 NVME_CSTS_SHST_CMPLT) { 1241 msleep(100); 1242 if (fatal_signal_pending(current)) 1243 return -EINTR; 1244 if (time_after(jiffies, timeout)) { 1245 dev_err(&dev->pci_dev->dev, 1246 "Device shutdown incomplete; abort shutdown\n"); 1247 return -ENODEV; 1248 } 1249 } 1250 1251 return 0; 1252} 1253 1254static int nvme_configure_admin_queue(struct nvme_dev *dev) 1255{ 1256 int result; 1257 u32 aqa; 1258 u64 cap = readq(&dev->bar->cap); 1259 struct nvme_queue *nvmeq; 1260 1261 result = nvme_disable_ctrl(dev, cap); 1262 if (result < 0) 1263 return result; 1264 1265 nvmeq = dev->queues[0]; 1266 if (!nvmeq) { 1267 nvmeq = nvme_alloc_queue(dev, 0, 64, 0); 1268 if (!nvmeq) 1269 return -ENOMEM; 1270 dev->queues[0] = nvmeq; 1271 } 1272 1273 aqa = nvmeq->q_depth - 1; 1274 aqa |= aqa << 16; 1275 1276 dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM; 1277 dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; 1278 dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; 1279 dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; 1280 1281 writel(aqa, &dev->bar->aqa); 1282 writeq(nvmeq->sq_dma_addr, &dev->bar->asq); 1283 writeq(nvmeq->cq_dma_addr, &dev->bar->acq); 1284 writel(dev->ctrl_config, &dev->bar->cc); 1285 1286 result = nvme_enable_ctrl(dev, cap); 1287 if (result) 1288 return result; 1289 1290 result = queue_request_irq(dev, nvmeq, "nvme admin"); 1291 if (result) 1292 return result; 1293 1294 spin_lock(&nvmeq->q_lock); 1295 nvme_init_queue(nvmeq, 0); 1296 spin_unlock(&nvmeq->q_lock); 1297 return result; 1298} 1299 1300struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, 1301 unsigned long addr, unsigned length) 1302{ 1303 int i, err, count, nents, offset; 1304 struct scatterlist *sg; 1305 struct page **pages; 1306 struct nvme_iod *iod; 1307 1308 if (addr & 3) 1309 return ERR_PTR(-EINVAL); 1310 if (!length || length > INT_MAX - PAGE_SIZE) 1311 return ERR_PTR(-EINVAL); 1312 1313 offset = offset_in_page(addr); 1314 count = DIV_ROUND_UP(offset + length, PAGE_SIZE); 1315 pages = kcalloc(count, sizeof(*pages), GFP_KERNEL); 1316 if (!pages) 1317 return ERR_PTR(-ENOMEM); 1318 1319 err = get_user_pages_fast(addr, count, 1, pages); 1320 if (err < count) { 1321 count = err; 1322 err = -EFAULT; 1323 goto put_pages; 1324 } 1325 1326 iod = nvme_alloc_iod(count, length, GFP_KERNEL); 1327 sg = iod->sg; 1328 sg_init_table(sg, count); 1329 for (i = 0; i < count; i++) { 1330 sg_set_page(&sg[i], pages[i], 1331 min_t(unsigned, length, PAGE_SIZE - offset), 1332 offset); 1333 length -= (PAGE_SIZE - offset); 1334 offset = 0; 1335 } 1336 sg_mark_end(&sg[i - 1]); 1337 iod->nents = count; 1338 1339 err = -ENOMEM; 1340 nents = dma_map_sg(&dev->pci_dev->dev, sg, count, 1341 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1342 if (!nents) 1343 goto free_iod; 1344 1345 kfree(pages); 1346 return iod; 1347 1348 free_iod: 1349 kfree(iod); 1350 put_pages: 1351 for (i = 0; i < count; i++) 1352 put_page(pages[i]); 1353 kfree(pages); 1354 return ERR_PTR(err); 1355} 1356 1357void nvme_unmap_user_pages(struct nvme_dev *dev, int write, 1358 struct nvme_iod *iod) 1359{ 1360 int i; 1361 1362 dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, 1363 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1364 1365 for (i = 0; i < iod->nents; i++) 1366 put_page(sg_page(&iod->sg[i])); 1367} 1368 1369static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) 1370{ 1371 struct nvme_dev *dev = ns->dev; 1372 struct nvme_queue *nvmeq; 1373 struct nvme_user_io io; 1374 struct nvme_command c; 1375 unsigned length, meta_len; 1376 int status, i; 1377 struct nvme_iod *iod, *meta_iod = NULL; 1378 dma_addr_t meta_dma_addr; 1379 void *meta, *uninitialized_var(meta_mem); 1380 1381 if (copy_from_user(&io, uio, sizeof(io))) 1382 return -EFAULT; 1383 length = (io.nblocks + 1) << ns->lba_shift; 1384 meta_len = (io.nblocks + 1) * ns->ms; 1385 1386 if (meta_len && ((io.metadata & 3) || !io.metadata)) 1387 return -EINVAL; 1388 1389 switch (io.opcode) { 1390 case nvme_cmd_write: 1391 case nvme_cmd_read: 1392 case nvme_cmd_compare: 1393 iod = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length); 1394 break; 1395 default: 1396 return -EINVAL; 1397 } 1398 1399 if (IS_ERR(iod)) 1400 return PTR_ERR(iod); 1401 1402 memset(&c, 0, sizeof(c)); 1403 c.rw.opcode = io.opcode; 1404 c.rw.flags = io.flags; 1405 c.rw.nsid = cpu_to_le32(ns->ns_id); 1406 c.rw.slba = cpu_to_le64(io.slba); 1407 c.rw.length = cpu_to_le16(io.nblocks); 1408 c.rw.control = cpu_to_le16(io.control); 1409 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); 1410 c.rw.reftag = cpu_to_le32(io.reftag); 1411 c.rw.apptag = cpu_to_le16(io.apptag); 1412 c.rw.appmask = cpu_to_le16(io.appmask); 1413 1414 if (meta_len) { 1415 meta_iod = nvme_map_user_pages(dev, io.opcode & 1, io.metadata, 1416 meta_len); 1417 if (IS_ERR(meta_iod)) { 1418 status = PTR_ERR(meta_iod); 1419 meta_iod = NULL; 1420 goto unmap; 1421 } 1422 1423 meta_mem = dma_alloc_coherent(&dev->pci_dev->dev, meta_len, 1424 &meta_dma_addr, GFP_KERNEL); 1425 if (!meta_mem) { 1426 status = -ENOMEM; 1427 goto unmap; 1428 } 1429 1430 if (io.opcode & 1) { 1431 int meta_offset = 0; 1432 1433 for (i = 0; i < meta_iod->nents; i++) { 1434 meta = kmap_atomic(sg_page(&meta_iod->sg[i])) + 1435 meta_iod->sg[i].offset; 1436 memcpy(meta_mem + meta_offset, meta, 1437 meta_iod->sg[i].length); 1438 kunmap_atomic(meta); 1439 meta_offset += meta_iod->sg[i].length; 1440 } 1441 } 1442 1443 c.rw.metadata = cpu_to_le64(meta_dma_addr); 1444 } 1445 1446 length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL); 1447 1448 nvmeq = get_nvmeq(dev); 1449 /* 1450 * Since nvme_submit_sync_cmd sleeps, we can't keep preemption 1451 * disabled. We may be preempted at any point, and be rescheduled 1452 * to a different CPU. That will cause cacheline bouncing, but no 1453 * additional races since q_lock already protects against other CPUs. 1454 */ 1455 put_nvmeq(nvmeq); 1456 if (length != (io.nblocks + 1) << ns->lba_shift) 1457 status = -ENOMEM; 1458 else if (!nvmeq || nvmeq->q_suspended) 1459 status = -EBUSY; 1460 else 1461 status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); 1462 1463 if (meta_len) { 1464 if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) { 1465 int meta_offset = 0; 1466 1467 for (i = 0; i < meta_iod->nents; i++) { 1468 meta = kmap_atomic(sg_page(&meta_iod->sg[i])) + 1469 meta_iod->sg[i].offset; 1470 memcpy(meta, meta_mem + meta_offset, 1471 meta_iod->sg[i].length); 1472 kunmap_atomic(meta); 1473 meta_offset += meta_iod->sg[i].length; 1474 } 1475 } 1476 1477 dma_free_coherent(&dev->pci_dev->dev, meta_len, meta_mem, 1478 meta_dma_addr); 1479 } 1480 1481 unmap: 1482 nvme_unmap_user_pages(dev, io.opcode & 1, iod); 1483 nvme_free_iod(dev, iod); 1484 1485 if (meta_iod) { 1486 nvme_unmap_user_pages(dev, io.opcode & 1, meta_iod); 1487 nvme_free_iod(dev, meta_iod); 1488 } 1489 1490 return status; 1491} 1492 1493static int nvme_user_admin_cmd(struct nvme_dev *dev, 1494 struct nvme_admin_cmd __user *ucmd) 1495{ 1496 struct nvme_admin_cmd cmd; 1497 struct nvme_command c; 1498 int status, length; 1499 struct nvme_iod *uninitialized_var(iod); 1500 unsigned timeout; 1501 1502 if (!capable(CAP_SYS_ADMIN)) 1503 return -EACCES; 1504 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 1505 return -EFAULT; 1506 1507 memset(&c, 0, sizeof(c)); 1508 c.common.opcode = cmd.opcode; 1509 c.common.flags = cmd.flags; 1510 c.common.nsid = cpu_to_le32(cmd.nsid); 1511 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 1512 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 1513 c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); 1514 c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); 1515 c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); 1516 c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); 1517 c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); 1518 c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); 1519 1520 length = cmd.data_len; 1521 if (cmd.data_len) { 1522 iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr, 1523 length); 1524 if (IS_ERR(iod)) 1525 return PTR_ERR(iod); 1526 length = nvme_setup_prps(dev, &c.common, iod, length, 1527 GFP_KERNEL); 1528 } 1529 1530 timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) : 1531 ADMIN_TIMEOUT; 1532 if (length != cmd.data_len) 1533 status = -ENOMEM; 1534 else 1535 status = nvme_submit_sync_cmd(dev->queues[0], &c, &cmd.result, 1536 timeout); 1537 1538 if (cmd.data_len) { 1539 nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); 1540 nvme_free_iod(dev, iod); 1541 } 1542 1543 if ((status >= 0) && copy_to_user(&ucmd->result, &cmd.result, 1544 sizeof(cmd.result))) 1545 status = -EFAULT; 1546 1547 return status; 1548} 1549 1550static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, 1551 unsigned long arg) 1552{ 1553 struct nvme_ns *ns = bdev->bd_disk->private_data; 1554 1555 switch (cmd) { 1556 case NVME_IOCTL_ID: 1557 force_successful_syscall_return(); 1558 return ns->ns_id; 1559 case NVME_IOCTL_ADMIN_CMD: 1560 return nvme_user_admin_cmd(ns->dev, (void __user *)arg); 1561 case NVME_IOCTL_SUBMIT_IO: 1562 return nvme_submit_io(ns, (void __user *)arg); 1563 case SG_GET_VERSION_NUM: 1564 return nvme_sg_get_version_num((void __user *)arg); 1565 case SG_IO: 1566 return nvme_sg_io(ns, (void __user *)arg); 1567 default: 1568 return -ENOTTY; 1569 } 1570} 1571 1572static const struct block_device_operations nvme_fops = { 1573 .owner = THIS_MODULE, 1574 .ioctl = nvme_ioctl, 1575 .compat_ioctl = nvme_ioctl, 1576}; 1577 1578static void nvme_resubmit_bios(struct nvme_queue *nvmeq) 1579{ 1580 while (bio_list_peek(&nvmeq->sq_cong)) { 1581 struct bio *bio = bio_list_pop(&nvmeq->sq_cong); 1582 struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; 1583 1584 if (bio_list_empty(&nvmeq->sq_cong)) 1585 remove_wait_queue(&nvmeq->sq_full, 1586 &nvmeq->sq_cong_wait); 1587 if (nvme_submit_bio_queue(nvmeq, ns, bio)) { 1588 if (bio_list_empty(&nvmeq->sq_cong)) 1589 add_wait_queue(&nvmeq->sq_full, 1590 &nvmeq->sq_cong_wait); 1591 bio_list_add_head(&nvmeq->sq_cong, bio); 1592 break; 1593 } 1594 } 1595} 1596 1597static int nvme_kthread(void *data) 1598{ 1599 struct nvme_dev *dev; 1600 1601 while (!kthread_should_stop()) { 1602 set_current_state(TASK_INTERRUPTIBLE); 1603 spin_lock(&dev_list_lock); 1604 list_for_each_entry(dev, &dev_list, node) { 1605 int i; 1606 for (i = 0; i < dev->queue_count; i++) { 1607 struct nvme_queue *nvmeq = dev->queues[i]; 1608 if (!nvmeq) 1609 continue; 1610 spin_lock_irq(&nvmeq->q_lock); 1611 if (nvmeq->q_suspended) 1612 goto unlock; 1613 nvme_process_cq(nvmeq); 1614 nvme_cancel_ios(nvmeq, true); 1615 nvme_resubmit_bios(nvmeq); 1616 unlock: 1617 spin_unlock_irq(&nvmeq->q_lock); 1618 } 1619 } 1620 spin_unlock(&dev_list_lock); 1621 schedule_timeout(round_jiffies_relative(HZ)); 1622 } 1623 return 0; 1624} 1625 1626static DEFINE_IDA(nvme_index_ida); 1627 1628static int nvme_get_ns_idx(void) 1629{ 1630 int index, error; 1631 1632 do { 1633 if (!ida_pre_get(&nvme_index_ida, GFP_KERNEL)) 1634 return -1; 1635 1636 spin_lock(&dev_list_lock); 1637 error = ida_get_new(&nvme_index_ida, &index); 1638 spin_unlock(&dev_list_lock); 1639 } while (error == -EAGAIN); 1640 1641 if (error) 1642 index = -1; 1643 return index; 1644} 1645 1646static void nvme_put_ns_idx(int index) 1647{ 1648 spin_lock(&dev_list_lock); 1649 ida_remove(&nvme_index_ida, index); 1650 spin_unlock(&dev_list_lock); 1651} 1652 1653static void nvme_config_discard(struct nvme_ns *ns) 1654{ 1655 u32 logical_block_size = queue_logical_block_size(ns->queue); 1656 ns->queue->limits.discard_zeroes_data = 0; 1657 ns->queue->limits.discard_alignment = logical_block_size; 1658 ns->queue->limits.discard_granularity = logical_block_size; 1659 ns->queue->limits.max_discard_sectors = 0xffffffff; 1660 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); 1661} 1662 1663static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, 1664 struct nvme_id_ns *id, struct nvme_lba_range_type *rt) 1665{ 1666 struct nvme_ns *ns; 1667 struct gendisk *disk; 1668 int lbaf; 1669 1670 if (rt->attributes & NVME_LBART_ATTRIB_HIDE) 1671 return NULL; 1672 1673 ns = kzalloc(sizeof(*ns), GFP_KERNEL); 1674 if (!ns) 1675 return NULL; 1676 ns->queue = blk_alloc_queue(GFP_KERNEL); 1677 if (!ns->queue) 1678 goto out_free_ns; 1679 ns->queue->queue_flags = QUEUE_FLAG_DEFAULT; 1680 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); 1681 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); 1682 blk_queue_make_request(ns->queue, nvme_make_request); 1683 ns->dev = dev; 1684 ns->queue->queuedata = ns; 1685 1686 disk = alloc_disk(NVME_MINORS); 1687 if (!disk) 1688 goto out_free_queue; 1689 ns->ns_id = nsid; 1690 ns->disk = disk; 1691 lbaf = id->flbas & 0xf; 1692 ns->lba_shift = id->lbaf[lbaf].ds; 1693 ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); 1694 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); 1695 if (dev->max_hw_sectors) 1696 blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); 1697 1698 disk->major = nvme_major; 1699 disk->minors = NVME_MINORS; 1700 disk->first_minor = NVME_MINORS * nvme_get_ns_idx(); 1701 disk->fops = &nvme_fops; 1702 disk->private_data = ns; 1703 disk->queue = ns->queue; 1704 disk->driverfs_dev = &dev->pci_dev->dev; 1705 sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); 1706 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); 1707 1708 if (dev->oncs & NVME_CTRL_ONCS_DSM) 1709 nvme_config_discard(ns); 1710 1711 return ns; 1712 1713 out_free_queue: 1714 blk_cleanup_queue(ns->queue); 1715 out_free_ns: 1716 kfree(ns); 1717 return NULL; 1718} 1719 1720static void nvme_ns_free(struct nvme_ns *ns) 1721{ 1722 int index = ns->disk->first_minor / NVME_MINORS; 1723 put_disk(ns->disk); 1724 nvme_put_ns_idx(index); 1725 blk_cleanup_queue(ns->queue); 1726 kfree(ns); 1727} 1728 1729static int set_queue_count(struct nvme_dev *dev, int count) 1730{ 1731 int status; 1732 u32 result; 1733 u32 q_count = (count - 1) | ((count - 1) << 16); 1734 1735 status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, 1736 &result); 1737 if (status) 1738 return status < 0 ? -EIO : -EBUSY; 1739 return min(result & 0xffff, result >> 16) + 1; 1740} 1741 1742static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) 1743{ 1744 return 4096 + ((nr_io_queues + 1) << (dev->db_stride + 3)); 1745} 1746 1747static int nvme_setup_io_queues(struct nvme_dev *dev) 1748{ 1749 struct pci_dev *pdev = dev->pci_dev; 1750 int result, cpu, i, vecs, nr_io_queues, size, q_depth; 1751 1752 nr_io_queues = num_online_cpus(); 1753 result = set_queue_count(dev, nr_io_queues); 1754 if (result < 0) 1755 return result; 1756 if (result < nr_io_queues) 1757 nr_io_queues = result; 1758 1759 size = db_bar_size(dev, nr_io_queues); 1760 if (size > 8192) { 1761 iounmap(dev->bar); 1762 do { 1763 dev->bar = ioremap(pci_resource_start(pdev, 0), size); 1764 if (dev->bar) 1765 break; 1766 if (!--nr_io_queues) 1767 return -ENOMEM; 1768 size = db_bar_size(dev, nr_io_queues); 1769 } while (1); 1770 dev->dbs = ((void __iomem *)dev->bar) + 4096; 1771 dev->queues[0]->q_db = dev->dbs; 1772 } 1773 1774 /* Deregister the admin queue's interrupt */ 1775 free_irq(dev->entry[0].vector, dev->queues[0]); 1776 1777 vecs = nr_io_queues; 1778 for (i = 0; i < vecs; i++) 1779 dev->entry[i].entry = i; 1780 for (;;) { 1781 result = pci_enable_msix(pdev, dev->entry, vecs); 1782 if (result <= 0) 1783 break; 1784 vecs = result; 1785 } 1786 1787 if (result < 0) { 1788 vecs = nr_io_queues; 1789 if (vecs > 32) 1790 vecs = 32; 1791 for (;;) { 1792 result = pci_enable_msi_block(pdev, vecs); 1793 if (result == 0) { 1794 for (i = 0; i < vecs; i++) 1795 dev->entry[i].vector = i + pdev->irq; 1796 break; 1797 } else if (result < 0) { 1798 vecs = 1; 1799 break; 1800 } 1801 vecs = result; 1802 } 1803 } 1804 1805 /* 1806 * Should investigate if there's a performance win from allocating 1807 * more queues than interrupt vectors; it might allow the submission 1808 * path to scale better, even if the receive path is limited by the 1809 * number of interrupts. 1810 */ 1811 nr_io_queues = vecs; 1812 1813 result = queue_request_irq(dev, dev->queues[0], "nvme admin"); 1814 if (result) { 1815 dev->queues[0]->q_suspended = 1; 1816 goto free_queues; 1817 } 1818 1819 /* Free previously allocated queues that are no longer usable */ 1820 spin_lock(&dev_list_lock); 1821 for (i = dev->queue_count - 1; i > nr_io_queues; i--) { 1822 struct nvme_queue *nvmeq = dev->queues[i]; 1823 1824 spin_lock(&nvmeq->q_lock); 1825 nvme_cancel_ios(nvmeq, false); 1826 spin_unlock(&nvmeq->q_lock); 1827 1828 nvme_free_queue(nvmeq); 1829 dev->queue_count--; 1830 dev->queues[i] = NULL; 1831 } 1832 spin_unlock(&dev_list_lock); 1833 1834 cpu = cpumask_first(cpu_online_mask); 1835 for (i = 0; i < nr_io_queues; i++) { 1836 irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu)); 1837 cpu = cpumask_next(cpu, cpu_online_mask); 1838 } 1839 1840 q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1, 1841 NVME_Q_DEPTH); 1842 for (i = dev->queue_count - 1; i < nr_io_queues; i++) { 1843 dev->queues[i + 1] = nvme_alloc_queue(dev, i + 1, q_depth, i); 1844 if (!dev->queues[i + 1]) { 1845 result = -ENOMEM; 1846 goto free_queues; 1847 } 1848 } 1849 1850 for (; i < num_possible_cpus(); i++) { 1851 int target = i % rounddown_pow_of_two(dev->queue_count - 1); 1852 dev->queues[i + 1] = dev->queues[target + 1]; 1853 } 1854 1855 for (i = 1; i < dev->queue_count; i++) { 1856 result = nvme_create_queue(dev->queues[i], i); 1857 if (result) { 1858 for (--i; i > 0; i--) 1859 nvme_disable_queue(dev, i); 1860 goto free_queues; 1861 } 1862 } 1863 1864 return 0; 1865 1866 free_queues: 1867 nvme_free_queues(dev); 1868 return result; 1869} 1870 1871/* 1872 * Return: error value if an error occurred setting up the queues or calling 1873 * Identify Device. 0 if these succeeded, even if adding some of the 1874 * namespaces failed. At the moment, these failures are silent. TBD which 1875 * failures should be reported. 1876 */ 1877static int nvme_dev_add(struct nvme_dev *dev) 1878{ 1879 int res; 1880 unsigned nn, i; 1881 struct nvme_ns *ns; 1882 struct nvme_id_ctrl *ctrl; 1883 struct nvme_id_ns *id_ns; 1884 void *mem; 1885 dma_addr_t dma_addr; 1886 int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; 1887 1888 mem = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr, 1889 GFP_KERNEL); 1890 if (!mem) 1891 return -ENOMEM; 1892 1893 res = nvme_identify(dev, 0, 1, dma_addr); 1894 if (res) { 1895 res = -EIO; 1896 goto out; 1897 } 1898 1899 ctrl = mem; 1900 nn = le32_to_cpup(&ctrl->nn); 1901 dev->oncs = le16_to_cpup(&ctrl->oncs); 1902 memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); 1903 memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); 1904 memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); 1905 if (ctrl->mdts) 1906 dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); 1907 if ((dev->pci_dev->vendor == PCI_VENDOR_ID_INTEL) && 1908 (dev->pci_dev->device == 0x0953) && ctrl->vs[3]) 1909 dev->stripe_size = 1 << (ctrl->vs[3] + shift); 1910 1911 id_ns = mem; 1912 for (i = 1; i <= nn; i++) { 1913 res = nvme_identify(dev, i, 0, dma_addr); 1914 if (res) 1915 continue; 1916 1917 if (id_ns->ncap == 0) 1918 continue; 1919 1920 res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i, 1921 dma_addr + 4096, NULL); 1922 if (res) 1923 memset(mem + 4096, 0, 4096); 1924 1925 ns = nvme_alloc_ns(dev, i, mem, mem + 4096); 1926 if (ns) 1927 list_add_tail(&ns->list, &dev->namespaces); 1928 } 1929 list_for_each_entry(ns, &dev->namespaces, list) 1930 add_disk(ns->disk); 1931 res = 0; 1932 1933 out: 1934 dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr); 1935 return res; 1936} 1937 1938static int nvme_dev_map(struct nvme_dev *dev) 1939{ 1940 int bars, result = -ENOMEM; 1941 struct pci_dev *pdev = dev->pci_dev; 1942 1943 if (pci_enable_device_mem(pdev)) 1944 return result; 1945 1946 dev->entry[0].vector = pdev->irq; 1947 pci_set_master(pdev); 1948 bars = pci_select_bars(pdev, IORESOURCE_MEM); 1949 if (pci_request_selected_regions(pdev, bars, "nvme")) 1950 goto disable_pci; 1951 1952 if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(64))) 1953 dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64)); 1954 else if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(32))) 1955 dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32)); 1956 else 1957 goto disable_pci; 1958 1959 pci_set_drvdata(pdev, dev); 1960 dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); 1961 if (!dev->bar) 1962 goto disable; 1963 1964 dev->db_stride = NVME_CAP_STRIDE(readq(&dev->bar->cap)); 1965 dev->dbs = ((void __iomem *)dev->bar) + 4096; 1966 1967 return 0; 1968 1969 disable: 1970 pci_release_regions(pdev); 1971 disable_pci: 1972 pci_disable_device(pdev); 1973 return result; 1974} 1975 1976static void nvme_dev_unmap(struct nvme_dev *dev) 1977{ 1978 if (dev->pci_dev->msi_enabled) 1979 pci_disable_msi(dev->pci_dev); 1980 else if (dev->pci_dev->msix_enabled) 1981 pci_disable_msix(dev->pci_dev); 1982 1983 if (dev->bar) { 1984 iounmap(dev->bar); 1985 dev->bar = NULL; 1986 } 1987 1988 pci_release_regions(dev->pci_dev); 1989 if (pci_is_enabled(dev->pci_dev)) 1990 pci_disable_device(dev->pci_dev); 1991} 1992 1993static void nvme_dev_shutdown(struct nvme_dev *dev) 1994{ 1995 int i; 1996 1997 for (i = dev->queue_count - 1; i >= 0; i--) 1998 nvme_disable_queue(dev, i); 1999 2000 spin_lock(&dev_list_lock); 2001 list_del_init(&dev->node); 2002 spin_unlock(&dev_list_lock); 2003 2004 if (dev->bar) 2005 nvme_shutdown_ctrl(dev); 2006 nvme_dev_unmap(dev); 2007} 2008 2009static void nvme_dev_remove(struct nvme_dev *dev) 2010{ 2011 struct nvme_ns *ns, *next; 2012 2013 list_for_each_entry_safe(ns, next, &dev->namespaces, list) { 2014 list_del(&ns->list); 2015 del_gendisk(ns->disk); 2016 nvme_ns_free(ns); 2017 } 2018} 2019 2020static int nvme_setup_prp_pools(struct nvme_dev *dev) 2021{ 2022 struct device *dmadev = &dev->pci_dev->dev; 2023 dev->prp_page_pool = dma_pool_create("prp list page", dmadev, 2024 PAGE_SIZE, PAGE_SIZE, 0); 2025 if (!dev->prp_page_pool) 2026 return -ENOMEM; 2027 2028 /* Optimisation for I/Os between 4k and 128k */ 2029 dev->prp_small_pool = dma_pool_create("prp list 256", dmadev, 2030 256, 256, 0); 2031 if (!dev->prp_small_pool) { 2032 dma_pool_destroy(dev->prp_page_pool); 2033 return -ENOMEM; 2034 } 2035 return 0; 2036} 2037 2038static void nvme_release_prp_pools(struct nvme_dev *dev) 2039{ 2040 dma_pool_destroy(dev->prp_page_pool); 2041 dma_pool_destroy(dev->prp_small_pool); 2042} 2043 2044static DEFINE_IDA(nvme_instance_ida); 2045 2046static int nvme_set_instance(struct nvme_dev *dev) 2047{ 2048 int instance, error; 2049 2050 do { 2051 if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) 2052 return -ENODEV; 2053 2054 spin_lock(&dev_list_lock); 2055 error = ida_get_new(&nvme_instance_ida, &instance); 2056 spin_unlock(&dev_list_lock); 2057 } while (error == -EAGAIN); 2058 2059 if (error) 2060 return -ENODEV; 2061 2062 dev->instance = instance; 2063 return 0; 2064} 2065 2066static void nvme_release_instance(struct nvme_dev *dev) 2067{ 2068 spin_lock(&dev_list_lock); 2069 ida_remove(&nvme_instance_ida, dev->instance); 2070 spin_unlock(&dev_list_lock); 2071} 2072 2073static void nvme_free_dev(struct kref *kref) 2074{ 2075 struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); 2076 nvme_dev_remove(dev); 2077 nvme_dev_shutdown(dev); 2078 nvme_free_queues(dev); 2079 nvme_release_instance(dev); 2080 nvme_release_prp_pools(dev); 2081 kfree(dev->queues); 2082 kfree(dev->entry); 2083 kfree(dev); 2084} 2085 2086static int nvme_dev_open(struct inode *inode, struct file *f) 2087{ 2088 struct nvme_dev *dev = container_of(f->private_data, struct nvme_dev, 2089 miscdev); 2090 kref_get(&dev->kref); 2091 f->private_data = dev; 2092 return 0; 2093} 2094 2095static int nvme_dev_release(struct inode *inode, struct file *f) 2096{ 2097 struct nvme_dev *dev = f->private_data; 2098 kref_put(&dev->kref, nvme_free_dev); 2099 return 0; 2100} 2101 2102static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 2103{ 2104 struct nvme_dev *dev = f->private_data; 2105 switch (cmd) { 2106 case NVME_IOCTL_ADMIN_CMD: 2107 return nvme_user_admin_cmd(dev, (void __user *)arg); 2108 default: 2109 return -ENOTTY; 2110 } 2111} 2112 2113static const struct file_operations nvme_dev_fops = { 2114 .owner = THIS_MODULE, 2115 .open = nvme_dev_open, 2116 .release = nvme_dev_release, 2117 .unlocked_ioctl = nvme_dev_ioctl, 2118 .compat_ioctl = nvme_dev_ioctl, 2119}; 2120 2121static int nvme_dev_start(struct nvme_dev *dev) 2122{ 2123 int result; 2124 2125 result = nvme_dev_map(dev); 2126 if (result) 2127 return result; 2128 2129 result = nvme_configure_admin_queue(dev); 2130 if (result) 2131 goto unmap; 2132 2133 spin_lock(&dev_list_lock); 2134 list_add(&dev->node, &dev_list); 2135 spin_unlock(&dev_list_lock); 2136 2137 result = nvme_setup_io_queues(dev); 2138 if (result && result != -EBUSY) 2139 goto disable; 2140 2141 return result; 2142 2143 disable: 2144 spin_lock(&dev_list_lock); 2145 list_del_init(&dev->node); 2146 spin_unlock(&dev_list_lock); 2147 unmap: 2148 nvme_dev_unmap(dev); 2149 return result; 2150} 2151 2152static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) 2153{ 2154 int result = -ENOMEM; 2155 struct nvme_dev *dev; 2156 2157 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 2158 if (!dev) 2159 return -ENOMEM; 2160 dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry), 2161 GFP_KERNEL); 2162 if (!dev->entry) 2163 goto free; 2164 dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *), 2165 GFP_KERNEL); 2166 if (!dev->queues) 2167 goto free; 2168 2169 INIT_LIST_HEAD(&dev->namespaces); 2170 dev->pci_dev = pdev; 2171 result = nvme_set_instance(dev); 2172 if (result) 2173 goto free; 2174 2175 result = nvme_setup_prp_pools(dev); 2176 if (result) 2177 goto release; 2178 2179 result = nvme_dev_start(dev); 2180 if (result) { 2181 if (result == -EBUSY) 2182 goto create_cdev; 2183 goto release_pools; 2184 } 2185 2186 result = nvme_dev_add(dev); 2187 if (result) 2188 goto shutdown; 2189 2190 create_cdev: 2191 scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance); 2192 dev->miscdev.minor = MISC_DYNAMIC_MINOR; 2193 dev->miscdev.parent = &pdev->dev; 2194 dev->miscdev.name = dev->name; 2195 dev->miscdev.fops = &nvme_dev_fops; 2196 result = misc_register(&dev->miscdev); 2197 if (result) 2198 goto remove; 2199 2200 kref_init(&dev->kref); 2201 return 0; 2202 2203 remove: 2204 nvme_dev_remove(dev); 2205 shutdown: 2206 nvme_dev_shutdown(dev); 2207 release_pools: 2208 nvme_free_queues(dev); 2209 nvme_release_prp_pools(dev); 2210 release: 2211 nvme_release_instance(dev); 2212 free: 2213 kfree(dev->queues); 2214 kfree(dev->entry); 2215 kfree(dev); 2216 return result; 2217} 2218 2219static void nvme_remove(struct pci_dev *pdev) 2220{ 2221 struct nvme_dev *dev = pci_get_drvdata(pdev); 2222 misc_deregister(&dev->miscdev); 2223 kref_put(&dev->kref, nvme_free_dev); 2224} 2225 2226/* These functions are yet to be implemented */ 2227#define nvme_error_detected NULL 2228#define nvme_dump_registers NULL 2229#define nvme_link_reset NULL 2230#define nvme_slot_reset NULL 2231#define nvme_error_resume NULL 2232 2233static int nvme_suspend(struct device *dev) 2234{ 2235 struct pci_dev *pdev = to_pci_dev(dev); 2236 struct nvme_dev *ndev = pci_get_drvdata(pdev); 2237 2238 nvme_dev_shutdown(ndev); 2239 return 0; 2240} 2241 2242static int nvme_resume(struct device *dev) 2243{ 2244 struct pci_dev *pdev = to_pci_dev(dev); 2245 struct nvme_dev *ndev = pci_get_drvdata(pdev); 2246 int ret; 2247 2248 ret = nvme_dev_start(ndev); 2249 /* XXX: should remove gendisks if resume fails */ 2250 if (ret) 2251 nvme_free_queues(ndev); 2252 return ret; 2253} 2254 2255static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume); 2256 2257static const struct pci_error_handlers nvme_err_handler = { 2258 .error_detected = nvme_error_detected, 2259 .mmio_enabled = nvme_dump_registers, 2260 .link_reset = nvme_link_reset, 2261 .slot_reset = nvme_slot_reset, 2262 .resume = nvme_error_resume, 2263}; 2264 2265/* Move to pci_ids.h later */ 2266#define PCI_CLASS_STORAGE_EXPRESS 0x010802 2267 2268static DEFINE_PCI_DEVICE_TABLE(nvme_id_table) = { 2269 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, 2270 { 0, } 2271}; 2272MODULE_DEVICE_TABLE(pci, nvme_id_table); 2273 2274static struct pci_driver nvme_driver = { 2275 .name = "nvme", 2276 .id_table = nvme_id_table, 2277 .probe = nvme_probe, 2278 .remove = nvme_remove, 2279 .driver = { 2280 .pm = &nvme_dev_pm_ops, 2281 }, 2282 .err_handler = &nvme_err_handler, 2283}; 2284 2285static int __init nvme_init(void) 2286{ 2287 int result; 2288 2289 nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); 2290 if (IS_ERR(nvme_thread)) 2291 return PTR_ERR(nvme_thread); 2292 2293 result = register_blkdev(nvme_major, "nvme"); 2294 if (result < 0) 2295 goto kill_kthread; 2296 else if (result > 0) 2297 nvme_major = result; 2298 2299 result = pci_register_driver(&nvme_driver); 2300 if (result) 2301 goto unregister_blkdev; 2302 return 0; 2303 2304 unregister_blkdev: 2305 unregister_blkdev(nvme_major, "nvme"); 2306 kill_kthread: 2307 kthread_stop(nvme_thread); 2308 return result; 2309} 2310 2311static void __exit nvme_exit(void) 2312{ 2313 pci_unregister_driver(&nvme_driver); 2314 unregister_blkdev(nvme_major, "nvme"); 2315 kthread_stop(nvme_thread); 2316} 2317 2318MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>"); 2319MODULE_LICENSE("GPL"); 2320MODULE_VERSION("0.8"); 2321module_init(nvme_init); 2322module_exit(nvme_exit);