Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v3.9-rc8 1807 lines 45 kB view raw
1/* 2 * NVM Express device driver 3 * Copyright (c) 2011, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 * 14 * You should have received a copy of the GNU General Public License along with 15 * this program; if not, write to the Free Software Foundation, Inc., 16 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 17 */ 18 19#include <linux/nvme.h> 20#include <linux/bio.h> 21#include <linux/bitops.h> 22#include <linux/blkdev.h> 23#include <linux/delay.h> 24#include <linux/errno.h> 25#include <linux/fs.h> 26#include <linux/genhd.h> 27#include <linux/idr.h> 28#include <linux/init.h> 29#include <linux/interrupt.h> 30#include <linux/io.h> 31#include <linux/kdev_t.h> 32#include <linux/kthread.h> 33#include <linux/kernel.h> 34#include <linux/mm.h> 35#include <linux/module.h> 36#include <linux/moduleparam.h> 37#include <linux/pci.h> 38#include <linux/poison.h> 39#include <linux/sched.h> 40#include <linux/slab.h> 41#include <linux/types.h> 42 43#include <asm-generic/io-64-nonatomic-lo-hi.h> 44 45#define NVME_Q_DEPTH 1024 46#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) 47#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) 48#define NVME_MINORS 64 49#define NVME_IO_TIMEOUT (5 * HZ) 50#define ADMIN_TIMEOUT (60 * HZ) 51 52static int nvme_major; 53module_param(nvme_major, int, 0); 54 55static int use_threaded_interrupts; 56module_param(use_threaded_interrupts, int, 0); 57 58static DEFINE_SPINLOCK(dev_list_lock); 59static LIST_HEAD(dev_list); 60static struct task_struct *nvme_thread; 61 62/* 63 * Represents an NVM Express device. Each nvme_dev is a PCI function. 64 */ 65struct nvme_dev { 66 struct list_head node; 67 struct nvme_queue **queues; 68 u32 __iomem *dbs; 69 struct pci_dev *pci_dev; 70 struct dma_pool *prp_page_pool; 71 struct dma_pool *prp_small_pool; 72 int instance; 73 int queue_count; 74 int db_stride; 75 u32 ctrl_config; 76 struct msix_entry *entry; 77 struct nvme_bar __iomem *bar; 78 struct list_head namespaces; 79 char serial[20]; 80 char model[40]; 81 char firmware_rev[8]; 82 u32 max_hw_sectors; 83}; 84 85/* 86 * An NVM Express namespace is equivalent to a SCSI LUN 87 */ 88struct nvme_ns { 89 struct list_head list; 90 91 struct nvme_dev *dev; 92 struct request_queue *queue; 93 struct gendisk *disk; 94 95 int ns_id; 96 int lba_shift; 97}; 98 99/* 100 * An NVM Express queue. Each device has at least two (one for admin 101 * commands and one for I/O commands). 102 */ 103struct nvme_queue { 104 struct device *q_dmadev; 105 struct nvme_dev *dev; 106 spinlock_t q_lock; 107 struct nvme_command *sq_cmds; 108 volatile struct nvme_completion *cqes; 109 dma_addr_t sq_dma_addr; 110 dma_addr_t cq_dma_addr; 111 wait_queue_head_t sq_full; 112 wait_queue_t sq_cong_wait; 113 struct bio_list sq_cong; 114 u32 __iomem *q_db; 115 u16 q_depth; 116 u16 cq_vector; 117 u16 sq_head; 118 u16 sq_tail; 119 u16 cq_head; 120 u16 cq_phase; 121 unsigned long cmdid_data[]; 122}; 123 124/* 125 * Check we didin't inadvertently grow the command struct 126 */ 127static inline void _nvme_check_size(void) 128{ 129 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); 130 BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); 131 BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); 132 BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); 133 BUILD_BUG_ON(sizeof(struct nvme_features) != 64); 134 BUILD_BUG_ON(sizeof(struct nvme_command) != 64); 135 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); 136 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); 137 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); 138 BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); 139} 140 141typedef void (*nvme_completion_fn)(struct nvme_dev *, void *, 142 struct nvme_completion *); 143 144struct nvme_cmd_info { 145 nvme_completion_fn fn; 146 void *ctx; 147 unsigned long timeout; 148}; 149 150static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq) 151{ 152 return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)]; 153} 154 155/** 156 * alloc_cmdid() - Allocate a Command ID 157 * @nvmeq: The queue that will be used for this command 158 * @ctx: A pointer that will be passed to the handler 159 * @handler: The function to call on completion 160 * 161 * Allocate a Command ID for a queue. The data passed in will 162 * be passed to the completion handler. This is implemented by using 163 * the bottom two bits of the ctx pointer to store the handler ID. 164 * Passing in a pointer that's not 4-byte aligned will cause a BUG. 165 * We can change this if it becomes a problem. 166 * 167 * May be called with local interrupts disabled and the q_lock held, 168 * or with interrupts enabled and no locks held. 169 */ 170static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, 171 nvme_completion_fn handler, unsigned timeout) 172{ 173 int depth = nvmeq->q_depth - 1; 174 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 175 int cmdid; 176 177 do { 178 cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth); 179 if (cmdid >= depth) 180 return -EBUSY; 181 } while (test_and_set_bit(cmdid, nvmeq->cmdid_data)); 182 183 info[cmdid].fn = handler; 184 info[cmdid].ctx = ctx; 185 info[cmdid].timeout = jiffies + timeout; 186 return cmdid; 187} 188 189static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, 190 nvme_completion_fn handler, unsigned timeout) 191{ 192 int cmdid; 193 wait_event_killable(nvmeq->sq_full, 194 (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0); 195 return (cmdid < 0) ? -EINTR : cmdid; 196} 197 198/* Special values must be less than 0x1000 */ 199#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) 200#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) 201#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) 202#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) 203#define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE) 204 205static void special_completion(struct nvme_dev *dev, void *ctx, 206 struct nvme_completion *cqe) 207{ 208 if (ctx == CMD_CTX_CANCELLED) 209 return; 210 if (ctx == CMD_CTX_FLUSH) 211 return; 212 if (ctx == CMD_CTX_COMPLETED) { 213 dev_warn(&dev->pci_dev->dev, 214 "completed id %d twice on queue %d\n", 215 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 216 return; 217 } 218 if (ctx == CMD_CTX_INVALID) { 219 dev_warn(&dev->pci_dev->dev, 220 "invalid id %d completed on queue %d\n", 221 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 222 return; 223 } 224 225 dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx); 226} 227 228/* 229 * Called with local interrupts disabled and the q_lock held. May not sleep. 230 */ 231static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid, 232 nvme_completion_fn *fn) 233{ 234 void *ctx; 235 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 236 237 if (cmdid >= nvmeq->q_depth) { 238 *fn = special_completion; 239 return CMD_CTX_INVALID; 240 } 241 if (fn) 242 *fn = info[cmdid].fn; 243 ctx = info[cmdid].ctx; 244 info[cmdid].fn = special_completion; 245 info[cmdid].ctx = CMD_CTX_COMPLETED; 246 clear_bit(cmdid, nvmeq->cmdid_data); 247 wake_up(&nvmeq->sq_full); 248 return ctx; 249} 250 251static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid, 252 nvme_completion_fn *fn) 253{ 254 void *ctx; 255 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 256 if (fn) 257 *fn = info[cmdid].fn; 258 ctx = info[cmdid].ctx; 259 info[cmdid].fn = special_completion; 260 info[cmdid].ctx = CMD_CTX_CANCELLED; 261 return ctx; 262} 263 264static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) 265{ 266 return dev->queues[get_cpu() + 1]; 267} 268 269static void put_nvmeq(struct nvme_queue *nvmeq) 270{ 271 put_cpu(); 272} 273 274/** 275 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell 276 * @nvmeq: The queue to use 277 * @cmd: The command to send 278 * 279 * Safe to use from interrupt context 280 */ 281static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) 282{ 283 unsigned long flags; 284 u16 tail; 285 spin_lock_irqsave(&nvmeq->q_lock, flags); 286 tail = nvmeq->sq_tail; 287 memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); 288 if (++tail == nvmeq->q_depth) 289 tail = 0; 290 writel(tail, nvmeq->q_db); 291 nvmeq->sq_tail = tail; 292 spin_unlock_irqrestore(&nvmeq->q_lock, flags); 293 294 return 0; 295} 296 297/* 298 * The nvme_iod describes the data in an I/O, including the list of PRP 299 * entries. You can't see it in this data structure because C doesn't let 300 * me express that. Use nvme_alloc_iod to ensure there's enough space 301 * allocated to store the PRP list. 302 */ 303struct nvme_iod { 304 void *private; /* For the use of the submitter of the I/O */ 305 int npages; /* In the PRP list. 0 means small pool in use */ 306 int offset; /* Of PRP list */ 307 int nents; /* Used in scatterlist */ 308 int length; /* Of data, in bytes */ 309 dma_addr_t first_dma; 310 struct scatterlist sg[0]; 311}; 312 313static __le64 **iod_list(struct nvme_iod *iod) 314{ 315 return ((void *)iod) + iod->offset; 316} 317 318/* 319 * Will slightly overestimate the number of pages needed. This is OK 320 * as it only leads to a small amount of wasted memory for the lifetime of 321 * the I/O. 322 */ 323static int nvme_npages(unsigned size) 324{ 325 unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE); 326 return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); 327} 328 329static struct nvme_iod * 330nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) 331{ 332 struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + 333 sizeof(__le64 *) * nvme_npages(nbytes) + 334 sizeof(struct scatterlist) * nseg, gfp); 335 336 if (iod) { 337 iod->offset = offsetof(struct nvme_iod, sg[nseg]); 338 iod->npages = -1; 339 iod->length = nbytes; 340 iod->nents = 0; 341 } 342 343 return iod; 344} 345 346static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) 347{ 348 const int last_prp = PAGE_SIZE / 8 - 1; 349 int i; 350 __le64 **list = iod_list(iod); 351 dma_addr_t prp_dma = iod->first_dma; 352 353 if (iod->npages == 0) 354 dma_pool_free(dev->prp_small_pool, list[0], prp_dma); 355 for (i = 0; i < iod->npages; i++) { 356 __le64 *prp_list = list[i]; 357 dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); 358 dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); 359 prp_dma = next_prp_dma; 360 } 361 kfree(iod); 362} 363 364static void requeue_bio(struct nvme_dev *dev, struct bio *bio) 365{ 366 struct nvme_queue *nvmeq = get_nvmeq(dev); 367 if (bio_list_empty(&nvmeq->sq_cong)) 368 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); 369 bio_list_add(&nvmeq->sq_cong, bio); 370 put_nvmeq(nvmeq); 371 wake_up_process(nvme_thread); 372} 373 374static void bio_completion(struct nvme_dev *dev, void *ctx, 375 struct nvme_completion *cqe) 376{ 377 struct nvme_iod *iod = ctx; 378 struct bio *bio = iod->private; 379 u16 status = le16_to_cpup(&cqe->status) >> 1; 380 381 if (iod->nents) 382 dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, 383 bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 384 nvme_free_iod(dev, iod); 385 if (status) { 386 bio_endio(bio, -EIO); 387 } else if (bio->bi_vcnt > bio->bi_idx) { 388 requeue_bio(dev, bio); 389 } else { 390 bio_endio(bio, 0); 391 } 392} 393 394/* length is in bytes. gfp flags indicates whether we may sleep. */ 395static int nvme_setup_prps(struct nvme_dev *dev, 396 struct nvme_common_command *cmd, struct nvme_iod *iod, 397 int total_len, gfp_t gfp) 398{ 399 struct dma_pool *pool; 400 int length = total_len; 401 struct scatterlist *sg = iod->sg; 402 int dma_len = sg_dma_len(sg); 403 u64 dma_addr = sg_dma_address(sg); 404 int offset = offset_in_page(dma_addr); 405 __le64 *prp_list; 406 __le64 **list = iod_list(iod); 407 dma_addr_t prp_dma; 408 int nprps, i; 409 410 cmd->prp1 = cpu_to_le64(dma_addr); 411 length -= (PAGE_SIZE - offset); 412 if (length <= 0) 413 return total_len; 414 415 dma_len -= (PAGE_SIZE - offset); 416 if (dma_len) { 417 dma_addr += (PAGE_SIZE - offset); 418 } else { 419 sg = sg_next(sg); 420 dma_addr = sg_dma_address(sg); 421 dma_len = sg_dma_len(sg); 422 } 423 424 if (length <= PAGE_SIZE) { 425 cmd->prp2 = cpu_to_le64(dma_addr); 426 return total_len; 427 } 428 429 nprps = DIV_ROUND_UP(length, PAGE_SIZE); 430 if (nprps <= (256 / 8)) { 431 pool = dev->prp_small_pool; 432 iod->npages = 0; 433 } else { 434 pool = dev->prp_page_pool; 435 iod->npages = 1; 436 } 437 438 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 439 if (!prp_list) { 440 cmd->prp2 = cpu_to_le64(dma_addr); 441 iod->npages = -1; 442 return (total_len - length) + PAGE_SIZE; 443 } 444 list[0] = prp_list; 445 iod->first_dma = prp_dma; 446 cmd->prp2 = cpu_to_le64(prp_dma); 447 i = 0; 448 for (;;) { 449 if (i == PAGE_SIZE / 8) { 450 __le64 *old_prp_list = prp_list; 451 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 452 if (!prp_list) 453 return total_len - length; 454 list[iod->npages++] = prp_list; 455 prp_list[0] = old_prp_list[i - 1]; 456 old_prp_list[i - 1] = cpu_to_le64(prp_dma); 457 i = 1; 458 } 459 prp_list[i++] = cpu_to_le64(dma_addr); 460 dma_len -= PAGE_SIZE; 461 dma_addr += PAGE_SIZE; 462 length -= PAGE_SIZE; 463 if (length <= 0) 464 break; 465 if (dma_len > 0) 466 continue; 467 BUG_ON(dma_len < 0); 468 sg = sg_next(sg); 469 dma_addr = sg_dma_address(sg); 470 dma_len = sg_dma_len(sg); 471 } 472 473 return total_len; 474} 475 476/* NVMe scatterlists require no holes in the virtual address */ 477#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \ 478 (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE)) 479 480static int nvme_map_bio(struct device *dev, struct nvme_iod *iod, 481 struct bio *bio, enum dma_data_direction dma_dir, int psegs) 482{ 483 struct bio_vec *bvec, *bvprv = NULL; 484 struct scatterlist *sg = NULL; 485 int i, old_idx, length = 0, nsegs = 0; 486 487 sg_init_table(iod->sg, psegs); 488 old_idx = bio->bi_idx; 489 bio_for_each_segment(bvec, bio, i) { 490 if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) { 491 sg->length += bvec->bv_len; 492 } else { 493 if (bvprv && BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec)) 494 break; 495 sg = sg ? sg + 1 : iod->sg; 496 sg_set_page(sg, bvec->bv_page, bvec->bv_len, 497 bvec->bv_offset); 498 nsegs++; 499 } 500 length += bvec->bv_len; 501 bvprv = bvec; 502 } 503 bio->bi_idx = i; 504 iod->nents = nsegs; 505 sg_mark_end(sg); 506 if (dma_map_sg(dev, iod->sg, iod->nents, dma_dir) == 0) { 507 bio->bi_idx = old_idx; 508 return -ENOMEM; 509 } 510 return length; 511} 512 513static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, 514 int cmdid) 515{ 516 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 517 518 memset(cmnd, 0, sizeof(*cmnd)); 519 cmnd->common.opcode = nvme_cmd_flush; 520 cmnd->common.command_id = cmdid; 521 cmnd->common.nsid = cpu_to_le32(ns->ns_id); 522 523 if (++nvmeq->sq_tail == nvmeq->q_depth) 524 nvmeq->sq_tail = 0; 525 writel(nvmeq->sq_tail, nvmeq->q_db); 526 527 return 0; 528} 529 530static int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns) 531{ 532 int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH, 533 special_completion, NVME_IO_TIMEOUT); 534 if (unlikely(cmdid < 0)) 535 return cmdid; 536 537 return nvme_submit_flush(nvmeq, ns, cmdid); 538} 539 540/* 541 * Called with local interrupts disabled and the q_lock held. May not sleep. 542 */ 543static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, 544 struct bio *bio) 545{ 546 struct nvme_command *cmnd; 547 struct nvme_iod *iod; 548 enum dma_data_direction dma_dir; 549 int cmdid, length, result = -ENOMEM; 550 u16 control; 551 u32 dsmgmt; 552 int psegs = bio_phys_segments(ns->queue, bio); 553 554 if ((bio->bi_rw & REQ_FLUSH) && psegs) { 555 result = nvme_submit_flush_data(nvmeq, ns); 556 if (result) 557 return result; 558 } 559 560 iod = nvme_alloc_iod(psegs, bio->bi_size, GFP_ATOMIC); 561 if (!iod) 562 goto nomem; 563 iod->private = bio; 564 565 result = -EBUSY; 566 cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT); 567 if (unlikely(cmdid < 0)) 568 goto free_iod; 569 570 if ((bio->bi_rw & REQ_FLUSH) && !psegs) 571 return nvme_submit_flush(nvmeq, ns, cmdid); 572 573 control = 0; 574 if (bio->bi_rw & REQ_FUA) 575 control |= NVME_RW_FUA; 576 if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD)) 577 control |= NVME_RW_LR; 578 579 dsmgmt = 0; 580 if (bio->bi_rw & REQ_RAHEAD) 581 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; 582 583 cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 584 585 memset(cmnd, 0, sizeof(*cmnd)); 586 if (bio_data_dir(bio)) { 587 cmnd->rw.opcode = nvme_cmd_write; 588 dma_dir = DMA_TO_DEVICE; 589 } else { 590 cmnd->rw.opcode = nvme_cmd_read; 591 dma_dir = DMA_FROM_DEVICE; 592 } 593 594 result = nvme_map_bio(nvmeq->q_dmadev, iod, bio, dma_dir, psegs); 595 if (result < 0) 596 goto free_cmdid; 597 length = result; 598 599 cmnd->rw.command_id = cmdid; 600 cmnd->rw.nsid = cpu_to_le32(ns->ns_id); 601 length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length, 602 GFP_ATOMIC); 603 cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9)); 604 cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1); 605 cmnd->rw.control = cpu_to_le16(control); 606 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); 607 608 bio->bi_sector += length >> 9; 609 610 if (++nvmeq->sq_tail == nvmeq->q_depth) 611 nvmeq->sq_tail = 0; 612 writel(nvmeq->sq_tail, nvmeq->q_db); 613 614 return 0; 615 616 free_cmdid: 617 free_cmdid(nvmeq, cmdid, NULL); 618 free_iod: 619 nvme_free_iod(nvmeq->dev, iod); 620 nomem: 621 return result; 622} 623 624static void nvme_make_request(struct request_queue *q, struct bio *bio) 625{ 626 struct nvme_ns *ns = q->queuedata; 627 struct nvme_queue *nvmeq = get_nvmeq(ns->dev); 628 int result = -EBUSY; 629 630 spin_lock_irq(&nvmeq->q_lock); 631 if (bio_list_empty(&nvmeq->sq_cong)) 632 result = nvme_submit_bio_queue(nvmeq, ns, bio); 633 if (unlikely(result)) { 634 if (bio_list_empty(&nvmeq->sq_cong)) 635 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); 636 bio_list_add(&nvmeq->sq_cong, bio); 637 } 638 639 spin_unlock_irq(&nvmeq->q_lock); 640 put_nvmeq(nvmeq); 641} 642 643static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq) 644{ 645 u16 head, phase; 646 647 head = nvmeq->cq_head; 648 phase = nvmeq->cq_phase; 649 650 for (;;) { 651 void *ctx; 652 nvme_completion_fn fn; 653 struct nvme_completion cqe = nvmeq->cqes[head]; 654 if ((le16_to_cpu(cqe.status) & 1) != phase) 655 break; 656 nvmeq->sq_head = le16_to_cpu(cqe.sq_head); 657 if (++head == nvmeq->q_depth) { 658 head = 0; 659 phase = !phase; 660 } 661 662 ctx = free_cmdid(nvmeq, cqe.command_id, &fn); 663 fn(nvmeq->dev, ctx, &cqe); 664 } 665 666 /* If the controller ignores the cq head doorbell and continuously 667 * writes to the queue, it is theoretically possible to wrap around 668 * the queue twice and mistakenly return IRQ_NONE. Linux only 669 * requires that 0.1% of your interrupts are handled, so this isn't 670 * a big problem. 671 */ 672 if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) 673 return IRQ_NONE; 674 675 writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride)); 676 nvmeq->cq_head = head; 677 nvmeq->cq_phase = phase; 678 679 return IRQ_HANDLED; 680} 681 682static irqreturn_t nvme_irq(int irq, void *data) 683{ 684 irqreturn_t result; 685 struct nvme_queue *nvmeq = data; 686 spin_lock(&nvmeq->q_lock); 687 result = nvme_process_cq(nvmeq); 688 spin_unlock(&nvmeq->q_lock); 689 return result; 690} 691 692static irqreturn_t nvme_irq_check(int irq, void *data) 693{ 694 struct nvme_queue *nvmeq = data; 695 struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head]; 696 if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase) 697 return IRQ_NONE; 698 return IRQ_WAKE_THREAD; 699} 700 701static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid) 702{ 703 spin_lock_irq(&nvmeq->q_lock); 704 cancel_cmdid(nvmeq, cmdid, NULL); 705 spin_unlock_irq(&nvmeq->q_lock); 706} 707 708struct sync_cmd_info { 709 struct task_struct *task; 710 u32 result; 711 int status; 712}; 713 714static void sync_completion(struct nvme_dev *dev, void *ctx, 715 struct nvme_completion *cqe) 716{ 717 struct sync_cmd_info *cmdinfo = ctx; 718 cmdinfo->result = le32_to_cpup(&cqe->result); 719 cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; 720 wake_up_process(cmdinfo->task); 721} 722 723/* 724 * Returns 0 on success. If the result is negative, it's a Linux error code; 725 * if the result is positive, it's an NVM Express status code 726 */ 727static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, 728 struct nvme_command *cmd, u32 *result, unsigned timeout) 729{ 730 int cmdid; 731 struct sync_cmd_info cmdinfo; 732 733 cmdinfo.task = current; 734 cmdinfo.status = -EINTR; 735 736 cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion, 737 timeout); 738 if (cmdid < 0) 739 return cmdid; 740 cmd->common.command_id = cmdid; 741 742 set_current_state(TASK_KILLABLE); 743 nvme_submit_cmd(nvmeq, cmd); 744 schedule(); 745 746 if (cmdinfo.status == -EINTR) { 747 nvme_abort_command(nvmeq, cmdid); 748 return -EINTR; 749 } 750 751 if (result) 752 *result = cmdinfo.result; 753 754 return cmdinfo.status; 755} 756 757static int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, 758 u32 *result) 759{ 760 return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT); 761} 762 763static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) 764{ 765 int status; 766 struct nvme_command c; 767 768 memset(&c, 0, sizeof(c)); 769 c.delete_queue.opcode = opcode; 770 c.delete_queue.qid = cpu_to_le16(id); 771 772 status = nvme_submit_admin_cmd(dev, &c, NULL); 773 if (status) 774 return -EIO; 775 return 0; 776} 777 778static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, 779 struct nvme_queue *nvmeq) 780{ 781 int status; 782 struct nvme_command c; 783 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; 784 785 memset(&c, 0, sizeof(c)); 786 c.create_cq.opcode = nvme_admin_create_cq; 787 c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); 788 c.create_cq.cqid = cpu_to_le16(qid); 789 c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 790 c.create_cq.cq_flags = cpu_to_le16(flags); 791 c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); 792 793 status = nvme_submit_admin_cmd(dev, &c, NULL); 794 if (status) 795 return -EIO; 796 return 0; 797} 798 799static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, 800 struct nvme_queue *nvmeq) 801{ 802 int status; 803 struct nvme_command c; 804 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; 805 806 memset(&c, 0, sizeof(c)); 807 c.create_sq.opcode = nvme_admin_create_sq; 808 c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); 809 c.create_sq.sqid = cpu_to_le16(qid); 810 c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 811 c.create_sq.sq_flags = cpu_to_le16(flags); 812 c.create_sq.cqid = cpu_to_le16(qid); 813 814 status = nvme_submit_admin_cmd(dev, &c, NULL); 815 if (status) 816 return -EIO; 817 return 0; 818} 819 820static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) 821{ 822 return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); 823} 824 825static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) 826{ 827 return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); 828} 829 830static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, 831 dma_addr_t dma_addr) 832{ 833 struct nvme_command c; 834 835 memset(&c, 0, sizeof(c)); 836 c.identify.opcode = nvme_admin_identify; 837 c.identify.nsid = cpu_to_le32(nsid); 838 c.identify.prp1 = cpu_to_le64(dma_addr); 839 c.identify.cns = cpu_to_le32(cns); 840 841 return nvme_submit_admin_cmd(dev, &c, NULL); 842} 843 844static int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, 845 dma_addr_t dma_addr, u32 *result) 846{ 847 struct nvme_command c; 848 849 memset(&c, 0, sizeof(c)); 850 c.features.opcode = nvme_admin_get_features; 851 c.features.nsid = cpu_to_le32(nsid); 852 c.features.prp1 = cpu_to_le64(dma_addr); 853 c.features.fid = cpu_to_le32(fid); 854 855 return nvme_submit_admin_cmd(dev, &c, result); 856} 857 858static int nvme_set_features(struct nvme_dev *dev, unsigned fid, 859 unsigned dword11, dma_addr_t dma_addr, u32 *result) 860{ 861 struct nvme_command c; 862 863 memset(&c, 0, sizeof(c)); 864 c.features.opcode = nvme_admin_set_features; 865 c.features.prp1 = cpu_to_le64(dma_addr); 866 c.features.fid = cpu_to_le32(fid); 867 c.features.dword11 = cpu_to_le32(dword11); 868 869 return nvme_submit_admin_cmd(dev, &c, result); 870} 871 872/** 873 * nvme_cancel_ios - Cancel outstanding I/Os 874 * @queue: The queue to cancel I/Os on 875 * @timeout: True to only cancel I/Os which have timed out 876 */ 877static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) 878{ 879 int depth = nvmeq->q_depth - 1; 880 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 881 unsigned long now = jiffies; 882 int cmdid; 883 884 for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) { 885 void *ctx; 886 nvme_completion_fn fn; 887 static struct nvme_completion cqe = { 888 .status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1, 889 }; 890 891 if (timeout && !time_after(now, info[cmdid].timeout)) 892 continue; 893 dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d\n", cmdid); 894 ctx = cancel_cmdid(nvmeq, cmdid, &fn); 895 fn(nvmeq->dev, ctx, &cqe); 896 } 897} 898 899static void nvme_free_queue_mem(struct nvme_queue *nvmeq) 900{ 901 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 902 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 903 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 904 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 905 kfree(nvmeq); 906} 907 908static void nvme_free_queue(struct nvme_dev *dev, int qid) 909{ 910 struct nvme_queue *nvmeq = dev->queues[qid]; 911 int vector = dev->entry[nvmeq->cq_vector].vector; 912 913 spin_lock_irq(&nvmeq->q_lock); 914 nvme_cancel_ios(nvmeq, false); 915 while (bio_list_peek(&nvmeq->sq_cong)) { 916 struct bio *bio = bio_list_pop(&nvmeq->sq_cong); 917 bio_endio(bio, -EIO); 918 } 919 spin_unlock_irq(&nvmeq->q_lock); 920 921 irq_set_affinity_hint(vector, NULL); 922 free_irq(vector, nvmeq); 923 924 /* Don't tell the adapter to delete the admin queue */ 925 if (qid) { 926 adapter_delete_sq(dev, qid); 927 adapter_delete_cq(dev, qid); 928 } 929 930 nvme_free_queue_mem(nvmeq); 931} 932 933static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, 934 int depth, int vector) 935{ 936 struct device *dmadev = &dev->pci_dev->dev; 937 unsigned extra = DIV_ROUND_UP(depth, 8) + (depth * 938 sizeof(struct nvme_cmd_info)); 939 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL); 940 if (!nvmeq) 941 return NULL; 942 943 nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth), 944 &nvmeq->cq_dma_addr, GFP_KERNEL); 945 if (!nvmeq->cqes) 946 goto free_nvmeq; 947 memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth)); 948 949 nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth), 950 &nvmeq->sq_dma_addr, GFP_KERNEL); 951 if (!nvmeq->sq_cmds) 952 goto free_cqdma; 953 954 nvmeq->q_dmadev = dmadev; 955 nvmeq->dev = dev; 956 spin_lock_init(&nvmeq->q_lock); 957 nvmeq->cq_head = 0; 958 nvmeq->cq_phase = 1; 959 init_waitqueue_head(&nvmeq->sq_full); 960 init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread); 961 bio_list_init(&nvmeq->sq_cong); 962 nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)]; 963 nvmeq->q_depth = depth; 964 nvmeq->cq_vector = vector; 965 966 return nvmeq; 967 968 free_cqdma: 969 dma_free_coherent(dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes, 970 nvmeq->cq_dma_addr); 971 free_nvmeq: 972 kfree(nvmeq); 973 return NULL; 974} 975 976static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, 977 const char *name) 978{ 979 if (use_threaded_interrupts) 980 return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector, 981 nvme_irq_check, nvme_irq, 982 IRQF_DISABLED | IRQF_SHARED, 983 name, nvmeq); 984 return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq, 985 IRQF_DISABLED | IRQF_SHARED, name, nvmeq); 986} 987 988static struct nvme_queue *nvme_create_queue(struct nvme_dev *dev, int qid, 989 int cq_size, int vector) 990{ 991 int result; 992 struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector); 993 994 if (!nvmeq) 995 return ERR_PTR(-ENOMEM); 996 997 result = adapter_alloc_cq(dev, qid, nvmeq); 998 if (result < 0) 999 goto free_nvmeq; 1000 1001 result = adapter_alloc_sq(dev, qid, nvmeq); 1002 if (result < 0) 1003 goto release_cq; 1004 1005 result = queue_request_irq(dev, nvmeq, "nvme"); 1006 if (result < 0) 1007 goto release_sq; 1008 1009 return nvmeq; 1010 1011 release_sq: 1012 adapter_delete_sq(dev, qid); 1013 release_cq: 1014 adapter_delete_cq(dev, qid); 1015 free_nvmeq: 1016 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 1017 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 1018 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 1019 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 1020 kfree(nvmeq); 1021 return ERR_PTR(result); 1022} 1023 1024static int nvme_configure_admin_queue(struct nvme_dev *dev) 1025{ 1026 int result = 0; 1027 u32 aqa; 1028 u64 cap; 1029 unsigned long timeout; 1030 struct nvme_queue *nvmeq; 1031 1032 dev->dbs = ((void __iomem *)dev->bar) + 4096; 1033 1034 nvmeq = nvme_alloc_queue(dev, 0, 64, 0); 1035 if (!nvmeq) 1036 return -ENOMEM; 1037 1038 aqa = nvmeq->q_depth - 1; 1039 aqa |= aqa << 16; 1040 1041 dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM; 1042 dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; 1043 dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; 1044 dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; 1045 1046 writel(0, &dev->bar->cc); 1047 writel(aqa, &dev->bar->aqa); 1048 writeq(nvmeq->sq_dma_addr, &dev->bar->asq); 1049 writeq(nvmeq->cq_dma_addr, &dev->bar->acq); 1050 writel(dev->ctrl_config, &dev->bar->cc); 1051 1052 cap = readq(&dev->bar->cap); 1053 timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; 1054 dev->db_stride = NVME_CAP_STRIDE(cap); 1055 1056 while (!result && !(readl(&dev->bar->csts) & NVME_CSTS_RDY)) { 1057 msleep(100); 1058 if (fatal_signal_pending(current)) 1059 result = -EINTR; 1060 if (time_after(jiffies, timeout)) { 1061 dev_err(&dev->pci_dev->dev, 1062 "Device not ready; aborting initialisation\n"); 1063 result = -ENODEV; 1064 } 1065 } 1066 1067 if (result) { 1068 nvme_free_queue_mem(nvmeq); 1069 return result; 1070 } 1071 1072 result = queue_request_irq(dev, nvmeq, "nvme admin"); 1073 dev->queues[0] = nvmeq; 1074 return result; 1075} 1076 1077static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, 1078 unsigned long addr, unsigned length) 1079{ 1080 int i, err, count, nents, offset; 1081 struct scatterlist *sg; 1082 struct page **pages; 1083 struct nvme_iod *iod; 1084 1085 if (addr & 3) 1086 return ERR_PTR(-EINVAL); 1087 if (!length) 1088 return ERR_PTR(-EINVAL); 1089 1090 offset = offset_in_page(addr); 1091 count = DIV_ROUND_UP(offset + length, PAGE_SIZE); 1092 pages = kcalloc(count, sizeof(*pages), GFP_KERNEL); 1093 if (!pages) 1094 return ERR_PTR(-ENOMEM); 1095 1096 err = get_user_pages_fast(addr, count, 1, pages); 1097 if (err < count) { 1098 count = err; 1099 err = -EFAULT; 1100 goto put_pages; 1101 } 1102 1103 iod = nvme_alloc_iod(count, length, GFP_KERNEL); 1104 sg = iod->sg; 1105 sg_init_table(sg, count); 1106 for (i = 0; i < count; i++) { 1107 sg_set_page(&sg[i], pages[i], 1108 min_t(int, length, PAGE_SIZE - offset), offset); 1109 length -= (PAGE_SIZE - offset); 1110 offset = 0; 1111 } 1112 sg_mark_end(&sg[i - 1]); 1113 iod->nents = count; 1114 1115 err = -ENOMEM; 1116 nents = dma_map_sg(&dev->pci_dev->dev, sg, count, 1117 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1118 if (!nents) 1119 goto free_iod; 1120 1121 kfree(pages); 1122 return iod; 1123 1124 free_iod: 1125 kfree(iod); 1126 put_pages: 1127 for (i = 0; i < count; i++) 1128 put_page(pages[i]); 1129 kfree(pages); 1130 return ERR_PTR(err); 1131} 1132 1133static void nvme_unmap_user_pages(struct nvme_dev *dev, int write, 1134 struct nvme_iod *iod) 1135{ 1136 int i; 1137 1138 dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, 1139 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1140 1141 for (i = 0; i < iod->nents; i++) 1142 put_page(sg_page(&iod->sg[i])); 1143} 1144 1145static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) 1146{ 1147 struct nvme_dev *dev = ns->dev; 1148 struct nvme_queue *nvmeq; 1149 struct nvme_user_io io; 1150 struct nvme_command c; 1151 unsigned length; 1152 int status; 1153 struct nvme_iod *iod; 1154 1155 if (copy_from_user(&io, uio, sizeof(io))) 1156 return -EFAULT; 1157 length = (io.nblocks + 1) << ns->lba_shift; 1158 1159 switch (io.opcode) { 1160 case nvme_cmd_write: 1161 case nvme_cmd_read: 1162 case nvme_cmd_compare: 1163 iod = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length); 1164 break; 1165 default: 1166 return -EINVAL; 1167 } 1168 1169 if (IS_ERR(iod)) 1170 return PTR_ERR(iod); 1171 1172 memset(&c, 0, sizeof(c)); 1173 c.rw.opcode = io.opcode; 1174 c.rw.flags = io.flags; 1175 c.rw.nsid = cpu_to_le32(ns->ns_id); 1176 c.rw.slba = cpu_to_le64(io.slba); 1177 c.rw.length = cpu_to_le16(io.nblocks); 1178 c.rw.control = cpu_to_le16(io.control); 1179 c.rw.dsmgmt = cpu_to_le16(io.dsmgmt); 1180 c.rw.reftag = io.reftag; 1181 c.rw.apptag = io.apptag; 1182 c.rw.appmask = io.appmask; 1183 /* XXX: metadata */ 1184 length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL); 1185 1186 nvmeq = get_nvmeq(dev); 1187 /* 1188 * Since nvme_submit_sync_cmd sleeps, we can't keep preemption 1189 * disabled. We may be preempted at any point, and be rescheduled 1190 * to a different CPU. That will cause cacheline bouncing, but no 1191 * additional races since q_lock already protects against other CPUs. 1192 */ 1193 put_nvmeq(nvmeq); 1194 if (length != (io.nblocks + 1) << ns->lba_shift) 1195 status = -ENOMEM; 1196 else 1197 status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); 1198 1199 nvme_unmap_user_pages(dev, io.opcode & 1, iod); 1200 nvme_free_iod(dev, iod); 1201 return status; 1202} 1203 1204static int nvme_user_admin_cmd(struct nvme_dev *dev, 1205 struct nvme_admin_cmd __user *ucmd) 1206{ 1207 struct nvme_admin_cmd cmd; 1208 struct nvme_command c; 1209 int status, length; 1210 struct nvme_iod *uninitialized_var(iod); 1211 1212 if (!capable(CAP_SYS_ADMIN)) 1213 return -EACCES; 1214 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 1215 return -EFAULT; 1216 1217 memset(&c, 0, sizeof(c)); 1218 c.common.opcode = cmd.opcode; 1219 c.common.flags = cmd.flags; 1220 c.common.nsid = cpu_to_le32(cmd.nsid); 1221 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 1222 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 1223 c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); 1224 c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); 1225 c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); 1226 c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); 1227 c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); 1228 c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); 1229 1230 length = cmd.data_len; 1231 if (cmd.data_len) { 1232 iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr, 1233 length); 1234 if (IS_ERR(iod)) 1235 return PTR_ERR(iod); 1236 length = nvme_setup_prps(dev, &c.common, iod, length, 1237 GFP_KERNEL); 1238 } 1239 1240 if (length != cmd.data_len) 1241 status = -ENOMEM; 1242 else 1243 status = nvme_submit_admin_cmd(dev, &c, &cmd.result); 1244 1245 if (cmd.data_len) { 1246 nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); 1247 nvme_free_iod(dev, iod); 1248 } 1249 1250 if (!status && copy_to_user(&ucmd->result, &cmd.result, 1251 sizeof(cmd.result))) 1252 status = -EFAULT; 1253 1254 return status; 1255} 1256 1257static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, 1258 unsigned long arg) 1259{ 1260 struct nvme_ns *ns = bdev->bd_disk->private_data; 1261 1262 switch (cmd) { 1263 case NVME_IOCTL_ID: 1264 return ns->ns_id; 1265 case NVME_IOCTL_ADMIN_CMD: 1266 return nvme_user_admin_cmd(ns->dev, (void __user *)arg); 1267 case NVME_IOCTL_SUBMIT_IO: 1268 return nvme_submit_io(ns, (void __user *)arg); 1269 default: 1270 return -ENOTTY; 1271 } 1272} 1273 1274static const struct block_device_operations nvme_fops = { 1275 .owner = THIS_MODULE, 1276 .ioctl = nvme_ioctl, 1277 .compat_ioctl = nvme_ioctl, 1278}; 1279 1280static void nvme_resubmit_bios(struct nvme_queue *nvmeq) 1281{ 1282 while (bio_list_peek(&nvmeq->sq_cong)) { 1283 struct bio *bio = bio_list_pop(&nvmeq->sq_cong); 1284 struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; 1285 if (nvme_submit_bio_queue(nvmeq, ns, bio)) { 1286 bio_list_add_head(&nvmeq->sq_cong, bio); 1287 break; 1288 } 1289 if (bio_list_empty(&nvmeq->sq_cong)) 1290 remove_wait_queue(&nvmeq->sq_full, 1291 &nvmeq->sq_cong_wait); 1292 } 1293} 1294 1295static int nvme_kthread(void *data) 1296{ 1297 struct nvme_dev *dev; 1298 1299 while (!kthread_should_stop()) { 1300 __set_current_state(TASK_RUNNING); 1301 spin_lock(&dev_list_lock); 1302 list_for_each_entry(dev, &dev_list, node) { 1303 int i; 1304 for (i = 0; i < dev->queue_count; i++) { 1305 struct nvme_queue *nvmeq = dev->queues[i]; 1306 if (!nvmeq) 1307 continue; 1308 spin_lock_irq(&nvmeq->q_lock); 1309 if (nvme_process_cq(nvmeq)) 1310 printk("process_cq did something\n"); 1311 nvme_cancel_ios(nvmeq, true); 1312 nvme_resubmit_bios(nvmeq); 1313 spin_unlock_irq(&nvmeq->q_lock); 1314 } 1315 } 1316 spin_unlock(&dev_list_lock); 1317 set_current_state(TASK_INTERRUPTIBLE); 1318 schedule_timeout(HZ); 1319 } 1320 return 0; 1321} 1322 1323static DEFINE_IDA(nvme_index_ida); 1324 1325static int nvme_get_ns_idx(void) 1326{ 1327 int index, error; 1328 1329 do { 1330 if (!ida_pre_get(&nvme_index_ida, GFP_KERNEL)) 1331 return -1; 1332 1333 spin_lock(&dev_list_lock); 1334 error = ida_get_new(&nvme_index_ida, &index); 1335 spin_unlock(&dev_list_lock); 1336 } while (error == -EAGAIN); 1337 1338 if (error) 1339 index = -1; 1340 return index; 1341} 1342 1343static void nvme_put_ns_idx(int index) 1344{ 1345 spin_lock(&dev_list_lock); 1346 ida_remove(&nvme_index_ida, index); 1347 spin_unlock(&dev_list_lock); 1348} 1349 1350static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid, 1351 struct nvme_id_ns *id, struct nvme_lba_range_type *rt) 1352{ 1353 struct nvme_ns *ns; 1354 struct gendisk *disk; 1355 int lbaf; 1356 1357 if (rt->attributes & NVME_LBART_ATTRIB_HIDE) 1358 return NULL; 1359 1360 ns = kzalloc(sizeof(*ns), GFP_KERNEL); 1361 if (!ns) 1362 return NULL; 1363 ns->queue = blk_alloc_queue(GFP_KERNEL); 1364 if (!ns->queue) 1365 goto out_free_ns; 1366 ns->queue->queue_flags = QUEUE_FLAG_DEFAULT; 1367 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); 1368 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); 1369/* queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); */ 1370 blk_queue_make_request(ns->queue, nvme_make_request); 1371 ns->dev = dev; 1372 ns->queue->queuedata = ns; 1373 1374 disk = alloc_disk(NVME_MINORS); 1375 if (!disk) 1376 goto out_free_queue; 1377 ns->ns_id = nsid; 1378 ns->disk = disk; 1379 lbaf = id->flbas & 0xf; 1380 ns->lba_shift = id->lbaf[lbaf].ds; 1381 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); 1382 if (dev->max_hw_sectors) 1383 blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); 1384 1385 disk->major = nvme_major; 1386 disk->minors = NVME_MINORS; 1387 disk->first_minor = NVME_MINORS * nvme_get_ns_idx(); 1388 disk->fops = &nvme_fops; 1389 disk->private_data = ns; 1390 disk->queue = ns->queue; 1391 disk->driverfs_dev = &dev->pci_dev->dev; 1392 sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); 1393 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); 1394 1395 return ns; 1396 1397 out_free_queue: 1398 blk_cleanup_queue(ns->queue); 1399 out_free_ns: 1400 kfree(ns); 1401 return NULL; 1402} 1403 1404static void nvme_ns_free(struct nvme_ns *ns) 1405{ 1406 int index = ns->disk->first_minor / NVME_MINORS; 1407 put_disk(ns->disk); 1408 nvme_put_ns_idx(index); 1409 blk_cleanup_queue(ns->queue); 1410 kfree(ns); 1411} 1412 1413static int set_queue_count(struct nvme_dev *dev, int count) 1414{ 1415 int status; 1416 u32 result; 1417 u32 q_count = (count - 1) | ((count - 1) << 16); 1418 1419 status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, 1420 &result); 1421 if (status) 1422 return -EIO; 1423 return min(result & 0xffff, result >> 16) + 1; 1424} 1425 1426static int nvme_setup_io_queues(struct nvme_dev *dev) 1427{ 1428 int result, cpu, i, nr_io_queues, db_bar_size, q_depth; 1429 1430 nr_io_queues = num_online_cpus(); 1431 result = set_queue_count(dev, nr_io_queues); 1432 if (result < 0) 1433 return result; 1434 if (result < nr_io_queues) 1435 nr_io_queues = result; 1436 1437 /* Deregister the admin queue's interrupt */ 1438 free_irq(dev->entry[0].vector, dev->queues[0]); 1439 1440 db_bar_size = 4096 + ((nr_io_queues + 1) << (dev->db_stride + 3)); 1441 if (db_bar_size > 8192) { 1442 iounmap(dev->bar); 1443 dev->bar = ioremap(pci_resource_start(dev->pci_dev, 0), 1444 db_bar_size); 1445 dev->dbs = ((void __iomem *)dev->bar) + 4096; 1446 dev->queues[0]->q_db = dev->dbs; 1447 } 1448 1449 for (i = 0; i < nr_io_queues; i++) 1450 dev->entry[i].entry = i; 1451 for (;;) { 1452 result = pci_enable_msix(dev->pci_dev, dev->entry, 1453 nr_io_queues); 1454 if (result == 0) { 1455 break; 1456 } else if (result > 0) { 1457 nr_io_queues = result; 1458 continue; 1459 } else { 1460 nr_io_queues = 1; 1461 break; 1462 } 1463 } 1464 1465 result = queue_request_irq(dev, dev->queues[0], "nvme admin"); 1466 /* XXX: handle failure here */ 1467 1468 cpu = cpumask_first(cpu_online_mask); 1469 for (i = 0; i < nr_io_queues; i++) { 1470 irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu)); 1471 cpu = cpumask_next(cpu, cpu_online_mask); 1472 } 1473 1474 q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1, 1475 NVME_Q_DEPTH); 1476 for (i = 0; i < nr_io_queues; i++) { 1477 dev->queues[i + 1] = nvme_create_queue(dev, i + 1, q_depth, i); 1478 if (IS_ERR(dev->queues[i + 1])) 1479 return PTR_ERR(dev->queues[i + 1]); 1480 dev->queue_count++; 1481 } 1482 1483 for (; i < num_possible_cpus(); i++) { 1484 int target = i % rounddown_pow_of_two(dev->queue_count - 1); 1485 dev->queues[i + 1] = dev->queues[target + 1]; 1486 } 1487 1488 return 0; 1489} 1490 1491static void nvme_free_queues(struct nvme_dev *dev) 1492{ 1493 int i; 1494 1495 for (i = dev->queue_count - 1; i >= 0; i--) 1496 nvme_free_queue(dev, i); 1497} 1498 1499static int nvme_dev_add(struct nvme_dev *dev) 1500{ 1501 int res, nn, i; 1502 struct nvme_ns *ns, *next; 1503 struct nvme_id_ctrl *ctrl; 1504 struct nvme_id_ns *id_ns; 1505 void *mem; 1506 dma_addr_t dma_addr; 1507 1508 res = nvme_setup_io_queues(dev); 1509 if (res) 1510 return res; 1511 1512 mem = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr, 1513 GFP_KERNEL); 1514 1515 res = nvme_identify(dev, 0, 1, dma_addr); 1516 if (res) { 1517 res = -EIO; 1518 goto out_free; 1519 } 1520 1521 ctrl = mem; 1522 nn = le32_to_cpup(&ctrl->nn); 1523 memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); 1524 memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); 1525 memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); 1526 if (ctrl->mdts) { 1527 int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; 1528 dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); 1529 } 1530 1531 id_ns = mem; 1532 for (i = 1; i <= nn; i++) { 1533 res = nvme_identify(dev, i, 0, dma_addr); 1534 if (res) 1535 continue; 1536 1537 if (id_ns->ncap == 0) 1538 continue; 1539 1540 res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i, 1541 dma_addr + 4096, NULL); 1542 if (res) 1543 memset(mem + 4096, 0, 4096); 1544 1545 ns = nvme_alloc_ns(dev, i, mem, mem + 4096); 1546 if (ns) 1547 list_add_tail(&ns->list, &dev->namespaces); 1548 } 1549 list_for_each_entry(ns, &dev->namespaces, list) 1550 add_disk(ns->disk); 1551 1552 goto out; 1553 1554 out_free: 1555 list_for_each_entry_safe(ns, next, &dev->namespaces, list) { 1556 list_del(&ns->list); 1557 nvme_ns_free(ns); 1558 } 1559 1560 out: 1561 dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr); 1562 return res; 1563} 1564 1565static int nvme_dev_remove(struct nvme_dev *dev) 1566{ 1567 struct nvme_ns *ns, *next; 1568 1569 spin_lock(&dev_list_lock); 1570 list_del(&dev->node); 1571 spin_unlock(&dev_list_lock); 1572 1573 list_for_each_entry_safe(ns, next, &dev->namespaces, list) { 1574 list_del(&ns->list); 1575 del_gendisk(ns->disk); 1576 nvme_ns_free(ns); 1577 } 1578 1579 nvme_free_queues(dev); 1580 1581 return 0; 1582} 1583 1584static int nvme_setup_prp_pools(struct nvme_dev *dev) 1585{ 1586 struct device *dmadev = &dev->pci_dev->dev; 1587 dev->prp_page_pool = dma_pool_create("prp list page", dmadev, 1588 PAGE_SIZE, PAGE_SIZE, 0); 1589 if (!dev->prp_page_pool) 1590 return -ENOMEM; 1591 1592 /* Optimisation for I/Os between 4k and 128k */ 1593 dev->prp_small_pool = dma_pool_create("prp list 256", dmadev, 1594 256, 256, 0); 1595 if (!dev->prp_small_pool) { 1596 dma_pool_destroy(dev->prp_page_pool); 1597 return -ENOMEM; 1598 } 1599 return 0; 1600} 1601 1602static void nvme_release_prp_pools(struct nvme_dev *dev) 1603{ 1604 dma_pool_destroy(dev->prp_page_pool); 1605 dma_pool_destroy(dev->prp_small_pool); 1606} 1607 1608static DEFINE_IDA(nvme_instance_ida); 1609 1610static int nvme_set_instance(struct nvme_dev *dev) 1611{ 1612 int instance, error; 1613 1614 do { 1615 if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) 1616 return -ENODEV; 1617 1618 spin_lock(&dev_list_lock); 1619 error = ida_get_new(&nvme_instance_ida, &instance); 1620 spin_unlock(&dev_list_lock); 1621 } while (error == -EAGAIN); 1622 1623 if (error) 1624 return -ENODEV; 1625 1626 dev->instance = instance; 1627 return 0; 1628} 1629 1630static void nvme_release_instance(struct nvme_dev *dev) 1631{ 1632 spin_lock(&dev_list_lock); 1633 ida_remove(&nvme_instance_ida, dev->instance); 1634 spin_unlock(&dev_list_lock); 1635} 1636 1637static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) 1638{ 1639 int bars, result = -ENOMEM; 1640 struct nvme_dev *dev; 1641 1642 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 1643 if (!dev) 1644 return -ENOMEM; 1645 dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry), 1646 GFP_KERNEL); 1647 if (!dev->entry) 1648 goto free; 1649 dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *), 1650 GFP_KERNEL); 1651 if (!dev->queues) 1652 goto free; 1653 1654 if (pci_enable_device_mem(pdev)) 1655 goto free; 1656 pci_set_master(pdev); 1657 bars = pci_select_bars(pdev, IORESOURCE_MEM); 1658 if (pci_request_selected_regions(pdev, bars, "nvme")) 1659 goto disable; 1660 1661 INIT_LIST_HEAD(&dev->namespaces); 1662 dev->pci_dev = pdev; 1663 pci_set_drvdata(pdev, dev); 1664 dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)); 1665 dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64)); 1666 result = nvme_set_instance(dev); 1667 if (result) 1668 goto disable; 1669 1670 dev->entry[0].vector = pdev->irq; 1671 1672 result = nvme_setup_prp_pools(dev); 1673 if (result) 1674 goto disable_msix; 1675 1676 dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); 1677 if (!dev->bar) { 1678 result = -ENOMEM; 1679 goto disable_msix; 1680 } 1681 1682 result = nvme_configure_admin_queue(dev); 1683 if (result) 1684 goto unmap; 1685 dev->queue_count++; 1686 1687 spin_lock(&dev_list_lock); 1688 list_add(&dev->node, &dev_list); 1689 spin_unlock(&dev_list_lock); 1690 1691 result = nvme_dev_add(dev); 1692 if (result) 1693 goto delete; 1694 1695 return 0; 1696 1697 delete: 1698 spin_lock(&dev_list_lock); 1699 list_del(&dev->node); 1700 spin_unlock(&dev_list_lock); 1701 1702 nvme_free_queues(dev); 1703 unmap: 1704 iounmap(dev->bar); 1705 disable_msix: 1706 pci_disable_msix(pdev); 1707 nvme_release_instance(dev); 1708 nvme_release_prp_pools(dev); 1709 disable: 1710 pci_disable_device(pdev); 1711 pci_release_regions(pdev); 1712 free: 1713 kfree(dev->queues); 1714 kfree(dev->entry); 1715 kfree(dev); 1716 return result; 1717} 1718 1719static void nvme_remove(struct pci_dev *pdev) 1720{ 1721 struct nvme_dev *dev = pci_get_drvdata(pdev); 1722 nvme_dev_remove(dev); 1723 pci_disable_msix(pdev); 1724 iounmap(dev->bar); 1725 nvme_release_instance(dev); 1726 nvme_release_prp_pools(dev); 1727 pci_disable_device(pdev); 1728 pci_release_regions(pdev); 1729 kfree(dev->queues); 1730 kfree(dev->entry); 1731 kfree(dev); 1732} 1733 1734/* These functions are yet to be implemented */ 1735#define nvme_error_detected NULL 1736#define nvme_dump_registers NULL 1737#define nvme_link_reset NULL 1738#define nvme_slot_reset NULL 1739#define nvme_error_resume NULL 1740#define nvme_suspend NULL 1741#define nvme_resume NULL 1742 1743static const struct pci_error_handlers nvme_err_handler = { 1744 .error_detected = nvme_error_detected, 1745 .mmio_enabled = nvme_dump_registers, 1746 .link_reset = nvme_link_reset, 1747 .slot_reset = nvme_slot_reset, 1748 .resume = nvme_error_resume, 1749}; 1750 1751/* Move to pci_ids.h later */ 1752#define PCI_CLASS_STORAGE_EXPRESS 0x010802 1753 1754static DEFINE_PCI_DEVICE_TABLE(nvme_id_table) = { 1755 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, 1756 { 0, } 1757}; 1758MODULE_DEVICE_TABLE(pci, nvme_id_table); 1759 1760static struct pci_driver nvme_driver = { 1761 .name = "nvme", 1762 .id_table = nvme_id_table, 1763 .probe = nvme_probe, 1764 .remove = nvme_remove, 1765 .suspend = nvme_suspend, 1766 .resume = nvme_resume, 1767 .err_handler = &nvme_err_handler, 1768}; 1769 1770static int __init nvme_init(void) 1771{ 1772 int result; 1773 1774 nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); 1775 if (IS_ERR(nvme_thread)) 1776 return PTR_ERR(nvme_thread); 1777 1778 result = register_blkdev(nvme_major, "nvme"); 1779 if (result < 0) 1780 goto kill_kthread; 1781 else if (result > 0) 1782 nvme_major = result; 1783 1784 result = pci_register_driver(&nvme_driver); 1785 if (result) 1786 goto unregister_blkdev; 1787 return 0; 1788 1789 unregister_blkdev: 1790 unregister_blkdev(nvme_major, "nvme"); 1791 kill_kthread: 1792 kthread_stop(nvme_thread); 1793 return result; 1794} 1795 1796static void __exit nvme_exit(void) 1797{ 1798 pci_unregister_driver(&nvme_driver); 1799 unregister_blkdev(nvme_major, "nvme"); 1800 kthread_stop(nvme_thread); 1801} 1802 1803MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>"); 1804MODULE_LICENSE("GPL"); 1805MODULE_VERSION("0.8"); 1806module_init(nvme_init); 1807module_exit(nvme_exit);