Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v4.3-rc6 3391 lines 85 kB view raw
1/* 2 * NVM Express device driver 3 * Copyright (c) 2011-2014, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 */ 14 15#include <linux/nvme.h> 16#include <linux/bitops.h> 17#include <linux/blkdev.h> 18#include <linux/blk-mq.h> 19#include <linux/cpu.h> 20#include <linux/delay.h> 21#include <linux/errno.h> 22#include <linux/fs.h> 23#include <linux/genhd.h> 24#include <linux/hdreg.h> 25#include <linux/idr.h> 26#include <linux/init.h> 27#include <linux/interrupt.h> 28#include <linux/io.h> 29#include <linux/kdev_t.h> 30#include <linux/kthread.h> 31#include <linux/kernel.h> 32#include <linux/list_sort.h> 33#include <linux/mm.h> 34#include <linux/module.h> 35#include <linux/moduleparam.h> 36#include <linux/pci.h> 37#include <linux/poison.h> 38#include <linux/ptrace.h> 39#include <linux/sched.h> 40#include <linux/slab.h> 41#include <linux/t10-pi.h> 42#include <linux/types.h> 43#include <scsi/sg.h> 44#include <asm-generic/io-64-nonatomic-lo-hi.h> 45 46#define NVME_MINORS (1U << MINORBITS) 47#define NVME_Q_DEPTH 1024 48#define NVME_AQ_DEPTH 256 49#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) 50#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) 51#define ADMIN_TIMEOUT (admin_timeout * HZ) 52#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) 53 54static unsigned char admin_timeout = 60; 55module_param(admin_timeout, byte, 0644); 56MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); 57 58unsigned char nvme_io_timeout = 30; 59module_param_named(io_timeout, nvme_io_timeout, byte, 0644); 60MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); 61 62static unsigned char shutdown_timeout = 5; 63module_param(shutdown_timeout, byte, 0644); 64MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); 65 66static int nvme_major; 67module_param(nvme_major, int, 0); 68 69static int nvme_char_major; 70module_param(nvme_char_major, int, 0); 71 72static int use_threaded_interrupts; 73module_param(use_threaded_interrupts, int, 0); 74 75static bool use_cmb_sqes = true; 76module_param(use_cmb_sqes, bool, 0644); 77MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes"); 78 79static DEFINE_SPINLOCK(dev_list_lock); 80static LIST_HEAD(dev_list); 81static struct task_struct *nvme_thread; 82static struct workqueue_struct *nvme_workq; 83static wait_queue_head_t nvme_kthread_wait; 84 85static struct class *nvme_class; 86 87static void nvme_reset_failed_dev(struct work_struct *ws); 88static int nvme_reset(struct nvme_dev *dev); 89static int nvme_process_cq(struct nvme_queue *nvmeq); 90 91struct async_cmd_info { 92 struct kthread_work work; 93 struct kthread_worker *worker; 94 struct request *req; 95 u32 result; 96 int status; 97 void *ctx; 98}; 99 100/* 101 * An NVM Express queue. Each device has at least two (one for admin 102 * commands and one for I/O commands). 103 */ 104struct nvme_queue { 105 struct device *q_dmadev; 106 struct nvme_dev *dev; 107 char irqname[24]; /* nvme4294967295-65535\0 */ 108 spinlock_t q_lock; 109 struct nvme_command *sq_cmds; 110 struct nvme_command __iomem *sq_cmds_io; 111 volatile struct nvme_completion *cqes; 112 struct blk_mq_tags **tags; 113 dma_addr_t sq_dma_addr; 114 dma_addr_t cq_dma_addr; 115 u32 __iomem *q_db; 116 u16 q_depth; 117 s16 cq_vector; 118 u16 sq_head; 119 u16 sq_tail; 120 u16 cq_head; 121 u16 qid; 122 u8 cq_phase; 123 u8 cqe_seen; 124 struct async_cmd_info cmdinfo; 125}; 126 127/* 128 * Check we didin't inadvertently grow the command struct 129 */ 130static inline void _nvme_check_size(void) 131{ 132 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); 133 BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); 134 BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); 135 BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); 136 BUILD_BUG_ON(sizeof(struct nvme_features) != 64); 137 BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); 138 BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64); 139 BUILD_BUG_ON(sizeof(struct nvme_command) != 64); 140 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); 141 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); 142 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); 143 BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); 144} 145 146typedef void (*nvme_completion_fn)(struct nvme_queue *, void *, 147 struct nvme_completion *); 148 149struct nvme_cmd_info { 150 nvme_completion_fn fn; 151 void *ctx; 152 int aborted; 153 struct nvme_queue *nvmeq; 154 struct nvme_iod iod[0]; 155}; 156 157/* 158 * Max size of iod being embedded in the request payload 159 */ 160#define NVME_INT_PAGES 2 161#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->page_size) 162#define NVME_INT_MASK 0x01 163 164/* 165 * Will slightly overestimate the number of pages needed. This is OK 166 * as it only leads to a small amount of wasted memory for the lifetime of 167 * the I/O. 168 */ 169static int nvme_npages(unsigned size, struct nvme_dev *dev) 170{ 171 unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); 172 return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); 173} 174 175static unsigned int nvme_cmd_size(struct nvme_dev *dev) 176{ 177 unsigned int ret = sizeof(struct nvme_cmd_info); 178 179 ret += sizeof(struct nvme_iod); 180 ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev); 181 ret += sizeof(struct scatterlist) * NVME_INT_PAGES; 182 183 return ret; 184} 185 186static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 187 unsigned int hctx_idx) 188{ 189 struct nvme_dev *dev = data; 190 struct nvme_queue *nvmeq = dev->queues[0]; 191 192 WARN_ON(hctx_idx != 0); 193 WARN_ON(dev->admin_tagset.tags[0] != hctx->tags); 194 WARN_ON(nvmeq->tags); 195 196 hctx->driver_data = nvmeq; 197 nvmeq->tags = &dev->admin_tagset.tags[0]; 198 return 0; 199} 200 201static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 202{ 203 struct nvme_queue *nvmeq = hctx->driver_data; 204 205 nvmeq->tags = NULL; 206} 207 208static int nvme_admin_init_request(void *data, struct request *req, 209 unsigned int hctx_idx, unsigned int rq_idx, 210 unsigned int numa_node) 211{ 212 struct nvme_dev *dev = data; 213 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); 214 struct nvme_queue *nvmeq = dev->queues[0]; 215 216 BUG_ON(!nvmeq); 217 cmd->nvmeq = nvmeq; 218 return 0; 219} 220 221static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 222 unsigned int hctx_idx) 223{ 224 struct nvme_dev *dev = data; 225 struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; 226 227 if (!nvmeq->tags) 228 nvmeq->tags = &dev->tagset.tags[hctx_idx]; 229 230 WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags); 231 hctx->driver_data = nvmeq; 232 return 0; 233} 234 235static int nvme_init_request(void *data, struct request *req, 236 unsigned int hctx_idx, unsigned int rq_idx, 237 unsigned int numa_node) 238{ 239 struct nvme_dev *dev = data; 240 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); 241 struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; 242 243 BUG_ON(!nvmeq); 244 cmd->nvmeq = nvmeq; 245 return 0; 246} 247 248static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx, 249 nvme_completion_fn handler) 250{ 251 cmd->fn = handler; 252 cmd->ctx = ctx; 253 cmd->aborted = 0; 254 blk_mq_start_request(blk_mq_rq_from_pdu(cmd)); 255} 256 257static void *iod_get_private(struct nvme_iod *iod) 258{ 259 return (void *) (iod->private & ~0x1UL); 260} 261 262/* 263 * If bit 0 is set, the iod is embedded in the request payload. 264 */ 265static bool iod_should_kfree(struct nvme_iod *iod) 266{ 267 return (iod->private & NVME_INT_MASK) == 0; 268} 269 270/* Special values must be less than 0x1000 */ 271#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) 272#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) 273#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) 274#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) 275 276static void special_completion(struct nvme_queue *nvmeq, void *ctx, 277 struct nvme_completion *cqe) 278{ 279 if (ctx == CMD_CTX_CANCELLED) 280 return; 281 if (ctx == CMD_CTX_COMPLETED) { 282 dev_warn(nvmeq->q_dmadev, 283 "completed id %d twice on queue %d\n", 284 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 285 return; 286 } 287 if (ctx == CMD_CTX_INVALID) { 288 dev_warn(nvmeq->q_dmadev, 289 "invalid id %d completed on queue %d\n", 290 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 291 return; 292 } 293 dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx); 294} 295 296static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn) 297{ 298 void *ctx; 299 300 if (fn) 301 *fn = cmd->fn; 302 ctx = cmd->ctx; 303 cmd->fn = special_completion; 304 cmd->ctx = CMD_CTX_CANCELLED; 305 return ctx; 306} 307 308static void async_req_completion(struct nvme_queue *nvmeq, void *ctx, 309 struct nvme_completion *cqe) 310{ 311 u32 result = le32_to_cpup(&cqe->result); 312 u16 status = le16_to_cpup(&cqe->status) >> 1; 313 314 if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) 315 ++nvmeq->dev->event_limit; 316 if (status != NVME_SC_SUCCESS) 317 return; 318 319 switch (result & 0xff07) { 320 case NVME_AER_NOTICE_NS_CHANGED: 321 dev_info(nvmeq->q_dmadev, "rescanning\n"); 322 schedule_work(&nvmeq->dev->scan_work); 323 default: 324 dev_warn(nvmeq->q_dmadev, "async event result %08x\n", result); 325 } 326} 327 328static void abort_completion(struct nvme_queue *nvmeq, void *ctx, 329 struct nvme_completion *cqe) 330{ 331 struct request *req = ctx; 332 333 u16 status = le16_to_cpup(&cqe->status) >> 1; 334 u32 result = le32_to_cpup(&cqe->result); 335 336 blk_mq_free_request(req); 337 338 dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result); 339 ++nvmeq->dev->abort_limit; 340} 341 342static void async_completion(struct nvme_queue *nvmeq, void *ctx, 343 struct nvme_completion *cqe) 344{ 345 struct async_cmd_info *cmdinfo = ctx; 346 cmdinfo->result = le32_to_cpup(&cqe->result); 347 cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; 348 queue_kthread_work(cmdinfo->worker, &cmdinfo->work); 349 blk_mq_free_request(cmdinfo->req); 350} 351 352static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq, 353 unsigned int tag) 354{ 355 struct request *req = blk_mq_tag_to_rq(*nvmeq->tags, tag); 356 357 return blk_mq_rq_to_pdu(req); 358} 359 360/* 361 * Called with local interrupts disabled and the q_lock held. May not sleep. 362 */ 363static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag, 364 nvme_completion_fn *fn) 365{ 366 struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag); 367 void *ctx; 368 if (tag >= nvmeq->q_depth) { 369 *fn = special_completion; 370 return CMD_CTX_INVALID; 371 } 372 if (fn) 373 *fn = cmd->fn; 374 ctx = cmd->ctx; 375 cmd->fn = special_completion; 376 cmd->ctx = CMD_CTX_COMPLETED; 377 return ctx; 378} 379 380/** 381 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell 382 * @nvmeq: The queue to use 383 * @cmd: The command to send 384 * 385 * Safe to use from interrupt context 386 */ 387static void __nvme_submit_cmd(struct nvme_queue *nvmeq, 388 struct nvme_command *cmd) 389{ 390 u16 tail = nvmeq->sq_tail; 391 392 if (nvmeq->sq_cmds_io) 393 memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd)); 394 else 395 memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); 396 397 if (++tail == nvmeq->q_depth) 398 tail = 0; 399 writel(tail, nvmeq->q_db); 400 nvmeq->sq_tail = tail; 401} 402 403static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) 404{ 405 unsigned long flags; 406 spin_lock_irqsave(&nvmeq->q_lock, flags); 407 __nvme_submit_cmd(nvmeq, cmd); 408 spin_unlock_irqrestore(&nvmeq->q_lock, flags); 409} 410 411static __le64 **iod_list(struct nvme_iod *iod) 412{ 413 return ((void *)iod) + iod->offset; 414} 415 416static inline void iod_init(struct nvme_iod *iod, unsigned nbytes, 417 unsigned nseg, unsigned long private) 418{ 419 iod->private = private; 420 iod->offset = offsetof(struct nvme_iod, sg[nseg]); 421 iod->npages = -1; 422 iod->length = nbytes; 423 iod->nents = 0; 424} 425 426static struct nvme_iod * 427__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev, 428 unsigned long priv, gfp_t gfp) 429{ 430 struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + 431 sizeof(__le64 *) * nvme_npages(bytes, dev) + 432 sizeof(struct scatterlist) * nseg, gfp); 433 434 if (iod) 435 iod_init(iod, bytes, nseg, priv); 436 437 return iod; 438} 439 440static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev, 441 gfp_t gfp) 442{ 443 unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) : 444 sizeof(struct nvme_dsm_range); 445 struct nvme_iod *iod; 446 447 if (rq->nr_phys_segments <= NVME_INT_PAGES && 448 size <= NVME_INT_BYTES(dev)) { 449 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq); 450 451 iod = cmd->iod; 452 iod_init(iod, size, rq->nr_phys_segments, 453 (unsigned long) rq | NVME_INT_MASK); 454 return iod; 455 } 456 457 return __nvme_alloc_iod(rq->nr_phys_segments, size, dev, 458 (unsigned long) rq, gfp); 459} 460 461static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) 462{ 463 const int last_prp = dev->page_size / 8 - 1; 464 int i; 465 __le64 **list = iod_list(iod); 466 dma_addr_t prp_dma = iod->first_dma; 467 468 if (iod->npages == 0) 469 dma_pool_free(dev->prp_small_pool, list[0], prp_dma); 470 for (i = 0; i < iod->npages; i++) { 471 __le64 *prp_list = list[i]; 472 dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); 473 dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); 474 prp_dma = next_prp_dma; 475 } 476 477 if (iod_should_kfree(iod)) 478 kfree(iod); 479} 480 481static int nvme_error_status(u16 status) 482{ 483 switch (status & 0x7ff) { 484 case NVME_SC_SUCCESS: 485 return 0; 486 case NVME_SC_CAP_EXCEEDED: 487 return -ENOSPC; 488 default: 489 return -EIO; 490 } 491} 492 493#ifdef CONFIG_BLK_DEV_INTEGRITY 494static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) 495{ 496 if (be32_to_cpu(pi->ref_tag) == v) 497 pi->ref_tag = cpu_to_be32(p); 498} 499 500static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) 501{ 502 if (be32_to_cpu(pi->ref_tag) == p) 503 pi->ref_tag = cpu_to_be32(v); 504} 505 506/** 507 * nvme_dif_remap - remaps ref tags to bip seed and physical lba 508 * 509 * The virtual start sector is the one that was originally submitted by the 510 * block layer. Due to partitioning, MD/DM cloning, etc. the actual physical 511 * start sector may be different. Remap protection information to match the 512 * physical LBA on writes, and back to the original seed on reads. 513 * 514 * Type 0 and 3 do not have a ref tag, so no remapping required. 515 */ 516static void nvme_dif_remap(struct request *req, 517 void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) 518{ 519 struct nvme_ns *ns = req->rq_disk->private_data; 520 struct bio_integrity_payload *bip; 521 struct t10_pi_tuple *pi; 522 void *p, *pmap; 523 u32 i, nlb, ts, phys, virt; 524 525 if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3) 526 return; 527 528 bip = bio_integrity(req->bio); 529 if (!bip) 530 return; 531 532 pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; 533 534 p = pmap; 535 virt = bip_get_seed(bip); 536 phys = nvme_block_nr(ns, blk_rq_pos(req)); 537 nlb = (blk_rq_bytes(req) >> ns->lba_shift); 538 ts = ns->disk->integrity->tuple_size; 539 540 for (i = 0; i < nlb; i++, virt++, phys++) { 541 pi = (struct t10_pi_tuple *)p; 542 dif_swap(phys, virt, pi); 543 p += ts; 544 } 545 kunmap_atomic(pmap); 546} 547 548static int nvme_noop_verify(struct blk_integrity_iter *iter) 549{ 550 return 0; 551} 552 553static int nvme_noop_generate(struct blk_integrity_iter *iter) 554{ 555 return 0; 556} 557 558struct blk_integrity nvme_meta_noop = { 559 .name = "NVME_META_NOOP", 560 .generate_fn = nvme_noop_generate, 561 .verify_fn = nvme_noop_verify, 562}; 563 564static void nvme_init_integrity(struct nvme_ns *ns) 565{ 566 struct blk_integrity integrity; 567 568 switch (ns->pi_type) { 569 case NVME_NS_DPS_PI_TYPE3: 570 integrity = t10_pi_type3_crc; 571 break; 572 case NVME_NS_DPS_PI_TYPE1: 573 case NVME_NS_DPS_PI_TYPE2: 574 integrity = t10_pi_type1_crc; 575 break; 576 default: 577 integrity = nvme_meta_noop; 578 break; 579 } 580 integrity.tuple_size = ns->ms; 581 blk_integrity_register(ns->disk, &integrity); 582 blk_queue_max_integrity_segments(ns->queue, 1); 583} 584#else /* CONFIG_BLK_DEV_INTEGRITY */ 585static void nvme_dif_remap(struct request *req, 586 void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) 587{ 588} 589static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) 590{ 591} 592static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) 593{ 594} 595static void nvme_init_integrity(struct nvme_ns *ns) 596{ 597} 598#endif 599 600static void req_completion(struct nvme_queue *nvmeq, void *ctx, 601 struct nvme_completion *cqe) 602{ 603 struct nvme_iod *iod = ctx; 604 struct request *req = iod_get_private(iod); 605 struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); 606 607 u16 status = le16_to_cpup(&cqe->status) >> 1; 608 609 if (unlikely(status)) { 610 if (!(status & NVME_SC_DNR || blk_noretry_request(req)) 611 && (jiffies - req->start_time) < req->timeout) { 612 unsigned long flags; 613 614 blk_mq_requeue_request(req); 615 spin_lock_irqsave(req->q->queue_lock, flags); 616 if (!blk_queue_stopped(req->q)) 617 blk_mq_kick_requeue_list(req->q); 618 spin_unlock_irqrestore(req->q->queue_lock, flags); 619 return; 620 } 621 622 if (req->cmd_type == REQ_TYPE_DRV_PRIV) { 623 if (cmd_rq->ctx == CMD_CTX_CANCELLED) 624 status = -EINTR; 625 } else { 626 status = nvme_error_status(status); 627 } 628 } 629 630 if (req->cmd_type == REQ_TYPE_DRV_PRIV) { 631 u32 result = le32_to_cpup(&cqe->result); 632 req->special = (void *)(uintptr_t)result; 633 } 634 635 if (cmd_rq->aborted) 636 dev_warn(nvmeq->dev->dev, 637 "completing aborted command with status:%04x\n", 638 status); 639 640 if (iod->nents) { 641 dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents, 642 rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 643 if (blk_integrity_rq(req)) { 644 if (!rq_data_dir(req)) 645 nvme_dif_remap(req, nvme_dif_complete); 646 dma_unmap_sg(nvmeq->dev->dev, iod->meta_sg, 1, 647 rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 648 } 649 } 650 nvme_free_iod(nvmeq->dev, iod); 651 652 blk_mq_complete_request(req, status); 653} 654 655/* length is in bytes. gfp flags indicates whether we may sleep. */ 656static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, 657 int total_len, gfp_t gfp) 658{ 659 struct dma_pool *pool; 660 int length = total_len; 661 struct scatterlist *sg = iod->sg; 662 int dma_len = sg_dma_len(sg); 663 u64 dma_addr = sg_dma_address(sg); 664 u32 page_size = dev->page_size; 665 int offset = dma_addr & (page_size - 1); 666 __le64 *prp_list; 667 __le64 **list = iod_list(iod); 668 dma_addr_t prp_dma; 669 int nprps, i; 670 671 length -= (page_size - offset); 672 if (length <= 0) 673 return total_len; 674 675 dma_len -= (page_size - offset); 676 if (dma_len) { 677 dma_addr += (page_size - offset); 678 } else { 679 sg = sg_next(sg); 680 dma_addr = sg_dma_address(sg); 681 dma_len = sg_dma_len(sg); 682 } 683 684 if (length <= page_size) { 685 iod->first_dma = dma_addr; 686 return total_len; 687 } 688 689 nprps = DIV_ROUND_UP(length, page_size); 690 if (nprps <= (256 / 8)) { 691 pool = dev->prp_small_pool; 692 iod->npages = 0; 693 } else { 694 pool = dev->prp_page_pool; 695 iod->npages = 1; 696 } 697 698 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 699 if (!prp_list) { 700 iod->first_dma = dma_addr; 701 iod->npages = -1; 702 return (total_len - length) + page_size; 703 } 704 list[0] = prp_list; 705 iod->first_dma = prp_dma; 706 i = 0; 707 for (;;) { 708 if (i == page_size >> 3) { 709 __le64 *old_prp_list = prp_list; 710 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 711 if (!prp_list) 712 return total_len - length; 713 list[iod->npages++] = prp_list; 714 prp_list[0] = old_prp_list[i - 1]; 715 old_prp_list[i - 1] = cpu_to_le64(prp_dma); 716 i = 1; 717 } 718 prp_list[i++] = cpu_to_le64(dma_addr); 719 dma_len -= page_size; 720 dma_addr += page_size; 721 length -= page_size; 722 if (length <= 0) 723 break; 724 if (dma_len > 0) 725 continue; 726 BUG_ON(dma_len < 0); 727 sg = sg_next(sg); 728 dma_addr = sg_dma_address(sg); 729 dma_len = sg_dma_len(sg); 730 } 731 732 return total_len; 733} 734 735static void nvme_submit_priv(struct nvme_queue *nvmeq, struct request *req, 736 struct nvme_iod *iod) 737{ 738 struct nvme_command cmnd; 739 740 memcpy(&cmnd, req->cmd, sizeof(cmnd)); 741 cmnd.rw.command_id = req->tag; 742 if (req->nr_phys_segments) { 743 cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); 744 cmnd.rw.prp2 = cpu_to_le64(iod->first_dma); 745 } 746 747 __nvme_submit_cmd(nvmeq, &cmnd); 748} 749 750/* 751 * We reuse the small pool to allocate the 16-byte range here as it is not 752 * worth having a special pool for these or additional cases to handle freeing 753 * the iod. 754 */ 755static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, 756 struct request *req, struct nvme_iod *iod) 757{ 758 struct nvme_dsm_range *range = 759 (struct nvme_dsm_range *)iod_list(iod)[0]; 760 struct nvme_command cmnd; 761 762 range->cattr = cpu_to_le32(0); 763 range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift); 764 range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); 765 766 memset(&cmnd, 0, sizeof(cmnd)); 767 cmnd.dsm.opcode = nvme_cmd_dsm; 768 cmnd.dsm.command_id = req->tag; 769 cmnd.dsm.nsid = cpu_to_le32(ns->ns_id); 770 cmnd.dsm.prp1 = cpu_to_le64(iod->first_dma); 771 cmnd.dsm.nr = 0; 772 cmnd.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); 773 774 __nvme_submit_cmd(nvmeq, &cmnd); 775} 776 777static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, 778 int cmdid) 779{ 780 struct nvme_command cmnd; 781 782 memset(&cmnd, 0, sizeof(cmnd)); 783 cmnd.common.opcode = nvme_cmd_flush; 784 cmnd.common.command_id = cmdid; 785 cmnd.common.nsid = cpu_to_le32(ns->ns_id); 786 787 __nvme_submit_cmd(nvmeq, &cmnd); 788} 789 790static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, 791 struct nvme_ns *ns) 792{ 793 struct request *req = iod_get_private(iod); 794 struct nvme_command cmnd; 795 u16 control = 0; 796 u32 dsmgmt = 0; 797 798 if (req->cmd_flags & REQ_FUA) 799 control |= NVME_RW_FUA; 800 if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) 801 control |= NVME_RW_LR; 802 803 if (req->cmd_flags & REQ_RAHEAD) 804 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; 805 806 memset(&cmnd, 0, sizeof(cmnd)); 807 cmnd.rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); 808 cmnd.rw.command_id = req->tag; 809 cmnd.rw.nsid = cpu_to_le32(ns->ns_id); 810 cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); 811 cmnd.rw.prp2 = cpu_to_le64(iod->first_dma); 812 cmnd.rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); 813 cmnd.rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); 814 815 if (ns->ms) { 816 switch (ns->pi_type) { 817 case NVME_NS_DPS_PI_TYPE3: 818 control |= NVME_RW_PRINFO_PRCHK_GUARD; 819 break; 820 case NVME_NS_DPS_PI_TYPE1: 821 case NVME_NS_DPS_PI_TYPE2: 822 control |= NVME_RW_PRINFO_PRCHK_GUARD | 823 NVME_RW_PRINFO_PRCHK_REF; 824 cmnd.rw.reftag = cpu_to_le32( 825 nvme_block_nr(ns, blk_rq_pos(req))); 826 break; 827 } 828 if (blk_integrity_rq(req)) 829 cmnd.rw.metadata = 830 cpu_to_le64(sg_dma_address(iod->meta_sg)); 831 else 832 control |= NVME_RW_PRINFO_PRACT; 833 } 834 835 cmnd.rw.control = cpu_to_le16(control); 836 cmnd.rw.dsmgmt = cpu_to_le32(dsmgmt); 837 838 __nvme_submit_cmd(nvmeq, &cmnd); 839 840 return 0; 841} 842 843/* 844 * NOTE: ns is NULL when called on the admin queue. 845 */ 846static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, 847 const struct blk_mq_queue_data *bd) 848{ 849 struct nvme_ns *ns = hctx->queue->queuedata; 850 struct nvme_queue *nvmeq = hctx->driver_data; 851 struct nvme_dev *dev = nvmeq->dev; 852 struct request *req = bd->rq; 853 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); 854 struct nvme_iod *iod; 855 enum dma_data_direction dma_dir; 856 857 /* 858 * If formated with metadata, require the block layer provide a buffer 859 * unless this namespace is formated such that the metadata can be 860 * stripped/generated by the controller with PRACT=1. 861 */ 862 if (ns && ns->ms && !blk_integrity_rq(req)) { 863 if (!(ns->pi_type && ns->ms == 8) && 864 req->cmd_type != REQ_TYPE_DRV_PRIV) { 865 blk_mq_complete_request(req, -EFAULT); 866 return BLK_MQ_RQ_QUEUE_OK; 867 } 868 } 869 870 iod = nvme_alloc_iod(req, dev, GFP_ATOMIC); 871 if (!iod) 872 return BLK_MQ_RQ_QUEUE_BUSY; 873 874 if (req->cmd_flags & REQ_DISCARD) { 875 void *range; 876 /* 877 * We reuse the small pool to allocate the 16-byte range here 878 * as it is not worth having a special pool for these or 879 * additional cases to handle freeing the iod. 880 */ 881 range = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, 882 &iod->first_dma); 883 if (!range) 884 goto retry_cmd; 885 iod_list(iod)[0] = (__le64 *)range; 886 iod->npages = 0; 887 } else if (req->nr_phys_segments) { 888 dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; 889 890 sg_init_table(iod->sg, req->nr_phys_segments); 891 iod->nents = blk_rq_map_sg(req->q, req, iod->sg); 892 if (!iod->nents) 893 goto error_cmd; 894 895 if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir)) 896 goto retry_cmd; 897 898 if (blk_rq_bytes(req) != 899 nvme_setup_prps(dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) { 900 dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); 901 goto retry_cmd; 902 } 903 if (blk_integrity_rq(req)) { 904 if (blk_rq_count_integrity_sg(req->q, req->bio) != 1) 905 goto error_cmd; 906 907 sg_init_table(iod->meta_sg, 1); 908 if (blk_rq_map_integrity_sg( 909 req->q, req->bio, iod->meta_sg) != 1) 910 goto error_cmd; 911 912 if (rq_data_dir(req)) 913 nvme_dif_remap(req, nvme_dif_prep); 914 915 if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir)) 916 goto error_cmd; 917 } 918 } 919 920 nvme_set_info(cmd, iod, req_completion); 921 spin_lock_irq(&nvmeq->q_lock); 922 if (req->cmd_type == REQ_TYPE_DRV_PRIV) 923 nvme_submit_priv(nvmeq, req, iod); 924 else if (req->cmd_flags & REQ_DISCARD) 925 nvme_submit_discard(nvmeq, ns, req, iod); 926 else if (req->cmd_flags & REQ_FLUSH) 927 nvme_submit_flush(nvmeq, ns, req->tag); 928 else 929 nvme_submit_iod(nvmeq, iod, ns); 930 931 nvme_process_cq(nvmeq); 932 spin_unlock_irq(&nvmeq->q_lock); 933 return BLK_MQ_RQ_QUEUE_OK; 934 935 error_cmd: 936 nvme_free_iod(dev, iod); 937 return BLK_MQ_RQ_QUEUE_ERROR; 938 retry_cmd: 939 nvme_free_iod(dev, iod); 940 return BLK_MQ_RQ_QUEUE_BUSY; 941} 942 943static int nvme_process_cq(struct nvme_queue *nvmeq) 944{ 945 u16 head, phase; 946 947 head = nvmeq->cq_head; 948 phase = nvmeq->cq_phase; 949 950 for (;;) { 951 void *ctx; 952 nvme_completion_fn fn; 953 struct nvme_completion cqe = nvmeq->cqes[head]; 954 if ((le16_to_cpu(cqe.status) & 1) != phase) 955 break; 956 nvmeq->sq_head = le16_to_cpu(cqe.sq_head); 957 if (++head == nvmeq->q_depth) { 958 head = 0; 959 phase = !phase; 960 } 961 ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn); 962 fn(nvmeq, ctx, &cqe); 963 } 964 965 /* If the controller ignores the cq head doorbell and continuously 966 * writes to the queue, it is theoretically possible to wrap around 967 * the queue twice and mistakenly return IRQ_NONE. Linux only 968 * requires that 0.1% of your interrupts are handled, so this isn't 969 * a big problem. 970 */ 971 if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) 972 return 0; 973 974 writel(head, nvmeq->q_db + nvmeq->dev->db_stride); 975 nvmeq->cq_head = head; 976 nvmeq->cq_phase = phase; 977 978 nvmeq->cqe_seen = 1; 979 return 1; 980} 981 982static irqreturn_t nvme_irq(int irq, void *data) 983{ 984 irqreturn_t result; 985 struct nvme_queue *nvmeq = data; 986 spin_lock(&nvmeq->q_lock); 987 nvme_process_cq(nvmeq); 988 result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE; 989 nvmeq->cqe_seen = 0; 990 spin_unlock(&nvmeq->q_lock); 991 return result; 992} 993 994static irqreturn_t nvme_irq_check(int irq, void *data) 995{ 996 struct nvme_queue *nvmeq = data; 997 struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head]; 998 if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase) 999 return IRQ_NONE; 1000 return IRQ_WAKE_THREAD; 1001} 1002 1003/* 1004 * Returns 0 on success. If the result is negative, it's a Linux error code; 1005 * if the result is positive, it's an NVM Express status code 1006 */ 1007int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 1008 void *buffer, void __user *ubuffer, unsigned bufflen, 1009 u32 *result, unsigned timeout) 1010{ 1011 bool write = cmd->common.opcode & 1; 1012 struct bio *bio = NULL; 1013 struct request *req; 1014 int ret; 1015 1016 req = blk_mq_alloc_request(q, write, GFP_KERNEL, false); 1017 if (IS_ERR(req)) 1018 return PTR_ERR(req); 1019 1020 req->cmd_type = REQ_TYPE_DRV_PRIV; 1021 req->cmd_flags |= REQ_FAILFAST_DRIVER; 1022 req->__data_len = 0; 1023 req->__sector = (sector_t) -1; 1024 req->bio = req->biotail = NULL; 1025 1026 req->timeout = timeout ? timeout : ADMIN_TIMEOUT; 1027 1028 req->cmd = (unsigned char *)cmd; 1029 req->cmd_len = sizeof(struct nvme_command); 1030 req->special = (void *)0; 1031 1032 if (buffer && bufflen) { 1033 ret = blk_rq_map_kern(q, req, buffer, bufflen, __GFP_WAIT); 1034 if (ret) 1035 goto out; 1036 } else if (ubuffer && bufflen) { 1037 ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, __GFP_WAIT); 1038 if (ret) 1039 goto out; 1040 bio = req->bio; 1041 } 1042 1043 blk_execute_rq(req->q, NULL, req, 0); 1044 if (bio) 1045 blk_rq_unmap_user(bio); 1046 if (result) 1047 *result = (u32)(uintptr_t)req->special; 1048 ret = req->errors; 1049 out: 1050 blk_mq_free_request(req); 1051 return ret; 1052} 1053 1054int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 1055 void *buffer, unsigned bufflen) 1056{ 1057 return __nvme_submit_sync_cmd(q, cmd, buffer, NULL, bufflen, NULL, 0); 1058} 1059 1060static int nvme_submit_async_admin_req(struct nvme_dev *dev) 1061{ 1062 struct nvme_queue *nvmeq = dev->queues[0]; 1063 struct nvme_command c; 1064 struct nvme_cmd_info *cmd_info; 1065 struct request *req; 1066 1067 req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, true); 1068 if (IS_ERR(req)) 1069 return PTR_ERR(req); 1070 1071 req->cmd_flags |= REQ_NO_TIMEOUT; 1072 cmd_info = blk_mq_rq_to_pdu(req); 1073 nvme_set_info(cmd_info, NULL, async_req_completion); 1074 1075 memset(&c, 0, sizeof(c)); 1076 c.common.opcode = nvme_admin_async_event; 1077 c.common.command_id = req->tag; 1078 1079 blk_mq_free_request(req); 1080 __nvme_submit_cmd(nvmeq, &c); 1081 return 0; 1082} 1083 1084static int nvme_submit_admin_async_cmd(struct nvme_dev *dev, 1085 struct nvme_command *cmd, 1086 struct async_cmd_info *cmdinfo, unsigned timeout) 1087{ 1088 struct nvme_queue *nvmeq = dev->queues[0]; 1089 struct request *req; 1090 struct nvme_cmd_info *cmd_rq; 1091 1092 req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false); 1093 if (IS_ERR(req)) 1094 return PTR_ERR(req); 1095 1096 req->timeout = timeout; 1097 cmd_rq = blk_mq_rq_to_pdu(req); 1098 cmdinfo->req = req; 1099 nvme_set_info(cmd_rq, cmdinfo, async_completion); 1100 cmdinfo->status = -EINTR; 1101 1102 cmd->common.command_id = req->tag; 1103 1104 nvme_submit_cmd(nvmeq, cmd); 1105 return 0; 1106} 1107 1108static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) 1109{ 1110 struct nvme_command c; 1111 1112 memset(&c, 0, sizeof(c)); 1113 c.delete_queue.opcode = opcode; 1114 c.delete_queue.qid = cpu_to_le16(id); 1115 1116 return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); 1117} 1118 1119static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, 1120 struct nvme_queue *nvmeq) 1121{ 1122 struct nvme_command c; 1123 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; 1124 1125 /* 1126 * Note: we (ab)use the fact the the prp fields survive if no data 1127 * is attached to the request. 1128 */ 1129 memset(&c, 0, sizeof(c)); 1130 c.create_cq.opcode = nvme_admin_create_cq; 1131 c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); 1132 c.create_cq.cqid = cpu_to_le16(qid); 1133 c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 1134 c.create_cq.cq_flags = cpu_to_le16(flags); 1135 c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); 1136 1137 return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); 1138} 1139 1140static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, 1141 struct nvme_queue *nvmeq) 1142{ 1143 struct nvme_command c; 1144 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; 1145 1146 /* 1147 * Note: we (ab)use the fact the the prp fields survive if no data 1148 * is attached to the request. 1149 */ 1150 memset(&c, 0, sizeof(c)); 1151 c.create_sq.opcode = nvme_admin_create_sq; 1152 c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); 1153 c.create_sq.sqid = cpu_to_le16(qid); 1154 c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 1155 c.create_sq.sq_flags = cpu_to_le16(flags); 1156 c.create_sq.cqid = cpu_to_le16(qid); 1157 1158 return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); 1159} 1160 1161static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) 1162{ 1163 return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); 1164} 1165 1166static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) 1167{ 1168 return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); 1169} 1170 1171int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id) 1172{ 1173 struct nvme_command c = { }; 1174 int error; 1175 1176 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ 1177 c.identify.opcode = nvme_admin_identify; 1178 c.identify.cns = cpu_to_le32(1); 1179 1180 *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); 1181 if (!*id) 1182 return -ENOMEM; 1183 1184 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, 1185 sizeof(struct nvme_id_ctrl)); 1186 if (error) 1187 kfree(*id); 1188 return error; 1189} 1190 1191int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid, 1192 struct nvme_id_ns **id) 1193{ 1194 struct nvme_command c = { }; 1195 int error; 1196 1197 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ 1198 c.identify.opcode = nvme_admin_identify, 1199 c.identify.nsid = cpu_to_le32(nsid), 1200 1201 *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL); 1202 if (!*id) 1203 return -ENOMEM; 1204 1205 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, 1206 sizeof(struct nvme_id_ns)); 1207 if (error) 1208 kfree(*id); 1209 return error; 1210} 1211 1212int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, 1213 dma_addr_t dma_addr, u32 *result) 1214{ 1215 struct nvme_command c; 1216 1217 memset(&c, 0, sizeof(c)); 1218 c.features.opcode = nvme_admin_get_features; 1219 c.features.nsid = cpu_to_le32(nsid); 1220 c.features.prp1 = cpu_to_le64(dma_addr); 1221 c.features.fid = cpu_to_le32(fid); 1222 1223 return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0, 1224 result, 0); 1225} 1226 1227int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, 1228 dma_addr_t dma_addr, u32 *result) 1229{ 1230 struct nvme_command c; 1231 1232 memset(&c, 0, sizeof(c)); 1233 c.features.opcode = nvme_admin_set_features; 1234 c.features.prp1 = cpu_to_le64(dma_addr); 1235 c.features.fid = cpu_to_le32(fid); 1236 c.features.dword11 = cpu_to_le32(dword11); 1237 1238 return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0, 1239 result, 0); 1240} 1241 1242int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log) 1243{ 1244 struct nvme_command c = { }; 1245 int error; 1246 1247 c.common.opcode = nvme_admin_get_log_page, 1248 c.common.nsid = cpu_to_le32(0xFFFFFFFF), 1249 c.common.cdw10[0] = cpu_to_le32( 1250 (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) | 1251 NVME_LOG_SMART), 1252 1253 *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL); 1254 if (!*log) 1255 return -ENOMEM; 1256 1257 error = nvme_submit_sync_cmd(dev->admin_q, &c, *log, 1258 sizeof(struct nvme_smart_log)); 1259 if (error) 1260 kfree(*log); 1261 return error; 1262} 1263 1264/** 1265 * nvme_abort_req - Attempt aborting a request 1266 * 1267 * Schedule controller reset if the command was already aborted once before and 1268 * still hasn't been returned to the driver, or if this is the admin queue. 1269 */ 1270static void nvme_abort_req(struct request *req) 1271{ 1272 struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); 1273 struct nvme_queue *nvmeq = cmd_rq->nvmeq; 1274 struct nvme_dev *dev = nvmeq->dev; 1275 struct request *abort_req; 1276 struct nvme_cmd_info *abort_cmd; 1277 struct nvme_command cmd; 1278 1279 if (!nvmeq->qid || cmd_rq->aborted) { 1280 unsigned long flags; 1281 1282 spin_lock_irqsave(&dev_list_lock, flags); 1283 if (work_busy(&dev->reset_work)) 1284 goto out; 1285 list_del_init(&dev->node); 1286 dev_warn(dev->dev, "I/O %d QID %d timeout, reset controller\n", 1287 req->tag, nvmeq->qid); 1288 dev->reset_workfn = nvme_reset_failed_dev; 1289 queue_work(nvme_workq, &dev->reset_work); 1290 out: 1291 spin_unlock_irqrestore(&dev_list_lock, flags); 1292 return; 1293 } 1294 1295 if (!dev->abort_limit) 1296 return; 1297 1298 abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, 1299 false); 1300 if (IS_ERR(abort_req)) 1301 return; 1302 1303 abort_cmd = blk_mq_rq_to_pdu(abort_req); 1304 nvme_set_info(abort_cmd, abort_req, abort_completion); 1305 1306 memset(&cmd, 0, sizeof(cmd)); 1307 cmd.abort.opcode = nvme_admin_abort_cmd; 1308 cmd.abort.cid = req->tag; 1309 cmd.abort.sqid = cpu_to_le16(nvmeq->qid); 1310 cmd.abort.command_id = abort_req->tag; 1311 1312 --dev->abort_limit; 1313 cmd_rq->aborted = 1; 1314 1315 dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag, 1316 nvmeq->qid); 1317 nvme_submit_cmd(dev->queues[0], &cmd); 1318} 1319 1320static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved) 1321{ 1322 struct nvme_queue *nvmeq = data; 1323 void *ctx; 1324 nvme_completion_fn fn; 1325 struct nvme_cmd_info *cmd; 1326 struct nvme_completion cqe; 1327 1328 if (!blk_mq_request_started(req)) 1329 return; 1330 1331 cmd = blk_mq_rq_to_pdu(req); 1332 1333 if (cmd->ctx == CMD_CTX_CANCELLED) 1334 return; 1335 1336 if (blk_queue_dying(req->q)) 1337 cqe.status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1); 1338 else 1339 cqe.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); 1340 1341 1342 dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", 1343 req->tag, nvmeq->qid); 1344 ctx = cancel_cmd_info(cmd, &fn); 1345 fn(nvmeq, ctx, &cqe); 1346} 1347 1348static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) 1349{ 1350 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); 1351 struct nvme_queue *nvmeq = cmd->nvmeq; 1352 1353 dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag, 1354 nvmeq->qid); 1355 spin_lock_irq(&nvmeq->q_lock); 1356 nvme_abort_req(req); 1357 spin_unlock_irq(&nvmeq->q_lock); 1358 1359 /* 1360 * The aborted req will be completed on receiving the abort req. 1361 * We enable the timer again. If hit twice, it'll cause a device reset, 1362 * as the device then is in a faulty state. 1363 */ 1364 return BLK_EH_RESET_TIMER; 1365} 1366 1367static void nvme_free_queue(struct nvme_queue *nvmeq) 1368{ 1369 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 1370 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 1371 if (nvmeq->sq_cmds) 1372 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 1373 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 1374 kfree(nvmeq); 1375} 1376 1377static void nvme_free_queues(struct nvme_dev *dev, int lowest) 1378{ 1379 int i; 1380 1381 for (i = dev->queue_count - 1; i >= lowest; i--) { 1382 struct nvme_queue *nvmeq = dev->queues[i]; 1383 dev->queue_count--; 1384 dev->queues[i] = NULL; 1385 nvme_free_queue(nvmeq); 1386 } 1387} 1388 1389/** 1390 * nvme_suspend_queue - put queue into suspended state 1391 * @nvmeq - queue to suspend 1392 */ 1393static int nvme_suspend_queue(struct nvme_queue *nvmeq) 1394{ 1395 int vector; 1396 1397 spin_lock_irq(&nvmeq->q_lock); 1398 if (nvmeq->cq_vector == -1) { 1399 spin_unlock_irq(&nvmeq->q_lock); 1400 return 1; 1401 } 1402 vector = nvmeq->dev->entry[nvmeq->cq_vector].vector; 1403 nvmeq->dev->online_queues--; 1404 nvmeq->cq_vector = -1; 1405 spin_unlock_irq(&nvmeq->q_lock); 1406 1407 if (!nvmeq->qid && nvmeq->dev->admin_q) 1408 blk_mq_freeze_queue_start(nvmeq->dev->admin_q); 1409 1410 irq_set_affinity_hint(vector, NULL); 1411 free_irq(vector, nvmeq); 1412 1413 return 0; 1414} 1415 1416static void nvme_clear_queue(struct nvme_queue *nvmeq) 1417{ 1418 spin_lock_irq(&nvmeq->q_lock); 1419 if (nvmeq->tags && *nvmeq->tags) 1420 blk_mq_all_tag_busy_iter(*nvmeq->tags, nvme_cancel_queue_ios, nvmeq); 1421 spin_unlock_irq(&nvmeq->q_lock); 1422} 1423 1424static void nvme_disable_queue(struct nvme_dev *dev, int qid) 1425{ 1426 struct nvme_queue *nvmeq = dev->queues[qid]; 1427 1428 if (!nvmeq) 1429 return; 1430 if (nvme_suspend_queue(nvmeq)) 1431 return; 1432 1433 /* Don't tell the adapter to delete the admin queue. 1434 * Don't tell a removed adapter to delete IO queues. */ 1435 if (qid && readl(&dev->bar->csts) != -1) { 1436 adapter_delete_sq(dev, qid); 1437 adapter_delete_cq(dev, qid); 1438 } 1439 1440 spin_lock_irq(&nvmeq->q_lock); 1441 nvme_process_cq(nvmeq); 1442 spin_unlock_irq(&nvmeq->q_lock); 1443} 1444 1445static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, 1446 int entry_size) 1447{ 1448 int q_depth = dev->q_depth; 1449 unsigned q_size_aligned = roundup(q_depth * entry_size, dev->page_size); 1450 1451 if (q_size_aligned * nr_io_queues > dev->cmb_size) { 1452 u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues); 1453 mem_per_q = round_down(mem_per_q, dev->page_size); 1454 q_depth = div_u64(mem_per_q, entry_size); 1455 1456 /* 1457 * Ensure the reduced q_depth is above some threshold where it 1458 * would be better to map queues in system memory with the 1459 * original depth 1460 */ 1461 if (q_depth < 64) 1462 return -ENOMEM; 1463 } 1464 1465 return q_depth; 1466} 1467 1468static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, 1469 int qid, int depth) 1470{ 1471 if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) { 1472 unsigned offset = (qid - 1) * 1473 roundup(SQ_SIZE(depth), dev->page_size); 1474 nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset; 1475 nvmeq->sq_cmds_io = dev->cmb + offset; 1476 } else { 1477 nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), 1478 &nvmeq->sq_dma_addr, GFP_KERNEL); 1479 if (!nvmeq->sq_cmds) 1480 return -ENOMEM; 1481 } 1482 1483 return 0; 1484} 1485 1486static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, 1487 int depth) 1488{ 1489 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL); 1490 if (!nvmeq) 1491 return NULL; 1492 1493 nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth), 1494 &nvmeq->cq_dma_addr, GFP_KERNEL); 1495 if (!nvmeq->cqes) 1496 goto free_nvmeq; 1497 1498 if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth)) 1499 goto free_cqdma; 1500 1501 nvmeq->q_dmadev = dev->dev; 1502 nvmeq->dev = dev; 1503 snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", 1504 dev->instance, qid); 1505 spin_lock_init(&nvmeq->q_lock); 1506 nvmeq->cq_head = 0; 1507 nvmeq->cq_phase = 1; 1508 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1509 nvmeq->q_depth = depth; 1510 nvmeq->qid = qid; 1511 nvmeq->cq_vector = -1; 1512 dev->queues[qid] = nvmeq; 1513 1514 /* make sure queue descriptor is set before queue count, for kthread */ 1515 mb(); 1516 dev->queue_count++; 1517 1518 return nvmeq; 1519 1520 free_cqdma: 1521 dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes, 1522 nvmeq->cq_dma_addr); 1523 free_nvmeq: 1524 kfree(nvmeq); 1525 return NULL; 1526} 1527 1528static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, 1529 const char *name) 1530{ 1531 if (use_threaded_interrupts) 1532 return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector, 1533 nvme_irq_check, nvme_irq, IRQF_SHARED, 1534 name, nvmeq); 1535 return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq, 1536 IRQF_SHARED, name, nvmeq); 1537} 1538 1539static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) 1540{ 1541 struct nvme_dev *dev = nvmeq->dev; 1542 1543 spin_lock_irq(&nvmeq->q_lock); 1544 nvmeq->sq_tail = 0; 1545 nvmeq->cq_head = 0; 1546 nvmeq->cq_phase = 1; 1547 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1548 memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); 1549 dev->online_queues++; 1550 spin_unlock_irq(&nvmeq->q_lock); 1551} 1552 1553static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) 1554{ 1555 struct nvme_dev *dev = nvmeq->dev; 1556 int result; 1557 1558 nvmeq->cq_vector = qid - 1; 1559 result = adapter_alloc_cq(dev, qid, nvmeq); 1560 if (result < 0) 1561 return result; 1562 1563 result = adapter_alloc_sq(dev, qid, nvmeq); 1564 if (result < 0) 1565 goto release_cq; 1566 1567 result = queue_request_irq(dev, nvmeq, nvmeq->irqname); 1568 if (result < 0) 1569 goto release_sq; 1570 1571 nvme_init_queue(nvmeq, qid); 1572 return result; 1573 1574 release_sq: 1575 adapter_delete_sq(dev, qid); 1576 release_cq: 1577 adapter_delete_cq(dev, qid); 1578 return result; 1579} 1580 1581static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) 1582{ 1583 unsigned long timeout; 1584 u32 bit = enabled ? NVME_CSTS_RDY : 0; 1585 1586 timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; 1587 1588 while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) { 1589 msleep(100); 1590 if (fatal_signal_pending(current)) 1591 return -EINTR; 1592 if (time_after(jiffies, timeout)) { 1593 dev_err(dev->dev, 1594 "Device not ready; aborting %s\n", enabled ? 1595 "initialisation" : "reset"); 1596 return -ENODEV; 1597 } 1598 } 1599 1600 return 0; 1601} 1602 1603/* 1604 * If the device has been passed off to us in an enabled state, just clear 1605 * the enabled bit. The spec says we should set the 'shutdown notification 1606 * bits', but doing so may cause the device to complete commands to the 1607 * admin queue ... and we don't know what memory that might be pointing at! 1608 */ 1609static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap) 1610{ 1611 dev->ctrl_config &= ~NVME_CC_SHN_MASK; 1612 dev->ctrl_config &= ~NVME_CC_ENABLE; 1613 writel(dev->ctrl_config, &dev->bar->cc); 1614 1615 return nvme_wait_ready(dev, cap, false); 1616} 1617 1618static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap) 1619{ 1620 dev->ctrl_config &= ~NVME_CC_SHN_MASK; 1621 dev->ctrl_config |= NVME_CC_ENABLE; 1622 writel(dev->ctrl_config, &dev->bar->cc); 1623 1624 return nvme_wait_ready(dev, cap, true); 1625} 1626 1627static int nvme_shutdown_ctrl(struct nvme_dev *dev) 1628{ 1629 unsigned long timeout; 1630 1631 dev->ctrl_config &= ~NVME_CC_SHN_MASK; 1632 dev->ctrl_config |= NVME_CC_SHN_NORMAL; 1633 1634 writel(dev->ctrl_config, &dev->bar->cc); 1635 1636 timeout = SHUTDOWN_TIMEOUT + jiffies; 1637 while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) != 1638 NVME_CSTS_SHST_CMPLT) { 1639 msleep(100); 1640 if (fatal_signal_pending(current)) 1641 return -EINTR; 1642 if (time_after(jiffies, timeout)) { 1643 dev_err(dev->dev, 1644 "Device shutdown incomplete; abort shutdown\n"); 1645 return -ENODEV; 1646 } 1647 } 1648 1649 return 0; 1650} 1651 1652static struct blk_mq_ops nvme_mq_admin_ops = { 1653 .queue_rq = nvme_queue_rq, 1654 .map_queue = blk_mq_map_queue, 1655 .init_hctx = nvme_admin_init_hctx, 1656 .exit_hctx = nvme_admin_exit_hctx, 1657 .init_request = nvme_admin_init_request, 1658 .timeout = nvme_timeout, 1659}; 1660 1661static struct blk_mq_ops nvme_mq_ops = { 1662 .queue_rq = nvme_queue_rq, 1663 .map_queue = blk_mq_map_queue, 1664 .init_hctx = nvme_init_hctx, 1665 .init_request = nvme_init_request, 1666 .timeout = nvme_timeout, 1667}; 1668 1669static void nvme_dev_remove_admin(struct nvme_dev *dev) 1670{ 1671 if (dev->admin_q && !blk_queue_dying(dev->admin_q)) { 1672 blk_cleanup_queue(dev->admin_q); 1673 blk_mq_free_tag_set(&dev->admin_tagset); 1674 } 1675} 1676 1677static int nvme_alloc_admin_tags(struct nvme_dev *dev) 1678{ 1679 if (!dev->admin_q) { 1680 dev->admin_tagset.ops = &nvme_mq_admin_ops; 1681 dev->admin_tagset.nr_hw_queues = 1; 1682 dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; 1683 dev->admin_tagset.reserved_tags = 1; 1684 dev->admin_tagset.timeout = ADMIN_TIMEOUT; 1685 dev->admin_tagset.numa_node = dev_to_node(dev->dev); 1686 dev->admin_tagset.cmd_size = nvme_cmd_size(dev); 1687 dev->admin_tagset.driver_data = dev; 1688 1689 if (blk_mq_alloc_tag_set(&dev->admin_tagset)) 1690 return -ENOMEM; 1691 1692 dev->admin_q = blk_mq_init_queue(&dev->admin_tagset); 1693 if (IS_ERR(dev->admin_q)) { 1694 blk_mq_free_tag_set(&dev->admin_tagset); 1695 return -ENOMEM; 1696 } 1697 if (!blk_get_queue(dev->admin_q)) { 1698 nvme_dev_remove_admin(dev); 1699 dev->admin_q = NULL; 1700 return -ENODEV; 1701 } 1702 } else 1703 blk_mq_unfreeze_queue(dev->admin_q); 1704 1705 return 0; 1706} 1707 1708static int nvme_configure_admin_queue(struct nvme_dev *dev) 1709{ 1710 int result; 1711 u32 aqa; 1712 u64 cap = readq(&dev->bar->cap); 1713 struct nvme_queue *nvmeq; 1714 unsigned page_shift = PAGE_SHIFT; 1715 unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12; 1716 unsigned dev_page_max = NVME_CAP_MPSMAX(cap) + 12; 1717 1718 if (page_shift < dev_page_min) { 1719 dev_err(dev->dev, 1720 "Minimum device page size (%u) too large for " 1721 "host (%u)\n", 1 << dev_page_min, 1722 1 << page_shift); 1723 return -ENODEV; 1724 } 1725 if (page_shift > dev_page_max) { 1726 dev_info(dev->dev, 1727 "Device maximum page size (%u) smaller than " 1728 "host (%u); enabling work-around\n", 1729 1 << dev_page_max, 1 << page_shift); 1730 page_shift = dev_page_max; 1731 } 1732 1733 dev->subsystem = readl(&dev->bar->vs) >= NVME_VS(1, 1) ? 1734 NVME_CAP_NSSRC(cap) : 0; 1735 1736 if (dev->subsystem && (readl(&dev->bar->csts) & NVME_CSTS_NSSRO)) 1737 writel(NVME_CSTS_NSSRO, &dev->bar->csts); 1738 1739 result = nvme_disable_ctrl(dev, cap); 1740 if (result < 0) 1741 return result; 1742 1743 nvmeq = dev->queues[0]; 1744 if (!nvmeq) { 1745 nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH); 1746 if (!nvmeq) 1747 return -ENOMEM; 1748 } 1749 1750 aqa = nvmeq->q_depth - 1; 1751 aqa |= aqa << 16; 1752 1753 dev->page_size = 1 << page_shift; 1754 1755 dev->ctrl_config = NVME_CC_CSS_NVM; 1756 dev->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; 1757 dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; 1758 dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; 1759 1760 writel(aqa, &dev->bar->aqa); 1761 writeq(nvmeq->sq_dma_addr, &dev->bar->asq); 1762 writeq(nvmeq->cq_dma_addr, &dev->bar->acq); 1763 1764 result = nvme_enable_ctrl(dev, cap); 1765 if (result) 1766 goto free_nvmeq; 1767 1768 nvmeq->cq_vector = 0; 1769 result = queue_request_irq(dev, nvmeq, nvmeq->irqname); 1770 if (result) { 1771 nvmeq->cq_vector = -1; 1772 goto free_nvmeq; 1773 } 1774 1775 return result; 1776 1777 free_nvmeq: 1778 nvme_free_queues(dev, 0); 1779 return result; 1780} 1781 1782static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) 1783{ 1784 struct nvme_dev *dev = ns->dev; 1785 struct nvme_user_io io; 1786 struct nvme_command c; 1787 unsigned length, meta_len; 1788 int status, write; 1789 dma_addr_t meta_dma = 0; 1790 void *meta = NULL; 1791 void __user *metadata; 1792 1793 if (copy_from_user(&io, uio, sizeof(io))) 1794 return -EFAULT; 1795 1796 switch (io.opcode) { 1797 case nvme_cmd_write: 1798 case nvme_cmd_read: 1799 case nvme_cmd_compare: 1800 break; 1801 default: 1802 return -EINVAL; 1803 } 1804 1805 length = (io.nblocks + 1) << ns->lba_shift; 1806 meta_len = (io.nblocks + 1) * ns->ms; 1807 metadata = (void __user *)(unsigned long)io.metadata; 1808 write = io.opcode & 1; 1809 1810 if (ns->ext) { 1811 length += meta_len; 1812 meta_len = 0; 1813 } 1814 if (meta_len) { 1815 if (((io.metadata & 3) || !io.metadata) && !ns->ext) 1816 return -EINVAL; 1817 1818 meta = dma_alloc_coherent(dev->dev, meta_len, 1819 &meta_dma, GFP_KERNEL); 1820 1821 if (!meta) { 1822 status = -ENOMEM; 1823 goto unmap; 1824 } 1825 if (write) { 1826 if (copy_from_user(meta, metadata, meta_len)) { 1827 status = -EFAULT; 1828 goto unmap; 1829 } 1830 } 1831 } 1832 1833 memset(&c, 0, sizeof(c)); 1834 c.rw.opcode = io.opcode; 1835 c.rw.flags = io.flags; 1836 c.rw.nsid = cpu_to_le32(ns->ns_id); 1837 c.rw.slba = cpu_to_le64(io.slba); 1838 c.rw.length = cpu_to_le16(io.nblocks); 1839 c.rw.control = cpu_to_le16(io.control); 1840 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); 1841 c.rw.reftag = cpu_to_le32(io.reftag); 1842 c.rw.apptag = cpu_to_le16(io.apptag); 1843 c.rw.appmask = cpu_to_le16(io.appmask); 1844 c.rw.metadata = cpu_to_le64(meta_dma); 1845 1846 status = __nvme_submit_sync_cmd(ns->queue, &c, NULL, 1847 (void __user *)io.addr, length, NULL, 0); 1848 unmap: 1849 if (meta) { 1850 if (status == NVME_SC_SUCCESS && !write) { 1851 if (copy_to_user(metadata, meta, meta_len)) 1852 status = -EFAULT; 1853 } 1854 dma_free_coherent(dev->dev, meta_len, meta, meta_dma); 1855 } 1856 return status; 1857} 1858 1859static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns, 1860 struct nvme_passthru_cmd __user *ucmd) 1861{ 1862 struct nvme_passthru_cmd cmd; 1863 struct nvme_command c; 1864 unsigned timeout = 0; 1865 int status; 1866 1867 if (!capable(CAP_SYS_ADMIN)) 1868 return -EACCES; 1869 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 1870 return -EFAULT; 1871 1872 memset(&c, 0, sizeof(c)); 1873 c.common.opcode = cmd.opcode; 1874 c.common.flags = cmd.flags; 1875 c.common.nsid = cpu_to_le32(cmd.nsid); 1876 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 1877 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 1878 c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); 1879 c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); 1880 c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); 1881 c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); 1882 c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); 1883 c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); 1884 1885 if (cmd.timeout_ms) 1886 timeout = msecs_to_jiffies(cmd.timeout_ms); 1887 1888 status = __nvme_submit_sync_cmd(ns ? ns->queue : dev->admin_q, &c, 1889 NULL, (void __user *)cmd.addr, cmd.data_len, 1890 &cmd.result, timeout); 1891 if (status >= 0) { 1892 if (put_user(cmd.result, &ucmd->result)) 1893 return -EFAULT; 1894 } 1895 1896 return status; 1897} 1898 1899static int nvme_subsys_reset(struct nvme_dev *dev) 1900{ 1901 if (!dev->subsystem) 1902 return -ENOTTY; 1903 1904 writel(0x4E564D65, &dev->bar->nssr); /* "NVMe" */ 1905 return 0; 1906} 1907 1908static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, 1909 unsigned long arg) 1910{ 1911 struct nvme_ns *ns = bdev->bd_disk->private_data; 1912 1913 switch (cmd) { 1914 case NVME_IOCTL_ID: 1915 force_successful_syscall_return(); 1916 return ns->ns_id; 1917 case NVME_IOCTL_ADMIN_CMD: 1918 return nvme_user_cmd(ns->dev, NULL, (void __user *)arg); 1919 case NVME_IOCTL_IO_CMD: 1920 return nvme_user_cmd(ns->dev, ns, (void __user *)arg); 1921 case NVME_IOCTL_SUBMIT_IO: 1922 return nvme_submit_io(ns, (void __user *)arg); 1923 case SG_GET_VERSION_NUM: 1924 return nvme_sg_get_version_num((void __user *)arg); 1925 case SG_IO: 1926 return nvme_sg_io(ns, (void __user *)arg); 1927 default: 1928 return -ENOTTY; 1929 } 1930} 1931 1932#ifdef CONFIG_COMPAT 1933static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, 1934 unsigned int cmd, unsigned long arg) 1935{ 1936 switch (cmd) { 1937 case SG_IO: 1938 return -ENOIOCTLCMD; 1939 } 1940 return nvme_ioctl(bdev, mode, cmd, arg); 1941} 1942#else 1943#define nvme_compat_ioctl NULL 1944#endif 1945 1946static int nvme_open(struct block_device *bdev, fmode_t mode) 1947{ 1948 int ret = 0; 1949 struct nvme_ns *ns; 1950 1951 spin_lock(&dev_list_lock); 1952 ns = bdev->bd_disk->private_data; 1953 if (!ns) 1954 ret = -ENXIO; 1955 else if (!kref_get_unless_zero(&ns->dev->kref)) 1956 ret = -ENXIO; 1957 spin_unlock(&dev_list_lock); 1958 1959 return ret; 1960} 1961 1962static void nvme_free_dev(struct kref *kref); 1963 1964static void nvme_release(struct gendisk *disk, fmode_t mode) 1965{ 1966 struct nvme_ns *ns = disk->private_data; 1967 struct nvme_dev *dev = ns->dev; 1968 1969 kref_put(&dev->kref, nvme_free_dev); 1970} 1971 1972static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo) 1973{ 1974 /* some standard values */ 1975 geo->heads = 1 << 6; 1976 geo->sectors = 1 << 5; 1977 geo->cylinders = get_capacity(bd->bd_disk) >> 11; 1978 return 0; 1979} 1980 1981static void nvme_config_discard(struct nvme_ns *ns) 1982{ 1983 u32 logical_block_size = queue_logical_block_size(ns->queue); 1984 ns->queue->limits.discard_zeroes_data = 0; 1985 ns->queue->limits.discard_alignment = logical_block_size; 1986 ns->queue->limits.discard_granularity = logical_block_size; 1987 blk_queue_max_discard_sectors(ns->queue, 0xffffffff); 1988 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); 1989} 1990 1991static int nvme_revalidate_disk(struct gendisk *disk) 1992{ 1993 struct nvme_ns *ns = disk->private_data; 1994 struct nvme_dev *dev = ns->dev; 1995 struct nvme_id_ns *id; 1996 u8 lbaf, pi_type; 1997 u16 old_ms; 1998 unsigned short bs; 1999 2000 if (nvme_identify_ns(dev, ns->ns_id, &id)) { 2001 dev_warn(dev->dev, "%s: Identify failure nvme%dn%d\n", __func__, 2002 dev->instance, ns->ns_id); 2003 return -ENODEV; 2004 } 2005 if (id->ncap == 0) { 2006 kfree(id); 2007 return -ENODEV; 2008 } 2009 2010 old_ms = ns->ms; 2011 lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; 2012 ns->lba_shift = id->lbaf[lbaf].ds; 2013 ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); 2014 ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); 2015 2016 /* 2017 * If identify namespace failed, use default 512 byte block size so 2018 * block layer can use before failing read/write for 0 capacity. 2019 */ 2020 if (ns->lba_shift == 0) 2021 ns->lba_shift = 9; 2022 bs = 1 << ns->lba_shift; 2023 2024 /* XXX: PI implementation requires metadata equal t10 pi tuple size */ 2025 pi_type = ns->ms == sizeof(struct t10_pi_tuple) ? 2026 id->dps & NVME_NS_DPS_PI_MASK : 0; 2027 2028 if (blk_get_integrity(disk) && (ns->pi_type != pi_type || 2029 ns->ms != old_ms || 2030 bs != queue_logical_block_size(disk->queue) || 2031 (ns->ms && ns->ext))) 2032 blk_integrity_unregister(disk); 2033 2034 ns->pi_type = pi_type; 2035 blk_queue_logical_block_size(ns->queue, bs); 2036 2037 if (ns->ms && !blk_get_integrity(disk) && (disk->flags & GENHD_FL_UP) && 2038 !ns->ext) 2039 nvme_init_integrity(ns); 2040 2041 if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) 2042 set_capacity(disk, 0); 2043 else 2044 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); 2045 2046 if (dev->oncs & NVME_CTRL_ONCS_DSM) 2047 nvme_config_discard(ns); 2048 2049 kfree(id); 2050 return 0; 2051} 2052 2053static const struct block_device_operations nvme_fops = { 2054 .owner = THIS_MODULE, 2055 .ioctl = nvme_ioctl, 2056 .compat_ioctl = nvme_compat_ioctl, 2057 .open = nvme_open, 2058 .release = nvme_release, 2059 .getgeo = nvme_getgeo, 2060 .revalidate_disk= nvme_revalidate_disk, 2061}; 2062 2063static int nvme_kthread(void *data) 2064{ 2065 struct nvme_dev *dev, *next; 2066 2067 while (!kthread_should_stop()) { 2068 set_current_state(TASK_INTERRUPTIBLE); 2069 spin_lock(&dev_list_lock); 2070 list_for_each_entry_safe(dev, next, &dev_list, node) { 2071 int i; 2072 u32 csts = readl(&dev->bar->csts); 2073 2074 if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) || 2075 csts & NVME_CSTS_CFS) { 2076 if (work_busy(&dev->reset_work)) 2077 continue; 2078 list_del_init(&dev->node); 2079 dev_warn(dev->dev, 2080 "Failed status: %x, reset controller\n", 2081 readl(&dev->bar->csts)); 2082 dev->reset_workfn = nvme_reset_failed_dev; 2083 queue_work(nvme_workq, &dev->reset_work); 2084 continue; 2085 } 2086 for (i = 0; i < dev->queue_count; i++) { 2087 struct nvme_queue *nvmeq = dev->queues[i]; 2088 if (!nvmeq) 2089 continue; 2090 spin_lock_irq(&nvmeq->q_lock); 2091 nvme_process_cq(nvmeq); 2092 2093 while ((i == 0) && (dev->event_limit > 0)) { 2094 if (nvme_submit_async_admin_req(dev)) 2095 break; 2096 dev->event_limit--; 2097 } 2098 spin_unlock_irq(&nvmeq->q_lock); 2099 } 2100 } 2101 spin_unlock(&dev_list_lock); 2102 schedule_timeout(round_jiffies_relative(HZ)); 2103 } 2104 return 0; 2105} 2106 2107static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) 2108{ 2109 struct nvme_ns *ns; 2110 struct gendisk *disk; 2111 int node = dev_to_node(dev->dev); 2112 2113 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); 2114 if (!ns) 2115 return; 2116 2117 ns->queue = blk_mq_init_queue(&dev->tagset); 2118 if (IS_ERR(ns->queue)) 2119 goto out_free_ns; 2120 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); 2121 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); 2122 ns->dev = dev; 2123 ns->queue->queuedata = ns; 2124 2125 disk = alloc_disk_node(0, node); 2126 if (!disk) 2127 goto out_free_queue; 2128 2129 ns->ns_id = nsid; 2130 ns->disk = disk; 2131 ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ 2132 list_add_tail(&ns->list, &dev->namespaces); 2133 2134 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); 2135 if (dev->max_hw_sectors) { 2136 blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); 2137 blk_queue_max_segments(ns->queue, 2138 ((dev->max_hw_sectors << 9) / dev->page_size) + 1); 2139 } 2140 if (dev->stripe_size) 2141 blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9); 2142 if (dev->vwc & NVME_CTRL_VWC_PRESENT) 2143 blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); 2144 blk_queue_virt_boundary(ns->queue, dev->page_size - 1); 2145 2146 disk->major = nvme_major; 2147 disk->first_minor = 0; 2148 disk->fops = &nvme_fops; 2149 disk->private_data = ns; 2150 disk->queue = ns->queue; 2151 disk->driverfs_dev = dev->device; 2152 disk->flags = GENHD_FL_EXT_DEVT; 2153 sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); 2154 2155 /* 2156 * Initialize capacity to 0 until we establish the namespace format and 2157 * setup integrity extentions if necessary. The revalidate_disk after 2158 * add_disk allows the driver to register with integrity if the format 2159 * requires it. 2160 */ 2161 set_capacity(disk, 0); 2162 if (nvme_revalidate_disk(ns->disk)) 2163 goto out_free_disk; 2164 2165 add_disk(ns->disk); 2166 if (ns->ms) { 2167 struct block_device *bd = bdget_disk(ns->disk, 0); 2168 if (!bd) 2169 return; 2170 if (blkdev_get(bd, FMODE_READ, NULL)) { 2171 bdput(bd); 2172 return; 2173 } 2174 blkdev_reread_part(bd); 2175 blkdev_put(bd, FMODE_READ); 2176 } 2177 return; 2178 out_free_disk: 2179 kfree(disk); 2180 list_del(&ns->list); 2181 out_free_queue: 2182 blk_cleanup_queue(ns->queue); 2183 out_free_ns: 2184 kfree(ns); 2185} 2186 2187static void nvme_create_io_queues(struct nvme_dev *dev) 2188{ 2189 unsigned i; 2190 2191 for (i = dev->queue_count; i <= dev->max_qid; i++) 2192 if (!nvme_alloc_queue(dev, i, dev->q_depth)) 2193 break; 2194 2195 for (i = dev->online_queues; i <= dev->queue_count - 1; i++) 2196 if (nvme_create_queue(dev->queues[i], i)) 2197 break; 2198} 2199 2200static int set_queue_count(struct nvme_dev *dev, int count) 2201{ 2202 int status; 2203 u32 result; 2204 u32 q_count = (count - 1) | ((count - 1) << 16); 2205 2206 status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, 2207 &result); 2208 if (status < 0) 2209 return status; 2210 if (status > 0) { 2211 dev_err(dev->dev, "Could not set queue count (%d)\n", status); 2212 return 0; 2213 } 2214 return min(result & 0xffff, result >> 16) + 1; 2215} 2216 2217static void __iomem *nvme_map_cmb(struct nvme_dev *dev) 2218{ 2219 u64 szu, size, offset; 2220 u32 cmbloc; 2221 resource_size_t bar_size; 2222 struct pci_dev *pdev = to_pci_dev(dev->dev); 2223 void __iomem *cmb; 2224 dma_addr_t dma_addr; 2225 2226 if (!use_cmb_sqes) 2227 return NULL; 2228 2229 dev->cmbsz = readl(&dev->bar->cmbsz); 2230 if (!(NVME_CMB_SZ(dev->cmbsz))) 2231 return NULL; 2232 2233 cmbloc = readl(&dev->bar->cmbloc); 2234 2235 szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz)); 2236 size = szu * NVME_CMB_SZ(dev->cmbsz); 2237 offset = szu * NVME_CMB_OFST(cmbloc); 2238 bar_size = pci_resource_len(pdev, NVME_CMB_BIR(cmbloc)); 2239 2240 if (offset > bar_size) 2241 return NULL; 2242 2243 /* 2244 * Controllers may support a CMB size larger than their BAR, 2245 * for example, due to being behind a bridge. Reduce the CMB to 2246 * the reported size of the BAR 2247 */ 2248 if (size > bar_size - offset) 2249 size = bar_size - offset; 2250 2251 dma_addr = pci_resource_start(pdev, NVME_CMB_BIR(cmbloc)) + offset; 2252 cmb = ioremap_wc(dma_addr, size); 2253 if (!cmb) 2254 return NULL; 2255 2256 dev->cmb_dma_addr = dma_addr; 2257 dev->cmb_size = size; 2258 return cmb; 2259} 2260 2261static inline void nvme_release_cmb(struct nvme_dev *dev) 2262{ 2263 if (dev->cmb) { 2264 iounmap(dev->cmb); 2265 dev->cmb = NULL; 2266 } 2267} 2268 2269static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) 2270{ 2271 return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride); 2272} 2273 2274static int nvme_setup_io_queues(struct nvme_dev *dev) 2275{ 2276 struct nvme_queue *adminq = dev->queues[0]; 2277 struct pci_dev *pdev = to_pci_dev(dev->dev); 2278 int result, i, vecs, nr_io_queues, size; 2279 2280 nr_io_queues = num_possible_cpus(); 2281 result = set_queue_count(dev, nr_io_queues); 2282 if (result <= 0) 2283 return result; 2284 if (result < nr_io_queues) 2285 nr_io_queues = result; 2286 2287 if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) { 2288 result = nvme_cmb_qdepth(dev, nr_io_queues, 2289 sizeof(struct nvme_command)); 2290 if (result > 0) 2291 dev->q_depth = result; 2292 else 2293 nvme_release_cmb(dev); 2294 } 2295 2296 size = db_bar_size(dev, nr_io_queues); 2297 if (size > 8192) { 2298 iounmap(dev->bar); 2299 do { 2300 dev->bar = ioremap(pci_resource_start(pdev, 0), size); 2301 if (dev->bar) 2302 break; 2303 if (!--nr_io_queues) 2304 return -ENOMEM; 2305 size = db_bar_size(dev, nr_io_queues); 2306 } while (1); 2307 dev->dbs = ((void __iomem *)dev->bar) + 4096; 2308 adminq->q_db = dev->dbs; 2309 } 2310 2311 /* Deregister the admin queue's interrupt */ 2312 free_irq(dev->entry[0].vector, adminq); 2313 2314 /* 2315 * If we enable msix early due to not intx, disable it again before 2316 * setting up the full range we need. 2317 */ 2318 if (!pdev->irq) 2319 pci_disable_msix(pdev); 2320 2321 for (i = 0; i < nr_io_queues; i++) 2322 dev->entry[i].entry = i; 2323 vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues); 2324 if (vecs < 0) { 2325 vecs = pci_enable_msi_range(pdev, 1, min(nr_io_queues, 32)); 2326 if (vecs < 0) { 2327 vecs = 1; 2328 } else { 2329 for (i = 0; i < vecs; i++) 2330 dev->entry[i].vector = i + pdev->irq; 2331 } 2332 } 2333 2334 /* 2335 * Should investigate if there's a performance win from allocating 2336 * more queues than interrupt vectors; it might allow the submission 2337 * path to scale better, even if the receive path is limited by the 2338 * number of interrupts. 2339 */ 2340 nr_io_queues = vecs; 2341 dev->max_qid = nr_io_queues; 2342 2343 result = queue_request_irq(dev, adminq, adminq->irqname); 2344 if (result) { 2345 adminq->cq_vector = -1; 2346 goto free_queues; 2347 } 2348 2349 /* Free previously allocated queues that are no longer usable */ 2350 nvme_free_queues(dev, nr_io_queues + 1); 2351 nvme_create_io_queues(dev); 2352 2353 return 0; 2354 2355 free_queues: 2356 nvme_free_queues(dev, 1); 2357 return result; 2358} 2359 2360static void nvme_free_namespace(struct nvme_ns *ns) 2361{ 2362 list_del(&ns->list); 2363 2364 spin_lock(&dev_list_lock); 2365 ns->disk->private_data = NULL; 2366 spin_unlock(&dev_list_lock); 2367 2368 put_disk(ns->disk); 2369 kfree(ns); 2370} 2371 2372static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) 2373{ 2374 struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); 2375 struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); 2376 2377 return nsa->ns_id - nsb->ns_id; 2378} 2379 2380static struct nvme_ns *nvme_find_ns(struct nvme_dev *dev, unsigned nsid) 2381{ 2382 struct nvme_ns *ns; 2383 2384 list_for_each_entry(ns, &dev->namespaces, list) { 2385 if (ns->ns_id == nsid) 2386 return ns; 2387 if (ns->ns_id > nsid) 2388 break; 2389 } 2390 return NULL; 2391} 2392 2393static inline bool nvme_io_incapable(struct nvme_dev *dev) 2394{ 2395 return (!dev->bar || readl(&dev->bar->csts) & NVME_CSTS_CFS || 2396 dev->online_queues < 2); 2397} 2398 2399static void nvme_ns_remove(struct nvme_ns *ns) 2400{ 2401 bool kill = nvme_io_incapable(ns->dev) && !blk_queue_dying(ns->queue); 2402 2403 if (kill) 2404 blk_set_queue_dying(ns->queue); 2405 if (ns->disk->flags & GENHD_FL_UP) { 2406 if (blk_get_integrity(ns->disk)) 2407 blk_integrity_unregister(ns->disk); 2408 del_gendisk(ns->disk); 2409 } 2410 if (kill || !blk_queue_dying(ns->queue)) { 2411 blk_mq_abort_requeue_list(ns->queue); 2412 blk_cleanup_queue(ns->queue); 2413 } 2414} 2415 2416static void nvme_scan_namespaces(struct nvme_dev *dev, unsigned nn) 2417{ 2418 struct nvme_ns *ns, *next; 2419 unsigned i; 2420 2421 for (i = 1; i <= nn; i++) { 2422 ns = nvme_find_ns(dev, i); 2423 if (ns) { 2424 if (revalidate_disk(ns->disk)) { 2425 nvme_ns_remove(ns); 2426 nvme_free_namespace(ns); 2427 } 2428 } else 2429 nvme_alloc_ns(dev, i); 2430 } 2431 list_for_each_entry_safe(ns, next, &dev->namespaces, list) { 2432 if (ns->ns_id > nn) { 2433 nvme_ns_remove(ns); 2434 nvme_free_namespace(ns); 2435 } 2436 } 2437 list_sort(NULL, &dev->namespaces, ns_cmp); 2438} 2439 2440static void nvme_set_irq_hints(struct nvme_dev *dev) 2441{ 2442 struct nvme_queue *nvmeq; 2443 int i; 2444 2445 for (i = 0; i < dev->online_queues; i++) { 2446 nvmeq = dev->queues[i]; 2447 2448 if (!nvmeq->tags || !(*nvmeq->tags)) 2449 continue; 2450 2451 irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, 2452 blk_mq_tags_cpumask(*nvmeq->tags)); 2453 } 2454} 2455 2456static void nvme_dev_scan(struct work_struct *work) 2457{ 2458 struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work); 2459 struct nvme_id_ctrl *ctrl; 2460 2461 if (!dev->tagset.tags) 2462 return; 2463 if (nvme_identify_ctrl(dev, &ctrl)) 2464 return; 2465 nvme_scan_namespaces(dev, le32_to_cpup(&ctrl->nn)); 2466 kfree(ctrl); 2467 nvme_set_irq_hints(dev); 2468} 2469 2470/* 2471 * Return: error value if an error occurred setting up the queues or calling 2472 * Identify Device. 0 if these succeeded, even if adding some of the 2473 * namespaces failed. At the moment, these failures are silent. TBD which 2474 * failures should be reported. 2475 */ 2476static int nvme_dev_add(struct nvme_dev *dev) 2477{ 2478 struct pci_dev *pdev = to_pci_dev(dev->dev); 2479 int res; 2480 struct nvme_id_ctrl *ctrl; 2481 int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; 2482 2483 res = nvme_identify_ctrl(dev, &ctrl); 2484 if (res) { 2485 dev_err(dev->dev, "Identify Controller failed (%d)\n", res); 2486 return -EIO; 2487 } 2488 2489 dev->oncs = le16_to_cpup(&ctrl->oncs); 2490 dev->abort_limit = ctrl->acl + 1; 2491 dev->vwc = ctrl->vwc; 2492 memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); 2493 memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); 2494 memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); 2495 if (ctrl->mdts) 2496 dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); 2497 if ((pdev->vendor == PCI_VENDOR_ID_INTEL) && 2498 (pdev->device == 0x0953) && ctrl->vs[3]) { 2499 unsigned int max_hw_sectors; 2500 2501 dev->stripe_size = 1 << (ctrl->vs[3] + shift); 2502 max_hw_sectors = dev->stripe_size >> (shift - 9); 2503 if (dev->max_hw_sectors) { 2504 dev->max_hw_sectors = min(max_hw_sectors, 2505 dev->max_hw_sectors); 2506 } else 2507 dev->max_hw_sectors = max_hw_sectors; 2508 } 2509 kfree(ctrl); 2510 2511 if (!dev->tagset.tags) { 2512 dev->tagset.ops = &nvme_mq_ops; 2513 dev->tagset.nr_hw_queues = dev->online_queues - 1; 2514 dev->tagset.timeout = NVME_IO_TIMEOUT; 2515 dev->tagset.numa_node = dev_to_node(dev->dev); 2516 dev->tagset.queue_depth = 2517 min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; 2518 dev->tagset.cmd_size = nvme_cmd_size(dev); 2519 dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; 2520 dev->tagset.driver_data = dev; 2521 2522 if (blk_mq_alloc_tag_set(&dev->tagset)) 2523 return 0; 2524 } 2525 schedule_work(&dev->scan_work); 2526 return 0; 2527} 2528 2529static int nvme_dev_map(struct nvme_dev *dev) 2530{ 2531 u64 cap; 2532 int bars, result = -ENOMEM; 2533 struct pci_dev *pdev = to_pci_dev(dev->dev); 2534 2535 if (pci_enable_device_mem(pdev)) 2536 return result; 2537 2538 dev->entry[0].vector = pdev->irq; 2539 pci_set_master(pdev); 2540 bars = pci_select_bars(pdev, IORESOURCE_MEM); 2541 if (!bars) 2542 goto disable_pci; 2543 2544 if (pci_request_selected_regions(pdev, bars, "nvme")) 2545 goto disable_pci; 2546 2547 if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) && 2548 dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32))) 2549 goto disable; 2550 2551 dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); 2552 if (!dev->bar) 2553 goto disable; 2554 2555 if (readl(&dev->bar->csts) == -1) { 2556 result = -ENODEV; 2557 goto unmap; 2558 } 2559 2560 /* 2561 * Some devices don't advertse INTx interrupts, pre-enable a single 2562 * MSIX vec for setup. We'll adjust this later. 2563 */ 2564 if (!pdev->irq) { 2565 result = pci_enable_msix(pdev, dev->entry, 1); 2566 if (result < 0) 2567 goto unmap; 2568 } 2569 2570 cap = readq(&dev->bar->cap); 2571 dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH); 2572 dev->db_stride = 1 << NVME_CAP_STRIDE(cap); 2573 dev->dbs = ((void __iomem *)dev->bar) + 4096; 2574 if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) 2575 dev->cmb = nvme_map_cmb(dev); 2576 2577 return 0; 2578 2579 unmap: 2580 iounmap(dev->bar); 2581 dev->bar = NULL; 2582 disable: 2583 pci_release_regions(pdev); 2584 disable_pci: 2585 pci_disable_device(pdev); 2586 return result; 2587} 2588 2589static void nvme_dev_unmap(struct nvme_dev *dev) 2590{ 2591 struct pci_dev *pdev = to_pci_dev(dev->dev); 2592 2593 if (pdev->msi_enabled) 2594 pci_disable_msi(pdev); 2595 else if (pdev->msix_enabled) 2596 pci_disable_msix(pdev); 2597 2598 if (dev->bar) { 2599 iounmap(dev->bar); 2600 dev->bar = NULL; 2601 pci_release_regions(pdev); 2602 } 2603 2604 if (pci_is_enabled(pdev)) 2605 pci_disable_device(pdev); 2606} 2607 2608struct nvme_delq_ctx { 2609 struct task_struct *waiter; 2610 struct kthread_worker *worker; 2611 atomic_t refcount; 2612}; 2613 2614static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev) 2615{ 2616 dq->waiter = current; 2617 mb(); 2618 2619 for (;;) { 2620 set_current_state(TASK_KILLABLE); 2621 if (!atomic_read(&dq->refcount)) 2622 break; 2623 if (!schedule_timeout(ADMIN_TIMEOUT) || 2624 fatal_signal_pending(current)) { 2625 /* 2626 * Disable the controller first since we can't trust it 2627 * at this point, but leave the admin queue enabled 2628 * until all queue deletion requests are flushed. 2629 * FIXME: This may take a while if there are more h/w 2630 * queues than admin tags. 2631 */ 2632 set_current_state(TASK_RUNNING); 2633 nvme_disable_ctrl(dev, readq(&dev->bar->cap)); 2634 nvme_clear_queue(dev->queues[0]); 2635 flush_kthread_worker(dq->worker); 2636 nvme_disable_queue(dev, 0); 2637 return; 2638 } 2639 } 2640 set_current_state(TASK_RUNNING); 2641} 2642 2643static void nvme_put_dq(struct nvme_delq_ctx *dq) 2644{ 2645 atomic_dec(&dq->refcount); 2646 if (dq->waiter) 2647 wake_up_process(dq->waiter); 2648} 2649 2650static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq) 2651{ 2652 atomic_inc(&dq->refcount); 2653 return dq; 2654} 2655 2656static void nvme_del_queue_end(struct nvme_queue *nvmeq) 2657{ 2658 struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx; 2659 nvme_put_dq(dq); 2660} 2661 2662static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode, 2663 kthread_work_func_t fn) 2664{ 2665 struct nvme_command c; 2666 2667 memset(&c, 0, sizeof(c)); 2668 c.delete_queue.opcode = opcode; 2669 c.delete_queue.qid = cpu_to_le16(nvmeq->qid); 2670 2671 init_kthread_work(&nvmeq->cmdinfo.work, fn); 2672 return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo, 2673 ADMIN_TIMEOUT); 2674} 2675 2676static void nvme_del_cq_work_handler(struct kthread_work *work) 2677{ 2678 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, 2679 cmdinfo.work); 2680 nvme_del_queue_end(nvmeq); 2681} 2682 2683static int nvme_delete_cq(struct nvme_queue *nvmeq) 2684{ 2685 return adapter_async_del_queue(nvmeq, nvme_admin_delete_cq, 2686 nvme_del_cq_work_handler); 2687} 2688 2689static void nvme_del_sq_work_handler(struct kthread_work *work) 2690{ 2691 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, 2692 cmdinfo.work); 2693 int status = nvmeq->cmdinfo.status; 2694 2695 if (!status) 2696 status = nvme_delete_cq(nvmeq); 2697 if (status) 2698 nvme_del_queue_end(nvmeq); 2699} 2700 2701static int nvme_delete_sq(struct nvme_queue *nvmeq) 2702{ 2703 return adapter_async_del_queue(nvmeq, nvme_admin_delete_sq, 2704 nvme_del_sq_work_handler); 2705} 2706 2707static void nvme_del_queue_start(struct kthread_work *work) 2708{ 2709 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, 2710 cmdinfo.work); 2711 if (nvme_delete_sq(nvmeq)) 2712 nvme_del_queue_end(nvmeq); 2713} 2714 2715static void nvme_disable_io_queues(struct nvme_dev *dev) 2716{ 2717 int i; 2718 DEFINE_KTHREAD_WORKER_ONSTACK(worker); 2719 struct nvme_delq_ctx dq; 2720 struct task_struct *kworker_task = kthread_run(kthread_worker_fn, 2721 &worker, "nvme%d", dev->instance); 2722 2723 if (IS_ERR(kworker_task)) { 2724 dev_err(dev->dev, 2725 "Failed to create queue del task\n"); 2726 for (i = dev->queue_count - 1; i > 0; i--) 2727 nvme_disable_queue(dev, i); 2728 return; 2729 } 2730 2731 dq.waiter = NULL; 2732 atomic_set(&dq.refcount, 0); 2733 dq.worker = &worker; 2734 for (i = dev->queue_count - 1; i > 0; i--) { 2735 struct nvme_queue *nvmeq = dev->queues[i]; 2736 2737 if (nvme_suspend_queue(nvmeq)) 2738 continue; 2739 nvmeq->cmdinfo.ctx = nvme_get_dq(&dq); 2740 nvmeq->cmdinfo.worker = dq.worker; 2741 init_kthread_work(&nvmeq->cmdinfo.work, nvme_del_queue_start); 2742 queue_kthread_work(dq.worker, &nvmeq->cmdinfo.work); 2743 } 2744 nvme_wait_dq(&dq, dev); 2745 kthread_stop(kworker_task); 2746} 2747 2748/* 2749* Remove the node from the device list and check 2750* for whether or not we need to stop the nvme_thread. 2751*/ 2752static void nvme_dev_list_remove(struct nvme_dev *dev) 2753{ 2754 struct task_struct *tmp = NULL; 2755 2756 spin_lock(&dev_list_lock); 2757 list_del_init(&dev->node); 2758 if (list_empty(&dev_list) && !IS_ERR_OR_NULL(nvme_thread)) { 2759 tmp = nvme_thread; 2760 nvme_thread = NULL; 2761 } 2762 spin_unlock(&dev_list_lock); 2763 2764 if (tmp) 2765 kthread_stop(tmp); 2766} 2767 2768static void nvme_freeze_queues(struct nvme_dev *dev) 2769{ 2770 struct nvme_ns *ns; 2771 2772 list_for_each_entry(ns, &dev->namespaces, list) { 2773 blk_mq_freeze_queue_start(ns->queue); 2774 2775 spin_lock_irq(ns->queue->queue_lock); 2776 queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue); 2777 spin_unlock_irq(ns->queue->queue_lock); 2778 2779 blk_mq_cancel_requeue_work(ns->queue); 2780 blk_mq_stop_hw_queues(ns->queue); 2781 } 2782} 2783 2784static void nvme_unfreeze_queues(struct nvme_dev *dev) 2785{ 2786 struct nvme_ns *ns; 2787 2788 list_for_each_entry(ns, &dev->namespaces, list) { 2789 queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue); 2790 blk_mq_unfreeze_queue(ns->queue); 2791 blk_mq_start_stopped_hw_queues(ns->queue, true); 2792 blk_mq_kick_requeue_list(ns->queue); 2793 } 2794} 2795 2796static void nvme_dev_shutdown(struct nvme_dev *dev) 2797{ 2798 int i; 2799 u32 csts = -1; 2800 2801 nvme_dev_list_remove(dev); 2802 2803 if (dev->bar) { 2804 nvme_freeze_queues(dev); 2805 csts = readl(&dev->bar->csts); 2806 } 2807 if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { 2808 for (i = dev->queue_count - 1; i >= 0; i--) { 2809 struct nvme_queue *nvmeq = dev->queues[i]; 2810 nvme_suspend_queue(nvmeq); 2811 } 2812 } else { 2813 nvme_disable_io_queues(dev); 2814 nvme_shutdown_ctrl(dev); 2815 nvme_disable_queue(dev, 0); 2816 } 2817 nvme_dev_unmap(dev); 2818 2819 for (i = dev->queue_count - 1; i >= 0; i--) 2820 nvme_clear_queue(dev->queues[i]); 2821} 2822 2823static void nvme_dev_remove(struct nvme_dev *dev) 2824{ 2825 struct nvme_ns *ns; 2826 2827 list_for_each_entry(ns, &dev->namespaces, list) 2828 nvme_ns_remove(ns); 2829} 2830 2831static int nvme_setup_prp_pools(struct nvme_dev *dev) 2832{ 2833 dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, 2834 PAGE_SIZE, PAGE_SIZE, 0); 2835 if (!dev->prp_page_pool) 2836 return -ENOMEM; 2837 2838 /* Optimisation for I/Os between 4k and 128k */ 2839 dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev, 2840 256, 256, 0); 2841 if (!dev->prp_small_pool) { 2842 dma_pool_destroy(dev->prp_page_pool); 2843 return -ENOMEM; 2844 } 2845 return 0; 2846} 2847 2848static void nvme_release_prp_pools(struct nvme_dev *dev) 2849{ 2850 dma_pool_destroy(dev->prp_page_pool); 2851 dma_pool_destroy(dev->prp_small_pool); 2852} 2853 2854static DEFINE_IDA(nvme_instance_ida); 2855 2856static int nvme_set_instance(struct nvme_dev *dev) 2857{ 2858 int instance, error; 2859 2860 do { 2861 if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) 2862 return -ENODEV; 2863 2864 spin_lock(&dev_list_lock); 2865 error = ida_get_new(&nvme_instance_ida, &instance); 2866 spin_unlock(&dev_list_lock); 2867 } while (error == -EAGAIN); 2868 2869 if (error) 2870 return -ENODEV; 2871 2872 dev->instance = instance; 2873 return 0; 2874} 2875 2876static void nvme_release_instance(struct nvme_dev *dev) 2877{ 2878 spin_lock(&dev_list_lock); 2879 ida_remove(&nvme_instance_ida, dev->instance); 2880 spin_unlock(&dev_list_lock); 2881} 2882 2883static void nvme_free_namespaces(struct nvme_dev *dev) 2884{ 2885 struct nvme_ns *ns, *next; 2886 2887 list_for_each_entry_safe(ns, next, &dev->namespaces, list) 2888 nvme_free_namespace(ns); 2889} 2890 2891static void nvme_free_dev(struct kref *kref) 2892{ 2893 struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); 2894 2895 put_device(dev->dev); 2896 put_device(dev->device); 2897 nvme_free_namespaces(dev); 2898 nvme_release_instance(dev); 2899 if (dev->tagset.tags) 2900 blk_mq_free_tag_set(&dev->tagset); 2901 if (dev->admin_q) 2902 blk_put_queue(dev->admin_q); 2903 kfree(dev->queues); 2904 kfree(dev->entry); 2905 kfree(dev); 2906} 2907 2908static int nvme_dev_open(struct inode *inode, struct file *f) 2909{ 2910 struct nvme_dev *dev; 2911 int instance = iminor(inode); 2912 int ret = -ENODEV; 2913 2914 spin_lock(&dev_list_lock); 2915 list_for_each_entry(dev, &dev_list, node) { 2916 if (dev->instance == instance) { 2917 if (!dev->admin_q) { 2918 ret = -EWOULDBLOCK; 2919 break; 2920 } 2921 if (!kref_get_unless_zero(&dev->kref)) 2922 break; 2923 f->private_data = dev; 2924 ret = 0; 2925 break; 2926 } 2927 } 2928 spin_unlock(&dev_list_lock); 2929 2930 return ret; 2931} 2932 2933static int nvme_dev_release(struct inode *inode, struct file *f) 2934{ 2935 struct nvme_dev *dev = f->private_data; 2936 kref_put(&dev->kref, nvme_free_dev); 2937 return 0; 2938} 2939 2940static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 2941{ 2942 struct nvme_dev *dev = f->private_data; 2943 struct nvme_ns *ns; 2944 2945 switch (cmd) { 2946 case NVME_IOCTL_ADMIN_CMD: 2947 return nvme_user_cmd(dev, NULL, (void __user *)arg); 2948 case NVME_IOCTL_IO_CMD: 2949 if (list_empty(&dev->namespaces)) 2950 return -ENOTTY; 2951 ns = list_first_entry(&dev->namespaces, struct nvme_ns, list); 2952 return nvme_user_cmd(dev, ns, (void __user *)arg); 2953 case NVME_IOCTL_RESET: 2954 dev_warn(dev->dev, "resetting controller\n"); 2955 return nvme_reset(dev); 2956 case NVME_IOCTL_SUBSYS_RESET: 2957 return nvme_subsys_reset(dev); 2958 default: 2959 return -ENOTTY; 2960 } 2961} 2962 2963static const struct file_operations nvme_dev_fops = { 2964 .owner = THIS_MODULE, 2965 .open = nvme_dev_open, 2966 .release = nvme_dev_release, 2967 .unlocked_ioctl = nvme_dev_ioctl, 2968 .compat_ioctl = nvme_dev_ioctl, 2969}; 2970 2971static int nvme_dev_start(struct nvme_dev *dev) 2972{ 2973 int result; 2974 bool start_thread = false; 2975 2976 result = nvme_dev_map(dev); 2977 if (result) 2978 return result; 2979 2980 result = nvme_configure_admin_queue(dev); 2981 if (result) 2982 goto unmap; 2983 2984 spin_lock(&dev_list_lock); 2985 if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) { 2986 start_thread = true; 2987 nvme_thread = NULL; 2988 } 2989 list_add(&dev->node, &dev_list); 2990 spin_unlock(&dev_list_lock); 2991 2992 if (start_thread) { 2993 nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); 2994 wake_up_all(&nvme_kthread_wait); 2995 } else 2996 wait_event_killable(nvme_kthread_wait, nvme_thread); 2997 2998 if (IS_ERR_OR_NULL(nvme_thread)) { 2999 result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR; 3000 goto disable; 3001 } 3002 3003 nvme_init_queue(dev->queues[0], 0); 3004 result = nvme_alloc_admin_tags(dev); 3005 if (result) 3006 goto disable; 3007 3008 result = nvme_setup_io_queues(dev); 3009 if (result) 3010 goto free_tags; 3011 3012 dev->event_limit = 1; 3013 return result; 3014 3015 free_tags: 3016 nvme_dev_remove_admin(dev); 3017 blk_put_queue(dev->admin_q); 3018 dev->admin_q = NULL; 3019 dev->queues[0]->tags = NULL; 3020 disable: 3021 nvme_disable_queue(dev, 0); 3022 nvme_dev_list_remove(dev); 3023 unmap: 3024 nvme_dev_unmap(dev); 3025 return result; 3026} 3027 3028static int nvme_remove_dead_ctrl(void *arg) 3029{ 3030 struct nvme_dev *dev = (struct nvme_dev *)arg; 3031 struct pci_dev *pdev = to_pci_dev(dev->dev); 3032 3033 if (pci_get_drvdata(pdev)) 3034 pci_stop_and_remove_bus_device_locked(pdev); 3035 kref_put(&dev->kref, nvme_free_dev); 3036 return 0; 3037} 3038 3039static void nvme_remove_disks(struct work_struct *ws) 3040{ 3041 struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); 3042 3043 nvme_free_queues(dev, 1); 3044 nvme_dev_remove(dev); 3045} 3046 3047static int nvme_dev_resume(struct nvme_dev *dev) 3048{ 3049 int ret; 3050 3051 ret = nvme_dev_start(dev); 3052 if (ret) 3053 return ret; 3054 if (dev->online_queues < 2) { 3055 spin_lock(&dev_list_lock); 3056 dev->reset_workfn = nvme_remove_disks; 3057 queue_work(nvme_workq, &dev->reset_work); 3058 spin_unlock(&dev_list_lock); 3059 } else { 3060 nvme_unfreeze_queues(dev); 3061 nvme_dev_add(dev); 3062 } 3063 return 0; 3064} 3065 3066static void nvme_dead_ctrl(struct nvme_dev *dev) 3067{ 3068 dev_warn(dev->dev, "Device failed to resume\n"); 3069 kref_get(&dev->kref); 3070 if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", 3071 dev->instance))) { 3072 dev_err(dev->dev, 3073 "Failed to start controller remove task\n"); 3074 kref_put(&dev->kref, nvme_free_dev); 3075 } 3076} 3077 3078static void nvme_dev_reset(struct nvme_dev *dev) 3079{ 3080 bool in_probe = work_busy(&dev->probe_work); 3081 3082 nvme_dev_shutdown(dev); 3083 3084 /* Synchronize with device probe so that work will see failure status 3085 * and exit gracefully without trying to schedule another reset */ 3086 flush_work(&dev->probe_work); 3087 3088 /* Fail this device if reset occured during probe to avoid 3089 * infinite initialization loops. */ 3090 if (in_probe) { 3091 nvme_dead_ctrl(dev); 3092 return; 3093 } 3094 /* Schedule device resume asynchronously so the reset work is available 3095 * to cleanup errors that may occur during reinitialization */ 3096 schedule_work(&dev->probe_work); 3097} 3098 3099static void nvme_reset_failed_dev(struct work_struct *ws) 3100{ 3101 struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); 3102 nvme_dev_reset(dev); 3103} 3104 3105static void nvme_reset_workfn(struct work_struct *work) 3106{ 3107 struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); 3108 dev->reset_workfn(work); 3109} 3110 3111static int nvme_reset(struct nvme_dev *dev) 3112{ 3113 int ret = -EBUSY; 3114 3115 if (!dev->admin_q || blk_queue_dying(dev->admin_q)) 3116 return -ENODEV; 3117 3118 spin_lock(&dev_list_lock); 3119 if (!work_pending(&dev->reset_work)) { 3120 dev->reset_workfn = nvme_reset_failed_dev; 3121 queue_work(nvme_workq, &dev->reset_work); 3122 ret = 0; 3123 } 3124 spin_unlock(&dev_list_lock); 3125 3126 if (!ret) { 3127 flush_work(&dev->reset_work); 3128 flush_work(&dev->probe_work); 3129 return 0; 3130 } 3131 3132 return ret; 3133} 3134 3135static ssize_t nvme_sysfs_reset(struct device *dev, 3136 struct device_attribute *attr, const char *buf, 3137 size_t count) 3138{ 3139 struct nvme_dev *ndev = dev_get_drvdata(dev); 3140 int ret; 3141 3142 ret = nvme_reset(ndev); 3143 if (ret < 0) 3144 return ret; 3145 3146 return count; 3147} 3148static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); 3149 3150static void nvme_async_probe(struct work_struct *work); 3151static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) 3152{ 3153 int node, result = -ENOMEM; 3154 struct nvme_dev *dev; 3155 3156 node = dev_to_node(&pdev->dev); 3157 if (node == NUMA_NO_NODE) 3158 set_dev_node(&pdev->dev, 0); 3159 3160 dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); 3161 if (!dev) 3162 return -ENOMEM; 3163 dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry), 3164 GFP_KERNEL, node); 3165 if (!dev->entry) 3166 goto free; 3167 dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *), 3168 GFP_KERNEL, node); 3169 if (!dev->queues) 3170 goto free; 3171 3172 INIT_LIST_HEAD(&dev->namespaces); 3173 dev->reset_workfn = nvme_reset_failed_dev; 3174 INIT_WORK(&dev->reset_work, nvme_reset_workfn); 3175 dev->dev = get_device(&pdev->dev); 3176 pci_set_drvdata(pdev, dev); 3177 result = nvme_set_instance(dev); 3178 if (result) 3179 goto put_pci; 3180 3181 result = nvme_setup_prp_pools(dev); 3182 if (result) 3183 goto release; 3184 3185 kref_init(&dev->kref); 3186 dev->device = device_create(nvme_class, &pdev->dev, 3187 MKDEV(nvme_char_major, dev->instance), 3188 dev, "nvme%d", dev->instance); 3189 if (IS_ERR(dev->device)) { 3190 result = PTR_ERR(dev->device); 3191 goto release_pools; 3192 } 3193 get_device(dev->device); 3194 dev_set_drvdata(dev->device, dev); 3195 3196 result = device_create_file(dev->device, &dev_attr_reset_controller); 3197 if (result) 3198 goto put_dev; 3199 3200 INIT_LIST_HEAD(&dev->node); 3201 INIT_WORK(&dev->scan_work, nvme_dev_scan); 3202 INIT_WORK(&dev->probe_work, nvme_async_probe); 3203 schedule_work(&dev->probe_work); 3204 return 0; 3205 3206 put_dev: 3207 device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance)); 3208 put_device(dev->device); 3209 release_pools: 3210 nvme_release_prp_pools(dev); 3211 release: 3212 nvme_release_instance(dev); 3213 put_pci: 3214 put_device(dev->dev); 3215 free: 3216 kfree(dev->queues); 3217 kfree(dev->entry); 3218 kfree(dev); 3219 return result; 3220} 3221 3222static void nvme_async_probe(struct work_struct *work) 3223{ 3224 struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work); 3225 3226 if (nvme_dev_resume(dev) && !work_busy(&dev->reset_work)) 3227 nvme_dead_ctrl(dev); 3228} 3229 3230static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) 3231{ 3232 struct nvme_dev *dev = pci_get_drvdata(pdev); 3233 3234 if (prepare) 3235 nvme_dev_shutdown(dev); 3236 else 3237 nvme_dev_resume(dev); 3238} 3239 3240static void nvme_shutdown(struct pci_dev *pdev) 3241{ 3242 struct nvme_dev *dev = pci_get_drvdata(pdev); 3243 nvme_dev_shutdown(dev); 3244} 3245 3246static void nvme_remove(struct pci_dev *pdev) 3247{ 3248 struct nvme_dev *dev = pci_get_drvdata(pdev); 3249 3250 spin_lock(&dev_list_lock); 3251 list_del_init(&dev->node); 3252 spin_unlock(&dev_list_lock); 3253 3254 pci_set_drvdata(pdev, NULL); 3255 flush_work(&dev->probe_work); 3256 flush_work(&dev->reset_work); 3257 flush_work(&dev->scan_work); 3258 device_remove_file(dev->device, &dev_attr_reset_controller); 3259 nvme_dev_remove(dev); 3260 nvme_dev_shutdown(dev); 3261 nvme_dev_remove_admin(dev); 3262 device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance)); 3263 nvme_free_queues(dev, 0); 3264 nvme_release_cmb(dev); 3265 nvme_release_prp_pools(dev); 3266 kref_put(&dev->kref, nvme_free_dev); 3267} 3268 3269/* These functions are yet to be implemented */ 3270#define nvme_error_detected NULL 3271#define nvme_dump_registers NULL 3272#define nvme_link_reset NULL 3273#define nvme_slot_reset NULL 3274#define nvme_error_resume NULL 3275 3276#ifdef CONFIG_PM_SLEEP 3277static int nvme_suspend(struct device *dev) 3278{ 3279 struct pci_dev *pdev = to_pci_dev(dev); 3280 struct nvme_dev *ndev = pci_get_drvdata(pdev); 3281 3282 nvme_dev_shutdown(ndev); 3283 return 0; 3284} 3285 3286static int nvme_resume(struct device *dev) 3287{ 3288 struct pci_dev *pdev = to_pci_dev(dev); 3289 struct nvme_dev *ndev = pci_get_drvdata(pdev); 3290 3291 if (nvme_dev_resume(ndev) && !work_busy(&ndev->reset_work)) { 3292 ndev->reset_workfn = nvme_reset_failed_dev; 3293 queue_work(nvme_workq, &ndev->reset_work); 3294 } 3295 return 0; 3296} 3297#endif 3298 3299static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume); 3300 3301static const struct pci_error_handlers nvme_err_handler = { 3302 .error_detected = nvme_error_detected, 3303 .mmio_enabled = nvme_dump_registers, 3304 .link_reset = nvme_link_reset, 3305 .slot_reset = nvme_slot_reset, 3306 .resume = nvme_error_resume, 3307 .reset_notify = nvme_reset_notify, 3308}; 3309 3310/* Move to pci_ids.h later */ 3311#define PCI_CLASS_STORAGE_EXPRESS 0x010802 3312 3313static const struct pci_device_id nvme_id_table[] = { 3314 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, 3315 { 0, } 3316}; 3317MODULE_DEVICE_TABLE(pci, nvme_id_table); 3318 3319static struct pci_driver nvme_driver = { 3320 .name = "nvme", 3321 .id_table = nvme_id_table, 3322 .probe = nvme_probe, 3323 .remove = nvme_remove, 3324 .shutdown = nvme_shutdown, 3325 .driver = { 3326 .pm = &nvme_dev_pm_ops, 3327 }, 3328 .err_handler = &nvme_err_handler, 3329}; 3330 3331static int __init nvme_init(void) 3332{ 3333 int result; 3334 3335 init_waitqueue_head(&nvme_kthread_wait); 3336 3337 nvme_workq = create_singlethread_workqueue("nvme"); 3338 if (!nvme_workq) 3339 return -ENOMEM; 3340 3341 result = register_blkdev(nvme_major, "nvme"); 3342 if (result < 0) 3343 goto kill_workq; 3344 else if (result > 0) 3345 nvme_major = result; 3346 3347 result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", 3348 &nvme_dev_fops); 3349 if (result < 0) 3350 goto unregister_blkdev; 3351 else if (result > 0) 3352 nvme_char_major = result; 3353 3354 nvme_class = class_create(THIS_MODULE, "nvme"); 3355 if (IS_ERR(nvme_class)) { 3356 result = PTR_ERR(nvme_class); 3357 goto unregister_chrdev; 3358 } 3359 3360 result = pci_register_driver(&nvme_driver); 3361 if (result) 3362 goto destroy_class; 3363 return 0; 3364 3365 destroy_class: 3366 class_destroy(nvme_class); 3367 unregister_chrdev: 3368 __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); 3369 unregister_blkdev: 3370 unregister_blkdev(nvme_major, "nvme"); 3371 kill_workq: 3372 destroy_workqueue(nvme_workq); 3373 return result; 3374} 3375 3376static void __exit nvme_exit(void) 3377{ 3378 pci_unregister_driver(&nvme_driver); 3379 unregister_blkdev(nvme_major, "nvme"); 3380 destroy_workqueue(nvme_workq); 3381 class_destroy(nvme_class); 3382 __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); 3383 BUG_ON(nvme_thread && !IS_ERR(nvme_thread)); 3384 _nvme_check_size(); 3385} 3386 3387MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>"); 3388MODULE_LICENSE("GPL"); 3389MODULE_VERSION("1.0"); 3390module_init(nvme_init); 3391module_exit(nvme_exit);