Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v4.3-rc7 3397 lines 86 kB view raw
1/* 2 * NVM Express device driver 3 * Copyright (c) 2011-2014, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 */ 14 15#include <linux/nvme.h> 16#include <linux/bitops.h> 17#include <linux/blkdev.h> 18#include <linux/blk-mq.h> 19#include <linux/cpu.h> 20#include <linux/delay.h> 21#include <linux/errno.h> 22#include <linux/fs.h> 23#include <linux/genhd.h> 24#include <linux/hdreg.h> 25#include <linux/idr.h> 26#include <linux/init.h> 27#include <linux/interrupt.h> 28#include <linux/io.h> 29#include <linux/kdev_t.h> 30#include <linux/kthread.h> 31#include <linux/kernel.h> 32#include <linux/list_sort.h> 33#include <linux/mm.h> 34#include <linux/module.h> 35#include <linux/moduleparam.h> 36#include <linux/pci.h> 37#include <linux/poison.h> 38#include <linux/ptrace.h> 39#include <linux/sched.h> 40#include <linux/slab.h> 41#include <linux/t10-pi.h> 42#include <linux/types.h> 43#include <scsi/sg.h> 44#include <asm-generic/io-64-nonatomic-lo-hi.h> 45 46#define NVME_MINORS (1U << MINORBITS) 47#define NVME_Q_DEPTH 1024 48#define NVME_AQ_DEPTH 256 49#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) 50#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) 51#define ADMIN_TIMEOUT (admin_timeout * HZ) 52#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) 53 54static unsigned char admin_timeout = 60; 55module_param(admin_timeout, byte, 0644); 56MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); 57 58unsigned char nvme_io_timeout = 30; 59module_param_named(io_timeout, nvme_io_timeout, byte, 0644); 60MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); 61 62static unsigned char shutdown_timeout = 5; 63module_param(shutdown_timeout, byte, 0644); 64MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); 65 66static int nvme_major; 67module_param(nvme_major, int, 0); 68 69static int nvme_char_major; 70module_param(nvme_char_major, int, 0); 71 72static int use_threaded_interrupts; 73module_param(use_threaded_interrupts, int, 0); 74 75static bool use_cmb_sqes = true; 76module_param(use_cmb_sqes, bool, 0644); 77MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes"); 78 79static DEFINE_SPINLOCK(dev_list_lock); 80static LIST_HEAD(dev_list); 81static struct task_struct *nvme_thread; 82static struct workqueue_struct *nvme_workq; 83static wait_queue_head_t nvme_kthread_wait; 84 85static struct class *nvme_class; 86 87static void nvme_reset_failed_dev(struct work_struct *ws); 88static int nvme_reset(struct nvme_dev *dev); 89static int nvme_process_cq(struct nvme_queue *nvmeq); 90 91struct async_cmd_info { 92 struct kthread_work work; 93 struct kthread_worker *worker; 94 struct request *req; 95 u32 result; 96 int status; 97 void *ctx; 98}; 99 100/* 101 * An NVM Express queue. Each device has at least two (one for admin 102 * commands and one for I/O commands). 103 */ 104struct nvme_queue { 105 struct device *q_dmadev; 106 struct nvme_dev *dev; 107 char irqname[24]; /* nvme4294967295-65535\0 */ 108 spinlock_t q_lock; 109 struct nvme_command *sq_cmds; 110 struct nvme_command __iomem *sq_cmds_io; 111 volatile struct nvme_completion *cqes; 112 struct blk_mq_tags **tags; 113 dma_addr_t sq_dma_addr; 114 dma_addr_t cq_dma_addr; 115 u32 __iomem *q_db; 116 u16 q_depth; 117 s16 cq_vector; 118 u16 sq_head; 119 u16 sq_tail; 120 u16 cq_head; 121 u16 qid; 122 u8 cq_phase; 123 u8 cqe_seen; 124 struct async_cmd_info cmdinfo; 125}; 126 127/* 128 * Check we didin't inadvertently grow the command struct 129 */ 130static inline void _nvme_check_size(void) 131{ 132 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); 133 BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); 134 BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); 135 BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); 136 BUILD_BUG_ON(sizeof(struct nvme_features) != 64); 137 BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); 138 BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64); 139 BUILD_BUG_ON(sizeof(struct nvme_command) != 64); 140 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); 141 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); 142 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); 143 BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); 144} 145 146typedef void (*nvme_completion_fn)(struct nvme_queue *, void *, 147 struct nvme_completion *); 148 149struct nvme_cmd_info { 150 nvme_completion_fn fn; 151 void *ctx; 152 int aborted; 153 struct nvme_queue *nvmeq; 154 struct nvme_iod iod[0]; 155}; 156 157/* 158 * Max size of iod being embedded in the request payload 159 */ 160#define NVME_INT_PAGES 2 161#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->page_size) 162#define NVME_INT_MASK 0x01 163 164/* 165 * Will slightly overestimate the number of pages needed. This is OK 166 * as it only leads to a small amount of wasted memory for the lifetime of 167 * the I/O. 168 */ 169static int nvme_npages(unsigned size, struct nvme_dev *dev) 170{ 171 unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); 172 return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); 173} 174 175static unsigned int nvme_cmd_size(struct nvme_dev *dev) 176{ 177 unsigned int ret = sizeof(struct nvme_cmd_info); 178 179 ret += sizeof(struct nvme_iod); 180 ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev); 181 ret += sizeof(struct scatterlist) * NVME_INT_PAGES; 182 183 return ret; 184} 185 186static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 187 unsigned int hctx_idx) 188{ 189 struct nvme_dev *dev = data; 190 struct nvme_queue *nvmeq = dev->queues[0]; 191 192 WARN_ON(hctx_idx != 0); 193 WARN_ON(dev->admin_tagset.tags[0] != hctx->tags); 194 WARN_ON(nvmeq->tags); 195 196 hctx->driver_data = nvmeq; 197 nvmeq->tags = &dev->admin_tagset.tags[0]; 198 return 0; 199} 200 201static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 202{ 203 struct nvme_queue *nvmeq = hctx->driver_data; 204 205 nvmeq->tags = NULL; 206} 207 208static int nvme_admin_init_request(void *data, struct request *req, 209 unsigned int hctx_idx, unsigned int rq_idx, 210 unsigned int numa_node) 211{ 212 struct nvme_dev *dev = data; 213 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); 214 struct nvme_queue *nvmeq = dev->queues[0]; 215 216 BUG_ON(!nvmeq); 217 cmd->nvmeq = nvmeq; 218 return 0; 219} 220 221static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 222 unsigned int hctx_idx) 223{ 224 struct nvme_dev *dev = data; 225 struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; 226 227 if (!nvmeq->tags) 228 nvmeq->tags = &dev->tagset.tags[hctx_idx]; 229 230 WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags); 231 hctx->driver_data = nvmeq; 232 return 0; 233} 234 235static int nvme_init_request(void *data, struct request *req, 236 unsigned int hctx_idx, unsigned int rq_idx, 237 unsigned int numa_node) 238{ 239 struct nvme_dev *dev = data; 240 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); 241 struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; 242 243 BUG_ON(!nvmeq); 244 cmd->nvmeq = nvmeq; 245 return 0; 246} 247 248static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx, 249 nvme_completion_fn handler) 250{ 251 cmd->fn = handler; 252 cmd->ctx = ctx; 253 cmd->aborted = 0; 254 blk_mq_start_request(blk_mq_rq_from_pdu(cmd)); 255} 256 257static void *iod_get_private(struct nvme_iod *iod) 258{ 259 return (void *) (iod->private & ~0x1UL); 260} 261 262/* 263 * If bit 0 is set, the iod is embedded in the request payload. 264 */ 265static bool iod_should_kfree(struct nvme_iod *iod) 266{ 267 return (iod->private & NVME_INT_MASK) == 0; 268} 269 270/* Special values must be less than 0x1000 */ 271#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) 272#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) 273#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) 274#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) 275 276static void special_completion(struct nvme_queue *nvmeq, void *ctx, 277 struct nvme_completion *cqe) 278{ 279 if (ctx == CMD_CTX_CANCELLED) 280 return; 281 if (ctx == CMD_CTX_COMPLETED) { 282 dev_warn(nvmeq->q_dmadev, 283 "completed id %d twice on queue %d\n", 284 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 285 return; 286 } 287 if (ctx == CMD_CTX_INVALID) { 288 dev_warn(nvmeq->q_dmadev, 289 "invalid id %d completed on queue %d\n", 290 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 291 return; 292 } 293 dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx); 294} 295 296static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn) 297{ 298 void *ctx; 299 300 if (fn) 301 *fn = cmd->fn; 302 ctx = cmd->ctx; 303 cmd->fn = special_completion; 304 cmd->ctx = CMD_CTX_CANCELLED; 305 return ctx; 306} 307 308static void async_req_completion(struct nvme_queue *nvmeq, void *ctx, 309 struct nvme_completion *cqe) 310{ 311 u32 result = le32_to_cpup(&cqe->result); 312 u16 status = le16_to_cpup(&cqe->status) >> 1; 313 314 if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) 315 ++nvmeq->dev->event_limit; 316 if (status != NVME_SC_SUCCESS) 317 return; 318 319 switch (result & 0xff07) { 320 case NVME_AER_NOTICE_NS_CHANGED: 321 dev_info(nvmeq->q_dmadev, "rescanning\n"); 322 schedule_work(&nvmeq->dev->scan_work); 323 default: 324 dev_warn(nvmeq->q_dmadev, "async event result %08x\n", result); 325 } 326} 327 328static void abort_completion(struct nvme_queue *nvmeq, void *ctx, 329 struct nvme_completion *cqe) 330{ 331 struct request *req = ctx; 332 333 u16 status = le16_to_cpup(&cqe->status) >> 1; 334 u32 result = le32_to_cpup(&cqe->result); 335 336 blk_mq_free_request(req); 337 338 dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result); 339 ++nvmeq->dev->abort_limit; 340} 341 342static void async_completion(struct nvme_queue *nvmeq, void *ctx, 343 struct nvme_completion *cqe) 344{ 345 struct async_cmd_info *cmdinfo = ctx; 346 cmdinfo->result = le32_to_cpup(&cqe->result); 347 cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; 348 queue_kthread_work(cmdinfo->worker, &cmdinfo->work); 349 blk_mq_free_request(cmdinfo->req); 350} 351 352static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq, 353 unsigned int tag) 354{ 355 struct request *req = blk_mq_tag_to_rq(*nvmeq->tags, tag); 356 357 return blk_mq_rq_to_pdu(req); 358} 359 360/* 361 * Called with local interrupts disabled and the q_lock held. May not sleep. 362 */ 363static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag, 364 nvme_completion_fn *fn) 365{ 366 struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag); 367 void *ctx; 368 if (tag >= nvmeq->q_depth) { 369 *fn = special_completion; 370 return CMD_CTX_INVALID; 371 } 372 if (fn) 373 *fn = cmd->fn; 374 ctx = cmd->ctx; 375 cmd->fn = special_completion; 376 cmd->ctx = CMD_CTX_COMPLETED; 377 return ctx; 378} 379 380/** 381 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell 382 * @nvmeq: The queue to use 383 * @cmd: The command to send 384 * 385 * Safe to use from interrupt context 386 */ 387static void __nvme_submit_cmd(struct nvme_queue *nvmeq, 388 struct nvme_command *cmd) 389{ 390 u16 tail = nvmeq->sq_tail; 391 392 if (nvmeq->sq_cmds_io) 393 memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd)); 394 else 395 memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); 396 397 if (++tail == nvmeq->q_depth) 398 tail = 0; 399 writel(tail, nvmeq->q_db); 400 nvmeq->sq_tail = tail; 401} 402 403static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) 404{ 405 unsigned long flags; 406 spin_lock_irqsave(&nvmeq->q_lock, flags); 407 __nvme_submit_cmd(nvmeq, cmd); 408 spin_unlock_irqrestore(&nvmeq->q_lock, flags); 409} 410 411static __le64 **iod_list(struct nvme_iod *iod) 412{ 413 return ((void *)iod) + iod->offset; 414} 415 416static inline void iod_init(struct nvme_iod *iod, unsigned nbytes, 417 unsigned nseg, unsigned long private) 418{ 419 iod->private = private; 420 iod->offset = offsetof(struct nvme_iod, sg[nseg]); 421 iod->npages = -1; 422 iod->length = nbytes; 423 iod->nents = 0; 424} 425 426static struct nvme_iod * 427__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev, 428 unsigned long priv, gfp_t gfp) 429{ 430 struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + 431 sizeof(__le64 *) * nvme_npages(bytes, dev) + 432 sizeof(struct scatterlist) * nseg, gfp); 433 434 if (iod) 435 iod_init(iod, bytes, nseg, priv); 436 437 return iod; 438} 439 440static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev, 441 gfp_t gfp) 442{ 443 unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) : 444 sizeof(struct nvme_dsm_range); 445 struct nvme_iod *iod; 446 447 if (rq->nr_phys_segments <= NVME_INT_PAGES && 448 size <= NVME_INT_BYTES(dev)) { 449 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq); 450 451 iod = cmd->iod; 452 iod_init(iod, size, rq->nr_phys_segments, 453 (unsigned long) rq | NVME_INT_MASK); 454 return iod; 455 } 456 457 return __nvme_alloc_iod(rq->nr_phys_segments, size, dev, 458 (unsigned long) rq, gfp); 459} 460 461static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) 462{ 463 const int last_prp = dev->page_size / 8 - 1; 464 int i; 465 __le64 **list = iod_list(iod); 466 dma_addr_t prp_dma = iod->first_dma; 467 468 if (iod->npages == 0) 469 dma_pool_free(dev->prp_small_pool, list[0], prp_dma); 470 for (i = 0; i < iod->npages; i++) { 471 __le64 *prp_list = list[i]; 472 dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); 473 dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); 474 prp_dma = next_prp_dma; 475 } 476 477 if (iod_should_kfree(iod)) 478 kfree(iod); 479} 480 481static int nvme_error_status(u16 status) 482{ 483 switch (status & 0x7ff) { 484 case NVME_SC_SUCCESS: 485 return 0; 486 case NVME_SC_CAP_EXCEEDED: 487 return -ENOSPC; 488 default: 489 return -EIO; 490 } 491} 492 493#ifdef CONFIG_BLK_DEV_INTEGRITY 494static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) 495{ 496 if (be32_to_cpu(pi->ref_tag) == v) 497 pi->ref_tag = cpu_to_be32(p); 498} 499 500static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) 501{ 502 if (be32_to_cpu(pi->ref_tag) == p) 503 pi->ref_tag = cpu_to_be32(v); 504} 505 506/** 507 * nvme_dif_remap - remaps ref tags to bip seed and physical lba 508 * 509 * The virtual start sector is the one that was originally submitted by the 510 * block layer. Due to partitioning, MD/DM cloning, etc. the actual physical 511 * start sector may be different. Remap protection information to match the 512 * physical LBA on writes, and back to the original seed on reads. 513 * 514 * Type 0 and 3 do not have a ref tag, so no remapping required. 515 */ 516static void nvme_dif_remap(struct request *req, 517 void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) 518{ 519 struct nvme_ns *ns = req->rq_disk->private_data; 520 struct bio_integrity_payload *bip; 521 struct t10_pi_tuple *pi; 522 void *p, *pmap; 523 u32 i, nlb, ts, phys, virt; 524 525 if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3) 526 return; 527 528 bip = bio_integrity(req->bio); 529 if (!bip) 530 return; 531 532 pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; 533 534 p = pmap; 535 virt = bip_get_seed(bip); 536 phys = nvme_block_nr(ns, blk_rq_pos(req)); 537 nlb = (blk_rq_bytes(req) >> ns->lba_shift); 538 ts = ns->disk->integrity->tuple_size; 539 540 for (i = 0; i < nlb; i++, virt++, phys++) { 541 pi = (struct t10_pi_tuple *)p; 542 dif_swap(phys, virt, pi); 543 p += ts; 544 } 545 kunmap_atomic(pmap); 546} 547 548static int nvme_noop_verify(struct blk_integrity_iter *iter) 549{ 550 return 0; 551} 552 553static int nvme_noop_generate(struct blk_integrity_iter *iter) 554{ 555 return 0; 556} 557 558struct blk_integrity nvme_meta_noop = { 559 .name = "NVME_META_NOOP", 560 .generate_fn = nvme_noop_generate, 561 .verify_fn = nvme_noop_verify, 562}; 563 564static void nvme_init_integrity(struct nvme_ns *ns) 565{ 566 struct blk_integrity integrity; 567 568 switch (ns->pi_type) { 569 case NVME_NS_DPS_PI_TYPE3: 570 integrity = t10_pi_type3_crc; 571 break; 572 case NVME_NS_DPS_PI_TYPE1: 573 case NVME_NS_DPS_PI_TYPE2: 574 integrity = t10_pi_type1_crc; 575 break; 576 default: 577 integrity = nvme_meta_noop; 578 break; 579 } 580 integrity.tuple_size = ns->ms; 581 blk_integrity_register(ns->disk, &integrity); 582 blk_queue_max_integrity_segments(ns->queue, 1); 583} 584#else /* CONFIG_BLK_DEV_INTEGRITY */ 585static void nvme_dif_remap(struct request *req, 586 void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) 587{ 588} 589static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) 590{ 591} 592static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) 593{ 594} 595static void nvme_init_integrity(struct nvme_ns *ns) 596{ 597} 598#endif 599 600static void req_completion(struct nvme_queue *nvmeq, void *ctx, 601 struct nvme_completion *cqe) 602{ 603 struct nvme_iod *iod = ctx; 604 struct request *req = iod_get_private(iod); 605 struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); 606 u16 status = le16_to_cpup(&cqe->status) >> 1; 607 bool requeue = false; 608 int error = 0; 609 610 if (unlikely(status)) { 611 if (!(status & NVME_SC_DNR || blk_noretry_request(req)) 612 && (jiffies - req->start_time) < req->timeout) { 613 unsigned long flags; 614 615 requeue = true; 616 blk_mq_requeue_request(req); 617 spin_lock_irqsave(req->q->queue_lock, flags); 618 if (!blk_queue_stopped(req->q)) 619 blk_mq_kick_requeue_list(req->q); 620 spin_unlock_irqrestore(req->q->queue_lock, flags); 621 goto release_iod; 622 } 623 624 if (req->cmd_type == REQ_TYPE_DRV_PRIV) { 625 if (cmd_rq->ctx == CMD_CTX_CANCELLED) 626 error = -EINTR; 627 else 628 error = status; 629 } else { 630 error = nvme_error_status(status); 631 } 632 } 633 634 if (req->cmd_type == REQ_TYPE_DRV_PRIV) { 635 u32 result = le32_to_cpup(&cqe->result); 636 req->special = (void *)(uintptr_t)result; 637 } 638 639 if (cmd_rq->aborted) 640 dev_warn(nvmeq->dev->dev, 641 "completing aborted command with status:%04x\n", 642 error); 643 644release_iod: 645 if (iod->nents) { 646 dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents, 647 rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 648 if (blk_integrity_rq(req)) { 649 if (!rq_data_dir(req)) 650 nvme_dif_remap(req, nvme_dif_complete); 651 dma_unmap_sg(nvmeq->dev->dev, iod->meta_sg, 1, 652 rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 653 } 654 } 655 nvme_free_iod(nvmeq->dev, iod); 656 657 if (likely(!requeue)) 658 blk_mq_complete_request(req, error); 659} 660 661/* length is in bytes. gfp flags indicates whether we may sleep. */ 662static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, 663 int total_len, gfp_t gfp) 664{ 665 struct dma_pool *pool; 666 int length = total_len; 667 struct scatterlist *sg = iod->sg; 668 int dma_len = sg_dma_len(sg); 669 u64 dma_addr = sg_dma_address(sg); 670 u32 page_size = dev->page_size; 671 int offset = dma_addr & (page_size - 1); 672 __le64 *prp_list; 673 __le64 **list = iod_list(iod); 674 dma_addr_t prp_dma; 675 int nprps, i; 676 677 length -= (page_size - offset); 678 if (length <= 0) 679 return total_len; 680 681 dma_len -= (page_size - offset); 682 if (dma_len) { 683 dma_addr += (page_size - offset); 684 } else { 685 sg = sg_next(sg); 686 dma_addr = sg_dma_address(sg); 687 dma_len = sg_dma_len(sg); 688 } 689 690 if (length <= page_size) { 691 iod->first_dma = dma_addr; 692 return total_len; 693 } 694 695 nprps = DIV_ROUND_UP(length, page_size); 696 if (nprps <= (256 / 8)) { 697 pool = dev->prp_small_pool; 698 iod->npages = 0; 699 } else { 700 pool = dev->prp_page_pool; 701 iod->npages = 1; 702 } 703 704 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 705 if (!prp_list) { 706 iod->first_dma = dma_addr; 707 iod->npages = -1; 708 return (total_len - length) + page_size; 709 } 710 list[0] = prp_list; 711 iod->first_dma = prp_dma; 712 i = 0; 713 for (;;) { 714 if (i == page_size >> 3) { 715 __le64 *old_prp_list = prp_list; 716 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 717 if (!prp_list) 718 return total_len - length; 719 list[iod->npages++] = prp_list; 720 prp_list[0] = old_prp_list[i - 1]; 721 old_prp_list[i - 1] = cpu_to_le64(prp_dma); 722 i = 1; 723 } 724 prp_list[i++] = cpu_to_le64(dma_addr); 725 dma_len -= page_size; 726 dma_addr += page_size; 727 length -= page_size; 728 if (length <= 0) 729 break; 730 if (dma_len > 0) 731 continue; 732 BUG_ON(dma_len < 0); 733 sg = sg_next(sg); 734 dma_addr = sg_dma_address(sg); 735 dma_len = sg_dma_len(sg); 736 } 737 738 return total_len; 739} 740 741static void nvme_submit_priv(struct nvme_queue *nvmeq, struct request *req, 742 struct nvme_iod *iod) 743{ 744 struct nvme_command cmnd; 745 746 memcpy(&cmnd, req->cmd, sizeof(cmnd)); 747 cmnd.rw.command_id = req->tag; 748 if (req->nr_phys_segments) { 749 cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); 750 cmnd.rw.prp2 = cpu_to_le64(iod->first_dma); 751 } 752 753 __nvme_submit_cmd(nvmeq, &cmnd); 754} 755 756/* 757 * We reuse the small pool to allocate the 16-byte range here as it is not 758 * worth having a special pool for these or additional cases to handle freeing 759 * the iod. 760 */ 761static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, 762 struct request *req, struct nvme_iod *iod) 763{ 764 struct nvme_dsm_range *range = 765 (struct nvme_dsm_range *)iod_list(iod)[0]; 766 struct nvme_command cmnd; 767 768 range->cattr = cpu_to_le32(0); 769 range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift); 770 range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); 771 772 memset(&cmnd, 0, sizeof(cmnd)); 773 cmnd.dsm.opcode = nvme_cmd_dsm; 774 cmnd.dsm.command_id = req->tag; 775 cmnd.dsm.nsid = cpu_to_le32(ns->ns_id); 776 cmnd.dsm.prp1 = cpu_to_le64(iod->first_dma); 777 cmnd.dsm.nr = 0; 778 cmnd.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); 779 780 __nvme_submit_cmd(nvmeq, &cmnd); 781} 782 783static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, 784 int cmdid) 785{ 786 struct nvme_command cmnd; 787 788 memset(&cmnd, 0, sizeof(cmnd)); 789 cmnd.common.opcode = nvme_cmd_flush; 790 cmnd.common.command_id = cmdid; 791 cmnd.common.nsid = cpu_to_le32(ns->ns_id); 792 793 __nvme_submit_cmd(nvmeq, &cmnd); 794} 795 796static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, 797 struct nvme_ns *ns) 798{ 799 struct request *req = iod_get_private(iod); 800 struct nvme_command cmnd; 801 u16 control = 0; 802 u32 dsmgmt = 0; 803 804 if (req->cmd_flags & REQ_FUA) 805 control |= NVME_RW_FUA; 806 if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) 807 control |= NVME_RW_LR; 808 809 if (req->cmd_flags & REQ_RAHEAD) 810 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; 811 812 memset(&cmnd, 0, sizeof(cmnd)); 813 cmnd.rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); 814 cmnd.rw.command_id = req->tag; 815 cmnd.rw.nsid = cpu_to_le32(ns->ns_id); 816 cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); 817 cmnd.rw.prp2 = cpu_to_le64(iod->first_dma); 818 cmnd.rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); 819 cmnd.rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); 820 821 if (ns->ms) { 822 switch (ns->pi_type) { 823 case NVME_NS_DPS_PI_TYPE3: 824 control |= NVME_RW_PRINFO_PRCHK_GUARD; 825 break; 826 case NVME_NS_DPS_PI_TYPE1: 827 case NVME_NS_DPS_PI_TYPE2: 828 control |= NVME_RW_PRINFO_PRCHK_GUARD | 829 NVME_RW_PRINFO_PRCHK_REF; 830 cmnd.rw.reftag = cpu_to_le32( 831 nvme_block_nr(ns, blk_rq_pos(req))); 832 break; 833 } 834 if (blk_integrity_rq(req)) 835 cmnd.rw.metadata = 836 cpu_to_le64(sg_dma_address(iod->meta_sg)); 837 else 838 control |= NVME_RW_PRINFO_PRACT; 839 } 840 841 cmnd.rw.control = cpu_to_le16(control); 842 cmnd.rw.dsmgmt = cpu_to_le32(dsmgmt); 843 844 __nvme_submit_cmd(nvmeq, &cmnd); 845 846 return 0; 847} 848 849/* 850 * NOTE: ns is NULL when called on the admin queue. 851 */ 852static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, 853 const struct blk_mq_queue_data *bd) 854{ 855 struct nvme_ns *ns = hctx->queue->queuedata; 856 struct nvme_queue *nvmeq = hctx->driver_data; 857 struct nvme_dev *dev = nvmeq->dev; 858 struct request *req = bd->rq; 859 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); 860 struct nvme_iod *iod; 861 enum dma_data_direction dma_dir; 862 863 /* 864 * If formated with metadata, require the block layer provide a buffer 865 * unless this namespace is formated such that the metadata can be 866 * stripped/generated by the controller with PRACT=1. 867 */ 868 if (ns && ns->ms && !blk_integrity_rq(req)) { 869 if (!(ns->pi_type && ns->ms == 8) && 870 req->cmd_type != REQ_TYPE_DRV_PRIV) { 871 blk_mq_complete_request(req, -EFAULT); 872 return BLK_MQ_RQ_QUEUE_OK; 873 } 874 } 875 876 iod = nvme_alloc_iod(req, dev, GFP_ATOMIC); 877 if (!iod) 878 return BLK_MQ_RQ_QUEUE_BUSY; 879 880 if (req->cmd_flags & REQ_DISCARD) { 881 void *range; 882 /* 883 * We reuse the small pool to allocate the 16-byte range here 884 * as it is not worth having a special pool for these or 885 * additional cases to handle freeing the iod. 886 */ 887 range = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, 888 &iod->first_dma); 889 if (!range) 890 goto retry_cmd; 891 iod_list(iod)[0] = (__le64 *)range; 892 iod->npages = 0; 893 } else if (req->nr_phys_segments) { 894 dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; 895 896 sg_init_table(iod->sg, req->nr_phys_segments); 897 iod->nents = blk_rq_map_sg(req->q, req, iod->sg); 898 if (!iod->nents) 899 goto error_cmd; 900 901 if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir)) 902 goto retry_cmd; 903 904 if (blk_rq_bytes(req) != 905 nvme_setup_prps(dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) { 906 dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); 907 goto retry_cmd; 908 } 909 if (blk_integrity_rq(req)) { 910 if (blk_rq_count_integrity_sg(req->q, req->bio) != 1) 911 goto error_cmd; 912 913 sg_init_table(iod->meta_sg, 1); 914 if (blk_rq_map_integrity_sg( 915 req->q, req->bio, iod->meta_sg) != 1) 916 goto error_cmd; 917 918 if (rq_data_dir(req)) 919 nvme_dif_remap(req, nvme_dif_prep); 920 921 if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir)) 922 goto error_cmd; 923 } 924 } 925 926 nvme_set_info(cmd, iod, req_completion); 927 spin_lock_irq(&nvmeq->q_lock); 928 if (req->cmd_type == REQ_TYPE_DRV_PRIV) 929 nvme_submit_priv(nvmeq, req, iod); 930 else if (req->cmd_flags & REQ_DISCARD) 931 nvme_submit_discard(nvmeq, ns, req, iod); 932 else if (req->cmd_flags & REQ_FLUSH) 933 nvme_submit_flush(nvmeq, ns, req->tag); 934 else 935 nvme_submit_iod(nvmeq, iod, ns); 936 937 nvme_process_cq(nvmeq); 938 spin_unlock_irq(&nvmeq->q_lock); 939 return BLK_MQ_RQ_QUEUE_OK; 940 941 error_cmd: 942 nvme_free_iod(dev, iod); 943 return BLK_MQ_RQ_QUEUE_ERROR; 944 retry_cmd: 945 nvme_free_iod(dev, iod); 946 return BLK_MQ_RQ_QUEUE_BUSY; 947} 948 949static int nvme_process_cq(struct nvme_queue *nvmeq) 950{ 951 u16 head, phase; 952 953 head = nvmeq->cq_head; 954 phase = nvmeq->cq_phase; 955 956 for (;;) { 957 void *ctx; 958 nvme_completion_fn fn; 959 struct nvme_completion cqe = nvmeq->cqes[head]; 960 if ((le16_to_cpu(cqe.status) & 1) != phase) 961 break; 962 nvmeq->sq_head = le16_to_cpu(cqe.sq_head); 963 if (++head == nvmeq->q_depth) { 964 head = 0; 965 phase = !phase; 966 } 967 ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn); 968 fn(nvmeq, ctx, &cqe); 969 } 970 971 /* If the controller ignores the cq head doorbell and continuously 972 * writes to the queue, it is theoretically possible to wrap around 973 * the queue twice and mistakenly return IRQ_NONE. Linux only 974 * requires that 0.1% of your interrupts are handled, so this isn't 975 * a big problem. 976 */ 977 if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) 978 return 0; 979 980 writel(head, nvmeq->q_db + nvmeq->dev->db_stride); 981 nvmeq->cq_head = head; 982 nvmeq->cq_phase = phase; 983 984 nvmeq->cqe_seen = 1; 985 return 1; 986} 987 988static irqreturn_t nvme_irq(int irq, void *data) 989{ 990 irqreturn_t result; 991 struct nvme_queue *nvmeq = data; 992 spin_lock(&nvmeq->q_lock); 993 nvme_process_cq(nvmeq); 994 result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE; 995 nvmeq->cqe_seen = 0; 996 spin_unlock(&nvmeq->q_lock); 997 return result; 998} 999 1000static irqreturn_t nvme_irq_check(int irq, void *data) 1001{ 1002 struct nvme_queue *nvmeq = data; 1003 struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head]; 1004 if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase) 1005 return IRQ_NONE; 1006 return IRQ_WAKE_THREAD; 1007} 1008 1009/* 1010 * Returns 0 on success. If the result is negative, it's a Linux error code; 1011 * if the result is positive, it's an NVM Express status code 1012 */ 1013int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 1014 void *buffer, void __user *ubuffer, unsigned bufflen, 1015 u32 *result, unsigned timeout) 1016{ 1017 bool write = cmd->common.opcode & 1; 1018 struct bio *bio = NULL; 1019 struct request *req; 1020 int ret; 1021 1022 req = blk_mq_alloc_request(q, write, GFP_KERNEL, false); 1023 if (IS_ERR(req)) 1024 return PTR_ERR(req); 1025 1026 req->cmd_type = REQ_TYPE_DRV_PRIV; 1027 req->cmd_flags |= REQ_FAILFAST_DRIVER; 1028 req->__data_len = 0; 1029 req->__sector = (sector_t) -1; 1030 req->bio = req->biotail = NULL; 1031 1032 req->timeout = timeout ? timeout : ADMIN_TIMEOUT; 1033 1034 req->cmd = (unsigned char *)cmd; 1035 req->cmd_len = sizeof(struct nvme_command); 1036 req->special = (void *)0; 1037 1038 if (buffer && bufflen) { 1039 ret = blk_rq_map_kern(q, req, buffer, bufflen, __GFP_WAIT); 1040 if (ret) 1041 goto out; 1042 } else if (ubuffer && bufflen) { 1043 ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, __GFP_WAIT); 1044 if (ret) 1045 goto out; 1046 bio = req->bio; 1047 } 1048 1049 blk_execute_rq(req->q, NULL, req, 0); 1050 if (bio) 1051 blk_rq_unmap_user(bio); 1052 if (result) 1053 *result = (u32)(uintptr_t)req->special; 1054 ret = req->errors; 1055 out: 1056 blk_mq_free_request(req); 1057 return ret; 1058} 1059 1060int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 1061 void *buffer, unsigned bufflen) 1062{ 1063 return __nvme_submit_sync_cmd(q, cmd, buffer, NULL, bufflen, NULL, 0); 1064} 1065 1066static int nvme_submit_async_admin_req(struct nvme_dev *dev) 1067{ 1068 struct nvme_queue *nvmeq = dev->queues[0]; 1069 struct nvme_command c; 1070 struct nvme_cmd_info *cmd_info; 1071 struct request *req; 1072 1073 req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, true); 1074 if (IS_ERR(req)) 1075 return PTR_ERR(req); 1076 1077 req->cmd_flags |= REQ_NO_TIMEOUT; 1078 cmd_info = blk_mq_rq_to_pdu(req); 1079 nvme_set_info(cmd_info, NULL, async_req_completion); 1080 1081 memset(&c, 0, sizeof(c)); 1082 c.common.opcode = nvme_admin_async_event; 1083 c.common.command_id = req->tag; 1084 1085 blk_mq_free_request(req); 1086 __nvme_submit_cmd(nvmeq, &c); 1087 return 0; 1088} 1089 1090static int nvme_submit_admin_async_cmd(struct nvme_dev *dev, 1091 struct nvme_command *cmd, 1092 struct async_cmd_info *cmdinfo, unsigned timeout) 1093{ 1094 struct nvme_queue *nvmeq = dev->queues[0]; 1095 struct request *req; 1096 struct nvme_cmd_info *cmd_rq; 1097 1098 req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false); 1099 if (IS_ERR(req)) 1100 return PTR_ERR(req); 1101 1102 req->timeout = timeout; 1103 cmd_rq = blk_mq_rq_to_pdu(req); 1104 cmdinfo->req = req; 1105 nvme_set_info(cmd_rq, cmdinfo, async_completion); 1106 cmdinfo->status = -EINTR; 1107 1108 cmd->common.command_id = req->tag; 1109 1110 nvme_submit_cmd(nvmeq, cmd); 1111 return 0; 1112} 1113 1114static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) 1115{ 1116 struct nvme_command c; 1117 1118 memset(&c, 0, sizeof(c)); 1119 c.delete_queue.opcode = opcode; 1120 c.delete_queue.qid = cpu_to_le16(id); 1121 1122 return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); 1123} 1124 1125static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, 1126 struct nvme_queue *nvmeq) 1127{ 1128 struct nvme_command c; 1129 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; 1130 1131 /* 1132 * Note: we (ab)use the fact the the prp fields survive if no data 1133 * is attached to the request. 1134 */ 1135 memset(&c, 0, sizeof(c)); 1136 c.create_cq.opcode = nvme_admin_create_cq; 1137 c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); 1138 c.create_cq.cqid = cpu_to_le16(qid); 1139 c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 1140 c.create_cq.cq_flags = cpu_to_le16(flags); 1141 c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); 1142 1143 return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); 1144} 1145 1146static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, 1147 struct nvme_queue *nvmeq) 1148{ 1149 struct nvme_command c; 1150 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; 1151 1152 /* 1153 * Note: we (ab)use the fact the the prp fields survive if no data 1154 * is attached to the request. 1155 */ 1156 memset(&c, 0, sizeof(c)); 1157 c.create_sq.opcode = nvme_admin_create_sq; 1158 c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); 1159 c.create_sq.sqid = cpu_to_le16(qid); 1160 c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 1161 c.create_sq.sq_flags = cpu_to_le16(flags); 1162 c.create_sq.cqid = cpu_to_le16(qid); 1163 1164 return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); 1165} 1166 1167static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) 1168{ 1169 return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); 1170} 1171 1172static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) 1173{ 1174 return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); 1175} 1176 1177int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id) 1178{ 1179 struct nvme_command c = { }; 1180 int error; 1181 1182 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ 1183 c.identify.opcode = nvme_admin_identify; 1184 c.identify.cns = cpu_to_le32(1); 1185 1186 *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); 1187 if (!*id) 1188 return -ENOMEM; 1189 1190 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, 1191 sizeof(struct nvme_id_ctrl)); 1192 if (error) 1193 kfree(*id); 1194 return error; 1195} 1196 1197int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid, 1198 struct nvme_id_ns **id) 1199{ 1200 struct nvme_command c = { }; 1201 int error; 1202 1203 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ 1204 c.identify.opcode = nvme_admin_identify, 1205 c.identify.nsid = cpu_to_le32(nsid), 1206 1207 *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL); 1208 if (!*id) 1209 return -ENOMEM; 1210 1211 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, 1212 sizeof(struct nvme_id_ns)); 1213 if (error) 1214 kfree(*id); 1215 return error; 1216} 1217 1218int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, 1219 dma_addr_t dma_addr, u32 *result) 1220{ 1221 struct nvme_command c; 1222 1223 memset(&c, 0, sizeof(c)); 1224 c.features.opcode = nvme_admin_get_features; 1225 c.features.nsid = cpu_to_le32(nsid); 1226 c.features.prp1 = cpu_to_le64(dma_addr); 1227 c.features.fid = cpu_to_le32(fid); 1228 1229 return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0, 1230 result, 0); 1231} 1232 1233int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, 1234 dma_addr_t dma_addr, u32 *result) 1235{ 1236 struct nvme_command c; 1237 1238 memset(&c, 0, sizeof(c)); 1239 c.features.opcode = nvme_admin_set_features; 1240 c.features.prp1 = cpu_to_le64(dma_addr); 1241 c.features.fid = cpu_to_le32(fid); 1242 c.features.dword11 = cpu_to_le32(dword11); 1243 1244 return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0, 1245 result, 0); 1246} 1247 1248int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log) 1249{ 1250 struct nvme_command c = { }; 1251 int error; 1252 1253 c.common.opcode = nvme_admin_get_log_page, 1254 c.common.nsid = cpu_to_le32(0xFFFFFFFF), 1255 c.common.cdw10[0] = cpu_to_le32( 1256 (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) | 1257 NVME_LOG_SMART), 1258 1259 *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL); 1260 if (!*log) 1261 return -ENOMEM; 1262 1263 error = nvme_submit_sync_cmd(dev->admin_q, &c, *log, 1264 sizeof(struct nvme_smart_log)); 1265 if (error) 1266 kfree(*log); 1267 return error; 1268} 1269 1270/** 1271 * nvme_abort_req - Attempt aborting a request 1272 * 1273 * Schedule controller reset if the command was already aborted once before and 1274 * still hasn't been returned to the driver, or if this is the admin queue. 1275 */ 1276static void nvme_abort_req(struct request *req) 1277{ 1278 struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); 1279 struct nvme_queue *nvmeq = cmd_rq->nvmeq; 1280 struct nvme_dev *dev = nvmeq->dev; 1281 struct request *abort_req; 1282 struct nvme_cmd_info *abort_cmd; 1283 struct nvme_command cmd; 1284 1285 if (!nvmeq->qid || cmd_rq->aborted) { 1286 unsigned long flags; 1287 1288 spin_lock_irqsave(&dev_list_lock, flags); 1289 if (work_busy(&dev->reset_work)) 1290 goto out; 1291 list_del_init(&dev->node); 1292 dev_warn(dev->dev, "I/O %d QID %d timeout, reset controller\n", 1293 req->tag, nvmeq->qid); 1294 dev->reset_workfn = nvme_reset_failed_dev; 1295 queue_work(nvme_workq, &dev->reset_work); 1296 out: 1297 spin_unlock_irqrestore(&dev_list_lock, flags); 1298 return; 1299 } 1300 1301 if (!dev->abort_limit) 1302 return; 1303 1304 abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, 1305 false); 1306 if (IS_ERR(abort_req)) 1307 return; 1308 1309 abort_cmd = blk_mq_rq_to_pdu(abort_req); 1310 nvme_set_info(abort_cmd, abort_req, abort_completion); 1311 1312 memset(&cmd, 0, sizeof(cmd)); 1313 cmd.abort.opcode = nvme_admin_abort_cmd; 1314 cmd.abort.cid = req->tag; 1315 cmd.abort.sqid = cpu_to_le16(nvmeq->qid); 1316 cmd.abort.command_id = abort_req->tag; 1317 1318 --dev->abort_limit; 1319 cmd_rq->aborted = 1; 1320 1321 dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag, 1322 nvmeq->qid); 1323 nvme_submit_cmd(dev->queues[0], &cmd); 1324} 1325 1326static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved) 1327{ 1328 struct nvme_queue *nvmeq = data; 1329 void *ctx; 1330 nvme_completion_fn fn; 1331 struct nvme_cmd_info *cmd; 1332 struct nvme_completion cqe; 1333 1334 if (!blk_mq_request_started(req)) 1335 return; 1336 1337 cmd = blk_mq_rq_to_pdu(req); 1338 1339 if (cmd->ctx == CMD_CTX_CANCELLED) 1340 return; 1341 1342 if (blk_queue_dying(req->q)) 1343 cqe.status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1); 1344 else 1345 cqe.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); 1346 1347 1348 dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", 1349 req->tag, nvmeq->qid); 1350 ctx = cancel_cmd_info(cmd, &fn); 1351 fn(nvmeq, ctx, &cqe); 1352} 1353 1354static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) 1355{ 1356 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); 1357 struct nvme_queue *nvmeq = cmd->nvmeq; 1358 1359 dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag, 1360 nvmeq->qid); 1361 spin_lock_irq(&nvmeq->q_lock); 1362 nvme_abort_req(req); 1363 spin_unlock_irq(&nvmeq->q_lock); 1364 1365 /* 1366 * The aborted req will be completed on receiving the abort req. 1367 * We enable the timer again. If hit twice, it'll cause a device reset, 1368 * as the device then is in a faulty state. 1369 */ 1370 return BLK_EH_RESET_TIMER; 1371} 1372 1373static void nvme_free_queue(struct nvme_queue *nvmeq) 1374{ 1375 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 1376 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 1377 if (nvmeq->sq_cmds) 1378 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 1379 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 1380 kfree(nvmeq); 1381} 1382 1383static void nvme_free_queues(struct nvme_dev *dev, int lowest) 1384{ 1385 int i; 1386 1387 for (i = dev->queue_count - 1; i >= lowest; i--) { 1388 struct nvme_queue *nvmeq = dev->queues[i]; 1389 dev->queue_count--; 1390 dev->queues[i] = NULL; 1391 nvme_free_queue(nvmeq); 1392 } 1393} 1394 1395/** 1396 * nvme_suspend_queue - put queue into suspended state 1397 * @nvmeq - queue to suspend 1398 */ 1399static int nvme_suspend_queue(struct nvme_queue *nvmeq) 1400{ 1401 int vector; 1402 1403 spin_lock_irq(&nvmeq->q_lock); 1404 if (nvmeq->cq_vector == -1) { 1405 spin_unlock_irq(&nvmeq->q_lock); 1406 return 1; 1407 } 1408 vector = nvmeq->dev->entry[nvmeq->cq_vector].vector; 1409 nvmeq->dev->online_queues--; 1410 nvmeq->cq_vector = -1; 1411 spin_unlock_irq(&nvmeq->q_lock); 1412 1413 if (!nvmeq->qid && nvmeq->dev->admin_q) 1414 blk_mq_freeze_queue_start(nvmeq->dev->admin_q); 1415 1416 irq_set_affinity_hint(vector, NULL); 1417 free_irq(vector, nvmeq); 1418 1419 return 0; 1420} 1421 1422static void nvme_clear_queue(struct nvme_queue *nvmeq) 1423{ 1424 spin_lock_irq(&nvmeq->q_lock); 1425 if (nvmeq->tags && *nvmeq->tags) 1426 blk_mq_all_tag_busy_iter(*nvmeq->tags, nvme_cancel_queue_ios, nvmeq); 1427 spin_unlock_irq(&nvmeq->q_lock); 1428} 1429 1430static void nvme_disable_queue(struct nvme_dev *dev, int qid) 1431{ 1432 struct nvme_queue *nvmeq = dev->queues[qid]; 1433 1434 if (!nvmeq) 1435 return; 1436 if (nvme_suspend_queue(nvmeq)) 1437 return; 1438 1439 /* Don't tell the adapter to delete the admin queue. 1440 * Don't tell a removed adapter to delete IO queues. */ 1441 if (qid && readl(&dev->bar->csts) != -1) { 1442 adapter_delete_sq(dev, qid); 1443 adapter_delete_cq(dev, qid); 1444 } 1445 1446 spin_lock_irq(&nvmeq->q_lock); 1447 nvme_process_cq(nvmeq); 1448 spin_unlock_irq(&nvmeq->q_lock); 1449} 1450 1451static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, 1452 int entry_size) 1453{ 1454 int q_depth = dev->q_depth; 1455 unsigned q_size_aligned = roundup(q_depth * entry_size, dev->page_size); 1456 1457 if (q_size_aligned * nr_io_queues > dev->cmb_size) { 1458 u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues); 1459 mem_per_q = round_down(mem_per_q, dev->page_size); 1460 q_depth = div_u64(mem_per_q, entry_size); 1461 1462 /* 1463 * Ensure the reduced q_depth is above some threshold where it 1464 * would be better to map queues in system memory with the 1465 * original depth 1466 */ 1467 if (q_depth < 64) 1468 return -ENOMEM; 1469 } 1470 1471 return q_depth; 1472} 1473 1474static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, 1475 int qid, int depth) 1476{ 1477 if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) { 1478 unsigned offset = (qid - 1) * 1479 roundup(SQ_SIZE(depth), dev->page_size); 1480 nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset; 1481 nvmeq->sq_cmds_io = dev->cmb + offset; 1482 } else { 1483 nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), 1484 &nvmeq->sq_dma_addr, GFP_KERNEL); 1485 if (!nvmeq->sq_cmds) 1486 return -ENOMEM; 1487 } 1488 1489 return 0; 1490} 1491 1492static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, 1493 int depth) 1494{ 1495 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL); 1496 if (!nvmeq) 1497 return NULL; 1498 1499 nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth), 1500 &nvmeq->cq_dma_addr, GFP_KERNEL); 1501 if (!nvmeq->cqes) 1502 goto free_nvmeq; 1503 1504 if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth)) 1505 goto free_cqdma; 1506 1507 nvmeq->q_dmadev = dev->dev; 1508 nvmeq->dev = dev; 1509 snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", 1510 dev->instance, qid); 1511 spin_lock_init(&nvmeq->q_lock); 1512 nvmeq->cq_head = 0; 1513 nvmeq->cq_phase = 1; 1514 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1515 nvmeq->q_depth = depth; 1516 nvmeq->qid = qid; 1517 nvmeq->cq_vector = -1; 1518 dev->queues[qid] = nvmeq; 1519 1520 /* make sure queue descriptor is set before queue count, for kthread */ 1521 mb(); 1522 dev->queue_count++; 1523 1524 return nvmeq; 1525 1526 free_cqdma: 1527 dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes, 1528 nvmeq->cq_dma_addr); 1529 free_nvmeq: 1530 kfree(nvmeq); 1531 return NULL; 1532} 1533 1534static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, 1535 const char *name) 1536{ 1537 if (use_threaded_interrupts) 1538 return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector, 1539 nvme_irq_check, nvme_irq, IRQF_SHARED, 1540 name, nvmeq); 1541 return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq, 1542 IRQF_SHARED, name, nvmeq); 1543} 1544 1545static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) 1546{ 1547 struct nvme_dev *dev = nvmeq->dev; 1548 1549 spin_lock_irq(&nvmeq->q_lock); 1550 nvmeq->sq_tail = 0; 1551 nvmeq->cq_head = 0; 1552 nvmeq->cq_phase = 1; 1553 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1554 memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); 1555 dev->online_queues++; 1556 spin_unlock_irq(&nvmeq->q_lock); 1557} 1558 1559static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) 1560{ 1561 struct nvme_dev *dev = nvmeq->dev; 1562 int result; 1563 1564 nvmeq->cq_vector = qid - 1; 1565 result = adapter_alloc_cq(dev, qid, nvmeq); 1566 if (result < 0) 1567 return result; 1568 1569 result = adapter_alloc_sq(dev, qid, nvmeq); 1570 if (result < 0) 1571 goto release_cq; 1572 1573 result = queue_request_irq(dev, nvmeq, nvmeq->irqname); 1574 if (result < 0) 1575 goto release_sq; 1576 1577 nvme_init_queue(nvmeq, qid); 1578 return result; 1579 1580 release_sq: 1581 adapter_delete_sq(dev, qid); 1582 release_cq: 1583 adapter_delete_cq(dev, qid); 1584 return result; 1585} 1586 1587static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) 1588{ 1589 unsigned long timeout; 1590 u32 bit = enabled ? NVME_CSTS_RDY : 0; 1591 1592 timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; 1593 1594 while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) { 1595 msleep(100); 1596 if (fatal_signal_pending(current)) 1597 return -EINTR; 1598 if (time_after(jiffies, timeout)) { 1599 dev_err(dev->dev, 1600 "Device not ready; aborting %s\n", enabled ? 1601 "initialisation" : "reset"); 1602 return -ENODEV; 1603 } 1604 } 1605 1606 return 0; 1607} 1608 1609/* 1610 * If the device has been passed off to us in an enabled state, just clear 1611 * the enabled bit. The spec says we should set the 'shutdown notification 1612 * bits', but doing so may cause the device to complete commands to the 1613 * admin queue ... and we don't know what memory that might be pointing at! 1614 */ 1615static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap) 1616{ 1617 dev->ctrl_config &= ~NVME_CC_SHN_MASK; 1618 dev->ctrl_config &= ~NVME_CC_ENABLE; 1619 writel(dev->ctrl_config, &dev->bar->cc); 1620 1621 return nvme_wait_ready(dev, cap, false); 1622} 1623 1624static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap) 1625{ 1626 dev->ctrl_config &= ~NVME_CC_SHN_MASK; 1627 dev->ctrl_config |= NVME_CC_ENABLE; 1628 writel(dev->ctrl_config, &dev->bar->cc); 1629 1630 return nvme_wait_ready(dev, cap, true); 1631} 1632 1633static int nvme_shutdown_ctrl(struct nvme_dev *dev) 1634{ 1635 unsigned long timeout; 1636 1637 dev->ctrl_config &= ~NVME_CC_SHN_MASK; 1638 dev->ctrl_config |= NVME_CC_SHN_NORMAL; 1639 1640 writel(dev->ctrl_config, &dev->bar->cc); 1641 1642 timeout = SHUTDOWN_TIMEOUT + jiffies; 1643 while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) != 1644 NVME_CSTS_SHST_CMPLT) { 1645 msleep(100); 1646 if (fatal_signal_pending(current)) 1647 return -EINTR; 1648 if (time_after(jiffies, timeout)) { 1649 dev_err(dev->dev, 1650 "Device shutdown incomplete; abort shutdown\n"); 1651 return -ENODEV; 1652 } 1653 } 1654 1655 return 0; 1656} 1657 1658static struct blk_mq_ops nvme_mq_admin_ops = { 1659 .queue_rq = nvme_queue_rq, 1660 .map_queue = blk_mq_map_queue, 1661 .init_hctx = nvme_admin_init_hctx, 1662 .exit_hctx = nvme_admin_exit_hctx, 1663 .init_request = nvme_admin_init_request, 1664 .timeout = nvme_timeout, 1665}; 1666 1667static struct blk_mq_ops nvme_mq_ops = { 1668 .queue_rq = nvme_queue_rq, 1669 .map_queue = blk_mq_map_queue, 1670 .init_hctx = nvme_init_hctx, 1671 .init_request = nvme_init_request, 1672 .timeout = nvme_timeout, 1673}; 1674 1675static void nvme_dev_remove_admin(struct nvme_dev *dev) 1676{ 1677 if (dev->admin_q && !blk_queue_dying(dev->admin_q)) { 1678 blk_cleanup_queue(dev->admin_q); 1679 blk_mq_free_tag_set(&dev->admin_tagset); 1680 } 1681} 1682 1683static int nvme_alloc_admin_tags(struct nvme_dev *dev) 1684{ 1685 if (!dev->admin_q) { 1686 dev->admin_tagset.ops = &nvme_mq_admin_ops; 1687 dev->admin_tagset.nr_hw_queues = 1; 1688 dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; 1689 dev->admin_tagset.reserved_tags = 1; 1690 dev->admin_tagset.timeout = ADMIN_TIMEOUT; 1691 dev->admin_tagset.numa_node = dev_to_node(dev->dev); 1692 dev->admin_tagset.cmd_size = nvme_cmd_size(dev); 1693 dev->admin_tagset.driver_data = dev; 1694 1695 if (blk_mq_alloc_tag_set(&dev->admin_tagset)) 1696 return -ENOMEM; 1697 1698 dev->admin_q = blk_mq_init_queue(&dev->admin_tagset); 1699 if (IS_ERR(dev->admin_q)) { 1700 blk_mq_free_tag_set(&dev->admin_tagset); 1701 return -ENOMEM; 1702 } 1703 if (!blk_get_queue(dev->admin_q)) { 1704 nvme_dev_remove_admin(dev); 1705 dev->admin_q = NULL; 1706 return -ENODEV; 1707 } 1708 } else 1709 blk_mq_unfreeze_queue(dev->admin_q); 1710 1711 return 0; 1712} 1713 1714static int nvme_configure_admin_queue(struct nvme_dev *dev) 1715{ 1716 int result; 1717 u32 aqa; 1718 u64 cap = readq(&dev->bar->cap); 1719 struct nvme_queue *nvmeq; 1720 unsigned page_shift = PAGE_SHIFT; 1721 unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12; 1722 unsigned dev_page_max = NVME_CAP_MPSMAX(cap) + 12; 1723 1724 if (page_shift < dev_page_min) { 1725 dev_err(dev->dev, 1726 "Minimum device page size (%u) too large for " 1727 "host (%u)\n", 1 << dev_page_min, 1728 1 << page_shift); 1729 return -ENODEV; 1730 } 1731 if (page_shift > dev_page_max) { 1732 dev_info(dev->dev, 1733 "Device maximum page size (%u) smaller than " 1734 "host (%u); enabling work-around\n", 1735 1 << dev_page_max, 1 << page_shift); 1736 page_shift = dev_page_max; 1737 } 1738 1739 dev->subsystem = readl(&dev->bar->vs) >= NVME_VS(1, 1) ? 1740 NVME_CAP_NSSRC(cap) : 0; 1741 1742 if (dev->subsystem && (readl(&dev->bar->csts) & NVME_CSTS_NSSRO)) 1743 writel(NVME_CSTS_NSSRO, &dev->bar->csts); 1744 1745 result = nvme_disable_ctrl(dev, cap); 1746 if (result < 0) 1747 return result; 1748 1749 nvmeq = dev->queues[0]; 1750 if (!nvmeq) { 1751 nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH); 1752 if (!nvmeq) 1753 return -ENOMEM; 1754 } 1755 1756 aqa = nvmeq->q_depth - 1; 1757 aqa |= aqa << 16; 1758 1759 dev->page_size = 1 << page_shift; 1760 1761 dev->ctrl_config = NVME_CC_CSS_NVM; 1762 dev->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; 1763 dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; 1764 dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; 1765 1766 writel(aqa, &dev->bar->aqa); 1767 writeq(nvmeq->sq_dma_addr, &dev->bar->asq); 1768 writeq(nvmeq->cq_dma_addr, &dev->bar->acq); 1769 1770 result = nvme_enable_ctrl(dev, cap); 1771 if (result) 1772 goto free_nvmeq; 1773 1774 nvmeq->cq_vector = 0; 1775 result = queue_request_irq(dev, nvmeq, nvmeq->irqname); 1776 if (result) { 1777 nvmeq->cq_vector = -1; 1778 goto free_nvmeq; 1779 } 1780 1781 return result; 1782 1783 free_nvmeq: 1784 nvme_free_queues(dev, 0); 1785 return result; 1786} 1787 1788static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) 1789{ 1790 struct nvme_dev *dev = ns->dev; 1791 struct nvme_user_io io; 1792 struct nvme_command c; 1793 unsigned length, meta_len; 1794 int status, write; 1795 dma_addr_t meta_dma = 0; 1796 void *meta = NULL; 1797 void __user *metadata; 1798 1799 if (copy_from_user(&io, uio, sizeof(io))) 1800 return -EFAULT; 1801 1802 switch (io.opcode) { 1803 case nvme_cmd_write: 1804 case nvme_cmd_read: 1805 case nvme_cmd_compare: 1806 break; 1807 default: 1808 return -EINVAL; 1809 } 1810 1811 length = (io.nblocks + 1) << ns->lba_shift; 1812 meta_len = (io.nblocks + 1) * ns->ms; 1813 metadata = (void __user *)(uintptr_t)io.metadata; 1814 write = io.opcode & 1; 1815 1816 if (ns->ext) { 1817 length += meta_len; 1818 meta_len = 0; 1819 } 1820 if (meta_len) { 1821 if (((io.metadata & 3) || !io.metadata) && !ns->ext) 1822 return -EINVAL; 1823 1824 meta = dma_alloc_coherent(dev->dev, meta_len, 1825 &meta_dma, GFP_KERNEL); 1826 1827 if (!meta) { 1828 status = -ENOMEM; 1829 goto unmap; 1830 } 1831 if (write) { 1832 if (copy_from_user(meta, metadata, meta_len)) { 1833 status = -EFAULT; 1834 goto unmap; 1835 } 1836 } 1837 } 1838 1839 memset(&c, 0, sizeof(c)); 1840 c.rw.opcode = io.opcode; 1841 c.rw.flags = io.flags; 1842 c.rw.nsid = cpu_to_le32(ns->ns_id); 1843 c.rw.slba = cpu_to_le64(io.slba); 1844 c.rw.length = cpu_to_le16(io.nblocks); 1845 c.rw.control = cpu_to_le16(io.control); 1846 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); 1847 c.rw.reftag = cpu_to_le32(io.reftag); 1848 c.rw.apptag = cpu_to_le16(io.apptag); 1849 c.rw.appmask = cpu_to_le16(io.appmask); 1850 c.rw.metadata = cpu_to_le64(meta_dma); 1851 1852 status = __nvme_submit_sync_cmd(ns->queue, &c, NULL, 1853 (void __user *)(uintptr_t)io.addr, length, NULL, 0); 1854 unmap: 1855 if (meta) { 1856 if (status == NVME_SC_SUCCESS && !write) { 1857 if (copy_to_user(metadata, meta, meta_len)) 1858 status = -EFAULT; 1859 } 1860 dma_free_coherent(dev->dev, meta_len, meta, meta_dma); 1861 } 1862 return status; 1863} 1864 1865static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns, 1866 struct nvme_passthru_cmd __user *ucmd) 1867{ 1868 struct nvme_passthru_cmd cmd; 1869 struct nvme_command c; 1870 unsigned timeout = 0; 1871 int status; 1872 1873 if (!capable(CAP_SYS_ADMIN)) 1874 return -EACCES; 1875 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 1876 return -EFAULT; 1877 1878 memset(&c, 0, sizeof(c)); 1879 c.common.opcode = cmd.opcode; 1880 c.common.flags = cmd.flags; 1881 c.common.nsid = cpu_to_le32(cmd.nsid); 1882 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 1883 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 1884 c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); 1885 c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); 1886 c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); 1887 c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); 1888 c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); 1889 c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); 1890 1891 if (cmd.timeout_ms) 1892 timeout = msecs_to_jiffies(cmd.timeout_ms); 1893 1894 status = __nvme_submit_sync_cmd(ns ? ns->queue : dev->admin_q, &c, 1895 NULL, (void __user *)(uintptr_t)cmd.addr, cmd.data_len, 1896 &cmd.result, timeout); 1897 if (status >= 0) { 1898 if (put_user(cmd.result, &ucmd->result)) 1899 return -EFAULT; 1900 } 1901 1902 return status; 1903} 1904 1905static int nvme_subsys_reset(struct nvme_dev *dev) 1906{ 1907 if (!dev->subsystem) 1908 return -ENOTTY; 1909 1910 writel(0x4E564D65, &dev->bar->nssr); /* "NVMe" */ 1911 return 0; 1912} 1913 1914static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, 1915 unsigned long arg) 1916{ 1917 struct nvme_ns *ns = bdev->bd_disk->private_data; 1918 1919 switch (cmd) { 1920 case NVME_IOCTL_ID: 1921 force_successful_syscall_return(); 1922 return ns->ns_id; 1923 case NVME_IOCTL_ADMIN_CMD: 1924 return nvme_user_cmd(ns->dev, NULL, (void __user *)arg); 1925 case NVME_IOCTL_IO_CMD: 1926 return nvme_user_cmd(ns->dev, ns, (void __user *)arg); 1927 case NVME_IOCTL_SUBMIT_IO: 1928 return nvme_submit_io(ns, (void __user *)arg); 1929 case SG_GET_VERSION_NUM: 1930 return nvme_sg_get_version_num((void __user *)arg); 1931 case SG_IO: 1932 return nvme_sg_io(ns, (void __user *)arg); 1933 default: 1934 return -ENOTTY; 1935 } 1936} 1937 1938#ifdef CONFIG_COMPAT 1939static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, 1940 unsigned int cmd, unsigned long arg) 1941{ 1942 switch (cmd) { 1943 case SG_IO: 1944 return -ENOIOCTLCMD; 1945 } 1946 return nvme_ioctl(bdev, mode, cmd, arg); 1947} 1948#else 1949#define nvme_compat_ioctl NULL 1950#endif 1951 1952static int nvme_open(struct block_device *bdev, fmode_t mode) 1953{ 1954 int ret = 0; 1955 struct nvme_ns *ns; 1956 1957 spin_lock(&dev_list_lock); 1958 ns = bdev->bd_disk->private_data; 1959 if (!ns) 1960 ret = -ENXIO; 1961 else if (!kref_get_unless_zero(&ns->dev->kref)) 1962 ret = -ENXIO; 1963 spin_unlock(&dev_list_lock); 1964 1965 return ret; 1966} 1967 1968static void nvme_free_dev(struct kref *kref); 1969 1970static void nvme_release(struct gendisk *disk, fmode_t mode) 1971{ 1972 struct nvme_ns *ns = disk->private_data; 1973 struct nvme_dev *dev = ns->dev; 1974 1975 kref_put(&dev->kref, nvme_free_dev); 1976} 1977 1978static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo) 1979{ 1980 /* some standard values */ 1981 geo->heads = 1 << 6; 1982 geo->sectors = 1 << 5; 1983 geo->cylinders = get_capacity(bd->bd_disk) >> 11; 1984 return 0; 1985} 1986 1987static void nvme_config_discard(struct nvme_ns *ns) 1988{ 1989 u32 logical_block_size = queue_logical_block_size(ns->queue); 1990 ns->queue->limits.discard_zeroes_data = 0; 1991 ns->queue->limits.discard_alignment = logical_block_size; 1992 ns->queue->limits.discard_granularity = logical_block_size; 1993 blk_queue_max_discard_sectors(ns->queue, 0xffffffff); 1994 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); 1995} 1996 1997static int nvme_revalidate_disk(struct gendisk *disk) 1998{ 1999 struct nvme_ns *ns = disk->private_data; 2000 struct nvme_dev *dev = ns->dev; 2001 struct nvme_id_ns *id; 2002 u8 lbaf, pi_type; 2003 u16 old_ms; 2004 unsigned short bs; 2005 2006 if (nvme_identify_ns(dev, ns->ns_id, &id)) { 2007 dev_warn(dev->dev, "%s: Identify failure nvme%dn%d\n", __func__, 2008 dev->instance, ns->ns_id); 2009 return -ENODEV; 2010 } 2011 if (id->ncap == 0) { 2012 kfree(id); 2013 return -ENODEV; 2014 } 2015 2016 old_ms = ns->ms; 2017 lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; 2018 ns->lba_shift = id->lbaf[lbaf].ds; 2019 ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); 2020 ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); 2021 2022 /* 2023 * If identify namespace failed, use default 512 byte block size so 2024 * block layer can use before failing read/write for 0 capacity. 2025 */ 2026 if (ns->lba_shift == 0) 2027 ns->lba_shift = 9; 2028 bs = 1 << ns->lba_shift; 2029 2030 /* XXX: PI implementation requires metadata equal t10 pi tuple size */ 2031 pi_type = ns->ms == sizeof(struct t10_pi_tuple) ? 2032 id->dps & NVME_NS_DPS_PI_MASK : 0; 2033 2034 if (blk_get_integrity(disk) && (ns->pi_type != pi_type || 2035 ns->ms != old_ms || 2036 bs != queue_logical_block_size(disk->queue) || 2037 (ns->ms && ns->ext))) 2038 blk_integrity_unregister(disk); 2039 2040 ns->pi_type = pi_type; 2041 blk_queue_logical_block_size(ns->queue, bs); 2042 2043 if (ns->ms && !blk_get_integrity(disk) && (disk->flags & GENHD_FL_UP) && 2044 !ns->ext) 2045 nvme_init_integrity(ns); 2046 2047 if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) 2048 set_capacity(disk, 0); 2049 else 2050 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); 2051 2052 if (dev->oncs & NVME_CTRL_ONCS_DSM) 2053 nvme_config_discard(ns); 2054 2055 kfree(id); 2056 return 0; 2057} 2058 2059static const struct block_device_operations nvme_fops = { 2060 .owner = THIS_MODULE, 2061 .ioctl = nvme_ioctl, 2062 .compat_ioctl = nvme_compat_ioctl, 2063 .open = nvme_open, 2064 .release = nvme_release, 2065 .getgeo = nvme_getgeo, 2066 .revalidate_disk= nvme_revalidate_disk, 2067}; 2068 2069static int nvme_kthread(void *data) 2070{ 2071 struct nvme_dev *dev, *next; 2072 2073 while (!kthread_should_stop()) { 2074 set_current_state(TASK_INTERRUPTIBLE); 2075 spin_lock(&dev_list_lock); 2076 list_for_each_entry_safe(dev, next, &dev_list, node) { 2077 int i; 2078 u32 csts = readl(&dev->bar->csts); 2079 2080 if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) || 2081 csts & NVME_CSTS_CFS) { 2082 if (work_busy(&dev->reset_work)) 2083 continue; 2084 list_del_init(&dev->node); 2085 dev_warn(dev->dev, 2086 "Failed status: %x, reset controller\n", 2087 readl(&dev->bar->csts)); 2088 dev->reset_workfn = nvme_reset_failed_dev; 2089 queue_work(nvme_workq, &dev->reset_work); 2090 continue; 2091 } 2092 for (i = 0; i < dev->queue_count; i++) { 2093 struct nvme_queue *nvmeq = dev->queues[i]; 2094 if (!nvmeq) 2095 continue; 2096 spin_lock_irq(&nvmeq->q_lock); 2097 nvme_process_cq(nvmeq); 2098 2099 while ((i == 0) && (dev->event_limit > 0)) { 2100 if (nvme_submit_async_admin_req(dev)) 2101 break; 2102 dev->event_limit--; 2103 } 2104 spin_unlock_irq(&nvmeq->q_lock); 2105 } 2106 } 2107 spin_unlock(&dev_list_lock); 2108 schedule_timeout(round_jiffies_relative(HZ)); 2109 } 2110 return 0; 2111} 2112 2113static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) 2114{ 2115 struct nvme_ns *ns; 2116 struct gendisk *disk; 2117 int node = dev_to_node(dev->dev); 2118 2119 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); 2120 if (!ns) 2121 return; 2122 2123 ns->queue = blk_mq_init_queue(&dev->tagset); 2124 if (IS_ERR(ns->queue)) 2125 goto out_free_ns; 2126 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); 2127 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); 2128 ns->dev = dev; 2129 ns->queue->queuedata = ns; 2130 2131 disk = alloc_disk_node(0, node); 2132 if (!disk) 2133 goto out_free_queue; 2134 2135 ns->ns_id = nsid; 2136 ns->disk = disk; 2137 ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ 2138 list_add_tail(&ns->list, &dev->namespaces); 2139 2140 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); 2141 if (dev->max_hw_sectors) { 2142 blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); 2143 blk_queue_max_segments(ns->queue, 2144 ((dev->max_hw_sectors << 9) / dev->page_size) + 1); 2145 } 2146 if (dev->stripe_size) 2147 blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9); 2148 if (dev->vwc & NVME_CTRL_VWC_PRESENT) 2149 blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); 2150 blk_queue_virt_boundary(ns->queue, dev->page_size - 1); 2151 2152 disk->major = nvme_major; 2153 disk->first_minor = 0; 2154 disk->fops = &nvme_fops; 2155 disk->private_data = ns; 2156 disk->queue = ns->queue; 2157 disk->driverfs_dev = dev->device; 2158 disk->flags = GENHD_FL_EXT_DEVT; 2159 sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); 2160 2161 /* 2162 * Initialize capacity to 0 until we establish the namespace format and 2163 * setup integrity extentions if necessary. The revalidate_disk after 2164 * add_disk allows the driver to register with integrity if the format 2165 * requires it. 2166 */ 2167 set_capacity(disk, 0); 2168 if (nvme_revalidate_disk(ns->disk)) 2169 goto out_free_disk; 2170 2171 add_disk(ns->disk); 2172 if (ns->ms) { 2173 struct block_device *bd = bdget_disk(ns->disk, 0); 2174 if (!bd) 2175 return; 2176 if (blkdev_get(bd, FMODE_READ, NULL)) { 2177 bdput(bd); 2178 return; 2179 } 2180 blkdev_reread_part(bd); 2181 blkdev_put(bd, FMODE_READ); 2182 } 2183 return; 2184 out_free_disk: 2185 kfree(disk); 2186 list_del(&ns->list); 2187 out_free_queue: 2188 blk_cleanup_queue(ns->queue); 2189 out_free_ns: 2190 kfree(ns); 2191} 2192 2193static void nvme_create_io_queues(struct nvme_dev *dev) 2194{ 2195 unsigned i; 2196 2197 for (i = dev->queue_count; i <= dev->max_qid; i++) 2198 if (!nvme_alloc_queue(dev, i, dev->q_depth)) 2199 break; 2200 2201 for (i = dev->online_queues; i <= dev->queue_count - 1; i++) 2202 if (nvme_create_queue(dev->queues[i], i)) 2203 break; 2204} 2205 2206static int set_queue_count(struct nvme_dev *dev, int count) 2207{ 2208 int status; 2209 u32 result; 2210 u32 q_count = (count - 1) | ((count - 1) << 16); 2211 2212 status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, 2213 &result); 2214 if (status < 0) 2215 return status; 2216 if (status > 0) { 2217 dev_err(dev->dev, "Could not set queue count (%d)\n", status); 2218 return 0; 2219 } 2220 return min(result & 0xffff, result >> 16) + 1; 2221} 2222 2223static void __iomem *nvme_map_cmb(struct nvme_dev *dev) 2224{ 2225 u64 szu, size, offset; 2226 u32 cmbloc; 2227 resource_size_t bar_size; 2228 struct pci_dev *pdev = to_pci_dev(dev->dev); 2229 void __iomem *cmb; 2230 dma_addr_t dma_addr; 2231 2232 if (!use_cmb_sqes) 2233 return NULL; 2234 2235 dev->cmbsz = readl(&dev->bar->cmbsz); 2236 if (!(NVME_CMB_SZ(dev->cmbsz))) 2237 return NULL; 2238 2239 cmbloc = readl(&dev->bar->cmbloc); 2240 2241 szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz)); 2242 size = szu * NVME_CMB_SZ(dev->cmbsz); 2243 offset = szu * NVME_CMB_OFST(cmbloc); 2244 bar_size = pci_resource_len(pdev, NVME_CMB_BIR(cmbloc)); 2245 2246 if (offset > bar_size) 2247 return NULL; 2248 2249 /* 2250 * Controllers may support a CMB size larger than their BAR, 2251 * for example, due to being behind a bridge. Reduce the CMB to 2252 * the reported size of the BAR 2253 */ 2254 if (size > bar_size - offset) 2255 size = bar_size - offset; 2256 2257 dma_addr = pci_resource_start(pdev, NVME_CMB_BIR(cmbloc)) + offset; 2258 cmb = ioremap_wc(dma_addr, size); 2259 if (!cmb) 2260 return NULL; 2261 2262 dev->cmb_dma_addr = dma_addr; 2263 dev->cmb_size = size; 2264 return cmb; 2265} 2266 2267static inline void nvme_release_cmb(struct nvme_dev *dev) 2268{ 2269 if (dev->cmb) { 2270 iounmap(dev->cmb); 2271 dev->cmb = NULL; 2272 } 2273} 2274 2275static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) 2276{ 2277 return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride); 2278} 2279 2280static int nvme_setup_io_queues(struct nvme_dev *dev) 2281{ 2282 struct nvme_queue *adminq = dev->queues[0]; 2283 struct pci_dev *pdev = to_pci_dev(dev->dev); 2284 int result, i, vecs, nr_io_queues, size; 2285 2286 nr_io_queues = num_possible_cpus(); 2287 result = set_queue_count(dev, nr_io_queues); 2288 if (result <= 0) 2289 return result; 2290 if (result < nr_io_queues) 2291 nr_io_queues = result; 2292 2293 if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) { 2294 result = nvme_cmb_qdepth(dev, nr_io_queues, 2295 sizeof(struct nvme_command)); 2296 if (result > 0) 2297 dev->q_depth = result; 2298 else 2299 nvme_release_cmb(dev); 2300 } 2301 2302 size = db_bar_size(dev, nr_io_queues); 2303 if (size > 8192) { 2304 iounmap(dev->bar); 2305 do { 2306 dev->bar = ioremap(pci_resource_start(pdev, 0), size); 2307 if (dev->bar) 2308 break; 2309 if (!--nr_io_queues) 2310 return -ENOMEM; 2311 size = db_bar_size(dev, nr_io_queues); 2312 } while (1); 2313 dev->dbs = ((void __iomem *)dev->bar) + 4096; 2314 adminq->q_db = dev->dbs; 2315 } 2316 2317 /* Deregister the admin queue's interrupt */ 2318 free_irq(dev->entry[0].vector, adminq); 2319 2320 /* 2321 * If we enable msix early due to not intx, disable it again before 2322 * setting up the full range we need. 2323 */ 2324 if (!pdev->irq) 2325 pci_disable_msix(pdev); 2326 2327 for (i = 0; i < nr_io_queues; i++) 2328 dev->entry[i].entry = i; 2329 vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues); 2330 if (vecs < 0) { 2331 vecs = pci_enable_msi_range(pdev, 1, min(nr_io_queues, 32)); 2332 if (vecs < 0) { 2333 vecs = 1; 2334 } else { 2335 for (i = 0; i < vecs; i++) 2336 dev->entry[i].vector = i + pdev->irq; 2337 } 2338 } 2339 2340 /* 2341 * Should investigate if there's a performance win from allocating 2342 * more queues than interrupt vectors; it might allow the submission 2343 * path to scale better, even if the receive path is limited by the 2344 * number of interrupts. 2345 */ 2346 nr_io_queues = vecs; 2347 dev->max_qid = nr_io_queues; 2348 2349 result = queue_request_irq(dev, adminq, adminq->irqname); 2350 if (result) { 2351 adminq->cq_vector = -1; 2352 goto free_queues; 2353 } 2354 2355 /* Free previously allocated queues that are no longer usable */ 2356 nvme_free_queues(dev, nr_io_queues + 1); 2357 nvme_create_io_queues(dev); 2358 2359 return 0; 2360 2361 free_queues: 2362 nvme_free_queues(dev, 1); 2363 return result; 2364} 2365 2366static void nvme_free_namespace(struct nvme_ns *ns) 2367{ 2368 list_del(&ns->list); 2369 2370 spin_lock(&dev_list_lock); 2371 ns->disk->private_data = NULL; 2372 spin_unlock(&dev_list_lock); 2373 2374 put_disk(ns->disk); 2375 kfree(ns); 2376} 2377 2378static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) 2379{ 2380 struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); 2381 struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); 2382 2383 return nsa->ns_id - nsb->ns_id; 2384} 2385 2386static struct nvme_ns *nvme_find_ns(struct nvme_dev *dev, unsigned nsid) 2387{ 2388 struct nvme_ns *ns; 2389 2390 list_for_each_entry(ns, &dev->namespaces, list) { 2391 if (ns->ns_id == nsid) 2392 return ns; 2393 if (ns->ns_id > nsid) 2394 break; 2395 } 2396 return NULL; 2397} 2398 2399static inline bool nvme_io_incapable(struct nvme_dev *dev) 2400{ 2401 return (!dev->bar || readl(&dev->bar->csts) & NVME_CSTS_CFS || 2402 dev->online_queues < 2); 2403} 2404 2405static void nvme_ns_remove(struct nvme_ns *ns) 2406{ 2407 bool kill = nvme_io_incapable(ns->dev) && !blk_queue_dying(ns->queue); 2408 2409 if (kill) 2410 blk_set_queue_dying(ns->queue); 2411 if (ns->disk->flags & GENHD_FL_UP) { 2412 if (blk_get_integrity(ns->disk)) 2413 blk_integrity_unregister(ns->disk); 2414 del_gendisk(ns->disk); 2415 } 2416 if (kill || !blk_queue_dying(ns->queue)) { 2417 blk_mq_abort_requeue_list(ns->queue); 2418 blk_cleanup_queue(ns->queue); 2419 } 2420} 2421 2422static void nvme_scan_namespaces(struct nvme_dev *dev, unsigned nn) 2423{ 2424 struct nvme_ns *ns, *next; 2425 unsigned i; 2426 2427 for (i = 1; i <= nn; i++) { 2428 ns = nvme_find_ns(dev, i); 2429 if (ns) { 2430 if (revalidate_disk(ns->disk)) { 2431 nvme_ns_remove(ns); 2432 nvme_free_namespace(ns); 2433 } 2434 } else 2435 nvme_alloc_ns(dev, i); 2436 } 2437 list_for_each_entry_safe(ns, next, &dev->namespaces, list) { 2438 if (ns->ns_id > nn) { 2439 nvme_ns_remove(ns); 2440 nvme_free_namespace(ns); 2441 } 2442 } 2443 list_sort(NULL, &dev->namespaces, ns_cmp); 2444} 2445 2446static void nvme_set_irq_hints(struct nvme_dev *dev) 2447{ 2448 struct nvme_queue *nvmeq; 2449 int i; 2450 2451 for (i = 0; i < dev->online_queues; i++) { 2452 nvmeq = dev->queues[i]; 2453 2454 if (!nvmeq->tags || !(*nvmeq->tags)) 2455 continue; 2456 2457 irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, 2458 blk_mq_tags_cpumask(*nvmeq->tags)); 2459 } 2460} 2461 2462static void nvme_dev_scan(struct work_struct *work) 2463{ 2464 struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work); 2465 struct nvme_id_ctrl *ctrl; 2466 2467 if (!dev->tagset.tags) 2468 return; 2469 if (nvme_identify_ctrl(dev, &ctrl)) 2470 return; 2471 nvme_scan_namespaces(dev, le32_to_cpup(&ctrl->nn)); 2472 kfree(ctrl); 2473 nvme_set_irq_hints(dev); 2474} 2475 2476/* 2477 * Return: error value if an error occurred setting up the queues or calling 2478 * Identify Device. 0 if these succeeded, even if adding some of the 2479 * namespaces failed. At the moment, these failures are silent. TBD which 2480 * failures should be reported. 2481 */ 2482static int nvme_dev_add(struct nvme_dev *dev) 2483{ 2484 struct pci_dev *pdev = to_pci_dev(dev->dev); 2485 int res; 2486 struct nvme_id_ctrl *ctrl; 2487 int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; 2488 2489 res = nvme_identify_ctrl(dev, &ctrl); 2490 if (res) { 2491 dev_err(dev->dev, "Identify Controller failed (%d)\n", res); 2492 return -EIO; 2493 } 2494 2495 dev->oncs = le16_to_cpup(&ctrl->oncs); 2496 dev->abort_limit = ctrl->acl + 1; 2497 dev->vwc = ctrl->vwc; 2498 memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); 2499 memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); 2500 memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); 2501 if (ctrl->mdts) 2502 dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); 2503 if ((pdev->vendor == PCI_VENDOR_ID_INTEL) && 2504 (pdev->device == 0x0953) && ctrl->vs[3]) { 2505 unsigned int max_hw_sectors; 2506 2507 dev->stripe_size = 1 << (ctrl->vs[3] + shift); 2508 max_hw_sectors = dev->stripe_size >> (shift - 9); 2509 if (dev->max_hw_sectors) { 2510 dev->max_hw_sectors = min(max_hw_sectors, 2511 dev->max_hw_sectors); 2512 } else 2513 dev->max_hw_sectors = max_hw_sectors; 2514 } 2515 kfree(ctrl); 2516 2517 if (!dev->tagset.tags) { 2518 dev->tagset.ops = &nvme_mq_ops; 2519 dev->tagset.nr_hw_queues = dev->online_queues - 1; 2520 dev->tagset.timeout = NVME_IO_TIMEOUT; 2521 dev->tagset.numa_node = dev_to_node(dev->dev); 2522 dev->tagset.queue_depth = 2523 min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; 2524 dev->tagset.cmd_size = nvme_cmd_size(dev); 2525 dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; 2526 dev->tagset.driver_data = dev; 2527 2528 if (blk_mq_alloc_tag_set(&dev->tagset)) 2529 return 0; 2530 } 2531 schedule_work(&dev->scan_work); 2532 return 0; 2533} 2534 2535static int nvme_dev_map(struct nvme_dev *dev) 2536{ 2537 u64 cap; 2538 int bars, result = -ENOMEM; 2539 struct pci_dev *pdev = to_pci_dev(dev->dev); 2540 2541 if (pci_enable_device_mem(pdev)) 2542 return result; 2543 2544 dev->entry[0].vector = pdev->irq; 2545 pci_set_master(pdev); 2546 bars = pci_select_bars(pdev, IORESOURCE_MEM); 2547 if (!bars) 2548 goto disable_pci; 2549 2550 if (pci_request_selected_regions(pdev, bars, "nvme")) 2551 goto disable_pci; 2552 2553 if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) && 2554 dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32))) 2555 goto disable; 2556 2557 dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); 2558 if (!dev->bar) 2559 goto disable; 2560 2561 if (readl(&dev->bar->csts) == -1) { 2562 result = -ENODEV; 2563 goto unmap; 2564 } 2565 2566 /* 2567 * Some devices don't advertse INTx interrupts, pre-enable a single 2568 * MSIX vec for setup. We'll adjust this later. 2569 */ 2570 if (!pdev->irq) { 2571 result = pci_enable_msix(pdev, dev->entry, 1); 2572 if (result < 0) 2573 goto unmap; 2574 } 2575 2576 cap = readq(&dev->bar->cap); 2577 dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH); 2578 dev->db_stride = 1 << NVME_CAP_STRIDE(cap); 2579 dev->dbs = ((void __iomem *)dev->bar) + 4096; 2580 if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) 2581 dev->cmb = nvme_map_cmb(dev); 2582 2583 return 0; 2584 2585 unmap: 2586 iounmap(dev->bar); 2587 dev->bar = NULL; 2588 disable: 2589 pci_release_regions(pdev); 2590 disable_pci: 2591 pci_disable_device(pdev); 2592 return result; 2593} 2594 2595static void nvme_dev_unmap(struct nvme_dev *dev) 2596{ 2597 struct pci_dev *pdev = to_pci_dev(dev->dev); 2598 2599 if (pdev->msi_enabled) 2600 pci_disable_msi(pdev); 2601 else if (pdev->msix_enabled) 2602 pci_disable_msix(pdev); 2603 2604 if (dev->bar) { 2605 iounmap(dev->bar); 2606 dev->bar = NULL; 2607 pci_release_regions(pdev); 2608 } 2609 2610 if (pci_is_enabled(pdev)) 2611 pci_disable_device(pdev); 2612} 2613 2614struct nvme_delq_ctx { 2615 struct task_struct *waiter; 2616 struct kthread_worker *worker; 2617 atomic_t refcount; 2618}; 2619 2620static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev) 2621{ 2622 dq->waiter = current; 2623 mb(); 2624 2625 for (;;) { 2626 set_current_state(TASK_KILLABLE); 2627 if (!atomic_read(&dq->refcount)) 2628 break; 2629 if (!schedule_timeout(ADMIN_TIMEOUT) || 2630 fatal_signal_pending(current)) { 2631 /* 2632 * Disable the controller first since we can't trust it 2633 * at this point, but leave the admin queue enabled 2634 * until all queue deletion requests are flushed. 2635 * FIXME: This may take a while if there are more h/w 2636 * queues than admin tags. 2637 */ 2638 set_current_state(TASK_RUNNING); 2639 nvme_disable_ctrl(dev, readq(&dev->bar->cap)); 2640 nvme_clear_queue(dev->queues[0]); 2641 flush_kthread_worker(dq->worker); 2642 nvme_disable_queue(dev, 0); 2643 return; 2644 } 2645 } 2646 set_current_state(TASK_RUNNING); 2647} 2648 2649static void nvme_put_dq(struct nvme_delq_ctx *dq) 2650{ 2651 atomic_dec(&dq->refcount); 2652 if (dq->waiter) 2653 wake_up_process(dq->waiter); 2654} 2655 2656static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq) 2657{ 2658 atomic_inc(&dq->refcount); 2659 return dq; 2660} 2661 2662static void nvme_del_queue_end(struct nvme_queue *nvmeq) 2663{ 2664 struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx; 2665 nvme_put_dq(dq); 2666} 2667 2668static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode, 2669 kthread_work_func_t fn) 2670{ 2671 struct nvme_command c; 2672 2673 memset(&c, 0, sizeof(c)); 2674 c.delete_queue.opcode = opcode; 2675 c.delete_queue.qid = cpu_to_le16(nvmeq->qid); 2676 2677 init_kthread_work(&nvmeq->cmdinfo.work, fn); 2678 return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo, 2679 ADMIN_TIMEOUT); 2680} 2681 2682static void nvme_del_cq_work_handler(struct kthread_work *work) 2683{ 2684 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, 2685 cmdinfo.work); 2686 nvme_del_queue_end(nvmeq); 2687} 2688 2689static int nvme_delete_cq(struct nvme_queue *nvmeq) 2690{ 2691 return adapter_async_del_queue(nvmeq, nvme_admin_delete_cq, 2692 nvme_del_cq_work_handler); 2693} 2694 2695static void nvme_del_sq_work_handler(struct kthread_work *work) 2696{ 2697 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, 2698 cmdinfo.work); 2699 int status = nvmeq->cmdinfo.status; 2700 2701 if (!status) 2702 status = nvme_delete_cq(nvmeq); 2703 if (status) 2704 nvme_del_queue_end(nvmeq); 2705} 2706 2707static int nvme_delete_sq(struct nvme_queue *nvmeq) 2708{ 2709 return adapter_async_del_queue(nvmeq, nvme_admin_delete_sq, 2710 nvme_del_sq_work_handler); 2711} 2712 2713static void nvme_del_queue_start(struct kthread_work *work) 2714{ 2715 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, 2716 cmdinfo.work); 2717 if (nvme_delete_sq(nvmeq)) 2718 nvme_del_queue_end(nvmeq); 2719} 2720 2721static void nvme_disable_io_queues(struct nvme_dev *dev) 2722{ 2723 int i; 2724 DEFINE_KTHREAD_WORKER_ONSTACK(worker); 2725 struct nvme_delq_ctx dq; 2726 struct task_struct *kworker_task = kthread_run(kthread_worker_fn, 2727 &worker, "nvme%d", dev->instance); 2728 2729 if (IS_ERR(kworker_task)) { 2730 dev_err(dev->dev, 2731 "Failed to create queue del task\n"); 2732 for (i = dev->queue_count - 1; i > 0; i--) 2733 nvme_disable_queue(dev, i); 2734 return; 2735 } 2736 2737 dq.waiter = NULL; 2738 atomic_set(&dq.refcount, 0); 2739 dq.worker = &worker; 2740 for (i = dev->queue_count - 1; i > 0; i--) { 2741 struct nvme_queue *nvmeq = dev->queues[i]; 2742 2743 if (nvme_suspend_queue(nvmeq)) 2744 continue; 2745 nvmeq->cmdinfo.ctx = nvme_get_dq(&dq); 2746 nvmeq->cmdinfo.worker = dq.worker; 2747 init_kthread_work(&nvmeq->cmdinfo.work, nvme_del_queue_start); 2748 queue_kthread_work(dq.worker, &nvmeq->cmdinfo.work); 2749 } 2750 nvme_wait_dq(&dq, dev); 2751 kthread_stop(kworker_task); 2752} 2753 2754/* 2755* Remove the node from the device list and check 2756* for whether or not we need to stop the nvme_thread. 2757*/ 2758static void nvme_dev_list_remove(struct nvme_dev *dev) 2759{ 2760 struct task_struct *tmp = NULL; 2761 2762 spin_lock(&dev_list_lock); 2763 list_del_init(&dev->node); 2764 if (list_empty(&dev_list) && !IS_ERR_OR_NULL(nvme_thread)) { 2765 tmp = nvme_thread; 2766 nvme_thread = NULL; 2767 } 2768 spin_unlock(&dev_list_lock); 2769 2770 if (tmp) 2771 kthread_stop(tmp); 2772} 2773 2774static void nvme_freeze_queues(struct nvme_dev *dev) 2775{ 2776 struct nvme_ns *ns; 2777 2778 list_for_each_entry(ns, &dev->namespaces, list) { 2779 blk_mq_freeze_queue_start(ns->queue); 2780 2781 spin_lock_irq(ns->queue->queue_lock); 2782 queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue); 2783 spin_unlock_irq(ns->queue->queue_lock); 2784 2785 blk_mq_cancel_requeue_work(ns->queue); 2786 blk_mq_stop_hw_queues(ns->queue); 2787 } 2788} 2789 2790static void nvme_unfreeze_queues(struct nvme_dev *dev) 2791{ 2792 struct nvme_ns *ns; 2793 2794 list_for_each_entry(ns, &dev->namespaces, list) { 2795 queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue); 2796 blk_mq_unfreeze_queue(ns->queue); 2797 blk_mq_start_stopped_hw_queues(ns->queue, true); 2798 blk_mq_kick_requeue_list(ns->queue); 2799 } 2800} 2801 2802static void nvme_dev_shutdown(struct nvme_dev *dev) 2803{ 2804 int i; 2805 u32 csts = -1; 2806 2807 nvme_dev_list_remove(dev); 2808 2809 if (dev->bar) { 2810 nvme_freeze_queues(dev); 2811 csts = readl(&dev->bar->csts); 2812 } 2813 if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { 2814 for (i = dev->queue_count - 1; i >= 0; i--) { 2815 struct nvme_queue *nvmeq = dev->queues[i]; 2816 nvme_suspend_queue(nvmeq); 2817 } 2818 } else { 2819 nvme_disable_io_queues(dev); 2820 nvme_shutdown_ctrl(dev); 2821 nvme_disable_queue(dev, 0); 2822 } 2823 nvme_dev_unmap(dev); 2824 2825 for (i = dev->queue_count - 1; i >= 0; i--) 2826 nvme_clear_queue(dev->queues[i]); 2827} 2828 2829static void nvme_dev_remove(struct nvme_dev *dev) 2830{ 2831 struct nvme_ns *ns; 2832 2833 list_for_each_entry(ns, &dev->namespaces, list) 2834 nvme_ns_remove(ns); 2835} 2836 2837static int nvme_setup_prp_pools(struct nvme_dev *dev) 2838{ 2839 dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, 2840 PAGE_SIZE, PAGE_SIZE, 0); 2841 if (!dev->prp_page_pool) 2842 return -ENOMEM; 2843 2844 /* Optimisation for I/Os between 4k and 128k */ 2845 dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev, 2846 256, 256, 0); 2847 if (!dev->prp_small_pool) { 2848 dma_pool_destroy(dev->prp_page_pool); 2849 return -ENOMEM; 2850 } 2851 return 0; 2852} 2853 2854static void nvme_release_prp_pools(struct nvme_dev *dev) 2855{ 2856 dma_pool_destroy(dev->prp_page_pool); 2857 dma_pool_destroy(dev->prp_small_pool); 2858} 2859 2860static DEFINE_IDA(nvme_instance_ida); 2861 2862static int nvme_set_instance(struct nvme_dev *dev) 2863{ 2864 int instance, error; 2865 2866 do { 2867 if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) 2868 return -ENODEV; 2869 2870 spin_lock(&dev_list_lock); 2871 error = ida_get_new(&nvme_instance_ida, &instance); 2872 spin_unlock(&dev_list_lock); 2873 } while (error == -EAGAIN); 2874 2875 if (error) 2876 return -ENODEV; 2877 2878 dev->instance = instance; 2879 return 0; 2880} 2881 2882static void nvme_release_instance(struct nvme_dev *dev) 2883{ 2884 spin_lock(&dev_list_lock); 2885 ida_remove(&nvme_instance_ida, dev->instance); 2886 spin_unlock(&dev_list_lock); 2887} 2888 2889static void nvme_free_namespaces(struct nvme_dev *dev) 2890{ 2891 struct nvme_ns *ns, *next; 2892 2893 list_for_each_entry_safe(ns, next, &dev->namespaces, list) 2894 nvme_free_namespace(ns); 2895} 2896 2897static void nvme_free_dev(struct kref *kref) 2898{ 2899 struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); 2900 2901 put_device(dev->dev); 2902 put_device(dev->device); 2903 nvme_free_namespaces(dev); 2904 nvme_release_instance(dev); 2905 if (dev->tagset.tags) 2906 blk_mq_free_tag_set(&dev->tagset); 2907 if (dev->admin_q) 2908 blk_put_queue(dev->admin_q); 2909 kfree(dev->queues); 2910 kfree(dev->entry); 2911 kfree(dev); 2912} 2913 2914static int nvme_dev_open(struct inode *inode, struct file *f) 2915{ 2916 struct nvme_dev *dev; 2917 int instance = iminor(inode); 2918 int ret = -ENODEV; 2919 2920 spin_lock(&dev_list_lock); 2921 list_for_each_entry(dev, &dev_list, node) { 2922 if (dev->instance == instance) { 2923 if (!dev->admin_q) { 2924 ret = -EWOULDBLOCK; 2925 break; 2926 } 2927 if (!kref_get_unless_zero(&dev->kref)) 2928 break; 2929 f->private_data = dev; 2930 ret = 0; 2931 break; 2932 } 2933 } 2934 spin_unlock(&dev_list_lock); 2935 2936 return ret; 2937} 2938 2939static int nvme_dev_release(struct inode *inode, struct file *f) 2940{ 2941 struct nvme_dev *dev = f->private_data; 2942 kref_put(&dev->kref, nvme_free_dev); 2943 return 0; 2944} 2945 2946static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 2947{ 2948 struct nvme_dev *dev = f->private_data; 2949 struct nvme_ns *ns; 2950 2951 switch (cmd) { 2952 case NVME_IOCTL_ADMIN_CMD: 2953 return nvme_user_cmd(dev, NULL, (void __user *)arg); 2954 case NVME_IOCTL_IO_CMD: 2955 if (list_empty(&dev->namespaces)) 2956 return -ENOTTY; 2957 ns = list_first_entry(&dev->namespaces, struct nvme_ns, list); 2958 return nvme_user_cmd(dev, ns, (void __user *)arg); 2959 case NVME_IOCTL_RESET: 2960 dev_warn(dev->dev, "resetting controller\n"); 2961 return nvme_reset(dev); 2962 case NVME_IOCTL_SUBSYS_RESET: 2963 return nvme_subsys_reset(dev); 2964 default: 2965 return -ENOTTY; 2966 } 2967} 2968 2969static const struct file_operations nvme_dev_fops = { 2970 .owner = THIS_MODULE, 2971 .open = nvme_dev_open, 2972 .release = nvme_dev_release, 2973 .unlocked_ioctl = nvme_dev_ioctl, 2974 .compat_ioctl = nvme_dev_ioctl, 2975}; 2976 2977static int nvme_dev_start(struct nvme_dev *dev) 2978{ 2979 int result; 2980 bool start_thread = false; 2981 2982 result = nvme_dev_map(dev); 2983 if (result) 2984 return result; 2985 2986 result = nvme_configure_admin_queue(dev); 2987 if (result) 2988 goto unmap; 2989 2990 spin_lock(&dev_list_lock); 2991 if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) { 2992 start_thread = true; 2993 nvme_thread = NULL; 2994 } 2995 list_add(&dev->node, &dev_list); 2996 spin_unlock(&dev_list_lock); 2997 2998 if (start_thread) { 2999 nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); 3000 wake_up_all(&nvme_kthread_wait); 3001 } else 3002 wait_event_killable(nvme_kthread_wait, nvme_thread); 3003 3004 if (IS_ERR_OR_NULL(nvme_thread)) { 3005 result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR; 3006 goto disable; 3007 } 3008 3009 nvme_init_queue(dev->queues[0], 0); 3010 result = nvme_alloc_admin_tags(dev); 3011 if (result) 3012 goto disable; 3013 3014 result = nvme_setup_io_queues(dev); 3015 if (result) 3016 goto free_tags; 3017 3018 dev->event_limit = 1; 3019 return result; 3020 3021 free_tags: 3022 nvme_dev_remove_admin(dev); 3023 blk_put_queue(dev->admin_q); 3024 dev->admin_q = NULL; 3025 dev->queues[0]->tags = NULL; 3026 disable: 3027 nvme_disable_queue(dev, 0); 3028 nvme_dev_list_remove(dev); 3029 unmap: 3030 nvme_dev_unmap(dev); 3031 return result; 3032} 3033 3034static int nvme_remove_dead_ctrl(void *arg) 3035{ 3036 struct nvme_dev *dev = (struct nvme_dev *)arg; 3037 struct pci_dev *pdev = to_pci_dev(dev->dev); 3038 3039 if (pci_get_drvdata(pdev)) 3040 pci_stop_and_remove_bus_device_locked(pdev); 3041 kref_put(&dev->kref, nvme_free_dev); 3042 return 0; 3043} 3044 3045static void nvme_remove_disks(struct work_struct *ws) 3046{ 3047 struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); 3048 3049 nvme_free_queues(dev, 1); 3050 nvme_dev_remove(dev); 3051} 3052 3053static int nvme_dev_resume(struct nvme_dev *dev) 3054{ 3055 int ret; 3056 3057 ret = nvme_dev_start(dev); 3058 if (ret) 3059 return ret; 3060 if (dev->online_queues < 2) { 3061 spin_lock(&dev_list_lock); 3062 dev->reset_workfn = nvme_remove_disks; 3063 queue_work(nvme_workq, &dev->reset_work); 3064 spin_unlock(&dev_list_lock); 3065 } else { 3066 nvme_unfreeze_queues(dev); 3067 nvme_dev_add(dev); 3068 } 3069 return 0; 3070} 3071 3072static void nvme_dead_ctrl(struct nvme_dev *dev) 3073{ 3074 dev_warn(dev->dev, "Device failed to resume\n"); 3075 kref_get(&dev->kref); 3076 if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", 3077 dev->instance))) { 3078 dev_err(dev->dev, 3079 "Failed to start controller remove task\n"); 3080 kref_put(&dev->kref, nvme_free_dev); 3081 } 3082} 3083 3084static void nvme_dev_reset(struct nvme_dev *dev) 3085{ 3086 bool in_probe = work_busy(&dev->probe_work); 3087 3088 nvme_dev_shutdown(dev); 3089 3090 /* Synchronize with device probe so that work will see failure status 3091 * and exit gracefully without trying to schedule another reset */ 3092 flush_work(&dev->probe_work); 3093 3094 /* Fail this device if reset occured during probe to avoid 3095 * infinite initialization loops. */ 3096 if (in_probe) { 3097 nvme_dead_ctrl(dev); 3098 return; 3099 } 3100 /* Schedule device resume asynchronously so the reset work is available 3101 * to cleanup errors that may occur during reinitialization */ 3102 schedule_work(&dev->probe_work); 3103} 3104 3105static void nvme_reset_failed_dev(struct work_struct *ws) 3106{ 3107 struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); 3108 nvme_dev_reset(dev); 3109} 3110 3111static void nvme_reset_workfn(struct work_struct *work) 3112{ 3113 struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); 3114 dev->reset_workfn(work); 3115} 3116 3117static int nvme_reset(struct nvme_dev *dev) 3118{ 3119 int ret = -EBUSY; 3120 3121 if (!dev->admin_q || blk_queue_dying(dev->admin_q)) 3122 return -ENODEV; 3123 3124 spin_lock(&dev_list_lock); 3125 if (!work_pending(&dev->reset_work)) { 3126 dev->reset_workfn = nvme_reset_failed_dev; 3127 queue_work(nvme_workq, &dev->reset_work); 3128 ret = 0; 3129 } 3130 spin_unlock(&dev_list_lock); 3131 3132 if (!ret) { 3133 flush_work(&dev->reset_work); 3134 flush_work(&dev->probe_work); 3135 return 0; 3136 } 3137 3138 return ret; 3139} 3140 3141static ssize_t nvme_sysfs_reset(struct device *dev, 3142 struct device_attribute *attr, const char *buf, 3143 size_t count) 3144{ 3145 struct nvme_dev *ndev = dev_get_drvdata(dev); 3146 int ret; 3147 3148 ret = nvme_reset(ndev); 3149 if (ret < 0) 3150 return ret; 3151 3152 return count; 3153} 3154static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); 3155 3156static void nvme_async_probe(struct work_struct *work); 3157static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) 3158{ 3159 int node, result = -ENOMEM; 3160 struct nvme_dev *dev; 3161 3162 node = dev_to_node(&pdev->dev); 3163 if (node == NUMA_NO_NODE) 3164 set_dev_node(&pdev->dev, 0); 3165 3166 dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); 3167 if (!dev) 3168 return -ENOMEM; 3169 dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry), 3170 GFP_KERNEL, node); 3171 if (!dev->entry) 3172 goto free; 3173 dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *), 3174 GFP_KERNEL, node); 3175 if (!dev->queues) 3176 goto free; 3177 3178 INIT_LIST_HEAD(&dev->namespaces); 3179 dev->reset_workfn = nvme_reset_failed_dev; 3180 INIT_WORK(&dev->reset_work, nvme_reset_workfn); 3181 dev->dev = get_device(&pdev->dev); 3182 pci_set_drvdata(pdev, dev); 3183 result = nvme_set_instance(dev); 3184 if (result) 3185 goto put_pci; 3186 3187 result = nvme_setup_prp_pools(dev); 3188 if (result) 3189 goto release; 3190 3191 kref_init(&dev->kref); 3192 dev->device = device_create(nvme_class, &pdev->dev, 3193 MKDEV(nvme_char_major, dev->instance), 3194 dev, "nvme%d", dev->instance); 3195 if (IS_ERR(dev->device)) { 3196 result = PTR_ERR(dev->device); 3197 goto release_pools; 3198 } 3199 get_device(dev->device); 3200 dev_set_drvdata(dev->device, dev); 3201 3202 result = device_create_file(dev->device, &dev_attr_reset_controller); 3203 if (result) 3204 goto put_dev; 3205 3206 INIT_LIST_HEAD(&dev->node); 3207 INIT_WORK(&dev->scan_work, nvme_dev_scan); 3208 INIT_WORK(&dev->probe_work, nvme_async_probe); 3209 schedule_work(&dev->probe_work); 3210 return 0; 3211 3212 put_dev: 3213 device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance)); 3214 put_device(dev->device); 3215 release_pools: 3216 nvme_release_prp_pools(dev); 3217 release: 3218 nvme_release_instance(dev); 3219 put_pci: 3220 put_device(dev->dev); 3221 free: 3222 kfree(dev->queues); 3223 kfree(dev->entry); 3224 kfree(dev); 3225 return result; 3226} 3227 3228static void nvme_async_probe(struct work_struct *work) 3229{ 3230 struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work); 3231 3232 if (nvme_dev_resume(dev) && !work_busy(&dev->reset_work)) 3233 nvme_dead_ctrl(dev); 3234} 3235 3236static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) 3237{ 3238 struct nvme_dev *dev = pci_get_drvdata(pdev); 3239 3240 if (prepare) 3241 nvme_dev_shutdown(dev); 3242 else 3243 nvme_dev_resume(dev); 3244} 3245 3246static void nvme_shutdown(struct pci_dev *pdev) 3247{ 3248 struct nvme_dev *dev = pci_get_drvdata(pdev); 3249 nvme_dev_shutdown(dev); 3250} 3251 3252static void nvme_remove(struct pci_dev *pdev) 3253{ 3254 struct nvme_dev *dev = pci_get_drvdata(pdev); 3255 3256 spin_lock(&dev_list_lock); 3257 list_del_init(&dev->node); 3258 spin_unlock(&dev_list_lock); 3259 3260 pci_set_drvdata(pdev, NULL); 3261 flush_work(&dev->probe_work); 3262 flush_work(&dev->reset_work); 3263 flush_work(&dev->scan_work); 3264 device_remove_file(dev->device, &dev_attr_reset_controller); 3265 nvme_dev_remove(dev); 3266 nvme_dev_shutdown(dev); 3267 nvme_dev_remove_admin(dev); 3268 device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance)); 3269 nvme_free_queues(dev, 0); 3270 nvme_release_cmb(dev); 3271 nvme_release_prp_pools(dev); 3272 kref_put(&dev->kref, nvme_free_dev); 3273} 3274 3275/* These functions are yet to be implemented */ 3276#define nvme_error_detected NULL 3277#define nvme_dump_registers NULL 3278#define nvme_link_reset NULL 3279#define nvme_slot_reset NULL 3280#define nvme_error_resume NULL 3281 3282#ifdef CONFIG_PM_SLEEP 3283static int nvme_suspend(struct device *dev) 3284{ 3285 struct pci_dev *pdev = to_pci_dev(dev); 3286 struct nvme_dev *ndev = pci_get_drvdata(pdev); 3287 3288 nvme_dev_shutdown(ndev); 3289 return 0; 3290} 3291 3292static int nvme_resume(struct device *dev) 3293{ 3294 struct pci_dev *pdev = to_pci_dev(dev); 3295 struct nvme_dev *ndev = pci_get_drvdata(pdev); 3296 3297 if (nvme_dev_resume(ndev) && !work_busy(&ndev->reset_work)) { 3298 ndev->reset_workfn = nvme_reset_failed_dev; 3299 queue_work(nvme_workq, &ndev->reset_work); 3300 } 3301 return 0; 3302} 3303#endif 3304 3305static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume); 3306 3307static const struct pci_error_handlers nvme_err_handler = { 3308 .error_detected = nvme_error_detected, 3309 .mmio_enabled = nvme_dump_registers, 3310 .link_reset = nvme_link_reset, 3311 .slot_reset = nvme_slot_reset, 3312 .resume = nvme_error_resume, 3313 .reset_notify = nvme_reset_notify, 3314}; 3315 3316/* Move to pci_ids.h later */ 3317#define PCI_CLASS_STORAGE_EXPRESS 0x010802 3318 3319static const struct pci_device_id nvme_id_table[] = { 3320 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, 3321 { 0, } 3322}; 3323MODULE_DEVICE_TABLE(pci, nvme_id_table); 3324 3325static struct pci_driver nvme_driver = { 3326 .name = "nvme", 3327 .id_table = nvme_id_table, 3328 .probe = nvme_probe, 3329 .remove = nvme_remove, 3330 .shutdown = nvme_shutdown, 3331 .driver = { 3332 .pm = &nvme_dev_pm_ops, 3333 }, 3334 .err_handler = &nvme_err_handler, 3335}; 3336 3337static int __init nvme_init(void) 3338{ 3339 int result; 3340 3341 init_waitqueue_head(&nvme_kthread_wait); 3342 3343 nvme_workq = create_singlethread_workqueue("nvme"); 3344 if (!nvme_workq) 3345 return -ENOMEM; 3346 3347 result = register_blkdev(nvme_major, "nvme"); 3348 if (result < 0) 3349 goto kill_workq; 3350 else if (result > 0) 3351 nvme_major = result; 3352 3353 result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", 3354 &nvme_dev_fops); 3355 if (result < 0) 3356 goto unregister_blkdev; 3357 else if (result > 0) 3358 nvme_char_major = result; 3359 3360 nvme_class = class_create(THIS_MODULE, "nvme"); 3361 if (IS_ERR(nvme_class)) { 3362 result = PTR_ERR(nvme_class); 3363 goto unregister_chrdev; 3364 } 3365 3366 result = pci_register_driver(&nvme_driver); 3367 if (result) 3368 goto destroy_class; 3369 return 0; 3370 3371 destroy_class: 3372 class_destroy(nvme_class); 3373 unregister_chrdev: 3374 __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); 3375 unregister_blkdev: 3376 unregister_blkdev(nvme_major, "nvme"); 3377 kill_workq: 3378 destroy_workqueue(nvme_workq); 3379 return result; 3380} 3381 3382static void __exit nvme_exit(void) 3383{ 3384 pci_unregister_driver(&nvme_driver); 3385 unregister_blkdev(nvme_major, "nvme"); 3386 destroy_workqueue(nvme_workq); 3387 class_destroy(nvme_class); 3388 __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); 3389 BUG_ON(nvme_thread && !IS_ERR(nvme_thread)); 3390 _nvme_check_size(); 3391} 3392 3393MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>"); 3394MODULE_LICENSE("GPL"); 3395MODULE_VERSION("1.0"); 3396module_init(nvme_init); 3397module_exit(nvme_exit);