Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'for-linus' of git://git.kernel.dk/linux-block

Pull followup block layer updates from Jens Axboe:
"Two things in this pull request:

- A block throttle oops fix (marked for stable) from Thadeu.

- The NVMe fixes/features queued up for 3.20, but merged later in the
process. From Keith. We should have gotten this merged earlier,
we're ironing out the kinks in the process. Will be ready for the
initial pull next series"

* 'for-linus' of git://git.kernel.dk/linux-block:
blk-throttle: check stats_cpu before reading it from sysfs
NVMe: Fix potential corruption on sync commands
NVMe: Remove unused variables
NVMe: Fix scsi mode select llbaa setting
NVMe: Fix potential corruption during shutdown
NVMe: Asynchronous controller probe
NVMe: Register management handle under nvme class
NVMe: Update SCSI Inquiry VPD 83h translation
NVMe: Metadata format support

+417 -213
+3
block/blk-throttle.c
··· 1292 1292 struct blkg_rwstat rwstat = { }, tmp; 1293 1293 int i, cpu; 1294 1294 1295 + if (tg->stats_cpu == NULL) 1296 + return 0; 1297 + 1295 1298 for_each_possible_cpu(cpu) { 1296 1299 struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); 1297 1300
+331 -165
drivers/block/nvme-core.c
··· 37 37 #include <linux/ptrace.h> 38 38 #include <linux/sched.h> 39 39 #include <linux/slab.h> 40 + #include <linux/t10-pi.h> 40 41 #include <linux/types.h> 41 42 #include <scsi/sg.h> 42 43 #include <asm-generic/io-64-nonatomic-lo-hi.h> 43 44 45 + #define NVME_MINORS (1U << MINORBITS) 44 46 #define NVME_Q_DEPTH 1024 45 47 #define NVME_AQ_DEPTH 64 46 48 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) 47 49 #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) 48 50 #define ADMIN_TIMEOUT (admin_timeout * HZ) 49 51 #define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) 50 - #define IOD_TIMEOUT (retry_time * HZ) 51 52 52 53 static unsigned char admin_timeout = 60; 53 54 module_param(admin_timeout, byte, 0644); ··· 58 57 module_param_named(io_timeout, nvme_io_timeout, byte, 0644); 59 58 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); 60 59 61 - static unsigned char retry_time = 30; 62 - module_param(retry_time, byte, 0644); 63 - MODULE_PARM_DESC(retry_time, "time in seconds to retry failed I/O"); 64 - 65 60 static unsigned char shutdown_timeout = 5; 66 61 module_param(shutdown_timeout, byte, 0644); 67 62 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); 68 63 69 64 static int nvme_major; 70 65 module_param(nvme_major, int, 0); 66 + 67 + static int nvme_char_major; 68 + module_param(nvme_char_major, int, 0); 71 69 72 70 static int use_threaded_interrupts; 73 71 module_param(use_threaded_interrupts, int, 0); ··· 76 76 static struct task_struct *nvme_thread; 77 77 static struct workqueue_struct *nvme_workq; 78 78 static wait_queue_head_t nvme_kthread_wait; 79 - static struct notifier_block nvme_nb; 79 + 80 + static struct class *nvme_class; 80 81 81 82 static void nvme_reset_failed_dev(struct work_struct *ws); 82 83 static int nvme_process_cq(struct nvme_queue *nvmeq); ··· 96 95 * commands and one for I/O commands). 97 96 */ 98 97 struct nvme_queue { 99 - struct llist_node node; 100 98 struct device *q_dmadev; 101 99 struct nvme_dev *dev; 102 100 char irqname[24]; /* nvme4294967295-65535\0 */ ··· 482 482 } 483 483 } 484 484 485 + static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) 486 + { 487 + if (be32_to_cpu(pi->ref_tag) == v) 488 + pi->ref_tag = cpu_to_be32(p); 489 + } 490 + 491 + static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) 492 + { 493 + if (be32_to_cpu(pi->ref_tag) == p) 494 + pi->ref_tag = cpu_to_be32(v); 495 + } 496 + 497 + /** 498 + * nvme_dif_remap - remaps ref tags to bip seed and physical lba 499 + * 500 + * The virtual start sector is the one that was originally submitted by the 501 + * block layer. Due to partitioning, MD/DM cloning, etc. the actual physical 502 + * start sector may be different. Remap protection information to match the 503 + * physical LBA on writes, and back to the original seed on reads. 504 + * 505 + * Type 0 and 3 do not have a ref tag, so no remapping required. 506 + */ 507 + static void nvme_dif_remap(struct request *req, 508 + void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) 509 + { 510 + struct nvme_ns *ns = req->rq_disk->private_data; 511 + struct bio_integrity_payload *bip; 512 + struct t10_pi_tuple *pi; 513 + void *p, *pmap; 514 + u32 i, nlb, ts, phys, virt; 515 + 516 + if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3) 517 + return; 518 + 519 + bip = bio_integrity(req->bio); 520 + if (!bip) 521 + return; 522 + 523 + pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; 524 + if (!pmap) 525 + return; 526 + 527 + p = pmap; 528 + virt = bip_get_seed(bip); 529 + phys = nvme_block_nr(ns, blk_rq_pos(req)); 530 + nlb = (blk_rq_bytes(req) >> ns->lba_shift); 531 + ts = ns->disk->integrity->tuple_size; 532 + 533 + for (i = 0; i < nlb; i++, virt++, phys++) { 534 + pi = (struct t10_pi_tuple *)p; 535 + dif_swap(phys, virt, pi); 536 + p += ts; 537 + } 538 + kunmap_atomic(pmap); 539 + } 540 + 485 541 static void req_completion(struct nvme_queue *nvmeq, void *ctx, 486 542 struct nvme_completion *cqe) 487 543 { ··· 568 512 "completing aborted command with status:%04x\n", 569 513 status); 570 514 571 - if (iod->nents) 515 + if (iod->nents) { 572 516 dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents, 573 517 rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 518 + if (blk_integrity_rq(req)) { 519 + if (!rq_data_dir(req)) 520 + nvme_dif_remap(req, nvme_dif_complete); 521 + dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->meta_sg, 1, 522 + rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 523 + } 524 + } 574 525 nvme_free_iod(nvmeq->dev, iod); 575 526 576 527 blk_mq_complete_request(req); ··· 733 670 cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); 734 671 cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); 735 672 cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); 673 + 674 + if (blk_integrity_rq(req)) { 675 + cmnd->rw.metadata = cpu_to_le64(sg_dma_address(iod->meta_sg)); 676 + switch (ns->pi_type) { 677 + case NVME_NS_DPS_PI_TYPE3: 678 + control |= NVME_RW_PRINFO_PRCHK_GUARD; 679 + break; 680 + case NVME_NS_DPS_PI_TYPE1: 681 + case NVME_NS_DPS_PI_TYPE2: 682 + control |= NVME_RW_PRINFO_PRCHK_GUARD | 683 + NVME_RW_PRINFO_PRCHK_REF; 684 + cmnd->rw.reftag = cpu_to_le32( 685 + nvme_block_nr(ns, blk_rq_pos(req))); 686 + break; 687 + } 688 + } else if (ns->ms) 689 + control |= NVME_RW_PRINFO_PRACT; 690 + 736 691 cmnd->rw.control = cpu_to_le16(control); 737 692 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); 738 693 ··· 770 689 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); 771 690 struct nvme_iod *iod; 772 691 enum dma_data_direction dma_dir; 692 + 693 + /* 694 + * If formated with metadata, require the block layer provide a buffer 695 + * unless this namespace is formated such that the metadata can be 696 + * stripped/generated by the controller with PRACT=1. 697 + */ 698 + if (ns->ms && !blk_integrity_rq(req)) { 699 + if (!(ns->pi_type && ns->ms == 8)) { 700 + req->errors = -EFAULT; 701 + blk_mq_complete_request(req); 702 + return BLK_MQ_RQ_QUEUE_OK; 703 + } 704 + } 773 705 774 706 iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC); 775 707 if (!iod) ··· 818 724 dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, 819 725 iod->nents, dma_dir); 820 726 goto retry_cmd; 727 + } 728 + if (blk_integrity_rq(req)) { 729 + if (blk_rq_count_integrity_sg(req->q, req->bio) != 1) 730 + goto error_cmd; 731 + 732 + sg_init_table(iod->meta_sg, 1); 733 + if (blk_rq_map_integrity_sg( 734 + req->q, req->bio, iod->meta_sg) != 1) 735 + goto error_cmd; 736 + 737 + if (rq_data_dir(req)) 738 + nvme_dif_remap(req, nvme_dif_prep); 739 + 740 + if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir)) 741 + goto error_cmd; 821 742 } 822 743 } 823 744 ··· 926 817 return IRQ_WAKE_THREAD; 927 818 } 928 819 929 - static void nvme_abort_cmd_info(struct nvme_queue *nvmeq, struct nvme_cmd_info * 930 - cmd_info) 931 - { 932 - spin_lock_irq(&nvmeq->q_lock); 933 - cancel_cmd_info(cmd_info, NULL); 934 - spin_unlock_irq(&nvmeq->q_lock); 935 - } 936 - 937 820 struct sync_cmd_info { 938 821 struct task_struct *task; 939 822 u32 result; ··· 948 847 static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd, 949 848 u32 *result, unsigned timeout) 950 849 { 951 - int ret; 952 850 struct sync_cmd_info cmdinfo; 953 851 struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); 954 852 struct nvme_queue *nvmeq = cmd_rq->nvmeq; ··· 959 859 960 860 nvme_set_info(cmd_rq, &cmdinfo, sync_completion); 961 861 962 - set_current_state(TASK_KILLABLE); 963 - ret = nvme_submit_cmd(nvmeq, cmd); 964 - if (ret) { 965 - nvme_finish_cmd(nvmeq, req->tag, NULL); 966 - set_current_state(TASK_RUNNING); 967 - } 968 - ret = schedule_timeout(timeout); 969 - 970 - /* 971 - * Ensure that sync_completion has either run, or that it will 972 - * never run. 973 - */ 974 - nvme_abort_cmd_info(nvmeq, blk_mq_rq_to_pdu(req)); 975 - 976 - /* 977 - * We never got the completion 978 - */ 979 - if (cmdinfo.status == -EINTR) 980 - return -EINTR; 862 + set_current_state(TASK_UNINTERRUPTIBLE); 863 + nvme_submit_cmd(nvmeq, cmd); 864 + schedule(); 981 865 982 866 if (result) 983 867 *result = cmdinfo.result; 984 - 985 868 return cmdinfo.status; 986 869 } 987 870 ··· 1241 1158 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); 1242 1159 struct nvme_queue *nvmeq = cmd->nvmeq; 1243 1160 1161 + dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag, 1162 + nvmeq->qid); 1163 + spin_lock_irq(&nvmeq->q_lock); 1164 + nvme_abort_req(req); 1165 + spin_unlock_irq(&nvmeq->q_lock); 1166 + 1244 1167 /* 1245 1168 * The aborted req will be completed on receiving the abort req. 1246 1169 * We enable the timer again. If hit twice, it'll cause a device reset, 1247 1170 * as the device then is in a faulty state. 1248 1171 */ 1249 - int ret = BLK_EH_RESET_TIMER; 1250 - 1251 - dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag, 1252 - nvmeq->qid); 1253 - 1254 - spin_lock_irq(&nvmeq->q_lock); 1255 - if (!nvmeq->dev->initialized) { 1256 - /* 1257 - * Force cancelled command frees the request, which requires we 1258 - * return BLK_EH_NOT_HANDLED. 1259 - */ 1260 - nvme_cancel_queue_ios(nvmeq->hctx, req, nvmeq, reserved); 1261 - ret = BLK_EH_NOT_HANDLED; 1262 - } else 1263 - nvme_abort_req(req); 1264 - spin_unlock_irq(&nvmeq->q_lock); 1265 - 1266 - return ret; 1172 + return BLK_EH_RESET_TIMER; 1267 1173 } 1268 1174 1269 1175 static void nvme_free_queue(struct nvme_queue *nvmeq) ··· 1305 1233 struct blk_mq_hw_ctx *hctx = nvmeq->hctx; 1306 1234 1307 1235 spin_lock_irq(&nvmeq->q_lock); 1308 - nvme_process_cq(nvmeq); 1309 1236 if (hctx && hctx->tags) 1310 1237 blk_mq_tag_busy_iter(hctx, nvme_cancel_queue_ios, nvmeq); 1311 1238 spin_unlock_irq(&nvmeq->q_lock); ··· 1327 1256 } 1328 1257 if (!qid && dev->admin_q) 1329 1258 blk_mq_freeze_queue_start(dev->admin_q); 1330 - nvme_clear_queue(nvmeq); 1259 + 1260 + spin_lock_irq(&nvmeq->q_lock); 1261 + nvme_process_cq(nvmeq); 1262 + spin_unlock_irq(&nvmeq->q_lock); 1331 1263 } 1332 1264 1333 1265 static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, ··· 1949 1875 return 0; 1950 1876 } 1951 1877 1878 + static void nvme_config_discard(struct nvme_ns *ns) 1879 + { 1880 + u32 logical_block_size = queue_logical_block_size(ns->queue); 1881 + ns->queue->limits.discard_zeroes_data = 0; 1882 + ns->queue->limits.discard_alignment = logical_block_size; 1883 + ns->queue->limits.discard_granularity = logical_block_size; 1884 + ns->queue->limits.max_discard_sectors = 0xffffffff; 1885 + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); 1886 + } 1887 + 1888 + static int nvme_noop_verify(struct blk_integrity_iter *iter) 1889 + { 1890 + return 0; 1891 + } 1892 + 1893 + static int nvme_noop_generate(struct blk_integrity_iter *iter) 1894 + { 1895 + return 0; 1896 + } 1897 + 1898 + struct blk_integrity nvme_meta_noop = { 1899 + .name = "NVME_META_NOOP", 1900 + .generate_fn = nvme_noop_generate, 1901 + .verify_fn = nvme_noop_verify, 1902 + }; 1903 + 1904 + static void nvme_init_integrity(struct nvme_ns *ns) 1905 + { 1906 + struct blk_integrity integrity; 1907 + 1908 + switch (ns->pi_type) { 1909 + case NVME_NS_DPS_PI_TYPE3: 1910 + integrity = t10_pi_type3_crc; 1911 + break; 1912 + case NVME_NS_DPS_PI_TYPE1: 1913 + case NVME_NS_DPS_PI_TYPE2: 1914 + integrity = t10_pi_type1_crc; 1915 + break; 1916 + default: 1917 + integrity = nvme_meta_noop; 1918 + break; 1919 + } 1920 + integrity.tuple_size = ns->ms; 1921 + blk_integrity_register(ns->disk, &integrity); 1922 + blk_queue_max_integrity_segments(ns->queue, 1); 1923 + } 1924 + 1952 1925 static int nvme_revalidate_disk(struct gendisk *disk) 1953 1926 { 1954 1927 struct nvme_ns *ns = disk->private_data; 1955 1928 struct nvme_dev *dev = ns->dev; 1956 1929 struct nvme_id_ns *id; 1957 1930 dma_addr_t dma_addr; 1958 - int lbaf; 1931 + int lbaf, pi_type, old_ms; 1932 + unsigned short bs; 1959 1933 1960 1934 id = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr, 1961 1935 GFP_KERNEL); ··· 2012 1890 __func__); 2013 1891 return 0; 2014 1892 } 1893 + if (nvme_identify(dev, ns->ns_id, 0, dma_addr)) { 1894 + dev_warn(&dev->pci_dev->dev, 1895 + "identify failed ns:%d, setting capacity to 0\n", 1896 + ns->ns_id); 1897 + memset(id, 0, sizeof(*id)); 1898 + } 2015 1899 2016 - if (nvme_identify(dev, ns->ns_id, 0, dma_addr)) 2017 - goto free; 2018 - 2019 - lbaf = id->flbas & 0xf; 1900 + old_ms = ns->ms; 1901 + lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; 2020 1902 ns->lba_shift = id->lbaf[lbaf].ds; 1903 + ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); 2021 1904 2022 - blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); 2023 - set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); 2024 - free: 1905 + /* 1906 + * If identify namespace failed, use default 512 byte block size so 1907 + * block layer can use before failing read/write for 0 capacity. 1908 + */ 1909 + if (ns->lba_shift == 0) 1910 + ns->lba_shift = 9; 1911 + bs = 1 << ns->lba_shift; 1912 + 1913 + /* XXX: PI implementation requires metadata equal t10 pi tuple size */ 1914 + pi_type = ns->ms == sizeof(struct t10_pi_tuple) ? 1915 + id->dps & NVME_NS_DPS_PI_MASK : 0; 1916 + 1917 + if (disk->integrity && (ns->pi_type != pi_type || ns->ms != old_ms || 1918 + bs != queue_logical_block_size(disk->queue) || 1919 + (ns->ms && id->flbas & NVME_NS_FLBAS_META_EXT))) 1920 + blk_integrity_unregister(disk); 1921 + 1922 + ns->pi_type = pi_type; 1923 + blk_queue_logical_block_size(ns->queue, bs); 1924 + 1925 + if (ns->ms && !disk->integrity && (disk->flags & GENHD_FL_UP) && 1926 + !(id->flbas & NVME_NS_FLBAS_META_EXT)) 1927 + nvme_init_integrity(ns); 1928 + 1929 + if (id->ncap == 0 || (ns->ms && !disk->integrity)) 1930 + set_capacity(disk, 0); 1931 + else 1932 + set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); 1933 + 1934 + if (dev->oncs & NVME_CTRL_ONCS_DSM) 1935 + nvme_config_discard(ns); 1936 + 2025 1937 dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr); 2026 1938 return 0; 2027 1939 } ··· 2079 1923 spin_lock(&dev_list_lock); 2080 1924 list_for_each_entry_safe(dev, next, &dev_list, node) { 2081 1925 int i; 2082 - if (readl(&dev->bar->csts) & NVME_CSTS_CFS && 2083 - dev->initialized) { 1926 + if (readl(&dev->bar->csts) & NVME_CSTS_CFS) { 2084 1927 if (work_busy(&dev->reset_work)) 2085 1928 continue; 2086 1929 list_del_init(&dev->node); ··· 2111 1956 return 0; 2112 1957 } 2113 1958 2114 - static void nvme_config_discard(struct nvme_ns *ns) 2115 - { 2116 - u32 logical_block_size = queue_logical_block_size(ns->queue); 2117 - ns->queue->limits.discard_zeroes_data = 0; 2118 - ns->queue->limits.discard_alignment = logical_block_size; 2119 - ns->queue->limits.discard_granularity = logical_block_size; 2120 - ns->queue->limits.max_discard_sectors = 0xffffffff; 2121 - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); 2122 - } 2123 - 2124 - static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, 2125 - struct nvme_id_ns *id, struct nvme_lba_range_type *rt) 1959 + static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) 2126 1960 { 2127 1961 struct nvme_ns *ns; 2128 1962 struct gendisk *disk; 2129 1963 int node = dev_to_node(&dev->pci_dev->dev); 2130 - int lbaf; 2131 - 2132 - if (rt->attributes & NVME_LBART_ATTRIB_HIDE) 2133 - return NULL; 2134 1964 2135 1965 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); 2136 1966 if (!ns) 2137 - return NULL; 1967 + return; 1968 + 2138 1969 ns->queue = blk_mq_init_queue(&dev->tagset); 2139 1970 if (IS_ERR(ns->queue)) 2140 1971 goto out_free_ns; ··· 2136 1995 2137 1996 ns->ns_id = nsid; 2138 1997 ns->disk = disk; 2139 - lbaf = id->flbas & 0xf; 2140 - ns->lba_shift = id->lbaf[lbaf].ds; 2141 - ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); 1998 + ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ 1999 + list_add_tail(&ns->list, &dev->namespaces); 2000 + 2142 2001 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); 2143 2002 if (dev->max_hw_sectors) 2144 2003 blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); ··· 2152 2011 disk->fops = &nvme_fops; 2153 2012 disk->private_data = ns; 2154 2013 disk->queue = ns->queue; 2155 - disk->driverfs_dev = &dev->pci_dev->dev; 2014 + disk->driverfs_dev = dev->device; 2156 2015 disk->flags = GENHD_FL_EXT_DEVT; 2157 2016 sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); 2158 - set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); 2159 2017 2160 - if (dev->oncs & NVME_CTRL_ONCS_DSM) 2161 - nvme_config_discard(ns); 2162 - 2163 - return ns; 2164 - 2018 + /* 2019 + * Initialize capacity to 0 until we establish the namespace format and 2020 + * setup integrity extentions if necessary. The revalidate_disk after 2021 + * add_disk allows the driver to register with integrity if the format 2022 + * requires it. 2023 + */ 2024 + set_capacity(disk, 0); 2025 + nvme_revalidate_disk(ns->disk); 2026 + add_disk(ns->disk); 2027 + if (ns->ms) 2028 + revalidate_disk(ns->disk); 2029 + return; 2165 2030 out_free_queue: 2166 2031 blk_cleanup_queue(ns->queue); 2167 2032 out_free_ns: 2168 2033 kfree(ns); 2169 - return NULL; 2170 2034 } 2171 2035 2172 2036 static void nvme_create_io_queues(struct nvme_dev *dev) ··· 2296 2150 struct pci_dev *pdev = dev->pci_dev; 2297 2151 int res; 2298 2152 unsigned nn, i; 2299 - struct nvme_ns *ns; 2300 2153 struct nvme_id_ctrl *ctrl; 2301 - struct nvme_id_ns *id_ns; 2302 2154 void *mem; 2303 2155 dma_addr_t dma_addr; 2304 2156 int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; 2305 2157 2306 - mem = dma_alloc_coherent(&pdev->dev, 8192, &dma_addr, GFP_KERNEL); 2158 + mem = dma_alloc_coherent(&pdev->dev, 4096, &dma_addr, GFP_KERNEL); 2307 2159 if (!mem) 2308 2160 return -ENOMEM; 2309 2161 2310 2162 res = nvme_identify(dev, 0, 1, dma_addr); 2311 2163 if (res) { 2312 2164 dev_err(&pdev->dev, "Identify Controller failed (%d)\n", res); 2313 - res = -EIO; 2314 - goto out; 2165 + dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr); 2166 + return -EIO; 2315 2167 } 2316 2168 2317 2169 ctrl = mem; ··· 2335 2191 } else 2336 2192 dev->max_hw_sectors = max_hw_sectors; 2337 2193 } 2194 + dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr); 2338 2195 2339 2196 dev->tagset.ops = &nvme_mq_ops; 2340 2197 dev->tagset.nr_hw_queues = dev->online_queues - 1; ··· 2348 2203 dev->tagset.driver_data = dev; 2349 2204 2350 2205 if (blk_mq_alloc_tag_set(&dev->tagset)) 2351 - goto out; 2206 + return 0; 2352 2207 2353 - id_ns = mem; 2354 - for (i = 1; i <= nn; i++) { 2355 - res = nvme_identify(dev, i, 0, dma_addr); 2356 - if (res) 2357 - continue; 2208 + for (i = 1; i <= nn; i++) 2209 + nvme_alloc_ns(dev, i); 2358 2210 2359 - if (id_ns->ncap == 0) 2360 - continue; 2361 - 2362 - res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i, 2363 - dma_addr + 4096, NULL); 2364 - if (res) 2365 - memset(mem + 4096, 0, 4096); 2366 - 2367 - ns = nvme_alloc_ns(dev, i, mem, mem + 4096); 2368 - if (ns) 2369 - list_add_tail(&ns->list, &dev->namespaces); 2370 - } 2371 - list_for_each_entry(ns, &dev->namespaces, list) 2372 - add_disk(ns->disk); 2373 - res = 0; 2374 - 2375 - out: 2376 - dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr); 2377 - return res; 2211 + return 0; 2378 2212 } 2379 2213 2380 2214 static int nvme_dev_map(struct nvme_dev *dev) ··· 2482 2358 static void nvme_del_queue_end(struct nvme_queue *nvmeq) 2483 2359 { 2484 2360 struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx; 2485 - 2486 - nvme_clear_queue(nvmeq); 2487 2361 nvme_put_dq(dq); 2488 2362 } 2489 2363 ··· 2624 2502 int i; 2625 2503 u32 csts = -1; 2626 2504 2627 - dev->initialized = 0; 2628 2505 nvme_dev_list_remove(dev); 2629 2506 2630 2507 if (dev->bar) { ··· 2634 2513 for (i = dev->queue_count - 1; i >= 0; i--) { 2635 2514 struct nvme_queue *nvmeq = dev->queues[i]; 2636 2515 nvme_suspend_queue(nvmeq); 2637 - nvme_clear_queue(nvmeq); 2638 2516 } 2639 2517 } else { 2640 2518 nvme_disable_io_queues(dev); ··· 2641 2521 nvme_disable_queue(dev, 0); 2642 2522 } 2643 2523 nvme_dev_unmap(dev); 2524 + 2525 + for (i = dev->queue_count - 1; i >= 0; i--) 2526 + nvme_clear_queue(dev->queues[i]); 2644 2527 } 2645 2528 2646 2529 static void nvme_dev_remove(struct nvme_dev *dev) ··· 2651 2528 struct nvme_ns *ns; 2652 2529 2653 2530 list_for_each_entry(ns, &dev->namespaces, list) { 2654 - if (ns->disk->flags & GENHD_FL_UP) 2531 + if (ns->disk->flags & GENHD_FL_UP) { 2532 + if (ns->disk->integrity) 2533 + blk_integrity_unregister(ns->disk); 2655 2534 del_gendisk(ns->disk); 2535 + } 2656 2536 if (!blk_queue_dying(ns->queue)) { 2657 2537 blk_mq_abort_requeue_list(ns->queue); 2658 2538 blk_cleanup_queue(ns->queue); ··· 2737 2611 struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); 2738 2612 2739 2613 pci_dev_put(dev->pci_dev); 2614 + put_device(dev->device); 2740 2615 nvme_free_namespaces(dev); 2741 2616 nvme_release_instance(dev); 2742 2617 blk_mq_free_tag_set(&dev->tagset); ··· 2749 2622 2750 2623 static int nvme_dev_open(struct inode *inode, struct file *f) 2751 2624 { 2752 - struct nvme_dev *dev = container_of(f->private_data, struct nvme_dev, 2753 - miscdev); 2754 - kref_get(&dev->kref); 2755 - f->private_data = dev; 2756 - return 0; 2625 + struct nvme_dev *dev; 2626 + int instance = iminor(inode); 2627 + int ret = -ENODEV; 2628 + 2629 + spin_lock(&dev_list_lock); 2630 + list_for_each_entry(dev, &dev_list, node) { 2631 + if (dev->instance == instance) { 2632 + if (!dev->admin_q) { 2633 + ret = -EWOULDBLOCK; 2634 + break; 2635 + } 2636 + if (!kref_get_unless_zero(&dev->kref)) 2637 + break; 2638 + f->private_data = dev; 2639 + ret = 0; 2640 + break; 2641 + } 2642 + } 2643 + spin_unlock(&dev_list_lock); 2644 + 2645 + return ret; 2757 2646 } 2758 2647 2759 2648 static int nvme_dev_release(struct inode *inode, struct file *f) ··· 2911 2768 nvme_unfreeze_queues(dev); 2912 2769 nvme_set_irq_hints(dev); 2913 2770 } 2914 - dev->initialized = 1; 2915 2771 return 0; 2916 2772 } 2917 2773 ··· 2941 2799 dev->reset_workfn(work); 2942 2800 } 2943 2801 2802 + static void nvme_async_probe(struct work_struct *work); 2944 2803 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) 2945 2804 { 2946 2805 int node, result = -ENOMEM; ··· 2977 2834 goto release; 2978 2835 2979 2836 kref_init(&dev->kref); 2980 - result = nvme_dev_start(dev); 2981 - if (result) 2837 + dev->device = device_create(nvme_class, &pdev->dev, 2838 + MKDEV(nvme_char_major, dev->instance), 2839 + dev, "nvme%d", dev->instance); 2840 + if (IS_ERR(dev->device)) { 2841 + result = PTR_ERR(dev->device); 2982 2842 goto release_pools; 2843 + } 2844 + get_device(dev->device); 2983 2845 2984 - if (dev->online_queues > 1) 2985 - result = nvme_dev_add(dev); 2986 - if (result) 2987 - goto shutdown; 2988 - 2989 - scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance); 2990 - dev->miscdev.minor = MISC_DYNAMIC_MINOR; 2991 - dev->miscdev.parent = &pdev->dev; 2992 - dev->miscdev.name = dev->name; 2993 - dev->miscdev.fops = &nvme_dev_fops; 2994 - result = misc_register(&dev->miscdev); 2995 - if (result) 2996 - goto remove; 2997 - 2998 - nvme_set_irq_hints(dev); 2999 - 3000 - dev->initialized = 1; 2846 + INIT_WORK(&dev->probe_work, nvme_async_probe); 2847 + schedule_work(&dev->probe_work); 3001 2848 return 0; 3002 2849 3003 - remove: 3004 - nvme_dev_remove(dev); 3005 - nvme_dev_remove_admin(dev); 3006 - nvme_free_namespaces(dev); 3007 - shutdown: 3008 - nvme_dev_shutdown(dev); 3009 2850 release_pools: 3010 - nvme_free_queues(dev, 0); 3011 2851 nvme_release_prp_pools(dev); 3012 2852 release: 3013 2853 nvme_release_instance(dev); ··· 3001 2875 kfree(dev->entry); 3002 2876 kfree(dev); 3003 2877 return result; 2878 + } 2879 + 2880 + static void nvme_async_probe(struct work_struct *work) 2881 + { 2882 + struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work); 2883 + int result; 2884 + 2885 + result = nvme_dev_start(dev); 2886 + if (result) 2887 + goto reset; 2888 + 2889 + if (dev->online_queues > 1) 2890 + result = nvme_dev_add(dev); 2891 + if (result) 2892 + goto reset; 2893 + 2894 + nvme_set_irq_hints(dev); 2895 + return; 2896 + reset: 2897 + if (!work_busy(&dev->reset_work)) { 2898 + dev->reset_workfn = nvme_reset_failed_dev; 2899 + queue_work(nvme_workq, &dev->reset_work); 2900 + } 3004 2901 } 3005 2902 3006 2903 static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) ··· 3051 2902 spin_unlock(&dev_list_lock); 3052 2903 3053 2904 pci_set_drvdata(pdev, NULL); 2905 + flush_work(&dev->probe_work); 3054 2906 flush_work(&dev->reset_work); 3055 - misc_deregister(&dev->miscdev); 3056 2907 nvme_dev_shutdown(dev); 3057 2908 nvme_dev_remove(dev); 3058 2909 nvme_dev_remove_admin(dev); 2910 + device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance)); 3059 2911 nvme_free_queues(dev, 0); 3060 2912 nvme_release_prp_pools(dev); 3061 2913 kref_put(&dev->kref, nvme_free_dev); ··· 3140 2990 else if (result > 0) 3141 2991 nvme_major = result; 3142 2992 2993 + result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", 2994 + &nvme_dev_fops); 2995 + if (result < 0) 2996 + goto unregister_blkdev; 2997 + else if (result > 0) 2998 + nvme_char_major = result; 2999 + 3000 + nvme_class = class_create(THIS_MODULE, "nvme"); 3001 + if (!nvme_class) 3002 + goto unregister_chrdev; 3003 + 3143 3004 result = pci_register_driver(&nvme_driver); 3144 3005 if (result) 3145 - goto unregister_blkdev; 3006 + goto destroy_class; 3146 3007 return 0; 3147 3008 3009 + destroy_class: 3010 + class_destroy(nvme_class); 3011 + unregister_chrdev: 3012 + __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); 3148 3013 unregister_blkdev: 3149 3014 unregister_blkdev(nvme_major, "nvme"); 3150 3015 kill_workq: ··· 3170 3005 static void __exit nvme_exit(void) 3171 3006 { 3172 3007 pci_unregister_driver(&nvme_driver); 3173 - unregister_hotcpu_notifier(&nvme_nb); 3174 3008 unregister_blkdev(nvme_major, "nvme"); 3175 3009 destroy_workqueue(nvme_workq); 3010 + class_destroy(nvme_class); 3011 + __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); 3176 3012 BUG_ON(nvme_thread && !IS_ERR(nvme_thread)); 3177 3013 _nvme_check_size(); 3178 3014 }
+54 -42
drivers/block/nvme-scsi.c
··· 779 779 struct nvme_dev *dev = ns->dev; 780 780 dma_addr_t dma_addr; 781 781 void *mem; 782 - struct nvme_id_ctrl *id_ctrl; 783 782 int res = SNTI_TRANSLATION_SUCCESS; 784 783 int nvme_sc; 785 - u8 ieee[4]; 786 784 int xfer_len; 787 785 __be32 tmp_id = cpu_to_be32(ns->ns_id); 788 786 ··· 791 793 goto out_dma; 792 794 } 793 795 794 - /* nvme controller identify */ 795 - nvme_sc = nvme_identify(dev, 0, 1, dma_addr); 796 - res = nvme_trans_status_code(hdr, nvme_sc); 797 - if (res) 798 - goto out_free; 799 - if (nvme_sc) { 800 - res = nvme_sc; 801 - goto out_free; 802 - } 803 - id_ctrl = mem; 804 - 805 - /* Since SCSI tried to save 4 bits... [SPC-4(r34) Table 591] */ 806 - ieee[0] = id_ctrl->ieee[0] << 4; 807 - ieee[1] = id_ctrl->ieee[0] >> 4 | id_ctrl->ieee[1] << 4; 808 - ieee[2] = id_ctrl->ieee[1] >> 4 | id_ctrl->ieee[2] << 4; 809 - ieee[3] = id_ctrl->ieee[2] >> 4; 810 - 811 - memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); 796 + memset(inq_response, 0, alloc_len); 812 797 inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; /* Page Code */ 813 - inq_response[3] = 20; /* Page Length */ 814 - /* Designation Descriptor start */ 815 - inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */ 816 - inq_response[5] = 0x03; /* PIV=0b | Asso=00b | Designator Type=3h */ 817 - inq_response[6] = 0x00; /* Rsvd */ 818 - inq_response[7] = 16; /* Designator Length */ 819 - /* Designator start */ 820 - inq_response[8] = 0x60 | ieee[3]; /* NAA=6h | IEEE ID MSB, High nibble*/ 821 - inq_response[9] = ieee[2]; /* IEEE ID */ 822 - inq_response[10] = ieee[1]; /* IEEE ID */ 823 - inq_response[11] = ieee[0]; /* IEEE ID| Vendor Specific ID... */ 824 - inq_response[12] = (dev->pci_dev->vendor & 0xFF00) >> 8; 825 - inq_response[13] = (dev->pci_dev->vendor & 0x00FF); 826 - inq_response[14] = dev->serial[0]; 827 - inq_response[15] = dev->serial[1]; 828 - inq_response[16] = dev->model[0]; 829 - inq_response[17] = dev->model[1]; 830 - memcpy(&inq_response[18], &tmp_id, sizeof(u32)); 831 - /* Last 2 bytes are zero */ 798 + if (readl(&dev->bar->vs) >= NVME_VS(1, 1)) { 799 + struct nvme_id_ns *id_ns = mem; 800 + void *eui = id_ns->eui64; 801 + int len = sizeof(id_ns->eui64); 832 802 833 - xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); 803 + nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); 804 + res = nvme_trans_status_code(hdr, nvme_sc); 805 + if (res) 806 + goto out_free; 807 + if (nvme_sc) { 808 + res = nvme_sc; 809 + goto out_free; 810 + } 811 + 812 + if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) { 813 + if (bitmap_empty(eui, len * 8)) { 814 + eui = id_ns->nguid; 815 + len = sizeof(id_ns->nguid); 816 + } 817 + } 818 + if (bitmap_empty(eui, len * 8)) 819 + goto scsi_string; 820 + 821 + inq_response[3] = 4 + len; /* Page Length */ 822 + /* Designation Descriptor start */ 823 + inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */ 824 + inq_response[5] = 0x02; /* PIV=0b | Asso=00b | Designator Type=2h */ 825 + inq_response[6] = 0x00; /* Rsvd */ 826 + inq_response[7] = len; /* Designator Length */ 827 + memcpy(&inq_response[8], eui, len); 828 + } else { 829 + scsi_string: 830 + if (alloc_len < 72) { 831 + res = nvme_trans_completion(hdr, 832 + SAM_STAT_CHECK_CONDITION, 833 + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, 834 + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); 835 + goto out_free; 836 + } 837 + inq_response[3] = 0x48; /* Page Length */ 838 + /* Designation Descriptor start */ 839 + inq_response[4] = 0x03; /* Proto ID=0h | Code set=3h */ 840 + inq_response[5] = 0x08; /* PIV=0b | Asso=00b | Designator Type=8h */ 841 + inq_response[6] = 0x00; /* Rsvd */ 842 + inq_response[7] = 0x44; /* Designator Length */ 843 + 844 + sprintf(&inq_response[8], "%04x", dev->pci_dev->vendor); 845 + memcpy(&inq_response[12], dev->model, sizeof(dev->model)); 846 + sprintf(&inq_response[52], "%04x", tmp_id); 847 + memcpy(&inq_response[56], dev->serial, sizeof(dev->serial)); 848 + } 849 + xfer_len = alloc_len; 834 850 res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); 835 851 836 852 out_free: ··· 1612 1600 /* 10 Byte CDB */ 1613 1601 *bd_len = (parm_list[MODE_SELECT_10_BD_OFFSET] << 8) + 1614 1602 parm_list[MODE_SELECT_10_BD_OFFSET + 1]; 1615 - *llbaa = parm_list[MODE_SELECT_10_LLBAA_OFFSET] && 1603 + *llbaa = parm_list[MODE_SELECT_10_LLBAA_OFFSET] & 1616 1604 MODE_SELECT_10_LLBAA_MASK; 1617 1605 } else { 1618 1606 /* 6 Byte CDB */ ··· 2234 2222 page_code = GET_INQ_PAGE_CODE(cmd); 2235 2223 alloc_len = GET_INQ_ALLOC_LENGTH(cmd); 2236 2224 2237 - inq_response = kmalloc(STANDARD_INQUIRY_LENGTH, GFP_KERNEL); 2225 + inq_response = kmalloc(alloc_len, GFP_KERNEL); 2238 2226 if (inq_response == NULL) { 2239 2227 res = -ENOMEM; 2240 2228 goto out_mem;
+4 -5
include/linux/nvme.h
··· 17 17 18 18 #include <uapi/linux/nvme.h> 19 19 #include <linux/pci.h> 20 - #include <linux/miscdevice.h> 21 20 #include <linux/kref.h> 22 21 #include <linux/blk-mq.h> 23 22 ··· 61 62 NVME_CSTS_SHST_MASK = 3 << 2, 62 63 }; 63 64 64 - #define NVME_VS(major, minor) (major << 16 | minor) 65 - 66 65 extern unsigned char nvme_io_timeout; 67 66 #define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) 68 67 ··· 88 91 struct nvme_bar __iomem *bar; 89 92 struct list_head namespaces; 90 93 struct kref kref; 91 - struct miscdevice miscdev; 94 + struct device *device; 92 95 work_func_t reset_workfn; 93 96 struct work_struct reset_work; 97 + struct work_struct probe_work; 94 98 char name[12]; 95 99 char serial[20]; 96 100 char model[40]; ··· 103 105 u16 abort_limit; 104 106 u8 event_limit; 105 107 u8 vwc; 106 - u8 initialized; 107 108 }; 108 109 109 110 /* ··· 118 121 unsigned ns_id; 119 122 int lba_shift; 120 123 int ms; 124 + int pi_type; 121 125 u64 mode_select_num_blocks; 122 126 u32 mode_select_block_len; 123 127 }; ··· 136 138 int nents; /* Used in scatterlist */ 137 139 int length; /* Of data, in bytes */ 138 140 dma_addr_t first_dma; 141 + struct scatterlist meta_sg[1]; /* metadata requires single contiguous buffer */ 139 142 struct scatterlist sg[0]; 140 143 }; 141 144
+25 -1
include/uapi/linux/nvme.h
··· 115 115 __le16 nawun; 116 116 __le16 nawupf; 117 117 __le16 nacwu; 118 - __u8 rsvd40[80]; 118 + __le16 nabsn; 119 + __le16 nabo; 120 + __le16 nabspf; 121 + __u16 rsvd46; 122 + __le64 nvmcap[2]; 123 + __u8 rsvd64[40]; 124 + __u8 nguid[16]; 119 125 __u8 eui64[8]; 120 126 struct nvme_lbaf lbaf[16]; 121 127 __u8 rsvd192[192]; ··· 130 124 131 125 enum { 132 126 NVME_NS_FEAT_THIN = 1 << 0, 127 + NVME_NS_FLBAS_LBA_MASK = 0xf, 128 + NVME_NS_FLBAS_META_EXT = 0x10, 133 129 NVME_LBAF_RP_BEST = 0, 134 130 NVME_LBAF_RP_BETTER = 1, 135 131 NVME_LBAF_RP_GOOD = 2, 136 132 NVME_LBAF_RP_DEGRADED = 3, 133 + NVME_NS_DPC_PI_LAST = 1 << 4, 134 + NVME_NS_DPC_PI_FIRST = 1 << 3, 135 + NVME_NS_DPC_PI_TYPE3 = 1 << 2, 136 + NVME_NS_DPC_PI_TYPE2 = 1 << 1, 137 + NVME_NS_DPC_PI_TYPE1 = 1 << 0, 138 + NVME_NS_DPS_PI_FIRST = 1 << 3, 139 + NVME_NS_DPS_PI_MASK = 0x7, 140 + NVME_NS_DPS_PI_TYPE1 = 1, 141 + NVME_NS_DPS_PI_TYPE2 = 2, 142 + NVME_NS_DPS_PI_TYPE3 = 3, 137 143 }; 138 144 139 145 struct nvme_smart_log { ··· 279 261 NVME_RW_DSM_LATENCY_LOW = 3 << 4, 280 262 NVME_RW_DSM_SEQ_REQ = 1 << 6, 281 263 NVME_RW_DSM_COMPRESSED = 1 << 7, 264 + NVME_RW_PRINFO_PRCHK_REF = 1 << 10, 265 + NVME_RW_PRINFO_PRCHK_APP = 1 << 11, 266 + NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12, 267 + NVME_RW_PRINFO_PRACT = 1 << 13, 282 268 }; 283 269 284 270 struct nvme_dsm_cmd { ··· 570 548 __u32 timeout_ms; 571 549 __u32 result; 572 550 }; 551 + 552 + #define NVME_VS(major, minor) (((major) << 16) | ((minor) << 8)) 573 553 574 554 #define nvme_admin_cmd nvme_passthru_cmd 575 555