commit 80a0d644d37296d7cac1a4956cb4e916770083e0

tjh.dev / kernel

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Merge branch 'for-linus' of git://git.kernel.dk/linux-block

Pull block fixes from Jens Axboe:
"Small collection of fixes that would be nice to have in -rc1. This
contains:

- NVMe pull request form Christoph, mostly with fixes for nvme-pci,
host memory buffer in particular.

- Error handling fixup for cgwb_create(), in case allocation of 'wb'
fails. From Christophe Jaillet.

- Ensure that trace_block_getrq() gets the 'dev' in an appropriate
fashion, to avoid a potential NULL deref. From Greg Thelen.

- Regression fix for dm-mq with blk-mq, fixing a problem with
stacking IO schedulers. From me.

- string.h fixup, fixing an issue with memcpy_and_pad(). This
original change came in through an NVMe dependency, which is why
I'm including it here. From Martin Wilck.

- Fix potential int overflow in __blkdev_sectors_to_bio_pages(), from
Mikulas.

- MBR enable fix for sed-opal, from Scott"

* 'for-linus' of git://git.kernel.dk/linux-block:
block: directly insert blk-mq request from blk_insert_cloned_request()
mm/backing-dev.c: fix an error handling path in 'cgwb_create()'
string.h: un-fortify memcpy_and_pad
nvme-pci: implement the HMB entry number and size limitations
nvme-pci: propagate (some) errors from host memory buffer setup
nvme-pci: use appropriate initial chunk size for HMB allocation
nvme-pci: fix host memory buffer allocation fallback
nvme: fix lightnvm check
block: fix integer overflow in __blkdev_sectors_to_bio_pages()
block: sed-opal: Set MBRDone on S3 resume path if TPER is MBREnabled
block: tolerate tracing of NULL bio

Linus Torvalds 8 years ago 80a0d644 20e52ee5

+134 -81

14 changed files

expand all

unified split

block

blk-core.c

blk-lib.c

blk-mq.c

blk-mq.h

opal_proto.h

sed-opal.c

drivers

nvme

host

core.c

lightnvm.c

nvme.h

pci.c

include

linux

nvme.h

string.h

trace

events

block.h

backing-dev.c

+6 -1

block/blk-core.c

··· 2342 2342 if (q->mq_ops) { 2343 2343 if (blk_queue_io_stat(q)) 2344 2344 blk_account_io_start(rq, true); 2345 - blk_mq_sched_insert_request(rq, false, true, false, false); 2345 + /* 2346 + * Since we have a scheduler attached on the top device, 2347 + * bypass a potential scheduler on the bottom device for 2348 + * insert. 2349 + */ 2350 + blk_mq_request_bypass_insert(rq); 2346 2351 return BLK_STS_OK; 2347 2352 } 2348 2353

+2 -2

block/blk-lib.c

··· 269 269 */ 270 270 static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects) 271 271 { 272 - sector_t bytes = (nr_sects << 9) + PAGE_SIZE - 1; 272 + sector_t pages = DIV_ROUND_UP_SECTOR_T(nr_sects, PAGE_SIZE / 512); 273 273 274 - return min(bytes >> PAGE_SHIFT, (sector_t)BIO_MAX_PAGES); 274 + return min(pages, (sector_t)BIO_MAX_PAGES); 275 275 } 276 276 277 277 /**

+16

block/blk-mq.c

··· 1401 1401 blk_mq_hctx_mark_pending(hctx, ctx); 1402 1402 } 1403 1403 1404 + /* 1405 + * Should only be used carefully, when the caller knows we want to 1406 + * bypass a potential IO scheduler on the target device. 1407 + */ 1408 + void blk_mq_request_bypass_insert(struct request *rq) 1409 + { 1410 + struct blk_mq_ctx *ctx = rq->mq_ctx; 1411 + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); 1412 + 1413 + spin_lock(&hctx->lock); 1414 + list_add_tail(&rq->queuelist, &hctx->dispatch); 1415 + spin_unlock(&hctx->lock); 1416 + 1417 + blk_mq_run_hw_queue(hctx, false); 1418 + } 1419 + 1404 1420 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, 1405 1421 struct list_head *list) 1406 1422

block/blk-mq.h

··· 54 54 */ 55 55 void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, 56 56 bool at_head); 57 + void blk_mq_request_bypass_insert(struct request *rq); 57 58 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, 58 59 struct list_head *list); 59 60

block/opal_proto.h

··· 46 46 #define GENERIC_HOST_SESSION_NUM 0x41 47 47 48 48 #define TPER_SYNC_SUPPORTED 0x01 49 + #define MBR_ENABLED_MASK 0x10 49 50 50 51 #define TINY_ATOM_DATA_MASK 0x3F 51 52 #define TINY_ATOM_SIGNED 0x40

+32

block/sed-opal.c

··· 80 80 81 81 struct opal_dev { 82 82 bool supported; 83 + bool mbr_enabled; 83 84 84 85 void *data; 85 86 sec_send_recv *send_recv; ··· 284 283 return true; 285 284 } 286 285 286 + static bool check_mbrenabled(const void *data) 287 + { 288 + const struct d0_locking_features *lfeat = data; 289 + u8 sup_feat = lfeat->supported_features; 290 + 291 + return !!(sup_feat & MBR_ENABLED_MASK); 292 + } 293 + 287 294 static bool check_sum(const void *data) 288 295 { 289 296 const struct d0_single_user_mode *sum = data; ··· 426 417 u32 hlen = be32_to_cpu(hdr->length); 427 418 428 419 print_buffer(dev->resp, hlen); 420 + dev->mbr_enabled = false; 429 421 430 422 if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) { 431 423 pr_debug("Discovery length overflows buffer (%zu+%u)/%u\n", ··· 452 442 check_geometry(dev, body); 453 443 break; 454 444 case FC_LOCKING: 445 + dev->mbr_enabled = check_mbrenabled(body->features); 446 + break; 455 447 case FC_ENTERPRISE: 456 448 case FC_DATASTORE: 457 449 /* some ignored properties */ ··· 2202 2190 return next(dev); 2203 2191 } 2204 2192 2193 + static int __opal_set_mbr_done(struct opal_dev *dev, struct opal_key *key) 2194 + { 2195 + u8 mbr_done_tf = 1; 2196 + const struct opal_step mbrdone_step [] = { 2197 + { opal_discovery0, }, 2198 + { start_admin1LSP_opal_session, key }, 2199 + { set_mbr_done, &mbr_done_tf }, 2200 + { end_opal_session, }, 2201 + { NULL, } 2202 + }; 2203 + 2204 + dev->steps = mbrdone_step; 2205 + return next(dev); 2206 + } 2207 + 2205 2208 static int opal_lock_unlock(struct opal_dev *dev, 2206 2209 struct opal_lock_unlock *lk_unlk) 2207 2210 { ··· 2371 2344 suspend->unlk.session.opal_key.lr, 2372 2345 suspend->unlk.session.sum); 2373 2346 was_failure = true; 2347 + } 2348 + if (dev->mbr_enabled) { 2349 + ret = __opal_set_mbr_done(dev, &suspend->unlk.session.opal_key); 2350 + if (ret) 2351 + pr_debug("Failed to set MBR Done in S3 resume\n"); 2374 2352 } 2375 2353 } 2376 2354 mutex_unlock(&dev->dev_lock);

+7 -4

drivers/nvme/host/core.c

··· 1897 1897 ctrl->cntlid = le16_to_cpu(id->cntlid); 1898 1898 ctrl->hmpre = le32_to_cpu(id->hmpre); 1899 1899 ctrl->hmmin = le32_to_cpu(id->hmmin); 1900 + ctrl->hmminds = le32_to_cpu(id->hmminds); 1901 + ctrl->hmmaxd = le16_to_cpu(id->hmmaxd); 1900 1902 } 1901 1903 1902 1904 kfree(id); ··· 2379 2377 2380 2378 nvme_report_ns_ids(ctrl, ns->ns_id, id, ns->eui, ns->nguid, &ns->uuid); 2381 2379 2382 - if (nvme_nvm_ns_supported(ns, id) && 2383 - nvme_nvm_register(ns, disk_name, node)) { 2384 - dev_warn(ctrl->device, "%s: LightNVM init failure\n", __func__); 2385 - goto out_free_id; 2380 + if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { 2381 + if (nvme_nvm_register(ns, disk_name, node)) { 2382 + dev_warn(ctrl->device, "LightNVM init failure\n"); 2383 + goto out_free_id; 2384 + } 2386 2385 } 2387 2386 2388 2387 disk = alloc_disk_node(0, node);

-26

drivers/nvme/host/lightnvm.c

··· 955 955 sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, 956 956 &nvm_dev_attr_group); 957 957 } 958 - 959 - /* move to shared place when used in multiple places. */ 960 - #define PCI_VENDOR_ID_CNEX 0x1d1d 961 - #define PCI_DEVICE_ID_CNEX_WL 0x2807 962 - #define PCI_DEVICE_ID_CNEX_QEMU 0x1f1f 963 - 964 - int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id) 965 - { 966 - struct nvme_ctrl *ctrl = ns->ctrl; 967 - /* XXX: this is poking into PCI structures from generic code! */ 968 - struct pci_dev *pdev = to_pci_dev(ctrl->dev); 969 - 970 - /* QEMU NVMe simulator - PCI ID + Vendor specific bit */ 971 - if (pdev->vendor == PCI_VENDOR_ID_CNEX && 972 - pdev->device == PCI_DEVICE_ID_CNEX_QEMU && 973 - id->vs[0] == 0x1) 974 - return 1; 975 - 976 - /* CNEX Labs - PCI ID + Vendor specific bit */ 977 - if (pdev->vendor == PCI_VENDOR_ID_CNEX && 978 - pdev->device == PCI_DEVICE_ID_CNEX_WL && 979 - id->vs[0] == 0x1) 980 - return 1; 981 - 982 - return 0; 983 - }

+8 -5

drivers/nvme/host/nvme.h

··· 75 75 * The deepest sleep state should not be used. 76 76 */ 77 77 NVME_QUIRK_NO_DEEPEST_PS = (1 << 5), 78 + 79 + /* 80 + * Supports the LighNVM command set if indicated in vs[1]. 81 + */ 82 + NVME_QUIRK_LIGHTNVM = (1 << 6), 78 83 }; 79 84 80 85 /* ··· 181 176 u64 ps_max_latency_us; 182 177 bool apst_enabled; 183 178 179 + /* PCIe only: */ 184 180 u32 hmpre; 185 181 u32 hmmin; 182 + u32 hmminds; 183 + u16 hmmaxd; 186 184 187 185 /* Fabrics only */ 188 186 u16 sqsize; ··· 328 320 int nvme_reset_ctrl(struct nvme_ctrl *ctrl); 329 321 330 322 #ifdef CONFIG_NVM 331 - int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id); 332 323 int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); 333 324 void nvme_nvm_unregister(struct nvme_ns *ns); 334 325 int nvme_nvm_register_sysfs(struct nvme_ns *ns); ··· 346 339 return 0; 347 340 } 348 341 static inline void nvme_nvm_unregister_sysfs(struct nvme_ns *ns) {}; 349 - static inline int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id) 350 - { 351 - return 0; 352 - } 353 342 static inline int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, 354 343 unsigned long arg) 355 344 {

+50 -24

drivers/nvme/host/pci.c

··· 1612 1612 dev->host_mem_descs = NULL; 1613 1613 } 1614 1614 1615 - static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) 1615 + static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, 1616 + u32 chunk_size) 1616 1617 { 1617 1618 struct nvme_host_mem_buf_desc *descs; 1618 - u32 chunk_size, max_entries, len; 1619 + u32 max_entries, len; 1619 1620 dma_addr_t descs_dma; 1620 1621 int i = 0; 1621 1622 void **bufs; 1622 1623 u64 size = 0, tmp; 1623 1624 1624 - /* start big and work our way down */ 1625 - chunk_size = min(preferred, (u64)PAGE_SIZE << MAX_ORDER); 1626 - retry: 1627 1625 tmp = (preferred + chunk_size - 1); 1628 1626 do_div(tmp, chunk_size); 1629 1627 max_entries = tmp; 1628 + 1629 + if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries) 1630 + max_entries = dev->ctrl.hmmaxd; 1631 + 1630 1632 descs = dma_zalloc_coherent(dev->dev, max_entries * sizeof(*descs), 1631 1633 &descs_dma, GFP_KERNEL); 1632 1634 if (!descs) ··· 1652 1650 i++; 1653 1651 } 1654 1652 1655 - if (!size || (min && size < min)) { 1656 - dev_warn(dev->ctrl.device, 1657 - "failed to allocate host memory buffer.\n"); 1653 + if (!size) 1658 1654 goto out_free_bufs; 1659 - } 1660 1655 1661 - dev_info(dev->ctrl.device, 1662 - "allocated %lld MiB host memory buffer.\n", 1663 - size >> ilog2(SZ_1M)); 1664 1656 dev->nr_host_mem_descs = i; 1665 1657 dev->host_mem_size = size; 1666 1658 dev->host_mem_descs = descs; ··· 1675 1679 dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs, 1676 1680 descs_dma); 1677 1681 out: 1678 - /* try a smaller chunk size if we failed early */ 1679 - if (chunk_size >= PAGE_SIZE * 2 && (i == 0 || size < min)) { 1680 - chunk_size /= 2; 1681 - goto retry; 1682 - } 1683 1682 dev->host_mem_descs = NULL; 1684 1683 return -ENOMEM; 1685 1684 } 1686 1685 1687 - static void nvme_setup_host_mem(struct nvme_dev *dev) 1686 + static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) 1687 + { 1688 + u32 chunk_size; 1689 + 1690 + /* start big and work our way down */ 1691 + for (chunk_size = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES); 1692 + chunk_size >= max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2); 1693 + chunk_size /= 2) { 1694 + if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) { 1695 + if (!min || dev->host_mem_size >= min) 1696 + return 0; 1697 + nvme_free_host_mem(dev); 1698 + } 1699 + } 1700 + 1701 + return -ENOMEM; 1702 + } 1703 + 1704 + static int nvme_setup_host_mem(struct nvme_dev *dev) 1688 1705 { 1689 1706 u64 max = (u64)max_host_mem_size_mb * SZ_1M; 1690 1707 u64 preferred = (u64)dev->ctrl.hmpre * 4096; 1691 1708 u64 min = (u64)dev->ctrl.hmmin * 4096; 1692 1709 u32 enable_bits = NVME_HOST_MEM_ENABLE; 1710 + int ret = 0; 1693 1711 1694 1712 preferred = min(preferred, max); 1695 1713 if (min > max) { ··· 1711 1701 "min host memory (%lld MiB) above limit (%d MiB).\n", 1712 1702 min >> ilog2(SZ_1M), max_host_mem_size_mb); 1713 1703 nvme_free_host_mem(dev); 1714 - return; 1704 + return 0; 1715 1705 } 1716 1706 1717 1707 /* ··· 1725 1715 } 1726 1716 1727 1717 if (!dev->host_mem_descs) { 1728 - if (nvme_alloc_host_mem(dev, min, preferred)) 1729 - return; 1718 + if (nvme_alloc_host_mem(dev, min, preferred)) { 1719 + dev_warn(dev->ctrl.device, 1720 + "failed to allocate host memory buffer.\n"); 1721 + return 0; /* controller must work without HMB */ 1722 + } 1723 + 1724 + dev_info(dev->ctrl.device, 1725 + "allocated %lld MiB host memory buffer.\n", 1726 + dev->host_mem_size >> ilog2(SZ_1M)); 1730 1727 } 1731 1728 1732 - if (nvme_set_host_mem(dev, enable_bits)) 1729 + ret = nvme_set_host_mem(dev, enable_bits); 1730 + if (ret) 1733 1731 nvme_free_host_mem(dev); 1732 + return ret; 1734 1733 } 1735 1734 1736 1735 static int nvme_setup_io_queues(struct nvme_dev *dev) ··· 2183 2164 "unable to allocate dma for dbbuf\n"); 2184 2165 } 2185 2166 2186 - if (dev->ctrl.hmpre) 2187 - nvme_setup_host_mem(dev); 2167 + if (dev->ctrl.hmpre) { 2168 + result = nvme_setup_host_mem(dev); 2169 + if (result < 0) 2170 + goto out; 2171 + } 2188 2172 2189 2173 result = nvme_setup_io_queues(dev); 2190 2174 if (result) ··· 2519 2497 .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, 2520 2498 { PCI_DEVICE(0x144d, 0xa822), /* Samsung PM1725a */ 2521 2499 .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, 2500 + { PCI_DEVICE(0x1d1d, 0x1f1f), /* LighNVM qemu device */ 2501 + .driver_data = NVME_QUIRK_LIGHTNVM, }, 2502 + { PCI_DEVICE(0x1d1d, 0x2807), /* CNEX WL */ 2503 + .driver_data = NVME_QUIRK_LIGHTNVM, }, 2522 2504 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, 2523 2505 { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, 2524 2506 { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },

+3 -1

include/linux/nvme.h

··· 226 226 __le16 mntmt; 227 227 __le16 mxtmt; 228 228 __le32 sanicap; 229 - __u8 rsvd332[180]; 229 + __le32 hmminds; 230 + __le16 hmmaxd; 231 + __u8 rsvd338[174]; 230 232 __u8 sqes; 231 233 __u8 cqes; 232 234 __le16 maxcmd;

+2 -13

include/linux/string.h

··· 434 434 * @count: The number of bytes to copy 435 435 * @pad: Character to use for padding if space is left in destination. 436 436 */ 437 - __FORTIFY_INLINE void memcpy_and_pad(void *dest, size_t dest_len, 438 - const void *src, size_t count, int pad) 437 + static inline void memcpy_and_pad(void *dest, size_t dest_len, 438 + const void *src, size_t count, int pad) 439 439 { 440 - size_t dest_size = __builtin_object_size(dest, 0); 441 - size_t src_size = __builtin_object_size(src, 0); 442 - 443 - if (__builtin_constant_p(dest_len) && __builtin_constant_p(count)) { 444 - if (dest_size < dest_len && dest_size < count) 445 - __write_overflow(); 446 - else if (src_size < dest_len && src_size < count) 447 - __read_overflow3(); 448 - } 449 - if (dest_size < dest_len) 450 - fortify_panic(__func__); 451 440 if (dest_len > count) { 452 441 memcpy(dest, src, count); 453 442 memset(dest + count, pad, dest_len - count);

+2 -3

include/trace/events/block.h

··· 397 397 398 398 TP_fast_assign( 399 399 __entry->dev = bio ? bio_dev(bio) : 0; 400 - __entry->dev = bio_dev(bio); 401 400 __entry->sector = bio ? bio->bi_iter.bi_sector : 0; 402 401 __entry->nr_sector = bio ? bio_sectors(bio) : 0; 403 402 blk_fill_rwbs(__entry->rwbs, ··· 413 414 /** 414 415 * block_getrq - get a free request entry in queue for block IO operations 415 416 * @q: queue for operations 416 - * @bio: pending block IO operation 417 + * @bio: pending block IO operation (can be %NULL) 417 418 * @rw: low bit indicates a read (%0) or a write (%1) 418 419 * 419 420 * A request struct for queue @q has been allocated to handle the ··· 429 430 /** 430 431 * block_sleeprq - waiting to get a free request entry in queue for block IO operation 431 432 * @q: queue for operation 432 - * @bio: pending block IO operation 433 + * @bio: pending block IO operation (can be %NULL) 433 434 * @rw: low bit indicates a read (%0) or a write (%1) 434 435 * 435 436 * In the case where a request struct cannot be provided for queue @q

+4 -2

mm/backing-dev.c

··· 569 569 570 570 /* need to create a new one */ 571 571 wb = kmalloc(sizeof(*wb), gfp); 572 - if (!wb) 573 - return -ENOMEM; 572 + if (!wb) { 573 + ret = -ENOMEM; 574 + goto out_put; 575 + } 574 576 575 577 ret = wb_init(wb, bdi, blkcg_css->id, gfp); 576 578 if (ret)