Merge tag 'for-6.13/block-20241118' of git://git.kernel.dk/linux

+7

Documentation/ABI/stable/sysfs-block

··· 424 424 [RW] This file is used to control (on/off) the iostats 425 425 accounting of the disk. 426 426 427 + What: /sys/block/<disk>/queue/iostats_passthrough 428 + Date: October 2024 429 + Contact: linux-block@vger.kernel.org 430 + Description: 431 + [RW] This file is used to control (on/off) the iostats 432 + accounting of the disk for passthrough commands. 433 + 427 434 428 435 What: /sys/block/<disk>/queue/logical_block_size 429 436 Date: May 2009

+4 -1

Documentation/block/cmdline-partition.rst

··· 39 39 create a link to block device partition with the name "PARTNAME". 40 40 User space application can access partition by partition name. 41 41 42 + ro 43 + read-only. Flag the partition as read-only. 44 + 42 45 Example: 43 46 44 47 eMMC disk names are "mmcblk0" and "mmcblk0boot0". 45 48 46 49 bootargs:: 47 50 48 - 'blkdevparts=mmcblk0:1G(data0),1G(data1),-;mmcblk0boot0:1m(boot),-(kernel)' 51 + 'blkdevparts=mmcblk0:1G(data0),1G(data1),-;mmcblk0boot0:1m(boot)ro,-(kernel)' 49 52 50 53 dmesg:: 51 54

+17 -5

Documentation/block/ublk.rst

··· 199 199 200 200 - user recovery feature description 201 201 202 - Two new features are added for user recovery: ``UBLK_F_USER_RECOVERY`` and 203 - ``UBLK_F_USER_RECOVERY_REISSUE``. 202 + Three new features are added for user recovery: ``UBLK_F_USER_RECOVERY``, 203 + ``UBLK_F_USER_RECOVERY_REISSUE``, and ``UBLK_F_USER_RECOVERY_FAIL_IO``. To 204 + enable recovery of ublk devices after the ublk server exits, the ublk server 205 + should specify the ``UBLK_F_USER_RECOVERY`` flag when creating the device. The 206 + ublk server may additionally specify at most one of 207 + ``UBLK_F_USER_RECOVERY_REISSUE`` and ``UBLK_F_USER_RECOVERY_FAIL_IO`` to 208 + modify how I/O is handled while the ublk server is dying/dead (this is called 209 + the ``nosrv`` case in the driver code). 204 210 205 - With ``UBLK_F_USER_RECOVERY`` set, after one ubq_daemon(ublk server's io 211 + With just ``UBLK_F_USER_RECOVERY`` set, after one ubq_daemon(ublk server's io 206 212 handler) is dying, ublk does not delete ``/dev/ublkb*`` during the whole 207 213 recovery stage and ublk device ID is kept. It is ublk server's 208 214 responsibility to recover the device context by its own knowledge. 209 215 Requests which have not been issued to userspace are requeued. Requests 210 216 which have been issued to userspace are aborted. 211 217 212 - With ``UBLK_F_USER_RECOVERY_REISSUE`` set, after one ubq_daemon(ublk 213 - server's io handler) is dying, contrary to ``UBLK_F_USER_RECOVERY``, 218 + With ``UBLK_F_USER_RECOVERY_REISSUE`` additionally set, after one ubq_daemon 219 + (ublk server's io handler) is dying, contrary to ``UBLK_F_USER_RECOVERY``, 214 220 requests which have been issued to userspace are requeued and will be 215 221 re-issued to the new process after handling ``UBLK_CMD_END_USER_RECOVERY``. 216 222 ``UBLK_F_USER_RECOVERY_REISSUE`` is designed for backends who tolerate 217 223 double-write since the driver may issue the same I/O request twice. It 218 224 might be useful to a read-only FS or a VM backend. 225 + 226 + With ``UBLK_F_USER_RECOVERY_FAIL_IO`` additionally set, after the ublk server 227 + exits, requests which have issued to userspace are failed, as are any 228 + subsequently issued requests. Applications continuously issuing I/O against 229 + devices with this flag set will see a stream of I/O errors until a new ublk 230 + server recovers the device. 219 231 220 232 Unprivileged ublk device is supported by passing ``UBLK_F_UNPRIVILEGED_DEV``. 221 233 Once the flag is set, all control commands can be sent by unprivileged

+52

Documentation/devicetree/bindings/mmc/mmc-card.yaml

··· 13 13 This documents describes the devicetree bindings for a mmc-host controller 14 14 child node describing a mmc-card / an eMMC. 15 15 16 + It's possible to define a fixed partition table for an eMMC for the user 17 + partition, the 2 BOOT partition (boot1/2) and the 4 GP (gp1/2/3/4) if supported 18 + by the eMMC. 19 + 16 20 properties: 17 21 compatible: 18 22 const: mmc-card ··· 29 25 description: 30 26 Use this to indicate that the mmc-card has a broken hpi 31 27 implementation, and that hpi should not be used. 28 + 29 + patternProperties: 30 + "^partitions(-boot[12]|-gp[14])?$": 31 + $ref: /schemas/mtd/partitions/partitions.yaml 32 + 33 + patternProperties: 34 + "^partition@[0-9a-f]+$": 35 + $ref: /schemas/mtd/partitions/partition.yaml 36 + 37 + properties: 38 + reg: 39 + description: Must be multiple of 512 as it's converted 40 + internally from bytes to SECTOR_SIZE (512 bytes) 41 + 42 + required: 43 + - reg 44 + 45 + unevaluatedProperties: false 32 46 33 47 required: 34 48 - compatible ··· 64 42 compatible = "mmc-card"; 65 43 reg = <0>; 66 44 broken-hpi; 45 + 46 + partitions { 47 + compatible = "fixed-partitions"; 48 + 49 + #address-cells = <1>; 50 + #size-cells = <1>; 51 + 52 + partition@0 { 53 + label = "kernel"; /* Kernel */ 54 + reg = <0x0 0x2000000>; /* 32 MB */ 55 + }; 56 + 57 + partition@2000000 { 58 + label = "rootfs"; 59 + reg = <0x2000000 0x40000000>; /* 1GB */ 60 + }; 61 + }; 62 + 63 + partitions-boot1 { 64 + compatible = "fixed-partitions"; 65 + 66 + #address-cells = <1>; 67 + #size-cells = <1>; 68 + 69 + partition@0 { 70 + label = "bl"; 71 + reg = <0x0 0x2000000>; /* 32MB */ 72 + read-only; 73 + }; 74 + }; 67 75 }; 68 76 }; 69 77

+2 -2

MAINTAINERS

··· 21393 21393 21394 21394 SOFTWARE RAID (Multiple Disks) SUPPORT 21395 21395 M: Song Liu <song@kernel.org> 21396 - R: Yu Kuai <yukuai3@huawei.com> 21396 + M: Yu Kuai <yukuai3@huawei.com> 21397 21397 L: linux-raid@vger.kernel.org 21398 21398 S: Supported 21399 21399 Q: https://patchwork.kernel.org/project/linux-raid/list/ 21400 - T: git git://git.kernel.org/pub/scm/linux/kernel/git/song/md.git 21400 + T: git git://git.kernel.org/pub/scm/linux/kernel/git/mdraid/linux.git 21401 21401 F: drivers/md/Kconfig 21402 21402 F: drivers/md/Makefile 21403 21403 F: drivers/md/md*

+5 -8

block/bio-integrity.c

··· 199 199 200 200 static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec, 201 201 int nr_vecs, unsigned int len, 202 - unsigned int direction, u32 seed) 202 + unsigned int direction) 203 203 { 204 204 bool write = direction == ITER_SOURCE; 205 205 struct bio_integrity_payload *bip; ··· 247 247 } 248 248 249 249 bip->bip_flags |= BIP_COPY_USER; 250 - bip->bip_iter.bi_sector = seed; 251 250 bip->bip_vcnt = nr_vecs; 252 251 return 0; 253 252 free_bip: ··· 257 258 } 258 259 259 260 static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec, 260 - int nr_vecs, unsigned int len, u32 seed) 261 + int nr_vecs, unsigned int len) 261 262 { 262 263 struct bio_integrity_payload *bip; 263 264 ··· 266 267 return PTR_ERR(bip); 267 268 268 269 memcpy(bip->bip_vec, bvec, nr_vecs * sizeof(*bvec)); 269 - bip->bip_iter.bi_sector = seed; 270 270 bip->bip_iter.bi_size = len; 271 271 bip->bip_vcnt = nr_vecs; 272 272 return 0; ··· 301 303 return nr_bvecs; 302 304 } 303 305 304 - int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes, 305 - u32 seed) 306 + int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes) 306 307 { 307 308 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 308 309 unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits); ··· 347 350 348 351 if (copy) 349 352 ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes, 350 - direction, seed); 353 + direction); 351 354 else 352 - ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes, seed); 355 + ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes); 353 356 if (ret) 354 357 goto release_pages; 355 358 if (bvec != stack_vec)

+12 -69

block/bio.c

··· 1065 1065 EXPORT_SYMBOL(bio_add_pc_page); 1066 1066 1067 1067 /** 1068 - * bio_add_zone_append_page - attempt to add page to zone-append bio 1069 - * @bio: destination bio 1070 - * @page: page to add 1071 - * @len: vec entry length 1072 - * @offset: vec entry offset 1073 - * 1074 - * Attempt to add a page to the bio_vec maplist of a bio that will be submitted 1075 - * for a zone-append request. This can fail for a number of reasons, such as the 1076 - * bio being full or the target block device is not a zoned block device or 1077 - * other limitations of the target block device. The target block device must 1078 - * allow bio's up to PAGE_SIZE, so it is always possible to add a single page 1079 - * to an empty bio. 1080 - * 1081 - * Returns: number of bytes added to the bio, or 0 in case of a failure. 1082 - */ 1083 - int bio_add_zone_append_page(struct bio *bio, struct page *page, 1084 - unsigned int len, unsigned int offset) 1085 - { 1086 - struct request_queue *q = bdev_get_queue(bio->bi_bdev); 1087 - bool same_page = false; 1088 - 1089 - if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND)) 1090 - return 0; 1091 - 1092 - if (WARN_ON_ONCE(!bdev_is_zoned(bio->bi_bdev))) 1093 - return 0; 1094 - 1095 - return bio_add_hw_page(q, bio, page, len, offset, 1096 - queue_max_zone_append_sectors(q), &same_page); 1097 - } 1098 - EXPORT_SYMBOL_GPL(bio_add_zone_append_page); 1099 - 1100 - /** 1101 1068 * __bio_add_page - add page(s) to a bio in a new segment 1102 1069 * @bio: destination bio 1103 1070 * @page: start page to add ··· 1173 1206 1174 1207 void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) 1175 1208 { 1176 - size_t size = iov_iter_count(iter); 1177 - 1178 1209 WARN_ON_ONCE(bio->bi_max_vecs); 1179 - 1180 - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 1181 - struct request_queue *q = bdev_get_queue(bio->bi_bdev); 1182 - size_t max_sectors = queue_max_zone_append_sectors(q); 1183 - 1184 - size = min(size, max_sectors << SECTOR_SHIFT); 1185 - } 1186 1210 1187 1211 bio->bi_vcnt = iter->nr_segs; 1188 1212 bio->bi_io_vec = (struct bio_vec *)iter->bvec; 1189 1213 bio->bi_iter.bi_bvec_done = iter->iov_offset; 1190 - bio->bi_iter.bi_size = size; 1214 + bio->bi_iter.bi_size = iov_iter_count(iter); 1191 1215 bio_set_flag(bio, BIO_CLONED); 1192 1216 } 1193 1217 ··· 1200 1242 return 0; 1201 1243 } 1202 1244 bio_add_folio_nofail(bio, folio, len, offset); 1203 - return 0; 1204 - } 1205 - 1206 - static int bio_iov_add_zone_append_folio(struct bio *bio, struct folio *folio, 1207 - size_t len, size_t offset) 1208 - { 1209 - struct request_queue *q = bdev_get_queue(bio->bi_bdev); 1210 - bool same_page = false; 1211 - 1212 - if (bio_add_hw_folio(q, bio, folio, len, offset, 1213 - queue_max_zone_append_sectors(q), &same_page) != len) 1214 - return -EINVAL; 1215 - if (same_page && bio_flagged(bio, BIO_PAGE_PINNED)) 1216 - unpin_user_folio(folio, 1); 1217 1245 return 0; 1218 1246 } 1219 1247 ··· 1309 1365 len = get_contig_folio_len(&num_pages, pages, i, 1310 1366 folio, left, offset); 1311 1367 1312 - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 1313 - ret = bio_iov_add_zone_append_folio(bio, folio, len, 1314 - folio_offset); 1315 - if (ret) 1316 - break; 1317 - } else 1318 - bio_iov_add_folio(bio, folio, len, folio_offset); 1319 - 1368 + bio_iov_add_folio(bio, folio, len, folio_offset); 1320 1369 offset = 0; 1321 1370 } 1322 1371 ··· 1665 1728 { 1666 1729 struct bio *split; 1667 1730 1668 - BUG_ON(sectors <= 0); 1669 - BUG_ON(sectors >= bio_sectors(bio)); 1731 + if (WARN_ON_ONCE(sectors <= 0)) 1732 + return ERR_PTR(-EINVAL); 1733 + if (WARN_ON_ONCE(sectors >= bio_sectors(bio))) 1734 + return ERR_PTR(-EINVAL); 1670 1735 1671 1736 /* Zone append commands cannot be split */ 1672 1737 if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) 1673 - return NULL; 1738 + return ERR_PTR(-EINVAL); 1739 + 1740 + /* atomic writes cannot be split */ 1741 + if (bio->bi_opf & REQ_ATOMIC) 1742 + return ERR_PTR(-EINVAL); 1674 1743 1675 1744 split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs); 1676 1745 if (!split) 1677 - return NULL; 1746 + return ERR_PTR(-ENOMEM); 1678 1747 1679 1748 split->bi_iter.bi_size = sectors << 9; 1680 1749

+20 -6

block/blk-core.c

··· 261 261 blk_mq_release(q); 262 262 263 263 ida_free(&blk_queue_ida, q->id); 264 + lockdep_unregister_key(&q->io_lock_cls_key); 265 + lockdep_unregister_key(&q->q_lock_cls_key); 264 266 call_rcu(&q->rcu_head, blk_free_queue_rcu); 265 267 } 266 268 ··· 280 278 } 281 279 EXPORT_SYMBOL(blk_put_queue); 282 280 283 - void blk_queue_start_drain(struct request_queue *q) 281 + bool blk_queue_start_drain(struct request_queue *q) 284 282 { 285 283 /* 286 284 * When queue DYING flag is set, we need to block new req 287 285 * entering queue, so we call blk_freeze_queue_start() to 288 286 * prevent I/O from crossing blk_queue_enter(). 289 287 */ 290 - blk_freeze_queue_start(q); 288 + bool freeze = __blk_freeze_queue_start(q, current); 291 289 if (queue_is_mq(q)) 292 290 blk_mq_wake_waiters(q); 293 291 /* Make blk_queue_enter() reexamine the DYING flag. */ 294 292 wake_up_all(&q->mq_freeze_wq); 293 + 294 + return freeze; 295 295 } 296 296 297 297 /** ··· 325 321 return -ENODEV; 326 322 } 327 323 324 + rwsem_acquire_read(&q->q_lockdep_map, 0, 0, _RET_IP_); 325 + rwsem_release(&q->q_lockdep_map, _RET_IP_); 328 326 return 0; 329 327 } 330 328 ··· 358 352 goto dead; 359 353 } 360 354 355 + rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_); 356 + rwsem_release(&q->io_lockdep_map, _RET_IP_); 361 357 return 0; 362 358 dead: 363 359 bio_io_error(bio); ··· 449 441 PERCPU_REF_INIT_ATOMIC, GFP_KERNEL); 450 442 if (error) 451 443 goto fail_stats; 444 + lockdep_register_key(&q->io_lock_cls_key); 445 + lockdep_register_key(&q->q_lock_cls_key); 446 + lockdep_init_map(&q->io_lockdep_map, "&q->q_usage_counter(io)", 447 + &q->io_lock_cls_key, 0); 448 + lockdep_init_map(&q->q_lockdep_map, "&q->q_usage_counter(queue)", 449 + &q->q_lock_cls_key, 0); 452 450 453 451 q->nr_requests = BLKDEV_DEFAULT_RQ; 454 452 ··· 607 593 return BLK_STS_IOERR; 608 594 609 595 /* Make sure the BIO is small enough and will not get split */ 610 - if (nr_sectors > queue_max_zone_append_sectors(q)) 596 + if (nr_sectors > q->limits.max_zone_append_sectors) 611 597 return BLK_STS_IOERR; 612 598 613 599 bio->bi_opf |= REQ_NOMERGE; ··· 1120 1106 return; 1121 1107 1122 1108 plug->cur_ktime = 0; 1123 - plug->mq_list = NULL; 1124 - plug->cached_rq = NULL; 1109 + rq_list_init(&plug->mq_list); 1110 + rq_list_init(&plug->cached_rqs); 1125 1111 plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT); 1126 1112 plug->rq_count = 0; 1127 1113 plug->multiple_queues = false; ··· 1217 1203 * queue for cached requests, we don't want a blocked task holding 1218 1204 * up a queue freeze/quiesce event. 1219 1205 */ 1220 - if (unlikely(!rq_list_empty(plug->cached_rq))) 1206 + if (unlikely(!rq_list_empty(&plug->cached_rqs))) 1221 1207 blk_mq_free_plug_rqs(plug); 1222 1208 1223 1209 plug->cur_ktime = 0;

+1 -1

block/blk-crypto-fallback.c

··· 226 226 227 227 split_bio = bio_split(bio, num_sectors, GFP_NOIO, 228 228 &crypto_bio_split); 229 - if (!split_bio) { 229 + if (IS_ERR(split_bio)) { 230 230 bio->bi_status = BLK_STS_RESOURCE; 231 231 return false; 232 232 }

+2 -2

block/blk-integrity.c

··· 113 113 EXPORT_SYMBOL(blk_rq_map_integrity_sg); 114 114 115 115 int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, 116 - ssize_t bytes, u32 seed) 116 + ssize_t bytes) 117 117 { 118 - int ret = bio_integrity_map_user(rq->bio, ubuf, bytes, seed); 118 + int ret = bio_integrity_map_user(rq->bio, ubuf, bytes); 119 119 120 120 if (ret) 121 121 return ret;

+1 -8

block/blk-ioc.c

··· 32 32 atomic_long_inc(&ioc->refcount); 33 33 } 34 34 35 - static void icq_free_icq_rcu(struct rcu_head *head) 36 - { 37 - struct io_cq *icq = container_of(head, struct io_cq, __rcu_head); 38 - 39 - kmem_cache_free(icq->__rcu_icq_cache, icq); 40 - } 41 - 42 35 /* 43 36 * Exit an icq. Called with ioc locked for blk-mq, and with both ioc 44 37 * and queue locked for legacy. ··· 95 102 */ 96 103 icq->__rcu_icq_cache = et->icq_cache; 97 104 icq->flags |= ICQ_DESTROYED; 98 - call_rcu(&icq->__rcu_head, icq_free_icq_rcu); 105 + kfree_rcu(icq, __rcu_head); 99 106 } 100 107 101 108 /*

+65 -42

block/blk-merge.c

··· 107 107 108 108 static struct bio *bio_submit_split(struct bio *bio, int split_sectors) 109 109 { 110 - if (unlikely(split_sectors < 0)) { 111 - bio->bi_status = errno_to_blk_status(split_sectors); 112 - bio_endio(bio); 113 - return NULL; 114 - } 110 + if (unlikely(split_sectors < 0)) 111 + goto error; 115 112 116 113 if (split_sectors) { 117 114 struct bio *split; 118 115 119 116 split = bio_split(bio, split_sectors, GFP_NOIO, 120 117 &bio->bi_bdev->bd_disk->bio_split); 118 + if (IS_ERR(split)) { 119 + split_sectors = PTR_ERR(split); 120 + goto error; 121 + } 121 122 split->bi_opf |= REQ_NOMERGE; 122 123 blkcg_bio_issue_init(split); 123 124 bio_chain(split, bio); ··· 129 128 } 130 129 131 130 return bio; 131 + error: 132 + bio->bi_status = errno_to_blk_status(split_sectors); 133 + bio_endio(bio); 134 + return NULL; 132 135 } 133 136 134 137 struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, ··· 171 166 return bio_submit_split(bio, split_sectors); 172 167 } 173 168 174 - struct bio *bio_split_write_zeroes(struct bio *bio, 175 - const struct queue_limits *lim, unsigned *nsegs) 176 - { 177 - *nsegs = 0; 178 - if (!lim->max_write_zeroes_sectors) 179 - return bio; 180 - if (bio_sectors(bio) <= lim->max_write_zeroes_sectors) 181 - return bio; 182 - return bio_submit_split(bio, lim->max_write_zeroes_sectors); 183 - } 184 - 185 169 static inline unsigned int blk_boundary_sectors(const struct queue_limits *lim, 186 170 bool is_atomic) 187 171 { ··· 205 211 * We ignore lim->max_sectors for atomic writes because it may less 206 212 * than the actual bio size, which we cannot tolerate. 207 213 */ 208 - if (is_atomic) 214 + if (bio_op(bio) == REQ_OP_WRITE_ZEROES) 215 + max_sectors = lim->max_write_zeroes_sectors; 216 + else if (is_atomic) 209 217 max_sectors = lim->atomic_write_max_sectors; 210 218 else 211 219 max_sectors = lim->max_sectors; ··· 292 296 return len > 0 || bv->bv_len > max_len; 293 297 } 294 298 299 + static unsigned int bio_split_alignment(struct bio *bio, 300 + const struct queue_limits *lim) 301 + { 302 + if (op_is_write(bio_op(bio)) && lim->zone_write_granularity) 303 + return lim->zone_write_granularity; 304 + return lim->logical_block_size; 305 + } 306 + 295 307 /** 296 308 * bio_split_rw_at - check if and where to split a read/write bio 297 309 * @bio: [in] bio to be split ··· 362 358 * split size so that each bio is properly block size aligned, even if 363 359 * we do not use the full hardware limits. 364 360 */ 365 - bytes = ALIGN_DOWN(bytes, lim->logical_block_size); 361 + bytes = ALIGN_DOWN(bytes, bio_split_alignment(bio, lim)); 366 362 367 363 /* 368 364 * Bio splitting may cause subtle trouble such as hang when doing sync ··· 392 388 struct bio *bio_split_zone_append(struct bio *bio, 393 389 const struct queue_limits *lim, unsigned *nr_segs) 394 390 { 395 - unsigned int max_sectors = queue_limits_max_zone_append_sectors(lim); 396 391 int split_sectors; 397 392 398 393 split_sectors = bio_split_rw_at(bio, lim, nr_segs, 399 - max_sectors << SECTOR_SHIFT); 394 + lim->max_zone_append_sectors << SECTOR_SHIFT); 400 395 if (WARN_ON_ONCE(split_sectors > 0)) 401 396 split_sectors = -EINVAL; 402 397 return bio_submit_split(bio, split_sectors); 398 + } 399 + 400 + struct bio *bio_split_write_zeroes(struct bio *bio, 401 + const struct queue_limits *lim, unsigned *nsegs) 402 + { 403 + unsigned int max_sectors = get_max_io_size(bio, lim); 404 + 405 + *nsegs = 0; 406 + 407 + /* 408 + * An unset limit should normally not happen, as bio submission is keyed 409 + * off having a non-zero limit. But SCSI can clear the limit in the 410 + * I/O completion handler, and we can race and see this. Splitting to a 411 + * zero limit obviously doesn't make sense, so band-aid it here. 412 + */ 413 + if (!max_sectors) 414 + return bio; 415 + if (bio_sectors(bio) <= max_sectors) 416 + return bio; 417 + return bio_submit_split(bio, max_sectors); 403 418 } 404 419 405 420 /** ··· 434 411 */ 435 412 struct bio *bio_split_to_limits(struct bio *bio) 436 413 { 437 - const struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits; 438 414 unsigned int nr_segs; 439 415 440 - return __bio_split_to_limits(bio, lim, &nr_segs); 416 + return __bio_split_to_limits(bio, bdev_limits(bio->bi_bdev), &nr_segs); 441 417 } 442 418 EXPORT_SYMBOL(bio_split_to_limits); 443 419 ··· 819 797 820 798 static void blk_account_io_merge_request(struct request *req) 821 799 { 822 - if (blk_do_io_stat(req)) { 800 + if (req->rq_flags & RQF_IO_STAT) { 823 801 part_stat_lock(); 824 802 part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); 825 803 part_stat_local_dec(req->part, ··· 867 845 if (rq_data_dir(req) != rq_data_dir(next)) 868 846 return NULL; 869 847 870 - /* Don't merge requests with different write hints. */ 871 - if (req->write_hint != next->write_hint) 872 - return NULL; 873 - 874 - if (req->ioprio != next->ioprio) 875 - return NULL; 848 + if (req->bio && next->bio) { 849 + /* Don't merge requests with different write hints. */ 850 + if (req->bio->bi_write_hint != next->bio->bi_write_hint) 851 + return NULL; 852 + if (req->bio->bi_ioprio != next->bio->bi_ioprio) 853 + return NULL; 854 + } 876 855 877 856 if (!blk_atomic_write_mergeable_rqs(req, next)) 878 857 return NULL; ··· 1002 979 if (!bio_crypt_rq_ctx_compatible(rq, bio)) 1003 980 return false; 1004 981 1005 - /* Don't merge requests with different write hints. */ 1006 - if (rq->write_hint != bio->bi_write_hint) 1007 - return false; 1008 - 1009 - if (rq->ioprio != bio_prio(bio)) 1010 - return false; 982 + if (rq->bio) { 983 + /* Don't merge requests with different write hints. */ 984 + if (rq->bio->bi_write_hint != bio->bi_write_hint) 985 + return false; 986 + if (rq->bio->bi_ioprio != bio->bi_ioprio) 987 + return false; 988 + } 1011 989 1012 990 if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false) 1013 991 return false; ··· 1029 1005 1030 1006 static void blk_account_io_merge_bio(struct request *req) 1031 1007 { 1032 - if (!blk_do_io_stat(req)) 1033 - return; 1034 - 1035 - part_stat_lock(); 1036 - part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); 1037 - part_stat_unlock(); 1008 + if (req->rq_flags & RQF_IO_STAT) { 1009 + part_stat_lock(); 1010 + part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); 1011 + part_stat_unlock(); 1012 + } 1038 1013 } 1039 1014 1040 1015 enum bio_merge_status bio_attempt_back_merge(struct request *req, ··· 1179 1156 struct blk_plug *plug = current->plug; 1180 1157 struct request *rq; 1181 1158 1182 - if (!plug || rq_list_empty(plug->mq_list)) 1159 + if (!plug || rq_list_empty(&plug->mq_list)) 1183 1160 return false; 1184 1161 1185 1162 rq_list_for_each(&plug->mq_list, rq) {

+211 -96

block/blk-mq.c

··· 92 92 { 93 93 struct mq_inflight *mi = priv; 94 94 95 - if (rq->part && blk_do_io_stat(rq) && 95 + if (rq->rq_flags & RQF_IO_STAT && 96 96 (!bdev_is_partition(mi->part) || rq->part == mi->part) && 97 97 blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) 98 98 mi->inflight[rq_data_dir(rq)]++; ··· 120 120 inflight[1] = mi.inflight[1]; 121 121 } 122 122 123 - void blk_freeze_queue_start(struct request_queue *q) 123 + #ifdef CONFIG_LOCKDEP 124 + static bool blk_freeze_set_owner(struct request_queue *q, 125 + struct task_struct *owner) 124 126 { 127 + if (!owner) 128 + return false; 129 + 130 + if (!q->mq_freeze_depth) { 131 + q->mq_freeze_owner = owner; 132 + q->mq_freeze_owner_depth = 1; 133 + return true; 134 + } 135 + 136 + if (owner == q->mq_freeze_owner) 137 + q->mq_freeze_owner_depth += 1; 138 + return false; 139 + } 140 + 141 + /* verify the last unfreeze in owner context */ 142 + static bool blk_unfreeze_check_owner(struct request_queue *q) 143 + { 144 + if (!q->mq_freeze_owner) 145 + return false; 146 + if (q->mq_freeze_owner != current) 147 + return false; 148 + if (--q->mq_freeze_owner_depth == 0) { 149 + q->mq_freeze_owner = NULL; 150 + return true; 151 + } 152 + return false; 153 + } 154 + 155 + #else 156 + 157 + static bool blk_freeze_set_owner(struct request_queue *q, 158 + struct task_struct *owner) 159 + { 160 + return false; 161 + } 162 + 163 + static bool blk_unfreeze_check_owner(struct request_queue *q) 164 + { 165 + return false; 166 + } 167 + #endif 168 + 169 + bool __blk_freeze_queue_start(struct request_queue *q, 170 + struct task_struct *owner) 171 + { 172 + bool freeze; 173 + 125 174 mutex_lock(&q->mq_freeze_lock); 175 + freeze = blk_freeze_set_owner(q, owner); 126 176 if (++q->mq_freeze_depth == 1) { 127 177 percpu_ref_kill(&q->q_usage_counter); 128 178 mutex_unlock(&q->mq_freeze_lock); ··· 181 131 } else { 182 132 mutex_unlock(&q->mq_freeze_lock); 183 133 } 134 + 135 + return freeze; 136 + } 137 + 138 + void blk_freeze_queue_start(struct request_queue *q) 139 + { 140 + if (__blk_freeze_queue_start(q, current)) 141 + blk_freeze_acquire_lock(q, false, false); 184 142 } 185 143 EXPORT_SYMBOL_GPL(blk_freeze_queue_start); 186 144 ··· 207 149 } 208 150 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout); 209 151 210 - /* 211 - * Guarantee no request is in use, so we can change any data structure of 212 - * the queue afterward. 213 - */ 214 - void blk_freeze_queue(struct request_queue *q) 152 + void blk_mq_freeze_queue(struct request_queue *q) 215 153 { 216 - /* 217 - * In the !blk_mq case we are only calling this to kill the 218 - * q_usage_counter, otherwise this increases the freeze depth 219 - * and waits for it to return to zero. For this reason there is 220 - * no blk_unfreeze_queue(), and blk_freeze_queue() is not 221 - * exported to drivers as the only user for unfreeze is blk_mq. 222 - */ 223 154 blk_freeze_queue_start(q); 224 155 blk_mq_freeze_queue_wait(q); 225 156 } 226 - 227 - void blk_mq_freeze_queue(struct request_queue *q) 228 - { 229 - /* 230 - * ...just an alias to keep freeze and unfreeze actions balanced 231 - * in the blk_mq_* namespace 232 - */ 233 - blk_freeze_queue(q); 234 - } 235 157 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); 236 158 237 - void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) 159 + bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) 238 160 { 161 + bool unfreeze; 162 + 239 163 mutex_lock(&q->mq_freeze_lock); 240 164 if (force_atomic) 241 165 q->q_usage_counter.data->force_atomic = true; ··· 227 187 percpu_ref_resurrect(&q->q_usage_counter); 228 188 wake_up_all(&q->mq_freeze_wq); 229 189 } 190 + unfreeze = blk_unfreeze_check_owner(q); 230 191 mutex_unlock(&q->mq_freeze_lock); 192 + 193 + return unfreeze; 231 194 } 232 195 233 196 void blk_mq_unfreeze_queue(struct request_queue *q) 234 197 { 235 - __blk_mq_unfreeze_queue(q, false); 198 + if (__blk_mq_unfreeze_queue(q, false)) 199 + blk_unfreeze_release_lock(q, false, false); 236 200 } 237 201 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); 202 + 203 + /* 204 + * non_owner variant of blk_freeze_queue_start 205 + * 206 + * Unlike blk_freeze_queue_start, the queue doesn't need to be unfrozen 207 + * by the same task. This is fragile and should not be used if at all 208 + * possible. 209 + */ 210 + void blk_freeze_queue_start_non_owner(struct request_queue *q) 211 + { 212 + __blk_freeze_queue_start(q, NULL); 213 + } 214 + EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner); 215 + 216 + /* non_owner variant of blk_mq_unfreeze_queue */ 217 + void blk_mq_unfreeze_queue_non_owner(struct request_queue *q) 218 + { 219 + __blk_mq_unfreeze_queue(q, false); 220 + } 221 + EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_non_owner); 238 222 239 223 /* 240 224 * FIXME: replace the scsi_internal_device_*block_nowait() calls in the ··· 347 283 if (!blk_queue_skip_tagset_quiesce(q)) 348 284 blk_mq_quiesce_queue_nowait(q); 349 285 } 350 - blk_mq_wait_quiesce_done(set); 351 286 mutex_unlock(&set->tag_list_lock); 287 + 288 + blk_mq_wait_quiesce_done(set); 352 289 } 353 290 EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset); 354 291 ··· 396 331 /* Set start and alloc time when the allocated request is actually used */ 397 332 static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) 398 333 { 399 - if (blk_mq_need_time_stamp(rq)) 400 - rq->start_time_ns = blk_time_get_ns(); 401 - else 402 - rq->start_time_ns = 0; 403 - 404 334 #ifdef CONFIG_BLK_RQ_ALLOC_TIME 405 335 if (blk_queue_rq_alloc_time(rq->q)) 406 - rq->alloc_time_ns = alloc_time_ns ?: rq->start_time_ns; 336 + rq->alloc_time_ns = alloc_time_ns; 407 337 else 408 338 rq->alloc_time_ns = 0; 409 339 #endif ··· 419 359 420 360 if (data->flags & BLK_MQ_REQ_PM) 421 361 data->rq_flags |= RQF_PM; 422 - if (blk_queue_io_stat(q)) 423 - data->rq_flags |= RQF_IO_STAT; 424 362 rq->rq_flags = data->rq_flags; 425 363 426 364 if (data->rq_flags & RQF_SCHED_TAGS) { ··· 478 420 prefetch(tags->static_rqs[tag]); 479 421 tag_mask &= ~(1UL << i); 480 422 rq = blk_mq_rq_ctx_init(data, tags, tag); 481 - rq_list_add(data->cached_rq, rq); 423 + rq_list_add_head(data->cached_rqs, rq); 482 424 nr++; 483 425 } 484 426 if (!(data->rq_flags & RQF_SCHED_TAGS)) ··· 487 429 percpu_ref_get_many(&data->q->q_usage_counter, nr - 1); 488 430 data->nr_tags -= nr; 489 431 490 - return rq_list_pop(data->cached_rq); 432 + return rq_list_pop(data->cached_rqs); 491 433 } 492 434 493 435 static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) ··· 584 526 .flags = flags, 585 527 .cmd_flags = opf, 586 528 .nr_tags = plug->nr_ios, 587 - .cached_rq = &plug->cached_rq, 529 + .cached_rqs = &plug->cached_rqs, 588 530 }; 589 531 struct request *rq; 590 532 ··· 609 551 if (!plug) 610 552 return NULL; 611 553 612 - if (rq_list_empty(plug->cached_rq)) { 554 + if (rq_list_empty(&plug->cached_rqs)) { 613 555 if (plug->nr_ios == 1) 614 556 return NULL; 615 557 rq = blk_mq_rq_cache_fill(q, plug, opf, flags); 616 558 if (!rq) 617 559 return NULL; 618 560 } else { 619 - rq = rq_list_peek(&plug->cached_rq); 561 + rq = rq_list_peek(&plug->cached_rqs); 620 562 if (!rq || rq->q != q) 621 563 return NULL; 622 564 ··· 625 567 if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) 626 568 return NULL; 627 569 628 - plug->cached_rq = rq_list_next(rq); 629 - blk_mq_rq_time_init(rq, 0); 570 + rq_list_pop(&plug->cached_rqs); 571 + blk_mq_rq_time_init(rq, blk_time_get_ns()); 630 572 } 631 573 632 574 rq->cmd_flags = opf; ··· 802 744 { 803 745 struct request *rq; 804 746 805 - while ((rq = rq_list_pop(&plug->cached_rq)) != NULL) 747 + while ((rq = rq_list_pop(&plug->cached_rqs)) != NULL) 806 748 blk_mq_free_request(rq); 807 749 } 808 750 ··· 822 764 823 765 static void blk_account_io_completion(struct request *req, unsigned int bytes) 824 766 { 825 - if (req->part && blk_do_io_stat(req)) { 767 + if (req->rq_flags & RQF_IO_STAT) { 826 768 const int sgrp = op_stat_group(req_op(req)); 827 769 828 770 part_stat_lock(); ··· 842 784 blk_op_str(req_op(req)), 843 785 (__force u32)(req->cmd_flags & ~REQ_OP_MASK), 844 786 req->nr_phys_segments, 845 - IOPRIO_PRIO_CLASS(req->ioprio)); 787 + IOPRIO_PRIO_CLASS(req_get_ioprio(req))); 846 788 } 847 789 848 790 /* ··· 1040 982 * normal IO on queueing nor completion. Accounting the 1041 983 * containing request is enough. 1042 984 */ 1043 - if (blk_do_io_stat(req) && req->part && 1044 - !(req->rq_flags & RQF_FLUSH_SEQ)) { 985 + if ((req->rq_flags & (RQF_IO_STAT|RQF_FLUSH_SEQ)) == RQF_IO_STAT) { 1045 986 const int sgrp = op_stat_group(req_op(req)); 1046 987 1047 988 part_stat_lock(); ··· 1053 996 } 1054 997 } 1055 998 999 + static inline bool blk_rq_passthrough_stats(struct request *req) 1000 + { 1001 + struct bio *bio = req->bio; 1002 + 1003 + if (!blk_queue_passthrough_stat(req->q)) 1004 + return false; 1005 + 1006 + /* Requests without a bio do not transfer data. */ 1007 + if (!bio) 1008 + return false; 1009 + 1010 + /* 1011 + * Stats are accumulated in the bdev, so must have one attached to a 1012 + * bio to track stats. Most drivers do not set the bdev for passthrough 1013 + * requests, but nvme is one that will set it. 1014 + */ 1015 + if (!bio->bi_bdev) 1016 + return false; 1017 + 1018 + /* 1019 + * We don't know what a passthrough command does, but we know the 1020 + * payload size and data direction. Ensuring the size is aligned to the 1021 + * block size filters out most commands with payloads that don't 1022 + * represent sector access. 1023 + */ 1024 + if (blk_rq_bytes(req) & (bdev_logical_block_size(bio->bi_bdev) - 1)) 1025 + return false; 1026 + return true; 1027 + } 1028 + 1056 1029 static inline void blk_account_io_start(struct request *req) 1057 1030 { 1058 1031 trace_block_io_start(req); 1059 1032 1060 - if (blk_do_io_stat(req)) { 1061 - /* 1062 - * All non-passthrough requests are created from a bio with one 1063 - * exception: when a flush command that is part of a flush sequence 1064 - * generated by the state machine in blk-flush.c is cloned onto the 1065 - * lower device by dm-multipath we can get here without a bio. 1066 - */ 1067 - if (req->bio) 1068 - req->part = req->bio->bi_bdev; 1069 - else 1070 - req->part = req->q->disk->part0; 1033 + if (!blk_queue_io_stat(req->q)) 1034 + return; 1035 + if (blk_rq_is_passthrough(req) && !blk_rq_passthrough_stats(req)) 1036 + return; 1071 1037 1072 - part_stat_lock(); 1073 - update_io_ticks(req->part, jiffies, false); 1074 - part_stat_local_inc(req->part, 1075 - in_flight[op_is_write(req_op(req))]); 1076 - part_stat_unlock(); 1077 - } 1038 + req->rq_flags |= RQF_IO_STAT; 1039 + req->start_time_ns = blk_time_get_ns(); 1040 + 1041 + /* 1042 + * All non-passthrough requests are created from a bio with one 1043 + * exception: when a flush command that is part of a flush sequence 1044 + * generated by the state machine in blk-flush.c is cloned onto the 1045 + * lower device by dm-multipath we can get here without a bio. 1046 + */ 1047 + if (req->bio) 1048 + req->part = req->bio->bi_bdev; 1049 + else 1050 + req->part = req->q->disk->part0; 1051 + 1052 + part_stat_lock(); 1053 + update_io_ticks(req->part, jiffies, false); 1054 + part_stat_local_inc(req->part, in_flight[op_is_write(req_op(req))]); 1055 + part_stat_unlock(); 1078 1056 } 1079 1057 1080 1058 static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) ··· 1392 1300 */ 1393 1301 if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS)) 1394 1302 plug->has_elevator = true; 1395 - rq->rq_next = NULL; 1396 - rq_list_add(&plug->mq_list, rq); 1303 + rq_list_add_tail(&plug->mq_list, rq); 1397 1304 plug->rq_count++; 1398 1305 } 1399 1306 ··· 1789 1698 1790 1699 sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); 1791 1700 } 1792 - EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); 1793 1701 1794 1702 struct dispatch_rq_data { 1795 1703 struct blk_mq_hw_ctx *hctx; ··· 2290 2200 } 2291 2201 EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); 2292 2202 2203 + static inline bool blk_mq_hw_queue_need_run(struct blk_mq_hw_ctx *hctx) 2204 + { 2205 + bool need_run; 2206 + 2207 + /* 2208 + * When queue is quiesced, we may be switching io scheduler, or 2209 + * updating nr_hw_queues, or other things, and we can't run queue 2210 + * any more, even blk_mq_hctx_has_pending() can't be called safely. 2211 + * 2212 + * And queue will be rerun in blk_mq_unquiesce_queue() if it is 2213 + * quiesced. 2214 + */ 2215 + __blk_mq_run_dispatch_ops(hctx->queue, false, 2216 + need_run = !blk_queue_quiesced(hctx->queue) && 2217 + blk_mq_hctx_has_pending(hctx)); 2218 + return need_run; 2219 + } 2220 + 2293 2221 /** 2294 2222 * blk_mq_run_hw_queue - Start to run a hardware queue. 2295 2223 * @hctx: Pointer to the hardware queue to run. ··· 2328 2220 2329 2221 might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING); 2330 2222 2331 - /* 2332 - * When queue is quiesced, we may be switching io scheduler, or 2333 - * updating nr_hw_queues, or other things, and we can't run queue 2334 - * any more, even __blk_mq_hctx_has_pending() can't be called safely. 2335 - * 2336 - * And queue will be rerun in blk_mq_unquiesce_queue() if it is 2337 - * quiesced. 2338 - */ 2339 - __blk_mq_run_dispatch_ops(hctx->queue, false, 2340 - need_run = !blk_queue_quiesced(hctx->queue) && 2341 - blk_mq_hctx_has_pending(hctx)); 2223 + need_run = blk_mq_hw_queue_need_run(hctx); 2224 + if (!need_run) { 2225 + unsigned long flags; 2342 2226 2343 - if (!need_run) 2344 - return; 2227 + /* 2228 + * Synchronize with blk_mq_unquiesce_queue(), because we check 2229 + * if hw queue is quiesced locklessly above, we need the use 2230 + * ->queue_lock to make sure we see the up-to-date status to 2231 + * not miss rerunning the hw queue. 2232 + */ 2233 + spin_lock_irqsave(&hctx->queue->queue_lock, flags); 2234 + need_run = blk_mq_hw_queue_need_run(hctx); 2235 + spin_unlock_irqrestore(&hctx->queue->queue_lock, flags); 2236 + 2237 + if (!need_run) 2238 + return; 2239 + } 2345 2240 2346 2241 if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { 2347 2242 blk_mq_delay_run_hw_queue(hctx, 0); ··· 2501 2390 return; 2502 2391 2503 2392 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 2393 + /* 2394 + * Pairs with the smp_mb() in blk_mq_hctx_stopped() to order the 2395 + * clearing of BLK_MQ_S_STOPPED above and the checking of dispatch 2396 + * list in the subsequent routine. 2397 + */ 2398 + smp_mb__after_atomic(); 2504 2399 blk_mq_run_hw_queue(hctx, async); 2505 2400 } 2506 2401 EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); ··· 2659 2542 rq->cmd_flags |= REQ_FAILFAST_MASK; 2660 2543 2661 2544 rq->__sector = bio->bi_iter.bi_sector; 2662 - rq->write_hint = bio->bi_write_hint; 2663 2545 blk_rq_bio_prep(rq, bio, nr_segs); 2664 2546 if (bio_integrity(bio)) 2665 2547 rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q, ··· 2736 2620 2737 2621 if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { 2738 2622 blk_mq_insert_request(rq, 0); 2623 + blk_mq_run_hw_queue(hctx, false); 2739 2624 return; 2740 2625 } 2741 2626 ··· 2767 2650 2768 2651 if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { 2769 2652 blk_mq_insert_request(rq, 0); 2653 + blk_mq_run_hw_queue(hctx, false); 2770 2654 return BLK_STS_OK; 2771 2655 } 2772 2656 ··· 2784 2666 blk_status_t ret = BLK_STS_OK; 2785 2667 2786 2668 while ((rq = rq_list_pop(&plug->mq_list))) { 2787 - bool last = rq_list_empty(plug->mq_list); 2669 + bool last = rq_list_empty(&plug->mq_list); 2788 2670 2789 2671 if (hctx != rq->mq_hctx) { 2790 2672 if (hctx) { ··· 2827 2709 { 2828 2710 struct blk_mq_hw_ctx *this_hctx = NULL; 2829 2711 struct blk_mq_ctx *this_ctx = NULL; 2830 - struct request *requeue_list = NULL; 2831 - struct request **requeue_lastp = &requeue_list; 2712 + struct rq_list requeue_list = {}; 2832 2713 unsigned int depth = 0; 2833 2714 bool is_passthrough = false; 2834 2715 LIST_HEAD(list); ··· 2841 2724 is_passthrough = blk_rq_is_passthrough(rq); 2842 2725 } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx || 2843 2726 is_passthrough != blk_rq_is_passthrough(rq)) { 2844 - rq_list_add_tail(&requeue_lastp, rq); 2727 + rq_list_add_tail(&requeue_list, rq); 2845 2728 continue; 2846 2729 } 2847 - list_add(&rq->queuelist, &list); 2730 + list_add_tail(&rq->queuelist, &list); 2848 2731 depth++; 2849 - } while (!rq_list_empty(plug->mq_list)); 2732 + } while (!rq_list_empty(&plug->mq_list)); 2850 2733 2851 2734 plug->mq_list = requeue_list; 2852 2735 trace_block_unplug(this_hctx->queue, depth, !from_sched); ··· 2901 2784 if (q->mq_ops->queue_rqs) { 2902 2785 blk_mq_run_dispatch_ops(q, 2903 2786 __blk_mq_flush_plug_list(q, plug)); 2904 - if (rq_list_empty(plug->mq_list)) 2787 + if (rq_list_empty(&plug->mq_list)) 2905 2788 return; 2906 2789 } 2907 2790 2908 2791 blk_mq_run_dispatch_ops(q, 2909 2792 blk_mq_plug_issue_direct(plug)); 2910 - if (rq_list_empty(plug->mq_list)) 2793 + if (rq_list_empty(&plug->mq_list)) 2911 2794 return; 2912 2795 } 2913 2796 2914 2797 do { 2915 2798 blk_mq_dispatch_plug_list(plug, from_schedule); 2916 - } while (!rq_list_empty(plug->mq_list)); 2799 + } while (!rq_list_empty(&plug->mq_list)); 2917 2800 } 2918 2801 2919 2802 static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, ··· 2978 2861 if (plug) { 2979 2862 data.nr_tags = plug->nr_ios; 2980 2863 plug->nr_ios = 1; 2981 - data.cached_rq = &plug->cached_rq; 2864 + data.cached_rqs = &plug->cached_rqs; 2982 2865 } 2983 2866 2984 2867 rq = __blk_mq_alloc_requests(&data); ··· 3001 2884 3002 2885 if (!plug) 3003 2886 return NULL; 3004 - rq = rq_list_peek(&plug->cached_rq); 2887 + rq = rq_list_peek(&plug->cached_rqs); 3005 2888 if (!rq || rq->q != q) 3006 2889 return NULL; 3007 2890 if (type != rq->mq_hctx->type && ··· 3015 2898 static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, 3016 2899 struct bio *bio) 3017 2900 { 3018 - WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq); 2901 + if (rq_list_pop(&plug->cached_rqs) != rq) 2902 + WARN_ON_ONCE(1); 3019 2903 3020 2904 /* 3021 2905 * If any qos ->throttle() end up blocking, we will have flushed the 3022 2906 * plug and hence killed the cached_rq list as well. Pop this entry 3023 2907 * before we throttle. 3024 2908 */ 3025 - plug->cached_rq = rq_list_next(rq); 3026 2909 rq_qos_throttle(rq->q, bio); 3027 2910 3028 - blk_mq_rq_time_init(rq, 0); 2911 + blk_mq_rq_time_init(rq, blk_time_get_ns()); 3029 2912 rq->cmd_flags = bio->bi_opf; 3030 2913 INIT_LIST_HEAD(&rq->queuelist); 3031 2914 } ··· 3304 3187 rq->special_vec = rq_src->special_vec; 3305 3188 } 3306 3189 rq->nr_phys_segments = rq_src->nr_phys_segments; 3307 - rq->ioprio = rq_src->ioprio; 3308 - rq->write_hint = rq_src->write_hint; 3309 3190 3310 3191 if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) 3311 3192 goto free_and_out;

+14 -1

block/blk-mq.h

··· 155 155 156 156 /* allocate multiple requests/tags in one go */ 157 157 unsigned int nr_tags; 158 - struct request **cached_rq; 158 + struct rq_list *cached_rqs; 159 159 160 160 /* input & output parameter */ 161 161 struct blk_mq_ctx *ctx; ··· 230 230 231 231 static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) 232 232 { 233 + /* Fast path: hardware queue is not stopped most of the time. */ 234 + if (likely(!test_bit(BLK_MQ_S_STOPPED, &hctx->state))) 235 + return false; 236 + 237 + /* 238 + * This barrier is used to order adding of dispatch list before and 239 + * the test of BLK_MQ_S_STOPPED below. Pairs with the memory barrier 240 + * in blk_mq_start_stopped_hw_queue() so that dispatch code could 241 + * either see BLK_MQ_S_STOPPED is cleared or dispatch list is not 242 + * empty to avoid missing dispatching requests. 243 + */ 244 + smp_mb(); 245 + 233 246 return test_bit(BLK_MQ_S_STOPPED, &hctx->state); 234 247 } 235 248

+1 -3

block/blk-rq-qos.c

··· 218 218 return -1; 219 219 220 220 data->got_token = true; 221 - smp_wmb(); 222 221 wake_up_process(data->task); 223 222 list_del_init_careful(&curr->entry); 224 223 return 1; ··· 273 274 * which means we now have two. Put our local token 274 275 * and wake anyone else potentially waiting for one. 275 276 */ 276 - smp_rmb(); 277 277 if (data.got_token) 278 278 cleanup_cb(rqw, private_data); 279 - break; 279 + return; 280 280 } 281 281 io_schedule(); 282 282 has_sleeper = true;

+20 -20

block/blk-settings.c

··· 50 50 lim->max_sectors = UINT_MAX; 51 51 lim->max_dev_sectors = UINT_MAX; 52 52 lim->max_write_zeroes_sectors = UINT_MAX; 53 - lim->max_zone_append_sectors = UINT_MAX; 53 + lim->max_hw_zone_append_sectors = UINT_MAX; 54 54 lim->max_user_discard_sectors = UINT_MAX; 55 55 } 56 56 EXPORT_SYMBOL(blk_set_stacking_limits); ··· 91 91 if (lim->zone_write_granularity < lim->logical_block_size) 92 92 lim->zone_write_granularity = lim->logical_block_size; 93 93 94 - if (lim->max_zone_append_sectors) { 95 - /* 96 - * The Zone Append size is limited by the maximum I/O size 97 - * and the zone size given that it can't span zones. 98 - */ 99 - lim->max_zone_append_sectors = 100 - min3(lim->max_hw_sectors, 101 - lim->max_zone_append_sectors, 102 - lim->chunk_sectors); 103 - } 104 - 94 + /* 95 + * The Zone Append size is limited by the maximum I/O size and the zone 96 + * size given that it can't span zones. 97 + * 98 + * If no max_hw_zone_append_sectors limit is provided, the block layer 99 + * will emulated it, else we're also bound by the hardware limit. 100 + */ 101 + lim->max_zone_append_sectors = 102 + min_not_zero(lim->max_hw_zone_append_sectors, 103 + min(lim->chunk_sectors, lim->max_hw_sectors)); 105 104 return 0; 106 105 } 107 106 ··· 222 223 * Check that the limits in lim are valid, initialize defaults for unset 223 224 * values, and cap values based on others where needed. 224 225 */ 225 - static int blk_validate_limits(struct queue_limits *lim) 226 + int blk_validate_limits(struct queue_limits *lim) 226 227 { 227 228 unsigned int max_hw_sectors; 228 229 unsigned int logical_block_sectors; ··· 365 366 return err; 366 367 return blk_validate_zoned_limits(lim); 367 368 } 369 + EXPORT_SYMBOL_GPL(blk_validate_limits); 368 370 369 371 /* 370 372 * Set the default limits for a newly allocated queue. @lim contains the ··· 508 508 t->features |= (b->features & BLK_FEAT_INHERIT_MASK); 509 509 510 510 /* 511 - * BLK_FEAT_NOWAIT and BLK_FEAT_POLL need to be supported both by the 512 - * stacking driver and all underlying devices. The stacking driver sets 513 - * the flags before stacking the limits, and this will clear the flags 514 - * if any of the underlying devices does not support it. 511 + * Some feaures need to be supported both by the stacking driver and all 512 + * underlying devices. The stacking driver sets these flags before 513 + * stacking the limits, and this will clear the flags if any of the 514 + * underlying devices does not support it. 515 515 */ 516 516 if (!(b->features & BLK_FEAT_NOWAIT)) 517 517 t->features &= ~BLK_FEAT_NOWAIT; ··· 527 527 t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); 528 528 t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, 529 529 b->max_write_zeroes_sectors); 530 - t->max_zone_append_sectors = min(queue_limits_max_zone_append_sectors(t), 531 - queue_limits_max_zone_append_sectors(b)); 530 + t->max_hw_zone_append_sectors = min(t->max_hw_zone_append_sectors, 531 + b->max_hw_zone_append_sectors); 532 532 533 533 t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, 534 534 b->seg_boundary_mask); ··· 661 661 void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, 662 662 sector_t offset, const char *pfx) 663 663 { 664 - if (blk_stack_limits(t, &bdev_get_queue(bdev)->limits, 664 + if (blk_stack_limits(t, bdev_limits(bdev), 665 665 get_start_sect(bdev) + offset)) 666 666 pr_notice("%s: Warning: Device %pg is misaligned\n", 667 667 pfx, bdev);

+48 -32

block/blk-sysfs.c

··· 23 23 struct queue_sysfs_entry { 24 24 struct attribute attr; 25 25 ssize_t (*show)(struct gendisk *disk, char *page); 26 - int (*load_module)(struct gendisk *disk, const char *page, size_t count); 27 26 ssize_t (*store)(struct gendisk *disk, const char *page, size_t count); 27 + void (*load_module)(struct gendisk *disk, const char *page, size_t count); 28 28 }; 29 29 30 30 static ssize_t 31 31 queue_var_show(unsigned long var, char *page) 32 32 { 33 - return sprintf(page, "%lu\n", var); 33 + return sysfs_emit(page, "%lu\n", var); 34 34 } 35 35 36 36 static ssize_t ··· 121 121 #define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(_field) \ 122 122 static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ 123 123 { \ 124 - return sprintf(page, "%llu\n", \ 124 + return sysfs_emit(page, "%llu\n", \ 125 125 (unsigned long long)disk->queue->limits._field << \ 126 126 SECTOR_SHIFT); \ 127 127 } ··· 131 131 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_write_zeroes_sectors) 132 132 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors) 133 133 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors) 134 + QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_zone_append_sectors) 134 135 135 136 #define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(_field) \ 136 137 static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ ··· 145 144 #define QUEUE_SYSFS_SHOW_CONST(_name, _val) \ 146 145 static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ 147 146 { \ 148 - return sprintf(page, "%d\n", _val); \ 147 + return sysfs_emit(page, "%d\n", _val); \ 149 148 } 150 149 151 150 /* deprecated fields */ ··· 177 176 if (err) 178 177 return err; 179 178 return ret; 180 - } 181 - 182 - /* 183 - * For zone append queue_max_zone_append_sectors does not just return the 184 - * underlying queue limits, but actually contains a calculation. Because of 185 - * that we can't simply use QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES here. 186 - */ 187 - static ssize_t queue_zone_append_max_show(struct gendisk *disk, char *page) 188 - { 189 - return sprintf(page, "%llu\n", 190 - (u64)queue_max_zone_append_sectors(disk->queue) << 191 - SECTOR_SHIFT); 192 179 } 193 180 194 181 static ssize_t ··· 224 235 #define QUEUE_SYSFS_FEATURE(_name, _feature) \ 225 236 static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ 226 237 { \ 227 - return sprintf(page, "%u\n", \ 238 + return sysfs_emit(page, "%u\n", \ 228 239 !!(disk->queue->limits.features & _feature)); \ 229 240 } \ 230 241 static ssize_t queue_##_name##_store(struct gendisk *disk, \ ··· 241 252 #define QUEUE_SYSFS_FEATURE_SHOW(_name, _feature) \ 242 253 static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ 243 254 { \ 244 - return sprintf(page, "%u\n", \ 255 + return sysfs_emit(page, "%u\n", \ 245 256 !!(disk->queue->limits.features & _feature)); \ 246 257 } 247 258 ··· 252 263 static ssize_t queue_zoned_show(struct gendisk *disk, char *page) 253 264 { 254 265 if (blk_queue_is_zoned(disk->queue)) 255 - return sprintf(page, "host-managed\n"); 256 - return sprintf(page, "none\n"); 266 + return sysfs_emit(page, "host-managed\n"); 267 + return sysfs_emit(page, "none\n"); 257 268 } 258 269 259 270 static ssize_t queue_nr_zones_show(struct gendisk *disk, char *page) ··· 261 272 return queue_var_show(disk_nr_zones(disk), page); 262 273 } 263 274 275 + static ssize_t queue_iostats_passthrough_show(struct gendisk *disk, char *page) 276 + { 277 + return queue_var_show(blk_queue_passthrough_stat(disk->queue), page); 278 + } 279 + 280 + static ssize_t queue_iostats_passthrough_store(struct gendisk *disk, 281 + const char *page, size_t count) 282 + { 283 + struct queue_limits lim; 284 + unsigned long ios; 285 + ssize_t ret; 286 + 287 + ret = queue_var_store(&ios, page, count); 288 + if (ret < 0) 289 + return ret; 290 + 291 + lim = queue_limits_start_update(disk->queue); 292 + if (ios) 293 + lim.flags |= BLK_FLAG_IOSTATS_PASSTHROUGH; 294 + else 295 + lim.flags &= ~BLK_FLAG_IOSTATS_PASSTHROUGH; 296 + 297 + ret = queue_limits_commit_update(disk->queue, &lim); 298 + if (ret) 299 + return ret; 300 + 301 + return count; 302 + } 264 303 static ssize_t queue_nomerges_show(struct gendisk *disk, char *page) 265 304 { 266 305 return queue_var_show((blk_queue_nomerges(disk->queue) << 1) | ··· 366 349 367 350 static ssize_t queue_io_timeout_show(struct gendisk *disk, char *page) 368 351 { 369 - return sprintf(page, "%u\n", jiffies_to_msecs(disk->queue->rq_timeout)); 352 + return sysfs_emit(page, "%u\n", jiffies_to_msecs(disk->queue->rq_timeout)); 370 353 } 371 354 372 355 static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page, ··· 387 370 static ssize_t queue_wc_show(struct gendisk *disk, char *page) 388 371 { 389 372 if (blk_queue_write_cache(disk->queue)) 390 - return sprintf(page, "write back\n"); 391 - return sprintf(page, "write through\n"); 373 + return sysfs_emit(page, "write back\n"); 374 + return sysfs_emit(page, "write through\n"); 392 375 } 393 376 394 377 static ssize_t queue_wc_store(struct gendisk *disk, const char *page, ··· 468 451 469 452 QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); 470 453 QUEUE_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes"); 471 - QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes"); 454 + QUEUE_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes"); 472 455 QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); 473 456 474 457 QUEUE_RO_ENTRY(queue_zoned, "zoned"); ··· 477 460 QUEUE_RO_ENTRY(queue_max_active_zones, "max_active_zones"); 478 461 479 462 QUEUE_RW_ENTRY(queue_nomerges, "nomerges"); 463 + QUEUE_RW_ENTRY(queue_iostats_passthrough, "iostats_passthrough"); 480 464 QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity"); 481 465 QUEUE_RW_ENTRY(queue_poll, "io_poll"); 482 466 QUEUE_RW_ENTRY(queue_poll_delay, "io_poll_delay"); ··· 519 501 return -EINVAL; 520 502 521 503 if (wbt_disabled(disk->queue)) 522 - return sprintf(page, "0\n"); 504 + return sysfs_emit(page, "0\n"); 523 505 524 - return sprintf(page, "%llu\n", 506 + return sysfs_emit(page, "%llu\n", 525 507 div_u64(wbt_get_min_lat(disk->queue), 1000)); 526 508 } 527 509 ··· 596 578 &queue_atomic_write_unit_max_entry.attr, 597 579 &queue_write_same_max_entry.attr, 598 580 &queue_max_write_zeroes_sectors_entry.attr, 599 - &queue_zone_append_max_entry.attr, 581 + &queue_max_zone_append_sectors_entry.attr, 600 582 &queue_zone_write_granularity_entry.attr, 601 583 &queue_rotational_entry.attr, 602 584 &queue_zoned_entry.attr, ··· 604 586 &queue_max_open_zones_entry.attr, 605 587 &queue_max_active_zones_entry.attr, 606 588 &queue_nomerges_entry.attr, 589 + &queue_iostats_passthrough_entry.attr, 607 590 &queue_iostats_entry.attr, 608 591 &queue_stable_writes_entry.attr, 609 592 &queue_add_random_entry.attr, ··· 703 684 * queue to ensure that the module file can be read when the request 704 685 * queue is the one for the device storing the module file. 705 686 */ 706 - if (entry->load_module) { 707 - res = entry->load_module(disk, page, length); 708 - if (res) 709 - return res; 710 - } 687 + if (entry->load_module) 688 + entry->load_module(disk, page, length); 711 689 712 690 blk_mq_freeze_queue(q); 713 691 mutex_lock(&q->sysfs_lock);

+48 -28

block/blk-throttle.c

··· 1485 1485 goto out_finish; 1486 1486 1487 1487 ret = -EINVAL; 1488 - if (!strcmp(tok, "rbps") && val > 1) 1488 + if (!strcmp(tok, "rbps")) 1489 1489 v[0] = val; 1490 - else if (!strcmp(tok, "wbps") && val > 1) 1490 + else if (!strcmp(tok, "wbps")) 1491 1491 v[1] = val; 1492 - else if (!strcmp(tok, "riops") && val > 1) 1492 + else if (!strcmp(tok, "riops")) 1493 1493 v[2] = min_t(u64, val, UINT_MAX); 1494 - else if (!strcmp(tok, "wiops") && val > 1) 1494 + else if (!strcmp(tok, "wiops")) 1495 1495 v[3] = min_t(u64, val, UINT_MAX); 1496 1496 else 1497 1497 goto out_finish; ··· 1526 1526 cancel_work_sync(&td->dispatch_work); 1527 1527 } 1528 1528 1529 + static void tg_flush_bios(struct throtl_grp *tg) 1530 + { 1531 + struct throtl_service_queue *sq = &tg->service_queue; 1532 + 1533 + if (tg->flags & THROTL_TG_CANCELING) 1534 + return; 1535 + /* 1536 + * Set the flag to make sure throtl_pending_timer_fn() won't 1537 + * stop until all throttled bios are dispatched. 1538 + */ 1539 + tg->flags |= THROTL_TG_CANCELING; 1540 + 1541 + /* 1542 + * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup 1543 + * will be inserted to service queue without THROTL_TG_PENDING 1544 + * set in tg_update_disptime below. Then IO dispatched from 1545 + * child in tg_dispatch_one_bio will trigger double insertion 1546 + * and corrupt the tree. 1547 + */ 1548 + if (!(tg->flags & THROTL_TG_PENDING)) 1549 + return; 1550 + 1551 + /* 1552 + * Update disptime after setting the above flag to make sure 1553 + * throtl_select_dispatch() won't exit without dispatching. 1554 + */ 1555 + tg_update_disptime(tg); 1556 + 1557 + throtl_schedule_pending_timer(sq, jiffies + 1); 1558 + } 1559 + 1560 + static void throtl_pd_offline(struct blkg_policy_data *pd) 1561 + { 1562 + tg_flush_bios(pd_to_tg(pd)); 1563 + } 1564 + 1529 1565 struct blkcg_policy blkcg_policy_throtl = { 1530 1566 .dfl_cftypes = throtl_files, 1531 1567 .legacy_cftypes = throtl_legacy_files, ··· 1569 1533 .pd_alloc_fn = throtl_pd_alloc, 1570 1534 .pd_init_fn = throtl_pd_init, 1571 1535 .pd_online_fn = throtl_pd_online, 1536 + .pd_offline_fn = throtl_pd_offline, 1572 1537 .pd_free_fn = throtl_pd_free, 1573 1538 }; 1574 1539 ··· 1590 1553 */ 1591 1554 rcu_read_lock(); 1592 1555 blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { 1593 - struct throtl_grp *tg = blkg_to_tg(blkg); 1594 - struct throtl_service_queue *sq = &tg->service_queue; 1595 - 1596 1556 /* 1597 - * Set the flag to make sure throtl_pending_timer_fn() won't 1598 - * stop until all throttled bios are dispatched. 1557 + * disk_release will call pd_offline_fn to cancel bios. 1558 + * However, disk_release can't be called if someone get 1559 + * the refcount of device and issued bios which are 1560 + * inflight after del_gendisk. 1561 + * Cancel bios here to ensure no bios are inflight after 1562 + * del_gendisk. 1599 1563 */ 1600 - tg->flags |= THROTL_TG_CANCELING; 1601 - 1602 - /* 1603 - * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup 1604 - * will be inserted to service queue without THROTL_TG_PENDING 1605 - * set in tg_update_disptime below. Then IO dispatched from 1606 - * child in tg_dispatch_one_bio will trigger double insertion 1607 - * and corrupt the tree. 1608 - */ 1609 - if (!(tg->flags & THROTL_TG_PENDING)) 1610 - continue; 1611 - 1612 - /* 1613 - * Update disptime after setting the above flag to make sure 1614 - * throtl_select_dispatch() won't exit without dispatching. 1615 - */ 1616 - tg_update_disptime(tg); 1617 - 1618 - throtl_schedule_pending_timer(sq, jiffies + 1); 1564 + tg_flush_bios(blkg_to_tg(blkg)); 1619 1565 } 1620 1566 rcu_read_unlock(); 1621 1567 spin_unlock_irq(&q->queue_lock);

+34 -34

block/blk-zoned.c

··· 18 18 #include <linux/vmalloc.h> 19 19 #include <linux/sched/mm.h> 20 20 #include <linux/spinlock.h> 21 - #include <linux/atomic.h> 21 + #include <linux/refcount.h> 22 22 #include <linux/mempool.h> 23 23 24 24 #include "blk.h" ··· 64 64 struct blk_zone_wplug { 65 65 struct hlist_node node; 66 66 struct list_head link; 67 - atomic_t ref; 67 + refcount_t ref; 68 68 spinlock_t lock; 69 69 unsigned int flags; 70 70 unsigned int zone_no; ··· 348 348 return ret; 349 349 } 350 350 351 - static inline bool disk_zone_is_conv(struct gendisk *disk, sector_t sector) 352 - { 353 - if (!disk->conv_zones_bitmap) 354 - return false; 355 - return test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap); 356 - } 357 - 358 351 static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) 359 352 { 360 353 return zone->start + zone->len >= get_capacity(disk); ··· 404 411 405 412 hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) { 406 413 if (zwplug->zone_no == zno && 407 - atomic_inc_not_zero(&zwplug->ref)) { 414 + refcount_inc_not_zero(&zwplug->ref)) { 408 415 rcu_read_unlock(); 409 416 return zwplug; 410 417 } ··· 425 432 426 433 static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) 427 434 { 428 - if (atomic_dec_and_test(&zwplug->ref)) { 435 + if (refcount_dec_and_test(&zwplug->ref)) { 429 436 WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); 430 437 WARN_ON_ONCE(!list_empty(&zwplug->link)); 431 438 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)); ··· 456 463 * taken when the plug was allocated and another reference taken by the 457 464 * caller context). 458 465 */ 459 - if (atomic_read(&zwplug->ref) > 2) 466 + if (refcount_read(&zwplug->ref) > 2) 460 467 return false; 461 468 462 469 /* We can remove zone write plugs for zones that are empty or full. */ ··· 526 533 527 534 INIT_HLIST_NODE(&zwplug->node); 528 535 INIT_LIST_HEAD(&zwplug->link); 529 - atomic_set(&zwplug->ref, 2); 536 + refcount_set(&zwplug->ref, 2); 530 537 spin_lock_init(&zwplug->lock); 531 538 zwplug->flags = 0; 532 539 zwplug->zone_no = zno; ··· 617 624 * finished. 618 625 */ 619 626 zwplug->flags |= BLK_ZONE_WPLUG_ERROR; 620 - atomic_inc(&zwplug->ref); 627 + refcount_inc(&zwplug->ref); 621 628 622 629 spin_lock_irqsave(&disk->zone_wplugs_lock, flags); 623 630 list_add_tail(&zwplug->link, &disk->zone_wplugs_err_list); ··· 702 709 struct blk_zone_wplug *zwplug; 703 710 704 711 /* Conventional zones cannot be reset nor finished. */ 705 - if (disk_zone_is_conv(disk, sector)) { 712 + if (!bdev_zone_is_seq(bio->bi_bdev, sector)) { 706 713 bio_io_error(bio); 707 714 return true; 708 715 } ··· 956 963 } 957 964 958 965 /* Conventional zones do not need write plugging. */ 959 - if (disk_zone_is_conv(disk, sector)) { 966 + if (!bdev_zone_is_seq(bio->bi_bdev, sector)) { 960 967 /* Zone append to conventional zones is not allowed. */ 961 968 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 962 969 bio_io_error(bio); ··· 1092 1099 * reference we take here. 1093 1100 */ 1094 1101 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); 1095 - atomic_inc(&zwplug->ref); 1102 + refcount_inc(&zwplug->ref); 1096 1103 queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); 1097 1104 } 1098 1105 ··· 1437 1444 while (!hlist_empty(&disk->zone_wplugs_hash[i])) { 1438 1445 zwplug = hlist_entry(disk->zone_wplugs_hash[i].first, 1439 1446 struct blk_zone_wplug, node); 1440 - atomic_inc(&zwplug->ref); 1447 + refcount_inc(&zwplug->ref); 1441 1448 disk_remove_zone_wplug(disk, zwplug); 1442 1449 disk_put_zone_wplug(zwplug); 1443 1450 } ··· 1446 1453 kfree(disk->zone_wplugs_hash); 1447 1454 disk->zone_wplugs_hash = NULL; 1448 1455 disk->zone_wplugs_hash_bits = 0; 1456 + } 1457 + 1458 + static unsigned int disk_set_conv_zones_bitmap(struct gendisk *disk, 1459 + unsigned long *bitmap) 1460 + { 1461 + unsigned int nr_conv_zones = 0; 1462 + unsigned long flags; 1463 + 1464 + spin_lock_irqsave(&disk->zone_wplugs_lock, flags); 1465 + if (bitmap) 1466 + nr_conv_zones = bitmap_weight(bitmap, disk->nr_zones); 1467 + bitmap = rcu_replace_pointer(disk->conv_zones_bitmap, bitmap, 1468 + lockdep_is_held(&disk->zone_wplugs_lock)); 1469 + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); 1470 + 1471 + kfree_rcu_mightsleep(bitmap); 1472 + 1473 + return nr_conv_zones; 1449 1474 } 1450 1475 1451 1476 void disk_free_zone_resources(struct gendisk *disk) ··· 1489 1478 mempool_destroy(disk->zone_wplugs_pool); 1490 1479 disk->zone_wplugs_pool = NULL; 1491 1480 1492 - bitmap_free(disk->conv_zones_bitmap); 1493 - disk->conv_zones_bitmap = NULL; 1481 + disk_set_conv_zones_bitmap(disk, NULL); 1494 1482 disk->zone_capacity = 0; 1495 1483 disk->last_zone_capacity = 0; 1496 1484 disk->nr_zones = 0; ··· 1548 1538 struct blk_revalidate_zone_args *args) 1549 1539 { 1550 1540 struct request_queue *q = disk->queue; 1551 - unsigned int nr_seq_zones, nr_conv_zones = 0; 1541 + unsigned int nr_seq_zones, nr_conv_zones; 1552 1542 unsigned int pool_size; 1553 1543 struct queue_limits lim; 1554 1544 1555 1545 disk->nr_zones = args->nr_zones; 1556 1546 disk->zone_capacity = args->zone_capacity; 1557 1547 disk->last_zone_capacity = args->last_zone_capacity; 1558 - swap(disk->conv_zones_bitmap, args->conv_zones_bitmap); 1559 - if (disk->conv_zones_bitmap) 1560 - nr_conv_zones = bitmap_weight(disk->conv_zones_bitmap, 1561 - disk->nr_zones); 1548 + nr_conv_zones = 1549 + disk_set_conv_zones_bitmap(disk, args->conv_zones_bitmap); 1562 1550 if (nr_conv_zones >= disk->nr_zones) { 1563 1551 pr_warn("%s: Invalid number of conventional zones %u / %u\n", 1564 1552 disk->disk_name, nr_conv_zones, disk->nr_zones); ··· 1782 1774 return -ENODEV; 1783 1775 } 1784 1776 1785 - if (!queue_max_zone_append_sectors(q)) { 1786 - pr_warn("%s: Invalid 0 maximum zone append limit\n", 1787 - disk->disk_name); 1788 - return -ENODEV; 1789 - } 1790 - 1791 1777 /* 1792 1778 * Ensure that all memory allocations in this context are done as if 1793 1779 * GFP_NOIO was specified. ··· 1825 1823 disk_free_zone_resources(disk); 1826 1824 blk_mq_unfreeze_queue(q); 1827 1825 1828 - kfree(args.conv_zones_bitmap); 1829 - 1830 1826 return ret; 1831 1827 } 1832 1828 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); ··· 1851 1851 spin_lock_irqsave(&zwplug->lock, flags); 1852 1852 zwp_zone_no = zwplug->zone_no; 1853 1853 zwp_flags = zwplug->flags; 1854 - zwp_ref = atomic_read(&zwplug->ref); 1854 + zwp_ref = refcount_read(&zwplug->ref); 1855 1855 zwp_wp_offset = zwplug->wp_offset; 1856 1856 zwp_bio_list_size = bio_list_size(&zwplug->bio_list); 1857 1857 spin_unlock_irqrestore(&zwplug->lock, flags);

+28 -24

block/blk.h

··· 4 4 5 5 #include <linux/bio-integrity.h> 6 6 #include <linux/blk-crypto.h> 7 + #include <linux/lockdep.h> 7 8 #include <linux/memblock.h> /* for max_pfn/max_low_pfn */ 8 9 #include <linux/sched/sysctl.h> 9 10 #include <linux/timekeeping.h> ··· 35 34 gfp_t flags); 36 35 void blk_free_flush_queue(struct blk_flush_queue *q); 37 36 38 - void blk_freeze_queue(struct request_queue *q); 39 - void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic); 40 - void blk_queue_start_drain(struct request_queue *q); 37 + bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic); 38 + bool blk_queue_start_drain(struct request_queue *q); 39 + bool __blk_freeze_queue_start(struct request_queue *q, 40 + struct task_struct *owner); 41 41 int __bio_queue_enter(struct request_queue *q, struct bio *bio); 42 42 void submit_bio_noacct_nocheck(struct bio *bio); 43 43 void bio_await_chain(struct bio *bio); ··· 71 69 { 72 70 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 73 71 74 - if (blk_try_enter_queue(q, false)) 72 + if (blk_try_enter_queue(q, false)) { 73 + rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_); 74 + rwsem_release(&q->io_lockdep_map, _RET_IP_); 75 75 return 0; 76 + } 76 77 return __bio_queue_enter(q, bio); 77 78 } 78 79 ··· 410 405 struct queue_limits *lim); 411 406 int blk_dev_init(void); 412 407 413 - /* 414 - * Contribute to IO statistics IFF: 415 - * 416 - * a) it's attached to a gendisk, and 417 - * b) the queue had IO stats enabled when this request was started 418 - */ 419 - static inline bool blk_do_io_stat(struct request *rq) 420 - { 421 - return (rq->rq_flags & RQF_IO_STAT) && !blk_rq_is_passthrough(rq); 422 - } 423 - 424 408 void update_io_ticks(struct block_device *part, unsigned long now, bool end); 425 409 unsigned int part_in_flight(struct block_device *part); 426 410 ··· 456 462 static inline bool bio_zone_write_plugging(struct bio *bio) 457 463 { 458 464 return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING); 459 - } 460 - static inline bool bio_is_zone_append(struct bio *bio) 461 - { 462 - return bio_op(bio) == REQ_OP_ZONE_APPEND || 463 - bio_flagged(bio, BIO_EMULATES_ZONE_APPEND); 464 465 } 465 466 void blk_zone_write_plug_bio_merged(struct bio *bio); 466 467 void blk_zone_write_plug_init_request(struct request *rq); ··· 505 516 { 506 517 return false; 507 518 } 508 - static inline bool bio_is_zone_append(struct bio *bio) 509 - { 510 - return false; 511 - } 512 519 static inline void blk_zone_write_plug_bio_merged(struct bio *bio) 513 520 { 514 521 } ··· 543 558 #define ADDPART_FLAG_NONE 0 544 559 #define ADDPART_FLAG_RAID 1 545 560 #define ADDPART_FLAG_WHOLEDISK 2 561 + #define ADDPART_FLAG_READONLY 4 546 562 int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, 547 563 sector_t length); 548 564 int bdev_del_partition(struct gendisk *disk, int partno); ··· 719 733 void blk_integrity_verify(struct bio *bio); 720 734 void blk_integrity_prepare(struct request *rq); 721 735 void blk_integrity_complete(struct request *rq, unsigned int nr_bytes); 736 + 737 + static inline void blk_freeze_acquire_lock(struct request_queue *q, bool 738 + disk_dead, bool queue_dying) 739 + { 740 + if (!disk_dead) 741 + rwsem_acquire(&q->io_lockdep_map, 0, 1, _RET_IP_); 742 + if (!queue_dying) 743 + rwsem_acquire(&q->q_lockdep_map, 0, 1, _RET_IP_); 744 + } 745 + 746 + static inline void blk_unfreeze_release_lock(struct request_queue *q, bool 747 + disk_dead, bool queue_dying) 748 + { 749 + if (!queue_dying) 750 + rwsem_release(&q->q_lockdep_map, _RET_IP_); 751 + if (!disk_dead) 752 + rwsem_release(&q->io_lockdep_map, _RET_IP_); 753 + } 722 754 723 755 #endif /* BLK_INTERNAL_H */

+11 -7

block/elevator.c

··· 598 598 * drain any dispatch activities originated from passthrough 599 599 * requests, then no need to quiesce queue which may add long boot 600 600 * latency, especially when lots of disks are involved. 601 + * 602 + * Disk isn't added yet, so verifying queue lock only manually. 601 603 */ 602 - blk_mq_freeze_queue(q); 604 + blk_freeze_queue_start_non_owner(q); 605 + blk_freeze_acquire_lock(q, true, false); 606 + blk_mq_freeze_queue_wait(q); 607 + 603 608 blk_mq_cancel_work_sync(q); 604 609 605 610 err = blk_mq_init_sched(q, e); 606 611 607 - blk_mq_unfreeze_queue(q); 612 + blk_unfreeze_release_lock(q, true, false); 613 + blk_mq_unfreeze_queue_non_owner(q); 608 614 609 615 if (err) { 610 616 pr_warn("\"%s\" elevator initialization failed, " ··· 710 704 return ret; 711 705 } 712 706 713 - int elv_iosched_load_module(struct gendisk *disk, const char *buf, 714 - size_t count) 707 + void elv_iosched_load_module(struct gendisk *disk, const char *buf, 708 + size_t count) 715 709 { 716 710 char elevator_name[ELV_NAME_MAX]; 717 711 struct elevator_type *found; 718 712 const char *name; 719 713 720 714 if (!elv_support_iosched(disk->queue)) 721 - return -EOPNOTSUPP; 715 + return; 722 716 723 717 strscpy(elevator_name, buf, sizeof(elevator_name)); 724 718 name = strstrip(elevator_name); ··· 729 723 730 724 if (!found) 731 725 request_module("%s-iosched", name); 732 - 733 - return 0; 734 726 } 735 727 736 728 ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,

+2 -2

block/elevator.h

··· 148 148 * io scheduler sysfs switching 149 149 */ 150 150 ssize_t elv_iosched_show(struct gendisk *disk, char *page); 151 - int elv_iosched_load_module(struct gendisk *disk, const char *page, 152 - size_t count); 151 + void elv_iosched_load_module(struct gendisk *disk, const char *page, 152 + size_t count); 153 153 ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count); 154 154 155 155 extern bool elv_bio_merge_ok(struct request *, struct bio *);

+79 -57

block/genhd.c

··· 383 383 } 384 384 385 385 /** 386 - * device_add_disk - add disk information to kernel list 386 + * add_disk_fwnode - add disk information to kernel list with fwnode 387 387 * @parent: parent device for the disk 388 388 * @disk: per-device partitioning information 389 389 * @groups: Additional per-device sysfs groups 390 + * @fwnode: attached disk fwnode 390 391 * 391 392 * This function registers the partitioning information in @disk 392 - * with the kernel. 393 + * with the kernel. Also attach a fwnode to the disk device. 393 394 */ 394 - int __must_check device_add_disk(struct device *parent, struct gendisk *disk, 395 - const struct attribute_group **groups) 395 + int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, 396 + const struct attribute_group **groups, 397 + struct fwnode_handle *fwnode) 396 398 397 399 { 398 400 struct device *ddev = disk_to_dev(disk); ··· 454 452 ddev->parent = parent; 455 453 ddev->groups = groups; 456 454 dev_set_name(ddev, "%s", disk->disk_name); 455 + if (fwnode) 456 + device_set_node(ddev, fwnode); 457 457 if (!(disk->flags & GENHD_FL_HIDDEN)) 458 458 ddev->devt = MKDEV(disk->major, disk->first_minor); 459 459 ret = device_add(ddev); ··· 557 553 elevator_exit(disk->queue); 558 554 return ret; 559 555 } 556 + EXPORT_SYMBOL_GPL(add_disk_fwnode); 557 + 558 + /** 559 + * device_add_disk - add disk information to kernel list 560 + * @parent: parent device for the disk 561 + * @disk: per-device partitioning information 562 + * @groups: Additional per-device sysfs groups 563 + * 564 + * This function registers the partitioning information in @disk 565 + * with the kernel. 566 + */ 567 + int __must_check device_add_disk(struct device *parent, struct gendisk *disk, 568 + const struct attribute_group **groups) 569 + { 570 + return add_disk_fwnode(parent, disk, groups, NULL); 571 + } 560 572 EXPORT_SYMBOL(device_add_disk); 561 573 562 574 static void blk_report_disk_dead(struct gendisk *disk, bool surprise) ··· 601 581 rcu_read_unlock(); 602 582 } 603 583 604 - static void __blk_mark_disk_dead(struct gendisk *disk) 584 + static bool __blk_mark_disk_dead(struct gendisk *disk) 605 585 { 606 586 /* 607 587 * Fail any new I/O. 608 588 */ 609 589 if (test_and_set_bit(GD_DEAD, &disk->state)) 610 - return; 590 + return false; 611 591 612 592 if (test_bit(GD_OWNS_QUEUE, &disk->state)) 613 593 blk_queue_flag_set(QUEUE_FLAG_DYING, disk->queue); ··· 620 600 /* 621 601 * Prevent new I/O from crossing bio_queue_enter(). 622 602 */ 623 - blk_queue_start_drain(disk->queue); 603 + return blk_queue_start_drain(disk->queue); 624 604 } 625 605 626 606 /** ··· 661 641 struct request_queue *q = disk->queue; 662 642 struct block_device *part; 663 643 unsigned long idx; 644 + bool start_drain, queue_dying; 664 645 665 646 might_sleep(); 666 647 ··· 689 668 * Drop all partitions now that the disk is marked dead. 690 669 */ 691 670 mutex_lock(&disk->open_mutex); 692 - __blk_mark_disk_dead(disk); 671 + start_drain = __blk_mark_disk_dead(disk); 672 + queue_dying = blk_queue_dying(q); 673 + if (start_drain) 674 + blk_freeze_acquire_lock(q, true, queue_dying); 693 675 xa_for_each_start(&disk->part_tbl, idx, part, 1) 694 676 drop_partition(part); 695 677 mutex_unlock(&disk->open_mutex); ··· 749 725 if (queue_is_mq(q)) 750 726 blk_mq_exit_queue(q); 751 727 } 728 + 729 + if (start_drain) 730 + blk_unfreeze_release_lock(q, true, queue_dying); 752 731 } 753 732 EXPORT_SYMBOL(del_gendisk); 754 733 ··· 783 756 struct gendisk *disk = dev_to_disk(dev); 784 757 785 758 if (!disk->bb) 786 - return sprintf(page, "\n"); 759 + return sysfs_emit(page, "\n"); 787 760 788 761 return badblocks_show(disk->bb, page, 0); 789 762 } ··· 931 904 { 932 905 struct gendisk *disk = dev_to_disk(dev); 933 906 934 - return sprintf(buf, "%d\n", disk->minors); 907 + return sysfs_emit(buf, "%d\n", disk->minors); 935 908 } 936 909 937 910 static ssize_t disk_ext_range_show(struct device *dev, ··· 939 912 { 940 913 struct gendisk *disk = dev_to_disk(dev); 941 914 942 - return sprintf(buf, "%d\n", 915 + return sysfs_emit(buf, "%d\n", 943 916 (disk->flags & GENHD_FL_NO_PART) ? 1 : DISK_MAX_PARTS); 944 917 } 945 918 ··· 948 921 { 949 922 struct gendisk *disk = dev_to_disk(dev); 950 923 951 - return sprintf(buf, "%d\n", 924 + return sysfs_emit(buf, "%d\n", 952 925 (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0)); 953 926 } 954 927 ··· 957 930 { 958 931 struct gendisk *disk = dev_to_disk(dev); 959 932 960 - return sprintf(buf, "%d\n", 933 + return sysfs_emit(buf, "%d\n", 961 934 (disk->flags & GENHD_FL_HIDDEN ? 1 : 0)); 962 935 } 963 936 ··· 966 939 { 967 940 struct gendisk *disk = dev_to_disk(dev); 968 941 969 - return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0); 942 + return sysfs_emit(buf, "%d\n", get_disk_ro(disk) ? 1 : 0); 970 943 } 971 944 972 945 ssize_t part_size_show(struct device *dev, 973 946 struct device_attribute *attr, char *buf) 974 947 { 975 - return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev))); 948 + return sysfs_emit(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev))); 976 949 } 977 950 978 951 ssize_t part_stat_show(struct device *dev, ··· 989 962 part_stat_unlock(); 990 963 } 991 964 part_stat_read_all(bdev, &stat); 992 - return sprintf(buf, 965 + return sysfs_emit(buf, 993 966 "%8lu %8lu %8llu %8u " 994 967 "%8lu %8lu %8llu %8u " 995 968 "%8u %8u %8u " ··· 1031 1004 else 1032 1005 part_in_flight_rw(bdev, inflight); 1033 1006 1034 - return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); 1007 + return sysfs_emit(buf, "%8u %8u\n", inflight[0], inflight[1]); 1035 1008 } 1036 1009 1037 1010 static ssize_t disk_capability_show(struct device *dev, 1038 1011 struct device_attribute *attr, char *buf) 1039 1012 { 1040 1013 dev_warn_once(dev, "the capability attribute has been deprecated.\n"); 1041 - return sprintf(buf, "0\n"); 1014 + return sysfs_emit(buf, "0\n"); 1042 1015 } 1043 1016 1044 1017 static ssize_t disk_alignment_offset_show(struct device *dev, ··· 1047 1020 { 1048 1021 struct gendisk *disk = dev_to_disk(dev); 1049 1022 1050 - return sprintf(buf, "%d\n", bdev_alignment_offset(disk->part0)); 1023 + return sysfs_emit(buf, "%d\n", bdev_alignment_offset(disk->part0)); 1051 1024 } 1052 1025 1053 1026 static ssize_t disk_discard_alignment_show(struct device *dev, ··· 1056 1029 { 1057 1030 struct gendisk *disk = dev_to_disk(dev); 1058 1031 1059 - return sprintf(buf, "%d\n", bdev_alignment_offset(disk->part0)); 1032 + return sysfs_emit(buf, "%d\n", bdev_alignment_offset(disk->part0)); 1060 1033 } 1061 1034 1062 1035 static ssize_t diskseq_show(struct device *dev, ··· 1064 1037 { 1065 1038 struct gendisk *disk = dev_to_disk(dev); 1066 1039 1067 - return sprintf(buf, "%llu\n", disk->diskseq); 1040 + return sysfs_emit(buf, "%llu\n", disk->diskseq); 1068 1041 } 1069 1042 1070 1043 static ssize_t partscan_show(struct device *dev, 1071 1044 struct device_attribute *attr, char *buf) 1072 1045 { 1073 - return sprintf(buf, "%u\n", disk_has_partscan(dev_to_disk(dev))); 1046 + return sysfs_emit(buf, "%u\n", disk_has_partscan(dev_to_disk(dev))); 1074 1047 } 1075 1048 1076 1049 static DEVICE_ATTR(range, 0444, disk_range_show, NULL); ··· 1092 1065 ssize_t part_fail_show(struct device *dev, 1093 1066 struct device_attribute *attr, char *buf) 1094 1067 { 1095 - return sprintf(buf, "%d\n", 1068 + return sysfs_emit(buf, "%d\n", 1096 1069 bdev_test_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL)); 1097 1070 } 1098 1071 ··· 1291 1264 part_stat_unlock(); 1292 1265 } 1293 1266 part_stat_read_all(hd, &stat); 1294 - seq_printf(seqf, "%4d %7d %pg " 1295 - "%lu %lu %lu %u " 1296 - "%lu %lu %lu %u " 1297 - "%u %u %u " 1298 - "%lu %lu %lu %u " 1299 - "%lu %u" 1300 - "\n", 1301 - MAJOR(hd->bd_dev), MINOR(hd->bd_dev), hd, 1302 - stat.ios[STAT_READ], 1303 - stat.merges[STAT_READ], 1304 - stat.sectors[STAT_READ], 1305 - (unsigned int)div_u64(stat.nsecs[STAT_READ], 1306 - NSEC_PER_MSEC), 1307 - stat.ios[STAT_WRITE], 1308 - stat.merges[STAT_WRITE], 1309 - stat.sectors[STAT_WRITE], 1310 - (unsigned int)div_u64(stat.nsecs[STAT_WRITE], 1311 - NSEC_PER_MSEC), 1312 - inflight, 1313 - jiffies_to_msecs(stat.io_ticks), 1314 - (unsigned int)div_u64(stat.nsecs[STAT_READ] + 1315 - stat.nsecs[STAT_WRITE] + 1316 - stat.nsecs[STAT_DISCARD] + 1317 - stat.nsecs[STAT_FLUSH], 1318 - NSEC_PER_MSEC), 1319 - stat.ios[STAT_DISCARD], 1320 - stat.merges[STAT_DISCARD], 1321 - stat.sectors[STAT_DISCARD], 1322 - (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], 1323 - NSEC_PER_MSEC), 1324 - stat.ios[STAT_FLUSH], 1325 - (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], 1326 - NSEC_PER_MSEC) 1327 - ); 1267 + seq_put_decimal_ull_width(seqf, "", MAJOR(hd->bd_dev), 4); 1268 + seq_put_decimal_ull_width(seqf, " ", MINOR(hd->bd_dev), 7); 1269 + seq_printf(seqf, " %pg", hd); 1270 + seq_put_decimal_ull(seqf, " ", stat.ios[STAT_READ]); 1271 + seq_put_decimal_ull(seqf, " ", stat.merges[STAT_READ]); 1272 + seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_READ]); 1273 + seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_READ], 1274 + NSEC_PER_MSEC)); 1275 + seq_put_decimal_ull(seqf, " ", stat.ios[STAT_WRITE]); 1276 + seq_put_decimal_ull(seqf, " ", stat.merges[STAT_WRITE]); 1277 + seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_WRITE]); 1278 + seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_WRITE], 1279 + NSEC_PER_MSEC)); 1280 + seq_put_decimal_ull(seqf, " ", inflight); 1281 + seq_put_decimal_ull(seqf, " ", jiffies_to_msecs(stat.io_ticks)); 1282 + seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_READ] + 1283 + stat.nsecs[STAT_WRITE] + 1284 + stat.nsecs[STAT_DISCARD] + 1285 + stat.nsecs[STAT_FLUSH], 1286 + NSEC_PER_MSEC)); 1287 + seq_put_decimal_ull(seqf, " ", stat.ios[STAT_DISCARD]); 1288 + seq_put_decimal_ull(seqf, " ", stat.merges[STAT_DISCARD]); 1289 + seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_DISCARD]); 1290 + seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], 1291 + NSEC_PER_MSEC)); 1292 + seq_put_decimal_ull(seqf, " ", stat.ios[STAT_FLUSH]); 1293 + seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], 1294 + NSEC_PER_MSEC)); 1295 + seq_putc(seqf, '\n'); 1328 1296 } 1329 1297 rcu_read_unlock(); 1330 1298

+9

block/partitions/Kconfig

··· 270 270 Say Y here if you want to read the partition table from bootargs. 271 271 The format for the command line is just like mtdparts. 272 272 273 + config OF_PARTITION 274 + bool "Device Tree partition support" if PARTITION_ADVANCED 275 + depends on OF 276 + help 277 + Say Y here if you want to enable support for partition table 278 + defined in Device Tree. (mainly for eMMC) 279 + The format for the device tree node is just like MTD fixed-partition 280 + schema. 281 + 273 282 endmenu

+1

block/partitions/Makefile

··· 12 12 obj-$(CONFIG_MAC_PARTITION) += mac.o 13 13 obj-$(CONFIG_LDM_PARTITION) += ldm.o 14 14 obj-$(CONFIG_MSDOS_PARTITION) += msdos.o 15 + obj-$(CONFIG_OF_PARTITION) += of.o 15 16 obj-$(CONFIG_OSF_PARTITION) += osf.o 16 17 obj-$(CONFIG_SGI_PARTITION) += sgi.o 17 18 obj-$(CONFIG_SUN_PARTITION) += sun.o

+1

block/partitions/check.h

··· 62 62 int ldm_partition(struct parsed_partitions *state); 63 63 int mac_partition(struct parsed_partitions *state); 64 64 int msdos_partition(struct parsed_partitions *state); 65 + int of_partition(struct parsed_partitions *state); 65 66 int osf_partition(struct parsed_partitions *state); 66 67 int sgi_partition(struct parsed_partitions *state); 67 68 int sun_partition(struct parsed_partitions *state);

+3

block/partitions/cmdline.c

··· 237 237 put_partition(state, slot, subpart->from >> 9, 238 238 subpart->size >> 9); 239 239 240 + if (subpart->flags & PF_RDONLY) 241 + state->parts[slot].flags |= ADDPART_FLAG_READONLY; 242 + 240 243 info = &state->parts[slot].info; 241 244 242 245 strscpy(info->volname, subpart->name, sizeof(info->volname));

+8

block/partitions/core.c

··· 43 43 #ifdef CONFIG_CMDLINE_PARTITION 44 44 cmdline_partition, 45 45 #endif 46 + #ifdef CONFIG_OF_PARTITION 47 + of_partition, /* cmdline have priority to OF */ 48 + #endif 46 49 #ifdef CONFIG_EFI_PARTITION 47 50 efi_partition, /* this must come before msdos */ 48 51 #endif ··· 256 253 add_uevent_var(env, "PARTN=%u", bdev_partno(part)); 257 254 if (part->bd_meta_info && part->bd_meta_info->volname[0]) 258 255 add_uevent_var(env, "PARTNAME=%s", part->bd_meta_info->volname); 256 + if (part->bd_meta_info && part->bd_meta_info->uuid[0]) 257 + add_uevent_var(env, "PARTUUID=%s", part->bd_meta_info->uuid); 259 258 return 0; 260 259 } 261 260 ··· 377 372 if (err) 378 373 goto out_del; 379 374 } 375 + 376 + if (flags & ADDPART_FLAG_READONLY) 377 + bdev_set_flag(bdev, BD_READ_ONLY); 380 378 381 379 /* everything is up and running, commence */ 382 380 err = xa_insert(&disk->part_tbl, partno, bdev, GFP_KERNEL);

+110

block/partitions/of.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include <linux/blkdev.h> 4 + #include <linux/major.h> 5 + #include <linux/of.h> 6 + #include <linux/string.h> 7 + #include "check.h" 8 + 9 + static int validate_of_partition(struct device_node *np, int slot) 10 + { 11 + u64 offset, size; 12 + int len; 13 + 14 + const __be32 *reg = of_get_property(np, "reg", &len); 15 + int a_cells = of_n_addr_cells(np); 16 + int s_cells = of_n_size_cells(np); 17 + 18 + /* Make sure reg len match the expected addr and size cells */ 19 + if (len / sizeof(*reg) != a_cells + s_cells) 20 + return -EINVAL; 21 + 22 + /* Validate offset conversion from bytes to sectors */ 23 + offset = of_read_number(reg, a_cells); 24 + if (offset % SECTOR_SIZE) 25 + return -EINVAL; 26 + 27 + /* Validate size conversion from bytes to sectors */ 28 + size = of_read_number(reg + a_cells, s_cells); 29 + if (!size || size % SECTOR_SIZE) 30 + return -EINVAL; 31 + 32 + return 0; 33 + } 34 + 35 + static void add_of_partition(struct parsed_partitions *state, int slot, 36 + struct device_node *np) 37 + { 38 + struct partition_meta_info *info; 39 + char tmp[sizeof(info->volname) + 4]; 40 + const char *partname; 41 + int len; 42 + 43 + const __be32 *reg = of_get_property(np, "reg", &len); 44 + int a_cells = of_n_addr_cells(np); 45 + int s_cells = of_n_size_cells(np); 46 + 47 + /* Convert bytes to sector size */ 48 + u64 offset = of_read_number(reg, a_cells) / SECTOR_SIZE; 49 + u64 size = of_read_number(reg + a_cells, s_cells) / SECTOR_SIZE; 50 + 51 + put_partition(state, slot, offset, size); 52 + 53 + if (of_property_read_bool(np, "read-only")) 54 + state->parts[slot].flags |= ADDPART_FLAG_READONLY; 55 + 56 + /* 57 + * Follow MTD label logic, search for label property, 58 + * fallback to node name if not found. 59 + */ 60 + info = &state->parts[slot].info; 61 + partname = of_get_property(np, "label", &len); 62 + if (!partname) 63 + partname = of_get_property(np, "name", &len); 64 + strscpy(info->volname, partname, sizeof(info->volname)); 65 + 66 + snprintf(tmp, sizeof(tmp), "(%s)", info->volname); 67 + strlcat(state->pp_buf, tmp, PAGE_SIZE); 68 + } 69 + 70 + int of_partition(struct parsed_partitions *state) 71 + { 72 + struct device *ddev = disk_to_dev(state->disk); 73 + struct device_node *np; 74 + int slot; 75 + 76 + struct device_node *partitions_np = of_node_get(ddev->of_node); 77 + 78 + if (!partitions_np || 79 + !of_device_is_compatible(partitions_np, "fixed-partitions")) 80 + return 0; 81 + 82 + slot = 1; 83 + /* Validate parition offset and size */ 84 + for_each_child_of_node(partitions_np, np) { 85 + if (validate_of_partition(np, slot)) { 86 + of_node_put(np); 87 + of_node_put(partitions_np); 88 + 89 + return -1; 90 + } 91 + 92 + slot++; 93 + } 94 + 95 + slot = 1; 96 + for_each_child_of_node(partitions_np, np) { 97 + if (slot >= state->limit) { 98 + of_node_put(np); 99 + break; 100 + } 101 + 102 + add_of_partition(state, slot, np); 103 + 104 + slot++; 105 + } 106 + 107 + strlcat(state->pp_buf, "\n", PAGE_SIZE); 108 + 109 + return 1; 110 + }

+26

block/sed-opal.c

··· 3037 3037 return ret; 3038 3038 } 3039 3039 3040 + static int opal_set_new_sid_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw) 3041 + { 3042 + int ret; 3043 + struct opal_key *newkey = &opal_pw->new_user_pw.opal_key; 3044 + struct opal_key *oldkey = &opal_pw->session.opal_key; 3045 + 3046 + const struct opal_step pw_steps[] = { 3047 + { start_SIDASP_opal_session, oldkey }, 3048 + { set_sid_cpin_pin, newkey }, 3049 + { end_opal_session, } 3050 + }; 3051 + 3052 + if (!dev) 3053 + return -ENODEV; 3054 + 3055 + mutex_lock(&dev->dev_lock); 3056 + setup_opal_dev(dev); 3057 + ret = execute_steps(dev, pw_steps, ARRAY_SIZE(pw_steps)); 3058 + mutex_unlock(&dev->dev_lock); 3059 + 3060 + return ret; 3061 + } 3062 + 3040 3063 static int opal_activate_user(struct opal_dev *dev, 3041 3064 struct opal_session_info *opal_session) 3042 3065 { ··· 3308 3285 break; 3309 3286 case IOC_OPAL_DISCOVERY: 3310 3287 ret = opal_get_discv(dev, p); 3288 + break; 3289 + case IOC_OPAL_SET_SID_PW: 3290 + ret = opal_set_new_sid_pw(dev, p); 3311 3291 break; 3312 3292 3313 3293 default:

+44 -22

drivers/block/brd.c

··· 316 316 * (should share code eventually). 317 317 */ 318 318 static LIST_HEAD(brd_devices); 319 + static DEFINE_MUTEX(brd_devices_mutex); 319 320 static struct dentry *brd_debugfs_dir; 321 + 322 + static struct brd_device *brd_find_or_alloc_device(int i) 323 + { 324 + struct brd_device *brd; 325 + 326 + mutex_lock(&brd_devices_mutex); 327 + list_for_each_entry(brd, &brd_devices, brd_list) { 328 + if (brd->brd_number == i) { 329 + mutex_unlock(&brd_devices_mutex); 330 + return ERR_PTR(-EEXIST); 331 + } 332 + } 333 + 334 + brd = kzalloc(sizeof(*brd), GFP_KERNEL); 335 + if (!brd) { 336 + mutex_unlock(&brd_devices_mutex); 337 + return ERR_PTR(-ENOMEM); 338 + } 339 + brd->brd_number = i; 340 + list_add_tail(&brd->brd_list, &brd_devices); 341 + mutex_unlock(&brd_devices_mutex); 342 + return brd; 343 + } 344 + 345 + static void brd_free_device(struct brd_device *brd) 346 + { 347 + mutex_lock(&brd_devices_mutex); 348 + list_del(&brd->brd_list); 349 + mutex_unlock(&brd_devices_mutex); 350 + kfree(brd); 351 + } 320 352 321 353 static int brd_alloc(int i) 322 354 { ··· 372 340 BLK_FEAT_NOWAIT, 373 341 }; 374 342 375 - list_for_each_entry(brd, &brd_devices, brd_list) 376 - if (brd->brd_number == i) 377 - return -EEXIST; 378 - brd = kzalloc(sizeof(*brd), GFP_KERNEL); 379 - if (!brd) 380 - return -ENOMEM; 381 - brd->brd_number = i; 382 - list_add_tail(&brd->brd_list, &brd_devices); 343 + brd = brd_find_or_alloc_device(i); 344 + if (IS_ERR(brd)) 345 + return PTR_ERR(brd); 383 346 384 347 xa_init(&brd->brd_pages); 385 348 ··· 405 378 out_cleanup_disk: 406 379 put_disk(disk); 407 380 out_free_dev: 408 - list_del(&brd->brd_list); 409 - kfree(brd); 381 + brd_free_device(brd); 410 382 return err; 411 383 } 412 384 ··· 424 398 del_gendisk(brd->brd_disk); 425 399 put_disk(brd->brd_disk); 426 400 brd_free_pages(brd); 427 - list_del(&brd->brd_list); 428 - kfree(brd); 401 + brd_free_device(brd); 429 402 } 430 403 } 431 404 ··· 451 426 { 452 427 int err, i; 453 428 454 - brd_check_and_reset_par(); 455 - 456 - brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL); 457 - 458 - for (i = 0; i < rd_nr; i++) { 459 - err = brd_alloc(i); 460 - if (err) 461 - goto out_free; 462 - } 463 - 464 429 /* 465 430 * brd module now has a feature to instantiate underlying device 466 431 * structure on-demand, provided that there is an access dev node. ··· 466 451 * dynamically. 467 452 */ 468 453 454 + brd_check_and_reset_par(); 455 + 456 + brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL); 457 + 469 458 if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) { 470 459 err = -EIO; 471 460 goto out_free; 472 461 } 462 + 463 + for (i = 0; i < rd_nr; i++) 464 + brd_alloc(i); 473 465 474 466 pr_info("brd: module loaded\n"); 475 467 return 0;

+6 -7

drivers/block/loop.c

··· 173 173 static bool lo_bdev_can_use_dio(struct loop_device *lo, 174 174 struct block_device *backing_bdev) 175 175 { 176 - unsigned short sb_bsize = bdev_logical_block_size(backing_bdev); 176 + unsigned int sb_bsize = bdev_logical_block_size(backing_bdev); 177 177 178 178 if (queue_logical_block_size(lo->lo_queue) < sb_bsize) 179 179 return false; ··· 786 786 * file-backed loop devices: discarded regions read back as zero. 787 787 */ 788 788 if (S_ISBLK(inode->i_mode)) { 789 - struct request_queue *backingq = bdev_get_queue(I_BDEV(inode)); 789 + struct block_device *bdev = I_BDEV(inode); 790 790 791 - max_discard_sectors = backingq->limits.max_write_zeroes_sectors; 792 - granularity = bdev_discard_granularity(I_BDEV(inode)) ?: 793 - queue_physical_block_size(backingq); 791 + max_discard_sectors = bdev_write_zeroes_sectors(bdev); 792 + granularity = bdev_discard_granularity(bdev); 794 793 795 794 /* 796 795 * We use punch hole to reclaim the free space used by the ··· 976 977 return 0; 977 978 } 978 979 979 - static unsigned short loop_default_blocksize(struct loop_device *lo, 980 + static unsigned int loop_default_blocksize(struct loop_device *lo, 980 981 struct block_device *backing_bdev) 981 982 { 982 983 /* In case of direct I/O, match underlying block size */ ··· 985 986 return SECTOR_SIZE; 986 987 } 987 988 988 - static int loop_reconfigure_limits(struct loop_device *lo, unsigned short bsize) 989 + static int loop_reconfigure_limits(struct loop_device *lo, unsigned int bsize) 989 990 { 990 991 struct file *file = lo->lo_backing_file; 991 992 struct inode *inode = file->f_mapping->host;

+6 -8

drivers/block/mtip32xx/mtip32xx.c

··· 2701 2701 int rv; 2702 2702 unsigned long timeout, timetaken; 2703 2703 2704 - dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR]; 2704 + dd->mmio = pcim_iomap_region(dd->pdev, MTIP_ABAR, MTIP_DRV_NAME); 2705 + if (IS_ERR(dd->mmio)) { 2706 + dev_err(&dd->pdev->dev, "Unable to request / ioremap PCI region\n"); 2707 + return PTR_ERR(dd->mmio); 2708 + } 2709 + 2705 2710 2706 2711 mtip_detect_product(dd); 2707 2712 if (dd->product_type == MTIP_PRODUCT_UNKNOWN) { ··· 3712 3707 rv = pcim_enable_device(pdev); 3713 3708 if (rv < 0) { 3714 3709 dev_err(&pdev->dev, "Unable to enable device\n"); 3715 - goto iomap_err; 3716 - } 3717 - 3718 - /* Map BAR5 to memory. */ 3719 - rv = pcim_iomap_regions(pdev, 1 << MTIP_ABAR, MTIP_DRV_NAME); 3720 - if (rv < 0) { 3721 - dev_err(&pdev->dev, "Unable to map regions\n"); 3722 3710 goto iomap_err; 3723 3711 } 3724 3712

+4 -5

drivers/block/null_blk/main.c

··· 1638 1638 return BLK_STS_OK; 1639 1639 } 1640 1640 1641 - static void null_queue_rqs(struct request **rqlist) 1641 + static void null_queue_rqs(struct rq_list *rqlist) 1642 1642 { 1643 - struct request *requeue_list = NULL; 1644 - struct request **requeue_lastp = &requeue_list; 1643 + struct rq_list requeue_list = {}; 1645 1644 struct blk_mq_queue_data bd = { }; 1646 1645 blk_status_t ret; 1647 1646 ··· 1650 1651 bd.rq = rq; 1651 1652 ret = null_queue_rq(rq->mq_hctx, &bd); 1652 1653 if (ret != BLK_STS_OK) 1653 - rq_list_add_tail(&requeue_lastp, rq); 1654 - } while (!rq_list_empty(*rqlist)); 1654 + rq_list_add_tail(&requeue_list, rq); 1655 + } while (!rq_list_empty(rqlist)); 1655 1656 1656 1657 *rqlist = requeue_list; 1657 1658 }

+1 -1

drivers/block/null_blk/zoned.c

··· 166 166 167 167 lim->features |= BLK_FEAT_ZONED; 168 168 lim->chunk_sectors = dev->zone_size_sects; 169 - lim->max_zone_append_sectors = dev->zone_append_max_sectors; 169 + lim->max_hw_zone_append_sectors = dev->zone_append_max_sectors; 170 170 lim->max_open_zones = dev->zone_max_open; 171 171 lim->max_active_zones = dev->zone_max_active; 172 172 return 0;

+1

drivers/block/rbd.c

··· 7284 7284 */ 7285 7285 blk_mq_freeze_queue(rbd_dev->disk->queue); 7286 7286 blk_mark_disk_dead(rbd_dev->disk); 7287 + blk_mq_unfreeze_queue(rbd_dev->disk->queue); 7287 7288 } 7288 7289 7289 7290 del_gendisk(rbd_dev->disk);

+142 -66

drivers/block/ublk_drv.c

··· 60 60 | UBLK_F_UNPRIVILEGED_DEV \ 61 61 | UBLK_F_CMD_IOCTL_ENCODE \ 62 62 | UBLK_F_USER_COPY \ 63 - | UBLK_F_ZONED) 63 + | UBLK_F_ZONED \ 64 + | UBLK_F_USER_RECOVERY_FAIL_IO) 65 + 66 + #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ 67 + | UBLK_F_USER_RECOVERY_REISSUE \ 68 + | UBLK_F_USER_RECOVERY_FAIL_IO) 64 69 65 70 /* All UBLK_PARAM_TYPE_* should be included here */ 66 71 #define UBLK_PARAM_TYPE_ALL \ ··· 148 143 bool force_abort; 149 144 bool timeout; 150 145 bool canceling; 146 + bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */ 151 147 unsigned short nr_io_ready; /* how many ios setup */ 152 148 spinlock_t cancel_lock; 153 149 struct ublk_device *dev; ··· 185 179 unsigned int nr_queues_ready; 186 180 unsigned int nr_privileged_daemon; 187 181 188 - struct work_struct quiesce_work; 189 - struct work_struct stop_work; 182 + struct work_struct nosrv_work; 190 183 }; 191 184 192 185 /* header of ublk_params */ ··· 669 664 return ublk_get_queue(ub, q_id)->io_cmd_buf; 670 665 } 671 666 667 + static inline int __ublk_queue_cmd_buf_size(int depth) 668 + { 669 + return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE); 670 + } 671 + 672 672 static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id) 673 673 { 674 674 struct ublk_queue *ubq = ublk_get_queue(ub, q_id); 675 675 676 - return round_up(ubq->q_depth * sizeof(struct ublksrv_io_desc), 677 - PAGE_SIZE); 676 + return __ublk_queue_cmd_buf_size(ubq->q_depth); 678 677 } 679 678 680 - static inline bool ublk_queue_can_use_recovery_reissue( 681 - struct ublk_queue *ubq) 679 + static int ublk_max_cmd_buf_size(void) 680 + { 681 + return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH); 682 + } 683 + 684 + /* 685 + * Should I/O outstanding to the ublk server when it exits be reissued? 686 + * If not, outstanding I/O will get errors. 687 + */ 688 + static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub) 689 + { 690 + return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) && 691 + (ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE); 692 + } 693 + 694 + /* 695 + * Should I/O issued while there is no ublk server queue? If not, I/O 696 + * issued while there is no ublk server will get errors. 697 + */ 698 + static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub) 699 + { 700 + return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) && 701 + !(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO); 702 + } 703 + 704 + /* 705 + * Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy 706 + * of the device flags for smaller cache footprint - better for fast 707 + * paths. 708 + */ 709 + static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq) 682 710 { 683 711 return (ubq->flags & UBLK_F_USER_RECOVERY) && 684 - (ubq->flags & UBLK_F_USER_RECOVERY_REISSUE); 712 + !(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO); 685 713 } 686 714 687 - static inline bool ublk_queue_can_use_recovery( 688 - struct ublk_queue *ubq) 715 + /* 716 + * Should ublk devices be stopped (i.e. no recovery possible) when the 717 + * ublk server exits? If not, devices can be used again by a future 718 + * incarnation of a ublk server via the start_recovery/end_recovery 719 + * commands. 720 + */ 721 + static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub) 689 722 { 690 - return ubq->flags & UBLK_F_USER_RECOVERY; 723 + return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY); 691 724 } 692 725 693 - static inline bool ublk_can_use_recovery(struct ublk_device *ub) 726 + static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub) 694 727 { 695 - return ub->dev_info.flags & UBLK_F_USER_RECOVERY; 728 + return ub->dev_info.state == UBLK_S_DEV_QUIESCED || 729 + ub->dev_info.state == UBLK_S_DEV_FAIL_IO; 696 730 } 697 731 698 732 static void ublk_free_disk(struct gendisk *disk) ··· 1107 1063 { 1108 1064 WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); 1109 1065 1110 - if (ublk_queue_can_use_recovery_reissue(ubq)) 1066 + if (ublk_nosrv_should_reissue_outstanding(ubq->dev)) 1111 1067 blk_mq_requeue_request(req, false); 1112 1068 else 1113 1069 ublk_put_req_ref(ubq, req); ··· 1135 1091 struct request *rq) 1136 1092 { 1137 1093 /* We cannot process this rq so just requeue it. */ 1138 - if (ublk_queue_can_use_recovery(ubq)) 1094 + if (ublk_nosrv_dev_should_queue_io(ubq->dev)) 1139 1095 blk_mq_requeue_request(rq, false); 1140 1096 else 1141 1097 blk_mq_end_request(rq, BLK_STS_IOERR); ··· 1280 1236 struct ublk_device *ub = ubq->dev; 1281 1237 1282 1238 if (ublk_abort_requests(ub, ubq)) { 1283 - if (ublk_can_use_recovery(ub)) 1284 - schedule_work(&ub->quiesce_work); 1285 - else 1286 - schedule_work(&ub->stop_work); 1239 + schedule_work(&ub->nosrv_work); 1287 1240 } 1288 1241 return BLK_EH_DONE; 1289 1242 } ··· 1294 1253 struct ublk_queue *ubq = hctx->driver_data; 1295 1254 struct request *rq = bd->rq; 1296 1255 blk_status_t res; 1256 + 1257 + if (unlikely(ubq->fail_io)) { 1258 + return BLK_STS_TARGET; 1259 + } 1297 1260 1298 1261 /* fill iod to slot in io cmd buffer */ 1299 1262 res = ublk_setup_iod(ubq, rq); ··· 1313 1268 * Note: force_abort is guaranteed to be seen because it is set 1314 1269 * before request queue is unqiuesced. 1315 1270 */ 1316 - if (ublk_queue_can_use_recovery(ubq) && unlikely(ubq->force_abort)) 1271 + if (ublk_nosrv_should_queue_io(ubq) && unlikely(ubq->force_abort)) 1317 1272 return BLK_STS_IOERR; 1318 1273 1319 1274 if (unlikely(ubq->canceling)) { ··· 1367 1322 { 1368 1323 struct ublk_device *ub = filp->private_data; 1369 1324 size_t sz = vma->vm_end - vma->vm_start; 1370 - unsigned max_sz = UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc); 1325 + unsigned max_sz = ublk_max_cmd_buf_size(); 1371 1326 unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT; 1372 1327 int q_id, ret = 0; 1373 1328 ··· 1534 1489 ublk_cancel_cmd(ubq, io, issue_flags); 1535 1490 1536 1491 if (need_schedule) { 1537 - if (ublk_can_use_recovery(ub)) 1538 - schedule_work(&ub->quiesce_work); 1539 - else 1540 - schedule_work(&ub->stop_work); 1492 + schedule_work(&ub->nosrv_work); 1541 1493 } 1542 1494 } 1543 1495 ··· 1597 1555 ub->dev_info.state = UBLK_S_DEV_QUIESCED; 1598 1556 } 1599 1557 1600 - static void ublk_quiesce_work_fn(struct work_struct *work) 1601 - { 1602 - struct ublk_device *ub = 1603 - container_of(work, struct ublk_device, quiesce_work); 1604 - 1605 - mutex_lock(&ub->mutex); 1606 - if (ub->dev_info.state != UBLK_S_DEV_LIVE) 1607 - goto unlock; 1608 - __ublk_quiesce_dev(ub); 1609 - unlock: 1610 - mutex_unlock(&ub->mutex); 1611 - ublk_cancel_dev(ub); 1612 - } 1613 - 1614 1558 static void ublk_unquiesce_dev(struct ublk_device *ub) 1615 1559 { 1616 1560 int i; ··· 1625 1597 mutex_lock(&ub->mutex); 1626 1598 if (ub->dev_info.state == UBLK_S_DEV_DEAD) 1627 1599 goto unlock; 1628 - if (ublk_can_use_recovery(ub)) { 1600 + if (ublk_nosrv_dev_should_queue_io(ub)) { 1629 1601 if (ub->dev_info.state == UBLK_S_DEV_LIVE) 1630 1602 __ublk_quiesce_dev(ub); 1631 1603 ublk_unquiesce_dev(ub); ··· 1640 1612 ub->ub_disk = NULL; 1641 1613 spin_unlock(&ub->lock); 1642 1614 put_disk(disk); 1615 + unlock: 1616 + mutex_unlock(&ub->mutex); 1617 + ublk_cancel_dev(ub); 1618 + } 1619 + 1620 + static void ublk_nosrv_work(struct work_struct *work) 1621 + { 1622 + struct ublk_device *ub = 1623 + container_of(work, struct ublk_device, nosrv_work); 1624 + int i; 1625 + 1626 + if (ublk_nosrv_should_stop_dev(ub)) { 1627 + ublk_stop_dev(ub); 1628 + return; 1629 + } 1630 + 1631 + mutex_lock(&ub->mutex); 1632 + if (ub->dev_info.state != UBLK_S_DEV_LIVE) 1633 + goto unlock; 1634 + 1635 + if (ublk_nosrv_dev_should_queue_io(ub)) { 1636 + __ublk_quiesce_dev(ub); 1637 + } else { 1638 + blk_mq_quiesce_queue(ub->ub_disk->queue); 1639 + ub->dev_info.state = UBLK_S_DEV_FAIL_IO; 1640 + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { 1641 + ublk_get_queue(ub, i)->fail_io = true; 1642 + } 1643 + blk_mq_unquiesce_queue(ub->ub_disk->queue); 1644 + } 1645 + 1643 1646 unlock: 1644 1647 mutex_unlock(&ub->mutex); 1645 1648 ublk_cancel_dev(ub); ··· 2189 2130 return ret; 2190 2131 } 2191 2132 2192 - static void ublk_stop_work_fn(struct work_struct *work) 2193 - { 2194 - struct ublk_device *ub = 2195 - container_of(work, struct ublk_device, stop_work); 2196 - 2197 - ublk_stop_dev(ub); 2198 - } 2199 - 2200 2133 /* align max io buffer size with PAGE_SIZE */ 2201 2134 static void ublk_align_max_io_size(struct ublk_device *ub) 2202 2135 { ··· 2213 2162 static void ublk_remove(struct ublk_device *ub) 2214 2163 { 2215 2164 ublk_stop_dev(ub); 2216 - cancel_work_sync(&ub->stop_work); 2217 - cancel_work_sync(&ub->quiesce_work); 2165 + cancel_work_sync(&ub->nosrv_work); 2218 2166 cdev_device_del(&ub->cdev, &ub->cdev_dev); 2219 2167 ublk_put_device(ub); 2220 2168 ublks_added--; ··· 2279 2229 lim.features |= BLK_FEAT_ZONED; 2280 2230 lim.max_active_zones = p->max_active_zones; 2281 2231 lim.max_open_zones = p->max_open_zones; 2282 - lim.max_zone_append_sectors = p->max_zone_append_sectors; 2232 + lim.max_hw_zone_append_sectors = p->max_zone_append_sectors; 2283 2233 } 2284 2234 2285 2235 if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) { ··· 2422 2372 else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV)) 2423 2373 return -EPERM; 2424 2374 2375 + /* forbid nonsense combinations of recovery flags */ 2376 + switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) { 2377 + case 0: 2378 + case UBLK_F_USER_RECOVERY: 2379 + case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE): 2380 + case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO): 2381 + break; 2382 + default: 2383 + pr_warn("%s: invalid recovery flags %llx\n", __func__, 2384 + info.flags & UBLK_F_ALL_RECOVERY_FLAGS); 2385 + return -EINVAL; 2386 + } 2387 + 2425 2388 /* 2426 2389 * unprivileged device can't be trusted, but RECOVERY and 2427 2390 * RECOVERY_REISSUE still may hang error handling, so can't ··· 2487 2424 goto out_unlock; 2488 2425 mutex_init(&ub->mutex); 2489 2426 spin_lock_init(&ub->lock); 2490 - INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn); 2491 - INIT_WORK(&ub->stop_work, ublk_stop_work_fn); 2427 + INIT_WORK(&ub->nosrv_work, ublk_nosrv_work); 2492 2428 2493 2429 ret = ublk_alloc_dev_number(ub, header->dev_id); 2494 2430 if (ret < 0) ··· 2622 2560 static int ublk_ctrl_stop_dev(struct ublk_device *ub) 2623 2561 { 2624 2562 ublk_stop_dev(ub); 2625 - cancel_work_sync(&ub->stop_work); 2626 - cancel_work_sync(&ub->quiesce_work); 2627 - 2563 + cancel_work_sync(&ub->nosrv_work); 2628 2564 return 0; 2629 2565 } 2630 2566 ··· 2759 2699 int i; 2760 2700 2761 2701 mutex_lock(&ub->mutex); 2762 - if (!ublk_can_use_recovery(ub)) 2702 + if (ublk_nosrv_should_stop_dev(ub)) 2763 2703 goto out_unlock; 2764 2704 if (!ub->nr_queues_ready) 2765 2705 goto out_unlock; ··· 2770 2710 * and related io_uring ctx is freed so file struct of /dev/ublkcX is 2771 2711 * released. 2772 2712 * 2713 + * and one of the following holds 2714 + * 2773 2715 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work: 2774 2716 * (a)has quiesced request queue 2775 2717 * (b)has requeued every inflight rqs whose io_flags is ACTIVE 2776 2718 * (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE 2777 2719 * (d)has completed/camceled all ioucmds owned by ther dying process 2720 + * 2721 + * (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not 2722 + * quiesced, but all I/O is being immediately errored 2778 2723 */ 2779 - if (test_bit(UB_STATE_OPEN, &ub->state) || 2780 - ub->dev_info.state != UBLK_S_DEV_QUIESCED) { 2724 + if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) { 2781 2725 ret = -EBUSY; 2782 2726 goto out_unlock; 2783 2727 } ··· 2805 2741 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); 2806 2742 int ublksrv_pid = (int)header->data[0]; 2807 2743 int ret = -EINVAL; 2744 + int i; 2808 2745 2809 2746 pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n", 2810 2747 __func__, ub->dev_info.nr_hw_queues, header->dev_id); ··· 2817 2752 __func__, ub->dev_info.nr_hw_queues, header->dev_id); 2818 2753 2819 2754 mutex_lock(&ub->mutex); 2820 - if (!ublk_can_use_recovery(ub)) 2755 + if (ublk_nosrv_should_stop_dev(ub)) 2821 2756 goto out_unlock; 2822 2757 2823 - if (ub->dev_info.state != UBLK_S_DEV_QUIESCED) { 2758 + if (!ublk_dev_in_recoverable_state(ub)) { 2824 2759 ret = -EBUSY; 2825 2760 goto out_unlock; 2826 2761 } 2827 2762 ub->dev_info.ublksrv_pid = ublksrv_pid; 2828 2763 pr_devel("%s: new ublksrv_pid %d, dev id %d\n", 2829 2764 __func__, ublksrv_pid, header->dev_id); 2830 - blk_mq_unquiesce_queue(ub->ub_disk->queue); 2831 - pr_devel("%s: queue unquiesced, dev id %d.\n", 2832 - __func__, header->dev_id); 2833 - blk_mq_kick_requeue_list(ub->ub_disk->queue); 2834 - ub->dev_info.state = UBLK_S_DEV_LIVE; 2765 + 2766 + if (ublk_nosrv_dev_should_queue_io(ub)) { 2767 + ub->dev_info.state = UBLK_S_DEV_LIVE; 2768 + blk_mq_unquiesce_queue(ub->ub_disk->queue); 2769 + pr_devel("%s: queue unquiesced, dev id %d.\n", 2770 + __func__, header->dev_id); 2771 + blk_mq_kick_requeue_list(ub->ub_disk->queue); 2772 + } else { 2773 + blk_mq_quiesce_queue(ub->ub_disk->queue); 2774 + ub->dev_info.state = UBLK_S_DEV_LIVE; 2775 + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { 2776 + ublk_get_queue(ub, i)->fail_io = false; 2777 + } 2778 + blk_mq_unquiesce_queue(ub->ub_disk->queue); 2779 + } 2780 + 2835 2781 ret = 0; 2836 2782 out_unlock: 2837 2783 mutex_unlock(&ub->mutex);

+24 -29

drivers/block/virtio_blk.c

··· 471 471 return virtblk_prep_rq(req->mq_hctx, vblk, req, vbr) == BLK_STS_OK; 472 472 } 473 473 474 - static bool virtblk_add_req_batch(struct virtio_blk_vq *vq, 475 - struct request **rqlist) 474 + static void virtblk_add_req_batch(struct virtio_blk_vq *vq, 475 + struct rq_list *rqlist) 476 476 { 477 + struct request *req; 477 478 unsigned long flags; 478 - int err; 479 479 bool kick; 480 480 481 481 spin_lock_irqsave(&vq->lock, flags); 482 482 483 - while (!rq_list_empty(*rqlist)) { 484 - struct request *req = rq_list_pop(rqlist); 483 + while ((req = rq_list_pop(rqlist))) { 485 484 struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); 485 + int err; 486 486 487 487 err = virtblk_add_req(vq->vq, vbr); 488 488 if (err) { ··· 495 495 kick = virtqueue_kick_prepare(vq->vq); 496 496 spin_unlock_irqrestore(&vq->lock, flags); 497 497 498 - return kick; 498 + if (kick) 499 + virtqueue_notify(vq->vq); 499 500 } 500 501 501 - static void virtio_queue_rqs(struct request **rqlist) 502 + static void virtio_queue_rqs(struct rq_list *rqlist) 502 503 { 503 - struct request *req, *next, *prev = NULL; 504 - struct request *requeue_list = NULL; 504 + struct rq_list submit_list = { }; 505 + struct rq_list requeue_list = { }; 506 + struct virtio_blk_vq *vq = NULL; 507 + struct request *req; 505 508 506 - rq_list_for_each_safe(rqlist, req, next) { 507 - struct virtio_blk_vq *vq = get_virtio_blk_vq(req->mq_hctx); 508 - bool kick; 509 + while ((req = rq_list_pop(rqlist))) { 510 + struct virtio_blk_vq *this_vq = get_virtio_blk_vq(req->mq_hctx); 509 511 510 - if (!virtblk_prep_rq_batch(req)) { 511 - rq_list_move(rqlist, &requeue_list, req, prev); 512 - req = prev; 513 - if (!req) 514 - continue; 515 - } 512 + if (vq && vq != this_vq) 513 + virtblk_add_req_batch(vq, &submit_list); 514 + vq = this_vq; 516 515 517 - if (!next || req->mq_hctx != next->mq_hctx) { 518 - req->rq_next = NULL; 519 - kick = virtblk_add_req_batch(vq, rqlist); 520 - if (kick) 521 - virtqueue_notify(vq->vq); 522 - 523 - *rqlist = next; 524 - prev = NULL; 525 - } else 526 - prev = req; 516 + if (virtblk_prep_rq_batch(req)) 517 + rq_list_add_tail(&submit_list, req); 518 + else 519 + rq_list_add_tail(&requeue_list, req); 527 520 } 528 521 522 + if (vq) 523 + virtblk_add_req_batch(vq, &submit_list); 529 524 *rqlist = requeue_list; 530 525 } 531 526 ··· 779 784 wg, v); 780 785 return -ENODEV; 781 786 } 782 - lim->max_zone_append_sectors = v; 787 + lim->max_hw_zone_append_sectors = v; 783 788 dev_dbg(&vdev->dev, "max append sectors = %u\n", v); 784 789 785 790 return 0;

+2 -2

drivers/md/dm-cache-target.c

··· 3362 3362 static void disable_passdown_if_not_supported(struct cache *cache) 3363 3363 { 3364 3364 struct block_device *origin_bdev = cache->origin_dev->bdev; 3365 - struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits; 3365 + struct queue_limits *origin_limits = bdev_limits(origin_bdev); 3366 3366 const char *reason = NULL; 3367 3367 3368 3368 if (!cache->features.discard_passdown) ··· 3384 3384 static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 3385 3385 { 3386 3386 struct block_device *origin_bdev = cache->origin_dev->bdev; 3387 - struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits; 3387 + struct queue_limits *origin_limits = bdev_limits(origin_bdev); 3388 3388 3389 3389 if (!cache->features.discard_passdown) { 3390 3390 /* No passdown is done so setting own virtual limits */

+2 -2

drivers/md/dm-clone-target.c

··· 2020 2020 static void disable_passdown_if_not_supported(struct clone *clone) 2021 2021 { 2022 2022 struct block_device *dest_dev = clone->dest_dev->bdev; 2023 - struct queue_limits *dest_limits = &bdev_get_queue(dest_dev)->limits; 2023 + struct queue_limits *dest_limits = bdev_limits(dest_dev); 2024 2024 const char *reason = NULL; 2025 2025 2026 2026 if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) ··· 2041 2041 static void set_discard_limits(struct clone *clone, struct queue_limits *limits) 2042 2042 { 2043 2043 struct block_device *dest_bdev = clone->dest_dev->bdev; 2044 - struct queue_limits *dest_limits = &bdev_get_queue(dest_bdev)->limits; 2044 + struct queue_limits *dest_limits = bdev_limits(dest_bdev); 2045 2045 2046 2046 if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) { 2047 2047 /* No passdown is done so we set our own virtual limits */

+1 -1

drivers/md/dm-thin.c

··· 2842 2842 { 2843 2843 struct pool *pool = pt->pool; 2844 2844 struct block_device *data_bdev = pt->data_dev->bdev; 2845 - struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits; 2845 + struct queue_limits *data_limits = bdev_limits(data_bdev); 2846 2846 const char *reason = NULL; 2847 2847 2848 2848 if (!pt->adjusted_pf.discard_passdown)

+2 -2

drivers/md/dm-zone.c

··· 344 344 clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); 345 345 } else { 346 346 set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); 347 - lim->max_zone_append_sectors = 0; 347 + lim->max_hw_zone_append_sectors = 0; 348 348 } 349 349 350 350 /* ··· 379 379 if (!zlim.mapped_nr_seq_zones) { 380 380 lim->max_open_zones = 0; 381 381 lim->max_active_zones = 0; 382 - lim->max_zone_append_sectors = 0; 382 + lim->max_hw_zone_append_sectors = 0; 383 383 lim->zone_write_granularity = 0; 384 384 lim->chunk_sectors = 0; 385 385 lim->features &= ~BLK_FEAT_ZONED;

+1

drivers/md/md-bitmap.c

··· 1285 1285 1286 1286 queue_work(md_bitmap_wq, &unplug_work.work); 1287 1287 wait_for_completion(&done); 1288 + destroy_work_on_stack(&unplug_work.work); 1288 1289 } 1289 1290 1290 1291 static void bitmap_unplug(struct mddev *mddev, bool sync)

+12 -3

drivers/md/md.c

··· 9784 9784 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) 9785 9785 { 9786 9786 sysfs_notify_dirent_safe(rdev->sysfs_state); 9787 - wait_event_timeout(rdev->blocked_wait, 9788 - !test_bit(Blocked, &rdev->flags) && 9789 - !test_bit(BlockedBadBlocks, &rdev->flags), 9787 + wait_event_timeout(rdev->blocked_wait, !rdev_blocked(rdev), 9790 9788 msecs_to_jiffies(5000)); 9791 9789 rdev_dec_pending(rdev, mddev); 9792 9790 } ··· 9813 9815 { 9814 9816 struct mddev *mddev = rdev->mddev; 9815 9817 int rv; 9818 + 9819 + /* 9820 + * Recording new badblocks for faulty rdev will force unnecessary 9821 + * super block updating. This is fragile for external management because 9822 + * userspace daemon may trying to remove this device and deadlock may 9823 + * occur. This will be probably solved in the mdadm, but it is safer to 9824 + * avoid it. 9825 + */ 9826 + if (test_bit(Faulty, &rdev->flags)) 9827 + return 1; 9828 + 9816 9829 if (is_new) 9817 9830 s += rdev->new_data_offset; 9818 9831 else

+24

drivers/md/md.h

··· 1002 1002 trace_block_bio_remap(bio, disk_devt(mddev->gendisk), sector); 1003 1003 } 1004 1004 1005 + static inline bool rdev_blocked(struct md_rdev *rdev) 1006 + { 1007 + /* 1008 + * Blocked will be set by error handler and cleared by daemon after 1009 + * updating superblock, meanwhile write IO should be blocked to prevent 1010 + * reading old data after power failure. 1011 + */ 1012 + if (test_bit(Blocked, &rdev->flags)) 1013 + return true; 1014 + 1015 + /* 1016 + * Faulty device should not be accessed anymore, there is no need to 1017 + * wait for bad block to be acknowledged. 1018 + */ 1019 + if (test_bit(Faulty, &rdev->flags)) 1020 + return false; 1021 + 1022 + /* rdev is blocked by badblocks. */ 1023 + if (test_bit(BlockedBadBlocks, &rdev->flags)) 1024 + return true; 1025 + 1026 + return false; 1027 + } 1028 + 1005 1029 #define mddev_add_trace_msg(mddev, fmt, args...) \ 1006 1030 do { \ 1007 1031 if (!mddev_is_dm(mddev)) \

+12

drivers/md/raid0.c

··· 466 466 struct bio *split = bio_split(bio, 467 467 zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO, 468 468 &mddev->bio_set); 469 + 470 + if (IS_ERR(split)) { 471 + bio->bi_status = errno_to_blk_status(PTR_ERR(split)); 472 + bio_endio(bio); 473 + return; 474 + } 469 475 bio_chain(split, bio); 470 476 submit_bio_noacct(bio); 471 477 bio = split; ··· 614 608 if (sectors < bio_sectors(bio)) { 615 609 struct bio *split = bio_split(bio, sectors, GFP_NOIO, 616 610 &mddev->bio_set); 611 + 612 + if (IS_ERR(split)) { 613 + bio->bi_status = errno_to_blk_status(PTR_ERR(split)); 614 + bio_endio(bio); 615 + return true; 616 + } 617 617 bio_chain(split, bio); 618 618 raid0_map_submit_bio(mddev, bio); 619 619 bio = split;

+70 -38

drivers/md/raid1.c

··· 1322 1322 const enum req_op op = bio_op(bio); 1323 1323 const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; 1324 1324 int max_sectors; 1325 - int rdisk; 1325 + int rdisk, error; 1326 1326 bool r1bio_existed = !!r1_bio; 1327 1327 1328 1328 /* ··· 1383 1383 if (max_sectors < bio_sectors(bio)) { 1384 1384 struct bio *split = bio_split(bio, max_sectors, 1385 1385 gfp, &conf->bio_split); 1386 + 1387 + if (IS_ERR(split)) { 1388 + error = PTR_ERR(split); 1389 + goto err_handle; 1390 + } 1386 1391 bio_chain(split, bio); 1387 1392 submit_bio_noacct(bio); 1388 1393 bio = split; ··· 1415 1410 read_bio->bi_private = r1_bio; 1416 1411 mddev_trace_remap(mddev, read_bio, r1_bio->sector); 1417 1412 submit_bio_noacct(read_bio); 1413 + return; 1414 + 1415 + err_handle: 1416 + atomic_dec(&mirror->rdev->nr_pending); 1417 + bio->bi_status = errno_to_blk_status(error); 1418 + set_bit(R1BIO_Uptodate, &r1_bio->state); 1419 + raid_end_bio_io(r1_bio); 1420 + } 1421 + 1422 + static bool wait_blocked_rdev(struct mddev *mddev, struct bio *bio) 1423 + { 1424 + struct r1conf *conf = mddev->private; 1425 + int disks = conf->raid_disks * 2; 1426 + int i; 1427 + 1428 + retry: 1429 + for (i = 0; i < disks; i++) { 1430 + struct md_rdev *rdev = conf->mirrors[i].rdev; 1431 + 1432 + if (!rdev) 1433 + continue; 1434 + 1435 + /* don't write here until the bad block is acknowledged */ 1436 + if (test_bit(WriteErrorSeen, &rdev->flags) && 1437 + rdev_has_badblock(rdev, bio->bi_iter.bi_sector, 1438 + bio_sectors(bio)) < 0) 1439 + set_bit(BlockedBadBlocks, &rdev->flags); 1440 + 1441 + if (rdev_blocked(rdev)) { 1442 + if (bio->bi_opf & REQ_NOWAIT) 1443 + return false; 1444 + 1445 + mddev_add_trace_msg(rdev->mddev, "raid1 wait rdev %d blocked", 1446 + rdev->raid_disk); 1447 + atomic_inc(&rdev->nr_pending); 1448 + md_wait_for_blocked_rdev(rdev, rdev->mddev); 1449 + goto retry; 1450 + } 1451 + } 1452 + 1453 + return true; 1418 1454 } 1419 1455 1420 1456 static void raid1_write_request(struct mddev *mddev, struct bio *bio, ··· 1463 1417 { 1464 1418 struct r1conf *conf = mddev->private; 1465 1419 struct r1bio *r1_bio; 1466 - int i, disks; 1420 + int i, disks, k, error; 1467 1421 unsigned long flags; 1468 - struct md_rdev *blocked_rdev; 1469 1422 int first_clone; 1470 1423 int max_sectors; 1471 1424 bool write_behind = false; ··· 1502 1457 return; 1503 1458 } 1504 1459 1505 - retry_write: 1460 + if (!wait_blocked_rdev(mddev, bio)) { 1461 + bio_wouldblock_error(bio); 1462 + return; 1463 + } 1464 + 1506 1465 r1_bio = alloc_r1bio(mddev, bio); 1507 1466 r1_bio->sectors = max_write_sectors; 1508 1467 ··· 1522 1473 */ 1523 1474 1524 1475 disks = conf->raid_disks * 2; 1525 - blocked_rdev = NULL; 1526 1476 max_sectors = r1_bio->sectors; 1527 1477 for (i = 0; i < disks; i++) { 1528 1478 struct md_rdev *rdev = conf->mirrors[i].rdev; ··· 1534 1486 if (!is_discard && rdev && test_bit(WriteMostly, &rdev->flags)) 1535 1487 write_behind = true; 1536 1488 1537 - if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 1538 - atomic_inc(&rdev->nr_pending); 1539 - blocked_rdev = rdev; 1540 - break; 1541 - } 1542 1489 r1_bio->bios[i] = NULL; 1543 1490 if (!rdev || test_bit(Faulty, &rdev->flags)) { 1544 1491 if (i < conf->raid_disks) ··· 1549 1506 1550 1507 is_bad = is_badblock(rdev, r1_bio->sector, max_sectors, 1551 1508 &first_bad, &bad_sectors); 1552 - if (is_bad < 0) { 1553 - /* mustn't write here until the bad block is 1554 - * acknowledged*/ 1555 - set_bit(BlockedBadBlocks, &rdev->flags); 1556 - blocked_rdev = rdev; 1557 - break; 1558 - } 1559 1509 if (is_bad && first_bad <= r1_bio->sector) { 1560 1510 /* Cannot write here at all */ 1561 1511 bad_sectors -= (r1_bio->sector - first_bad); ··· 1579 1543 r1_bio->bios[i] = bio; 1580 1544 } 1581 1545 1582 - if (unlikely(blocked_rdev)) { 1583 - /* Wait for this device to become unblocked */ 1584 - int j; 1585 - 1586 - for (j = 0; j < i; j++) 1587 - if (r1_bio->bios[j]) 1588 - rdev_dec_pending(conf->mirrors[j].rdev, mddev); 1589 - mempool_free(r1_bio, &conf->r1bio_pool); 1590 - allow_barrier(conf, bio->bi_iter.bi_sector); 1591 - 1592 - if (bio->bi_opf & REQ_NOWAIT) { 1593 - bio_wouldblock_error(bio); 1594 - return; 1595 - } 1596 - mddev_add_trace_msg(mddev, "raid1 wait rdev %d blocked", 1597 - blocked_rdev->raid_disk); 1598 - md_wait_for_blocked_rdev(blocked_rdev, mddev); 1599 - wait_barrier(conf, bio->bi_iter.bi_sector, false); 1600 - goto retry_write; 1601 - } 1602 - 1603 1546 /* 1604 1547 * When using a bitmap, we may call alloc_behind_master_bio below. 1605 1548 * alloc_behind_master_bio allocates a copy of the data payload a page ··· 1591 1576 if (max_sectors < bio_sectors(bio)) { 1592 1577 struct bio *split = bio_split(bio, max_sectors, 1593 1578 GFP_NOIO, &conf->bio_split); 1579 + 1580 + if (IS_ERR(split)) { 1581 + error = PTR_ERR(split); 1582 + goto err_handle; 1583 + } 1594 1584 bio_chain(split, bio); 1595 1585 submit_bio_noacct(bio); 1596 1586 bio = split; ··· 1680 1660 1681 1661 /* In case raid1d snuck in to freeze_array */ 1682 1662 wake_up_barrier(conf); 1663 + return; 1664 + err_handle: 1665 + for (k = 0; k < i; k++) { 1666 + if (r1_bio->bios[k]) { 1667 + rdev_dec_pending(conf->mirrors[k].rdev, mddev); 1668 + r1_bio->bios[k] = NULL; 1669 + } 1670 + } 1671 + 1672 + bio->bi_status = errno_to_blk_status(error); 1673 + set_bit(R1BIO_Uptodate, &r1_bio->state); 1674 + raid_end_bio_io(r1_bio); 1683 1675 } 1684 1676 1685 1677 static bool raid1_make_request(struct mddev *mddev, struct bio *bio)

+64 -23

drivers/md/raid10.c

··· 1159 1159 int slot = r10_bio->read_slot; 1160 1160 struct md_rdev *err_rdev = NULL; 1161 1161 gfp_t gfp = GFP_NOIO; 1162 + int error; 1162 1163 1163 1164 if (slot >= 0 && r10_bio->devs[slot].rdev) { 1164 1165 /* ··· 1207 1206 if (max_sectors < bio_sectors(bio)) { 1208 1207 struct bio *split = bio_split(bio, max_sectors, 1209 1208 gfp, &conf->bio_split); 1209 + if (IS_ERR(split)) { 1210 + error = PTR_ERR(split); 1211 + goto err_handle; 1212 + } 1210 1213 bio_chain(split, bio); 1211 1214 allow_barrier(conf); 1212 1215 submit_bio_noacct(bio); ··· 1241 1236 mddev_trace_remap(mddev, read_bio, r10_bio->sector); 1242 1237 submit_bio_noacct(read_bio); 1243 1238 return; 1239 + err_handle: 1240 + atomic_dec(&rdev->nr_pending); 1241 + bio->bi_status = errno_to_blk_status(error); 1242 + set_bit(R10BIO_Uptodate, &r10_bio->state); 1243 + raid_end_bio_io(r10_bio); 1244 1244 } 1245 1245 1246 1246 static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, ··· 1295 1285 1296 1286 static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) 1297 1287 { 1298 - int i; 1299 1288 struct r10conf *conf = mddev->private; 1300 1289 struct md_rdev *blocked_rdev; 1290 + int i; 1301 1291 1302 1292 retry_wait: 1303 1293 blocked_rdev = NULL; ··· 1305 1295 struct md_rdev *rdev, *rrdev; 1306 1296 1307 1297 rdev = conf->mirrors[i].rdev; 1308 - rrdev = conf->mirrors[i].replacement; 1309 - if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 1310 - atomic_inc(&rdev->nr_pending); 1311 - blocked_rdev = rdev; 1312 - break; 1313 - } 1314 - if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { 1315 - atomic_inc(&rrdev->nr_pending); 1316 - blocked_rdev = rrdev; 1317 - break; 1318 - } 1319 - 1320 - if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { 1298 + if (rdev) { 1321 1299 sector_t dev_sector = r10_bio->devs[i].addr; 1322 1300 1323 1301 /* 1324 1302 * Discard request doesn't care the write result 1325 1303 * so it doesn't need to wait blocked disk here. 1326 1304 */ 1327 - if (!r10_bio->sectors) 1328 - continue; 1329 - 1330 - if (rdev_has_badblock(rdev, dev_sector, 1331 - r10_bio->sectors) < 0) { 1305 + if (test_bit(WriteErrorSeen, &rdev->flags) && 1306 + r10_bio->sectors && 1307 + rdev_has_badblock(rdev, dev_sector, 1308 + r10_bio->sectors) < 0) 1332 1309 /* 1333 - * Mustn't write here until the bad block 1334 - * is acknowledged 1310 + * Mustn't write here until the bad 1311 + * block is acknowledged 1335 1312 */ 1336 - atomic_inc(&rdev->nr_pending); 1337 1313 set_bit(BlockedBadBlocks, &rdev->flags); 1314 + 1315 + if (rdev_blocked(rdev)) { 1338 1316 blocked_rdev = rdev; 1317 + atomic_inc(&rdev->nr_pending); 1339 1318 break; 1340 1319 } 1320 + } 1321 + 1322 + rrdev = conf->mirrors[i].replacement; 1323 + if (rrdev && rdev_blocked(rrdev)) { 1324 + atomic_inc(&rrdev->nr_pending); 1325 + blocked_rdev = rrdev; 1326 + break; 1341 1327 } 1342 1328 } 1343 1329 ··· 1353 1347 struct r10bio *r10_bio) 1354 1348 { 1355 1349 struct r10conf *conf = mddev->private; 1356 - int i; 1350 + int i, k; 1357 1351 sector_t sectors; 1358 1352 int max_sectors; 1353 + int error; 1359 1354 1360 1355 if ((mddev_is_clustered(mddev) && 1361 1356 md_cluster_ops->area_resyncing(mddev, WRITE, ··· 1489 1482 if (r10_bio->sectors < bio_sectors(bio)) { 1490 1483 struct bio *split = bio_split(bio, r10_bio->sectors, 1491 1484 GFP_NOIO, &conf->bio_split); 1485 + if (IS_ERR(split)) { 1486 + error = PTR_ERR(split); 1487 + goto err_handle; 1488 + } 1492 1489 bio_chain(split, bio); 1493 1490 allow_barrier(conf); 1494 1491 submit_bio_noacct(bio); ··· 1514 1503 raid10_write_one_disk(mddev, r10_bio, bio, true, i); 1515 1504 } 1516 1505 one_write_done(r10_bio); 1506 + return; 1507 + err_handle: 1508 + for (k = 0; k < i; k++) { 1509 + int d = r10_bio->devs[k].devnum; 1510 + struct md_rdev *rdev = conf->mirrors[d].rdev; 1511 + struct md_rdev *rrdev = conf->mirrors[d].replacement; 1512 + 1513 + if (r10_bio->devs[k].bio) { 1514 + rdev_dec_pending(rdev, mddev); 1515 + r10_bio->devs[k].bio = NULL; 1516 + } 1517 + if (r10_bio->devs[k].repl_bio) { 1518 + rdev_dec_pending(rrdev, mddev); 1519 + r10_bio->devs[k].repl_bio = NULL; 1520 + } 1521 + } 1522 + 1523 + bio->bi_status = errno_to_blk_status(error); 1524 + set_bit(R10BIO_Uptodate, &r10_bio->state); 1525 + raid_end_bio_io(r10_bio); 1517 1526 } 1518 1527 1519 1528 static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) ··· 1675 1644 if (remainder) { 1676 1645 split_size = stripe_size - remainder; 1677 1646 split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); 1647 + if (IS_ERR(split)) { 1648 + bio->bi_status = errno_to_blk_status(PTR_ERR(split)); 1649 + bio_endio(bio); 1650 + return 0; 1651 + } 1678 1652 bio_chain(split, bio); 1679 1653 allow_barrier(conf); 1680 1654 /* Resend the fist split part */ ··· 1690 1654 if (remainder) { 1691 1655 split_size = bio_sectors(bio) - remainder; 1692 1656 split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); 1657 + if (IS_ERR(split)) { 1658 + bio->bi_status = errno_to_blk_status(PTR_ERR(split)); 1659 + bio_endio(bio); 1660 + return 0; 1661 + } 1693 1662 bio_chain(split, bio); 1694 1663 allow_barrier(conf); 1695 1664 /* Resend the second split part */

+1 -1

drivers/md/raid5-ppl.c

··· 258 258 memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED); 259 259 pplhdr->signature = cpu_to_le32(ppl_conf->signature); 260 260 261 - io->seq = atomic64_add_return(1, &ppl_conf->seq); 261 + io->seq = atomic64_inc_return(&ppl_conf->seq); 262 262 pplhdr->generation = cpu_to_le64(io->seq); 263 263 264 264 return io;

+10 -7

drivers/md/raid5.c

··· 4724 4724 if (rdev) { 4725 4725 is_bad = rdev_has_badblock(rdev, sh->sector, 4726 4726 RAID5_STRIPE_SECTORS(conf)); 4727 - if (s->blocked_rdev == NULL 4728 - && (test_bit(Blocked, &rdev->flags) 4729 - || is_bad < 0)) { 4727 + if (s->blocked_rdev == NULL) { 4730 4728 if (is_bad < 0) 4731 - set_bit(BlockedBadBlocks, 4732 - &rdev->flags); 4733 - s->blocked_rdev = rdev; 4734 - atomic_inc(&rdev->nr_pending); 4729 + set_bit(BlockedBadBlocks, &rdev->flags); 4730 + if (rdev_blocked(rdev)) { 4731 + s->blocked_rdev = rdev; 4732 + atomic_inc(&rdev->nr_pending); 4733 + } 4735 4734 } 4736 4735 } 4737 4736 clear_bit(R5_Insync, &dev->flags); ··· 7176 7177 err = mddev_suspend_and_lock(mddev); 7177 7178 if (err) 7178 7179 return err; 7180 + raid5_quiesce(mddev, true); 7181 + 7179 7182 conf = mddev->private; 7180 7183 if (!conf) 7181 7184 err = -ENODEV; ··· 7199 7198 kfree(old_groups); 7200 7199 } 7201 7200 } 7201 + 7202 + raid5_quiesce(mddev, false); 7202 7203 mddev_unlock_and_resume(mddev); 7203 7204 7204 7205 return err ?: len;

+1 -1

drivers/md/raid5.h

··· 633 633 * two caches. 634 634 */ 635 635 int active_name; 636 - char cache_name[2][32]; 636 + char cache_name[2][48]; 637 637 struct kmem_cache *slab_cache; /* for allocating stripes */ 638 638 struct mutex cache_size_mutex; /* Protect changes to cache size */ 639 639

+54 -1

drivers/mmc/core/block.c

··· 2501 2501 !(card->csd.cmdclass & CCC_BLOCK_WRITE); 2502 2502 } 2503 2503 2504 + /* 2505 + * Search for a declared partitions node for the disk in mmc-card related node. 2506 + * 2507 + * This is to permit support for partition table defined in DT in special case 2508 + * where a partition table is not written in the disk and is expected to be 2509 + * passed from the running system. 2510 + * 2511 + * For the user disk, "partitions" node is searched. 2512 + * For the special HW disk, "partitions-" node with the appended name is used 2513 + * following this conversion table (to adhere to JEDEC naming) 2514 + * - boot0 -> partitions-boot1 2515 + * - boot1 -> partitions-boot2 2516 + * - gp0 -> partitions-gp1 2517 + * - gp1 -> partitions-gp2 2518 + * - gp2 -> partitions-gp3 2519 + * - gp3 -> partitions-gp4 2520 + */ 2521 + static struct fwnode_handle *mmc_blk_get_partitions_node(struct device *mmc_dev, 2522 + const char *subname) 2523 + { 2524 + const char *node_name = "partitions"; 2525 + 2526 + if (subname) { 2527 + mmc_dev = mmc_dev->parent; 2528 + 2529 + /* 2530 + * Check if we are allocating a BOOT disk boot0/1 disk. 2531 + * In DT we use the JEDEC naming boot1/2. 2532 + */ 2533 + if (!strcmp(subname, "boot0")) 2534 + node_name = "partitions-boot1"; 2535 + if (!strcmp(subname, "boot1")) 2536 + node_name = "partitions-boot2"; 2537 + /* 2538 + * Check if we are allocating a GP disk gp0/1/2/3 disk. 2539 + * In DT we use the JEDEC naming gp1/2/3/4. 2540 + */ 2541 + if (!strcmp(subname, "gp0")) 2542 + node_name = "partitions-gp1"; 2543 + if (!strcmp(subname, "gp1")) 2544 + node_name = "partitions-gp2"; 2545 + if (!strcmp(subname, "gp2")) 2546 + node_name = "partitions-gp3"; 2547 + if (!strcmp(subname, "gp3")) 2548 + node_name = "partitions-gp4"; 2549 + } 2550 + 2551 + return device_get_named_child_node(mmc_dev, node_name); 2552 + } 2553 + 2504 2554 static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, 2505 2555 struct device *parent, 2506 2556 sector_t size, ··· 2559 2509 int area_type, 2560 2510 unsigned int part_type) 2561 2511 { 2512 + struct fwnode_handle *disk_fwnode; 2562 2513 struct mmc_blk_data *md; 2563 2514 int devidx, ret; 2564 2515 char cap_str[10]; ··· 2661 2610 /* used in ->open, must be set before add_disk: */ 2662 2611 if (area_type == MMC_BLK_DATA_AREA_MAIN) 2663 2612 dev_set_drvdata(&card->dev, md); 2664 - ret = device_add_disk(md->parent, md->disk, mmc_disk_attr_groups); 2613 + disk_fwnode = mmc_blk_get_partitions_node(parent, subname); 2614 + ret = add_disk_fwnode(md->parent, md->disk, mmc_disk_attr_groups, 2615 + disk_fwnode); 2665 2616 if (ret) 2666 2617 goto err_put_disk; 2667 2618 return md;

+1 -1

drivers/nvme/host/apple.c

··· 649 649 650 650 found = apple_nvme_poll_cq(q, &iob); 651 651 652 - if (!rq_list_empty(iob.req_list)) 652 + if (!rq_list_empty(&iob.req_list)) 653 653 apple_nvme_complete_batch(&iob); 654 654 655 655 return found;

+26 -12

drivers/nvme/host/core.c

··· 42 42 bool is_readonly; 43 43 bool is_ready; 44 44 bool is_removed; 45 + bool is_rotational; 46 + bool no_vwc; 45 47 }; 46 48 47 49 unsigned int admin_timeout = 60; ··· 1641 1639 info->is_shared = id->nmic & NVME_NS_NMIC_SHARED; 1642 1640 info->is_readonly = id->nsattr & NVME_NS_ATTR_RO; 1643 1641 info->is_ready = id->nstat & NVME_NSTAT_NRDY; 1642 + info->is_rotational = id->nsfeat & NVME_NS_ROTATIONAL; 1643 + info->no_vwc = id->nsfeat & NVME_NS_VWC_NOT_PRESENT; 1644 1644 } 1645 1645 kfree(id); 1646 1646 return ret; ··· 2189 2185 ns->head->ids.csi == NVME_CSI_ZNS) 2190 2186 nvme_update_zone_info(ns, &lim, &zi); 2191 2187 2192 - if (ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT) 2188 + if ((ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT) && !info->no_vwc) 2193 2189 lim.features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA; 2194 2190 else 2195 2191 lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA); 2192 + 2193 + if (info->is_rotational) 2194 + lim.features |= BLK_FEAT_ROTATIONAL; 2196 2195 2197 2196 /* 2198 2197 * Register a metadata profile for PI, or the plain non-integrity NVMe ··· 3643 3636 head->ns_id = info->nsid; 3644 3637 head->ids = info->ids; 3645 3638 head->shared = info->is_shared; 3639 + head->rotational = info->is_rotational; 3646 3640 ratelimit_state_init(&head->rs_nuse, 5 * HZ, 1); 3647 3641 ratelimit_set_flags(&head->rs_nuse, RATELIMIT_MSG_ON_RELEASE); 3648 3642 kref_init(&head->ref); ··· 4025 4017 { 4026 4018 struct nvme_ns_info info = { .nsid = nsid }; 4027 4019 struct nvme_ns *ns; 4028 - int ret; 4020 + int ret = 1; 4029 4021 4030 4022 if (nvme_identify_ns_descs(ctrl, &info)) 4031 4023 return; ··· 4042 4034 * set up a namespace. If not fall back to the legacy version. 4043 4035 */ 4044 4036 if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) || 4045 - (info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS)) 4037 + (info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS) || 4038 + ctrl->vs >= NVME_VS(2, 0, 0)) 4046 4039 ret = nvme_ns_info_from_id_cs_indep(ctrl, &info); 4047 - else 4040 + if (ret > 0) 4048 4041 ret = nvme_ns_info_from_identify(ctrl, &info); 4049 4042 4050 4043 if (info.is_removed) ··· 4904 4895 srcu_idx = srcu_read_lock(&ctrl->srcu); 4905 4896 list_for_each_entry_srcu(ns, &ctrl->namespaces, list, 4906 4897 srcu_read_lock_held(&ctrl->srcu)) 4907 - blk_mq_unfreeze_queue(ns->queue); 4898 + blk_mq_unfreeze_queue_non_owner(ns->queue); 4908 4899 srcu_read_unlock(&ctrl->srcu, srcu_idx); 4909 4900 clear_bit(NVME_CTRL_FROZEN, &ctrl->flags); 4910 4901 } ··· 4949 4940 srcu_idx = srcu_read_lock(&ctrl->srcu); 4950 4941 list_for_each_entry_srcu(ns, &ctrl->namespaces, list, 4951 4942 srcu_read_lock_held(&ctrl->srcu)) 4952 - blk_freeze_queue_start(ns->queue); 4943 + /* 4944 + * Typical non_owner use case is from pci driver, in which 4945 + * start_freeze is called from timeout work function, but 4946 + * unfreeze is done in reset work context 4947 + */ 4948 + blk_freeze_queue_start_non_owner(ns->queue); 4953 4949 srcu_read_unlock(&ctrl->srcu, srcu_idx); 4954 4950 } 4955 4951 EXPORT_SYMBOL_GPL(nvme_start_freeze); ··· 5050 5036 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_nvm) != NVME_IDENTIFY_DATA_SIZE); 5051 5037 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); 5052 5038 BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); 5039 + BUILD_BUG_ON(sizeof(struct nvme_endurance_group_log) != 512); 5040 + BUILD_BUG_ON(sizeof(struct nvme_rotational_media_log) != 512); 5053 5041 BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64); 5054 5042 BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64); 5055 5043 BUILD_BUG_ON(sizeof(struct nvme_feat_host_behavior) != 512); ··· 5060 5044 5061 5045 static int __init nvme_core_init(void) 5062 5046 { 5047 + unsigned int wq_flags = WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS; 5063 5048 int result = -ENOMEM; 5064 5049 5065 5050 _nvme_check_size(); 5066 5051 5067 - nvme_wq = alloc_workqueue("nvme-wq", 5068 - WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 5052 + nvme_wq = alloc_workqueue("nvme-wq", wq_flags, 0); 5069 5053 if (!nvme_wq) 5070 5054 goto out; 5071 5055 5072 - nvme_reset_wq = alloc_workqueue("nvme-reset-wq", 5073 - WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 5056 + nvme_reset_wq = alloc_workqueue("nvme-reset-wq", wq_flags, 0); 5074 5057 if (!nvme_reset_wq) 5075 5058 goto destroy_wq; 5076 5059 5077 - nvme_delete_wq = alloc_workqueue("nvme-delete-wq", 5078 - WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 5060 + nvme_delete_wq = alloc_workqueue("nvme-delete-wq", wq_flags, 0); 5079 5061 if (!nvme_delete_wq) 5080 5062 goto destroy_reset_wq; 5081 5063

+9 -12

drivers/nvme/host/ioctl.c

··· 114 114 115 115 static int nvme_map_user_request(struct request *req, u64 ubuffer, 116 116 unsigned bufflen, void __user *meta_buffer, unsigned meta_len, 117 - u32 meta_seed, struct io_uring_cmd *ioucmd, unsigned int flags) 117 + struct io_uring_cmd *ioucmd, unsigned int flags) 118 118 { 119 119 struct request_queue *q = req->q; 120 120 struct nvme_ns *ns = q->queuedata; ··· 152 152 bio_set_dev(bio, bdev); 153 153 154 154 if (has_metadata) { 155 - ret = blk_rq_integrity_map_user(req, meta_buffer, meta_len, 156 - meta_seed); 155 + ret = blk_rq_integrity_map_user(req, meta_buffer, meta_len); 157 156 if (ret) 158 157 goto out_unmap; 159 158 } ··· 169 170 170 171 static int nvme_submit_user_cmd(struct request_queue *q, 171 172 struct nvme_command *cmd, u64 ubuffer, unsigned bufflen, 172 - void __user *meta_buffer, unsigned meta_len, u32 meta_seed, 173 + void __user *meta_buffer, unsigned meta_len, 173 174 u64 *result, unsigned timeout, unsigned int flags) 174 175 { 175 176 struct nvme_ns *ns = q->queuedata; ··· 186 187 req->timeout = timeout; 187 188 if (ubuffer && bufflen) { 188 189 ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer, 189 - meta_len, meta_seed, NULL, flags); 190 + meta_len, NULL, flags); 190 191 if (ret) 191 192 return ret; 192 193 } ··· 267 268 c.rw.lbatm = cpu_to_le16(io.appmask); 268 269 269 270 return nvme_submit_user_cmd(ns->queue, &c, io.addr, length, metadata, 270 - meta_len, lower_32_bits(io.slba), NULL, 0, 0); 271 + meta_len, NULL, 0, 0); 271 272 } 272 273 273 274 static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl, ··· 322 323 323 324 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, 324 325 cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata), 325 - cmd.metadata_len, 0, &result, timeout, 0); 326 + cmd.metadata_len, &result, timeout, 0); 326 327 327 328 if (status >= 0) { 328 329 if (put_user(result, &ucmd->result)) ··· 369 370 370 371 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, 371 372 cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata), 372 - cmd.metadata_len, 0, &cmd.result, timeout, flags); 373 + cmd.metadata_len, &cmd.result, timeout, flags); 373 374 374 375 if (status >= 0) { 375 376 if (put_user(cmd.result, &ucmd->result)) ··· 401 402 static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu( 402 403 struct io_uring_cmd *ioucmd) 403 404 { 404 - return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu; 405 + return io_uring_cmd_to_pdu(ioucmd, struct nvme_uring_cmd_pdu); 405 406 } 406 407 407 408 static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd, ··· 506 507 if (d.addr && d.data_len) { 507 508 ret = nvme_map_user_request(req, d.addr, 508 509 d.data_len, nvme_to_user_ptr(d.metadata), 509 - d.metadata_len, 0, ioucmd, vec); 510 + d.metadata_len, ioucmd, vec); 510 511 if (ret) 511 512 return ret; 512 513 } ··· 633 634 { 634 635 struct nvme_ctrl *ctrl = ns->ctrl; 635 636 int ret; 636 - 637 - BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu)); 638 637 639 638 ret = nvme_uring_cmd_checks(issue_flags); 640 639 if (ret)

-2

drivers/nvme/host/multipath.c

··· 635 635 lim.features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT | BLK_FEAT_POLL; 636 636 if (head->ids.csi == NVME_CSI_ZNS) 637 637 lim.features |= BLK_FEAT_ZONED; 638 - else 639 - lim.max_zone_append_sectors = 0; 640 638 641 639 head->disk = blk_alloc_disk(&lim, ctrl->numa_node); 642 640 if (IS_ERR(head->disk))

+1

drivers/nvme/host/nvme.h

··· 474 474 struct list_head entry; 475 475 struct kref ref; 476 476 bool shared; 477 + bool rotational; 477 478 bool passthru_err_log_enabled; 478 479 struct nvme_effects_log *effects; 479 480 u64 nuse;

+82 -38

drivers/nvme/host/pci.c

··· 141 141 struct nvme_ctrl ctrl; 142 142 u32 last_ps; 143 143 bool hmb; 144 + struct sg_table *hmb_sgt; 144 145 145 146 mempool_t *iod_mempool; 146 147 ··· 154 153 /* host memory buffer support: */ 155 154 u64 host_mem_size; 156 155 u32 nr_host_mem_descs; 156 + u32 host_mem_descs_size; 157 157 dma_addr_t host_mem_descs_dma; 158 158 struct nvme_host_mem_buf_desc *host_mem_descs; 159 159 void **host_mem_desc_bufs; ··· 904 902 return BLK_STS_OK; 905 903 } 906 904 907 - static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist) 905 + static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct rq_list *rqlist) 908 906 { 907 + struct request *req; 908 + 909 909 spin_lock(&nvmeq->sq_lock); 910 - while (!rq_list_empty(*rqlist)) { 911 - struct request *req = rq_list_pop(rqlist); 910 + while ((req = rq_list_pop(rqlist))) { 912 911 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 913 912 914 913 nvme_sq_copy_cmd(nvmeq, &iod->cmd); ··· 932 929 return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK; 933 930 } 934 931 935 - static void nvme_queue_rqs(struct request **rqlist) 932 + static void nvme_queue_rqs(struct rq_list *rqlist) 936 933 { 937 - struct request *req, *next, *prev = NULL; 938 - struct request *requeue_list = NULL; 934 + struct rq_list submit_list = { }; 935 + struct rq_list requeue_list = { }; 936 + struct nvme_queue *nvmeq = NULL; 937 + struct request *req; 939 938 940 - rq_list_for_each_safe(rqlist, req, next) { 941 - struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 939 + while ((req = rq_list_pop(rqlist))) { 940 + if (nvmeq && nvmeq != req->mq_hctx->driver_data) 941 + nvme_submit_cmds(nvmeq, &submit_list); 942 + nvmeq = req->mq_hctx->driver_data; 942 943 943 - if (!nvme_prep_rq_batch(nvmeq, req)) { 944 - /* detach 'req' and add to remainder list */ 945 - rq_list_move(rqlist, &requeue_list, req, prev); 946 - 947 - req = prev; 948 - if (!req) 949 - continue; 950 - } 951 - 952 - if (!next || req->mq_hctx != next->mq_hctx) { 953 - /* detach rest of list, and submit */ 954 - req->rq_next = NULL; 955 - nvme_submit_cmds(nvmeq, rqlist); 956 - *rqlist = next; 957 - prev = NULL; 958 - } else 959 - prev = req; 944 + if (nvme_prep_rq_batch(nvmeq, req)) 945 + rq_list_add_tail(&submit_list, req); 946 + else 947 + rq_list_add_tail(&requeue_list, req); 960 948 } 961 949 950 + if (nvmeq) 951 + nvme_submit_cmds(nvmeq, &submit_list); 962 952 *rqlist = requeue_list; 963 953 } 964 954 ··· 1079 1083 DEFINE_IO_COMP_BATCH(iob); 1080 1084 1081 1085 if (nvme_poll_cq(nvmeq, &iob)) { 1082 - if (!rq_list_empty(iob.req_list)) 1086 + if (!rq_list_empty(&iob.req_list)) 1083 1087 nvme_pci_complete_batch(&iob); 1084 1088 return IRQ_HANDLED; 1085 1089 } ··· 1947 1951 return ret; 1948 1952 } 1949 1953 1950 - static void nvme_free_host_mem(struct nvme_dev *dev) 1954 + static void nvme_free_host_mem_multi(struct nvme_dev *dev) 1951 1955 { 1952 1956 int i; 1953 1957 ··· 1962 1966 1963 1967 kfree(dev->host_mem_desc_bufs); 1964 1968 dev->host_mem_desc_bufs = NULL; 1965 - dma_free_coherent(dev->dev, 1966 - dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs), 1969 + } 1970 + 1971 + static void nvme_free_host_mem(struct nvme_dev *dev) 1972 + { 1973 + if (dev->hmb_sgt) 1974 + dma_free_noncontiguous(dev->dev, dev->host_mem_size, 1975 + dev->hmb_sgt, DMA_BIDIRECTIONAL); 1976 + else 1977 + nvme_free_host_mem_multi(dev); 1978 + 1979 + dma_free_coherent(dev->dev, dev->host_mem_descs_size, 1967 1980 dev->host_mem_descs, dev->host_mem_descs_dma); 1968 1981 dev->host_mem_descs = NULL; 1982 + dev->host_mem_descs_size = 0; 1969 1983 dev->nr_host_mem_descs = 0; 1970 1984 } 1971 1985 1972 - static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, 1986 + static int nvme_alloc_host_mem_single(struct nvme_dev *dev, u64 size) 1987 + { 1988 + dev->hmb_sgt = dma_alloc_noncontiguous(dev->dev, size, 1989 + DMA_BIDIRECTIONAL, GFP_KERNEL, 0); 1990 + if (!dev->hmb_sgt) 1991 + return -ENOMEM; 1992 + 1993 + dev->host_mem_descs = dma_alloc_coherent(dev->dev, 1994 + sizeof(*dev->host_mem_descs), &dev->host_mem_descs_dma, 1995 + GFP_KERNEL); 1996 + if (!dev->host_mem_descs) { 1997 + dma_free_noncontiguous(dev->dev, dev->host_mem_size, 1998 + dev->hmb_sgt, DMA_BIDIRECTIONAL); 1999 + dev->hmb_sgt = NULL; 2000 + return -ENOMEM; 2001 + } 2002 + dev->host_mem_size = size; 2003 + dev->host_mem_descs_size = sizeof(*dev->host_mem_descs); 2004 + dev->nr_host_mem_descs = 1; 2005 + 2006 + dev->host_mem_descs[0].addr = 2007 + cpu_to_le64(dev->hmb_sgt->sgl->dma_address); 2008 + dev->host_mem_descs[0].size = cpu_to_le32(size / NVME_CTRL_PAGE_SIZE); 2009 + return 0; 2010 + } 2011 + 2012 + static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred, 1973 2013 u32 chunk_size) 1974 2014 { 1975 2015 struct nvme_host_mem_buf_desc *descs; 1976 - u32 max_entries, len; 2016 + u32 max_entries, len, descs_size; 1977 2017 dma_addr_t descs_dma; 1978 2018 int i = 0; 1979 2019 void **bufs; ··· 2022 1990 if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries) 2023 1991 max_entries = dev->ctrl.hmmaxd; 2024 1992 2025 - descs = dma_alloc_coherent(dev->dev, max_entries * sizeof(*descs), 2026 - &descs_dma, GFP_KERNEL); 1993 + descs_size = max_entries * sizeof(*descs); 1994 + descs = dma_alloc_coherent(dev->dev, descs_size, &descs_dma, 1995 + GFP_KERNEL); 2027 1996 if (!descs) 2028 1997 goto out; 2029 1998 ··· 2053 2020 dev->host_mem_size = size; 2054 2021 dev->host_mem_descs = descs; 2055 2022 dev->host_mem_descs_dma = descs_dma; 2023 + dev->host_mem_descs_size = descs_size; 2056 2024 dev->host_mem_desc_bufs = bufs; 2057 2025 return 0; 2058 2026 ··· 2068 2034 2069 2035 kfree(bufs); 2070 2036 out_free_descs: 2071 - dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs, 2072 - descs_dma); 2037 + dma_free_coherent(dev->dev, descs_size, descs, descs_dma); 2073 2038 out: 2074 2039 dev->host_mem_descs = NULL; 2075 2040 return -ENOMEM; ··· 2080 2047 u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2); 2081 2048 u64 chunk_size; 2082 2049 2050 + /* 2051 + * If there is an IOMMU that can merge pages, try a virtually 2052 + * non-contiguous allocation for a single segment first. 2053 + */ 2054 + if (!(PAGE_SIZE & dma_get_merge_boundary(dev->dev))) { 2055 + if (!nvme_alloc_host_mem_single(dev, preferred)) 2056 + return 0; 2057 + } 2058 + 2083 2059 /* start big and work our way down */ 2084 2060 for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) { 2085 - if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) { 2061 + if (!nvme_alloc_host_mem_multi(dev, preferred, chunk_size)) { 2086 2062 if (!min || dev->host_mem_size >= min) 2087 2063 return 0; 2088 2064 nvme_free_host_mem(dev); ··· 2139 2097 } 2140 2098 2141 2099 dev_info(dev->ctrl.device, 2142 - "allocated %lld MiB host memory buffer.\n", 2143 - dev->host_mem_size >> ilog2(SZ_1M)); 2100 + "allocated %lld MiB host memory buffer (%u segment%s).\n", 2101 + dev->host_mem_size >> ilog2(SZ_1M), 2102 + dev->nr_host_mem_descs, 2103 + str_plural(dev->nr_host_mem_descs)); 2144 2104 } 2145 2105 2146 2106 ret = nvme_set_host_mem(dev, enable_bits);

+52 -6

drivers/nvme/host/trace.c

··· 228 228 229 229 static const char *nvme_trace_resv_reg(struct trace_seq *p, u8 *cdw10) 230 230 { 231 + static const char * const rrega_strs[] = { 232 + [0x00] = "register", 233 + [0x01] = "unregister", 234 + [0x02] = "replace", 235 + }; 231 236 const char *ret = trace_seq_buffer_ptr(p); 232 237 u8 rrega = cdw10[0] & 0x7; 233 238 u8 iekey = (cdw10[0] >> 3) & 0x1; 234 239 u8 ptpl = (cdw10[3] >> 6) & 0x3; 240 + const char *rrega_str; 235 241 236 - trace_seq_printf(p, "rrega=%u, iekey=%u, ptpl=%u", 237 - rrega, iekey, ptpl); 242 + if (rrega < ARRAY_SIZE(rrega_strs) && rrega_strs[rrega]) 243 + rrega_str = rrega_strs[rrega]; 244 + else 245 + rrega_str = "reserved"; 246 + 247 + trace_seq_printf(p, "rrega=%u:%s, iekey=%u, ptpl=%u", 248 + rrega, rrega_str, iekey, ptpl); 238 249 trace_seq_putc(p, 0); 239 250 240 251 return ret; 241 252 } 242 253 254 + static const char * const rtype_strs[] = { 255 + [0x00] = "reserved", 256 + [0x01] = "write exclusive", 257 + [0x02] = "exclusive access", 258 + [0x03] = "write exclusive registrants only", 259 + [0x04] = "exclusive access registrants only", 260 + [0x05] = "write exclusive all registrants", 261 + [0x06] = "exclusive access all registrants", 262 + }; 263 + 243 264 static const char *nvme_trace_resv_acq(struct trace_seq *p, u8 *cdw10) 244 265 { 266 + static const char * const racqa_strs[] = { 267 + [0x00] = "acquire", 268 + [0x01] = "preempt", 269 + [0x02] = "preempt and abort", 270 + }; 245 271 const char *ret = trace_seq_buffer_ptr(p); 246 272 u8 racqa = cdw10[0] & 0x7; 247 273 u8 iekey = (cdw10[0] >> 3) & 0x1; 248 274 u8 rtype = cdw10[1]; 275 + const char *racqa_str = "reserved"; 276 + const char *rtype_str = "reserved"; 249 277 250 - trace_seq_printf(p, "racqa=%u, iekey=%u, rtype=%u", 251 - racqa, iekey, rtype); 278 + if (racqa < ARRAY_SIZE(racqa_strs) && racqa_strs[racqa]) 279 + racqa_str = racqa_strs[racqa]; 280 + 281 + if (rtype < ARRAY_SIZE(rtype_strs) && rtype_strs[rtype]) 282 + rtype_str = rtype_strs[rtype]; 283 + 284 + trace_seq_printf(p, "racqa=%u:%s, iekey=%u, rtype=%u:%s", 285 + racqa, racqa_str, iekey, rtype, rtype_str); 252 286 trace_seq_putc(p, 0); 253 287 254 288 return ret; ··· 290 256 291 257 static const char *nvme_trace_resv_rel(struct trace_seq *p, u8 *cdw10) 292 258 { 259 + static const char * const rrela_strs[] = { 260 + [0x00] = "release", 261 + [0x01] = "clear", 262 + }; 293 263 const char *ret = trace_seq_buffer_ptr(p); 294 264 u8 rrela = cdw10[0] & 0x7; 295 265 u8 iekey = (cdw10[0] >> 3) & 0x1; 296 266 u8 rtype = cdw10[1]; 267 + const char *rrela_str = "reserved"; 268 + const char *rtype_str = "reserved"; 297 269 298 - trace_seq_printf(p, "rrela=%u, iekey=%u, rtype=%u", 299 - rrela, iekey, rtype); 270 + if (rrela < ARRAY_SIZE(rrela_strs) && rrela_strs[rrela]) 271 + rrela_str = rrela_strs[rrela]; 272 + 273 + if (rtype < ARRAY_SIZE(rtype_strs) && rtype_strs[rtype]) 274 + rtype_str = rtype_strs[rtype]; 275 + 276 + trace_seq_printf(p, "rrela=%u:%s, iekey=%u, rtype=%u:%s", 277 + rrela, rrela_str, iekey, rtype, rtype_str); 300 278 trace_seq_putc(p, 0); 301 279 302 280 return ret;

+1 -1

drivers/nvme/host/zns.c

··· 111 111 lim->features |= BLK_FEAT_ZONED; 112 112 lim->max_open_zones = zi->max_open_zones; 113 113 lim->max_active_zones = zi->max_active_zones; 114 - lim->max_zone_append_sectors = ns->ctrl->max_zone_append; 114 + lim->max_hw_zone_append_sectors = ns->ctrl->max_zone_append; 115 115 lim->chunk_sectors = ns->head->zsze = 116 116 nvme_lba_to_sect(ns->head, zi->zone_size); 117 117 }

+1 -1

drivers/nvme/target/Makefile

··· 10 10 obj-$(CONFIG_NVME_TARGET_TCP) += nvmet-tcp.o 11 11 12 12 nvmet-y += core.o configfs.o admin-cmd.o fabrics-cmd.o \ 13 - discovery.o io-cmd-file.o io-cmd-bdev.o 13 + discovery.o io-cmd-file.o io-cmd-bdev.o pr.o 14 14 nvmet-$(CONFIG_NVME_TARGET_DEBUGFS) += debugfs.o 15 15 nvmet-$(CONFIG_NVME_TARGET_PASSTHRU) += passthru.o 16 16 nvmet-$(CONFIG_BLK_DEV_ZONED) += zns.o

+283 -5

drivers/nvme/target/admin-cmd.c

··· 71 71 nvmet_req_complete(req, 0); 72 72 } 73 73 74 + static void nvmet_execute_get_supported_log_pages(struct nvmet_req *req) 75 + { 76 + struct nvme_supported_log *logs; 77 + u16 status; 78 + 79 + logs = kzalloc(sizeof(*logs), GFP_KERNEL); 80 + if (!logs) { 81 + status = NVME_SC_INTERNAL; 82 + goto out; 83 + } 84 + 85 + logs->lids[NVME_LOG_SUPPORTED] = cpu_to_le32(NVME_LIDS_LSUPP); 86 + logs->lids[NVME_LOG_ERROR] = cpu_to_le32(NVME_LIDS_LSUPP); 87 + logs->lids[NVME_LOG_SMART] = cpu_to_le32(NVME_LIDS_LSUPP); 88 + logs->lids[NVME_LOG_FW_SLOT] = cpu_to_le32(NVME_LIDS_LSUPP); 89 + logs->lids[NVME_LOG_CHANGED_NS] = cpu_to_le32(NVME_LIDS_LSUPP); 90 + logs->lids[NVME_LOG_CMD_EFFECTS] = cpu_to_le32(NVME_LIDS_LSUPP); 91 + logs->lids[NVME_LOG_ENDURANCE_GROUP] = cpu_to_le32(NVME_LIDS_LSUPP); 92 + logs->lids[NVME_LOG_ANA] = cpu_to_le32(NVME_LIDS_LSUPP); 93 + logs->lids[NVME_LOG_FEATURES] = cpu_to_le32(NVME_LIDS_LSUPP); 94 + logs->lids[NVME_LOG_RMI] = cpu_to_le32(NVME_LIDS_LSUPP); 95 + logs->lids[NVME_LOG_RESERVATION] = cpu_to_le32(NVME_LIDS_LSUPP); 96 + 97 + status = nvmet_copy_to_sgl(req, 0, logs, sizeof(*logs)); 98 + kfree(logs); 99 + out: 100 + nvmet_req_complete(req, status); 101 + } 102 + 74 103 static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, 75 104 struct nvme_smart_log *slog) 76 105 { ··· 159 130 return NVME_SC_SUCCESS; 160 131 } 161 132 133 + static void nvmet_execute_get_log_page_rmi(struct nvmet_req *req) 134 + { 135 + struct nvme_rotational_media_log *log; 136 + struct gendisk *disk; 137 + u16 status; 138 + 139 + req->cmd->common.nsid = cpu_to_le32(le16_to_cpu( 140 + req->cmd->get_log_page.lsi)); 141 + status = nvmet_req_find_ns(req); 142 + if (status) 143 + goto out; 144 + 145 + if (!req->ns->bdev || bdev_nonrot(req->ns->bdev)) { 146 + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; 147 + goto out; 148 + } 149 + 150 + if (req->transfer_len != sizeof(*log)) { 151 + status = NVME_SC_SGL_INVALID_DATA | NVME_STATUS_DNR; 152 + goto out; 153 + } 154 + 155 + log = kzalloc(sizeof(*log), GFP_KERNEL); 156 + if (!log) 157 + goto out; 158 + 159 + log->endgid = req->cmd->get_log_page.lsi; 160 + disk = req->ns->bdev->bd_disk; 161 + if (disk && disk->ia_ranges) 162 + log->numa = cpu_to_le16(disk->ia_ranges->nr_ia_ranges); 163 + else 164 + log->numa = cpu_to_le16(1); 165 + 166 + status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log)); 167 + kfree(log); 168 + out: 169 + nvmet_req_complete(req, status); 170 + } 171 + 162 172 static void nvmet_execute_get_log_page_smart(struct nvmet_req *req) 163 173 { 164 174 struct nvme_smart_log *log; ··· 244 176 log->iocs[nvme_cmd_read] = 245 177 log->iocs[nvme_cmd_flush] = 246 178 log->iocs[nvme_cmd_dsm] = 179 + log->iocs[nvme_cmd_resv_acquire] = 180 + log->iocs[nvme_cmd_resv_register] = 181 + log->iocs[nvme_cmd_resv_release] = 182 + log->iocs[nvme_cmd_resv_report] = 247 183 cpu_to_le32(NVME_CMD_EFFECTS_CSUPP); 248 184 log->iocs[nvme_cmd_write] = 249 185 log->iocs[nvme_cmd_write_zeroes] = ··· 344 272 return struct_size(desc, nsids, count); 345 273 } 346 274 275 + static void nvmet_execute_get_log_page_endgrp(struct nvmet_req *req) 276 + { 277 + u64 host_reads, host_writes, data_units_read, data_units_written; 278 + struct nvme_endurance_group_log *log; 279 + u16 status; 280 + 281 + /* 282 + * The target driver emulates each endurance group as its own 283 + * namespace, reusing the nsid as the endurance group identifier. 284 + */ 285 + req->cmd->common.nsid = cpu_to_le32(le16_to_cpu( 286 + req->cmd->get_log_page.lsi)); 287 + status = nvmet_req_find_ns(req); 288 + if (status) 289 + goto out; 290 + 291 + log = kzalloc(sizeof(*log), GFP_KERNEL); 292 + if (!log) { 293 + status = NVME_SC_INTERNAL; 294 + goto out; 295 + } 296 + 297 + if (!req->ns->bdev) 298 + goto copy; 299 + 300 + host_reads = part_stat_read(req->ns->bdev, ios[READ]); 301 + data_units_read = 302 + DIV_ROUND_UP(part_stat_read(req->ns->bdev, sectors[READ]), 1000); 303 + host_writes = part_stat_read(req->ns->bdev, ios[WRITE]); 304 + data_units_written = 305 + DIV_ROUND_UP(part_stat_read(req->ns->bdev, sectors[WRITE]), 1000); 306 + 307 + put_unaligned_le64(host_reads, &log->hrc[0]); 308 + put_unaligned_le64(data_units_read, &log->dur[0]); 309 + put_unaligned_le64(host_writes, &log->hwc[0]); 310 + put_unaligned_le64(data_units_written, &log->duw[0]); 311 + copy: 312 + status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log)); 313 + kfree(log); 314 + out: 315 + nvmet_req_complete(req, status); 316 + } 317 + 347 318 static void nvmet_execute_get_log_page_ana(struct nvmet_req *req) 348 319 { 349 320 struct nvme_ana_rsp_hdr hdr = { 0, }; ··· 432 317 nvmet_req_complete(req, status); 433 318 } 434 319 320 + static void nvmet_execute_get_log_page_features(struct nvmet_req *req) 321 + { 322 + struct nvme_supported_features_log *features; 323 + u16 status; 324 + 325 + features = kzalloc(sizeof(*features), GFP_KERNEL); 326 + if (!features) { 327 + status = NVME_SC_INTERNAL; 328 + goto out; 329 + } 330 + 331 + features->fis[NVME_FEAT_NUM_QUEUES] = 332 + cpu_to_le32(NVME_FIS_FSUPP | NVME_FIS_CSCPE); 333 + features->fis[NVME_FEAT_KATO] = 334 + cpu_to_le32(NVME_FIS_FSUPP | NVME_FIS_CSCPE); 335 + features->fis[NVME_FEAT_ASYNC_EVENT] = 336 + cpu_to_le32(NVME_FIS_FSUPP | NVME_FIS_CSCPE); 337 + features->fis[NVME_FEAT_HOST_ID] = 338 + cpu_to_le32(NVME_FIS_FSUPP | NVME_FIS_CSCPE); 339 + features->fis[NVME_FEAT_WRITE_PROTECT] = 340 + cpu_to_le32(NVME_FIS_FSUPP | NVME_FIS_NSCPE); 341 + features->fis[NVME_FEAT_RESV_MASK] = 342 + cpu_to_le32(NVME_FIS_FSUPP | NVME_FIS_NSCPE); 343 + 344 + status = nvmet_copy_to_sgl(req, 0, features, sizeof(*features)); 345 + kfree(features); 346 + out: 347 + nvmet_req_complete(req, status); 348 + } 349 + 435 350 static void nvmet_execute_get_log_page(struct nvmet_req *req) 436 351 { 437 352 if (!nvmet_check_transfer_len(req, nvmet_get_log_page_len(req->cmd))) 438 353 return; 439 354 440 355 switch (req->cmd->get_log_page.lid) { 356 + case NVME_LOG_SUPPORTED: 357 + return nvmet_execute_get_supported_log_pages(req); 441 358 case NVME_LOG_ERROR: 442 359 return nvmet_execute_get_log_page_error(req); 443 360 case NVME_LOG_SMART: ··· 485 338 return nvmet_execute_get_log_changed_ns(req); 486 339 case NVME_LOG_CMD_EFFECTS: 487 340 return nvmet_execute_get_log_cmd_effects_ns(req); 341 + case NVME_LOG_ENDURANCE_GROUP: 342 + return nvmet_execute_get_log_page_endgrp(req); 488 343 case NVME_LOG_ANA: 489 344 return nvmet_execute_get_log_page_ana(req); 345 + case NVME_LOG_FEATURES: 346 + return nvmet_execute_get_log_page_features(req); 347 + case NVME_LOG_RMI: 348 + return nvmet_execute_get_log_page_rmi(req); 349 + case NVME_LOG_RESERVATION: 350 + return nvmet_execute_get_log_page_resv(req); 490 351 } 491 352 pr_debug("unhandled lid %d on qid %d\n", 492 353 req->cmd->get_log_page.lid, req->sq->qid); ··· 588 433 id->nn = cpu_to_le32(NVMET_MAX_NAMESPACES); 589 434 id->mnan = cpu_to_le32(NVMET_MAX_NAMESPACES); 590 435 id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM | 591 - NVME_CTRL_ONCS_WRITE_ZEROES); 436 + NVME_CTRL_ONCS_WRITE_ZEROES | 437 + NVME_CTRL_ONCS_RESERVATIONS); 592 438 593 439 /* XXX: don't report vwc if the underlying device is write through */ 594 440 id->vwc = NVME_CTRL_VWC_PRESENT; ··· 622 466 id->iorcsz = cpu_to_le32(sizeof(struct nvme_completion) / 16); 623 467 624 468 id->msdbd = ctrl->ops->msdbd; 469 + 470 + /* 471 + * Endurance group identifier is 16 bits, so we can't let namespaces 472 + * overflow that since we reuse the nsid 473 + */ 474 + BUILD_BUG_ON(NVMET_MAX_NAMESPACES > USHRT_MAX); 475 + id->endgidmax = cpu_to_le16(NVMET_MAX_NAMESPACES); 625 476 626 477 id->anacap = (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3) | (1 << 4); 627 478 id->anatt = 10; /* random value */ ··· 714 551 id->nmic = NVME_NS_NMIC_SHARED; 715 552 id->anagrpid = cpu_to_le32(req->ns->anagrpid); 716 553 554 + if (req->ns->pr.enable) 555 + id->rescap = NVME_PR_SUPPORT_WRITE_EXCLUSIVE | 556 + NVME_PR_SUPPORT_EXCLUSIVE_ACCESS | 557 + NVME_PR_SUPPORT_WRITE_EXCLUSIVE_REG_ONLY | 558 + NVME_PR_SUPPORT_EXCLUSIVE_ACCESS_REG_ONLY | 559 + NVME_PR_SUPPORT_WRITE_EXCLUSIVE_ALL_REGS | 560 + NVME_PR_SUPPORT_EXCLUSIVE_ACCESS_ALL_REGS | 561 + NVME_PR_SUPPORT_IEKEY_VER_1_3_DEF; 562 + 563 + /* 564 + * Since we don't know any better, every namespace is its own endurance 565 + * group. 566 + */ 567 + id->endgid = cpu_to_le16(req->ns->nsid); 568 + 717 569 memcpy(&id->nguid, &req->ns->nguid, sizeof(id->nguid)); 718 570 719 571 id->lbaf[0].ds = req->ns->blksize_shift; ··· 754 576 nvmet_req_complete(req, status); 755 577 } 756 578 757 - static void nvmet_execute_identify_nslist(struct nvmet_req *req) 579 + static void nvmet_execute_identify_endgrp_list(struct nvmet_req *req) 580 + { 581 + u16 min_endgid = le16_to_cpu(req->cmd->identify.cnssid); 582 + static const int buf_size = NVME_IDENTIFY_DATA_SIZE; 583 + struct nvmet_ctrl *ctrl = req->sq->ctrl; 584 + struct nvmet_ns *ns; 585 + unsigned long idx; 586 + __le16 *list; 587 + u16 status; 588 + int i = 1; 589 + 590 + list = kzalloc(buf_size, GFP_KERNEL); 591 + if (!list) { 592 + status = NVME_SC_INTERNAL; 593 + goto out; 594 + } 595 + 596 + xa_for_each(&ctrl->subsys->namespaces, idx, ns) { 597 + if (ns->nsid <= min_endgid) 598 + continue; 599 + 600 + list[i++] = cpu_to_le16(ns->nsid); 601 + if (i == buf_size / sizeof(__le16)) 602 + break; 603 + } 604 + 605 + list[0] = cpu_to_le16(i - 1); 606 + status = nvmet_copy_to_sgl(req, 0, list, buf_size); 607 + kfree(list); 608 + out: 609 + nvmet_req_complete(req, status); 610 + } 611 + 612 + static void nvmet_execute_identify_nslist(struct nvmet_req *req, bool match_css) 758 613 { 759 614 static const int buf_size = NVME_IDENTIFY_DATA_SIZE; 760 615 struct nvmet_ctrl *ctrl = req->sq->ctrl; ··· 816 605 817 606 xa_for_each(&ctrl->subsys->namespaces, idx, ns) { 818 607 if (ns->nsid <= min_nsid) 608 + continue; 609 + if (match_css && req->ns->csi != req->cmd->identify.csi) 819 610 continue; 820 611 list[i++] = cpu_to_le32(ns->nsid); 821 612 if (i == buf_size / sizeof(__le32)) ··· 898 685 nvmet_zero_sgl(req, 0, sizeof(struct nvme_id_ctrl_nvm))); 899 686 } 900 687 688 + static void nvme_execute_identify_ns_nvm(struct nvmet_req *req) 689 + { 690 + u16 status; 691 + 692 + status = nvmet_req_find_ns(req); 693 + if (status) 694 + goto out; 695 + 696 + status = nvmet_copy_to_sgl(req, 0, ZERO_PAGE(0), 697 + NVME_IDENTIFY_DATA_SIZE); 698 + out: 699 + nvmet_req_complete(req, status); 700 + } 701 + 702 + static void nvmet_execute_id_cs_indep(struct nvmet_req *req) 703 + { 704 + struct nvme_id_ns_cs_indep *id; 705 + u16 status; 706 + 707 + status = nvmet_req_find_ns(req); 708 + if (status) 709 + goto out; 710 + 711 + id = kzalloc(sizeof(*id), GFP_KERNEL); 712 + if (!id) { 713 + status = NVME_SC_INTERNAL; 714 + goto out; 715 + } 716 + 717 + id->nstat = NVME_NSTAT_NRDY; 718 + id->anagrpid = cpu_to_le32(req->ns->anagrpid); 719 + id->nmic = NVME_NS_NMIC_SHARED; 720 + if (req->ns->readonly) 721 + id->nsattr |= NVME_NS_ATTR_RO; 722 + if (req->ns->bdev && !bdev_nonrot(req->ns->bdev)) 723 + id->nsfeat |= NVME_NS_ROTATIONAL; 724 + /* 725 + * We need flush command to flush the file's metadata, 726 + * so report supporting vwc if backend is file, even 727 + * though buffered_io is disable. 728 + */ 729 + if (req->ns->bdev && !bdev_write_cache(req->ns->bdev)) 730 + id->nsfeat |= NVME_NS_VWC_NOT_PRESENT; 731 + 732 + status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); 733 + kfree(id); 734 + out: 735 + nvmet_req_complete(req, status); 736 + } 737 + 901 738 static void nvmet_execute_identify(struct nvmet_req *req) 902 739 { 903 740 if (!nvmet_check_transfer_len(req, NVME_IDENTIFY_DATA_SIZE)) ··· 961 698 nvmet_execute_identify_ctrl(req); 962 699 return; 963 700 case NVME_ID_CNS_NS_ACTIVE_LIST: 964 - nvmet_execute_identify_nslist(req); 701 + nvmet_execute_identify_nslist(req, false); 965 702 return; 966 703 case NVME_ID_CNS_NS_DESC_LIST: 967 704 nvmet_execute_identify_desclist(req); ··· 969 706 case NVME_ID_CNS_CS_NS: 970 707 switch (req->cmd->identify.csi) { 971 708 case NVME_CSI_NVM: 972 - /* Not supported */ 973 - break; 709 + nvme_execute_identify_ns_nvm(req); 710 + return; 974 711 case NVME_CSI_ZNS: 975 712 if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { 976 713 nvmet_execute_identify_ns_zns(req); ··· 992 729 break; 993 730 } 994 731 break; 732 + case NVME_ID_CNS_NS_ACTIVE_LIST_CS: 733 + nvmet_execute_identify_nslist(req, true); 734 + return; 735 + case NVME_ID_CNS_NS_CS_INDEP: 736 + nvmet_execute_id_cs_indep(req); 737 + return; 738 + case NVME_ID_CNS_ENDGRP_LIST: 739 + nvmet_execute_identify_endgrp_list(req); 740 + return; 995 741 } 996 742 997 743 pr_debug("unhandled identify cns %d on qid %d\n", ··· 1133 861 case NVME_FEAT_WRITE_PROTECT: 1134 862 status = nvmet_set_feat_write_protect(req); 1135 863 break; 864 + case NVME_FEAT_RESV_MASK: 865 + status = nvmet_set_feat_resv_notif_mask(req, cdw11); 866 + break; 1136 867 default: 1137 868 req->error_loc = offsetof(struct nvme_common_command, cdw10); 1138 869 status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; ··· 1233 958 break; 1234 959 case NVME_FEAT_WRITE_PROTECT: 1235 960 status = nvmet_get_feat_write_protect(req); 961 + break; 962 + case NVME_FEAT_RESV_MASK: 963 + status = nvmet_get_feat_resv_notif_mask(req); 1236 964 break; 1237 965 default: 1238 966 req->error_loc =

+27

drivers/nvme/target/configfs.c

··· 769 769 770 770 CONFIGFS_ATTR_WO(nvmet_ns_, revalidate_size); 771 771 772 + static ssize_t nvmet_ns_resv_enable_show(struct config_item *item, char *page) 773 + { 774 + return sysfs_emit(page, "%d\n", to_nvmet_ns(item)->pr.enable); 775 + } 776 + 777 + static ssize_t nvmet_ns_resv_enable_store(struct config_item *item, 778 + const char *page, size_t count) 779 + { 780 + struct nvmet_ns *ns = to_nvmet_ns(item); 781 + bool val; 782 + 783 + if (kstrtobool(page, &val)) 784 + return -EINVAL; 785 + 786 + mutex_lock(&ns->subsys->lock); 787 + if (ns->enabled) { 788 + pr_err("the ns:%d is already enabled.\n", ns->nsid); 789 + mutex_unlock(&ns->subsys->lock); 790 + return -EINVAL; 791 + } 792 + ns->pr.enable = val; 793 + mutex_unlock(&ns->subsys->lock); 794 + return count; 795 + } 796 + CONFIGFS_ATTR(nvmet_ns_, resv_enable); 797 + 772 798 static struct configfs_attribute *nvmet_ns_attrs[] = { 773 799 &nvmet_ns_attr_device_path, 774 800 &nvmet_ns_attr_device_nguid, ··· 803 777 &nvmet_ns_attr_enable, 804 778 &nvmet_ns_attr_buffered_io, 805 779 &nvmet_ns_attr_revalidate_size, 780 + &nvmet_ns_attr_resv_enable, 806 781 #ifdef CONFIG_PCI_P2PDMA 807 782 &nvmet_ns_attr_p2pmem, 808 783 #endif

+57 -7

drivers/nvme/target/core.c

··· 611 611 if (ret) 612 612 goto out_restore_subsys_maxnsid; 613 613 614 + if (ns->pr.enable) { 615 + ret = nvmet_pr_init_ns(ns); 616 + if (ret) 617 + goto out_remove_from_subsys; 618 + } 619 + 614 620 subsys->nr_namespaces++; 615 621 616 622 nvmet_ns_changed(subsys, ns->nsid); ··· 626 620 mutex_unlock(&subsys->lock); 627 621 return ret; 628 622 623 + out_remove_from_subsys: 624 + xa_erase(&subsys->namespaces, ns->nsid); 629 625 out_restore_subsys_maxnsid: 630 626 subsys->max_nsid = nvmet_max_nsid(subsys); 631 627 percpu_ref_exit(&ns->ref); ··· 670 662 synchronize_rcu(); 671 663 wait_for_completion(&ns->disable_done); 672 664 percpu_ref_exit(&ns->ref); 665 + 666 + if (ns->pr.enable) 667 + nvmet_pr_exit_ns(ns); 673 668 674 669 mutex_lock(&subsys->lock); 675 670 ··· 765 754 static void __nvmet_req_complete(struct nvmet_req *req, u16 status) 766 755 { 767 756 struct nvmet_ns *ns = req->ns; 757 + struct nvmet_pr_per_ctrl_ref *pc_ref = req->pc_ref; 768 758 769 759 if (!req->sq->sqhd_disabled) 770 760 nvmet_update_sq_head(req); ··· 778 766 trace_nvmet_req_complete(req); 779 767 780 768 req->ops->queue_response(req); 769 + 770 + if (pc_ref) 771 + nvmet_pr_put_ns_pc_ref(pc_ref); 781 772 if (ns) 782 773 nvmet_put_namespace(ns); 783 774 } ··· 944 929 return ret; 945 930 } 946 931 932 + if (req->ns->pr.enable) { 933 + ret = nvmet_parse_pr_cmd(req); 934 + if (!ret) 935 + return ret; 936 + } 937 + 947 938 switch (req->ns->csi) { 948 939 case NVME_CSI_NVM: 949 940 if (req->ns->file) 950 - return nvmet_file_parse_io_cmd(req); 951 - return nvmet_bdev_parse_io_cmd(req); 941 + ret = nvmet_file_parse_io_cmd(req); 942 + else 943 + ret = nvmet_bdev_parse_io_cmd(req); 944 + break; 952 945 case NVME_CSI_ZNS: 953 946 if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) 954 - return nvmet_bdev_zns_parse_io_cmd(req); 955 - return NVME_SC_INVALID_IO_CMD_SET; 947 + ret = nvmet_bdev_zns_parse_io_cmd(req); 948 + else 949 + ret = NVME_SC_INVALID_IO_CMD_SET; 950 + break; 956 951 default: 957 - return NVME_SC_INVALID_IO_CMD_SET; 952 + ret = NVME_SC_INVALID_IO_CMD_SET; 958 953 } 954 + if (ret) 955 + return ret; 956 + 957 + if (req->ns->pr.enable) { 958 + ret = nvmet_pr_check_cmd_access(req); 959 + if (ret) 960 + return ret; 961 + 962 + ret = nvmet_pr_get_ns_pc_ref(req); 963 + } 964 + return ret; 959 965 } 960 966 961 967 bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, ··· 1000 964 req->ns = NULL; 1001 965 req->error_loc = NVMET_NO_ERROR_LOC; 1002 966 req->error_slba = 0; 967 + req->pc_ref = NULL; 1003 968 1004 969 /* no support for fused commands yet */ 1005 970 if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) { ··· 1052 1015 void nvmet_req_uninit(struct nvmet_req *req) 1053 1016 { 1054 1017 percpu_ref_put(&req->sq->ref); 1018 + if (req->pc_ref) 1019 + nvmet_pr_put_ns_pc_ref(req->pc_ref); 1055 1020 if (req->ns) 1056 1021 nvmet_put_namespace(req->ns); 1057 1022 } ··· 1422 1383 } 1423 1384 1424 1385 u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, 1425 - struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp) 1386 + struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp, 1387 + uuid_t *hostid) 1426 1388 { 1427 1389 struct nvmet_subsys *subsys; 1428 1390 struct nvmet_ctrl *ctrl; ··· 1502 1462 } 1503 1463 ctrl->cntlid = ret; 1504 1464 1465 + uuid_copy(&ctrl->hostid, hostid); 1466 + 1505 1467 /* 1506 1468 * Discovery controllers may use some arbitrary high value 1507 1469 * in order to cleanup stale discovery sessions ··· 1520 1478 nvmet_start_keep_alive_timer(ctrl); 1521 1479 1522 1480 mutex_lock(&subsys->lock); 1481 + ret = nvmet_ctrl_init_pr(ctrl); 1482 + if (ret) 1483 + goto init_pr_fail; 1523 1484 list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); 1524 1485 nvmet_setup_p2p_ns_map(ctrl, req); 1525 1486 nvmet_debugfs_ctrl_setup(ctrl); ··· 1531 1486 *ctrlp = ctrl; 1532 1487 return 0; 1533 1488 1489 + init_pr_fail: 1490 + mutex_unlock(&subsys->lock); 1491 + nvmet_stop_keep_alive_timer(ctrl); 1492 + ida_free(&cntlid_ida, ctrl->cntlid); 1534 1493 out_free_sqs: 1535 1494 kfree(ctrl->sqs); 1536 1495 out_free_changed_ns_list: ··· 1553 1504 struct nvmet_subsys *subsys = ctrl->subsys; 1554 1505 1555 1506 mutex_lock(&subsys->lock); 1507 + nvmet_ctrl_destroy_pr(ctrl); 1556 1508 nvmet_release_p2p_ns_map(ctrl); 1557 1509 list_del(&ctrl->subsys_entry); 1558 1510 mutex_unlock(&subsys->lock); ··· 1767 1717 goto out_free_zbd_work_queue; 1768 1718 1769 1719 nvmet_wq = alloc_workqueue("nvmet-wq", 1770 - WQ_MEM_RECLAIM | WQ_UNBOUND, 0); 1720 + WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_SYSFS, 0); 1771 1721 if (!nvmet_wq) 1772 1722 goto out_free_buffered_work_queue; 1773 1723

+4 -3

drivers/nvme/target/fabrics-cmd.c

··· 64 64 case NVME_REG_CSTS: 65 65 val = ctrl->csts; 66 66 break; 67 + case NVME_REG_CRTO: 68 + val = NVME_CAP_TIMEOUT(ctrl->csts); 69 + break; 67 70 default: 68 71 status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; 69 72 break; ··· 248 245 d->subsysnqn[NVMF_NQN_FIELD_LEN - 1] = '\0'; 249 246 d->hostnqn[NVMF_NQN_FIELD_LEN - 1] = '\0'; 250 247 status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req, 251 - le32_to_cpu(c->kato), &ctrl); 248 + le32_to_cpu(c->kato), &ctrl, &d->hostid); 252 249 if (status) 253 250 goto out; 254 - 255 - uuid_copy(&ctrl->hostid, &d->hostid); 256 251 257 252 dhchap_status = nvmet_setup_auth(ctrl); 258 253 if (dhchap_status) {

+65 -2

drivers/nvme/target/nvmet.h

··· 20 20 #include <linux/blkdev.h> 21 21 #include <linux/radix-tree.h> 22 22 #include <linux/t10-pi.h> 23 + #include <linux/kfifo.h> 23 24 24 - #define NVMET_DEFAULT_VS NVME_VS(1, 3, 0) 25 + #define NVMET_DEFAULT_VS NVME_VS(2, 1, 0) 25 26 26 27 #define NVMET_ASYNC_EVENTS 4 27 28 #define NVMET_ERROR_LOG_SLOTS 128 ··· 31 30 #define NVMET_MN_MAX_SIZE 40 32 31 #define NVMET_SN_MAX_SIZE 20 33 32 #define NVMET_FR_MAX_SIZE 8 33 + #define NVMET_PR_LOG_QUEUE_SIZE 64 34 34 35 35 /* 36 36 * Supported optional AENs: ··· 57 55 (cpu_to_le32((1 << 16) | (offsetof(struct nvmf_connect_data, x)))) 58 56 #define IPO_IATTR_CONNECT_SQE(x) \ 59 57 (cpu_to_le32(offsetof(struct nvmf_connect_command, x))) 58 + 59 + struct nvmet_pr_registrant { 60 + u64 rkey; 61 + uuid_t hostid; 62 + enum nvme_pr_type rtype; 63 + struct list_head entry; 64 + struct rcu_head rcu; 65 + }; 66 + 67 + struct nvmet_pr { 68 + bool enable; 69 + unsigned long notify_mask; 70 + atomic_t generation; 71 + struct nvmet_pr_registrant __rcu *holder; 72 + /* 73 + * During the execution of the reservation command, mutual 74 + * exclusion is required throughout the process. However, 75 + * while waiting asynchronously for the 'per controller 76 + * percpu_ref' to complete before the 'preempt and abort' 77 + * command finishes, a semaphore is needed to ensure mutual 78 + * exclusion instead of a mutex. 79 + */ 80 + struct semaphore pr_sem; 81 + struct list_head registrant_list; 82 + }; 83 + 84 + struct nvmet_pr_per_ctrl_ref { 85 + struct percpu_ref ref; 86 + struct completion free_done; 87 + struct completion confirm_done; 88 + uuid_t hostid; 89 + }; 60 90 61 91 struct nvmet_ns { 62 92 struct percpu_ref ref; ··· 119 85 int pi_type; 120 86 int metadata_size; 121 87 u8 csi; 88 + struct nvmet_pr pr; 89 + struct xarray pr_per_ctrl_refs; 122 90 }; 123 91 124 92 static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item) ··· 227 191 return nvmet_port_disc_addr_treq_secure_channel(port) == NVMF_TREQ_REQUIRED; 228 192 } 229 193 194 + struct nvmet_pr_log_mgr { 195 + struct mutex lock; 196 + u64 lost_count; 197 + u64 counter; 198 + DECLARE_KFIFO(log_queue, struct nvme_pr_log, NVMET_PR_LOG_QUEUE_SIZE); 199 + }; 200 + 230 201 struct nvmet_ctrl { 231 202 struct nvmet_subsys *subsys; 232 203 struct nvmet_sq **sqs; ··· 289 246 u8 *dh_key; 290 247 size_t dh_keysize; 291 248 #endif 249 + struct nvmet_pr_log_mgr pr_log_mgr; 292 250 }; 293 251 294 252 struct nvmet_subsys { ··· 440 396 struct work_struct zmgmt_work; 441 397 } z; 442 398 #endif /* CONFIG_BLK_DEV_ZONED */ 399 + struct { 400 + struct work_struct abort_work; 401 + } r; 443 402 }; 444 403 int sg_cnt; 445 404 int metadata_sg_cnt; ··· 459 412 struct device *p2p_client; 460 413 u16 error_loc; 461 414 u64 error_slba; 415 + struct nvmet_pr_per_ctrl_ref *pc_ref; 462 416 }; 463 417 464 418 #define NVMET_MAX_MPOOL_BVEC 16 ··· 546 498 547 499 void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new); 548 500 u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, 549 - struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp); 501 + struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp, 502 + uuid_t *hostid); 550 503 struct nvmet_ctrl *nvmet_ctrl_find_get(const char *subsysnqn, 551 504 const char *hostnqn, u16 cntlid, 552 505 struct nvmet_req *req); ··· 810 761 static inline const char *nvmet_dhchap_dhgroup_name(u8 dhgid) { return NULL; } 811 762 #endif 812 763 764 + int nvmet_pr_init_ns(struct nvmet_ns *ns); 765 + u16 nvmet_parse_pr_cmd(struct nvmet_req *req); 766 + u16 nvmet_pr_check_cmd_access(struct nvmet_req *req); 767 + int nvmet_ctrl_init_pr(struct nvmet_ctrl *ctrl); 768 + void nvmet_ctrl_destroy_pr(struct nvmet_ctrl *ctrl); 769 + void nvmet_pr_exit_ns(struct nvmet_ns *ns); 770 + void nvmet_execute_get_log_page_resv(struct nvmet_req *req); 771 + u16 nvmet_set_feat_resv_notif_mask(struct nvmet_req *req, u32 mask); 772 + u16 nvmet_get_feat_resv_notif_mask(struct nvmet_req *req); 773 + u16 nvmet_pr_get_ns_pc_ref(struct nvmet_req *req); 774 + static inline void nvmet_pr_put_ns_pc_ref(struct nvmet_pr_per_ctrl_ref *pc_ref) 775 + { 776 + percpu_ref_put(&pc_ref->ref); 777 + } 813 778 #endif /* _NVMET_H */

+1156

drivers/nvme/target/pr.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * NVMe over Fabrics Persist Reservation. 4 + * Copyright (c) 2024 Guixin Liu, Alibaba Group. 5 + * All rights reserved. 6 + */ 7 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 8 + #include <linux/unaligned.h> 9 + #include "nvmet.h" 10 + 11 + #define NVMET_PR_NOTIFI_MASK_ALL \ 12 + (1 << NVME_PR_NOTIFY_BIT_REG_PREEMPTED | \ 13 + 1 << NVME_PR_NOTIFY_BIT_RESV_RELEASED | \ 14 + 1 << NVME_PR_NOTIFY_BIT_RESV_PREEMPTED) 15 + 16 + static inline bool nvmet_pr_parse_ignore_key(u32 cdw10) 17 + { 18 + /* Ignore existing key, bit 03. */ 19 + return (cdw10 >> 3) & 1; 20 + } 21 + 22 + static inline struct nvmet_ns *nvmet_pr_to_ns(struct nvmet_pr *pr) 23 + { 24 + return container_of(pr, struct nvmet_ns, pr); 25 + } 26 + 27 + static struct nvmet_pr_registrant * 28 + nvmet_pr_find_registrant(struct nvmet_pr *pr, uuid_t *hostid) 29 + { 30 + struct nvmet_pr_registrant *reg; 31 + 32 + list_for_each_entry_rcu(reg, &pr->registrant_list, entry) { 33 + if (uuid_equal(&reg->hostid, hostid)) 34 + return reg; 35 + } 36 + return NULL; 37 + } 38 + 39 + u16 nvmet_set_feat_resv_notif_mask(struct nvmet_req *req, u32 mask) 40 + { 41 + u32 nsid = le32_to_cpu(req->cmd->common.nsid); 42 + struct nvmet_ctrl *ctrl = req->sq->ctrl; 43 + struct nvmet_ns *ns; 44 + unsigned long idx; 45 + u16 status; 46 + 47 + if (mask & ~(NVMET_PR_NOTIFI_MASK_ALL)) { 48 + req->error_loc = offsetof(struct nvme_common_command, cdw11); 49 + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; 50 + } 51 + 52 + if (nsid != U32_MAX) { 53 + status = nvmet_req_find_ns(req); 54 + if (status) 55 + return status; 56 + if (!req->ns->pr.enable) 57 + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; 58 + 59 + WRITE_ONCE(req->ns->pr.notify_mask, mask); 60 + goto success; 61 + } 62 + 63 + xa_for_each(&ctrl->subsys->namespaces, idx, ns) { 64 + if (ns->pr.enable) 65 + WRITE_ONCE(ns->pr.notify_mask, mask); 66 + } 67 + 68 + success: 69 + nvmet_set_result(req, mask); 70 + return NVME_SC_SUCCESS; 71 + } 72 + 73 + u16 nvmet_get_feat_resv_notif_mask(struct nvmet_req *req) 74 + { 75 + u16 status; 76 + 77 + status = nvmet_req_find_ns(req); 78 + if (status) 79 + return status; 80 + 81 + if (!req->ns->pr.enable) 82 + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; 83 + 84 + nvmet_set_result(req, READ_ONCE(req->ns->pr.notify_mask)); 85 + return status; 86 + } 87 + 88 + void nvmet_execute_get_log_page_resv(struct nvmet_req *req) 89 + { 90 + struct nvmet_pr_log_mgr *log_mgr = &req->sq->ctrl->pr_log_mgr; 91 + struct nvme_pr_log next_log = {0}; 92 + struct nvme_pr_log log = {0}; 93 + u16 status = NVME_SC_SUCCESS; 94 + u64 lost_count; 95 + u64 cur_count; 96 + u64 next_count; 97 + 98 + mutex_lock(&log_mgr->lock); 99 + if (!kfifo_get(&log_mgr->log_queue, &log)) 100 + goto out; 101 + 102 + /* 103 + * We can't get the last in kfifo. 104 + * Utilize the current count and the count from the next log to 105 + * calculate the number of lost logs, while also addressing cases 106 + * of overflow. If there is no subsequent log, the number of lost 107 + * logs is equal to the lost_count within the nvmet_pr_log_mgr. 108 + */ 109 + cur_count = le64_to_cpu(log.count); 110 + if (kfifo_peek(&log_mgr->log_queue, &next_log)) { 111 + next_count = le64_to_cpu(next_log.count); 112 + if (next_count > cur_count) 113 + lost_count = next_count - cur_count - 1; 114 + else 115 + lost_count = U64_MAX - cur_count + next_count - 1; 116 + } else { 117 + lost_count = log_mgr->lost_count; 118 + } 119 + 120 + log.count = cpu_to_le64((cur_count + lost_count) == 0 ? 121 + 1 : (cur_count + lost_count)); 122 + log_mgr->lost_count -= lost_count; 123 + 124 + log.nr_pages = kfifo_len(&log_mgr->log_queue); 125 + 126 + out: 127 + status = nvmet_copy_to_sgl(req, 0, &log, sizeof(log)); 128 + mutex_unlock(&log_mgr->lock); 129 + nvmet_req_complete(req, status); 130 + } 131 + 132 + static void nvmet_pr_add_resv_log(struct nvmet_ctrl *ctrl, u8 log_type, 133 + u32 nsid) 134 + { 135 + struct nvmet_pr_log_mgr *log_mgr = &ctrl->pr_log_mgr; 136 + struct nvme_pr_log log = {0}; 137 + 138 + mutex_lock(&log_mgr->lock); 139 + log_mgr->counter++; 140 + if (log_mgr->counter == 0) 141 + log_mgr->counter = 1; 142 + 143 + log.count = cpu_to_le64(log_mgr->counter); 144 + log.type = log_type; 145 + log.nsid = cpu_to_le32(nsid); 146 + 147 + if (!kfifo_put(&log_mgr->log_queue, log)) { 148 + pr_info("a reservation log lost, cntlid:%d, log_type:%d, nsid:%d\n", 149 + ctrl->cntlid, log_type, nsid); 150 + log_mgr->lost_count++; 151 + } 152 + 153 + mutex_unlock(&log_mgr->lock); 154 + } 155 + 156 + static void nvmet_pr_resv_released(struct nvmet_pr *pr, uuid_t *hostid) 157 + { 158 + struct nvmet_ns *ns = nvmet_pr_to_ns(pr); 159 + struct nvmet_subsys *subsys = ns->subsys; 160 + struct nvmet_ctrl *ctrl; 161 + 162 + if (test_bit(NVME_PR_NOTIFY_BIT_RESV_RELEASED, &pr->notify_mask)) 163 + return; 164 + 165 + mutex_lock(&subsys->lock); 166 + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { 167 + if (!uuid_equal(&ctrl->hostid, hostid) && 168 + nvmet_pr_find_registrant(pr, &ctrl->hostid)) { 169 + nvmet_pr_add_resv_log(ctrl, 170 + NVME_PR_LOG_RESERVATION_RELEASED, ns->nsid); 171 + nvmet_add_async_event(ctrl, NVME_AER_CSS, 172 + NVME_AEN_RESV_LOG_PAGE_AVALIABLE, 173 + NVME_LOG_RESERVATION); 174 + } 175 + } 176 + mutex_unlock(&subsys->lock); 177 + } 178 + 179 + static void nvmet_pr_send_event_to_host(struct nvmet_pr *pr, uuid_t *hostid, 180 + u8 log_type) 181 + { 182 + struct nvmet_ns *ns = nvmet_pr_to_ns(pr); 183 + struct nvmet_subsys *subsys = ns->subsys; 184 + struct nvmet_ctrl *ctrl; 185 + 186 + mutex_lock(&subsys->lock); 187 + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { 188 + if (uuid_equal(hostid, &ctrl->hostid)) { 189 + nvmet_pr_add_resv_log(ctrl, log_type, ns->nsid); 190 + nvmet_add_async_event(ctrl, NVME_AER_CSS, 191 + NVME_AEN_RESV_LOG_PAGE_AVALIABLE, 192 + NVME_LOG_RESERVATION); 193 + } 194 + } 195 + mutex_unlock(&subsys->lock); 196 + } 197 + 198 + static void nvmet_pr_resv_preempted(struct nvmet_pr *pr, uuid_t *hostid) 199 + { 200 + if (test_bit(NVME_PR_NOTIFY_BIT_RESV_PREEMPTED, &pr->notify_mask)) 201 + return; 202 + 203 + nvmet_pr_send_event_to_host(pr, hostid, 204 + NVME_PR_LOG_RESERVATOIN_PREEMPTED); 205 + } 206 + 207 + static void nvmet_pr_registration_preempted(struct nvmet_pr *pr, 208 + uuid_t *hostid) 209 + { 210 + if (test_bit(NVME_PR_NOTIFY_BIT_REG_PREEMPTED, &pr->notify_mask)) 211 + return; 212 + 213 + nvmet_pr_send_event_to_host(pr, hostid, 214 + NVME_PR_LOG_REGISTRATION_PREEMPTED); 215 + } 216 + 217 + static inline void nvmet_pr_set_new_holder(struct nvmet_pr *pr, u8 new_rtype, 218 + struct nvmet_pr_registrant *reg) 219 + { 220 + reg->rtype = new_rtype; 221 + rcu_assign_pointer(pr->holder, reg); 222 + } 223 + 224 + static u16 nvmet_pr_register(struct nvmet_req *req, 225 + struct nvmet_pr_register_data *d) 226 + { 227 + struct nvmet_ctrl *ctrl = req->sq->ctrl; 228 + struct nvmet_pr_registrant *new, *reg; 229 + struct nvmet_pr *pr = &req->ns->pr; 230 + u16 status = NVME_SC_SUCCESS; 231 + u64 nrkey = le64_to_cpu(d->nrkey); 232 + 233 + new = kmalloc(sizeof(*new), GFP_KERNEL); 234 + if (!new) 235 + return NVME_SC_INTERNAL; 236 + 237 + down(&pr->pr_sem); 238 + reg = nvmet_pr_find_registrant(pr, &ctrl->hostid); 239 + if (reg) { 240 + if (reg->rkey != nrkey) 241 + status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR; 242 + kfree(new); 243 + goto out; 244 + } 245 + 246 + memset(new, 0, sizeof(*new)); 247 + INIT_LIST_HEAD(&new->entry); 248 + new->rkey = nrkey; 249 + uuid_copy(&new->hostid, &ctrl->hostid); 250 + list_add_tail_rcu(&new->entry, &pr->registrant_list); 251 + 252 + out: 253 + up(&pr->pr_sem); 254 + return status; 255 + } 256 + 257 + static void nvmet_pr_unregister_one(struct nvmet_pr *pr, 258 + struct nvmet_pr_registrant *reg) 259 + { 260 + struct nvmet_pr_registrant *first_reg; 261 + struct nvmet_pr_registrant *holder; 262 + u8 original_rtype; 263 + 264 + list_del_rcu(&reg->entry); 265 + 266 + holder = rcu_dereference_protected(pr->holder, 1); 267 + if (reg != holder) 268 + goto out; 269 + 270 + original_rtype = holder->rtype; 271 + if (original_rtype == NVME_PR_WRITE_EXCLUSIVE_ALL_REGS || 272 + original_rtype == NVME_PR_EXCLUSIVE_ACCESS_ALL_REGS) { 273 + first_reg = list_first_or_null_rcu(&pr->registrant_list, 274 + struct nvmet_pr_registrant, entry); 275 + if (first_reg) 276 + first_reg->rtype = original_rtype; 277 + rcu_assign_pointer(pr->holder, first_reg); 278 + } else { 279 + rcu_assign_pointer(pr->holder, NULL); 280 + 281 + if (original_rtype == NVME_PR_WRITE_EXCLUSIVE_REG_ONLY || 282 + original_rtype == NVME_PR_EXCLUSIVE_ACCESS_REG_ONLY) 283 + nvmet_pr_resv_released(pr, &reg->hostid); 284 + } 285 + out: 286 + kfree_rcu(reg, rcu); 287 + } 288 + 289 + static u16 nvmet_pr_unregister(struct nvmet_req *req, 290 + struct nvmet_pr_register_data *d, 291 + bool ignore_key) 292 + { 293 + u16 status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR; 294 + struct nvmet_ctrl *ctrl = req->sq->ctrl; 295 + struct nvmet_pr *pr = &req->ns->pr; 296 + struct nvmet_pr_registrant *reg; 297 + 298 + down(&pr->pr_sem); 299 + list_for_each_entry_rcu(reg, &pr->registrant_list, entry) { 300 + if (uuid_equal(&reg->hostid, &ctrl->hostid)) { 301 + if (ignore_key || reg->rkey == le64_to_cpu(d->crkey)) { 302 + status = NVME_SC_SUCCESS; 303 + nvmet_pr_unregister_one(pr, reg); 304 + } 305 + break; 306 + } 307 + } 308 + up(&pr->pr_sem); 309 + 310 + return status; 311 + } 312 + 313 + static void nvmet_pr_update_reg_rkey(struct nvmet_pr_registrant *reg, 314 + void *attr) 315 + { 316 + reg->rkey = *(u64 *)attr; 317 + } 318 + 319 + static u16 nvmet_pr_update_reg_attr(struct nvmet_pr *pr, 320 + struct nvmet_pr_registrant *reg, 321 + void (*change_attr)(struct nvmet_pr_registrant *reg, 322 + void *attr), 323 + void *attr) 324 + { 325 + struct nvmet_pr_registrant *holder; 326 + struct nvmet_pr_registrant *new; 327 + 328 + holder = rcu_dereference_protected(pr->holder, 1); 329 + if (reg != holder) { 330 + change_attr(reg, attr); 331 + return NVME_SC_SUCCESS; 332 + } 333 + 334 + new = kmalloc(sizeof(*new), GFP_ATOMIC); 335 + if (!new) 336 + return NVME_SC_INTERNAL; 337 + 338 + new->rkey = holder->rkey; 339 + new->rtype = holder->rtype; 340 + uuid_copy(&new->hostid, &holder->hostid); 341 + INIT_LIST_HEAD(&new->entry); 342 + 343 + change_attr(new, attr); 344 + list_replace_rcu(&holder->entry, &new->entry); 345 + rcu_assign_pointer(pr->holder, new); 346 + kfree_rcu(holder, rcu); 347 + 348 + return NVME_SC_SUCCESS; 349 + } 350 + 351 + static u16 nvmet_pr_replace(struct nvmet_req *req, 352 + struct nvmet_pr_register_data *d, 353 + bool ignore_key) 354 + { 355 + u16 status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR; 356 + struct nvmet_ctrl *ctrl = req->sq->ctrl; 357 + struct nvmet_pr *pr = &req->ns->pr; 358 + struct nvmet_pr_registrant *reg; 359 + u64 nrkey = le64_to_cpu(d->nrkey); 360 + 361 + down(&pr->pr_sem); 362 + list_for_each_entry_rcu(reg, &pr->registrant_list, entry) { 363 + if (uuid_equal(&reg->hostid, &ctrl->hostid)) { 364 + if (ignore_key || reg->rkey == le64_to_cpu(d->crkey)) 365 + status = nvmet_pr_update_reg_attr(pr, reg, 366 + nvmet_pr_update_reg_rkey, 367 + &nrkey); 368 + break; 369 + } 370 + } 371 + up(&pr->pr_sem); 372 + return status; 373 + } 374 + 375 + static void nvmet_execute_pr_register(struct nvmet_req *req) 376 + { 377 + u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); 378 + bool ignore_key = nvmet_pr_parse_ignore_key(cdw10); 379 + struct nvmet_pr_register_data *d; 380 + u8 reg_act = cdw10 & 0x07; /* Reservation Register Action, bit 02:00 */ 381 + u16 status; 382 + 383 + d = kmalloc(sizeof(*d), GFP_KERNEL); 384 + if (!d) { 385 + status = NVME_SC_INTERNAL; 386 + goto out; 387 + } 388 + 389 + status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d)); 390 + if (status) 391 + goto free_data; 392 + 393 + switch (reg_act) { 394 + case NVME_PR_REGISTER_ACT_REG: 395 + status = nvmet_pr_register(req, d); 396 + break; 397 + case NVME_PR_REGISTER_ACT_UNREG: 398 + status = nvmet_pr_unregister(req, d, ignore_key); 399 + break; 400 + case NVME_PR_REGISTER_ACT_REPLACE: 401 + status = nvmet_pr_replace(req, d, ignore_key); 402 + break; 403 + default: 404 + req->error_loc = offsetof(struct nvme_common_command, cdw10); 405 + status = NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR; 406 + break; 407 + } 408 + free_data: 409 + kfree(d); 410 + out: 411 + if (!status) 412 + atomic_inc(&req->ns->pr.generation); 413 + nvmet_req_complete(req, status); 414 + } 415 + 416 + static u16 nvmet_pr_acquire(struct nvmet_req *req, 417 + struct nvmet_pr_registrant *reg, 418 + u8 rtype) 419 + { 420 + struct nvmet_pr *pr = &req->ns->pr; 421 + struct nvmet_pr_registrant *holder; 422 + 423 + holder = rcu_dereference_protected(pr->holder, 1); 424 + if (holder && reg != holder) 425 + return NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR; 426 + if (holder && reg == holder) { 427 + if (holder->rtype == rtype) 428 + return NVME_SC_SUCCESS; 429 + return NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR; 430 + } 431 + 432 + nvmet_pr_set_new_holder(pr, rtype, reg); 433 + return NVME_SC_SUCCESS; 434 + } 435 + 436 + static void nvmet_pr_confirm_ns_pc_ref(struct percpu_ref *ref) 437 + { 438 + struct nvmet_pr_per_ctrl_ref *pc_ref = 439 + container_of(ref, struct nvmet_pr_per_ctrl_ref, ref); 440 + 441 + complete(&pc_ref->confirm_done); 442 + } 443 + 444 + static void nvmet_pr_set_ctrl_to_abort(struct nvmet_req *req, uuid_t *hostid) 445 + { 446 + struct nvmet_pr_per_ctrl_ref *pc_ref; 447 + struct nvmet_ns *ns = req->ns; 448 + unsigned long idx; 449 + 450 + xa_for_each(&ns->pr_per_ctrl_refs, idx, pc_ref) { 451 + if (uuid_equal(&pc_ref->hostid, hostid)) { 452 + percpu_ref_kill_and_confirm(&pc_ref->ref, 453 + nvmet_pr_confirm_ns_pc_ref); 454 + wait_for_completion(&pc_ref->confirm_done); 455 + } 456 + } 457 + } 458 + 459 + static u16 nvmet_pr_unreg_all_host_by_prkey(struct nvmet_req *req, u64 prkey, 460 + uuid_t *send_hostid, 461 + bool abort) 462 + { 463 + u16 status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR; 464 + struct nvmet_pr_registrant *reg, *tmp; 465 + struct nvmet_pr *pr = &req->ns->pr; 466 + uuid_t hostid; 467 + 468 + list_for_each_entry_safe(reg, tmp, &pr->registrant_list, entry) { 469 + if (reg->rkey == prkey) { 470 + status = NVME_SC_SUCCESS; 471 + uuid_copy(&hostid, &reg->hostid); 472 + if (abort) 473 + nvmet_pr_set_ctrl_to_abort(req, &hostid); 474 + nvmet_pr_unregister_one(pr, reg); 475 + if (!uuid_equal(&hostid, send_hostid)) 476 + nvmet_pr_registration_preempted(pr, &hostid); 477 + } 478 + } 479 + return status; 480 + } 481 + 482 + static void nvmet_pr_unreg_all_others_by_prkey(struct nvmet_req *req, 483 + u64 prkey, 484 + uuid_t *send_hostid, 485 + bool abort) 486 + { 487 + struct nvmet_pr_registrant *reg, *tmp; 488 + struct nvmet_pr *pr = &req->ns->pr; 489 + uuid_t hostid; 490 + 491 + list_for_each_entry_safe(reg, tmp, &pr->registrant_list, entry) { 492 + if (reg->rkey == prkey && 493 + !uuid_equal(&reg->hostid, send_hostid)) { 494 + uuid_copy(&hostid, &reg->hostid); 495 + if (abort) 496 + nvmet_pr_set_ctrl_to_abort(req, &hostid); 497 + nvmet_pr_unregister_one(pr, reg); 498 + nvmet_pr_registration_preempted(pr, &hostid); 499 + } 500 + } 501 + } 502 + 503 + static void nvmet_pr_unreg_all_others(struct nvmet_req *req, 504 + uuid_t *send_hostid, 505 + bool abort) 506 + { 507 + struct nvmet_pr_registrant *reg, *tmp; 508 + struct nvmet_pr *pr = &req->ns->pr; 509 + uuid_t hostid; 510 + 511 + list_for_each_entry_safe(reg, tmp, &pr->registrant_list, entry) { 512 + if (!uuid_equal(&reg->hostid, send_hostid)) { 513 + uuid_copy(&hostid, &reg->hostid); 514 + if (abort) 515 + nvmet_pr_set_ctrl_to_abort(req, &hostid); 516 + nvmet_pr_unregister_one(pr, reg); 517 + nvmet_pr_registration_preempted(pr, &hostid); 518 + } 519 + } 520 + } 521 + 522 + static void nvmet_pr_update_holder_rtype(struct nvmet_pr_registrant *reg, 523 + void *attr) 524 + { 525 + u8 new_rtype = *(u8 *)attr; 526 + 527 + reg->rtype = new_rtype; 528 + } 529 + 530 + static u16 nvmet_pr_preempt(struct nvmet_req *req, 531 + struct nvmet_pr_registrant *reg, 532 + u8 rtype, 533 + struct nvmet_pr_acquire_data *d, 534 + bool abort) 535 + { 536 + struct nvmet_ctrl *ctrl = req->sq->ctrl; 537 + struct nvmet_pr *pr = &req->ns->pr; 538 + struct nvmet_pr_registrant *holder; 539 + enum nvme_pr_type original_rtype; 540 + u64 prkey = le64_to_cpu(d->prkey); 541 + u16 status; 542 + 543 + holder = rcu_dereference_protected(pr->holder, 1); 544 + if (!holder) 545 + return nvmet_pr_unreg_all_host_by_prkey(req, prkey, 546 + &ctrl->hostid, abort); 547 + 548 + original_rtype = holder->rtype; 549 + if (original_rtype == NVME_PR_WRITE_EXCLUSIVE_ALL_REGS || 550 + original_rtype == NVME_PR_EXCLUSIVE_ACCESS_ALL_REGS) { 551 + if (!prkey) { 552 + /* 553 + * To prevent possible access from other hosts, and 554 + * avoid terminate the holder, set the new holder 555 + * first before unregistering. 556 + */ 557 + nvmet_pr_set_new_holder(pr, rtype, reg); 558 + nvmet_pr_unreg_all_others(req, &ctrl->hostid, abort); 559 + return NVME_SC_SUCCESS; 560 + } 561 + return nvmet_pr_unreg_all_host_by_prkey(req, prkey, 562 + &ctrl->hostid, abort); 563 + } 564 + 565 + if (holder == reg) { 566 + status = nvmet_pr_update_reg_attr(pr, holder, 567 + nvmet_pr_update_holder_rtype, &rtype); 568 + if (!status && original_rtype != rtype) 569 + nvmet_pr_resv_released(pr, &reg->hostid); 570 + return status; 571 + } 572 + 573 + if (prkey == holder->rkey) { 574 + /* 575 + * Same as before, set the new holder first. 576 + */ 577 + nvmet_pr_set_new_holder(pr, rtype, reg); 578 + nvmet_pr_unreg_all_others_by_prkey(req, prkey, &ctrl->hostid, 579 + abort); 580 + if (original_rtype != rtype) 581 + nvmet_pr_resv_released(pr, &reg->hostid); 582 + return NVME_SC_SUCCESS; 583 + } 584 + 585 + if (prkey) 586 + return nvmet_pr_unreg_all_host_by_prkey(req, prkey, 587 + &ctrl->hostid, abort); 588 + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; 589 + } 590 + 591 + static void nvmet_pr_do_abort(struct work_struct *w) 592 + { 593 + struct nvmet_req *req = container_of(w, struct nvmet_req, r.abort_work); 594 + struct nvmet_pr_per_ctrl_ref *pc_ref; 595 + struct nvmet_ns *ns = req->ns; 596 + unsigned long idx; 597 + 598 + /* 599 + * The target does not support abort, just wait per-controller ref to 0. 600 + */ 601 + xa_for_each(&ns->pr_per_ctrl_refs, idx, pc_ref) { 602 + if (percpu_ref_is_dying(&pc_ref->ref)) { 603 + wait_for_completion(&pc_ref->free_done); 604 + reinit_completion(&pc_ref->confirm_done); 605 + reinit_completion(&pc_ref->free_done); 606 + percpu_ref_resurrect(&pc_ref->ref); 607 + } 608 + } 609 + 610 + up(&ns->pr.pr_sem); 611 + nvmet_req_complete(req, NVME_SC_SUCCESS); 612 + } 613 + 614 + static u16 __nvmet_execute_pr_acquire(struct nvmet_req *req, 615 + struct nvmet_pr_registrant *reg, 616 + u8 acquire_act, 617 + u8 rtype, 618 + struct nvmet_pr_acquire_data *d) 619 + { 620 + u16 status; 621 + 622 + switch (acquire_act) { 623 + case NVME_PR_ACQUIRE_ACT_ACQUIRE: 624 + status = nvmet_pr_acquire(req, reg, rtype); 625 + goto out; 626 + case NVME_PR_ACQUIRE_ACT_PREEMPT: 627 + status = nvmet_pr_preempt(req, reg, rtype, d, false); 628 + goto inc_gen; 629 + case NVME_PR_ACQUIRE_ACT_PREEMPT_AND_ABORT: 630 + status = nvmet_pr_preempt(req, reg, rtype, d, true); 631 + goto inc_gen; 632 + default: 633 + req->error_loc = offsetof(struct nvme_common_command, cdw10); 634 + status = NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR; 635 + goto out; 636 + } 637 + inc_gen: 638 + if (!status) 639 + atomic_inc(&req->ns->pr.generation); 640 + out: 641 + return status; 642 + } 643 + 644 + static void nvmet_execute_pr_acquire(struct nvmet_req *req) 645 + { 646 + u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); 647 + bool ignore_key = nvmet_pr_parse_ignore_key(cdw10); 648 + /* Reservation type, bit 15:08 */ 649 + u8 rtype = (u8)((cdw10 >> 8) & 0xff); 650 + /* Reservation acquire action, bit 02:00 */ 651 + u8 acquire_act = cdw10 & 0x07; 652 + struct nvmet_ctrl *ctrl = req->sq->ctrl; 653 + struct nvmet_pr_acquire_data *d = NULL; 654 + struct nvmet_pr *pr = &req->ns->pr; 655 + struct nvmet_pr_registrant *reg; 656 + u16 status = NVME_SC_SUCCESS; 657 + 658 + if (ignore_key || 659 + rtype < NVME_PR_WRITE_EXCLUSIVE || 660 + rtype > NVME_PR_EXCLUSIVE_ACCESS_ALL_REGS) { 661 + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; 662 + goto out; 663 + } 664 + 665 + d = kmalloc(sizeof(*d), GFP_KERNEL); 666 + if (!d) { 667 + status = NVME_SC_INTERNAL; 668 + goto out; 669 + } 670 + 671 + status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d)); 672 + if (status) 673 + goto free_data; 674 + 675 + status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR; 676 + down(&pr->pr_sem); 677 + list_for_each_entry_rcu(reg, &pr->registrant_list, entry) { 678 + if (uuid_equal(&reg->hostid, &ctrl->hostid) && 679 + reg->rkey == le64_to_cpu(d->crkey)) { 680 + status = __nvmet_execute_pr_acquire(req, reg, 681 + acquire_act, rtype, d); 682 + break; 683 + } 684 + } 685 + 686 + if (!status && acquire_act == NVME_PR_ACQUIRE_ACT_PREEMPT_AND_ABORT) { 687 + kfree(d); 688 + INIT_WORK(&req->r.abort_work, nvmet_pr_do_abort); 689 + queue_work(nvmet_wq, &req->r.abort_work); 690 + return; 691 + } 692 + 693 + up(&pr->pr_sem); 694 + 695 + free_data: 696 + kfree(d); 697 + out: 698 + nvmet_req_complete(req, status); 699 + } 700 + 701 + static u16 nvmet_pr_release(struct nvmet_req *req, 702 + struct nvmet_pr_registrant *reg, 703 + u8 rtype) 704 + { 705 + struct nvmet_pr *pr = &req->ns->pr; 706 + struct nvmet_pr_registrant *holder; 707 + u8 original_rtype; 708 + 709 + holder = rcu_dereference_protected(pr->holder, 1); 710 + if (!holder || reg != holder) 711 + return NVME_SC_SUCCESS; 712 + 713 + original_rtype = holder->rtype; 714 + if (original_rtype != rtype) 715 + return NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR; 716 + 717 + rcu_assign_pointer(pr->holder, NULL); 718 + 719 + if (original_rtype != NVME_PR_WRITE_EXCLUSIVE && 720 + original_rtype != NVME_PR_EXCLUSIVE_ACCESS) 721 + nvmet_pr_resv_released(pr, &reg->hostid); 722 + 723 + return NVME_SC_SUCCESS; 724 + } 725 + 726 + static void nvmet_pr_clear(struct nvmet_req *req) 727 + { 728 + struct nvmet_pr_registrant *reg, *tmp; 729 + struct nvmet_pr *pr = &req->ns->pr; 730 + 731 + rcu_assign_pointer(pr->holder, NULL); 732 + 733 + list_for_each_entry_safe(reg, tmp, &pr->registrant_list, entry) { 734 + list_del_rcu(&reg->entry); 735 + if (!uuid_equal(&req->sq->ctrl->hostid, &reg->hostid)) 736 + nvmet_pr_resv_preempted(pr, &reg->hostid); 737 + kfree_rcu(reg, rcu); 738 + } 739 + 740 + atomic_inc(&pr->generation); 741 + } 742 + 743 + static u16 __nvmet_execute_pr_release(struct nvmet_req *req, 744 + struct nvmet_pr_registrant *reg, 745 + u8 release_act, u8 rtype) 746 + { 747 + switch (release_act) { 748 + case NVME_PR_RELEASE_ACT_RELEASE: 749 + return nvmet_pr_release(req, reg, rtype); 750 + case NVME_PR_RELEASE_ACT_CLEAR: 751 + nvmet_pr_clear(req); 752 + return NVME_SC_SUCCESS; 753 + default: 754 + req->error_loc = offsetof(struct nvme_common_command, cdw10); 755 + return NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR; 756 + } 757 + } 758 + 759 + static void nvmet_execute_pr_release(struct nvmet_req *req) 760 + { 761 + u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); 762 + bool ignore_key = nvmet_pr_parse_ignore_key(cdw10); 763 + u8 rtype = (u8)((cdw10 >> 8) & 0xff); /* Reservation type, bit 15:08 */ 764 + u8 release_act = cdw10 & 0x07; /* Reservation release action, bit 02:00 */ 765 + struct nvmet_ctrl *ctrl = req->sq->ctrl; 766 + struct nvmet_pr *pr = &req->ns->pr; 767 + struct nvmet_pr_release_data *d; 768 + struct nvmet_pr_registrant *reg; 769 + u16 status; 770 + 771 + if (ignore_key) { 772 + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; 773 + goto out; 774 + } 775 + 776 + d = kmalloc(sizeof(*d), GFP_KERNEL); 777 + if (!d) { 778 + status = NVME_SC_INTERNAL; 779 + goto out; 780 + } 781 + 782 + status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d)); 783 + if (status) 784 + goto free_data; 785 + 786 + status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR; 787 + down(&pr->pr_sem); 788 + list_for_each_entry_rcu(reg, &pr->registrant_list, entry) { 789 + if (uuid_equal(&reg->hostid, &ctrl->hostid) && 790 + reg->rkey == le64_to_cpu(d->crkey)) { 791 + status = __nvmet_execute_pr_release(req, reg, 792 + release_act, rtype); 793 + break; 794 + } 795 + } 796 + up(&pr->pr_sem); 797 + free_data: 798 + kfree(d); 799 + out: 800 + nvmet_req_complete(req, status); 801 + } 802 + 803 + static void nvmet_execute_pr_report(struct nvmet_req *req) 804 + { 805 + u32 cdw11 = le32_to_cpu(req->cmd->common.cdw11); 806 + u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); 807 + u32 num_bytes = 4 * (cdw10 + 1); /* cdw10 is number of dwords */ 808 + u8 eds = cdw11 & 1; /* Extended data structure, bit 00 */ 809 + struct nvme_registered_ctrl_ext *ctrl_eds; 810 + struct nvme_reservation_status_ext *data; 811 + struct nvmet_pr *pr = &req->ns->pr; 812 + struct nvmet_pr_registrant *holder; 813 + struct nvmet_pr_registrant *reg; 814 + u16 num_ctrls = 0; 815 + u16 status; 816 + u8 rtype; 817 + 818 + /* nvmet hostid(uuid_t) is 128 bit. */ 819 + if (!eds) { 820 + req->error_loc = offsetof(struct nvme_common_command, cdw11); 821 + status = NVME_SC_HOST_ID_INCONSIST | NVME_STATUS_DNR; 822 + goto out; 823 + } 824 + 825 + if (num_bytes < sizeof(struct nvme_reservation_status_ext)) { 826 + req->error_loc = offsetof(struct nvme_common_command, cdw10); 827 + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; 828 + goto out; 829 + } 830 + 831 + data = kmalloc(num_bytes, GFP_KERNEL); 832 + if (!data) { 833 + status = NVME_SC_INTERNAL; 834 + goto out; 835 + } 836 + memset(data, 0, num_bytes); 837 + data->gen = cpu_to_le32(atomic_read(&pr->generation)); 838 + data->ptpls = 0; 839 + ctrl_eds = data->regctl_eds; 840 + 841 + rcu_read_lock(); 842 + holder = rcu_dereference(pr->holder); 843 + rtype = holder ? holder->rtype : 0; 844 + data->rtype = rtype; 845 + 846 + list_for_each_entry_rcu(reg, &pr->registrant_list, entry) { 847 + num_ctrls++; 848 + /* 849 + * continue to get the number of all registrans. 850 + */ 851 + if (((void *)ctrl_eds + sizeof(*ctrl_eds)) > 852 + ((void *)data + num_bytes)) 853 + continue; 854 + /* 855 + * Dynamic controller, set cntlid to 0xffff. 856 + */ 857 + ctrl_eds->cntlid = cpu_to_le16(NVME_CNTLID_DYNAMIC); 858 + if (rtype == NVME_PR_WRITE_EXCLUSIVE_ALL_REGS || 859 + rtype == NVME_PR_EXCLUSIVE_ACCESS_ALL_REGS) 860 + ctrl_eds->rcsts = 1; 861 + if (reg == holder) 862 + ctrl_eds->rcsts = 1; 863 + uuid_copy((uuid_t *)&ctrl_eds->hostid, &reg->hostid); 864 + ctrl_eds->rkey = cpu_to_le64(reg->rkey); 865 + ctrl_eds++; 866 + } 867 + rcu_read_unlock(); 868 + 869 + put_unaligned_le16(num_ctrls, data->regctl); 870 + status = nvmet_copy_to_sgl(req, 0, data, num_bytes); 871 + kfree(data); 872 + out: 873 + nvmet_req_complete(req, status); 874 + } 875 + 876 + u16 nvmet_parse_pr_cmd(struct nvmet_req *req) 877 + { 878 + struct nvme_command *cmd = req->cmd; 879 + 880 + switch (cmd->common.opcode) { 881 + case nvme_cmd_resv_register: 882 + req->execute = nvmet_execute_pr_register; 883 + break; 884 + case nvme_cmd_resv_acquire: 885 + req->execute = nvmet_execute_pr_acquire; 886 + break; 887 + case nvme_cmd_resv_release: 888 + req->execute = nvmet_execute_pr_release; 889 + break; 890 + case nvme_cmd_resv_report: 891 + req->execute = nvmet_execute_pr_report; 892 + break; 893 + default: 894 + return 1; 895 + } 896 + return NVME_SC_SUCCESS; 897 + } 898 + 899 + static bool nvmet_is_req_write_cmd_group(struct nvmet_req *req) 900 + { 901 + u8 opcode = req->cmd->common.opcode; 902 + 903 + if (req->sq->qid) { 904 + switch (opcode) { 905 + case nvme_cmd_flush: 906 + case nvme_cmd_write: 907 + case nvme_cmd_write_zeroes: 908 + case nvme_cmd_dsm: 909 + case nvme_cmd_zone_append: 910 + case nvme_cmd_zone_mgmt_send: 911 + return true; 912 + default: 913 + return false; 914 + } 915 + } 916 + return false; 917 + } 918 + 919 + static bool nvmet_is_req_read_cmd_group(struct nvmet_req *req) 920 + { 921 + u8 opcode = req->cmd->common.opcode; 922 + 923 + if (req->sq->qid) { 924 + switch (opcode) { 925 + case nvme_cmd_read: 926 + case nvme_cmd_zone_mgmt_recv: 927 + return true; 928 + default: 929 + return false; 930 + } 931 + } 932 + return false; 933 + } 934 + 935 + u16 nvmet_pr_check_cmd_access(struct nvmet_req *req) 936 + { 937 + struct nvmet_ctrl *ctrl = req->sq->ctrl; 938 + struct nvmet_pr_registrant *holder; 939 + struct nvmet_ns *ns = req->ns; 940 + struct nvmet_pr *pr = &ns->pr; 941 + u16 status = NVME_SC_SUCCESS; 942 + 943 + rcu_read_lock(); 944 + holder = rcu_dereference(pr->holder); 945 + if (!holder) 946 + goto unlock; 947 + if (uuid_equal(&ctrl->hostid, &holder->hostid)) 948 + goto unlock; 949 + 950 + /* 951 + * The Reservation command group is checked in executing, 952 + * allow it here. 953 + */ 954 + switch (holder->rtype) { 955 + case NVME_PR_WRITE_EXCLUSIVE: 956 + if (nvmet_is_req_write_cmd_group(req)) 957 + status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR; 958 + break; 959 + case NVME_PR_EXCLUSIVE_ACCESS: 960 + if (nvmet_is_req_read_cmd_group(req) || 961 + nvmet_is_req_write_cmd_group(req)) 962 + status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR; 963 + break; 964 + case NVME_PR_WRITE_EXCLUSIVE_REG_ONLY: 965 + case NVME_PR_WRITE_EXCLUSIVE_ALL_REGS: 966 + if ((nvmet_is_req_write_cmd_group(req)) && 967 + !nvmet_pr_find_registrant(pr, &ctrl->hostid)) 968 + status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR; 969 + break; 970 + case NVME_PR_EXCLUSIVE_ACCESS_REG_ONLY: 971 + case NVME_PR_EXCLUSIVE_ACCESS_ALL_REGS: 972 + if ((nvmet_is_req_read_cmd_group(req) || 973 + nvmet_is_req_write_cmd_group(req)) && 974 + !nvmet_pr_find_registrant(pr, &ctrl->hostid)) 975 + status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR; 976 + break; 977 + default: 978 + pr_warn("the reservation type is set wrong, type:%d\n", 979 + holder->rtype); 980 + break; 981 + } 982 + 983 + unlock: 984 + rcu_read_unlock(); 985 + if (status) 986 + req->error_loc = offsetof(struct nvme_common_command, opcode); 987 + return status; 988 + } 989 + 990 + u16 nvmet_pr_get_ns_pc_ref(struct nvmet_req *req) 991 + { 992 + struct nvmet_pr_per_ctrl_ref *pc_ref; 993 + 994 + pc_ref = xa_load(&req->ns->pr_per_ctrl_refs, 995 + req->sq->ctrl->cntlid); 996 + if (unlikely(!percpu_ref_tryget_live(&pc_ref->ref))) 997 + return NVME_SC_INTERNAL; 998 + req->pc_ref = pc_ref; 999 + return NVME_SC_SUCCESS; 1000 + } 1001 + 1002 + static void nvmet_pr_ctrl_ns_all_cmds_done(struct percpu_ref *ref) 1003 + { 1004 + struct nvmet_pr_per_ctrl_ref *pc_ref = 1005 + container_of(ref, struct nvmet_pr_per_ctrl_ref, ref); 1006 + 1007 + complete(&pc_ref->free_done); 1008 + } 1009 + 1010 + static int nvmet_pr_alloc_and_insert_pc_ref(struct nvmet_ns *ns, 1011 + unsigned long idx, 1012 + uuid_t *hostid) 1013 + { 1014 + struct nvmet_pr_per_ctrl_ref *pc_ref; 1015 + int ret; 1016 + 1017 + pc_ref = kmalloc(sizeof(*pc_ref), GFP_ATOMIC); 1018 + if (!pc_ref) 1019 + return -ENOMEM; 1020 + 1021 + ret = percpu_ref_init(&pc_ref->ref, nvmet_pr_ctrl_ns_all_cmds_done, 1022 + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL); 1023 + if (ret) 1024 + goto free; 1025 + 1026 + init_completion(&pc_ref->free_done); 1027 + init_completion(&pc_ref->confirm_done); 1028 + uuid_copy(&pc_ref->hostid, hostid); 1029 + 1030 + ret = xa_insert(&ns->pr_per_ctrl_refs, idx, pc_ref, GFP_KERNEL); 1031 + if (ret) 1032 + goto exit; 1033 + return ret; 1034 + exit: 1035 + percpu_ref_exit(&pc_ref->ref); 1036 + free: 1037 + kfree(pc_ref); 1038 + return ret; 1039 + } 1040 + 1041 + int nvmet_ctrl_init_pr(struct nvmet_ctrl *ctrl) 1042 + { 1043 + struct nvmet_subsys *subsys = ctrl->subsys; 1044 + struct nvmet_pr_per_ctrl_ref *pc_ref; 1045 + struct nvmet_ns *ns = NULL; 1046 + unsigned long idx; 1047 + int ret; 1048 + 1049 + ctrl->pr_log_mgr.counter = 0; 1050 + ctrl->pr_log_mgr.lost_count = 0; 1051 + mutex_init(&ctrl->pr_log_mgr.lock); 1052 + INIT_KFIFO(ctrl->pr_log_mgr.log_queue); 1053 + 1054 + /* 1055 + * Here we are under subsys lock, if an ns not in subsys->namespaces, 1056 + * we can make sure that ns is not enabled, and not call 1057 + * nvmet_pr_init_ns(), see more details in nvmet_ns_enable(). 1058 + * So just check ns->pr.enable. 1059 + */ 1060 + xa_for_each(&subsys->namespaces, idx, ns) { 1061 + if (ns->pr.enable) { 1062 + ret = nvmet_pr_alloc_and_insert_pc_ref(ns, ctrl->cntlid, 1063 + &ctrl->hostid); 1064 + if (ret) 1065 + goto free_per_ctrl_refs; 1066 + } 1067 + } 1068 + return 0; 1069 + 1070 + free_per_ctrl_refs: 1071 + xa_for_each(&subsys->namespaces, idx, ns) { 1072 + if (ns->pr.enable) { 1073 + pc_ref = xa_erase(&ns->pr_per_ctrl_refs, ctrl->cntlid); 1074 + if (pc_ref) 1075 + percpu_ref_exit(&pc_ref->ref); 1076 + kfree(pc_ref); 1077 + } 1078 + } 1079 + return ret; 1080 + } 1081 + 1082 + void nvmet_ctrl_destroy_pr(struct nvmet_ctrl *ctrl) 1083 + { 1084 + struct nvmet_pr_per_ctrl_ref *pc_ref; 1085 + struct nvmet_ns *ns; 1086 + unsigned long idx; 1087 + 1088 + kfifo_free(&ctrl->pr_log_mgr.log_queue); 1089 + mutex_destroy(&ctrl->pr_log_mgr.lock); 1090 + 1091 + xa_for_each(&ctrl->subsys->namespaces, idx, ns) { 1092 + if (ns->pr.enable) { 1093 + pc_ref = xa_erase(&ns->pr_per_ctrl_refs, ctrl->cntlid); 1094 + if (pc_ref) 1095 + percpu_ref_exit(&pc_ref->ref); 1096 + kfree(pc_ref); 1097 + } 1098 + } 1099 + } 1100 + 1101 + int nvmet_pr_init_ns(struct nvmet_ns *ns) 1102 + { 1103 + struct nvmet_subsys *subsys = ns->subsys; 1104 + struct nvmet_pr_per_ctrl_ref *pc_ref; 1105 + struct nvmet_ctrl *ctrl = NULL; 1106 + unsigned long idx; 1107 + int ret; 1108 + 1109 + ns->pr.holder = NULL; 1110 + atomic_set(&ns->pr.generation, 0); 1111 + sema_init(&ns->pr.pr_sem, 1); 1112 + INIT_LIST_HEAD(&ns->pr.registrant_list); 1113 + ns->pr.notify_mask = 0; 1114 + 1115 + xa_init(&ns->pr_per_ctrl_refs); 1116 + 1117 + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { 1118 + ret = nvmet_pr_alloc_and_insert_pc_ref(ns, ctrl->cntlid, 1119 + &ctrl->hostid); 1120 + if (ret) 1121 + goto free_per_ctrl_refs; 1122 + } 1123 + return 0; 1124 + 1125 + free_per_ctrl_refs: 1126 + xa_for_each(&ns->pr_per_ctrl_refs, idx, pc_ref) { 1127 + xa_erase(&ns->pr_per_ctrl_refs, idx); 1128 + percpu_ref_exit(&pc_ref->ref); 1129 + kfree(pc_ref); 1130 + } 1131 + return ret; 1132 + } 1133 + 1134 + void nvmet_pr_exit_ns(struct nvmet_ns *ns) 1135 + { 1136 + struct nvmet_pr_registrant *reg, *tmp; 1137 + struct nvmet_pr_per_ctrl_ref *pc_ref; 1138 + struct nvmet_pr *pr = &ns->pr; 1139 + unsigned long idx; 1140 + 1141 + list_for_each_entry_safe(reg, tmp, &pr->registrant_list, entry) { 1142 + list_del(&reg->entry); 1143 + kfree(reg); 1144 + } 1145 + 1146 + xa_for_each(&ns->pr_per_ctrl_refs, idx, pc_ref) { 1147 + /* 1148 + * No command on ns here, we can safely free pc_ref. 1149 + */ 1150 + pc_ref = xa_erase(&ns->pr_per_ctrl_refs, idx); 1151 + percpu_ref_exit(&pc_ref->ref); 1152 + kfree(pc_ref); 1153 + } 1154 + 1155 + xa_destroy(&ns->pr_per_ctrl_refs); 1156 + }

+108

drivers/nvme/target/trace.c

··· 180 180 return ret; 181 181 } 182 182 183 + static const char *nvmet_trace_resv_reg(struct trace_seq *p, u8 *cdw10) 184 + { 185 + static const char * const rrega_strs[] = { 186 + [0x00] = "register", 187 + [0x01] = "unregister", 188 + [0x02] = "replace", 189 + }; 190 + const char *ret = trace_seq_buffer_ptr(p); 191 + u8 rrega = cdw10[0] & 0x7; 192 + u8 iekey = (cdw10[0] >> 3) & 0x1; 193 + u8 ptpl = (cdw10[3] >> 6) & 0x3; 194 + const char *rrega_str; 195 + 196 + if (rrega < ARRAY_SIZE(rrega_strs) && rrega_strs[rrega]) 197 + rrega_str = rrega_strs[rrega]; 198 + else 199 + rrega_str = "reserved"; 200 + 201 + trace_seq_printf(p, "rrega=%u:%s, iekey=%u, ptpl=%u", 202 + rrega, rrega_str, iekey, ptpl); 203 + trace_seq_putc(p, 0); 204 + 205 + return ret; 206 + } 207 + 208 + static const char * const rtype_strs[] = { 209 + [0x00] = "reserved", 210 + [0x01] = "write exclusive", 211 + [0x02] = "exclusive access", 212 + [0x03] = "write exclusive registrants only", 213 + [0x04] = "exclusive access registrants only", 214 + [0x05] = "write exclusive all registrants", 215 + [0x06] = "exclusive access all registrants", 216 + }; 217 + 218 + static const char *nvmet_trace_resv_acq(struct trace_seq *p, u8 *cdw10) 219 + { 220 + static const char * const racqa_strs[] = { 221 + [0x00] = "acquire", 222 + [0x01] = "preempt", 223 + [0x02] = "preempt and abort", 224 + }; 225 + const char *ret = trace_seq_buffer_ptr(p); 226 + u8 racqa = cdw10[0] & 0x7; 227 + u8 iekey = (cdw10[0] >> 3) & 0x1; 228 + u8 rtype = cdw10[1]; 229 + const char *racqa_str = "reserved"; 230 + const char *rtype_str = "reserved"; 231 + 232 + if (racqa < ARRAY_SIZE(racqa_strs) && racqa_strs[racqa]) 233 + racqa_str = racqa_strs[racqa]; 234 + 235 + if (rtype < ARRAY_SIZE(rtype_strs) && rtype_strs[rtype]) 236 + rtype_str = rtype_strs[rtype]; 237 + 238 + trace_seq_printf(p, "racqa=%u:%s, iekey=%u, rtype=%u:%s", 239 + racqa, racqa_str, iekey, rtype, rtype_str); 240 + trace_seq_putc(p, 0); 241 + 242 + return ret; 243 + } 244 + 245 + static const char *nvmet_trace_resv_rel(struct trace_seq *p, u8 *cdw10) 246 + { 247 + static const char * const rrela_strs[] = { 248 + [0x00] = "release", 249 + [0x01] = "clear", 250 + }; 251 + const char *ret = trace_seq_buffer_ptr(p); 252 + u8 rrela = cdw10[0] & 0x7; 253 + u8 iekey = (cdw10[0] >> 3) & 0x1; 254 + u8 rtype = cdw10[1]; 255 + const char *rrela_str = "reserved"; 256 + const char *rtype_str = "reserved"; 257 + 258 + if (rrela < ARRAY_SIZE(rrela_strs) && rrela_strs[rrela]) 259 + rrela_str = rrela_strs[rrela]; 260 + 261 + if (rtype < ARRAY_SIZE(rtype_strs) && rtype_strs[rtype]) 262 + rtype_str = rtype_strs[rtype]; 263 + 264 + trace_seq_printf(p, "rrela=%u:%s, iekey=%u, rtype=%u:%s", 265 + rrela, rrela_str, iekey, rtype, rtype_str); 266 + trace_seq_putc(p, 0); 267 + 268 + return ret; 269 + } 270 + 271 + static const char *nvmet_trace_resv_report(struct trace_seq *p, u8 *cdw10) 272 + { 273 + const char *ret = trace_seq_buffer_ptr(p); 274 + u32 numd = get_unaligned_le32(cdw10); 275 + u8 eds = cdw10[4] & 0x1; 276 + 277 + trace_seq_printf(p, "numd=%u, eds=%u", numd, eds); 278 + trace_seq_putc(p, 0); 279 + 280 + return ret; 281 + } 282 + 183 283 const char *nvmet_trace_parse_nvm_cmd(struct trace_seq *p, 184 284 u8 opcode, u8 *cdw10) 185 285 { ··· 295 195 return nvmet_trace_zone_mgmt_send(p, cdw10); 296 196 case nvme_cmd_zone_mgmt_recv: 297 197 return nvmet_trace_zone_mgmt_recv(p, cdw10); 198 + case nvme_cmd_resv_register: 199 + return nvmet_trace_resv_reg(p, cdw10); 200 + case nvme_cmd_resv_acquire: 201 + return nvmet_trace_resv_acq(p, cdw10); 202 + case nvme_cmd_resv_release: 203 + return nvmet_trace_resv_rel(p, cdw10); 204 + case nvme_cmd_resv_report: 205 + return nvmet_trace_resv_report(p, cdw10); 298 206 default: 299 207 return nvmet_trace_common(p, cdw10); 300 208 }

+13 -8

drivers/nvme/target/zns.c

··· 537 537 u16 status = NVME_SC_SUCCESS; 538 538 unsigned int total_len = 0; 539 539 struct scatterlist *sg; 540 + u32 data_len = nvmet_rw_data_len(req); 540 541 struct bio *bio; 541 542 int sg_cnt; 542 543 543 544 /* Request is completed on len mismatch in nvmet_check_transter_len() */ 544 545 if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req))) 545 546 return; 547 + 548 + if (data_len > 549 + bdev_max_zone_append_sectors(req->ns->bdev) << SECTOR_SHIFT) { 550 + req->error_loc = offsetof(struct nvme_rw_command, length); 551 + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; 552 + goto out; 553 + } 546 554 547 555 if (!req->sg_cnt) { 548 556 nvmet_req_complete(req, 0); ··· 584 576 bio->bi_opf |= REQ_FUA; 585 577 586 578 for_each_sg(req->sg, sg, req->sg_cnt, sg_cnt) { 587 - struct page *p = sg_page(sg); 588 - unsigned int l = sg->length; 589 - unsigned int o = sg->offset; 590 - unsigned int ret; 579 + unsigned int len = sg->length; 591 580 592 - ret = bio_add_zone_append_page(bio, p, l, o); 593 - if (ret != sg->length) { 581 + if (bio_add_pc_page(bdev_get_queue(bio->bi_bdev), bio, 582 + sg_page(sg), len, sg->offset) != len) { 594 583 status = NVME_SC_INTERNAL; 595 584 goto out_put_bio; 596 585 } 597 - total_len += sg->length; 586 + total_len += len; 598 587 } 599 588 600 - if (total_len != nvmet_rw_data_len(req)) { 589 + if (total_len != data_len) { 601 590 status = NVME_SC_INTERNAL | NVME_STATUS_DNR; 602 591 goto out_put_bio; 603 592 }

+1 -1

drivers/s390/block/dasd.c

··· 2117 2117 case DASD_CQR_IN_IO: 2118 2118 rc = device->discipline->term_IO(cqr); 2119 2119 if (rc) { 2120 - /* unable to terminate requeust */ 2120 + /* unable to terminate request */ 2121 2121 dev_err(&device->cdev->dev, 2122 2122 "Flushing the DASD request queue failed\n"); 2123 2123 /* stop flush processing */

+1 -1

drivers/s390/block/dasd_devmap.c

··· 855 855 dev_set_drvdata(&device->cdev->dev, NULL); 856 856 spin_unlock_irqrestore(get_ccwdev_lock(device->cdev), flags); 857 857 858 - /* Removve copy relation */ 858 + /* Remove copy relation */ 859 859 dasd_devmap_delete_copy_relation_device(device); 860 860 /* 861 861 * Drop ref_count by 3, one for the devmap reference, one for

+1 -1

drivers/s390/block/dasd_eckd.c

··· 2405 2405 } 2406 2406 2407 2407 if (count_area != NULL && count_area->kl == 0) { 2408 - /* we found notthing violating our disk layout */ 2408 + /* we found nothing violating our disk layout */ 2409 2409 if (dasd_check_blocksize(count_area->dl) == 0) 2410 2410 block->bp_block = count_area->dl; 2411 2411 }

+5

drivers/s390/block/dasd_proc.c

··· 350 350 remove_proc_entry("devices", dasd_proc_root_entry); 351 351 out_nodevices: 352 352 remove_proc_entry("dasd", NULL); 353 + dasd_proc_root_entry = NULL; 353 354 out_nodasd: 354 355 return -ENOENT; 355 356 } ··· 358 357 void 359 358 dasd_proc_exit(void) 360 359 { 360 + if (!dasd_proc_root_entry) 361 + return; 362 + 361 363 remove_proc_entry("devices", dasd_proc_root_entry); 362 364 remove_proc_entry("statistics", dasd_proc_root_entry); 363 365 remove_proc_entry("dasd", NULL); 366 + dasd_proc_root_entry = NULL; 364 367 }

+3 -3

drivers/scsi/sd.c

··· 1190 1190 if (!sdkp->rscs) 1191 1191 return 0; 1192 1192 1193 - return min3((u32)rq->write_hint, (u32)sdkp->permanent_stream_count, 1194 - 0x3fu); 1193 + return min3((u32)rq->bio->bi_write_hint, 1194 + (u32)sdkp->permanent_stream_count, 0x3fu); 1195 1195 } 1196 1196 1197 1197 static blk_status_t sd_setup_rw32_cmnd(struct scsi_cmnd *cmd, bool write, ··· 1389 1389 ret = sd_setup_rw16_cmnd(cmd, write, lba, nr_blocks, 1390 1390 protect | fua, dld); 1391 1391 } else if ((nr_blocks > 0xff) || (lba > 0x1fffff) || 1392 - sdp->use_10_for_rw || protect || rq->write_hint) { 1392 + sdp->use_10_for_rw || protect || rq->bio->bi_write_hint) { 1393 1393 ret = sd_setup_rw10_cmnd(cmd, write, lba, nr_blocks, 1394 1394 protect | fua); 1395 1395 } else {

-2

drivers/scsi/sd_zbc.c

··· 633 633 lim->max_open_zones = sdkp->zones_max_open; 634 634 lim->max_active_zones = 0; 635 635 lim->chunk_sectors = logical_to_sectors(sdkp->device, zone_blocks); 636 - /* Enable block layer zone append emulation */ 637 - lim->max_zone_append_sectors = 0; 638 636 639 637 return 0; 640 638

+8 -5

fs/btrfs/zoned.c

··· 707 707 * zoned mode. In this case, we don't have a valid max zone 708 708 * append size. 709 709 */ 710 - if (bdev_is_zoned(device->bdev)) { 711 - blk_stack_limits(lim, 712 - &bdev_get_queue(device->bdev)->limits, 713 - 0); 714 - } 710 + if (bdev_is_zoned(device->bdev)) 711 + blk_stack_limits(lim, bdev_limits(device->bdev), 0); 712 + } 713 + 714 + ret = blk_validate_limits(lim); 715 + if (ret) { 716 + btrfs_err(fs_info, "zoned: failed to validate queue limits"); 717 + return ret; 715 718 } 716 719 717 720 /*

+2 -2

include/linux/bio-integrity.h

··· 72 72 unsigned int nr); 73 73 int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, 74 74 unsigned int offset); 75 - int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len, u32 seed); 75 + int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len); 76 76 void bio_integrity_unmap_user(struct bio *bio); 77 77 bool bio_integrity_prep(struct bio *bio); 78 78 void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); ··· 99 99 } 100 100 101 101 static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf, 102 - ssize_t len, u32 seed) 102 + ssize_t len) 103 103 { 104 104 return -EINVAL; 105 105 }

+17 -2

include/linux/bio.h

··· 418 418 size_t len, size_t off); 419 419 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, 420 420 unsigned int, unsigned int); 421 - int bio_add_zone_append_page(struct bio *bio, struct page *page, 422 - unsigned int len, unsigned int offset); 423 421 void __bio_add_page(struct bio *bio, struct page *page, 424 422 unsigned int len, unsigned int off); 425 423 void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, ··· 673 675 static inline void bio_clear_polled(struct bio *bio) 674 676 { 675 677 bio->bi_opf &= ~REQ_POLLED; 678 + } 679 + 680 + /** 681 + * bio_is_zone_append - is this a zone append bio? 682 + * @bio: bio to check 683 + * 684 + * Check if @bio is a zone append operation. Core block layer code and end_io 685 + * handlers must use this instead of an open coded REQ_OP_ZONE_APPEND check 686 + * because the block layer can rewrite REQ_OP_ZONE_APPEND to REQ_OP_WRITE if 687 + * it is not natively supported. 688 + */ 689 + static inline bool bio_is_zone_append(struct bio *bio) 690 + { 691 + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) 692 + return false; 693 + return bio_op(bio) == REQ_OP_ZONE_APPEND || 694 + bio_flagged(bio, BIO_EMULATES_ZONE_APPEND); 676 695 } 677 696 678 697 struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,

+2 -3

include/linux/blk-integrity.h

··· 28 28 int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); 29 29 int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); 30 30 int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, 31 - ssize_t bytes, u32 seed); 31 + ssize_t bytes); 32 32 33 33 static inline bool 34 34 blk_integrity_queue_supports_integrity(struct request_queue *q) ··· 104 104 } 105 105 static inline int blk_rq_integrity_map_user(struct request *rq, 106 106 void __user *ubuf, 107 - ssize_t bytes, 108 - u32 seed) 107 + ssize_t bytes) 109 108 { 110 109 return -EINVAL; 111 110 }

+59 -66

include/linux/blk-mq.h

··· 156 156 struct blk_crypto_keyslot *crypt_keyslot; 157 157 #endif 158 158 159 - enum rw_hint write_hint; 160 - unsigned short ioprio; 161 - 162 159 enum mq_rq_state state; 163 160 atomic_t ref; 164 161 ··· 219 222 220 223 static inline unsigned short req_get_ioprio(struct request *req) 221 224 { 222 - return req->ioprio; 225 + if (req->bio) 226 + return req->bio->bi_ioprio; 227 + return 0; 223 228 } 224 229 225 230 #define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ) ··· 229 230 #define rq_dma_dir(rq) \ 230 231 (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) 231 232 232 - #define rq_list_add(listptr, rq) do { \ 233 - (rq)->rq_next = *(listptr); \ 234 - *(listptr) = rq; \ 235 - } while (0) 236 - 237 - #define rq_list_add_tail(lastpptr, rq) do { \ 238 - (rq)->rq_next = NULL; \ 239 - **(lastpptr) = rq; \ 240 - *(lastpptr) = &rq->rq_next; \ 241 - } while (0) 242 - 243 - #define rq_list_pop(listptr) \ 244 - ({ \ 245 - struct request *__req = NULL; \ 246 - if ((listptr) && *(listptr)) { \ 247 - __req = *(listptr); \ 248 - *(listptr) = __req->rq_next; \ 249 - } \ 250 - __req; \ 251 - }) 252 - 253 - #define rq_list_peek(listptr) \ 254 - ({ \ 255 - struct request *__req = NULL; \ 256 - if ((listptr) && *(listptr)) \ 257 - __req = *(listptr); \ 258 - __req; \ 259 - }) 260 - 261 - #define rq_list_for_each(listptr, pos) \ 262 - for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) 263 - 264 - #define rq_list_for_each_safe(listptr, pos, nxt) \ 265 - for (pos = rq_list_peek((listptr)), nxt = rq_list_next(pos); \ 266 - pos; pos = nxt, nxt = pos ? rq_list_next(pos) : NULL) 267 - 268 - #define rq_list_next(rq) (rq)->rq_next 269 - #define rq_list_empty(list) ((list) == (struct request *) NULL) 270 - 271 - /** 272 - * rq_list_move() - move a struct request from one list to another 273 - * @src: The source list @rq is currently in 274 - * @dst: The destination list that @rq will be appended to 275 - * @rq: The request to move 276 - * @prev: The request preceding @rq in @src (NULL if @rq is the head) 277 - */ 278 - static inline void rq_list_move(struct request **src, struct request **dst, 279 - struct request *rq, struct request *prev) 233 + static inline int rq_list_empty(const struct rq_list *rl) 280 234 { 281 - if (prev) 282 - prev->rq_next = rq->rq_next; 283 - else 284 - *src = rq->rq_next; 285 - rq_list_add(dst, rq); 235 + return rl->head == NULL; 286 236 } 237 + 238 + static inline void rq_list_init(struct rq_list *rl) 239 + { 240 + rl->head = NULL; 241 + rl->tail = NULL; 242 + } 243 + 244 + static inline void rq_list_add_tail(struct rq_list *rl, struct request *rq) 245 + { 246 + rq->rq_next = NULL; 247 + if (rl->tail) 248 + rl->tail->rq_next = rq; 249 + else 250 + rl->head = rq; 251 + rl->tail = rq; 252 + } 253 + 254 + static inline void rq_list_add_head(struct rq_list *rl, struct request *rq) 255 + { 256 + rq->rq_next = rl->head; 257 + rl->head = rq; 258 + if (!rl->tail) 259 + rl->tail = rq; 260 + } 261 + 262 + static inline struct request *rq_list_pop(struct rq_list *rl) 263 + { 264 + struct request *rq = rl->head; 265 + 266 + if (rq) { 267 + rl->head = rl->head->rq_next; 268 + if (!rl->head) 269 + rl->tail = NULL; 270 + rq->rq_next = NULL; 271 + } 272 + 273 + return rq; 274 + } 275 + 276 + static inline struct request *rq_list_peek(struct rq_list *rl) 277 + { 278 + return rl->head; 279 + } 280 + 281 + #define rq_list_for_each(rl, pos) \ 282 + for (pos = rq_list_peek((rl)); (pos); pos = pos->rq_next) 283 + 284 + #define rq_list_for_each_safe(rl, pos, nxt) \ 285 + for (pos = rq_list_peek((rl)), nxt = pos->rq_next; \ 286 + pos; pos = nxt, nxt = pos ? pos->rq_next : NULL) 287 287 288 288 /** 289 289 * enum blk_eh_timer_return - How the timeout handler should proceed ··· 575 577 * empty the @rqlist completely, then the rest will be queued 576 578 * individually by the block layer upon return. 577 579 */ 578 - void (*queue_rqs)(struct request **rqlist); 580 + void (*queue_rqs)(struct rq_list *rqlist); 579 581 580 582 /** 581 583 * @get_budget: Reserve budget before queue request, once .queue_rq is ··· 855 857 */ 856 858 static inline bool blk_mq_need_time_stamp(struct request *rq) 857 859 { 858 - /* 859 - * passthrough io doesn't use iostat accounting, cgroup stats 860 - * and io scheduler functionalities. 861 - */ 862 - if (blk_rq_is_passthrough(rq)) 863 - return false; 864 860 return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_USE_SCHED)); 865 861 } 866 862 ··· 884 892 else if (iob->complete != complete) 885 893 return false; 886 894 iob->need_ts |= blk_mq_need_time_stamp(req); 887 - rq_list_add(&iob->req_list, req); 895 + rq_list_add_tail(&iob->req_list, req); 888 896 return true; 889 897 } 890 898 ··· 917 925 void blk_mq_freeze_queue_wait(struct request_queue *q); 918 926 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, 919 927 unsigned long timeout); 928 + void blk_mq_unfreeze_queue_non_owner(struct request_queue *q); 929 + void blk_freeze_queue_start_non_owner(struct request_queue *q); 920 930 921 931 void blk_mq_map_queues(struct blk_mq_queue_map *qmap); 922 932 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); ··· 983 989 rq->nr_phys_segments = nr_segs; 984 990 rq->__data_len = bio->bi_iter.bi_size; 985 991 rq->bio = rq->biotail = bio; 986 - rq->ioprio = bio_prio(bio); 987 992 } 988 993 989 994 void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,

+68 -27

include/linux/blkdev.h

··· 25 25 #include <linux/uuid.h> 26 26 #include <linux/xarray.h> 27 27 #include <linux/file.h> 28 + #include <linux/lockdep.h> 28 29 29 30 struct module; 30 31 struct request_queue; ··· 195 194 unsigned int nr_zones; 196 195 unsigned int zone_capacity; 197 196 unsigned int last_zone_capacity; 198 - unsigned long *conv_zones_bitmap; 197 + unsigned long __rcu *conv_zones_bitmap; 199 198 unsigned int zone_wplugs_hash_bits; 200 199 spinlock_t zone_wplugs_lock; 201 200 struct mempool_s *zone_wplugs_pool; ··· 350 349 /* I/O topology is misaligned */ 351 350 #define BLK_FLAG_MISALIGNED ((__force blk_flags_t)(1u << 1)) 352 351 352 + /* passthrough command IO accounting */ 353 + #define BLK_FLAG_IOSTATS_PASSTHROUGH ((__force blk_flags_t)(1u << 2)) 354 + 353 355 struct queue_limits { 354 356 blk_features_t features; 355 357 blk_flags_t flags; ··· 375 371 unsigned int max_user_discard_sectors; 376 372 unsigned int max_secure_erase_sectors; 377 373 unsigned int max_write_zeroes_sectors; 374 + unsigned int max_hw_zone_append_sectors; 378 375 unsigned int max_zone_append_sectors; 379 376 unsigned int discard_granularity; 380 377 unsigned int discard_alignment; ··· 476 471 struct xarray hctx_table; 477 472 478 473 struct percpu_ref q_usage_counter; 474 + struct lock_class_key io_lock_cls_key; 475 + struct lockdep_map io_lockdep_map; 476 + 477 + struct lock_class_key q_lock_cls_key; 478 + struct lockdep_map q_lockdep_map; 479 479 480 480 struct request *last_merge; 481 481 ··· 576 566 struct throtl_data *td; 577 567 #endif 578 568 struct rcu_head rcu_head; 569 + #ifdef CONFIG_LOCKDEP 570 + struct task_struct *mq_freeze_owner; 571 + int mq_freeze_owner_depth; 572 + #endif 579 573 wait_queue_head_t mq_freeze_wq; 580 574 /* 581 575 * Protect concurrent access to q_usage_counter by ··· 631 617 test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) 632 618 #define blk_queue_nonrot(q) (!((q)->limits.features & BLK_FEAT_ROTATIONAL)) 633 619 #define blk_queue_io_stat(q) ((q)->limits.features & BLK_FEAT_IO_STAT) 620 + #define blk_queue_passthrough_stat(q) \ 621 + ((q)->limits.flags & BLK_FLAG_IOSTATS_PASSTHROUGH) 634 622 #define blk_queue_dax(q) ((q)->limits.features & BLK_FEAT_DAX) 635 623 #define blk_queue_pci_p2pdma(q) ((q)->limits.features & BLK_FEAT_PCI_P2PDMA) 636 624 #ifdef CONFIG_BLK_RQ_ALLOC_TIME ··· 741 725 #define for_each_bio(_bio) \ 742 726 for (; _bio; _bio = _bio->bi_next) 743 727 728 + int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, 729 + const struct attribute_group **groups, 730 + struct fwnode_handle *fwnode); 744 731 int __must_check device_add_disk(struct device *parent, struct gendisk *disk, 745 732 const struct attribute_group **groups); 746 733 static inline int __must_check add_disk(struct gendisk *disk) ··· 948 929 int queue_limits_commit_update(struct request_queue *q, 949 930 struct queue_limits *lim); 950 931 int queue_limits_set(struct request_queue *q, struct queue_limits *lim); 932 + int blk_validate_limits(struct queue_limits *lim); 951 933 952 934 /** 953 935 * queue_limits_cancel_update - cancel an atomic update of queue limits ··· 1006 986 1007 987 void blk_mark_disk_dead(struct gendisk *disk); 1008 988 989 + struct rq_list { 990 + struct request *head; 991 + struct request *tail; 992 + }; 993 + 1009 994 #ifdef CONFIG_BLOCK 1010 995 /* 1011 996 * blk_plug permits building a queue of related requests by holding the I/O ··· 1024 999 * blk_flush_plug() is called. 1025 1000 */ 1026 1001 struct blk_plug { 1027 - struct request *mq_list; /* blk-mq requests */ 1002 + struct rq_list mq_list; /* blk-mq requests */ 1028 1003 1029 1004 /* if ios_left is > 1, we can batch tag/rq allocations */ 1030 - struct request *cached_rq; 1005 + struct rq_list cached_rqs; 1031 1006 u64 cur_ktime; 1032 1007 unsigned short nr_ios; 1033 1008 ··· 1170 1145 */ 1171 1146 #define BLK_DEF_MAX_SECTORS_CAP 2560u 1172 1147 1148 + static inline struct queue_limits *bdev_limits(struct block_device *bdev) 1149 + { 1150 + return &bdev_get_queue(bdev)->limits; 1151 + } 1152 + 1173 1153 static inline unsigned long queue_segment_boundary(const struct request_queue *q) 1174 1154 { 1175 1155 return q->limits.seg_boundary_mask; ··· 1215 1185 return q->limits.max_segment_size; 1216 1186 } 1217 1187 1218 - static inline unsigned int 1219 - queue_limits_max_zone_append_sectors(const struct queue_limits *l) 1220 - { 1221 - unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors); 1222 - 1223 - return min_not_zero(l->max_zone_append_sectors, max_sectors); 1224 - } 1225 - 1226 - static inline unsigned int queue_max_zone_append_sectors(struct request_queue *q) 1227 - { 1228 - if (!blk_queue_is_zoned(q)) 1229 - return 0; 1230 - 1231 - return queue_limits_max_zone_append_sectors(&q->limits); 1232 - } 1233 - 1234 1188 static inline bool queue_emulates_zone_append(struct request_queue *q) 1235 1189 { 1236 - return blk_queue_is_zoned(q) && !q->limits.max_zone_append_sectors; 1190 + return blk_queue_is_zoned(q) && !q->limits.max_hw_zone_append_sectors; 1237 1191 } 1238 1192 1239 1193 static inline bool bdev_emulates_zone_append(struct block_device *bdev) ··· 1228 1214 static inline unsigned int 1229 1215 bdev_max_zone_append_sectors(struct block_device *bdev) 1230 1216 { 1231 - return queue_max_zone_append_sectors(bdev_get_queue(bdev)); 1217 + return bdev_limits(bdev)->max_zone_append_sectors; 1232 1218 } 1233 1219 1234 1220 static inline unsigned int bdev_max_segments(struct block_device *bdev) ··· 1293 1279 1294 1280 static inline unsigned int bdev_max_discard_sectors(struct block_device *bdev) 1295 1281 { 1296 - return bdev_get_queue(bdev)->limits.max_discard_sectors; 1282 + return bdev_limits(bdev)->max_discard_sectors; 1297 1283 } 1298 1284 1299 1285 static inline unsigned int bdev_discard_granularity(struct block_device *bdev) 1300 1286 { 1301 - return bdev_get_queue(bdev)->limits.discard_granularity; 1287 + return bdev_limits(bdev)->discard_granularity; 1302 1288 } 1303 1289 1304 1290 static inline unsigned int 1305 1291 bdev_max_secure_erase_sectors(struct block_device *bdev) 1306 1292 { 1307 - return bdev_get_queue(bdev)->limits.max_secure_erase_sectors; 1293 + return bdev_limits(bdev)->max_secure_erase_sectors; 1308 1294 } 1309 1295 1310 1296 static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) 1311 1297 { 1312 - return bdev_get_queue(bdev)->limits.max_write_zeroes_sectors; 1298 + return bdev_limits(bdev)->max_write_zeroes_sectors; 1313 1299 } 1314 1300 1315 1301 static inline bool bdev_nonrot(struct block_device *bdev) ··· 1345 1331 1346 1332 static inline bool bdev_fua(struct block_device *bdev) 1347 1333 { 1348 - return bdev_get_queue(bdev)->limits.features & BLK_FEAT_FUA; 1334 + return bdev_limits(bdev)->features & BLK_FEAT_FUA; 1349 1335 } 1350 1336 1351 1337 static inline bool bdev_nowait(struct block_device *bdev) ··· 1388 1374 sector_t sector) 1389 1375 { 1390 1376 return bdev_offset_from_zone_start(bdev, sector) == 0; 1377 + } 1378 + 1379 + /** 1380 + * bdev_zone_is_seq - check if a sector belongs to a sequential write zone 1381 + * @bdev: block device to check 1382 + * @sector: sector number 1383 + * 1384 + * Check if @sector on @bdev is contained in a sequential write required zone. 1385 + */ 1386 + static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) 1387 + { 1388 + bool is_seq = false; 1389 + 1390 + #if IS_ENABLED(CONFIG_BLK_DEV_ZONED) 1391 + if (bdev_is_zoned(bdev)) { 1392 + struct gendisk *disk = bdev->bd_disk; 1393 + unsigned long *bitmap; 1394 + 1395 + rcu_read_lock(); 1396 + bitmap = rcu_dereference(disk->conv_zones_bitmap); 1397 + is_seq = !bitmap || 1398 + !test_bit(disk_zone_no(disk, sector), bitmap); 1399 + rcu_read_unlock(); 1400 + } 1401 + #endif 1402 + 1403 + return is_seq; 1391 1404 } 1392 1405 1393 1406 static inline int queue_dma_alignment(const struct request_queue *q) ··· 1689 1648 void bdev_fput(struct file *bdev_file); 1690 1649 1691 1650 struct io_comp_batch { 1692 - struct request *req_list; 1651 + struct rq_list req_list; 1693 1652 bool need_ts; 1694 1653 void (*complete)(struct io_comp_batch *); 1695 1654 };

+132 -3

include/linux/nvme.h

··· 327 327 __le32 sanicap; 328 328 __le32 hmminds; 329 329 __le16 hmmaxd; 330 - __u8 rsvd338[4]; 330 + __le16 nvmsetidmax; 331 + __le16 endgidmax; 331 332 __u8 anatt; 332 333 __u8 anacap; 333 334 __le32 anagrpmax; ··· 523 522 NVME_ID_CNS_NS_DESC_LIST = 0x03, 524 523 NVME_ID_CNS_CS_NS = 0x05, 525 524 NVME_ID_CNS_CS_CTRL = 0x06, 525 + NVME_ID_CNS_NS_ACTIVE_LIST_CS = 0x07, 526 526 NVME_ID_CNS_NS_CS_INDEP = 0x08, 527 527 NVME_ID_CNS_NS_PRESENT_LIST = 0x10, 528 528 NVME_ID_CNS_NS_PRESENT = 0x11, ··· 532 530 NVME_ID_CNS_SCNDRY_CTRL_LIST = 0x15, 533 531 NVME_ID_CNS_NS_GRANULARITY = 0x16, 534 532 NVME_ID_CNS_UUID_LIST = 0x17, 533 + NVME_ID_CNS_ENDGRP_LIST = 0x19, 535 534 }; 536 535 537 536 enum { ··· 563 560 NVME_NS_FLBAS_LBA_SHIFT = 1, 564 561 NVME_NS_FLBAS_META_EXT = 0x10, 565 562 NVME_NS_NMIC_SHARED = 1 << 0, 563 + NVME_NS_ROTATIONAL = 1 << 4, 564 + NVME_NS_VWC_NOT_PRESENT = 1 << 5, 566 565 NVME_LBAF_RP_BEST = 0, 567 566 NVME_LBAF_RP_BETTER = 1, 568 567 NVME_LBAF_RP_GOOD = 2, ··· 620 615 NVME_NIDT_NGUID = 0x02, 621 616 NVME_NIDT_UUID = 0x03, 622 617 NVME_NIDT_CSI = 0x04, 618 + }; 619 + 620 + struct nvme_endurance_group_log { 621 + __u8 egcw; 622 + __u8 egfeat; 623 + __u8 rsvd2; 624 + __u8 avsp; 625 + __u8 avspt; 626 + __u8 pused; 627 + __le16 did; 628 + __u8 rsvd8[24]; 629 + __u8 ee[16]; 630 + __u8 dur[16]; 631 + __u8 duw[16]; 632 + __u8 muw[16]; 633 + __u8 hrc[16]; 634 + __u8 hwc[16]; 635 + __u8 mdie[16]; 636 + __u8 neile[16]; 637 + __u8 tegcap[16]; 638 + __u8 uegcap[16]; 639 + __u8 rsvd192[320]; 640 + }; 641 + 642 + struct nvme_rotational_media_log { 643 + __le16 endgid; 644 + __le16 numa; 645 + __le16 nrs; 646 + __u8 rsvd6[2]; 647 + __le32 spinc; 648 + __le32 fspinc; 649 + __le32 ldc; 650 + __le32 fldc; 651 + __u8 rsvd24[488]; 623 652 }; 624 653 625 654 struct nvme_smart_log { ··· 1283 1244 NVME_FEAT_WRITE_PROTECT = 0x84, 1284 1245 NVME_FEAT_VENDOR_START = 0xC0, 1285 1246 NVME_FEAT_VENDOR_END = 0xFF, 1247 + NVME_LOG_SUPPORTED = 0x00, 1286 1248 NVME_LOG_ERROR = 0x01, 1287 1249 NVME_LOG_SMART = 0x02, 1288 1250 NVME_LOG_FW_SLOT = 0x03, ··· 1294 1254 NVME_LOG_TELEMETRY_CTRL = 0x08, 1295 1255 NVME_LOG_ENDURANCE_GROUP = 0x09, 1296 1256 NVME_LOG_ANA = 0x0c, 1257 + NVME_LOG_FEATURES = 0x12, 1258 + NVME_LOG_RMI = 0x16, 1297 1259 NVME_LOG_DISC = 0x70, 1298 1260 NVME_LOG_RESERVATION = 0x80, 1299 1261 NVME_FWACT_REPL = (0 << 3), 1300 1262 NVME_FWACT_REPL_ACTV = (1 << 3), 1301 1263 NVME_FWACT_ACTV = (2 << 3), 1264 + }; 1265 + 1266 + struct nvme_supported_log { 1267 + __le32 lids[256]; 1268 + }; 1269 + 1270 + enum { 1271 + NVME_LIDS_LSUPP = 1 << 0, 1272 + }; 1273 + 1274 + struct nvme_supported_features_log { 1275 + __le32 fis[256]; 1276 + }; 1277 + 1278 + enum { 1279 + NVME_FIS_FSUPP = 1 << 0, 1280 + NVME_FIS_NSCPE = 1 << 20, 1281 + NVME_FIS_CSCPE = 1 << 21, 1302 1282 }; 1303 1283 1304 1284 /* NVMe Namespace Write Protect State */ ··· 1341 1281 __u8 cns; 1342 1282 __u8 rsvd3; 1343 1283 __le16 ctrlid; 1344 - __u8 rsvd11[3]; 1284 + __le16 cnssid; 1285 + __u8 rsvd11; 1345 1286 __u8 csi; 1346 1287 __u32 rsvd12[4]; 1347 1288 }; ··· 1450 1389 __u8 lsp; /* upper 4 bits reserved */ 1451 1390 __le16 numdl; 1452 1391 __le16 numdu; 1453 - __u16 rsvd11; 1392 + __le16 lsi; 1454 1393 union { 1455 1394 struct { 1456 1395 __le32 lpol; ··· 2097 2036 #define NVME_MAJOR(ver) ((ver) >> 16) 2098 2037 #define NVME_MINOR(ver) (((ver) >> 8) & 0xff) 2099 2038 #define NVME_TERTIARY(ver) ((ver) & 0xff) 2039 + 2040 + enum { 2041 + NVME_AEN_RESV_LOG_PAGE_AVALIABLE = 0x00, 2042 + }; 2043 + 2044 + enum { 2045 + NVME_PR_LOG_EMPTY_LOG_PAGE = 0x00, 2046 + NVME_PR_LOG_REGISTRATION_PREEMPTED = 0x01, 2047 + NVME_PR_LOG_RESERVATION_RELEASED = 0x02, 2048 + NVME_PR_LOG_RESERVATOIN_PREEMPTED = 0x03, 2049 + }; 2050 + 2051 + enum { 2052 + NVME_PR_NOTIFY_BIT_REG_PREEMPTED = 1, 2053 + NVME_PR_NOTIFY_BIT_RESV_RELEASED = 2, 2054 + NVME_PR_NOTIFY_BIT_RESV_PREEMPTED = 3, 2055 + }; 2056 + 2057 + struct nvme_pr_log { 2058 + __le64 count; 2059 + __u8 type; 2060 + __u8 nr_pages; 2061 + __u8 rsvd1[2]; 2062 + __le32 nsid; 2063 + __u8 rsvd2[48]; 2064 + }; 2065 + 2066 + struct nvmet_pr_register_data { 2067 + __le64 crkey; 2068 + __le64 nrkey; 2069 + }; 2070 + 2071 + struct nvmet_pr_acquire_data { 2072 + __le64 crkey; 2073 + __le64 prkey; 2074 + }; 2075 + 2076 + struct nvmet_pr_release_data { 2077 + __le64 crkey; 2078 + }; 2079 + 2080 + enum nvme_pr_capabilities { 2081 + NVME_PR_SUPPORT_PTPL = 1, 2082 + NVME_PR_SUPPORT_WRITE_EXCLUSIVE = 1 << 1, 2083 + NVME_PR_SUPPORT_EXCLUSIVE_ACCESS = 1 << 2, 2084 + NVME_PR_SUPPORT_WRITE_EXCLUSIVE_REG_ONLY = 1 << 3, 2085 + NVME_PR_SUPPORT_EXCLUSIVE_ACCESS_REG_ONLY = 1 << 4, 2086 + NVME_PR_SUPPORT_WRITE_EXCLUSIVE_ALL_REGS = 1 << 5, 2087 + NVME_PR_SUPPORT_EXCLUSIVE_ACCESS_ALL_REGS = 1 << 6, 2088 + NVME_PR_SUPPORT_IEKEY_VER_1_3_DEF = 1 << 7, 2089 + }; 2090 + 2091 + enum nvme_pr_register_action { 2092 + NVME_PR_REGISTER_ACT_REG = 0, 2093 + NVME_PR_REGISTER_ACT_UNREG = 1, 2094 + NVME_PR_REGISTER_ACT_REPLACE = 1 << 1, 2095 + }; 2096 + 2097 + enum nvme_pr_acquire_action { 2098 + NVME_PR_ACQUIRE_ACT_ACQUIRE = 0, 2099 + NVME_PR_ACQUIRE_ACT_PREEMPT = 1, 2100 + NVME_PR_ACQUIRE_ACT_PREEMPT_AND_ABORT = 1 << 1, 2101 + }; 2102 + 2103 + enum nvme_pr_release_action { 2104 + NVME_PR_RELEASE_ACT_RELEASE = 0, 2105 + NVME_PR_RELEASE_ACT_CLEAR = 1, 2106 + }; 2100 2107 2101 2108 #endif /* _LINUX_NVME_H */

+1

include/linux/sed-opal.h

··· 52 52 case IOC_OPAL_GET_GEOMETRY: 53 53 case IOC_OPAL_DISCOVERY: 54 54 case IOC_OPAL_REVERT_LSP: 55 + case IOC_OPAL_SET_SID_PW: 55 56 return true; 56 57 } 57 58 return false;

+3 -3

include/trace/events/block.h

··· 99 99 __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; 100 100 __entry->sector = blk_rq_trace_sector(rq); 101 101 __entry->nr_sector = blk_rq_trace_nr_sectors(rq); 102 - __entry->ioprio = rq->ioprio; 102 + __entry->ioprio = req_get_ioprio(rq); 103 103 104 104 blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); 105 105 __get_str(cmd)[0] = '\0'; ··· 136 136 __entry->sector = blk_rq_pos(rq); 137 137 __entry->nr_sector = nr_bytes >> 9; 138 138 __entry->error = blk_status_to_errno(error); 139 - __entry->ioprio = rq->ioprio; 139 + __entry->ioprio = req_get_ioprio(rq); 140 140 141 141 blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); 142 142 __get_str(cmd)[0] = '\0'; ··· 209 209 __entry->sector = blk_rq_trace_sector(rq); 210 210 __entry->nr_sector = blk_rq_trace_nr_sectors(rq); 211 211 __entry->bytes = blk_rq_bytes(rq); 212 - __entry->ioprio = rq->ioprio; 212 + __entry->ioprio = req_get_ioprio(rq); 213 213 214 214 blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); 215 215 __get_str(cmd)[0] = '\0';

+1

include/uapi/linux/sed-opal.h

··· 215 215 #define IOC_OPAL_GET_GEOMETRY _IOR('p', 238, struct opal_geometry) 216 216 #define IOC_OPAL_DISCOVERY _IOW('p', 239, struct opal_discovery) 217 217 #define IOC_OPAL_REVERT_LSP _IOW('p', 240, struct opal_revert_lsp) 218 + #define IOC_OPAL_SET_SID_PW _IOW('p', 241, struct opal_new_pw) 218 219 219 220 #endif /* _UAPI_SED_OPAL_H */

+18

include/uapi/linux/ublk_cmd.h

··· 147 147 */ 148 148 #define UBLK_F_NEED_GET_DATA (1UL << 2) 149 149 150 + /* 151 + * - Block devices are recoverable if ublk server exits and restarts 152 + * - Outstanding I/O when ublk server exits is met with errors 153 + * - I/O issued while there is no ublk server queues 154 + */ 150 155 #define UBLK_F_USER_RECOVERY (1UL << 3) 151 156 157 + /* 158 + * - Block devices are recoverable if ublk server exits and restarts 159 + * - Outstanding I/O when ublk server exits is reissued 160 + * - I/O issued while there is no ublk server queues 161 + */ 152 162 #define UBLK_F_USER_RECOVERY_REISSUE (1UL << 4) 153 163 154 164 /* ··· 200 190 */ 201 191 #define UBLK_F_ZONED (1ULL << 8) 202 192 193 + /* 194 + * - Block devices are recoverable if ublk server exits and restarts 195 + * - Outstanding I/O when ublk server exits is met with errors 196 + * - I/O issued while there is no ublk server is met with errors 197 + */ 198 + #define UBLK_F_USER_RECOVERY_FAIL_IO (1ULL << 9) 199 + 203 200 /* device state */ 204 201 #define UBLK_S_DEV_DEAD 0 205 202 #define UBLK_S_DEV_LIVE 1 206 203 #define UBLK_S_DEV_QUIESCED 2 204 + #define UBLK_S_DEV_FAIL_IO 3 207 205 208 206 /* shipped via sqe->cmd of io_uring command */ 209 207 struct ublksrv_ctrl_cmd {

+2 -2

io_uring/rw.c

··· 1179 1179 poll_flags |= BLK_POLL_ONESHOT; 1180 1180 1181 1181 /* iopoll may have completed current req */ 1182 - if (!rq_list_empty(iob.req_list) || 1182 + if (!rq_list_empty(&iob.req_list) || 1183 1183 READ_ONCE(req->iopoll_completed)) 1184 1184 break; 1185 1185 } 1186 1186 1187 - if (!rq_list_empty(iob.req_list)) 1187 + if (!rq_list_empty(&iob.req_list)) 1188 1188 iob.complete(&iob); 1189 1189 else if (!pos) 1190 1190 return 0;

+46 -22

lib/iov_iter.c

··· 1682 1682 } 1683 1683 1684 1684 /* 1685 - * Extract a list of contiguous pages from an ITER_BVEC iterator. This does 1686 - * not get references on the pages, nor does it get a pin on them. 1685 + * Extract a list of virtually contiguous pages from an ITER_BVEC iterator. 1686 + * This does not get references on the pages, nor does it get a pin on them. 1687 1687 */ 1688 1688 static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i, 1689 1689 struct page ***pages, size_t maxsize, ··· 1691 1691 iov_iter_extraction_t extraction_flags, 1692 1692 size_t *offset0) 1693 1693 { 1694 - struct page **p, *page; 1695 - size_t skip = i->iov_offset, offset, size; 1696 - int k; 1694 + size_t skip = i->iov_offset, size = 0; 1695 + struct bvec_iter bi; 1696 + int k = 0; 1697 1697 1698 - for (;;) { 1699 - if (i->nr_segs == 0) 1700 - return 0; 1701 - size = min(maxsize, i->bvec->bv_len - skip); 1702 - if (size) 1703 - break; 1698 + if (i->nr_segs == 0) 1699 + return 0; 1700 + 1701 + if (i->iov_offset == i->bvec->bv_len) { 1704 1702 i->iov_offset = 0; 1705 1703 i->nr_segs--; 1706 1704 i->bvec++; 1707 1705 skip = 0; 1708 1706 } 1707 + bi.bi_idx = 0; 1708 + bi.bi_size = maxsize; 1709 + bi.bi_bvec_done = skip; 1709 1710 1710 - skip += i->bvec->bv_offset; 1711 - page = i->bvec->bv_page + skip / PAGE_SIZE; 1712 - offset = skip % PAGE_SIZE; 1713 - *offset0 = offset; 1711 + maxpages = want_pages_array(pages, maxsize, skip, maxpages); 1714 1712 1715 - maxpages = want_pages_array(pages, size, offset, maxpages); 1716 - if (!maxpages) 1717 - return -ENOMEM; 1718 - p = *pages; 1719 - for (k = 0; k < maxpages; k++) 1720 - p[k] = page + k; 1713 + while (bi.bi_size && bi.bi_idx < i->nr_segs) { 1714 + struct bio_vec bv = bvec_iter_bvec(i->bvec, bi); 1721 1715 1722 - size = min_t(size_t, size, maxpages * PAGE_SIZE - offset); 1716 + /* 1717 + * The iov_iter_extract_pages interface only allows an offset 1718 + * into the first page. Break out of the loop if we see an 1719 + * offset into subsequent pages, the caller will have to call 1720 + * iov_iter_extract_pages again for the reminder. 1721 + */ 1722 + if (k) { 1723 + if (bv.bv_offset) 1724 + break; 1725 + } else { 1726 + *offset0 = bv.bv_offset; 1727 + } 1728 + 1729 + (*pages)[k++] = bv.bv_page; 1730 + size += bv.bv_len; 1731 + 1732 + if (k >= maxpages) 1733 + break; 1734 + 1735 + /* 1736 + * We are done when the end of the bvec doesn't align to a page 1737 + * boundary as that would create a hole in the returned space. 1738 + * The caller will handle this with another call to 1739 + * iov_iter_extract_pages. 1740 + */ 1741 + if (bv.bv_offset + bv.bv_len != PAGE_SIZE) 1742 + break; 1743 + 1744 + bvec_iter_advance_single(i->bvec, &bi, bv.bv_len); 1745 + } 1746 + 1723 1747 iov_iter_advance(i, size); 1724 1748 return size; 1725 1749 }