Merge tag 'for-6.1/block-2022-10-03' of git://git.kernel.dk/linux

+9

MAINTAINERS

··· 14582 14582 F: include/linux/nvme* 14583 14583 F: include/uapi/linux/nvme_ioctl.h 14584 14584 14585 + NVM EXPRESS FABRICS AUTHENTICATION 14586 + M: Hannes Reinecke <hare@suse.de> 14587 + L: linux-nvme@lists.infradead.org 14588 + S: Supported 14589 + F: drivers/nvme/host/auth.c 14590 + F: drivers/nvme/target/auth.c 14591 + F: drivers/nvme/target/fabrics-cmd-auth.c 14592 + F: include/linux/nvme-auth.h 14593 + 14585 14594 NVM EXPRESS FC TRANSPORT DRIVERS 14586 14595 M: James Smart <james.smart@broadcom.com> 14587 14596 L: linux-nvme@lists.infradead.org

+5

arch/s390/include/asm/scsw.h

··· 215 215 #define SNS2_ENV_DATA_PRESENT 0x10 216 216 #define SNS2_INPRECISE_END 0x04 217 217 218 + /* 219 + * architectured values for PPRC errors 220 + */ 221 + #define SNS7_INVALID_ON_SEC 0x0e 222 + 218 223 /** 219 224 * scsw_is_tm - check for transport mode scsw 220 225 * @scsw: pointer to scsw

+14

arch/s390/include/uapi/asm/dasd.h

··· 183 183 } format_data_t; 184 184 185 185 /* 186 + * struct dasd_copypair_swap_data_t 187 + * represents all data necessary to issue a swap of the copy pair relation 188 + */ 189 + struct dasd_copypair_swap_data_t { 190 + char primary[20]; /* BUSID of primary */ 191 + char secondary[20]; /* BUSID of secondary */ 192 + 193 + /* Reserved for future updates. */ 194 + __u8 reserved[64]; 195 + }; 196 + 197 + /* 186 198 * values to be used for format_data_t.intensity 187 199 * 0/8: normal format 188 200 * 1/9: also write record zero ··· 338 326 #define BIODASDSATTR _IOW(DASD_IOCTL_LETTER,2,attrib_data_t) 339 327 /* Release Allocated Space */ 340 328 #define BIODASDRAS _IOW(DASD_IOCTL_LETTER, 3, format_data_t) 329 + /* Swap copy pair relation */ 330 + #define BIODASDCOPYPAIRSWAP _IOW(DASD_IOCTL_LETTER, 4, struct dasd_copypair_swap_data_t) 341 331 342 332 /* Get Sense Path Group ID (SNID) data */ 343 333 #define BIODASDSNID _IOWR(DASD_IOCTL_LETTER, 1, struct dasd_snid_ioctl_data)

-5

block/bfq-cgroup.c

··· 254 254 255 255 #else /* CONFIG_BFQ_CGROUP_DEBUG */ 256 256 257 - void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, 258 - blk_opf_t opf) { } 259 257 void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf) { } 260 258 void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf) { } 261 259 void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, 262 260 u64 io_start_time_ns, blk_opf_t opf) { } 263 261 void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } 264 - void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } 265 - void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } 266 262 void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } 267 - void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } 268 263 269 264 #endif /* CONFIG_BFQ_CGROUP_DEBUG */ 270 265

+6 -8

block/bfq-iosched.c

··· 1925 1925 bfqq->service_from_backlogged = 0; 1926 1926 bfq_clear_bfqq_softrt_update(bfqq); 1927 1927 1928 - bfq_add_bfqq_busy(bfqd, bfqq); 1928 + bfq_add_bfqq_busy(bfqq); 1929 1929 1930 1930 /* 1931 1931 * Expire in-service queue if preemption may be needed for ··· 2419 2419 bfqq->next_rq = NULL; 2420 2420 2421 2421 if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { 2422 - bfq_del_bfqq_busy(bfqd, bfqq, false); 2422 + bfq_del_bfqq_busy(bfqq, false); 2423 2423 /* 2424 2424 * bfqq emptied. In normal operation, when 2425 2425 * bfqq is empty, bfqq->entity.service and ··· 3098 3098 */ 3099 3099 if (bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) && 3100 3100 bfqq != bfqd->in_service_queue) 3101 - bfq_del_bfqq_busy(bfqd, bfqq, false); 3101 + bfq_del_bfqq_busy(bfqq, false); 3102 3102 3103 3103 bfq_reassign_last_bfqq(bfqq, NULL); 3104 3104 ··· 3908 3908 */ 3909 3909 bfqq->budget_timeout = jiffies; 3910 3910 3911 - bfq_del_bfqq_busy(bfqd, bfqq, true); 3911 + bfq_del_bfqq_busy(bfqq, true); 3912 3912 } else { 3913 3913 bfq_requeue_bfqq(bfqd, bfqq, true); 3914 3914 /* ··· 5255 5255 struct hlist_node *n; 5256 5256 struct bfq_group *bfqg = bfqq_group(bfqq); 5257 5257 5258 - if (bfqq->bfqd) 5259 - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", 5260 - bfqq, bfqq->ref); 5258 + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); 5261 5259 5262 5260 bfqq->ref--; 5263 5261 if (bfqq->ref) ··· 5319 5321 hlist_del_init(&item->woken_list_node); 5320 5322 } 5321 5323 5322 - if (bfqq->bfqd && bfqq->bfqd->last_completed_rq_bfqq == bfqq) 5324 + if (bfqq->bfqd->last_completed_rq_bfqq == bfqq) 5323 5325 bfqq->bfqd->last_completed_rq_bfqq = NULL; 5324 5326 5325 5327 kmem_cache_free(bfq_pool, bfqq);

+10 -8

block/bfq-iosched.h

··· 993 993 /* ---------------- cgroups-support interface ---------------- */ 994 994 995 995 void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq); 996 - void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, 997 - blk_opf_t opf); 998 996 void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf); 999 997 void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf); 1000 998 void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, 1001 999 u64 io_start_time_ns, blk_opf_t opf); 1002 1000 void bfqg_stats_update_dequeue(struct bfq_group *bfqg); 1003 - void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); 1004 - void bfqg_stats_update_idle_time(struct bfq_group *bfqg); 1005 1001 void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg); 1006 - void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg); 1007 1002 void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, 1008 1003 struct bfq_group *bfqg); 1004 + 1005 + #ifdef CONFIG_BFQ_CGROUP_DEBUG 1006 + void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, 1007 + blk_opf_t opf); 1008 + void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); 1009 + void bfqg_stats_update_idle_time(struct bfq_group *bfqg); 1010 + void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg); 1011 + #endif 1009 1012 1010 1013 void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg); 1011 1014 void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio); ··· 1080 1077 void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); 1081 1078 void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, 1082 1079 bool expiration); 1083 - void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, 1084 - bool expiration); 1085 - void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq); 1080 + void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration); 1081 + void bfq_add_bfqq_busy(struct bfq_queue *bfqq); 1086 1082 1087 1083 /* --------------- end of interface of B-WF2Q+ ---------------- */ 1088 1084

+6 -3

block/bfq-wf2q.c

··· 1651 1651 * the service tree. As a special case, it can be invoked during an 1652 1652 * expiration. 1653 1653 */ 1654 - void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, 1655 - bool expiration) 1654 + void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration) 1656 1655 { 1656 + struct bfq_data *bfqd = bfqq->bfqd; 1657 + 1657 1658 bfq_log_bfqq(bfqd, bfqq, "del from busy"); 1658 1659 1659 1660 bfq_clear_bfqq_busy(bfqq); ··· 1675 1674 /* 1676 1675 * Called when an inactive queue receives a new request. 1677 1676 */ 1678 - void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) 1677 + void bfq_add_bfqq_busy(struct bfq_queue *bfqq) 1679 1678 { 1679 + struct bfq_data *bfqd = bfqq->bfqd; 1680 + 1680 1681 bfq_log_bfqq(bfqd, bfqq, "add to busy"); 1681 1682 1682 1683 bfq_activate_bfqq(bfqd, bfqq);

+2 -11

block/bio.c

··· 760 760 static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) 761 761 { 762 762 bio_set_flag(bio, BIO_CLONED); 763 - if (bio_flagged(bio_src, BIO_THROTTLED)) 764 - bio_set_flag(bio, BIO_THROTTLED); 765 763 bio->bi_ioprio = bio_src->bi_ioprio; 766 764 bio->bi_iter = bio_src->bi_iter; 767 765 ··· 1063 1065 1064 1066 bio->bi_iter.bi_size += len; 1065 1067 bio->bi_vcnt++; 1066 - 1067 - if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page))) 1068 - bio_set_flag(bio, BIO_WORKINGSET); 1069 1068 } 1070 1069 EXPORT_SYMBOL_GPL(__bio_add_page); 1071 1070 ··· 1271 1276 * fit into the bio, or are requested in @iter, whatever is smaller. If 1272 1277 * MM encounters an error pinning the requested pages, it stops. Error 1273 1278 * is returned only if 0 pages could be pinned. 1274 - * 1275 - * It's intended for direct IO, so doesn't do PSI tracking, the caller is 1276 - * responsible for setting BIO_WORKINGSET if necessary. 1277 1279 */ 1278 1280 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) 1279 1281 { ··· 1286 1294 ret = __bio_iov_iter_get_pages(bio, iter); 1287 1295 } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); 1288 1296 1289 - /* don't account direct I/O as memory stall */ 1290 - bio_clear_flag(bio, BIO_WORKINGSET); 1291 1297 return bio->bi_vcnt ? 0 : ret; 1292 1298 } 1293 1299 EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); ··· 1744 1754 cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL, 1745 1755 bio_cpu_dead); 1746 1756 1747 - if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS)) 1757 + if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, 1758 + BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE)) 1748 1759 panic("bio: can't allocate bios\n"); 1749 1760 1750 1761 if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE))

+66 -117

block/blk-cgroup.c

··· 202 202 /** 203 203 * blkg_alloc - allocate a blkg 204 204 * @blkcg: block cgroup the new blkg is associated with 205 - * @q: request_queue the new blkg is associated with 205 + * @disk: gendisk the new blkg is associated with 206 206 * @gfp_mask: allocation mask to use 207 207 * 208 208 * Allocate a new blkg assocating @blkcg and @q. 209 209 */ 210 - static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, 210 + static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, 211 211 gfp_t gfp_mask) 212 212 { 213 213 struct blkcg_gq *blkg; 214 214 int i, cpu; 215 215 216 216 /* alloc and init base part */ 217 - blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node); 217 + blkg = kzalloc_node(sizeof(*blkg), gfp_mask, disk->queue->node); 218 218 if (!blkg) 219 219 return NULL; 220 220 ··· 225 225 if (!blkg->iostat_cpu) 226 226 goto err_free; 227 227 228 - if (!blk_get_queue(q)) 228 + if (!blk_get_queue(disk->queue)) 229 229 goto err_free; 230 230 231 - blkg->q = q; 231 + blkg->q = disk->queue; 232 232 INIT_LIST_HEAD(&blkg->q_node); 233 233 spin_lock_init(&blkg->async_bio_lock); 234 234 bio_list_init(&blkg->async_bios); ··· 243 243 struct blkcg_policy *pol = blkcg_policy[i]; 244 244 struct blkg_policy_data *pd; 245 245 246 - if (!blkcg_policy_enabled(q, pol)) 246 + if (!blkcg_policy_enabled(disk->queue, pol)) 247 247 continue; 248 248 249 249 /* alloc per-policy data and attach it to blkg */ 250 - pd = pol->pd_alloc_fn(gfp_mask, q, blkcg); 250 + pd = pol->pd_alloc_fn(gfp_mask, disk->queue, blkcg); 251 251 if (!pd) 252 252 goto err_free; 253 253 ··· 263 263 return NULL; 264 264 } 265 265 266 - struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, 267 - struct request_queue *q, bool update_hint) 268 - { 269 - struct blkcg_gq *blkg; 270 - 271 - /* 272 - * Hint didn't match. Look up from the radix tree. Note that the 273 - * hint can only be updated under queue_lock as otherwise @blkg 274 - * could have already been removed from blkg_tree. The caller is 275 - * responsible for grabbing queue_lock if @update_hint. 276 - */ 277 - blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); 278 - if (blkg && blkg->q == q) { 279 - if (update_hint) { 280 - lockdep_assert_held(&q->queue_lock); 281 - rcu_assign_pointer(blkcg->blkg_hint, blkg); 282 - } 283 - return blkg; 284 - } 285 - 286 - return NULL; 287 - } 288 - EXPORT_SYMBOL_GPL(blkg_lookup_slowpath); 289 - 290 266 /* 291 267 * If @new_blkg is %NULL, this function tries to allocate a new one as 292 268 * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. 293 269 */ 294 - static struct blkcg_gq *blkg_create(struct blkcg *blkcg, 295 - struct request_queue *q, 270 + static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, 296 271 struct blkcg_gq *new_blkg) 297 272 { 298 273 struct blkcg_gq *blkg; 299 274 int i, ret; 300 275 301 - lockdep_assert_held(&q->queue_lock); 276 + lockdep_assert_held(&disk->queue->queue_lock); 302 277 303 278 /* request_queue is dying, do not create/recreate a blkg */ 304 - if (blk_queue_dying(q)) { 279 + if (blk_queue_dying(disk->queue)) { 305 280 ret = -ENODEV; 306 281 goto err_free_blkg; 307 282 } ··· 289 314 290 315 /* allocate */ 291 316 if (!new_blkg) { 292 - new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN); 317 + new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT | __GFP_NOWARN); 293 318 if (unlikely(!new_blkg)) { 294 319 ret = -ENOMEM; 295 320 goto err_put_css; ··· 299 324 300 325 /* link parent */ 301 326 if (blkcg_parent(blkcg)) { 302 - blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); 327 + blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue); 303 328 if (WARN_ON_ONCE(!blkg->parent)) { 304 329 ret = -ENODEV; 305 330 goto err_put_css; ··· 317 342 318 343 /* insert */ 319 344 spin_lock(&blkcg->lock); 320 - ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); 345 + ret = radix_tree_insert(&blkcg->blkg_tree, disk->queue->id, blkg); 321 346 if (likely(!ret)) { 322 347 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); 323 - list_add(&blkg->q_node, &q->blkg_list); 348 + list_add(&blkg->q_node, &disk->queue->blkg_list); 324 349 325 350 for (i = 0; i < BLKCG_MAX_POLS; i++) { 326 351 struct blkcg_policy *pol = blkcg_policy[i]; ··· 349 374 /** 350 375 * blkg_lookup_create - lookup blkg, try to create one if not there 351 376 * @blkcg: blkcg of interest 352 - * @q: request_queue of interest 377 + * @disk: gendisk of interest 353 378 * 354 - * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to 379 + * Lookup blkg for the @blkcg - @disk pair. If it doesn't exist, try to 355 380 * create one. blkg creation is performed recursively from blkcg_root such 356 381 * that all non-root blkg's have access to the parent blkg. This function 357 - * should be called under RCU read lock and takes @q->queue_lock. 382 + * should be called under RCU read lock and takes @disk->queue->queue_lock. 358 383 * 359 384 * Returns the blkg or the closest blkg if blkg_create() fails as it walks 360 385 * down from root. 361 386 */ 362 387 static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, 363 - struct request_queue *q) 388 + struct gendisk *disk) 364 389 { 390 + struct request_queue *q = disk->queue; 365 391 struct blkcg_gq *blkg; 366 392 unsigned long flags; 367 393 ··· 373 397 return blkg; 374 398 375 399 spin_lock_irqsave(&q->queue_lock, flags); 376 - blkg = __blkg_lookup(blkcg, q, true); 377 - if (blkg) 400 + blkg = blkg_lookup(blkcg, q); 401 + if (blkg) { 402 + if (blkcg != &blkcg_root && 403 + blkg != rcu_dereference(blkcg->blkg_hint)) 404 + rcu_assign_pointer(blkcg->blkg_hint, blkg); 378 405 goto found; 406 + } 379 407 380 408 /* 381 409 * Create blkgs walking down from blkcg_root to @blkcg, so that all ··· 392 412 struct blkcg_gq *ret_blkg = q->root_blkg; 393 413 394 414 while (parent) { 395 - blkg = __blkg_lookup(parent, q, false); 415 + blkg = blkg_lookup(parent, q); 396 416 if (blkg) { 397 417 /* remember closest blkg */ 398 418 ret_blkg = blkg; ··· 402 422 parent = blkcg_parent(parent); 403 423 } 404 424 405 - blkg = blkg_create(pos, q, NULL); 425 + blkg = blkg_create(pos, disk, NULL); 406 426 if (IS_ERR(blkg)) { 407 427 blkg = ret_blkg; 408 428 break; ··· 456 476 percpu_ref_kill(&blkg->refcnt); 457 477 } 458 478 459 - /** 460 - * blkg_destroy_all - destroy all blkgs associated with a request_queue 461 - * @q: request_queue of interest 462 - * 463 - * Destroy all blkgs associated with @q. 464 - */ 465 - static void blkg_destroy_all(struct request_queue *q) 479 + static void blkg_destroy_all(struct gendisk *disk) 466 480 { 481 + struct request_queue *q = disk->queue; 467 482 struct blkcg_gq *blkg, *n; 468 483 int count = BLKG_DESTROY_BATCH_SIZE; 469 484 ··· 591 616 } 592 617 EXPORT_SYMBOL_GPL(__blkg_prfill_u64); 593 618 594 - /* Performs queue bypass and policy enabled checks then looks up blkg. */ 595 - static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, 596 - const struct blkcg_policy *pol, 597 - struct request_queue *q) 598 - { 599 - WARN_ON_ONCE(!rcu_read_lock_held()); 600 - lockdep_assert_held(&q->queue_lock); 601 - 602 - if (!blkcg_policy_enabled(q, pol)) 603 - return ERR_PTR(-EOPNOTSUPP); 604 - return __blkg_lookup(blkcg, q, true /* update_hint */); 605 - } 606 - 607 619 /** 608 620 * blkcg_conf_open_bdev - parse and open bdev for per-blkg config update 609 621 * @inputp: input string pointer ··· 646 684 __acquires(rcu) __acquires(&bdev->bd_queue->queue_lock) 647 685 { 648 686 struct block_device *bdev; 687 + struct gendisk *disk; 649 688 struct request_queue *q; 650 689 struct blkcg_gq *blkg; 651 690 int ret; ··· 654 691 bdev = blkcg_conf_open_bdev(&input); 655 692 if (IS_ERR(bdev)) 656 693 return PTR_ERR(bdev); 657 - 658 - q = bdev_get_queue(bdev); 694 + disk = bdev->bd_disk; 695 + q = disk->queue; 659 696 660 697 /* 661 698 * blkcg_deactivate_policy() requires queue to be frozen, we can grab ··· 668 705 rcu_read_lock(); 669 706 spin_lock_irq(&q->queue_lock); 670 707 671 - blkg = blkg_lookup_check(blkcg, pol, q); 672 - if (IS_ERR(blkg)) { 673 - ret = PTR_ERR(blkg); 708 + if (!blkcg_policy_enabled(q, pol)) { 709 + ret = -EOPNOTSUPP; 674 710 goto fail_unlock; 675 711 } 676 712 713 + blkg = blkg_lookup(blkcg, q); 677 714 if (blkg) 678 715 goto success; 679 716 ··· 687 724 struct blkcg_gq *new_blkg; 688 725 689 726 parent = blkcg_parent(blkcg); 690 - while (parent && !__blkg_lookup(parent, q, false)) { 727 + while (parent && !blkg_lookup(parent, q)) { 691 728 pos = parent; 692 729 parent = blkcg_parent(parent); 693 730 } ··· 696 733 spin_unlock_irq(&q->queue_lock); 697 734 rcu_read_unlock(); 698 735 699 - new_blkg = blkg_alloc(pos, q, GFP_KERNEL); 736 + new_blkg = blkg_alloc(pos, disk, GFP_KERNEL); 700 737 if (unlikely(!new_blkg)) { 701 738 ret = -ENOMEM; 702 739 goto fail_exit_queue; ··· 711 748 rcu_read_lock(); 712 749 spin_lock_irq(&q->queue_lock); 713 750 714 - blkg = blkg_lookup_check(pos, pol, q); 715 - if (IS_ERR(blkg)) { 716 - ret = PTR_ERR(blkg); 751 + if (!blkcg_policy_enabled(q, pol)) { 717 752 blkg_free(new_blkg); 753 + ret = -EOPNOTSUPP; 718 754 goto fail_preloaded; 719 755 } 720 756 757 + blkg = blkg_lookup(pos, q); 721 758 if (blkg) { 722 759 blkg_free(new_blkg); 723 760 } else { 724 - blkg = blkg_create(pos, q, new_blkg); 761 + blkg = blkg_create(pos, disk, new_blkg); 725 762 if (IS_ERR(blkg)) { 726 763 ret = PTR_ERR(blkg); 727 764 goto fail_preloaded; ··· 878 915 class_dev_iter_init(&iter, &block_class, NULL, &disk_type); 879 916 while ((dev = class_dev_iter_next(&iter))) { 880 917 struct block_device *bdev = dev_to_bdev(dev); 881 - struct blkcg_gq *blkg = 882 - blk_queue_root_blkg(bdev_get_queue(bdev)); 918 + struct blkcg_gq *blkg = bdev->bd_disk->queue->root_blkg; 883 919 struct blkg_iostat tmp; 884 920 int cpu; 885 921 unsigned long flags; ··· 1217 1255 return 0; 1218 1256 } 1219 1257 1220 - /** 1221 - * blkcg_init_queue - initialize blkcg part of request queue 1222 - * @q: request_queue to initialize 1223 - * 1224 - * Called from blk_alloc_queue(). Responsible for initializing blkcg 1225 - * part of new request_queue @q. 1226 - * 1227 - * RETURNS: 1228 - * 0 on success, -errno on failure. 1229 - */ 1230 - int blkcg_init_queue(struct request_queue *q) 1258 + int blkcg_init_disk(struct gendisk *disk) 1231 1259 { 1260 + struct request_queue *q = disk->queue; 1232 1261 struct blkcg_gq *new_blkg, *blkg; 1233 1262 bool preloaded; 1234 1263 int ret; 1235 1264 1236 1265 INIT_LIST_HEAD(&q->blkg_list); 1237 1266 1238 - new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); 1267 + new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); 1239 1268 if (!new_blkg) 1240 1269 return -ENOMEM; 1241 1270 ··· 1235 1282 /* Make sure the root blkg exists. */ 1236 1283 /* spin_lock_irq can serve as RCU read-side critical section. */ 1237 1284 spin_lock_irq(&q->queue_lock); 1238 - blkg = blkg_create(&blkcg_root, q, new_blkg); 1285 + blkg = blkg_create(&blkcg_root, disk, new_blkg); 1239 1286 if (IS_ERR(blkg)) 1240 1287 goto err_unlock; 1241 1288 q->root_blkg = blkg; ··· 1244 1291 if (preloaded) 1245 1292 radix_tree_preload_end(); 1246 1293 1247 - ret = blk_ioprio_init(q); 1294 + ret = blk_ioprio_init(disk); 1248 1295 if (ret) 1249 1296 goto err_destroy_all; 1250 1297 1251 - ret = blk_throtl_init(q); 1298 + ret = blk_throtl_init(disk); 1252 1299 if (ret) 1253 - goto err_destroy_all; 1300 + goto err_ioprio_exit; 1254 1301 1255 - ret = blk_iolatency_init(q); 1256 - if (ret) { 1257 - blk_throtl_exit(q); 1258 - blk_ioprio_exit(q); 1259 - goto err_destroy_all; 1260 - } 1302 + ret = blk_iolatency_init(disk); 1303 + if (ret) 1304 + goto err_throtl_exit; 1261 1305 1262 1306 return 0; 1263 1307 1308 + err_throtl_exit: 1309 + blk_throtl_exit(disk); 1310 + err_ioprio_exit: 1311 + blk_ioprio_exit(disk); 1264 1312 err_destroy_all: 1265 - blkg_destroy_all(q); 1313 + blkg_destroy_all(disk); 1266 1314 return ret; 1267 1315 err_unlock: 1268 1316 spin_unlock_irq(&q->queue_lock); ··· 1272 1318 return PTR_ERR(blkg); 1273 1319 } 1274 1320 1275 - /** 1276 - * blkcg_exit_queue - exit and release blkcg part of request_queue 1277 - * @q: request_queue being released 1278 - * 1279 - * Called from blk_exit_queue(). Responsible for exiting blkcg part. 1280 - */ 1281 - void blkcg_exit_queue(struct request_queue *q) 1321 + void blkcg_exit_disk(struct gendisk *disk) 1282 1322 { 1283 - blkg_destroy_all(q); 1284 - blk_throtl_exit(q); 1323 + blkg_destroy_all(disk); 1324 + blk_throtl_exit(disk); 1285 1325 } 1286 1326 1287 1327 static void blkcg_bind(struct cgroup_subsys_state *root_css) ··· 1784 1836 1785 1837 /** 1786 1838 * blkcg_schedule_throttle - this task needs to check for throttling 1787 - * @q: the request queue IO was submitted on 1839 + * @gendisk: disk to throttle 1788 1840 * @use_memdelay: do we charge this to memory delay for PSI 1789 1841 * 1790 1842 * This is called by the IO controller when we know there's delay accumulated 1791 1843 * for the blkg for this task. We do not pass the blkg because there are places 1792 1844 * we call this that may not have that information, the swapping code for 1793 - * instance will only have a request_queue at that point. This set's the 1845 + * instance will only have a block_device at that point. This set's the 1794 1846 * notify_resume for the task to check and see if it requires throttling before 1795 1847 * returning to user space. 1796 1848 * ··· 1799 1851 * throttle once. If the task needs to be throttled again it'll need to be 1800 1852 * re-set at the next time we see the task. 1801 1853 */ 1802 - void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) 1854 + void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay) 1803 1855 { 1856 + struct request_queue *q = disk->queue; 1857 + 1804 1858 if (unlikely(current->flags & PF_KTHREAD)) 1805 1859 return; 1806 1860 ··· 1852 1902 struct blkcg_gq *blkg, *ret_blkg = NULL; 1853 1903 1854 1904 rcu_read_lock(); 1855 - blkg = blkg_lookup_create(css_to_blkcg(css), 1856 - bdev_get_queue(bio->bi_bdev)); 1905 + blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_bdev->bd_disk); 1857 1906 while (blkg) { 1858 1907 if (blkg_tryget(blkg)) { 1859 1908 ret_blkg = blkg;

+20 -48

block/blk-cgroup.h

··· 178 178 extern struct blkcg blkcg_root; 179 179 extern bool blkcg_debug_stats; 180 180 181 - struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, 182 - struct request_queue *q, bool update_hint); 183 - int blkcg_init_queue(struct request_queue *q); 184 - void blkcg_exit_queue(struct request_queue *q); 181 + int blkcg_init_disk(struct gendisk *disk); 182 + void blkcg_exit_disk(struct gendisk *disk); 185 183 186 184 /* Blkio controller policy registration */ 187 185 int blkcg_policy_register(struct blkcg_policy *pol); ··· 225 227 } 226 228 227 229 /** 228 - * __blkg_lookup - internal version of blkg_lookup() 230 + * blkg_lookup - lookup blkg for the specified blkcg - q pair 229 231 * @blkcg: blkcg of interest 230 232 * @q: request_queue of interest 231 - * @update_hint: whether to update lookup hint with the result or not 232 233 * 233 - * This is internal version and shouldn't be used by policy 234 - * implementations. Looks up blkgs for the @blkcg - @q pair regardless of 235 - * @q's bypass state. If @update_hint is %true, the caller should be 236 - * holding @q->queue_lock and lookup hint is updated on success. 234 + * Lookup blkg for the @blkcg - @q pair. 235 + 236 + * Must be called in a RCU critical section. 237 237 */ 238 - static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, 239 - struct request_queue *q, 240 - bool update_hint) 238 + static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, 239 + struct request_queue *q) 241 240 { 242 241 struct blkcg_gq *blkg; 242 + 243 + WARN_ON_ONCE(!rcu_read_lock_held()); 243 244 244 245 if (blkcg == &blkcg_root) 245 246 return q->root_blkg; ··· 247 250 if (blkg && blkg->q == q) 248 251 return blkg; 249 252 250 - return blkg_lookup_slowpath(blkcg, q, update_hint); 251 - } 252 - 253 - /** 254 - * blkg_lookup - lookup blkg for the specified blkcg - q pair 255 - * @blkcg: blkcg of interest 256 - * @q: request_queue of interest 257 - * 258 - * Lookup blkg for the @blkcg - @q pair. This function should be called 259 - * under RCU read lock. 260 - */ 261 - static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, 262 - struct request_queue *q) 263 - { 264 - WARN_ON_ONCE(!rcu_read_lock_held()); 265 - return __blkg_lookup(blkcg, q, false); 266 - } 267 - 268 - /** 269 - * blk_queue_root_blkg - return blkg for the (blkcg_root, @q) pair 270 - * @q: request_queue of interest 271 - * 272 - * Lookup blkg for @q at the root level. See also blkg_lookup(). 273 - */ 274 - static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q) 275 - { 276 - return q->root_blkg; 253 + blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); 254 + if (blkg && blkg->q != q) 255 + blkg = NULL; 256 + return blkg; 277 257 } 278 258 279 259 /** ··· 347 373 */ 348 374 #define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ 349 375 css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ 350 - if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ 351 - (p_blkg)->q, false))) 376 + if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ 377 + (p_blkg)->q))) 352 378 353 379 /** 354 380 * blkg_for_each_descendant_post - post-order walk of a blkg's descendants ··· 362 388 */ 363 389 #define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ 364 390 css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ 365 - if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ 366 - (p_blkg)->q, false))) 391 + if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ 392 + (p_blkg)->q))) 367 393 368 394 bool __blkcg_punt_bio_submit(struct bio *bio); 369 395 ··· 481 507 }; 482 508 483 509 static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } 484 - static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q) 485 - { return NULL; } 486 - static inline int blkcg_init_queue(struct request_queue *q) { return 0; } 487 - static inline void blkcg_exit_queue(struct request_queue *q) { } 510 + static inline int blkcg_init_disk(struct gendisk *disk) { return 0; } 511 + static inline void blkcg_exit_disk(struct gendisk *disk) { } 488 512 static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } 489 513 static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } 490 514 static inline int blkcg_activate_policy(struct request_queue *q,

+11 -26

block/blk-core.c

··· 37 37 #include <linux/t10-pi.h> 38 38 #include <linux/debugfs.h> 39 39 #include <linux/bpf.h> 40 - #include <linux/psi.h> 41 40 #include <linux/part_stat.h> 42 41 #include <linux/sched/sysctl.h> 43 42 #include <linux/blk-crypto.h> ··· 486 487 late_initcall(fail_make_request_debugfs); 487 488 #endif /* CONFIG_FAIL_MAKE_REQUEST */ 488 489 489 - static inline bool bio_check_ro(struct bio *bio) 490 + static inline void bio_check_ro(struct bio *bio) 490 491 { 491 492 if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) { 492 493 if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) 493 - return false; 494 + return; 494 495 pr_warn("Trying to write to read-only block-device %pg\n", 495 496 bio->bi_bdev); 496 497 /* Older lvm-tools actually trigger this */ 497 - return false; 498 498 } 499 - 500 - return false; 501 499 } 502 500 503 501 static noinline int should_fail_bio(struct bio *bio) ··· 713 717 * For a REQ_NOWAIT based request, return -EOPNOTSUPP 714 718 * if queue does not support NOWAIT. 715 719 */ 716 - if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q)) 720 + if ((bio->bi_opf & REQ_NOWAIT) && !bdev_nowait(bdev)) 717 721 goto not_supported; 718 722 719 723 if (should_fail_bio(bio)) 720 724 goto end_io; 721 - if (unlikely(bio_check_ro(bio))) 722 - goto end_io; 725 + bio_check_ro(bio); 723 726 if (!bio_flagged(bio, BIO_REMAPPED)) { 724 727 if (unlikely(bio_check_eod(bio))) 725 728 goto end_io; ··· 809 814 * 810 815 * The success/failure status of the request, along with notification of 811 816 * completion, is delivered asynchronously through the ->bi_end_io() callback 812 - * in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has 817 + * in @bio. The bio must NOT be touched by the caller until ->bi_end_io() has 813 818 * been called. 814 819 */ 815 820 void submit_bio(struct bio *bio) ··· 822 827 count_vm_events(PGPGIN, bio_sectors(bio)); 823 828 } else if (bio_op(bio) == REQ_OP_WRITE) { 824 829 count_vm_events(PGPGOUT, bio_sectors(bio)); 825 - } 826 - 827 - /* 828 - * If we're reading data that is part of the userspace workingset, count 829 - * submission time as memory stall. When the device is congested, or 830 - * the submitting cgroup IO-throttled, submission can be a significant 831 - * part of overall IO time. 832 - */ 833 - if (unlikely(bio_op(bio) == REQ_OP_READ && 834 - bio_flagged(bio, BIO_WORKINGSET))) { 835 - unsigned long pflags; 836 - 837 - psi_memstall_enter(&pflags); 838 - submit_bio_noacct(bio); 839 - psi_memstall_leave(&pflags); 840 - return; 841 830 } 842 831 843 832 submit_bio_noacct(bio); ··· 850 871 !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) 851 872 return 0; 852 873 874 + /* 875 + * As the requests that require a zone lock are not plugged in the 876 + * first place, directly accessing the plug instead of using 877 + * blk_mq_plug() should not have any consequences during flushing for 878 + * zoned devices. 879 + */ 853 880 blk_flush_plug(current->plug, false); 854 881 855 882 if (bio_queue_enter(bio))

+19 -20

block/blk-iocost.c

··· 664 664 return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST)); 665 665 } 666 666 667 - static const char *q_name(struct request_queue *q) 668 - { 669 - if (blk_queue_registered(q)) 670 - return kobject_name(q->kobj.parent); 671 - else 672 - return "<unknown>"; 673 - } 674 - 675 667 static const char __maybe_unused *ioc_name(struct ioc *ioc) 676 668 { 677 - return q_name(ioc->rqos.q); 669 + struct gendisk *disk = ioc->rqos.q->disk; 670 + 671 + if (!disk) 672 + return "<unknown>"; 673 + return disk->disk_name; 678 674 } 679 675 680 676 static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd) ··· 1426 1430 int flags, void *key) 1427 1431 { 1428 1432 struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait); 1429 - struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key; 1433 + struct iocg_wake_ctx *ctx = key; 1430 1434 u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse); 1431 1435 1432 1436 ctx->vbudget -= cost; ··· 2636 2640 if (use_debt) { 2637 2641 iocg_incur_debt(iocg, abs_cost, &now); 2638 2642 if (iocg_kick_delay(iocg, &now)) 2639 - blkcg_schedule_throttle(rqos->q, 2643 + blkcg_schedule_throttle(rqos->q->disk, 2640 2644 (bio->bi_opf & REQ_SWAP) == REQ_SWAP); 2641 2645 iocg_unlock(iocg, ioc_locked, &flags); 2642 2646 return; ··· 2737 2741 if (likely(!list_empty(&iocg->active_list))) { 2738 2742 iocg_incur_debt(iocg, abs_cost, &now); 2739 2743 if (iocg_kick_delay(iocg, &now)) 2740 - blkcg_schedule_throttle(rqos->q, 2744 + blkcg_schedule_throttle(rqos->q->disk, 2741 2745 (bio->bi_opf & REQ_SWAP) == REQ_SWAP); 2742 2746 } else { 2743 2747 iocg_commit_bio(iocg, bio, abs_cost, cost); ··· 2828 2832 .exit = ioc_rqos_exit, 2829 2833 }; 2830 2834 2831 - static int blk_iocost_init(struct request_queue *q) 2835 + static int blk_iocost_init(struct gendisk *disk) 2832 2836 { 2837 + struct request_queue *q = disk->queue; 2833 2838 struct ioc *ioc; 2834 2839 struct rq_qos *rqos; 2835 2840 int i, cpu, ret; ··· 3167 3170 size_t nbytes, loff_t off) 3168 3171 { 3169 3172 struct block_device *bdev; 3173 + struct gendisk *disk; 3170 3174 struct ioc *ioc; 3171 3175 u32 qos[NR_QOS_PARAMS]; 3172 3176 bool enable, user; ··· 3178 3180 if (IS_ERR(bdev)) 3179 3181 return PTR_ERR(bdev); 3180 3182 3181 - ioc = q_to_ioc(bdev_get_queue(bdev)); 3183 + disk = bdev->bd_disk; 3184 + ioc = q_to_ioc(disk->queue); 3182 3185 if (!ioc) { 3183 - ret = blk_iocost_init(bdev_get_queue(bdev)); 3186 + ret = blk_iocost_init(disk); 3184 3187 if (ret) 3185 3188 goto err; 3186 - ioc = q_to_ioc(bdev_get_queue(bdev)); 3189 + ioc = q_to_ioc(disk->queue); 3187 3190 } 3188 3191 3189 3192 spin_lock_irq(&ioc->lock); ··· 3261 3262 spin_lock_irq(&ioc->lock); 3262 3263 3263 3264 if (enable) { 3264 - blk_stat_enable_accounting(ioc->rqos.q); 3265 - blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q); 3265 + blk_stat_enable_accounting(disk->queue); 3266 + blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); 3266 3267 ioc->enabled = true; 3267 3268 } else { 3268 - blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q); 3269 + blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); 3269 3270 ioc->enabled = false; 3270 3271 } 3271 3272 ··· 3348 3349 3349 3350 ioc = q_to_ioc(bdev_get_queue(bdev)); 3350 3351 if (!ioc) { 3351 - ret = blk_iocost_init(bdev_get_queue(bdev)); 3352 + ret = blk_iocost_init(bdev->bd_disk); 3352 3353 if (ret) 3353 3354 goto err; 3354 3355 ioc = q_to_ioc(bdev_get_queue(bdev));

+3 -2

block/blk-iolatency.c

··· 292 292 unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay); 293 293 294 294 if (use_delay) 295 - blkcg_schedule_throttle(rqos->q, use_memdelay); 295 + blkcg_schedule_throttle(rqos->q->disk, use_memdelay); 296 296 297 297 /* 298 298 * To avoid priority inversions we want to just take a slot if we are ··· 756 756 } 757 757 } 758 758 759 - int blk_iolatency_init(struct request_queue *q) 759 + int blk_iolatency_init(struct gendisk *disk) 760 760 { 761 + struct request_queue *q = disk->queue; 761 762 struct blk_iolatency *blkiolat; 762 763 struct rq_qos *rqos; 763 764 int ret;

+4 -4

block/blk-ioprio.c

··· 202 202 bio->bi_ioprio = prio; 203 203 } 204 204 205 - void blk_ioprio_exit(struct request_queue *q) 205 + void blk_ioprio_exit(struct gendisk *disk) 206 206 { 207 - blkcg_deactivate_policy(q, &ioprio_policy); 207 + blkcg_deactivate_policy(disk->queue, &ioprio_policy); 208 208 } 209 209 210 - int blk_ioprio_init(struct request_queue *q) 210 + int blk_ioprio_init(struct gendisk *disk) 211 211 { 212 - return blkcg_activate_policy(q, &ioprio_policy); 212 + return blkcg_activate_policy(disk->queue, &ioprio_policy); 213 213 } 214 214 215 215 static int __init ioprio_init(void)

+4 -4

block/blk-ioprio.h

··· 9 9 struct bio; 10 10 11 11 #ifdef CONFIG_BLK_CGROUP_IOPRIO 12 - int blk_ioprio_init(struct request_queue *q); 13 - void blk_ioprio_exit(struct request_queue *q); 12 + int blk_ioprio_init(struct gendisk *disk); 13 + void blk_ioprio_exit(struct gendisk *disk); 14 14 void blkcg_set_ioprio(struct bio *bio); 15 15 #else 16 - static inline int blk_ioprio_init(struct request_queue *q) 16 + static inline int blk_ioprio_init(struct gendisk *disk) 17 17 { 18 18 return 0; 19 19 } 20 - static inline void blk_ioprio_exit(struct request_queue *q) 20 + static inline void blk_ioprio_exit(struct gendisk *disk) 21 21 { 22 22 } 23 23 static inline void blkcg_set_ioprio(struct bio *bio)

+38 -14

block/blk-map.c

··· 158 158 bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, req_op(rq)); 159 159 160 160 if (map_data) { 161 - nr_pages = 1 << map_data->page_order; 161 + nr_pages = 1U << map_data->page_order; 162 162 i = map_data->offset / PAGE_SIZE; 163 163 } 164 164 while (len) { ··· 231 231 return ret; 232 232 } 233 233 234 + static void bio_map_put(struct bio *bio) 235 + { 236 + if (bio->bi_opf & REQ_ALLOC_CACHE) { 237 + bio_put(bio); 238 + } else { 239 + bio_uninit(bio); 240 + kfree(bio); 241 + } 242 + } 243 + 234 244 static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, 235 245 gfp_t gfp_mask) 236 246 { ··· 253 243 if (!iov_iter_count(iter)) 254 244 return -EINVAL; 255 245 256 - bio = bio_kmalloc(nr_vecs, gfp_mask); 257 - if (!bio) 258 - return -ENOMEM; 259 - bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq)); 246 + if (rq->cmd_flags & REQ_POLLED) { 247 + blk_opf_t opf = rq->cmd_flags | REQ_ALLOC_CACHE; 248 + 249 + bio = bio_alloc_bioset(NULL, nr_vecs, opf, gfp_mask, 250 + &fs_bio_set); 251 + if (!bio) 252 + return -ENOMEM; 253 + } else { 254 + bio = bio_kmalloc(nr_vecs, gfp_mask); 255 + if (!bio) 256 + return -ENOMEM; 257 + bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq)); 258 + } 260 259 261 260 while (iov_iter_count(iter)) { 262 - struct page **pages; 261 + struct page **pages, *stack_pages[UIO_FASTIOV]; 263 262 ssize_t bytes; 264 - size_t offs, added = 0; 263 + size_t offs; 265 264 int npages; 266 265 267 - bytes = iov_iter_get_pages_alloc2(iter, &pages, LONG_MAX, &offs); 266 + if (nr_vecs <= ARRAY_SIZE(stack_pages)) { 267 + pages = stack_pages; 268 + bytes = iov_iter_get_pages2(iter, pages, LONG_MAX, 269 + nr_vecs, &offs); 270 + } else { 271 + bytes = iov_iter_get_pages_alloc2(iter, &pages, 272 + LONG_MAX, &offs); 273 + } 268 274 if (unlikely(bytes <= 0)) { 269 275 ret = bytes ? bytes : -EFAULT; 270 276 goto out_unmap; ··· 306 280 break; 307 281 } 308 282 309 - added += n; 310 283 bytes -= n; 311 284 offs = 0; 312 285 } ··· 315 290 */ 316 291 while (j < npages) 317 292 put_page(pages[j++]); 318 - kvfree(pages); 293 + if (pages != stack_pages) 294 + kvfree(pages); 319 295 /* couldn't stuff something into bio? */ 320 296 if (bytes) { 321 297 iov_iter_revert(iter, bytes); ··· 331 305 332 306 out_unmap: 333 307 bio_release_pages(bio, false); 334 - bio_uninit(bio); 335 - kfree(bio); 308 + bio_map_put(bio); 336 309 return ret; 337 310 } 338 311 ··· 636 611 637 612 next_bio = bio; 638 613 bio = bio->bi_next; 639 - bio_uninit(next_bio); 640 - kfree(next_bio); 614 + bio_map_put(next_bio); 641 615 } 642 616 643 617 return ret;

+1 -3

block/blk-mq-cpumap.c

··· 32 32 return cpu; 33 33 } 34 34 35 - int blk_mq_map_queues(struct blk_mq_queue_map *qmap) 35 + void blk_mq_map_queues(struct blk_mq_queue_map *qmap) 36 36 { 37 37 unsigned int *map = qmap->mq_map; 38 38 unsigned int nr_queues = qmap->nr_queues; ··· 70 70 map[cpu] = map[first_sibling]; 71 71 } 72 72 } 73 - 74 - return 0; 75 73 } 76 74 EXPORT_SYMBOL_GPL(blk_mq_map_queues); 77 75

-2

block/blk-mq-debugfs.c

··· 807 807 return "latency"; 808 808 case RQ_QOS_COST: 809 809 return "cost"; 810 - case RQ_QOS_IOPRIO: 811 - return "ioprio"; 812 810 } 813 811 return "unknown"; 814 812 }

+3 -4

block/blk-mq-pci.c

··· 23 23 * that maps a queue to the CPUs that have irq affinity for the corresponding 24 24 * vector. 25 25 */ 26 - int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, 27 - int offset) 26 + void blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, 27 + int offset) 28 28 { 29 29 const struct cpumask *mask; 30 30 unsigned int queue, cpu; ··· 38 38 qmap->mq_map[cpu] = qmap->queue_offset + queue; 39 39 } 40 40 41 - return 0; 41 + return; 42 42 43 43 fallback: 44 44 WARN_ON_ONCE(qmap->nr_queues > 1); 45 45 blk_mq_clear_mq_map(qmap); 46 - return 0; 47 46 } 48 47 EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues);

+3 -3

block/blk-mq-rdma.c

··· 21 21 * @set->nr_hw_queues, or @dev does not provide an affinity mask for a 22 22 * vector, we fallback to the naive mapping. 23 23 */ 24 - int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, 24 + void blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, 25 25 struct ib_device *dev, int first_vec) 26 26 { 27 27 const struct cpumask *mask; ··· 36 36 map->mq_map[cpu] = map->queue_offset + queue; 37 37 } 38 38 39 - return 0; 39 + return; 40 40 41 41 fallback: 42 - return blk_mq_map_queues(map); 42 + blk_mq_map_queues(map); 43 43 } 44 44 EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues);

+1 -1

block/blk-mq-tag.c

··· 196 196 * other allocations on previous queue won't be starved. 197 197 */ 198 198 if (bt != bt_prev) 199 - sbitmap_queue_wake_up(bt_prev); 199 + sbitmap_queue_wake_up(bt_prev, 1); 200 200 201 201 ws = bt_wait_ptr(bt, data->hctx); 202 202 } while (1);

+4 -3

block/blk-mq-virtio.c

··· 21 21 * that maps a queue to the CPUs that have irq affinity for the corresponding 22 22 * vector. 23 23 */ 24 - int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, 24 + void blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, 25 25 struct virtio_device *vdev, int first_vec) 26 26 { 27 27 const struct cpumask *mask; ··· 39 39 qmap->mq_map[cpu] = qmap->queue_offset + queue; 40 40 } 41 41 42 - return 0; 42 + return; 43 + 43 44 fallback: 44 - return blk_mq_map_queues(qmap); 45 + blk_mq_map_queues(qmap); 45 46 } 46 47 EXPORT_SYMBOL_GPL(blk_mq_virtio_map_queues);

+19 -13

block/blk-mq.c

··· 1093 1093 WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); 1094 1094 1095 1095 /* 1096 - * For a polled request, always complete locally, it's pointless 1097 - * to redirect the completion. 1096 + * For request which hctx has only one ctx mapping, 1097 + * or a polled request, always complete locally, 1098 + * it's pointless to redirect the completion. 1098 1099 */ 1099 - if (rq->cmd_flags & REQ_POLLED) 1100 + if (rq->mq_hctx->nr_ctx == 1 || 1101 + rq->cmd_flags & REQ_POLLED) 1100 1102 return false; 1101 1103 1102 1104 if (blk_mq_complete_need_ipi(rq)) { ··· 1215 1213 WARN_ON(!blk_rq_is_passthrough(rq)); 1216 1214 1217 1215 blk_account_io_start(rq); 1216 + 1217 + /* 1218 + * As plugging can be enabled for passthrough requests on a zoned 1219 + * device, directly accessing the plug instead of using blk_mq_plug() 1220 + * should not have any consequences. 1221 + */ 1218 1222 if (current->plug) 1219 1223 blk_add_rq_to_plug(current->plug, rq); 1220 1224 else ··· 2001 1993 if (!needs_restart || 2002 1994 (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) 2003 1995 blk_mq_run_hw_queue(hctx, true); 2004 - else if (needs_restart && needs_resource) 1996 + else if (needs_resource) 2005 1997 blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); 2006 1998 2007 1999 blk_mq_update_dispatch_busy(hctx, true); ··· 4200 4192 return 0; 4201 4193 } 4202 4194 4203 - static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) 4195 + static void blk_mq_update_queue_map(struct blk_mq_tag_set *set) 4204 4196 { 4205 4197 /* 4206 4198 * blk_mq_map_queues() and multiple .map_queues() implementations ··· 4230 4222 for (i = 0; i < set->nr_maps; i++) 4231 4223 blk_mq_clear_mq_map(&set->map[i]); 4232 4224 4233 - return set->ops->map_queues(set); 4225 + set->ops->map_queues(set); 4234 4226 } else { 4235 4227 BUG_ON(set->nr_maps > 1); 4236 - return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); 4228 + blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); 4237 4229 } 4238 4230 } 4239 4231 ··· 4332 4324 set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues; 4333 4325 } 4334 4326 4335 - ret = blk_mq_update_queue_map(set); 4336 - if (ret) 4337 - goto out_free_mq_map; 4327 + blk_mq_update_queue_map(set); 4338 4328 4339 4329 ret = blk_mq_alloc_set_map_and_rqs(set); 4340 4330 if (ret) ··· 4480 4474 list_add(&qe->node, head); 4481 4475 4482 4476 /* 4483 - * After elevator_switch_mq, the previous elevator_queue will be 4477 + * After elevator_switch, the previous elevator_queue will be 4484 4478 * released by elevator_release. The reference of the io scheduler 4485 4479 * module get by elevator_get will also be put. So we need to get 4486 4480 * a reference of the io scheduler module here to prevent it to be 4487 4481 * removed. 4488 4482 */ 4489 4483 __module_get(qe->type->elevator_owner); 4490 - elevator_switch_mq(q, NULL); 4484 + elevator_switch(q, NULL); 4491 4485 mutex_unlock(&q->sysfs_lock); 4492 4486 4493 4487 return true; ··· 4519 4513 kfree(qe); 4520 4514 4521 4515 mutex_lock(&q->sysfs_lock); 4522 - elevator_switch_mq(q, t); 4516 + elevator_switch(q, t); 4523 4517 mutex_unlock(&q->sysfs_lock); 4524 4518 } 4525 4519

+2 -1

block/blk-mq.h

··· 312 312 static inline struct blk_plug *blk_mq_plug( struct bio *bio) 313 313 { 314 314 /* Zoned block device write operation case: do not plug the BIO */ 315 - if (bdev_is_zoned(bio->bi_bdev) && op_is_write(bio_op(bio))) 315 + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && 316 + bdev_op_is_zoned_write(bio->bi_bdev, bio_op(bio))) 316 317 return NULL; 317 318 318 319 /*

-1

block/blk-rq-qos.h

··· 17 17 RQ_QOS_WBT, 18 18 RQ_QOS_LATENCY, 19 19 RQ_QOS_COST, 20 - RQ_QOS_IOPRIO, 21 20 }; 22 21 23 22 struct rq_wait {

+1 -1

block/blk-sysfs.c

··· 844 844 845 845 blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); 846 846 wbt_enable_default(q); 847 - blk_throtl_register_queue(q); 847 + blk_throtl_register(disk); 848 848 849 849 /* Now everything is ready and send out KOBJ_ADD uevent */ 850 850 kobject_uevent(&q->kobj, KOBJ_ADD);

+177 -107

block/blk-throttle.c

··· 329 329 /* init a service_queue, assumes the caller zeroed it */ 330 330 static void throtl_service_queue_init(struct throtl_service_queue *sq) 331 331 { 332 - INIT_LIST_HEAD(&sq->queued[0]); 333 - INIT_LIST_HEAD(&sq->queued[1]); 332 + INIT_LIST_HEAD(&sq->queued[READ]); 333 + INIT_LIST_HEAD(&sq->queued[WRITE]); 334 334 sq->pending_tree = RB_ROOT_CACHED; 335 335 timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0); 336 336 } ··· 420 420 struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); 421 421 struct throtl_data *td = tg->td; 422 422 int rw; 423 - int has_iops_limit = 0; 424 423 425 424 for (rw = READ; rw <= WRITE; rw++) { 426 - unsigned int iops_limit = tg_iops_limit(tg, rw); 427 - 428 - tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) || 425 + tg->has_rules_iops[rw] = 426 + (parent_tg && parent_tg->has_rules_iops[rw]) || 429 427 (td->limit_valid[td->limit_index] && 430 - (tg_bps_limit(tg, rw) != U64_MAX || 431 - iops_limit != UINT_MAX)); 432 - 433 - if (iops_limit != UINT_MAX) 434 - has_iops_limit = 1; 428 + tg_iops_limit(tg, rw) != UINT_MAX); 429 + tg->has_rules_bps[rw] = 430 + (parent_tg && parent_tg->has_rules_bps[rw]) || 431 + (td->limit_valid[td->limit_index] && 432 + (tg_bps_limit(tg, rw) != U64_MAX)); 435 433 } 436 - 437 - if (has_iops_limit) 438 - tg->flags |= THROTL_TG_HAS_IOPS_LIMIT; 439 - else 440 - tg->flags &= ~THROTL_TG_HAS_IOPS_LIMIT; 441 434 } 442 435 443 436 static void throtl_pd_online(struct blkg_policy_data *pd) ··· 513 520 { 514 521 rb_erase_cached(n, &parent_sq->pending_tree); 515 522 RB_CLEAR_NODE(n); 516 - --parent_sq->nr_pending; 517 523 } 518 524 519 525 static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) ··· 564 572 static void throtl_dequeue_tg(struct throtl_grp *tg) 565 573 { 566 574 if (tg->flags & THROTL_TG_PENDING) { 567 - throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); 575 + struct throtl_service_queue *parent_sq = 576 + tg->service_queue.parent_sq; 577 + 578 + throtl_rb_erase(&tg->rb_node, parent_sq); 579 + --parent_sq->nr_pending; 568 580 tg->flags &= ~THROTL_TG_PENDING; 569 581 } 570 582 } ··· 635 639 { 636 640 tg->bytes_disp[rw] = 0; 637 641 tg->io_disp[rw] = 0; 642 + tg->carryover_bytes[rw] = 0; 643 + tg->carryover_ios[rw] = 0; 638 644 639 645 /* 640 646 * Previous slice has expired. We must have trimmed it after last ··· 654 656 tg->slice_end[rw], jiffies); 655 657 } 656 658 657 - static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) 659 + static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw, 660 + bool clear_carryover) 658 661 { 659 662 tg->bytes_disp[rw] = 0; 660 663 tg->io_disp[rw] = 0; 661 664 tg->slice_start[rw] = jiffies; 662 665 tg->slice_end[rw] = jiffies + tg->td->throtl_slice; 666 + if (clear_carryover) { 667 + tg->carryover_bytes[rw] = 0; 668 + tg->carryover_ios[rw] = 0; 669 + } 663 670 664 671 throtl_log(&tg->service_queue, 665 672 "[%c] new slice start=%lu end=%lu jiffies=%lu", ··· 757 754 tg->slice_start[rw], tg->slice_end[rw], jiffies); 758 755 } 759 756 760 - static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, 761 - u32 iops_limit, unsigned long *wait) 757 + static unsigned int calculate_io_allowed(u32 iops_limit, 758 + unsigned long jiffy_elapsed) 759 + { 760 + unsigned int io_allowed; 761 + u64 tmp; 762 + 763 + /* 764 + * jiffy_elapsed should not be a big value as minimum iops can be 765 + * 1 then at max jiffy elapsed should be equivalent of 1 second as we 766 + * will allow dispatch after 1 second and after that slice should 767 + * have been trimmed. 768 + */ 769 + 770 + tmp = (u64)iops_limit * jiffy_elapsed; 771 + do_div(tmp, HZ); 772 + 773 + if (tmp > UINT_MAX) 774 + io_allowed = UINT_MAX; 775 + else 776 + io_allowed = tmp; 777 + 778 + return io_allowed; 779 + } 780 + 781 + static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed) 782 + { 783 + return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ); 784 + } 785 + 786 + static void __tg_update_carryover(struct throtl_grp *tg, bool rw) 787 + { 788 + unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw]; 789 + u64 bps_limit = tg_bps_limit(tg, rw); 790 + u32 iops_limit = tg_iops_limit(tg, rw); 791 + 792 + /* 793 + * If config is updated while bios are still throttled, calculate and 794 + * accumulate how many bytes/ios are waited across changes. And 795 + * carryover_bytes/ios will be used to calculate new wait time under new 796 + * configuration. 797 + */ 798 + if (bps_limit != U64_MAX) 799 + tg->carryover_bytes[rw] += 800 + calculate_bytes_allowed(bps_limit, jiffy_elapsed) - 801 + tg->bytes_disp[rw]; 802 + if (iops_limit != UINT_MAX) 803 + tg->carryover_ios[rw] += 804 + calculate_io_allowed(iops_limit, jiffy_elapsed) - 805 + tg->io_disp[rw]; 806 + } 807 + 808 + static void tg_update_carryover(struct throtl_grp *tg) 809 + { 810 + if (tg->service_queue.nr_queued[READ]) 811 + __tg_update_carryover(tg, READ); 812 + if (tg->service_queue.nr_queued[WRITE]) 813 + __tg_update_carryover(tg, WRITE); 814 + 815 + /* see comments in struct throtl_grp for meaning of these fields. */ 816 + throtl_log(&tg->service_queue, "%s: %llu %llu %u %u\n", __func__, 817 + tg->carryover_bytes[READ], tg->carryover_bytes[WRITE], 818 + tg->carryover_ios[READ], tg->carryover_ios[WRITE]); 819 + } 820 + 821 + static bool tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, 822 + u32 iops_limit, unsigned long *wait) 762 823 { 763 824 bool rw = bio_data_dir(bio); 764 825 unsigned int io_allowed; 765 826 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 766 - u64 tmp; 767 827 768 828 if (iops_limit == UINT_MAX) { 769 829 if (wait) ··· 838 772 839 773 /* Round up to the next throttle slice, wait time must be nonzero */ 840 774 jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); 841 - 842 - /* 843 - * jiffy_elapsed_rnd should not be a big value as minimum iops can be 844 - * 1 then at max jiffy elapsed should be equivalent of 1 second as we 845 - * will allow dispatch after 1 second and after that slice should 846 - * have been trimmed. 847 - */ 848 - 849 - tmp = (u64)iops_limit * jiffy_elapsed_rnd; 850 - do_div(tmp, HZ); 851 - 852 - if (tmp > UINT_MAX) 853 - io_allowed = UINT_MAX; 854 - else 855 - io_allowed = tmp; 856 - 775 + io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) + 776 + tg->carryover_ios[rw]; 857 777 if (tg->io_disp[rw] + 1 <= io_allowed) { 858 778 if (wait) 859 779 *wait = 0; ··· 854 802 return false; 855 803 } 856 804 857 - static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, 858 - u64 bps_limit, unsigned long *wait) 805 + static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, 806 + u64 bps_limit, unsigned long *wait) 859 807 { 860 808 bool rw = bio_data_dir(bio); 861 - u64 bytes_allowed, extra_bytes, tmp; 809 + u64 bytes_allowed, extra_bytes; 862 810 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 863 811 unsigned int bio_size = throtl_bio_data_size(bio); 864 812 865 813 /* no need to throttle if this bio's bytes have been accounted */ 866 - if (bps_limit == U64_MAX || bio_flagged(bio, BIO_THROTTLED)) { 814 + if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) { 867 815 if (wait) 868 816 *wait = 0; 869 817 return true; ··· 876 824 jiffy_elapsed_rnd = tg->td->throtl_slice; 877 825 878 826 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); 879 - 880 - tmp = bps_limit * jiffy_elapsed_rnd; 881 - do_div(tmp, HZ); 882 - bytes_allowed = tmp; 883 - 827 + bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) + 828 + tg->carryover_bytes[rw]; 884 829 if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) { 885 830 if (wait) 886 831 *wait = 0; ··· 938 889 * slice and it should be extended instead. 939 890 */ 940 891 if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) 941 - throtl_start_new_slice(tg, rw); 892 + throtl_start_new_slice(tg, rw, true); 942 893 else { 943 894 if (time_before(tg->slice_end[rw], 944 895 jiffies + tg->td->throtl_slice)) ··· 946 897 jiffies + tg->td->throtl_slice); 947 898 } 948 899 949 - if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) && 950 - tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) { 900 + if (tg_within_bps_limit(tg, bio, bps_limit, &bps_wait) && 901 + tg_within_iops_limit(tg, bio, iops_limit, &iops_wait)) { 951 902 if (wait) 952 903 *wait = 0; 953 904 return true; ··· 970 921 unsigned int bio_size = throtl_bio_data_size(bio); 971 922 972 923 /* Charge the bio to the group */ 973 - if (!bio_flagged(bio, BIO_THROTTLED)) { 924 + if (!bio_flagged(bio, BIO_BPS_THROTTLED)) { 974 925 tg->bytes_disp[rw] += bio_size; 975 926 tg->last_bytes_disp[rw] += bio_size; 976 927 } 977 928 978 929 tg->io_disp[rw]++; 979 930 tg->last_io_disp[rw]++; 980 - 981 - /* 982 - * BIO_THROTTLED is used to prevent the same bio to be throttled 983 - * more than once as a throttled bio will go through blk-throtl the 984 - * second time when it eventually gets issued. Set it when a bio 985 - * is being charged to a tg. 986 - */ 987 - if (!bio_flagged(bio, BIO_THROTTLED)) 988 - bio_set_flag(bio, BIO_THROTTLED); 989 931 } 990 932 991 933 /** ··· 1030 990 disptime = jiffies + min_wait; 1031 991 1032 992 /* Update dispatch time */ 1033 - throtl_dequeue_tg(tg); 993 + throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); 1034 994 tg->disptime = disptime; 1035 - throtl_enqueue_tg(tg); 995 + tg_service_queue_add(tg); 1036 996 1037 997 /* see throtl_add_bio_tg() */ 1038 998 tg->flags &= ~THROTL_TG_WAS_EMPTY; ··· 1066 1026 sq->nr_queued[rw]--; 1067 1027 1068 1028 throtl_charge_bio(tg, bio); 1029 + bio_set_flag(bio, BIO_BPS_THROTTLED); 1069 1030 1070 1031 /* 1071 1032 * If our parent is another tg, we just need to transfer @bio to ··· 1142 1101 if (time_before(jiffies, tg->disptime)) 1143 1102 break; 1144 1103 1145 - throtl_dequeue_tg(tg); 1146 - 1147 1104 nr_disp += throtl_dispatch_tg(tg); 1148 1105 1149 1106 sq = &tg->service_queue; 1150 - if (sq->nr_queued[0] || sq->nr_queued[1]) 1107 + if (sq->nr_queued[READ] || sq->nr_queued[WRITE]) 1151 1108 tg_update_disptime(tg); 1109 + else 1110 + throtl_dequeue_tg(tg); 1152 1111 1153 1112 if (nr_disp >= THROTL_QUANTUM) 1154 1113 break; ··· 1362 1321 * that a group's limit are dropped suddenly and we don't want to 1363 1322 * account recently dispatched IO with new low rate. 1364 1323 */ 1365 - throtl_start_new_slice(tg, READ); 1366 - throtl_start_new_slice(tg, WRITE); 1324 + throtl_start_new_slice(tg, READ, false); 1325 + throtl_start_new_slice(tg, WRITE, false); 1367 1326 1368 1327 if (tg->flags & THROTL_TG_PENDING) { 1369 1328 tg_update_disptime(tg); ··· 1391 1350 v = U64_MAX; 1392 1351 1393 1352 tg = blkg_to_tg(ctx.blkg); 1353 + tg_update_carryover(tg); 1394 1354 1395 1355 if (is_u64) 1396 1356 *(u64 *)((void *)tg + of_cft(of)->private) = v; ··· 1578 1536 return ret; 1579 1537 1580 1538 tg = blkg_to_tg(ctx.blkg); 1539 + tg_update_carryover(tg); 1581 1540 1582 1541 v[0] = tg->bps_conf[READ][index]; 1583 1542 v[1] = tg->bps_conf[WRITE][index]; ··· 1716 1673 .pd_free_fn = throtl_pd_free, 1717 1674 }; 1718 1675 1676 + void blk_throtl_cancel_bios(struct gendisk *disk) 1677 + { 1678 + struct request_queue *q = disk->queue; 1679 + struct cgroup_subsys_state *pos_css; 1680 + struct blkcg_gq *blkg; 1681 + 1682 + spin_lock_irq(&q->queue_lock); 1683 + /* 1684 + * queue_lock is held, rcu lock is not needed here technically. 1685 + * However, rcu lock is still held to emphasize that following 1686 + * path need RCU protection and to prevent warning from lockdep. 1687 + */ 1688 + rcu_read_lock(); 1689 + blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { 1690 + struct throtl_grp *tg = blkg_to_tg(blkg); 1691 + struct throtl_service_queue *sq = &tg->service_queue; 1692 + 1693 + /* 1694 + * Set the flag to make sure throtl_pending_timer_fn() won't 1695 + * stop until all throttled bios are dispatched. 1696 + */ 1697 + blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING; 1698 + /* 1699 + * Update disptime after setting the above flag to make sure 1700 + * throtl_select_dispatch() won't exit without dispatching. 1701 + */ 1702 + tg_update_disptime(tg); 1703 + 1704 + throtl_schedule_pending_timer(sq, jiffies + 1); 1705 + } 1706 + rcu_read_unlock(); 1707 + spin_unlock_irq(&q->queue_lock); 1708 + } 1709 + 1710 + #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 1719 1711 static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg) 1720 1712 { 1721 1713 unsigned long rtime = jiffies, wtime = jiffies; ··· 1853 1775 return false; 1854 1776 } 1855 1777 return false; 1856 - } 1857 - 1858 - void blk_throtl_cancel_bios(struct request_queue *q) 1859 - { 1860 - struct cgroup_subsys_state *pos_css; 1861 - struct blkcg_gq *blkg; 1862 - 1863 - spin_lock_irq(&q->queue_lock); 1864 - /* 1865 - * queue_lock is held, rcu lock is not needed here technically. 1866 - * However, rcu lock is still held to emphasize that following 1867 - * path need RCU protection and to prevent warning from lockdep. 1868 - */ 1869 - rcu_read_lock(); 1870 - blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { 1871 - struct throtl_grp *tg = blkg_to_tg(blkg); 1872 - struct throtl_service_queue *sq = &tg->service_queue; 1873 - 1874 - /* 1875 - * Set the flag to make sure throtl_pending_timer_fn() won't 1876 - * stop until all throttled bios are dispatched. 1877 - */ 1878 - blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING; 1879 - /* 1880 - * Update disptime after setting the above flag to make sure 1881 - * throtl_select_dispatch() won't exit without dispatching. 1882 - */ 1883 - tg_update_disptime(tg); 1884 - 1885 - throtl_schedule_pending_timer(sq, jiffies + 1); 1886 - } 1887 - rcu_read_unlock(); 1888 - spin_unlock_irq(&q->queue_lock); 1889 1778 } 1890 1779 1891 1780 static bool throtl_can_upgrade(struct throtl_data *td, ··· 2050 2005 tg->checked_last_finish_time = last_finish_time; 2051 2006 } 2052 2007 2053 - #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 2054 2008 static void throtl_update_latency_buckets(struct throtl_data *td) 2055 2009 { 2056 2010 struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE]; ··· 2130 2086 static inline void throtl_update_latency_buckets(struct throtl_data *td) 2131 2087 { 2132 2088 } 2089 + 2090 + static void blk_throtl_update_idletime(struct throtl_grp *tg) 2091 + { 2092 + } 2093 + 2094 + static void throtl_downgrade_check(struct throtl_grp *tg) 2095 + { 2096 + } 2097 + 2098 + static void throtl_upgrade_check(struct throtl_grp *tg) 2099 + { 2100 + } 2101 + 2102 + static bool throtl_can_upgrade(struct throtl_data *td, 2103 + struct throtl_grp *this_tg) 2104 + { 2105 + return false; 2106 + } 2107 + 2108 + static void throtl_upgrade_state(struct throtl_data *td) 2109 + { 2110 + } 2133 2111 #endif 2134 2112 2135 2113 bool __blk_throtl_bio(struct bio *bio) ··· 2225 2159 qn = &tg->qnode_on_parent[rw]; 2226 2160 sq = sq->parent_sq; 2227 2161 tg = sq_to_tg(sq); 2228 - if (!tg) 2162 + if (!tg) { 2163 + bio_set_flag(bio, BIO_BPS_THROTTLED); 2229 2164 goto out_unlock; 2165 + } 2230 2166 } 2231 2167 2232 2168 /* out-of-limit, queue to @tg */ ··· 2257 2189 } 2258 2190 2259 2191 out_unlock: 2260 - bio_set_flag(bio, BIO_THROTTLED); 2261 - 2262 2192 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 2263 2193 if (throttled || !td->track_bio_latency) 2264 2194 bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY; ··· 2352 2286 } 2353 2287 #endif 2354 2288 2355 - int blk_throtl_init(struct request_queue *q) 2289 + int blk_throtl_init(struct gendisk *disk) 2356 2290 { 2291 + struct request_queue *q = disk->queue; 2357 2292 struct throtl_data *td; 2358 2293 int ret; 2359 2294 ··· 2396 2329 return ret; 2397 2330 } 2398 2331 2399 - void blk_throtl_exit(struct request_queue *q) 2332 + void blk_throtl_exit(struct gendisk *disk) 2400 2333 { 2334 + struct request_queue *q = disk->queue; 2335 + 2401 2336 BUG_ON(!q->td); 2402 2337 del_timer_sync(&q->td->service_queue.pending_timer); 2403 2338 throtl_shutdown_wq(q); ··· 2409 2340 kfree(q->td); 2410 2341 } 2411 2342 2412 - void blk_throtl_register_queue(struct request_queue *q) 2343 + void blk_throtl_register(struct gendisk *disk) 2413 2344 { 2345 + struct request_queue *q = disk->queue; 2414 2346 struct throtl_data *td; 2415 2347 int i; 2416 2348

+36 -17

block/blk-throttle.h

··· 55 55 enum tg_state_flags { 56 56 THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ 57 57 THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ 58 - THROTL_TG_HAS_IOPS_LIMIT = 1 << 2, /* tg has iops limit */ 59 - THROTL_TG_CANCELING = 1 << 3, /* starts to cancel bio */ 58 + THROTL_TG_CANCELING = 1 << 2, /* starts to cancel bio */ 60 59 }; 61 60 62 61 enum { ··· 98 99 unsigned int flags; 99 100 100 101 /* are there any throtl rules between this group and td? */ 101 - bool has_rules[2]; 102 + bool has_rules_bps[2]; 103 + bool has_rules_iops[2]; 102 104 103 105 /* internally used bytes per second rate limits */ 104 106 uint64_t bps[2][LIMIT_CNT]; ··· 120 120 121 121 uint64_t last_bytes_disp[2]; 122 122 unsigned int last_io_disp[2]; 123 + 124 + /* 125 + * The following two fields are updated when new configuration is 126 + * submitted while some bios are still throttled, they record how many 127 + * bytes/ios are waited already in previous configuration, and they will 128 + * be used to calculate wait time under new configuration. 129 + */ 130 + uint64_t carryover_bytes[2]; 131 + unsigned int carryover_ios[2]; 123 132 124 133 unsigned long last_check_time; 125 134 ··· 168 159 * Internal throttling interface 169 160 */ 170 161 #ifndef CONFIG_BLK_DEV_THROTTLING 171 - static inline int blk_throtl_init(struct request_queue *q) { return 0; } 172 - static inline void blk_throtl_exit(struct request_queue *q) { } 173 - static inline void blk_throtl_register_queue(struct request_queue *q) { } 162 + static inline int blk_throtl_init(struct gendisk *disk) { return 0; } 163 + static inline void blk_throtl_exit(struct gendisk *disk) { } 164 + static inline void blk_throtl_register(struct gendisk *disk) { } 174 165 static inline bool blk_throtl_bio(struct bio *bio) { return false; } 175 - static inline void blk_throtl_cancel_bios(struct request_queue *q) { } 166 + static inline void blk_throtl_cancel_bios(struct gendisk *disk) { } 176 167 #else /* CONFIG_BLK_DEV_THROTTLING */ 177 - int blk_throtl_init(struct request_queue *q); 178 - void blk_throtl_exit(struct request_queue *q); 179 - void blk_throtl_register_queue(struct request_queue *q); 168 + int blk_throtl_init(struct gendisk *disk); 169 + void blk_throtl_exit(struct gendisk *disk); 170 + void blk_throtl_register(struct gendisk *disk); 180 171 bool __blk_throtl_bio(struct bio *bio); 181 - void blk_throtl_cancel_bios(struct request_queue *q); 182 - static inline bool blk_throtl_bio(struct bio *bio) 172 + void blk_throtl_cancel_bios(struct gendisk *disk); 173 + 174 + static inline bool blk_should_throtl(struct bio *bio) 183 175 { 184 176 struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg); 177 + int rw = bio_data_dir(bio); 185 178 186 - /* no need to throttle bps any more if the bio has been throttled */ 187 - if (bio_flagged(bio, BIO_THROTTLED) && 188 - !(tg->flags & THROTL_TG_HAS_IOPS_LIMIT)) 189 - return false; 179 + /* iops limit is always counted */ 180 + if (tg->has_rules_iops[rw]) 181 + return true; 190 182 191 - if (!tg->has_rules[bio_data_dir(bio)]) 183 + if (tg->has_rules_bps[rw] && !bio_flagged(bio, BIO_BPS_THROTTLED)) 184 + return true; 185 + 186 + return false; 187 + } 188 + 189 + static inline bool blk_throtl_bio(struct bio *bio) 190 + { 191 + 192 + if (!blk_should_throtl(bio)) 192 193 return false; 193 194 194 195 return __blk_throtl_bio(bio);

+4 -5

block/blk-wbt.c

··· 843 843 rwb->enable_state = WBT_STATE_ON_DEFAULT; 844 844 rwb->wc = 1; 845 845 rwb->rq_depth.default_depth = RWB_DEF_DEPTH; 846 + rwb->min_lat_nsec = wbt_default_latency_nsec(q); 847 + 848 + wbt_queue_depth_changed(&rwb->rqos); 849 + wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); 846 850 847 851 /* 848 852 * Assign rwb and add the stats callback. ··· 856 852 goto err_free; 857 853 858 854 blk_stat_add_callback(q, rwb->cb); 859 - 860 - rwb->min_lat_nsec = wbt_default_latency_nsec(q); 861 - 862 - wbt_queue_depth_changed(&rwb->rqos); 863 - wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); 864 855 865 856 return 0; 866 857

+3 -6

block/blk-zoned.c

··· 63 63 if (!rq->q->disk->seq_zones_wlock) 64 64 return false; 65 65 66 - switch (req_op(rq)) { 67 - case REQ_OP_WRITE_ZEROES: 68 - case REQ_OP_WRITE: 66 + if (bdev_op_is_zoned_write(rq->q->disk->part0, req_op(rq))) 69 67 return blk_rq_zone_is_seq(rq); 70 - default: 71 - return false; 72 - } 68 + 69 + return false; 73 70 } 74 71 EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock); 75 72

+3 -4

block/blk.h

··· 270 270 271 271 void blk_insert_flush(struct request *rq); 272 272 273 - int elevator_switch_mq(struct request_queue *q, 274 - struct elevator_type *new_e); 273 + int elevator_switch(struct request_queue *q, struct elevator_type *new_e); 275 274 void elevator_exit(struct request_queue *q); 276 275 int elv_register_queue(struct request_queue *q, bool uevent); 277 276 void elv_unregister_queue(struct request_queue *q); ··· 388 389 } 389 390 390 391 #ifdef CONFIG_BLK_CGROUP_IOLATENCY 391 - extern int blk_iolatency_init(struct request_queue *q); 392 + int blk_iolatency_init(struct gendisk *disk); 392 393 #else 393 - static inline int blk_iolatency_init(struct request_queue *q) { return 0; } 394 + static inline int blk_iolatency_init(struct gendisk *disk) { return 0; }; 394 395 #endif 395 396 396 397 #ifdef CONFIG_BLK_DEV_ZONED

+2 -2

block/elevator.c

··· 588 588 } 589 589 EXPORT_SYMBOL_GPL(elv_unregister); 590 590 591 - int elevator_switch_mq(struct request_queue *q, 591 + static int elevator_switch_mq(struct request_queue *q, 592 592 struct elevator_type *new_e) 593 593 { 594 594 int ret; ··· 723 723 * need for the new one. this way we have a chance of going back to the old 724 724 * one, if the new one fails init for some reason. 725 725 */ 726 - static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) 726 + int elevator_switch(struct request_queue *q, struct elevator_type *new_e) 727 727 { 728 728 int err; 729 729

+4 -3

block/genhd.c

··· 627 627 628 628 blk_mq_freeze_queue_wait(q); 629 629 630 - blk_throtl_cancel_bios(disk->queue); 630 + blk_throtl_cancel_bios(disk); 631 631 632 632 blk_sync_queue(q); 633 633 blk_flush_integrity(); ··· 1151 1151 !test_bit(GD_ADDED, &disk->state)) 1152 1152 blk_mq_exit_queue(disk->queue); 1153 1153 1154 - blkcg_exit_queue(disk->queue); 1154 + blkcg_exit_disk(disk); 1155 + 1155 1156 bioset_exit(&disk->bio_split); 1156 1157 1157 1158 disk_release_events(disk); ··· 1365 1364 if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL)) 1366 1365 goto out_destroy_part_tbl; 1367 1366 1368 - if (blkcg_init_queue(q)) 1367 + if (blkcg_init_disk(disk)) 1369 1368 goto out_erase_part0; 1370 1369 1371 1370 rand_initialize_disk(disk);

+5

block/opal_proto.h

··· 39 39 #define FIRST_TPER_SESSION_NUM 4096 40 40 41 41 #define TPER_SYNC_SUPPORTED 0x01 42 + /* FC_LOCKING features */ 43 + #define LOCKING_SUPPORTED_MASK 0x01 44 + #define LOCKING_ENABLED_MASK 0x02 45 + #define LOCKED_MASK 0x04 42 46 #define MBR_ENABLED_MASK 0x10 47 + #define MBR_DONE_MASK 0x20 43 48 44 49 #define TINY_ATOM_DATA_MASK 0x3F 45 50 #define TINY_ATOM_SIGNED 0x40

+77 -12

block/sed-opal.c

··· 74 74 }; 75 75 76 76 struct opal_dev { 77 - bool supported; 78 - bool mbr_enabled; 77 + u32 flags; 79 78 80 79 void *data; 81 80 sec_send_recv *send_recv; ··· 279 280 return true; 280 281 } 281 282 283 + static bool check_lcksuppt(const void *data) 284 + { 285 + const struct d0_locking_features *lfeat = data; 286 + u8 sup_feat = lfeat->supported_features; 287 + 288 + return !!(sup_feat & LOCKING_SUPPORTED_MASK); 289 + } 290 + 291 + static bool check_lckenabled(const void *data) 292 + { 293 + const struct d0_locking_features *lfeat = data; 294 + u8 sup_feat = lfeat->supported_features; 295 + 296 + return !!(sup_feat & LOCKING_ENABLED_MASK); 297 + } 298 + 299 + static bool check_locked(const void *data) 300 + { 301 + const struct d0_locking_features *lfeat = data; 302 + u8 sup_feat = lfeat->supported_features; 303 + 304 + return !!(sup_feat & LOCKED_MASK); 305 + } 306 + 282 307 static bool check_mbrenabled(const void *data) 283 308 { 284 309 const struct d0_locking_features *lfeat = data; 285 310 u8 sup_feat = lfeat->supported_features; 286 311 287 312 return !!(sup_feat & MBR_ENABLED_MASK); 313 + } 314 + 315 + static bool check_mbrdone(const void *data) 316 + { 317 + const struct d0_locking_features *lfeat = data; 318 + u8 sup_feat = lfeat->supported_features; 319 + 320 + return !!(sup_feat & MBR_DONE_MASK); 288 321 } 289 322 290 323 static bool check_sum(const void *data) ··· 466 435 u32 hlen = be32_to_cpu(hdr->length); 467 436 468 437 print_buffer(dev->resp, hlen); 469 - dev->mbr_enabled = false; 438 + dev->flags &= OPAL_FL_SUPPORTED; 470 439 471 440 if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) { 472 441 pr_debug("Discovery length overflows buffer (%zu+%u)/%u\n", ··· 492 461 check_geometry(dev, body); 493 462 break; 494 463 case FC_LOCKING: 495 - dev->mbr_enabled = check_mbrenabled(body->features); 464 + if (check_lcksuppt(body->features)) 465 + dev->flags |= OPAL_FL_LOCKING_SUPPORTED; 466 + if (check_lckenabled(body->features)) 467 + dev->flags |= OPAL_FL_LOCKING_ENABLED; 468 + if (check_locked(body->features)) 469 + dev->flags |= OPAL_FL_LOCKED; 470 + if (check_mbrenabled(body->features)) 471 + dev->flags |= OPAL_FL_MBR_ENABLED; 472 + if (check_mbrdone(body->features)) 473 + dev->flags |= OPAL_FL_MBR_DONE; 496 474 break; 497 475 case FC_ENTERPRISE: 498 476 case FC_DATASTORE: ··· 2149 2109 mutex_lock(&dev->dev_lock); 2150 2110 setup_opal_dev(dev); 2151 2111 ret = opal_discovery0_step(dev); 2152 - dev->supported = !ret; 2112 + if (!ret) 2113 + dev->flags |= OPAL_FL_SUPPORTED; 2153 2114 mutex_unlock(&dev->dev_lock); 2154 2115 2155 2116 return ret; ··· 2189 2148 2190 2149 INIT_LIST_HEAD(&dev->unlk_lst); 2191 2150 mutex_init(&dev->dev_lock); 2151 + dev->flags = 0; 2192 2152 dev->data = data; 2193 2153 dev->send_recv = send_recv; 2194 2154 if (check_opal_support(dev) != 0) { ··· 2570 2528 if (!dev) 2571 2529 return false; 2572 2530 2573 - if (!dev->supported) 2531 + if (!(dev->flags & OPAL_FL_SUPPORTED)) 2574 2532 return false; 2575 2533 2576 2534 mutex_lock(&dev->dev_lock); ··· 2588 2546 was_failure = true; 2589 2547 } 2590 2548 2591 - if (dev->mbr_enabled) { 2549 + if (dev->flags & OPAL_FL_MBR_ENABLED) { 2592 2550 ret = __opal_set_mbr_done(dev, &suspend->unlk.session.opal_key); 2593 2551 if (ret) 2594 2552 pr_debug("Failed to set MBR Done in S3 resume\n"); ··· 2662 2620 return ret; 2663 2621 } 2664 2622 2623 + static int opal_get_status(struct opal_dev *dev, void __user *data) 2624 + { 2625 + struct opal_status sts = {0}; 2626 + 2627 + /* 2628 + * check_opal_support() error is not fatal, 2629 + * !dev->supported is a valid condition 2630 + */ 2631 + if (!check_opal_support(dev)) 2632 + sts.flags = dev->flags; 2633 + if (copy_to_user(data, &sts, sizeof(sts))) { 2634 + pr_debug("Error copying status to userspace\n"); 2635 + return -EFAULT; 2636 + } 2637 + return 0; 2638 + } 2639 + 2665 2640 int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) 2666 2641 { 2667 2642 void *p; ··· 2688 2629 return -EACCES; 2689 2630 if (!dev) 2690 2631 return -ENOTSUPP; 2691 - if (!dev->supported) 2632 + if (!(dev->flags & OPAL_FL_SUPPORTED)) 2692 2633 return -ENOTSUPP; 2693 2634 2694 - p = memdup_user(arg, _IOC_SIZE(cmd)); 2695 - if (IS_ERR(p)) 2696 - return PTR_ERR(p); 2635 + if (cmd & IOC_IN) { 2636 + p = memdup_user(arg, _IOC_SIZE(cmd)); 2637 + if (IS_ERR(p)) 2638 + return PTR_ERR(p); 2639 + } 2697 2640 2698 2641 switch (cmd) { 2699 2642 case IOC_OPAL_SAVE: ··· 2746 2685 case IOC_OPAL_GENERIC_TABLE_RW: 2747 2686 ret = opal_generic_read_write_table(dev, p); 2748 2687 break; 2688 + case IOC_OPAL_GET_STATUS: 2689 + ret = opal_get_status(dev, arg); 2690 + break; 2749 2691 default: 2750 2692 break; 2751 2693 } 2752 2694 2753 - kfree(p); 2695 + if (cmd & IOC_IN) 2696 + kfree(p); 2754 2697 return ret; 2755 2698 } 2756 2699 EXPORT_SYMBOL_GPL(sed_ioctl);

+2 -13

drivers/block/aoe/aoeblk.c

··· 108 108 return sysfs_emit(page, "%lu\n", d->maxbcnt); 109 109 } 110 110 111 - static int aoedisk_debugfs_show(struct seq_file *s, void *ignored) 111 + static int aoe_debugfs_show(struct seq_file *s, void *ignored) 112 112 { 113 113 struct aoedev *d; 114 114 struct aoetgt **t, **te; ··· 151 151 152 152 return 0; 153 153 } 154 - 155 - static int aoe_debugfs_open(struct inode *inode, struct file *file) 156 - { 157 - return single_open(file, aoedisk_debugfs_show, inode->i_private); 158 - } 154 + DEFINE_SHOW_ATTRIBUTE(aoe_debugfs); 159 155 160 156 static DEVICE_ATTR(state, 0444, aoedisk_show_state, NULL); 161 157 static DEVICE_ATTR(mac, 0444, aoedisk_show_mac, NULL); ··· 178 182 static const struct attribute_group *aoe_attr_groups[] = { 179 183 &aoe_attr_group, 180 184 NULL, 181 - }; 182 - 183 - static const struct file_operations aoe_debugfs_fops = { 184 - .open = aoe_debugfs_open, 185 - .read = seq_read, 186 - .llseek = seq_lseek, 187 - .release = single_release, 188 185 }; 189 186 190 187 static void

+1 -1

drivers/block/brd.c

··· 397 397 disk->minors = max_part; 398 398 disk->fops = &brd_fops; 399 399 disk->private_data = brd; 400 - strlcpy(disk->disk_name, buf, DISK_NAME_LEN); 400 + strscpy(disk->disk_name, buf, DISK_NAME_LEN); 401 401 set_capacity(disk, rd_size * 2); 402 402 403 403 /*

-1

drivers/block/drbd/drbd_int.h

··· 1529 1529 extern int w_e_reissue(struct drbd_work *, int); 1530 1530 extern int w_restart_disk_io(struct drbd_work *, int); 1531 1531 extern int w_send_out_of_sync(struct drbd_work *, int); 1532 - extern int w_start_resync(struct drbd_work *, int); 1533 1532 1534 1533 extern void resync_timer_fn(struct timer_list *t); 1535 1534 extern void start_resync_timer_fn(struct timer_list *t);

+1 -1

drivers/block/drbd/drbd_nl.c

··· 4752 4752 struct drbd_genlmsghdr *dh; 4753 4753 int err; 4754 4754 4755 - strlcpy(helper_info.helper_name, name, sizeof(helper_info.helper_name)); 4755 + strscpy(helper_info.helper_name, name, sizeof(helper_info.helper_name)); 4756 4756 helper_info.helper_name_len = min(strlen(name), sizeof(helper_info.helper_name)); 4757 4757 helper_info.helper_status = status; 4758 4758

-3

drivers/block/drbd/drbd_receiver.c

··· 2113 2113 if (unlikely(!req)) 2114 2114 return -EIO; 2115 2115 2116 - /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid 2117 - * special casing it there for the various failure cases. 2118 - * still no race with drbd_fail_pending_reads */ 2119 2116 err = recv_dless_read(peer_device, req, sector, pi->size); 2120 2117 if (!err) 2121 2118 req_mod(req, DATA_RECEIVED);

-2

drivers/block/drbd/drbd_req.h

··· 266 266 267 267 extern void start_new_tl_epoch(struct drbd_connection *connection); 268 268 extern void drbd_req_destroy(struct kref *kref); 269 - extern void _req_may_be_done(struct drbd_request *req, 270 - struct bio_and_error *m); 271 269 extern int __req_mod(struct drbd_request *req, enum drbd_req_event what, 272 270 struct bio_and_error *m); 273 271 extern void complete_master_bio(struct drbd_device *device,

+6 -6

drivers/block/mtip32xx/mtip32xx.c

··· 1397 1397 if (!port->identify_valid) 1398 1398 return; 1399 1399 1400 - strlcpy(cbuf, (char *)(port->identify+10), 21); 1400 + strscpy(cbuf, (char *)(port->identify + 10), 21); 1401 1401 dev_info(&port->dd->pdev->dev, 1402 1402 "Serial No.: %s\n", cbuf); 1403 1403 1404 - strlcpy(cbuf, (char *)(port->identify+23), 9); 1404 + strscpy(cbuf, (char *)(port->identify + 23), 9); 1405 1405 dev_info(&port->dd->pdev->dev, 1406 1406 "Firmware Ver.: %s\n", cbuf); 1407 1407 1408 - strlcpy(cbuf, (char *)(port->identify+27), 41); 1408 + strscpy(cbuf, (char *)(port->identify + 27), 41); 1409 1409 dev_info(&port->dd->pdev->dev, "Model: %s\n", cbuf); 1410 1410 1411 1411 dev_info(&port->dd->pdev->dev, "Security: %04x %s\n", ··· 1421 1421 pci_read_config_word(port->dd->pdev, PCI_REVISION_ID, &revid); 1422 1422 switch (revid & 0xFF) { 1423 1423 case 0x1: 1424 - strlcpy(cbuf, "A0", 3); 1424 + strscpy(cbuf, "A0", 3); 1425 1425 break; 1426 1426 case 0x3: 1427 - strlcpy(cbuf, "A2", 3); 1427 + strscpy(cbuf, "A2", 3); 1428 1428 break; 1429 1429 default: 1430 - strlcpy(cbuf, "?", 2); 1430 + strscpy(cbuf, "?", 2); 1431 1431 break; 1432 1432 } 1433 1433 dev_info(&port->dd->pdev->dev,

+4 -2

drivers/block/nbd.c

··· 1413 1413 mutex_unlock(&nbd->config_lock); 1414 1414 ret = wait_event_interruptible(config->recv_wq, 1415 1415 atomic_read(&config->recv_threads) == 0); 1416 - if (ret) 1416 + if (ret) { 1417 1417 sock_shutdown(nbd); 1418 - flush_workqueue(nbd->recv_workq); 1418 + nbd_clear_que(nbd); 1419 + } 1419 1420 1421 + flush_workqueue(nbd->recv_workq); 1420 1422 mutex_lock(&nbd->config_lock); 1421 1423 nbd_bdev_reset(nbd); 1422 1424 /* user requested, ignore socket errors */

+4 -4

drivers/block/null_blk/main.c

··· 1528 1528 return false; 1529 1529 } 1530 1530 1531 - static int null_map_queues(struct blk_mq_tag_set *set) 1531 + static void null_map_queues(struct blk_mq_tag_set *set) 1532 1532 { 1533 1533 struct nullb *nullb = set->driver_data; 1534 1534 int i, qoff; ··· 1555 1555 } else { 1556 1556 pr_warn("tag set has unexpected nr_hw_queues: %d\n", 1557 1557 set->nr_hw_queues); 1558 - return -EINVAL; 1558 + WARN_ON_ONCE(true); 1559 + submit_queues = 1; 1560 + poll_queues = 0; 1559 1561 } 1560 1562 } 1561 1563 ··· 1579 1577 qoff += map->nr_queues; 1580 1578 blk_mq_map_queues(map); 1581 1579 } 1582 - 1583 - return 0; 1584 1580 } 1585 1581 1586 1582 static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)

+1 -1

drivers/block/ps3vram.c

··· 745 745 gendisk->flags |= GENHD_FL_NO_PART; 746 746 gendisk->fops = &ps3vram_fops; 747 747 gendisk->private_data = dev; 748 - strlcpy(gendisk->disk_name, DEVICE_NAME, sizeof(gendisk->disk_name)); 748 + strscpy(gendisk->disk_name, DEVICE_NAME, sizeof(gendisk->disk_name)); 749 749 set_capacity(gendisk, priv->size >> 9); 750 750 blk_queue_max_segments(gendisk->queue, BLK_MAX_SEGMENTS); 751 751 blk_queue_max_segment_size(gendisk->queue, BLK_MAX_SEGMENT_SIZE);

+4 -2

drivers/block/rnbd/Makefile

··· 6 6 rnbd-clt-sysfs.o \ 7 7 rnbd-common.o 8 8 9 + CFLAGS_rnbd-srv-trace.o = -I$(src) 10 + 9 11 rnbd-server-y := rnbd-common.o \ 10 12 rnbd-srv.o \ 11 - rnbd-srv-dev.o \ 12 - rnbd-srv-sysfs.o 13 + rnbd-srv-sysfs.o \ 14 + rnbd-srv-trace.o 13 15 14 16 obj-$(CONFIG_BLK_DEV_RNBD_CLIENT) += rnbd-client.o 15 17 obj-$(CONFIG_BLK_DEV_RNBD_SERVER) += rnbd-server.o

+2 -6

drivers/block/rnbd/rnbd-clt.c

··· 1159 1159 { 1160 1160 struct rnbd_queue *q = hctx->driver_data; 1161 1161 struct rnbd_clt_dev *dev = q->dev; 1162 - int cnt; 1163 1162 1164 - cnt = rtrs_clt_rdma_cq_direct(dev->sess->rtrs, hctx->queue_num); 1165 - return cnt; 1163 + return rtrs_clt_rdma_cq_direct(dev->sess->rtrs, hctx->queue_num); 1166 1164 } 1167 1165 1168 - static int rnbd_rdma_map_queues(struct blk_mq_tag_set *set) 1166 + static void rnbd_rdma_map_queues(struct blk_mq_tag_set *set) 1169 1167 { 1170 1168 struct rnbd_clt_session *sess = set->driver_data; 1171 1169 ··· 1192 1194 set->map[HCTX_TYPE_DEFAULT].nr_queues, 1193 1195 set->map[HCTX_TYPE_READ].nr_queues); 1194 1196 } 1195 - 1196 - return 0; 1197 1197 } 1198 1198 1199 1199 static struct blk_mq_ops rnbd_mq_ops = {

-43

drivers/block/rnbd/rnbd-srv-dev.c

··· 1 - // SPDX-License-Identifier: GPL-2.0-or-later 2 - /* 3 - * RDMA Network Block Driver 4 - * 5 - * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. 6 - * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. 7 - * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. 8 - */ 9 - #undef pr_fmt 10 - #define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt 11 - 12 - #include "rnbd-srv-dev.h" 13 - #include "rnbd-log.h" 14 - 15 - struct rnbd_dev *rnbd_dev_open(const char *path, fmode_t flags) 16 - { 17 - struct rnbd_dev *dev; 18 - int ret; 19 - 20 - dev = kzalloc(sizeof(*dev), GFP_KERNEL); 21 - if (!dev) 22 - return ERR_PTR(-ENOMEM); 23 - 24 - dev->blk_open_flags = flags; 25 - dev->bdev = blkdev_get_by_path(path, flags, THIS_MODULE); 26 - ret = PTR_ERR_OR_ZERO(dev->bdev); 27 - if (ret) 28 - goto err; 29 - 30 - dev->blk_open_flags = flags; 31 - 32 - return dev; 33 - 34 - err: 35 - kfree(dev); 36 - return ERR_PTR(ret); 37 - } 38 - 39 - void rnbd_dev_close(struct rnbd_dev *dev) 40 - { 41 - blkdev_put(dev->bdev, dev->blk_open_flags); 42 - kfree(dev); 43 - }

-64

drivers/block/rnbd/rnbd-srv-dev.h

··· 1 - /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 - /* 3 - * RDMA Network Block Driver 4 - * 5 - * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. 6 - * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. 7 - * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. 8 - */ 9 - #ifndef RNBD_SRV_DEV_H 10 - #define RNBD_SRV_DEV_H 11 - 12 - #include <linux/fs.h> 13 - #include "rnbd-proto.h" 14 - 15 - struct rnbd_dev { 16 - struct block_device *bdev; 17 - fmode_t blk_open_flags; 18 - }; 19 - 20 - /** 21 - * rnbd_dev_open() - Open a device 22 - * @path: path to open 23 - * @flags: open flags 24 - */ 25 - struct rnbd_dev *rnbd_dev_open(const char *path, fmode_t flags); 26 - 27 - /** 28 - * rnbd_dev_close() - Close a device 29 - */ 30 - void rnbd_dev_close(struct rnbd_dev *dev); 31 - 32 - void rnbd_endio(void *priv, int error); 33 - 34 - static inline int rnbd_dev_get_max_segs(const struct rnbd_dev *dev) 35 - { 36 - return queue_max_segments(bdev_get_queue(dev->bdev)); 37 - } 38 - 39 - static inline int rnbd_dev_get_max_hw_sects(const struct rnbd_dev *dev) 40 - { 41 - return queue_max_hw_sectors(bdev_get_queue(dev->bdev)); 42 - } 43 - 44 - static inline int rnbd_dev_get_secure_discard(const struct rnbd_dev *dev) 45 - { 46 - return bdev_max_secure_erase_sectors(dev->bdev); 47 - } 48 - 49 - static inline int rnbd_dev_get_max_discard_sects(const struct rnbd_dev *dev) 50 - { 51 - return bdev_max_discard_sectors(dev->bdev); 52 - } 53 - 54 - static inline int rnbd_dev_get_discard_granularity(const struct rnbd_dev *dev) 55 - { 56 - return bdev_get_queue(dev->bdev)->limits.discard_granularity; 57 - } 58 - 59 - static inline int rnbd_dev_get_discard_alignment(const struct rnbd_dev *dev) 60 - { 61 - return bdev_discard_alignment(dev->bdev); 62 - } 63 - 64 - #endif /* RNBD_SRV_DEV_H */

+17

drivers/block/rnbd/rnbd-srv-trace.c

··· 1 + // SPDX-License-Identifier: GPL-2.0+ 2 + /* 3 + * RDMA Network Block Driver 4 + * 5 + * Copyright (c) 2022 1&1 IONOS SE. All rights reserved. 6 + */ 7 + #include "rtrs.h" 8 + #include "rtrs-srv.h" 9 + #include "rnbd-srv.h" 10 + #include "rnbd-proto.h" 11 + 12 + /* 13 + * We include this last to have the helpers above available for the trace 14 + * event implementations. 15 + */ 16 + #define CREATE_TRACE_POINTS 17 + #include "rnbd-srv-trace.h"

+207

drivers/block/rnbd/rnbd-srv-trace.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0+ */ 2 + /* 3 + * RDMA Network Block Driver 4 + * 5 + * Copyright (c) 2022 1&1 IONOS SE. All rights reserved. 6 + */ 7 + #undef TRACE_SYSTEM 8 + #define TRACE_SYSTEM rnbd_srv 9 + 10 + #if !defined(_TRACE_RNBD_SRV_H) || defined(TRACE_HEADER_MULTI_READ) 11 + #define _TRACE_RNBD_SRV_H 12 + 13 + #include <linux/tracepoint.h> 14 + 15 + struct rnbd_srv_session; 16 + struct rtrs_srv_op; 17 + 18 + DECLARE_EVENT_CLASS(rnbd_srv_link_class, 19 + TP_PROTO(struct rnbd_srv_session *srv), 20 + 21 + TP_ARGS(srv), 22 + 23 + TP_STRUCT__entry( 24 + __field(int, qdepth) 25 + __string(sessname, srv->sessname) 26 + ), 27 + 28 + TP_fast_assign( 29 + __entry->qdepth = srv->queue_depth; 30 + __assign_str(sessname, srv->sessname); 31 + ), 32 + 33 + TP_printk("sessname: %s qdepth: %d", 34 + __get_str(sessname), 35 + __entry->qdepth 36 + ) 37 + ); 38 + 39 + #define DEFINE_LINK_EVENT(name) \ 40 + DEFINE_EVENT(rnbd_srv_link_class, name, \ 41 + TP_PROTO(struct rnbd_srv_session *srv), \ 42 + TP_ARGS(srv)) 43 + 44 + DEFINE_LINK_EVENT(create_sess); 45 + DEFINE_LINK_EVENT(destroy_sess); 46 + 47 + TRACE_DEFINE_ENUM(RNBD_OP_READ); 48 + TRACE_DEFINE_ENUM(RNBD_OP_WRITE); 49 + TRACE_DEFINE_ENUM(RNBD_OP_FLUSH); 50 + TRACE_DEFINE_ENUM(RNBD_OP_DISCARD); 51 + TRACE_DEFINE_ENUM(RNBD_OP_SECURE_ERASE); 52 + TRACE_DEFINE_ENUM(RNBD_F_SYNC); 53 + TRACE_DEFINE_ENUM(RNBD_F_FUA); 54 + 55 + #define show_rnbd_rw_flags(x) \ 56 + __print_flags(x, "|", \ 57 + { RNBD_OP_READ, "READ" }, \ 58 + { RNBD_OP_WRITE, "WRITE" }, \ 59 + { RNBD_OP_FLUSH, "FLUSH" }, \ 60 + { RNBD_OP_DISCARD, "DISCARD" }, \ 61 + { RNBD_OP_SECURE_ERASE, "SECURE_ERASE" }, \ 62 + { RNBD_F_SYNC, "SYNC" }, \ 63 + { RNBD_F_FUA, "FUA" }) 64 + 65 + TRACE_EVENT(process_rdma, 66 + TP_PROTO(struct rnbd_srv_session *srv, 67 + const struct rnbd_msg_io *msg, 68 + struct rtrs_srv_op *id, 69 + u32 datalen, 70 + size_t usrlen), 71 + 72 + TP_ARGS(srv, msg, id, datalen, usrlen), 73 + 74 + TP_STRUCT__entry( 75 + __string(sessname, srv->sessname) 76 + __field(u8, dir) 77 + __field(u8, ver) 78 + __field(u32, device_id) 79 + __field(u64, sector) 80 + __field(u32, flags) 81 + __field(u32, bi_size) 82 + __field(u16, ioprio) 83 + __field(u32, datalen) 84 + __field(size_t, usrlen) 85 + ), 86 + 87 + TP_fast_assign( 88 + __assign_str(sessname, srv->sessname); 89 + __entry->dir = id->dir; 90 + __entry->ver = srv->ver; 91 + __entry->device_id = le32_to_cpu(msg->device_id); 92 + __entry->sector = le64_to_cpu(msg->sector); 93 + __entry->bi_size = le32_to_cpu(msg->bi_size); 94 + __entry->flags = le32_to_cpu(msg->rw); 95 + __entry->ioprio = le16_to_cpu(msg->prio); 96 + __entry->datalen = datalen; 97 + __entry->usrlen = usrlen; 98 + ), 99 + 100 + TP_printk("I/O req: sess: %s, type: %s, ver: %d, devid: %u, sector: %llu, bsize: %u, flags: %s, ioprio: %d, datalen: %u, usrlen: %zu", 101 + __get_str(sessname), 102 + __print_symbolic(__entry->dir, 103 + { READ, "READ" }, 104 + { WRITE, "WRITE" }), 105 + __entry->ver, 106 + __entry->device_id, 107 + __entry->sector, 108 + __entry->bi_size, 109 + show_rnbd_rw_flags(__entry->flags), 110 + __entry->ioprio, 111 + __entry->datalen, 112 + __entry->usrlen 113 + ) 114 + ); 115 + 116 + TRACE_EVENT(process_msg_sess_info, 117 + TP_PROTO(struct rnbd_srv_session *srv, 118 + const struct rnbd_msg_sess_info *msg), 119 + 120 + TP_ARGS(srv, msg), 121 + 122 + TP_STRUCT__entry( 123 + __field(u8, proto_ver) 124 + __field(u8, clt_ver) 125 + __field(u8, srv_ver) 126 + __string(sessname, srv->sessname) 127 + ), 128 + 129 + TP_fast_assign( 130 + __entry->proto_ver = srv->ver; 131 + __entry->clt_ver = msg->ver; 132 + __entry->srv_ver = RNBD_PROTO_VER_MAJOR; 133 + __assign_str(sessname, srv->sessname); 134 + ), 135 + 136 + TP_printk("Session %s using proto-ver %d (clt-ver: %d, srv-ver: %d)", 137 + __get_str(sessname), 138 + __entry->proto_ver, 139 + __entry->clt_ver, 140 + __entry->srv_ver 141 + ) 142 + ); 143 + 144 + TRACE_DEFINE_ENUM(RNBD_ACCESS_RO); 145 + TRACE_DEFINE_ENUM(RNBD_ACCESS_RW); 146 + TRACE_DEFINE_ENUM(RNBD_ACCESS_MIGRATION); 147 + 148 + #define show_rnbd_access_mode(x) \ 149 + __print_symbolic(x, \ 150 + { RNBD_ACCESS_RO, "RO" }, \ 151 + { RNBD_ACCESS_RW, "RW" }, \ 152 + { RNBD_ACCESS_MIGRATION, "MIGRATION" }) 153 + 154 + TRACE_EVENT(process_msg_open, 155 + TP_PROTO(struct rnbd_srv_session *srv, 156 + const struct rnbd_msg_open *msg), 157 + 158 + TP_ARGS(srv, msg), 159 + 160 + TP_STRUCT__entry( 161 + __field(u8, access_mode) 162 + __string(sessname, srv->sessname) 163 + __string(dev_name, msg->dev_name) 164 + ), 165 + 166 + TP_fast_assign( 167 + __entry->access_mode = msg->access_mode; 168 + __assign_str(sessname, srv->sessname); 169 + __assign_str(dev_name, msg->dev_name); 170 + ), 171 + 172 + TP_printk("Open message received: session='%s' path='%s' access_mode=%s", 173 + __get_str(sessname), 174 + __get_str(dev_name), 175 + show_rnbd_access_mode(__entry->access_mode) 176 + ) 177 + ); 178 + 179 + TRACE_EVENT(process_msg_close, 180 + TP_PROTO(struct rnbd_srv_session *srv, 181 + const struct rnbd_msg_close *msg), 182 + 183 + TP_ARGS(srv, msg), 184 + 185 + TP_STRUCT__entry( 186 + __field(u32, device_id) 187 + __string(sessname, srv->sessname) 188 + ), 189 + 190 + TP_fast_assign( 191 + __entry->device_id = le32_to_cpu(msg->device_id); 192 + __assign_str(sessname, srv->sessname); 193 + ), 194 + 195 + TP_printk("Close message received: session='%s' device id='%d'", 196 + __get_str(sessname), 197 + __entry->device_id 198 + ) 199 + ); 200 + 201 + #endif /* _TRACE_RNBD_SRV_H */ 202 + 203 + #undef TRACE_INCLUDE_PATH 204 + #define TRACE_INCLUDE_PATH . 205 + #define TRACE_INCLUDE_FILE rnbd-srv-trace 206 + #include <trace/define_trace.h> 207 +

+59 -64

drivers/block/rnbd/rnbd-srv.c

··· 13 13 #include <linux/blkdev.h> 14 14 15 15 #include "rnbd-srv.h" 16 - #include "rnbd-srv-dev.h" 16 + #include "rnbd-srv-trace.h" 17 17 18 18 MODULE_DESCRIPTION("RDMA Network Block Device Server"); 19 19 MODULE_LICENSE("GPL"); ··· 84 84 kref_put(&sess_dev->kref, rnbd_sess_dev_release); 85 85 } 86 86 87 - void rnbd_endio(void *priv, int error) 88 - { 89 - struct rnbd_io_private *rnbd_priv = priv; 90 - struct rnbd_srv_sess_dev *sess_dev = rnbd_priv->sess_dev; 91 - 92 - rnbd_put_sess_dev(sess_dev); 93 - 94 - rtrs_srv_resp_rdma(rnbd_priv->id, error); 95 - 96 - kfree(priv); 97 - } 98 - 99 87 static struct rnbd_srv_sess_dev * 100 88 rnbd_get_sess_dev(int dev_id, struct rnbd_srv_session *srv_sess) 101 89 { ··· 104 116 105 117 static void rnbd_dev_bi_end_io(struct bio *bio) 106 118 { 107 - rnbd_endio(bio->bi_private, blk_status_to_errno(bio->bi_status)); 119 + struct rnbd_io_private *rnbd_priv = bio->bi_private; 120 + struct rnbd_srv_sess_dev *sess_dev = rnbd_priv->sess_dev; 121 + 122 + rnbd_put_sess_dev(sess_dev); 123 + rtrs_srv_resp_rdma(rnbd_priv->id, blk_status_to_errno(bio->bi_status)); 124 + 125 + kfree(rnbd_priv); 108 126 bio_put(bio); 109 127 } 110 128 ··· 125 131 int err; 126 132 struct bio *bio; 127 133 short prio; 134 + 135 + trace_process_rdma(srv_sess, msg, id, datalen, usrlen); 128 136 129 137 priv = kmalloc(sizeof(*priv), GFP_KERNEL); 130 138 if (!priv) ··· 145 149 priv->sess_dev = sess_dev; 146 150 priv->id = id; 147 151 148 - bio = bio_alloc(sess_dev->rnbd_dev->bdev, 1, 152 + bio = bio_alloc(sess_dev->bdev, 1, 149 153 rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL); 150 154 if (bio_add_page(bio, virt_to_page(data), datalen, 151 155 offset_in_page(data)) != datalen) { ··· 219 223 rnbd_put_sess_dev(sess_dev); 220 224 wait_for_completion(&dc); /* wait for inflights to drop to zero */ 221 225 222 - rnbd_dev_close(sess_dev->rnbd_dev); 226 + blkdev_put(sess_dev->bdev, sess_dev->open_flags); 223 227 mutex_lock(&sess_dev->dev->lock); 224 228 list_del(&sess_dev->dev_list); 225 229 if (sess_dev->open_flags & FMODE_WRITE) ··· 239 243 240 244 if (xa_empty(&srv_sess->index_idr)) 241 245 goto out; 246 + 247 + trace_destroy_sess(srv_sess); 242 248 243 249 mutex_lock(&srv_sess->lock); 244 250 xa_for_each(&srv_sess->index_idr, index, sess_dev) ··· 288 290 289 291 rtrs_srv_set_sess_priv(rtrs, srv_sess); 290 292 293 + trace_create_sess(srv_sess); 294 + 291 295 return 0; 292 296 } 293 297 ··· 332 332 mutex_unlock(&sess->lock); 333 333 } 334 334 335 - static int process_msg_close(struct rnbd_srv_session *srv_sess, 335 + static void process_msg_close(struct rnbd_srv_session *srv_sess, 336 336 void *data, size_t datalen, const void *usr, 337 337 size_t usrlen) 338 338 { 339 339 const struct rnbd_msg_close *close_msg = usr; 340 340 struct rnbd_srv_sess_dev *sess_dev; 341 341 342 + trace_process_msg_close(srv_sess, close_msg); 343 + 342 344 sess_dev = rnbd_get_sess_dev(le32_to_cpu(close_msg->device_id), 343 345 srv_sess); 344 346 if (IS_ERR(sess_dev)) 345 - return 0; 347 + return; 346 348 347 349 rnbd_put_sess_dev(sess_dev); 348 350 mutex_lock(&srv_sess->lock); 349 351 rnbd_srv_destroy_dev_session_sysfs(sess_dev); 350 352 mutex_unlock(&srv_sess->lock); 351 - return 0; 352 353 } 353 354 354 355 static int process_msg_open(struct rnbd_srv_session *srv_sess, ··· 379 378 case RNBD_MSG_IO: 380 379 return process_rdma(srv_sess, id, data, datalen, usr, usrlen); 381 380 case RNBD_MSG_CLOSE: 382 - ret = process_msg_close(srv_sess, data, datalen, usr, usrlen); 381 + process_msg_close(srv_sess, data, datalen, usr, usrlen); 383 382 break; 384 383 case RNBD_MSG_OPEN: 385 384 ret = process_msg_open(srv_sess, usr, usrlen, data, datalen); ··· 394 393 return -EINVAL; 395 394 } 396 395 396 + /* 397 + * Since ret is passed to rtrs to handle the failure case, we 398 + * just return 0 at the end otherwise callers in rtrs would call 399 + * send_io_resp_imm again to print redundant err message. 400 + */ 397 401 rtrs_srv_resp_rdma(id, ret); 398 402 return 0; 399 403 } ··· 510 504 } 511 505 512 506 static struct rnbd_srv_dev * 513 - rnbd_srv_get_or_create_srv_dev(struct rnbd_dev *rnbd_dev, 507 + rnbd_srv_get_or_create_srv_dev(struct block_device *bdev, 514 508 struct rnbd_srv_session *srv_sess, 515 509 enum rnbd_access_mode access_mode) 516 510 { 517 511 int ret; 518 512 struct rnbd_srv_dev *new_dev, *dev; 519 513 520 - new_dev = rnbd_srv_init_srv_dev(rnbd_dev->bdev); 514 + new_dev = rnbd_srv_init_srv_dev(bdev); 521 515 if (IS_ERR(new_dev)) 522 516 return new_dev; 523 517 ··· 537 531 static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp, 538 532 struct rnbd_srv_sess_dev *sess_dev) 539 533 { 540 - struct rnbd_dev *rnbd_dev = sess_dev->rnbd_dev; 534 + struct block_device *bdev = sess_dev->bdev; 541 535 542 536 rsp->hdr.type = cpu_to_le16(RNBD_MSG_OPEN_RSP); 543 - rsp->device_id = 544 - cpu_to_le32(sess_dev->device_id); 545 - rsp->nsectors = 546 - cpu_to_le64(get_capacity(rnbd_dev->bdev->bd_disk)); 547 - rsp->logical_block_size = 548 - cpu_to_le16(bdev_logical_block_size(rnbd_dev->bdev)); 549 - rsp->physical_block_size = 550 - cpu_to_le16(bdev_physical_block_size(rnbd_dev->bdev)); 551 - rsp->max_segments = 552 - cpu_to_le16(rnbd_dev_get_max_segs(rnbd_dev)); 537 + rsp->device_id = cpu_to_le32(sess_dev->device_id); 538 + rsp->nsectors = cpu_to_le64(bdev_nr_sectors(bdev)); 539 + rsp->logical_block_size = cpu_to_le16(bdev_logical_block_size(bdev)); 540 + rsp->physical_block_size = cpu_to_le16(bdev_physical_block_size(bdev)); 541 + rsp->max_segments = cpu_to_le16(bdev_max_segments(bdev)); 553 542 rsp->max_hw_sectors = 554 - cpu_to_le32(rnbd_dev_get_max_hw_sects(rnbd_dev)); 543 + cpu_to_le32(queue_max_hw_sectors(bdev_get_queue(bdev))); 555 544 rsp->max_write_same_sectors = 0; 556 - rsp->max_discard_sectors = 557 - cpu_to_le32(rnbd_dev_get_max_discard_sects(rnbd_dev)); 558 - rsp->discard_granularity = 559 - cpu_to_le32(rnbd_dev_get_discard_granularity(rnbd_dev)); 560 - rsp->discard_alignment = 561 - cpu_to_le32(rnbd_dev_get_discard_alignment(rnbd_dev)); 562 - rsp->secure_discard = 563 - cpu_to_le16(rnbd_dev_get_secure_discard(rnbd_dev)); 545 + rsp->max_discard_sectors = cpu_to_le32(bdev_max_discard_sectors(bdev)); 546 + rsp->discard_granularity = cpu_to_le32(bdev_discard_granularity(bdev)); 547 + rsp->discard_alignment = cpu_to_le32(bdev_discard_alignment(bdev)); 548 + rsp->secure_discard = cpu_to_le16(bdev_max_secure_erase_sectors(bdev)); 564 549 rsp->cache_policy = 0; 565 - if (bdev_write_cache(rnbd_dev->bdev)) 550 + if (bdev_write_cache(bdev)) 566 551 rsp->cache_policy |= RNBD_WRITEBACK; 567 - if (bdev_fua(rnbd_dev->bdev)) 552 + if (bdev_fua(bdev)) 568 553 rsp->cache_policy |= RNBD_FUA; 569 554 } 570 555 571 556 static struct rnbd_srv_sess_dev * 572 557 rnbd_srv_create_set_sess_dev(struct rnbd_srv_session *srv_sess, 573 558 const struct rnbd_msg_open *open_msg, 574 - struct rnbd_dev *rnbd_dev, fmode_t open_flags, 559 + struct block_device *bdev, fmode_t open_flags, 575 560 struct rnbd_srv_dev *srv_dev) 576 561 { 577 562 struct rnbd_srv_sess_dev *sdev = rnbd_sess_dev_alloc(srv_sess); ··· 574 577 575 578 strscpy(sdev->pathname, open_msg->dev_name, sizeof(sdev->pathname)); 576 579 577 - sdev->rnbd_dev = rnbd_dev; 580 + sdev->bdev = bdev; 578 581 sdev->sess = srv_sess; 579 582 sdev->dev = srv_dev; 580 583 sdev->open_flags = open_flags; ··· 640 643 struct rnbd_msg_sess_info_rsp *rsp = data; 641 644 642 645 srv_sess->ver = min_t(u8, sess_info_msg->ver, RNBD_PROTO_VER_MAJOR); 643 - pr_debug("Session %s using protocol version %d (client version: %d, server version: %d)\n", 644 - srv_sess->sessname, srv_sess->ver, 645 - sess_info_msg->ver, RNBD_PROTO_VER_MAJOR); 646 + 647 + trace_process_msg_sess_info(srv_sess, sess_info_msg); 646 648 647 649 rsp->hdr.type = cpu_to_le16(RNBD_MSG_SESS_INFO_RSP); 648 650 rsp->ver = srv_sess->ver; ··· 681 685 struct rnbd_srv_dev *srv_dev; 682 686 struct rnbd_srv_sess_dev *srv_sess_dev; 683 687 const struct rnbd_msg_open *open_msg = msg; 688 + struct block_device *bdev; 684 689 fmode_t open_flags; 685 690 char *full_path; 686 - struct rnbd_dev *rnbd_dev; 687 691 struct rnbd_msg_open_rsp *rsp = data; 688 692 689 - pr_debug("Open message received: session='%s' path='%s' access_mode=%d\n", 690 - srv_sess->sessname, open_msg->dev_name, 691 - open_msg->access_mode); 693 + trace_process_msg_open(srv_sess, open_msg); 694 + 692 695 open_flags = FMODE_READ; 693 696 if (open_msg->access_mode != RNBD_ACCESS_RO) 694 697 open_flags |= FMODE_WRITE; ··· 720 725 goto reject; 721 726 } 722 727 723 - rnbd_dev = rnbd_dev_open(full_path, open_flags); 724 - if (IS_ERR(rnbd_dev)) { 725 - pr_err("Opening device '%s' on session %s failed, failed to open the block device, err: %ld\n", 726 - full_path, srv_sess->sessname, PTR_ERR(rnbd_dev)); 727 - ret = PTR_ERR(rnbd_dev); 728 + bdev = blkdev_get_by_path(full_path, open_flags, THIS_MODULE); 729 + if (IS_ERR(bdev)) { 730 + ret = PTR_ERR(bdev); 731 + pr_err("Opening device '%s' on session %s failed, failed to open the block device, err: %d\n", 732 + full_path, srv_sess->sessname, ret); 728 733 goto free_path; 729 734 } 730 735 731 - srv_dev = rnbd_srv_get_or_create_srv_dev(rnbd_dev, srv_sess, 736 + srv_dev = rnbd_srv_get_or_create_srv_dev(bdev, srv_sess, 732 737 open_msg->access_mode); 733 738 if (IS_ERR(srv_dev)) { 734 739 pr_err("Opening device '%s' on session %s failed, creating srv_dev failed, err: %ld\n", 735 740 full_path, srv_sess->sessname, PTR_ERR(srv_dev)); 736 741 ret = PTR_ERR(srv_dev); 737 - goto rnbd_dev_close; 742 + goto blkdev_put; 738 743 } 739 744 740 745 srv_sess_dev = rnbd_srv_create_set_sess_dev(srv_sess, open_msg, 741 - rnbd_dev, open_flags, 746 + bdev, open_flags, 742 747 srv_dev); 743 748 if (IS_ERR(srv_sess_dev)) { 744 749 pr_err("Opening device '%s' on session %s failed, creating sess_dev failed, err: %ld\n", ··· 753 758 */ 754 759 mutex_lock(&srv_dev->lock); 755 760 if (!srv_dev->dev_kobj.state_in_sysfs) { 756 - ret = rnbd_srv_create_dev_sysfs(srv_dev, rnbd_dev->bdev); 761 + ret = rnbd_srv_create_dev_sysfs(srv_dev, bdev); 757 762 if (ret) { 758 763 mutex_unlock(&srv_dev->lock); 759 764 rnbd_srv_err(srv_sess_dev, ··· 795 800 mutex_unlock(&srv_dev->lock); 796 801 } 797 802 rnbd_put_srv_dev(srv_dev); 798 - rnbd_dev_close: 799 - rnbd_dev_close(rnbd_dev); 803 + blkdev_put: 804 + blkdev_put(bdev, open_flags); 800 805 free_path: 801 806 kfree(full_path); 802 807 reject:

+1 -1

drivers/block/rnbd/rnbd-srv.h

··· 46 46 struct rnbd_srv_sess_dev { 47 47 /* Entry inside rnbd_srv_dev struct */ 48 48 struct list_head dev_list; 49 - struct rnbd_dev *rnbd_dev; 49 + struct block_device *bdev; 50 50 struct rnbd_srv_session *sess; 51 51 struct rnbd_srv_dev *dev; 52 52 struct kobject kobj;

+285 -17

drivers/block/ublk_drv.c

··· 49 49 /* All UBLK_F_* have to be included into UBLK_F_ALL */ 50 50 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \ 51 51 | UBLK_F_URING_CMD_COMP_IN_TASK \ 52 - | UBLK_F_NEED_GET_DATA) 52 + | UBLK_F_NEED_GET_DATA \ 53 + | UBLK_F_USER_RECOVERY \ 54 + | UBLK_F_USER_RECOVERY_REISSUE) 53 55 54 56 /* All UBLK_PARAM_TYPE_* should be included here */ 55 57 #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD) ··· 121 119 122 120 unsigned long io_addr; /* mapped vm address */ 123 121 unsigned int max_io_sz; 124 - bool abort_work_pending; 122 + bool force_abort; 125 123 unsigned short nr_io_ready; /* how many ios setup */ 126 124 struct ublk_device *dev; 127 125 struct ublk_io ios[0]; ··· 163 161 * monitor each queue's daemon periodically 164 162 */ 165 163 struct delayed_work monitor_work; 164 + struct work_struct quiesce_work; 166 165 struct work_struct stop_work; 167 166 }; 168 167 ··· 324 321 325 322 return round_up(ubq->q_depth * sizeof(struct ublksrv_io_desc), 326 323 PAGE_SIZE); 324 + } 325 + 326 + static inline bool ublk_queue_can_use_recovery_reissue( 327 + struct ublk_queue *ubq) 328 + { 329 + if ((ubq->flags & UBLK_F_USER_RECOVERY) && 330 + (ubq->flags & UBLK_F_USER_RECOVERY_REISSUE)) 331 + return true; 332 + return false; 333 + } 334 + 335 + static inline bool ublk_queue_can_use_recovery( 336 + struct ublk_queue *ubq) 337 + { 338 + if (ubq->flags & UBLK_F_USER_RECOVERY) 339 + return true; 340 + return false; 341 + } 342 + 343 + static inline bool ublk_can_use_recovery(struct ublk_device *ub) 344 + { 345 + if (ub->dev_info.flags & UBLK_F_USER_RECOVERY) 346 + return true; 347 + return false; 327 348 } 328 349 329 350 static void ublk_free_disk(struct gendisk *disk) ··· 639 612 * Also aborting may not be started yet, keep in mind that one failed 640 613 * request may be issued by block layer again. 641 614 */ 642 - static void __ublk_fail_req(struct ublk_io *io, struct request *req) 615 + static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, 616 + struct request *req) 643 617 { 644 618 WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); 645 619 646 620 if (!(io->flags & UBLK_IO_FLAG_ABORTED)) { 647 621 io->flags |= UBLK_IO_FLAG_ABORTED; 648 - blk_mq_end_request(req, BLK_STS_IOERR); 622 + if (ublk_queue_can_use_recovery_reissue(ubq)) 623 + blk_mq_requeue_request(req, false); 624 + else 625 + blk_mq_end_request(req, BLK_STS_IOERR); 649 626 } 650 627 } 651 628 ··· 670 639 671 640 #define UBLK_REQUEUE_DELAY_MS 3 672 641 642 + static inline void __ublk_abort_rq(struct ublk_queue *ubq, 643 + struct request *rq) 644 + { 645 + /* We cannot process this rq so just requeue it. */ 646 + if (ublk_queue_can_use_recovery(ubq)) 647 + blk_mq_requeue_request(rq, false); 648 + else 649 + blk_mq_end_request(rq, BLK_STS_IOERR); 650 + 651 + mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0); 652 + } 653 + 673 654 static inline void __ublk_rq_task_work(struct request *req) 674 655 { 675 656 struct ublk_queue *ubq = req->mq_hctx->driver_data; 676 - struct ublk_device *ub = ubq->dev; 677 657 int tag = req->tag; 678 658 struct ublk_io *io = &ubq->ios[tag]; 679 - bool task_exiting = current != ubq->ubq_daemon || ubq_daemon_is_dying(ubq); 680 659 unsigned int mapped_bytes; 681 660 682 661 pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n", 683 662 __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags, 684 663 ublk_get_iod(ubq, req->tag)->addr); 685 664 686 - if (unlikely(task_exiting)) { 687 - blk_mq_end_request(req, BLK_STS_IOERR); 688 - mod_delayed_work(system_wq, &ub->monitor_work, 0); 665 + /* 666 + * Task is exiting if either: 667 + * 668 + * (1) current != ubq_daemon. 669 + * io_uring_cmd_complete_in_task() tries to run task_work 670 + * in a workqueue if ubq_daemon(cmd's task) is PF_EXITING. 671 + * 672 + * (2) current->flags & PF_EXITING. 673 + */ 674 + if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) { 675 + __ublk_abort_rq(ubq, req); 689 676 return; 690 677 } 691 678 ··· 788 739 res = ublk_setup_iod(ubq, rq); 789 740 if (unlikely(res != BLK_STS_OK)) 790 741 return BLK_STS_IOERR; 742 + /* With recovery feature enabled, force_abort is set in 743 + * ublk_stop_dev() before calling del_gendisk(). We have to 744 + * abort all requeued and new rqs here to let del_gendisk() 745 + * move on. Besides, we cannot not call io_uring_cmd_complete_in_task() 746 + * to avoid UAF on io_uring ctx. 747 + * 748 + * Note: force_abort is guaranteed to be seen because it is set 749 + * before request queue is unqiuesced. 750 + */ 751 + if (ublk_queue_can_use_recovery(ubq) && unlikely(ubq->force_abort)) 752 + return BLK_STS_IOERR; 791 753 792 754 blk_mq_start_request(bd->rq); 793 755 794 756 if (unlikely(ubq_daemon_is_dying(ubq))) { 795 757 fail: 796 - mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0); 797 - return BLK_STS_IOERR; 758 + __ublk_abort_rq(ubq, rq); 759 + return BLK_STS_OK; 798 760 } 799 761 800 762 if (ublk_can_use_task_work(ubq)) { ··· 976 916 */ 977 917 rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i); 978 918 if (rq) 979 - __ublk_fail_req(io, rq); 919 + __ublk_fail_req(ubq, io, rq); 980 920 } 981 921 } 982 922 ublk_put_device(ub); ··· 992 932 struct ublk_queue *ubq = ublk_get_queue(ub, i); 993 933 994 934 if (ubq_daemon_is_dying(ubq)) { 995 - schedule_work(&ub->stop_work); 935 + if (ublk_queue_can_use_recovery(ubq)) 936 + schedule_work(&ub->quiesce_work); 937 + else 938 + schedule_work(&ub->stop_work); 996 939 997 940 /* abort queue is for making forward progress */ 998 941 ublk_abort_queue(ub, ubq); ··· 1003 940 } 1004 941 1005 942 /* 1006 - * We can't schedule monitor work after ublk_remove() is started. 943 + * We can't schedule monitor work after ub's state is not UBLK_S_DEV_LIVE. 944 + * after ublk_remove() or __ublk_quiesce_dev() is started. 1007 945 * 1008 946 * No need ub->mutex, monitor work are canceled after state is marked 1009 - * as DEAD, so DEAD state is observed reliably. 947 + * as not LIVE, so new state is observed reliably. 1010 948 */ 1011 - if (ub->dev_info.state != UBLK_S_DEV_DEAD) 949 + if (ub->dev_info.state == UBLK_S_DEV_LIVE) 1012 950 schedule_delayed_work(&ub->monitor_work, 1013 951 UBLK_DAEMON_MONITOR_PERIOD); 1014 952 } ··· 1046 982 ublk_cancel_queue(ublk_get_queue(ub, i)); 1047 983 } 1048 984 1049 - static void ublk_stop_dev(struct ublk_device *ub) 985 + static bool ublk_check_inflight_rq(struct request *rq, void *data) 1050 986 { 987 + bool *idle = data; 988 + 989 + if (blk_mq_request_started(rq)) { 990 + *idle = false; 991 + return false; 992 + } 993 + return true; 994 + } 995 + 996 + static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub) 997 + { 998 + bool idle; 999 + 1000 + WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue)); 1001 + while (true) { 1002 + idle = true; 1003 + blk_mq_tagset_busy_iter(&ub->tag_set, 1004 + ublk_check_inflight_rq, &idle); 1005 + if (idle) 1006 + break; 1007 + msleep(UBLK_REQUEUE_DELAY_MS); 1008 + } 1009 + } 1010 + 1011 + static void __ublk_quiesce_dev(struct ublk_device *ub) 1012 + { 1013 + pr_devel("%s: quiesce ub: dev_id %d state %s\n", 1014 + __func__, ub->dev_info.dev_id, 1015 + ub->dev_info.state == UBLK_S_DEV_LIVE ? 1016 + "LIVE" : "QUIESCED"); 1017 + blk_mq_quiesce_queue(ub->ub_disk->queue); 1018 + ublk_wait_tagset_rqs_idle(ub); 1019 + ub->dev_info.state = UBLK_S_DEV_QUIESCED; 1020 + ublk_cancel_dev(ub); 1021 + /* we are going to release task_struct of ubq_daemon and resets 1022 + * ->ubq_daemon to NULL. So in monitor_work, check on ubq_daemon causes UAF. 1023 + * Besides, monitor_work is not necessary in QUIESCED state since we have 1024 + * already scheduled quiesce_work and quiesced all ubqs. 1025 + * 1026 + * Do not let monitor_work schedule itself if state it QUIESCED. And we cancel 1027 + * it here and re-schedule it in END_USER_RECOVERY to avoid UAF. 1028 + */ 1029 + cancel_delayed_work_sync(&ub->monitor_work); 1030 + } 1031 + 1032 + static void ublk_quiesce_work_fn(struct work_struct *work) 1033 + { 1034 + struct ublk_device *ub = 1035 + container_of(work, struct ublk_device, quiesce_work); 1036 + 1051 1037 mutex_lock(&ub->mutex); 1052 1038 if (ub->dev_info.state != UBLK_S_DEV_LIVE) 1053 1039 goto unlock; 1040 + __ublk_quiesce_dev(ub); 1041 + unlock: 1042 + mutex_unlock(&ub->mutex); 1043 + } 1054 1044 1045 + static void ublk_unquiesce_dev(struct ublk_device *ub) 1046 + { 1047 + int i; 1048 + 1049 + pr_devel("%s: unquiesce ub: dev_id %d state %s\n", 1050 + __func__, ub->dev_info.dev_id, 1051 + ub->dev_info.state == UBLK_S_DEV_LIVE ? 1052 + "LIVE" : "QUIESCED"); 1053 + /* quiesce_work has run. We let requeued rqs be aborted 1054 + * before running fallback_wq. "force_abort" must be seen 1055 + * after request queue is unqiuesced. Then del_gendisk() 1056 + * can move on. 1057 + */ 1058 + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 1059 + ublk_get_queue(ub, i)->force_abort = true; 1060 + 1061 + blk_mq_unquiesce_queue(ub->ub_disk->queue); 1062 + /* We may have requeued some rqs in ublk_quiesce_queue() */ 1063 + blk_mq_kick_requeue_list(ub->ub_disk->queue); 1064 + } 1065 + 1066 + static void ublk_stop_dev(struct ublk_device *ub) 1067 + { 1068 + mutex_lock(&ub->mutex); 1069 + if (ub->dev_info.state == UBLK_S_DEV_DEAD) 1070 + goto unlock; 1071 + if (ublk_can_use_recovery(ub)) { 1072 + if (ub->dev_info.state == UBLK_S_DEV_LIVE) 1073 + __ublk_quiesce_dev(ub); 1074 + ublk_unquiesce_dev(ub); 1075 + } 1055 1076 del_gendisk(ub->ub_disk); 1056 1077 ub->dev_info.state = UBLK_S_DEV_DEAD; 1057 1078 ub->dev_info.ublksrv_pid = -1; ··· 1460 1311 { 1461 1312 ublk_stop_dev(ub); 1462 1313 cancel_work_sync(&ub->stop_work); 1314 + cancel_work_sync(&ub->quiesce_work); 1463 1315 cdev_device_del(&ub->cdev, &ub->cdev_dev); 1464 1316 put_device(&ub->cdev_dev); 1465 1317 } ··· 1637 1487 goto out_unlock; 1638 1488 mutex_init(&ub->mutex); 1639 1489 spin_lock_init(&ub->mm_lock); 1490 + INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn); 1640 1491 INIT_WORK(&ub->stop_work, ublk_stop_work_fn); 1641 1492 INIT_DELAYED_WORK(&ub->monitor_work, ublk_daemon_monitor_work); 1642 1493 ··· 1758 1607 1759 1608 ublk_stop_dev(ub); 1760 1609 cancel_work_sync(&ub->stop_work); 1610 + cancel_work_sync(&ub->quiesce_work); 1761 1611 1762 1612 ublk_put_device(ub); 1763 1613 return 0; ··· 1861 1709 return ret; 1862 1710 } 1863 1711 1712 + static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) 1713 + { 1714 + int i; 1715 + 1716 + WARN_ON_ONCE(!(ubq->ubq_daemon && ubq_daemon_is_dying(ubq))); 1717 + /* All old ioucmds have to be completed */ 1718 + WARN_ON_ONCE(ubq->nr_io_ready); 1719 + /* old daemon is PF_EXITING, put it now */ 1720 + put_task_struct(ubq->ubq_daemon); 1721 + /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */ 1722 + ubq->ubq_daemon = NULL; 1723 + 1724 + for (i = 0; i < ubq->q_depth; i++) { 1725 + struct ublk_io *io = &ubq->ios[i]; 1726 + 1727 + /* forget everything now and be ready for new FETCH_REQ */ 1728 + io->flags = 0; 1729 + io->cmd = NULL; 1730 + io->addr = 0; 1731 + } 1732 + } 1733 + 1734 + static int ublk_ctrl_start_recovery(struct io_uring_cmd *cmd) 1735 + { 1736 + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; 1737 + struct ublk_device *ub; 1738 + int ret = -EINVAL; 1739 + int i; 1740 + 1741 + ub = ublk_get_device_from_id(header->dev_id); 1742 + if (!ub) 1743 + return ret; 1744 + 1745 + mutex_lock(&ub->mutex); 1746 + if (!ublk_can_use_recovery(ub)) 1747 + goto out_unlock; 1748 + /* 1749 + * START_RECOVERY is only allowd after: 1750 + * 1751 + * (1) UB_STATE_OPEN is not set, which means the dying process is exited 1752 + * and related io_uring ctx is freed so file struct of /dev/ublkcX is 1753 + * released. 1754 + * 1755 + * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work: 1756 + * (a)has quiesced request queue 1757 + * (b)has requeued every inflight rqs whose io_flags is ACTIVE 1758 + * (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE 1759 + * (d)has completed/camceled all ioucmds owned by ther dying process 1760 + */ 1761 + if (test_bit(UB_STATE_OPEN, &ub->state) || 1762 + ub->dev_info.state != UBLK_S_DEV_QUIESCED) { 1763 + ret = -EBUSY; 1764 + goto out_unlock; 1765 + } 1766 + pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id); 1767 + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 1768 + ublk_queue_reinit(ub, ublk_get_queue(ub, i)); 1769 + /* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */ 1770 + ub->mm = NULL; 1771 + ub->nr_queues_ready = 0; 1772 + init_completion(&ub->completion); 1773 + ret = 0; 1774 + out_unlock: 1775 + mutex_unlock(&ub->mutex); 1776 + ublk_put_device(ub); 1777 + return ret; 1778 + } 1779 + 1780 + static int ublk_ctrl_end_recovery(struct io_uring_cmd *cmd) 1781 + { 1782 + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; 1783 + int ublksrv_pid = (int)header->data[0]; 1784 + struct ublk_device *ub; 1785 + int ret = -EINVAL; 1786 + 1787 + ub = ublk_get_device_from_id(header->dev_id); 1788 + if (!ub) 1789 + return ret; 1790 + 1791 + pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n", 1792 + __func__, ub->dev_info.nr_hw_queues, header->dev_id); 1793 + /* wait until new ubq_daemon sending all FETCH_REQ */ 1794 + wait_for_completion_interruptible(&ub->completion); 1795 + pr_devel("%s: All new ubq_daemons(nr: %d) are ready, dev id %d\n", 1796 + __func__, ub->dev_info.nr_hw_queues, header->dev_id); 1797 + 1798 + mutex_lock(&ub->mutex); 1799 + if (!ublk_can_use_recovery(ub)) 1800 + goto out_unlock; 1801 + 1802 + if (ub->dev_info.state != UBLK_S_DEV_QUIESCED) { 1803 + ret = -EBUSY; 1804 + goto out_unlock; 1805 + } 1806 + ub->dev_info.ublksrv_pid = ublksrv_pid; 1807 + pr_devel("%s: new ublksrv_pid %d, dev id %d\n", 1808 + __func__, ublksrv_pid, header->dev_id); 1809 + blk_mq_unquiesce_queue(ub->ub_disk->queue); 1810 + pr_devel("%s: queue unquiesced, dev id %d.\n", 1811 + __func__, header->dev_id); 1812 + blk_mq_kick_requeue_list(ub->ub_disk->queue); 1813 + ub->dev_info.state = UBLK_S_DEV_LIVE; 1814 + schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD); 1815 + ret = 0; 1816 + out_unlock: 1817 + mutex_unlock(&ub->mutex); 1818 + ublk_put_device(ub); 1819 + return ret; 1820 + } 1821 + 1864 1822 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, 1865 1823 unsigned int issue_flags) 1866 1824 { ··· 2011 1749 break; 2012 1750 case UBLK_CMD_SET_PARAMS: 2013 1751 ret = ublk_ctrl_set_params(cmd); 1752 + break; 1753 + case UBLK_CMD_START_USER_RECOVERY: 1754 + ret = ublk_ctrl_start_recovery(cmd); 1755 + break; 1756 + case UBLK_CMD_END_USER_RECOVERY: 1757 + ret = ublk_ctrl_end_recovery(cmd); 2014 1758 break; 2015 1759 default: 2016 1760 break;

+1 -3

drivers/block/virtio_blk.c

··· 801 801 NULL, 802 802 }; 803 803 804 - static int virtblk_map_queues(struct blk_mq_tag_set *set) 804 + static void virtblk_map_queues(struct blk_mq_tag_set *set) 805 805 { 806 806 struct virtio_blk *vblk = set->driver_data; 807 807 int i, qoff; ··· 826 826 else 827 827 blk_mq_virtio_map_queues(&set->map[i], vblk->vdev, 0); 828 828 } 829 - 830 - return 0; 831 829 } 832 830 833 831 static void virtblk_complete_batch(struct io_comp_batch *iob)

+3 -3

drivers/block/zram/zram_drv.c

··· 499 499 goto out; 500 500 } 501 501 502 - strlcpy(file_name, buf, PATH_MAX); 502 + strscpy(file_name, buf, PATH_MAX); 503 503 /* ignore trailing newline */ 504 504 sz = strlen(file_name); 505 505 if (sz > 0 && file_name[sz - 1] == '\n') ··· 1031 1031 char compressor[ARRAY_SIZE(zram->compressor)]; 1032 1032 size_t sz; 1033 1033 1034 - strlcpy(compressor, buf, sizeof(compressor)); 1034 + strscpy(compressor, buf, sizeof(compressor)); 1035 1035 /* ignore trailing newline */ 1036 1036 sz = strlen(compressor); 1037 1037 if (sz > 0 && compressor[sz - 1] == '\n') ··· 1974 1974 if (ret) 1975 1975 goto out_cleanup_disk; 1976 1976 1977 - strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); 1977 + strscpy(zram->compressor, default_compressor, sizeof(zram->compressor)); 1978 1978 1979 1979 zram_debugfs_register(zram); 1980 1980 pr_info("Added device: %s\n", zram->disk->disk_name);

+1 -1

drivers/md/bcache/bcache.h

··· 107 107 * 108 108 * BTREE NODES: 109 109 * 110 - * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and 110 + * Our unit of allocation is a bucket, and we can't arbitrarily allocate and 111 111 * free smaller than a bucket - so, that's how big our btree nodes are. 112 112 * 113 113 * (If buckets are really big we'll only use part of the bucket for a btree node

+1 -1

drivers/md/bcache/bset.c

··· 1264 1264 * 1265 1265 * Don't worry event 'out' is allocated from mempool, it can 1266 1266 * still be swapped here. Because state->pool is a page mempool 1267 - * creaated by by mempool_init_page_pool(), which allocates 1267 + * created by mempool_init_page_pool(), which allocates 1268 1268 * pages by alloc_pages() indeed. 1269 1269 */ 1270 1270

-1

drivers/md/bcache/stats.h

··· 54 54 55 55 void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, 56 56 bool hit, bool bypass); 57 - void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d); 58 57 void bch_mark_cache_miss_collision(struct cache_set *c, 59 58 struct bcache_device *d); 60 59 void bch_mark_sectors_bypassed(struct cache_set *c,

+54 -24

drivers/md/bcache/writeback.c

··· 157 157 dc->writeback_rate_target = target; 158 158 } 159 159 160 + static bool idle_counter_exceeded(struct cache_set *c) 161 + { 162 + int counter, dev_nr; 163 + 164 + /* 165 + * If c->idle_counter is overflow (idel for really long time), 166 + * reset as 0 and not set maximum rate this time for code 167 + * simplicity. 168 + */ 169 + counter = atomic_inc_return(&c->idle_counter); 170 + if (counter <= 0) { 171 + atomic_set(&c->idle_counter, 0); 172 + return false; 173 + } 174 + 175 + dev_nr = atomic_read(&c->attached_dev_nr); 176 + if (dev_nr == 0) 177 + return false; 178 + 179 + /* 180 + * c->idle_counter is increased by writeback thread of all 181 + * attached backing devices, in order to represent a rough 182 + * time period, counter should be divided by dev_nr. 183 + * Otherwise the idle time cannot be larger with more backing 184 + * device attached. 185 + * The following calculation equals to checking 186 + * (counter / dev_nr) < (dev_nr * 6) 187 + */ 188 + if (counter < (dev_nr * dev_nr * 6)) 189 + return false; 190 + 191 + return true; 192 + } 193 + 194 + /* 195 + * Idle_counter is increased every time when update_writeback_rate() is 196 + * called. If all backing devices attached to the same cache set have 197 + * identical dc->writeback_rate_update_seconds values, it is about 6 198 + * rounds of update_writeback_rate() on each backing device before 199 + * c->at_max_writeback_rate is set to 1, and then max wrteback rate set 200 + * to each dc->writeback_rate.rate. 201 + * In order to avoid extra locking cost for counting exact dirty cached 202 + * devices number, c->attached_dev_nr is used to calculate the idle 203 + * throushold. It might be bigger if not all cached device are in write- 204 + * back mode, but it still works well with limited extra rounds of 205 + * update_writeback_rate(). 206 + */ 160 207 static bool set_at_max_writeback_rate(struct cache_set *c, 161 208 struct cached_dev *dc) 162 209 { ··· 214 167 /* Don't set max writeback rate if gc is running */ 215 168 if (!c->gc_mark_valid) 216 169 return false; 217 - /* 218 - * Idle_counter is increased everytime when update_writeback_rate() is 219 - * called. If all backing devices attached to the same cache set have 220 - * identical dc->writeback_rate_update_seconds values, it is about 6 221 - * rounds of update_writeback_rate() on each backing device before 222 - * c->at_max_writeback_rate is set to 1, and then max wrteback rate set 223 - * to each dc->writeback_rate.rate. 224 - * In order to avoid extra locking cost for counting exact dirty cached 225 - * devices number, c->attached_dev_nr is used to calculate the idle 226 - * throushold. It might be bigger if not all cached device are in write- 227 - * back mode, but it still works well with limited extra rounds of 228 - * update_writeback_rate(). 229 - */ 230 - if (atomic_inc_return(&c->idle_counter) < 231 - atomic_read(&c->attached_dev_nr) * 6) 170 + 171 + if (!idle_counter_exceeded(c)) 232 172 return false; 233 173 234 174 if (atomic_read(&c->at_max_writeback_rate) != 1) ··· 229 195 dc->writeback_rate_change = 0; 230 196 231 197 /* 232 - * Check c->idle_counter and c->at_max_writeback_rate agagain in case 233 - * new I/O arrives during before set_at_max_writeback_rate() returns. 234 - * Then the writeback rate is set to 1, and its new value should be 235 - * decided via __update_writeback_rate(). 198 + * In case new I/O arrives during before 199 + * set_at_max_writeback_rate() returns. 236 200 */ 237 - if ((atomic_read(&c->idle_counter) < 238 - atomic_read(&c->attached_dev_nr) * 6) || 201 + if (!idle_counter_exceeded(c) || 239 202 !atomic_read(&c->at_max_writeback_rate)) 240 203 return false; 241 204 ··· 832 801 } 833 802 } 834 803 835 - if (dc->writeback_write_wq) { 836 - flush_workqueue(dc->writeback_write_wq); 804 + if (dc->writeback_write_wq) 837 805 destroy_workqueue(dc->writeback_write_wq); 838 - } 806 + 839 807 cached_dev_put(dc); 840 808 wait_for_kthread_stop(); 841 809

+1 -3

drivers/md/dm-table.c

··· 1856 1856 static int device_not_nowait_capable(struct dm_target *ti, struct dm_dev *dev, 1857 1857 sector_t start, sector_t len, void *data) 1858 1858 { 1859 - struct request_queue *q = bdev_get_queue(dev->bdev); 1860 - 1861 - return !blk_queue_nowait(q); 1859 + return !bdev_nowait(dev->bdev); 1862 1860 } 1863 1861 1864 1862 static bool dm_table_supports_nowait(struct dm_table *t)

+2 -3

drivers/md/md.c

··· 5845 5845 } 5846 5846 } 5847 5847 sysfs_notify_dirent_safe(rdev->sysfs_state); 5848 - nowait = nowait && blk_queue_nowait(bdev_get_queue(rdev->bdev)); 5848 + nowait = nowait && bdev_nowait(rdev->bdev); 5849 5849 } 5850 5850 5851 5851 if (!bioset_initialized(&mddev->bio_set)) { ··· 6982 6982 * If the new disk does not support REQ_NOWAIT, 6983 6983 * disable on the whole MD. 6984 6984 */ 6985 - if (!blk_queue_nowait(bdev_get_queue(rdev->bdev))) { 6985 + if (!bdev_nowait(rdev->bdev)) { 6986 6986 pr_info("%s: Disabling nowait because %pg does not support nowait\n", 6987 6987 mdname(mddev), rdev->bdev); 6988 6988 blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue); ··· 8156 8156 list_for_each(tmp,&all_mddevs) 8157 8157 if (!l--) { 8158 8158 mddev = list_entry(tmp, struct mddev, all_mddevs); 8159 - mddev_get(mddev); 8160 8159 if (!mddev_get(mddev)) 8161 8160 continue; 8162 8161 spin_unlock(&all_mddevs_lock);

+1 -1

drivers/md/raid0.c

··· 47 47 int len = 0; 48 48 49 49 for (k = 0; k < conf->strip_zone[j].nb_dev; k++) 50 - len += snprintf(line+len, 200-len, "%s%pg", k?"/":"", 50 + len += scnprintf(line+len, 200-len, "%s%pg", k?"/":"", 51 51 conf->devlist[j * raid_disks + k]->bdev); 52 52 pr_debug("md: zone%d=[%s]\n", j, line); 53 53

+96 -55

drivers/md/raid10.c

··· 79 79 80 80 #include "raid1-10.c" 81 81 82 + #define NULL_CMD 83 + #define cmd_before(conf, cmd) \ 84 + do { \ 85 + write_sequnlock_irq(&(conf)->resync_lock); \ 86 + cmd; \ 87 + } while (0) 88 + #define cmd_after(conf) write_seqlock_irq(&(conf)->resync_lock) 89 + 90 + #define wait_event_barrier_cmd(conf, cond, cmd) \ 91 + wait_event_cmd((conf)->wait_barrier, cond, cmd_before(conf, cmd), \ 92 + cmd_after(conf)) 93 + 94 + #define wait_event_barrier(conf, cond) \ 95 + wait_event_barrier_cmd(conf, cond, NULL_CMD) 96 + 82 97 /* 83 98 * for resync bio, r10bio pointer can be retrieved from the per-bio 84 99 * 'struct resync_pages'. ··· 287 272 mempool_free(r10_bio, &conf->r10buf_pool); 288 273 289 274 lower_barrier(conf); 275 + } 276 + 277 + static void wake_up_barrier(struct r10conf *conf) 278 + { 279 + if (wq_has_sleeper(&conf->wait_barrier)) 280 + wake_up(&conf->wait_barrier); 290 281 } 291 282 292 283 static void reschedule_retry(struct r10bio *r10_bio) ··· 951 930 952 931 static void raise_barrier(struct r10conf *conf, int force) 953 932 { 933 + write_seqlock_irq(&conf->resync_lock); 954 934 BUG_ON(force && !conf->barrier); 955 - spin_lock_irq(&conf->resync_lock); 956 935 957 936 /* Wait until no block IO is waiting (unless 'force') */ 958 - wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, 959 - conf->resync_lock); 937 + wait_event_barrier(conf, force || !conf->nr_waiting); 960 938 961 939 /* block any new IO from starting */ 962 - conf->barrier++; 940 + WRITE_ONCE(conf->barrier, conf->barrier + 1); 963 941 964 942 /* Now wait for all pending IO to complete */ 965 - wait_event_lock_irq(conf->wait_barrier, 966 - !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH, 967 - conf->resync_lock); 943 + wait_event_barrier(conf, !atomic_read(&conf->nr_pending) && 944 + conf->barrier < RESYNC_DEPTH); 968 945 969 - spin_unlock_irq(&conf->resync_lock); 946 + write_sequnlock_irq(&conf->resync_lock); 970 947 } 971 948 972 949 static void lower_barrier(struct r10conf *conf) 973 950 { 974 951 unsigned long flags; 975 - spin_lock_irqsave(&conf->resync_lock, flags); 976 - conf->barrier--; 977 - spin_unlock_irqrestore(&conf->resync_lock, flags); 952 + 953 + write_seqlock_irqsave(&conf->resync_lock, flags); 954 + WRITE_ONCE(conf->barrier, conf->barrier - 1); 955 + write_sequnlock_irqrestore(&conf->resync_lock, flags); 978 956 wake_up(&conf->wait_barrier); 957 + } 958 + 959 + static bool stop_waiting_barrier(struct r10conf *conf) 960 + { 961 + struct bio_list *bio_list = current->bio_list; 962 + 963 + /* barrier is dropped */ 964 + if (!conf->barrier) 965 + return true; 966 + 967 + /* 968 + * If there are already pending requests (preventing the barrier from 969 + * rising completely), and the pre-process bio queue isn't empty, then 970 + * don't wait, as we need to empty that queue to get the nr_pending 971 + * count down. 972 + */ 973 + if (atomic_read(&conf->nr_pending) && bio_list && 974 + (!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1]))) 975 + return true; 976 + 977 + /* move on if recovery thread is blocked by us */ 978 + if (conf->mddev->thread->tsk == current && 979 + test_bit(MD_RECOVERY_RUNNING, &conf->mddev->recovery) && 980 + conf->nr_queued > 0) 981 + return true; 982 + 983 + return false; 984 + } 985 + 986 + static bool wait_barrier_nolock(struct r10conf *conf) 987 + { 988 + unsigned int seq = read_seqbegin(&conf->resync_lock); 989 + 990 + if (READ_ONCE(conf->barrier)) 991 + return false; 992 + 993 + atomic_inc(&conf->nr_pending); 994 + if (!read_seqretry(&conf->resync_lock, seq)) 995 + return true; 996 + 997 + if (atomic_dec_and_test(&conf->nr_pending)) 998 + wake_up_barrier(conf); 999 + 1000 + return false; 979 1001 } 980 1002 981 1003 static bool wait_barrier(struct r10conf *conf, bool nowait) 982 1004 { 983 1005 bool ret = true; 984 1006 985 - spin_lock_irq(&conf->resync_lock); 1007 + if (wait_barrier_nolock(conf)) 1008 + return true; 1009 + 1010 + write_seqlock_irq(&conf->resync_lock); 986 1011 if (conf->barrier) { 987 - struct bio_list *bio_list = current->bio_list; 988 - conf->nr_waiting++; 989 - /* Wait for the barrier to drop. 990 - * However if there are already pending 991 - * requests (preventing the barrier from 992 - * rising completely), and the 993 - * pre-process bio queue isn't empty, 994 - * then don't wait, as we need to empty 995 - * that queue to get the nr_pending 996 - * count down. 997 - */ 998 1012 /* Return false when nowait flag is set */ 999 1013 if (nowait) { 1000 1014 ret = false; 1001 1015 } else { 1016 + conf->nr_waiting++; 1002 1017 raid10_log(conf->mddev, "wait barrier"); 1003 - wait_event_lock_irq(conf->wait_barrier, 1004 - !conf->barrier || 1005 - (atomic_read(&conf->nr_pending) && 1006 - bio_list && 1007 - (!bio_list_empty(&bio_list[0]) || 1008 - !bio_list_empty(&bio_list[1]))) || 1009 - /* move on if recovery thread is 1010 - * blocked by us 1011 - */ 1012 - (conf->mddev->thread->tsk == current && 1013 - test_bit(MD_RECOVERY_RUNNING, 1014 - &conf->mddev->recovery) && 1015 - conf->nr_queued > 0), 1016 - conf->resync_lock); 1018 + wait_event_barrier(conf, stop_waiting_barrier(conf)); 1019 + conf->nr_waiting--; 1017 1020 } 1018 - conf->nr_waiting--; 1019 1021 if (!conf->nr_waiting) 1020 1022 wake_up(&conf->wait_barrier); 1021 1023 } 1022 1024 /* Only increment nr_pending when we wait */ 1023 1025 if (ret) 1024 1026 atomic_inc(&conf->nr_pending); 1025 - spin_unlock_irq(&conf->resync_lock); 1027 + write_sequnlock_irq(&conf->resync_lock); 1026 1028 return ret; 1027 1029 } 1028 1030 ··· 1053 1009 { 1054 1010 if ((atomic_dec_and_test(&conf->nr_pending)) || 1055 1011 (conf->array_freeze_pending)) 1056 - wake_up(&conf->wait_barrier); 1012 + wake_up_barrier(conf); 1057 1013 } 1058 1014 1059 1015 static void freeze_array(struct r10conf *conf, int extra) ··· 1070 1026 * must match the number of pending IOs (nr_pending) before 1071 1027 * we continue. 1072 1028 */ 1073 - spin_lock_irq(&conf->resync_lock); 1029 + write_seqlock_irq(&conf->resync_lock); 1074 1030 conf->array_freeze_pending++; 1075 - conf->barrier++; 1031 + WRITE_ONCE(conf->barrier, conf->barrier + 1); 1076 1032 conf->nr_waiting++; 1077 - wait_event_lock_irq_cmd(conf->wait_barrier, 1078 - atomic_read(&conf->nr_pending) == conf->nr_queued+extra, 1079 - conf->resync_lock, 1080 - flush_pending_writes(conf)); 1081 - 1033 + wait_event_barrier_cmd(conf, atomic_read(&conf->nr_pending) == 1034 + conf->nr_queued + extra, flush_pending_writes(conf)); 1082 1035 conf->array_freeze_pending--; 1083 - spin_unlock_irq(&conf->resync_lock); 1036 + write_sequnlock_irq(&conf->resync_lock); 1084 1037 } 1085 1038 1086 1039 static void unfreeze_array(struct r10conf *conf) 1087 1040 { 1088 1041 /* reverse the effect of the freeze */ 1089 - spin_lock_irq(&conf->resync_lock); 1090 - conf->barrier--; 1042 + write_seqlock_irq(&conf->resync_lock); 1043 + WRITE_ONCE(conf->barrier, conf->barrier - 1); 1091 1044 conf->nr_waiting--; 1092 1045 wake_up(&conf->wait_barrier); 1093 - spin_unlock_irq(&conf->resync_lock); 1046 + write_sequnlock_irq(&conf->resync_lock); 1094 1047 } 1095 1048 1096 1049 static sector_t choose_data_offset(struct r10bio *r10_bio, ··· 1926 1885 __make_request(mddev, bio, sectors); 1927 1886 1928 1887 /* In case raid10d snuck in to freeze_array */ 1929 - wake_up(&conf->wait_barrier); 1888 + wake_up_barrier(conf); 1930 1889 return true; 1931 1890 } 1932 1891 ··· 2021 1980 * Otherwise, it must be degraded: 2022 1981 * - recovery is interrupted. 2023 1982 * - &mddev->degraded is bumped. 2024 - 1983 + * 2025 1984 * @rdev is marked as &Faulty excluding case when array is failed and 2026 1985 * &mddev->fail_last_dev is off. 2027 1986 */ ··· 4073 4032 INIT_LIST_HEAD(&conf->retry_list); 4074 4033 INIT_LIST_HEAD(&conf->bio_end_io_list); 4075 4034 4076 - spin_lock_init(&conf->resync_lock); 4035 + seqlock_init(&conf->resync_lock); 4077 4036 init_waitqueue_head(&conf->wait_barrier); 4078 4037 atomic_set(&conf->nr_pending, 0); 4079 4038 ··· 4392 4351 rdev->new_raid_disk = rdev->raid_disk * 2; 4393 4352 rdev->sectors = size; 4394 4353 } 4395 - conf->barrier = 1; 4354 + WRITE_ONCE(conf->barrier, 1); 4396 4355 } 4397 4356 4398 4357 return conf;

+1 -1

drivers/md/raid10.h

··· 76 76 /* queue pending writes and submit them on unplug */ 77 77 struct bio_list pending_bio_list; 78 78 79 - spinlock_t resync_lock; 79 + seqlock_t resync_lock; 80 80 atomic_t nr_pending; 81 81 int nr_waiting; 82 82 int nr_queued;

+6 -5

drivers/md/raid5-cache.c

··· 125 125 * reclaimed. if it's 0, reclaim spaces 126 126 * used by io_units which are in 127 127 * IO_UNIT_STRIPE_END state (eg, reclaim 128 - * dones't wait for specific io_unit 128 + * doesn't wait for specific io_unit 129 129 * switching to IO_UNIT_STRIPE_END 130 130 * state) */ 131 131 wait_queue_head_t iounit_wait; ··· 1327 1327 * superblock is updated to new log tail. Updating superblock (either 1328 1328 * directly call md_update_sb() or depend on md thread) must hold 1329 1329 * reconfig mutex. On the other hand, raid5_quiesce is called with 1330 - * reconfig_mutex hold. The first step of raid5_quiesce() is waitting 1331 - * for all IO finish, hence waitting for reclaim thread, while reclaim 1332 - * thread is calling this function and waitting for reconfig mutex. So 1330 + * reconfig_mutex hold. The first step of raid5_quiesce() is waiting 1331 + * for all IO finish, hence waiting for reclaim thread, while reclaim 1332 + * thread is calling this function and waiting for reconfig mutex. So 1333 1333 * there is a deadlock. We workaround this issue with a trylock. 1334 1334 * FIXME: we could miss discard if we can't take reconfig mutex 1335 1335 */ ··· 1923 1923 { 1924 1924 struct stripe_head *sh; 1925 1925 1926 - sh = raid5_get_active_stripe(conf, stripe_sect, 0, noblock, 0); 1926 + sh = raid5_get_active_stripe(conf, NULL, stripe_sect, 1927 + noblock ? R5_GAS_NOBLOCK : 0); 1927 1928 if (!sh) 1928 1929 return NULL; /* no more stripe available */ 1929 1930

+82 -69

drivers/md/raid5.c

··· 36 36 */ 37 37 38 38 #include <linux/blkdev.h> 39 + #include <linux/delay.h> 39 40 #include <linux/kthread.h> 40 41 #include <linux/raid/pq.h> 41 42 #include <linux/async_tx.h> ··· 790 789 */ 791 790 static bool is_inactive_blocked(struct r5conf *conf, int hash) 792 791 { 793 - int active = atomic_read(&conf->active_stripes); 794 - 795 792 if (list_empty(conf->inactive_list + hash)) 796 793 return false; 797 794 798 795 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) 799 796 return true; 800 797 801 - return active < (conf->max_nr_stripes * 3 / 4); 798 + return (atomic_read(&conf->active_stripes) < 799 + (conf->max_nr_stripes * 3 / 4)); 802 800 } 803 801 804 - static struct stripe_head *__raid5_get_active_stripe(struct r5conf *conf, 802 + struct stripe_head *raid5_get_active_stripe(struct r5conf *conf, 805 803 struct stripe_request_ctx *ctx, sector_t sector, 806 - bool previous, bool noblock, bool noquiesce) 804 + unsigned int flags) 807 805 { 808 806 struct stripe_head *sh; 809 807 int hash = stripe_hash_locks_hash(conf, sector); 808 + int previous = !!(flags & R5_GAS_PREVIOUS); 810 809 811 810 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 812 811 813 812 spin_lock_irq(conf->hash_locks + hash); 814 813 815 - retry: 816 - if (!noquiesce && conf->quiesce) { 817 - /* 818 - * Must release the reference to batch_last before waiting, 819 - * on quiesce, otherwise the batch_last will hold a reference 820 - * to a stripe and raid5_quiesce() will deadlock waiting for 821 - * active_stripes to go to zero. 822 - */ 823 - if (ctx && ctx->batch_last) { 824 - raid5_release_stripe(ctx->batch_last); 825 - ctx->batch_last = NULL; 814 + for (;;) { 815 + if (!(flags & R5_GAS_NOQUIESCE) && conf->quiesce) { 816 + /* 817 + * Must release the reference to batch_last before 818 + * waiting, on quiesce, otherwise the batch_last will 819 + * hold a reference to a stripe and raid5_quiesce() 820 + * will deadlock waiting for active_stripes to go to 821 + * zero. 822 + */ 823 + if (ctx && ctx->batch_last) { 824 + raid5_release_stripe(ctx->batch_last); 825 + ctx->batch_last = NULL; 826 + } 827 + 828 + wait_event_lock_irq(conf->wait_for_quiescent, 829 + !conf->quiesce, 830 + *(conf->hash_locks + hash)); 826 831 } 827 832 828 - wait_event_lock_irq(conf->wait_for_quiescent, !conf->quiesce, 833 + sh = find_get_stripe(conf, sector, conf->generation - previous, 834 + hash); 835 + if (sh) 836 + break; 837 + 838 + if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { 839 + sh = get_free_stripe(conf, hash); 840 + if (sh) { 841 + r5c_check_stripe_cache_usage(conf); 842 + init_stripe(sh, sector, previous); 843 + atomic_inc(&sh->count); 844 + break; 845 + } 846 + 847 + if (!test_bit(R5_DID_ALLOC, &conf->cache_state)) 848 + set_bit(R5_ALLOC_MORE, &conf->cache_state); 849 + } 850 + 851 + if (flags & R5_GAS_NOBLOCK) 852 + break; 853 + 854 + set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); 855 + r5l_wake_reclaim(conf->log, 0); 856 + wait_event_lock_irq(conf->wait_for_stripe, 857 + is_inactive_blocked(conf, hash), 829 858 *(conf->hash_locks + hash)); 859 + clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); 830 860 } 831 861 832 - sh = find_get_stripe(conf, sector, conf->generation - previous, hash); 833 - if (sh) 834 - goto out; 835 - 836 - if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) 837 - goto wait_for_stripe; 838 - 839 - sh = get_free_stripe(conf, hash); 840 - if (sh) { 841 - r5c_check_stripe_cache_usage(conf); 842 - init_stripe(sh, sector, previous); 843 - atomic_inc(&sh->count); 844 - goto out; 845 - } 846 - 847 - if (!test_bit(R5_DID_ALLOC, &conf->cache_state)) 848 - set_bit(R5_ALLOC_MORE, &conf->cache_state); 849 - 850 - wait_for_stripe: 851 - if (noblock) 852 - goto out; 853 - 854 - set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); 855 - r5l_wake_reclaim(conf->log, 0); 856 - wait_event_lock_irq(conf->wait_for_stripe, 857 - is_inactive_blocked(conf, hash), 858 - *(conf->hash_locks + hash)); 859 - clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); 860 - goto retry; 861 - 862 - out: 863 862 spin_unlock_irq(conf->hash_locks + hash); 864 863 return sh; 865 - } 866 - 867 - struct stripe_head *raid5_get_active_stripe(struct r5conf *conf, 868 - sector_t sector, bool previous, bool noblock, bool noquiesce) 869 - { 870 - return __raid5_get_active_stripe(conf, NULL, sector, previous, noblock, 871 - noquiesce); 872 864 } 873 865 874 866 static bool is_full_stripe_write(struct stripe_head *sh) ··· 4041 4047 * back cache (prexor with orig_page, and then xor with 4042 4048 * page) in the read path 4043 4049 */ 4044 - if (s->injournal && s->failed) { 4050 + if (s->to_read && s->injournal && s->failed) { 4045 4051 if (test_bit(STRIPE_R5C_CACHING, &sh->state)) 4046 4052 r5c_make_stripe_write_out(sh); 4047 4053 goto out; ··· 4630 4636 sector_t bn = raid5_compute_blocknr(sh, i, 1); 4631 4637 sector_t s = raid5_compute_sector(conf, bn, 0, 4632 4638 &dd_idx, NULL); 4633 - sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1); 4639 + sh2 = raid5_get_active_stripe(conf, NULL, s, 4640 + R5_GAS_NOBLOCK | R5_GAS_NOQUIESCE); 4634 4641 if (sh2 == NULL) 4635 4642 /* so far only the early blocks of this stripe 4636 4643 * have been requested. When later blocks ··· 5268 5273 /* Finish reconstruct operations initiated by the expansion process */ 5269 5274 if (sh->reconstruct_state == reconstruct_state_result) { 5270 5275 struct stripe_head *sh_src 5271 - = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1); 5276 + = raid5_get_active_stripe(conf, NULL, sh->sector, 5277 + R5_GAS_PREVIOUS | R5_GAS_NOBLOCK | 5278 + R5_GAS_NOQUIESCE); 5272 5279 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { 5273 5280 /* sh cannot be written until sh_src has been read. 5274 5281 * so arrange for sh to be delayed a little ··· 5539 5542 5540 5543 if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad, 5541 5544 &bad_sectors)) { 5542 - bio_put(raid_bio); 5543 5545 rdev_dec_pending(rdev, mddev); 5544 5546 return 0; 5545 5547 } ··· 5819 5823 DEFINE_WAIT(w); 5820 5824 int d; 5821 5825 again: 5822 - sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0); 5826 + sh = raid5_get_active_stripe(conf, NULL, logical_sector, 0); 5823 5827 prepare_to_wait(&conf->wait_for_overlap, &w, 5824 5828 TASK_UNINTERRUPTIBLE); 5825 5829 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); ··· 5974 5978 enum stripe_result ret; 5975 5979 struct stripe_head *sh; 5976 5980 sector_t new_sector; 5977 - int previous = 0; 5981 + int previous = 0, flags = 0; 5978 5982 int seq, dd_idx; 5979 5983 5980 5984 seq = read_seqcount_begin(&conf->gen_lock); ··· 6008 6012 pr_debug("raid456: %s, sector %llu logical %llu\n", __func__, 6009 6013 new_sector, logical_sector); 6010 6014 6011 - sh = __raid5_get_active_stripe(conf, ctx, new_sector, previous, 6012 - (bi->bi_opf & REQ_RAHEAD), 0); 6015 + if (previous) 6016 + flags |= R5_GAS_PREVIOUS; 6017 + if (bi->bi_opf & REQ_RAHEAD) 6018 + flags |= R5_GAS_NOBLOCK; 6019 + sh = raid5_get_active_stripe(conf, ctx, new_sector, flags); 6013 6020 if (unlikely(!sh)) { 6014 6021 /* cannot get stripe, just give-up */ 6015 6022 bi->bi_status = BLK_STS_IOERR; ··· 6361 6362 for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) { 6362 6363 int j; 6363 6364 int skipped_disk = 0; 6364 - sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 6365 + sh = raid5_get_active_stripe(conf, NULL, stripe_addr+i, 6366 + R5_GAS_NOQUIESCE); 6365 6367 set_bit(STRIPE_EXPANDING, &sh->state); 6366 6368 atomic_inc(&conf->reshape_stripes); 6367 6369 /* If any of this stripe is beyond the end of the old ··· 6411 6411 if (last_sector >= mddev->dev_sectors) 6412 6412 last_sector = mddev->dev_sectors - 1; 6413 6413 while (first_sector <= last_sector) { 6414 - sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1); 6414 + sh = raid5_get_active_stripe(conf, NULL, first_sector, 6415 + R5_GAS_PREVIOUS | R5_GAS_NOQUIESCE); 6415 6416 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 6416 6417 set_bit(STRIPE_HANDLE, &sh->state); 6417 6418 raid5_release_stripe(sh); ··· 6532 6531 6533 6532 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); 6534 6533 6535 - sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0); 6534 + sh = raid5_get_active_stripe(conf, NULL, sector_nr, 6535 + R5_GAS_NOBLOCK); 6536 6536 if (sh == NULL) { 6537 - sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0); 6537 + sh = raid5_get_active_stripe(conf, NULL, sector_nr, 0); 6538 6538 /* make sure we don't swamp the stripe cache if someone else 6539 6539 * is trying to get access 6540 6540 */ ··· 6598 6596 /* already done this stripe */ 6599 6597 continue; 6600 6598 6601 - sh = raid5_get_active_stripe(conf, sector, 0, 1, 1); 6602 - 6599 + sh = raid5_get_active_stripe(conf, NULL, sector, 6600 + R5_GAS_NOBLOCK | R5_GAS_NOQUIESCE); 6603 6601 if (!sh) { 6604 6602 /* failed to get a stripe - must wait */ 6605 6603 conf->retry_read_aligned = raid_bio; ··· 6783 6781 spin_unlock_irq(&conf->device_lock); 6784 6782 md_check_recovery(mddev); 6785 6783 spin_lock_irq(&conf->device_lock); 6784 + 6785 + /* 6786 + * Waiting on MD_SB_CHANGE_PENDING below may deadlock 6787 + * seeing md_check_recovery() is needed to clear 6788 + * the flag when using mdmon. 6789 + */ 6790 + continue; 6786 6791 } 6792 + 6793 + wait_event_lock_irq(mddev->sb_wait, 6794 + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), 6795 + conf->device_lock); 6787 6796 } 6788 6797 pr_debug("%d stripes handled\n", handled); 6789 6798

+20 -12

drivers/md/raid5.h

··· 803 803 } 804 804 #endif 805 805 806 - extern void md_raid5_kick_device(struct r5conf *conf); 807 - extern int raid5_set_cache_size(struct mddev *mddev, int size); 808 - extern sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous); 809 - extern void raid5_release_stripe(struct stripe_head *sh); 810 - extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 811 - int previous, int *dd_idx, 812 - struct stripe_head *sh); 813 - extern struct stripe_head * 814 - raid5_get_active_stripe(struct r5conf *conf, sector_t sector, 815 - bool previous, bool noblock, bool noquiesce); 816 - extern int raid5_calc_degraded(struct r5conf *conf); 817 - extern int r5c_journal_mode_set(struct mddev *mddev, int journal_mode); 806 + void md_raid5_kick_device(struct r5conf *conf); 807 + int raid5_set_cache_size(struct mddev *mddev, int size); 808 + sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous); 809 + void raid5_release_stripe(struct stripe_head *sh); 810 + sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 811 + int previous, int *dd_idx, struct stripe_head *sh); 812 + 813 + struct stripe_request_ctx; 814 + /* get stripe from previous generation (when reshaping) */ 815 + #define R5_GAS_PREVIOUS (1 << 0) 816 + /* do not block waiting for a free stripe */ 817 + #define R5_GAS_NOBLOCK (1 << 1) 818 + /* do not block waiting for quiesce to be released */ 819 + #define R5_GAS_NOQUIESCE (1 << 2) 820 + struct stripe_head *raid5_get_active_stripe(struct r5conf *conf, 821 + struct stripe_request_ctx *ctx, sector_t sector, 822 + unsigned int flags); 823 + 824 + int raid5_calc_degraded(struct r5conf *conf); 825 + int r5c_journal_mode_set(struct mddev *mddev, int journal_mode); 818 826 #endif

+126 -14

drivers/nvme/host/core.c

··· 1111 1111 return effects; 1112 1112 } 1113 1113 1114 - static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects, 1115 - struct nvme_command *cmd, int status) 1114 + void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects, 1115 + struct nvme_command *cmd, int status) 1116 1116 { 1117 1117 if (effects & NVME_CMD_EFFECTS_CSE_MASK) { 1118 1118 nvme_unfreeze(ctrl); ··· 1148 1148 break; 1149 1149 } 1150 1150 } 1151 + EXPORT_SYMBOL_NS_GPL(nvme_passthru_end, NVME_TARGET_PASSTHRU); 1151 1152 1152 - int nvme_execute_passthru_rq(struct request *rq) 1153 + int nvme_execute_passthru_rq(struct request *rq, u32 *effects) 1153 1154 { 1154 1155 struct nvme_command *cmd = nvme_req(rq)->cmd; 1155 1156 struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl; 1156 1157 struct nvme_ns *ns = rq->q->queuedata; 1157 - u32 effects; 1158 - int ret; 1159 1158 1160 - effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); 1161 - ret = nvme_execute_rq(rq, false); 1162 - if (effects) /* nothing to be done for zero cmd effects */ 1163 - nvme_passthru_end(ctrl, effects, cmd, ret); 1164 - 1165 - return ret; 1159 + *effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); 1160 + return nvme_execute_rq(rq, false); 1166 1161 } 1167 1162 EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); 1168 1163 ··· 2691 2696 if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) { 2692 2697 nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE); 2693 2698 if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) { 2694 - strlcpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE); 2699 + strscpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE); 2695 2700 return; 2696 2701 } 2697 2702 ··· 2699 2704 dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n"); 2700 2705 } 2701 2706 2702 - /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */ 2707 + /* 2708 + * Generate a "fake" NQN similar to the one in Section 4.5 of the NVMe 2709 + * Base Specification 2.0. It is slightly different from the format 2710 + * specified there due to historic reasons, and we can't change it now. 2711 + */ 2703 2712 off = snprintf(subsys->subnqn, NVMF_NQN_SIZE, 2704 2713 "nqn.2014.08.org.nvmexpress:%04x%04x", 2705 2714 le16_to_cpu(id->vid), le16_to_cpu(id->ssvid)); ··· 2893 2894 nvme_init_subnqn(subsys, ctrl, id); 2894 2895 memcpy(subsys->serial, id->sn, sizeof(subsys->serial)); 2895 2896 memcpy(subsys->model, id->mn, sizeof(subsys->model)); 2896 - memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev)); 2897 2897 subsys->vendor_id = le16_to_cpu(id->vid); 2898 2898 subsys->cmic = id->cmic; 2899 2899 ··· 3111 3113 ctrl->quirks |= core_quirks[i].quirks; 3112 3114 } 3113 3115 } 3116 + memcpy(ctrl->subsys->firmware_rev, id->fr, 3117 + sizeof(ctrl->subsys->firmware_rev)); 3114 3118 3115 3119 if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) { 3116 3120 dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n"); ··· 4805 4805 } 4806 4806 EXPORT_SYMBOL_GPL(nvme_complete_async_event); 4807 4807 4808 + int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, 4809 + const struct blk_mq_ops *ops, unsigned int flags, 4810 + unsigned int cmd_size) 4811 + { 4812 + int ret; 4813 + 4814 + memset(set, 0, sizeof(*set)); 4815 + set->ops = ops; 4816 + set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; 4817 + if (ctrl->ops->flags & NVME_F_FABRICS) 4818 + set->reserved_tags = NVMF_RESERVED_TAGS; 4819 + set->numa_node = ctrl->numa_node; 4820 + set->flags = flags; 4821 + set->cmd_size = cmd_size; 4822 + set->driver_data = ctrl; 4823 + set->nr_hw_queues = 1; 4824 + set->timeout = NVME_ADMIN_TIMEOUT; 4825 + ret = blk_mq_alloc_tag_set(set); 4826 + if (ret) 4827 + return ret; 4828 + 4829 + ctrl->admin_q = blk_mq_init_queue(set); 4830 + if (IS_ERR(ctrl->admin_q)) { 4831 + ret = PTR_ERR(ctrl->admin_q); 4832 + goto out_free_tagset; 4833 + } 4834 + 4835 + if (ctrl->ops->flags & NVME_F_FABRICS) { 4836 + ctrl->fabrics_q = blk_mq_init_queue(set); 4837 + if (IS_ERR(ctrl->fabrics_q)) { 4838 + ret = PTR_ERR(ctrl->fabrics_q); 4839 + goto out_cleanup_admin_q; 4840 + } 4841 + } 4842 + 4843 + ctrl->admin_tagset = set; 4844 + return 0; 4845 + 4846 + out_cleanup_admin_q: 4847 + blk_mq_destroy_queue(ctrl->fabrics_q); 4848 + out_free_tagset: 4849 + blk_mq_free_tag_set(ctrl->admin_tagset); 4850 + return ret; 4851 + } 4852 + EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set); 4853 + 4854 + void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl) 4855 + { 4856 + blk_mq_destroy_queue(ctrl->admin_q); 4857 + if (ctrl->ops->flags & NVME_F_FABRICS) 4858 + blk_mq_destroy_queue(ctrl->fabrics_q); 4859 + blk_mq_free_tag_set(ctrl->admin_tagset); 4860 + } 4861 + EXPORT_SYMBOL_GPL(nvme_remove_admin_tag_set); 4862 + 4863 + int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, 4864 + const struct blk_mq_ops *ops, unsigned int flags, 4865 + unsigned int cmd_size) 4866 + { 4867 + int ret; 4868 + 4869 + memset(set, 0, sizeof(*set)); 4870 + set->ops = ops; 4871 + set->queue_depth = ctrl->sqsize + 1; 4872 + set->reserved_tags = NVMF_RESERVED_TAGS; 4873 + set->numa_node = ctrl->numa_node; 4874 + set->flags = flags; 4875 + set->cmd_size = cmd_size, 4876 + set->driver_data = ctrl; 4877 + set->nr_hw_queues = ctrl->queue_count - 1; 4878 + set->timeout = NVME_IO_TIMEOUT; 4879 + if (ops->map_queues) 4880 + set->nr_maps = ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; 4881 + ret = blk_mq_alloc_tag_set(set); 4882 + if (ret) 4883 + return ret; 4884 + 4885 + if (ctrl->ops->flags & NVME_F_FABRICS) { 4886 + ctrl->connect_q = blk_mq_init_queue(set); 4887 + if (IS_ERR(ctrl->connect_q)) { 4888 + ret = PTR_ERR(ctrl->connect_q); 4889 + goto out_free_tag_set; 4890 + } 4891 + } 4892 + 4893 + ctrl->tagset = set; 4894 + return 0; 4895 + 4896 + out_free_tag_set: 4897 + blk_mq_free_tag_set(set); 4898 + return ret; 4899 + } 4900 + EXPORT_SYMBOL_GPL(nvme_alloc_io_tag_set); 4901 + 4902 + void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl) 4903 + { 4904 + if (ctrl->ops->flags & NVME_F_FABRICS) 4905 + blk_mq_destroy_queue(ctrl->connect_q); 4906 + blk_mq_free_tag_set(ctrl->tagset); 4907 + } 4908 + EXPORT_SYMBOL_GPL(nvme_remove_io_tag_set); 4909 + 4808 4910 void nvme_stop_ctrl(struct nvme_ctrl *ctrl) 4809 4911 { 4810 4912 nvme_mpath_stop(ctrl); ··· 4925 4823 nvme_start_keep_alive(ctrl); 4926 4824 4927 4825 nvme_enable_aen(ctrl); 4826 + 4827 + /* 4828 + * persistent discovery controllers need to send indication to userspace 4829 + * to re-read the discovery log page to learn about possible changes 4830 + * that were missed. We identify persistent discovery controllers by 4831 + * checking that they started once before, hence are reconnecting back. 4832 + */ 4833 + if (test_and_set_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags) && 4834 + nvme_discovery_ctrl(ctrl)) 4835 + nvme_change_uevent(ctrl, "NVME_EVENT=rediscover"); 4928 4836 4929 4837 if (ctrl->queue_count > 1) { 4930 4838 nvme_queue_scan(ctrl);

+19 -6

drivers/nvme/host/fabrics.c

··· 49 49 goto out_unlock; 50 50 51 51 kref_init(&host->ref); 52 - strlcpy(host->nqn, hostnqn, NVMF_NQN_SIZE); 52 + strscpy(host->nqn, hostnqn, NVMF_NQN_SIZE); 53 53 54 54 list_add_tail(&host->list, &nvmf_hosts); 55 55 out_unlock: ··· 971 971 return false; 972 972 973 973 /* 974 - * Checking the local address is rough. In most cases, none is specified 975 - * and the host port is selected by the stack. 974 + * Checking the local address or host interfaces is rough. 975 + * 976 + * In most cases, none is specified and the host port or 977 + * host interface is selected by the stack. 976 978 * 977 979 * Assume no match if: 978 - * - local address is specified and address is not the same 979 - * - local address is not specified but remote is, or vice versa 980 - * (admin using specific host_traddr when it matters). 980 + * - local address or host interface is specified and address 981 + * or host interface is not the same 982 + * - local address or host interface is not specified but 983 + * remote is, or vice versa (admin using specific 984 + * host_traddr/host_iface when it matters). 981 985 */ 982 986 if ((opts->mask & NVMF_OPT_HOST_TRADDR) && 983 987 (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR)) { ··· 989 985 return false; 990 986 } else if ((opts->mask & NVMF_OPT_HOST_TRADDR) || 991 987 (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR)) { 988 + return false; 989 + } 990 + 991 + if ((opts->mask & NVMF_OPT_HOST_IFACE) && 992 + (ctrl->opts->mask & NVMF_OPT_HOST_IFACE)) { 993 + if (strcmp(opts->host_iface, ctrl->opts->host_iface)) 994 + return false; 995 + } else if ((opts->mask & NVMF_OPT_HOST_IFACE) || 996 + (ctrl->opts->mask & NVMF_OPT_HOST_IFACE)) { 992 997 return false; 993 998 } 994 999

+29 -95

drivers/nvme/host/fc.c

··· 1829 1829 { 1830 1830 struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); 1831 1831 1832 - return __nvme_fc_exit_request(set->driver_data, op); 1832 + return __nvme_fc_exit_request(to_fc_ctrl(set->driver_data), op); 1833 1833 } 1834 1834 1835 1835 static int ··· 2135 2135 nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq, 2136 2136 unsigned int hctx_idx, unsigned int numa_node) 2137 2137 { 2138 - struct nvme_fc_ctrl *ctrl = set->driver_data; 2138 + struct nvme_fc_ctrl *ctrl = to_fc_ctrl(set->driver_data); 2139 2139 struct nvme_fcp_op_w_sgl *op = blk_mq_rq_to_pdu(rq); 2140 2140 int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; 2141 2141 struct nvme_fc_queue *queue = &ctrl->queues[queue_idx]; ··· 2206 2206 } 2207 2207 } 2208 2208 2209 - static inline void 2210 - __nvme_fc_init_hctx(struct blk_mq_hw_ctx *hctx, struct nvme_fc_ctrl *ctrl, 2211 - unsigned int qidx) 2209 + static inline int 2210 + __nvme_fc_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int qidx) 2212 2211 { 2212 + struct nvme_fc_ctrl *ctrl = to_fc_ctrl(data); 2213 2213 struct nvme_fc_queue *queue = &ctrl->queues[qidx]; 2214 2214 2215 2215 hctx->driver_data = queue; 2216 2216 queue->hctx = hctx; 2217 + return 0; 2217 2218 } 2218 2219 2219 2220 static int 2220 - nvme_fc_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 2221 - unsigned int hctx_idx) 2221 + nvme_fc_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) 2222 2222 { 2223 - struct nvme_fc_ctrl *ctrl = data; 2224 - 2225 - __nvme_fc_init_hctx(hctx, ctrl, hctx_idx + 1); 2226 - 2227 - return 0; 2223 + return __nvme_fc_init_hctx(hctx, data, hctx_idx + 1); 2228 2224 } 2229 2225 2230 2226 static int 2231 2227 nvme_fc_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, 2232 2228 unsigned int hctx_idx) 2233 2229 { 2234 - struct nvme_fc_ctrl *ctrl = data; 2235 - 2236 - __nvme_fc_init_hctx(hctx, ctrl, hctx_idx); 2237 - 2238 - return 0; 2230 + return __nvme_fc_init_hctx(hctx, data, hctx_idx); 2239 2231 } 2240 2232 2241 2233 static void ··· 2383 2391 container_of(ref, struct nvme_fc_ctrl, ref); 2384 2392 unsigned long flags; 2385 2393 2386 - if (ctrl->ctrl.tagset) { 2387 - blk_mq_destroy_queue(ctrl->ctrl.connect_q); 2388 - blk_mq_free_tag_set(&ctrl->tag_set); 2389 - } 2394 + if (ctrl->ctrl.tagset) 2395 + nvme_remove_io_tag_set(&ctrl->ctrl); 2390 2396 2391 2397 /* remove from rport list */ 2392 2398 spin_lock_irqsave(&ctrl->rport->lock, flags); ··· 2392 2402 spin_unlock_irqrestore(&ctrl->rport->lock, flags); 2393 2403 2394 2404 nvme_start_admin_queue(&ctrl->ctrl); 2395 - blk_mq_destroy_queue(ctrl->ctrl.admin_q); 2396 - blk_mq_destroy_queue(ctrl->ctrl.fabrics_q); 2397 - blk_mq_free_tag_set(&ctrl->admin_tag_set); 2405 + nvme_remove_admin_tag_set(&ctrl->ctrl); 2398 2406 2399 2407 kfree(ctrl->queues); 2400 2408 ··· 2848 2860 nvme_fc_ctrl_put(ctrl); 2849 2861 } 2850 2862 2851 - static int nvme_fc_map_queues(struct blk_mq_tag_set *set) 2863 + static void nvme_fc_map_queues(struct blk_mq_tag_set *set) 2852 2864 { 2853 - struct nvme_fc_ctrl *ctrl = set->driver_data; 2865 + struct nvme_fc_ctrl *ctrl = to_fc_ctrl(set->driver_data); 2854 2866 int i; 2855 2867 2856 2868 for (i = 0; i < set->nr_maps; i++) { ··· 2868 2880 else 2869 2881 blk_mq_map_queues(map); 2870 2882 } 2871 - return 0; 2872 2883 } 2873 2884 2874 2885 static const struct blk_mq_ops nvme_fc_mq_ops = { ··· 2902 2915 2903 2916 nvme_fc_init_io_queues(ctrl); 2904 2917 2905 - memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set)); 2906 - ctrl->tag_set.ops = &nvme_fc_mq_ops; 2907 - ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size; 2908 - ctrl->tag_set.reserved_tags = NVMF_RESERVED_TAGS; 2909 - ctrl->tag_set.numa_node = ctrl->ctrl.numa_node; 2910 - ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; 2911 - ctrl->tag_set.cmd_size = 2912 - struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, 2913 - ctrl->lport->ops->fcprqst_priv_sz); 2914 - ctrl->tag_set.driver_data = ctrl; 2915 - ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1; 2916 - ctrl->tag_set.timeout = NVME_IO_TIMEOUT; 2917 - 2918 - ret = blk_mq_alloc_tag_set(&ctrl->tag_set); 2918 + ret = nvme_alloc_io_tag_set(&ctrl->ctrl, &ctrl->tag_set, 2919 + &nvme_fc_mq_ops, BLK_MQ_F_SHOULD_MERGE, 2920 + struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, 2921 + ctrl->lport->ops->fcprqst_priv_sz)); 2919 2922 if (ret) 2920 2923 return ret; 2921 2924 2922 - ctrl->ctrl.tagset = &ctrl->tag_set; 2923 - 2924 - ret = nvme_ctrl_init_connect_q(&(ctrl->ctrl)); 2925 - if (ret) 2926 - goto out_free_tag_set; 2927 - 2928 2925 ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.sqsize + 1); 2929 2926 if (ret) 2930 - goto out_cleanup_blk_queue; 2927 + goto out_cleanup_tagset; 2931 2928 2932 2929 ret = nvme_fc_connect_io_queues(ctrl, ctrl->ctrl.sqsize + 1); 2933 2930 if (ret) ··· 2923 2952 2924 2953 out_delete_hw_queues: 2925 2954 nvme_fc_delete_hw_io_queues(ctrl); 2926 - out_cleanup_blk_queue: 2927 - blk_mq_destroy_queue(ctrl->ctrl.connect_q); 2928 - out_free_tag_set: 2929 - blk_mq_free_tag_set(&ctrl->tag_set); 2955 + out_cleanup_tagset: 2956 + nvme_remove_io_tag_set(&ctrl->ctrl); 2930 2957 nvme_fc_free_io_queues(ctrl); 2931 2958 2932 2959 /* force put free routine to ignore io queues */ ··· 3135 3166 "to maxcmd\n", 3136 3167 opts->queue_size, ctrl->ctrl.maxcmd); 3137 3168 opts->queue_size = ctrl->ctrl.maxcmd; 3138 - } 3139 - 3140 - if (opts->queue_size > ctrl->ctrl.sqsize + 1) { 3141 - /* warn if sqsize is lower than queue_size */ 3142 - dev_warn(ctrl->ctrl.device, 3143 - "queue_size %zu > ctrl sqsize %u, reducing " 3144 - "to sqsize\n", 3145 - opts->queue_size, ctrl->ctrl.sqsize + 1); 3146 - opts->queue_size = ctrl->ctrl.sqsize + 1; 3169 + ctrl->ctrl.sqsize = opts->queue_size - 1; 3147 3170 } 3148 3171 3149 3172 ret = nvme_fc_init_aen_ops(ctrl); ··· 3508 3547 3509 3548 nvme_fc_init_queue(ctrl, 0); 3510 3549 3511 - memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); 3512 - ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops; 3513 - ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH; 3514 - ctrl->admin_tag_set.reserved_tags = NVMF_RESERVED_TAGS; 3515 - ctrl->admin_tag_set.numa_node = ctrl->ctrl.numa_node; 3516 - ctrl->admin_tag_set.cmd_size = 3517 - struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, 3518 - ctrl->lport->ops->fcprqst_priv_sz); 3519 - ctrl->admin_tag_set.driver_data = ctrl; 3520 - ctrl->admin_tag_set.nr_hw_queues = 1; 3521 - ctrl->admin_tag_set.timeout = NVME_ADMIN_TIMEOUT; 3522 - ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED; 3523 - 3524 - ret = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); 3550 + ret = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set, 3551 + &nvme_fc_admin_mq_ops, BLK_MQ_F_NO_SCHED, 3552 + struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, 3553 + ctrl->lport->ops->fcprqst_priv_sz)); 3525 3554 if (ret) 3526 3555 goto out_free_queues; 3527 - ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set; 3528 - 3529 - ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set); 3530 - if (IS_ERR(ctrl->ctrl.fabrics_q)) { 3531 - ret = PTR_ERR(ctrl->ctrl.fabrics_q); 3532 - goto out_free_admin_tag_set; 3533 - } 3534 - 3535 - ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); 3536 - if (IS_ERR(ctrl->ctrl.admin_q)) { 3537 - ret = PTR_ERR(ctrl->ctrl.admin_q); 3538 - goto out_cleanup_fabrics_q; 3539 - } 3540 3556 3541 3557 /* 3542 3558 * Would have been nice to init io queues tag set as well. ··· 3524 3586 3525 3587 ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_fc_ctrl_ops, 0); 3526 3588 if (ret) 3527 - goto out_cleanup_admin_q; 3589 + goto out_cleanup_tagset; 3528 3590 3529 3591 /* at this point, teardown path changes to ref counting on nvme ctrl */ 3530 3592 ··· 3579 3641 3580 3642 return ERR_PTR(-EIO); 3581 3643 3582 - out_cleanup_admin_q: 3583 - blk_mq_destroy_queue(ctrl->ctrl.admin_q); 3584 - out_cleanup_fabrics_q: 3585 - blk_mq_destroy_queue(ctrl->ctrl.fabrics_q); 3586 - out_free_admin_tag_set: 3587 - blk_mq_free_tag_set(&ctrl->admin_tag_set); 3644 + out_cleanup_tagset: 3645 + nvme_remove_admin_tag_set(&ctrl->ctrl); 3588 3646 out_free_queues: 3589 3647 kfree(ctrl->queues); 3590 3648 out_free_ida:

+14 -1

drivers/nvme/host/ioctl.c

··· 136 136 unsigned bufflen, void __user *meta_buffer, unsigned meta_len, 137 137 u32 meta_seed, u64 *result, unsigned timeout, bool vec) 138 138 { 139 + struct nvme_ctrl *ctrl; 139 140 struct request *req; 140 141 void *meta = NULL; 141 142 struct bio *bio; 143 + u32 effects; 142 144 int ret; 143 145 144 146 req = nvme_alloc_user_request(q, cmd, ubuffer, bufflen, meta_buffer, ··· 149 147 return PTR_ERR(req); 150 148 151 149 bio = req->bio; 150 + ctrl = nvme_req(req)->ctrl; 152 151 153 - ret = nvme_execute_passthru_rq(req); 152 + ret = nvme_execute_passthru_rq(req, &effects); 154 153 155 154 if (result) 156 155 *result = le64_to_cpu(nvme_req(req)->result.u64); ··· 161 158 if (bio) 162 159 blk_rq_unmap_user(bio); 163 160 blk_mq_free_request(req); 161 + 162 + if (effects) 163 + nvme_passthru_end(ctrl, effects, cmd, ret); 164 + 164 165 return ret; 165 166 } 166 167 ··· 831 824 case NVME_IOCTL_IO_CMD: 832 825 return nvme_dev_user_cmd(ctrl, argp); 833 826 case NVME_IOCTL_RESET: 827 + if (!capable(CAP_SYS_ADMIN)) 828 + return -EACCES; 834 829 dev_warn(ctrl->device, "resetting controller\n"); 835 830 return nvme_reset_ctrl_sync(ctrl); 836 831 case NVME_IOCTL_SUBSYS_RESET: 832 + if (!capable(CAP_SYS_ADMIN)) 833 + return -EACCES; 837 834 return nvme_reset_subsystem(ctrl); 838 835 case NVME_IOCTL_RESCAN: 836 + if (!capable(CAP_SYS_ADMIN)) 837 + return -EACCES; 839 838 nvme_queue_scan(ctrl); 840 839 return 0; 841 840 default:

+30 -14

drivers/nvme/host/nvme.h

··· 233 233 #endif 234 234 }; 235 235 236 + enum nvme_ctrl_flags { 237 + NVME_CTRL_FAILFAST_EXPIRED = 0, 238 + NVME_CTRL_ADMIN_Q_STOPPED = 1, 239 + NVME_CTRL_STARTED_ONCE = 2, 240 + }; 241 + 236 242 struct nvme_ctrl { 237 243 bool comp_seen; 238 244 enum nvme_ctrl_state state; ··· 360 354 u16 maxcmd; 361 355 int nr_reconnects; 362 356 unsigned long flags; 363 - #define NVME_CTRL_FAILFAST_EXPIRED 0 364 - #define NVME_CTRL_ADMIN_Q_STOPPED 1 365 357 struct nvmf_ctrl_options *opts; 366 358 367 359 struct page *discard_page; ··· 606 602 static inline void nvme_should_fail(struct request *req) {} 607 603 #endif 608 604 605 + bool nvme_wait_reset(struct nvme_ctrl *ctrl); 606 + int nvme_try_sched_reset(struct nvme_ctrl *ctrl); 607 + 609 608 static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl) 610 609 { 610 + int ret; 611 + 611 612 if (!ctrl->subsystem) 612 613 return -ENOTTY; 613 - return ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65); 614 + if (!nvme_wait_reset(ctrl)) 615 + return -EBUSY; 616 + 617 + ret = ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65); 618 + if (ret) 619 + return ret; 620 + 621 + return nvme_try_sched_reset(ctrl); 614 622 } 615 623 616 624 /* ··· 728 712 void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl); 729 713 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, 730 714 enum nvme_ctrl_state new_state); 731 - bool nvme_wait_reset(struct nvme_ctrl *ctrl); 732 715 int nvme_disable_ctrl(struct nvme_ctrl *ctrl); 733 716 int nvme_enable_ctrl(struct nvme_ctrl *ctrl); 734 717 int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); ··· 737 722 void nvme_start_ctrl(struct nvme_ctrl *ctrl); 738 723 void nvme_stop_ctrl(struct nvme_ctrl *ctrl); 739 724 int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl); 725 + int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, 726 + const struct blk_mq_ops *ops, unsigned int flags, 727 + unsigned int cmd_size); 728 + void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl); 729 + int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, 730 + const struct blk_mq_ops *ops, unsigned int flags, 731 + unsigned int cmd_size); 732 + void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl); 740 733 741 734 void nvme_remove_namespaces(struct nvme_ctrl *ctrl); 742 735 ··· 825 802 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); 826 803 int nvme_reset_ctrl(struct nvme_ctrl *ctrl); 827 804 int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); 828 - int nvme_try_sched_reset(struct nvme_ctrl *ctrl); 829 805 int nvme_delete_ctrl(struct nvme_ctrl *ctrl); 830 806 void nvme_queue_scan(struct nvme_ctrl *ctrl); 831 807 int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, ··· 994 972 } 995 973 #endif 996 974 997 - static inline int nvme_ctrl_init_connect_q(struct nvme_ctrl *ctrl) 998 - { 999 - ctrl->connect_q = blk_mq_init_queue(ctrl->tagset); 1000 - if (IS_ERR(ctrl->connect_q)) 1001 - return PTR_ERR(ctrl->connect_q); 1002 - return 0; 1003 - } 1004 - 1005 975 static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) 1006 976 { 1007 977 return dev_to_disk(dev)->private_data; ··· 1041 1027 1042 1028 u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 1043 1029 u8 opcode); 1044 - int nvme_execute_passthru_rq(struct request *rq); 1030 + int nvme_execute_passthru_rq(struct request *rq, u32 *effects); 1031 + void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects, 1032 + struct nvme_command *cmd, int status); 1045 1033 struct nvme_ctrl *nvme_ctrl_from_file(struct file *file); 1046 1034 struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid); 1047 1035 void nvme_put_ns(struct nvme_ns *ns);

+40 -38

drivers/nvme/host/pci.c

··· 226 226 struct nvme_iod { 227 227 struct nvme_request req; 228 228 struct nvme_command cmd; 229 - struct nvme_queue *nvmeq; 230 229 bool use_sgl; 231 - int aborted; 232 - int npages; /* In the PRP list. 0 means small pool in use */ 233 - dma_addr_t first_dma; 230 + bool aborted; 231 + s8 nr_allocations; /* PRP list pool allocations. 0 means small 232 + pool in use */ 234 233 unsigned int dma_len; /* length of single DMA segment mapping */ 234 + dma_addr_t first_dma; 235 235 dma_addr_t meta_dma; 236 236 struct sg_table sgt; 237 237 }; ··· 430 430 { 431 431 struct nvme_dev *dev = set->driver_data; 432 432 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 433 - int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0; 434 - struct nvme_queue *nvmeq = &dev->queues[queue_idx]; 435 - 436 - BUG_ON(!nvmeq); 437 - iod->nvmeq = nvmeq; 438 433 439 434 nvme_req(req)->ctrl = &dev->ctrl; 440 435 nvme_req(req)->cmd = &iod->cmd; ··· 445 450 return 0; 446 451 } 447 452 448 - static int nvme_pci_map_queues(struct blk_mq_tag_set *set) 453 + static void nvme_pci_map_queues(struct blk_mq_tag_set *set) 449 454 { 450 455 struct nvme_dev *dev = set->driver_data; 451 456 int i, qoff, offset; ··· 472 477 qoff += map->nr_queues; 473 478 offset += map->nr_queues; 474 479 } 475 - 476 - return 0; 477 480 } 478 481 479 482 /* ··· 521 528 522 529 static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) 523 530 { 524 - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 531 + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 525 532 int nseg = blk_rq_nr_phys_segments(req); 526 533 unsigned int avg_seg_size; 527 534 ··· 529 536 530 537 if (!nvme_ctrl_sgl_supported(&dev->ctrl)) 531 538 return false; 532 - if (!iod->nvmeq->qid) 539 + if (!nvmeq->qid) 533 540 return false; 534 541 if (!sgl_threshold || avg_seg_size < sgl_threshold) 535 542 return false; ··· 543 550 dma_addr_t dma_addr = iod->first_dma; 544 551 int i; 545 552 546 - for (i = 0; i < iod->npages; i++) { 553 + for (i = 0; i < iod->nr_allocations; i++) { 547 554 __le64 *prp_list = nvme_pci_iod_list(req)[i]; 548 555 dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]); 549 556 ··· 559 566 dma_addr_t dma_addr = iod->first_dma; 560 567 int i; 561 568 562 - for (i = 0; i < iod->npages; i++) { 569 + for (i = 0; i < iod->nr_allocations; i++) { 563 570 struct nvme_sgl_desc *sg_list = nvme_pci_iod_list(req)[i]; 564 571 dma_addr_t next_dma_addr = le64_to_cpu((sg_list[last_sg]).addr); 565 572 ··· 582 589 583 590 dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0); 584 591 585 - if (iod->npages == 0) 592 + if (iod->nr_allocations == 0) 586 593 dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], 587 594 iod->first_dma); 588 595 else if (iod->use_sgl) ··· 644 651 nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); 645 652 if (nprps <= (256 / 8)) { 646 653 pool = dev->prp_small_pool; 647 - iod->npages = 0; 654 + iod->nr_allocations = 0; 648 655 } else { 649 656 pool = dev->prp_page_pool; 650 - iod->npages = 1; 657 + iod->nr_allocations = 1; 651 658 } 652 659 653 660 prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); 654 661 if (!prp_list) { 655 - iod->npages = -1; 662 + iod->nr_allocations = -1; 656 663 return BLK_STS_RESOURCE; 657 664 } 658 665 list[0] = prp_list; ··· 664 671 prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); 665 672 if (!prp_list) 666 673 goto free_prps; 667 - list[iod->npages++] = prp_list; 674 + list[iod->nr_allocations++] = prp_list; 668 675 prp_list[0] = old_prp_list[i - 1]; 669 676 old_prp_list[i - 1] = cpu_to_le64(prp_dma); 670 677 i = 1; ··· 739 746 740 747 if (entries <= (256 / sizeof(struct nvme_sgl_desc))) { 741 748 pool = dev->prp_small_pool; 742 - iod->npages = 0; 749 + iod->nr_allocations = 0; 743 750 } else { 744 751 pool = dev->prp_page_pool; 745 - iod->npages = 1; 752 + iod->nr_allocations = 1; 746 753 } 747 754 748 755 sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma); 749 756 if (!sg_list) { 750 - iod->npages = -1; 757 + iod->nr_allocations = -1; 751 758 return BLK_STS_RESOURCE; 752 759 } 753 760 ··· 766 773 goto free_sgls; 767 774 768 775 i = 0; 769 - nvme_pci_iod_list(req)[iod->npages++] = sg_list; 776 + nvme_pci_iod_list(req)[iod->nr_allocations++] = sg_list; 770 777 sg_list[i++] = *link; 771 778 nvme_pci_sgl_set_seg(link, sgl_dma, entries); 772 779 } ··· 826 833 int rc; 827 834 828 835 if (blk_rq_nr_phys_segments(req) == 1) { 836 + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 829 837 struct bio_vec bv = req_bvec(req); 830 838 831 839 if (!is_pci_p2pdma_page(bv.bv_page)) { ··· 834 840 return nvme_setup_prp_simple(dev, req, 835 841 &cmnd->rw, &bv); 836 842 837 - if (iod->nvmeq->qid && sgl_threshold && 843 + if (nvmeq->qid && sgl_threshold && 838 844 nvme_ctrl_sgl_supported(&dev->ctrl)) 839 845 return nvme_setup_sgl_simple(dev, req, 840 846 &cmnd->rw, &bv); ··· 892 898 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 893 899 blk_status_t ret; 894 900 895 - iod->aborted = 0; 896 - iod->npages = -1; 901 + iod->aborted = false; 902 + iod->nr_allocations = -1; 897 903 iod->sgt.nents = 0; 898 904 899 905 ret = nvme_setup_cmd(req->q->queuedata, req); ··· 1013 1019 1014 1020 static __always_inline void nvme_pci_unmap_rq(struct request *req) 1015 1021 { 1016 - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 1017 - struct nvme_dev *dev = iod->nvmeq->dev; 1022 + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 1023 + struct nvme_dev *dev = nvmeq->dev; 1018 1024 1019 - if (blk_integrity_rq(req)) 1025 + if (blk_integrity_rq(req)) { 1026 + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 1027 + 1020 1028 dma_unmap_page(dev->dev, iod->meta_dma, 1021 1029 rq_integrity_vec(req)->bv_len, rq_data_dir(req)); 1030 + } 1031 + 1022 1032 if (blk_rq_nr_phys_segments(req)) 1023 1033 nvme_unmap_data(dev, req); 1024 1034 } ··· 1270 1272 1271 1273 static void abort_endio(struct request *req, blk_status_t error) 1272 1274 { 1273 - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 1274 - struct nvme_queue *nvmeq = iod->nvmeq; 1275 + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 1275 1276 1276 1277 dev_warn(nvmeq->dev->ctrl.device, 1277 1278 "Abort status: 0x%x", nvme_req(req)->status); ··· 1332 1335 static enum blk_eh_timer_return nvme_timeout(struct request *req) 1333 1336 { 1334 1337 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 1335 - struct nvme_queue *nvmeq = iod->nvmeq; 1338 + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 1336 1339 struct nvme_dev *dev = nvmeq->dev; 1337 1340 struct request *abort_req; 1338 1341 struct nvme_command cmd = { }; ··· 1413 1416 atomic_inc(&dev->ctrl.abort_limit); 1414 1417 return BLK_EH_RESET_TIMER; 1415 1418 } 1416 - iod->aborted = 1; 1419 + iod->aborted = true; 1417 1420 1418 1421 cmd.abort.opcode = nvme_admin_abort_cmd; 1419 1422 cmd.abort.cid = nvme_cid(req); ··· 2526 2529 2527 2530 set->ops = &nvme_mq_ops; 2528 2531 set->nr_hw_queues = dev->online_queues - 1; 2529 - set->nr_maps = 2; /* default + read */ 2532 + set->nr_maps = 1; 2533 + if (dev->io_queues[HCTX_TYPE_READ]) 2534 + set->nr_maps = 2; 2530 2535 if (dev->io_queues[HCTX_TYPE_POLL]) 2531 - set->nr_maps++; 2536 + set->nr_maps = 3; 2532 2537 set->timeout = NVME_IO_TIMEOUT; 2533 2538 set->numa_node = dev->ctrl.numa_node; 2534 2539 set->queue_depth = min_t(unsigned, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; ··· 2833 2834 nvme_start_admin_queue(&dev->ctrl); 2834 2835 } 2835 2836 2837 + dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1); 2838 + 2836 2839 /* 2837 2840 * Limit the max command size to prevent iod->sg allocations going 2838 2841 * over a single page. ··· 2847 2846 * Don't limit the IOMMU merged segment size. 2848 2847 */ 2849 2848 dma_set_max_seg_size(dev->dev, 0xffffffff); 2850 - dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1); 2851 2849 2852 2850 mutex_unlock(&dev->shutdown_lock); 2853 2851 ··· 3569 3569 BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); 3570 3570 BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); 3571 3571 BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2); 3572 + BUILD_BUG_ON(DIV_ROUND_UP(nvme_pci_npages_prp(), NVME_CTRL_PAGE_SIZE) > 3573 + S8_MAX); 3572 3574 3573 3575 return pci_register_driver(&nvme_driver); 3574 3576 }

+61 -112

drivers/nvme/host/rdma.c

··· 295 295 struct request *rq, unsigned int hctx_idx, 296 296 unsigned int numa_node) 297 297 { 298 - struct nvme_rdma_ctrl *ctrl = set->driver_data; 298 + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(set->driver_data); 299 299 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); 300 300 int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; 301 301 struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; ··· 320 320 static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 321 321 unsigned int hctx_idx) 322 322 { 323 - struct nvme_rdma_ctrl *ctrl = data; 323 + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(data); 324 324 struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1]; 325 325 326 326 BUG_ON(hctx_idx >= ctrl->ctrl.queue_count); ··· 332 332 static int nvme_rdma_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, 333 333 unsigned int hctx_idx) 334 334 { 335 - struct nvme_rdma_ctrl *ctrl = data; 335 + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(data); 336 336 struct nvme_rdma_queue *queue = &ctrl->queues[0]; 337 337 338 338 BUG_ON(hctx_idx != 0); ··· 696 696 return ret; 697 697 } 698 698 699 - static int nvme_rdma_start_io_queues(struct nvme_rdma_ctrl *ctrl) 699 + static int nvme_rdma_start_io_queues(struct nvme_rdma_ctrl *ctrl, 700 + int first, int last) 700 701 { 701 702 int i, ret = 0; 702 703 703 - for (i = 1; i < ctrl->ctrl.queue_count; i++) { 704 + for (i = first; i < last; i++) { 704 705 ret = nvme_rdma_start_queue(ctrl, i); 705 706 if (ret) 706 707 goto out_stop_queues; ··· 710 709 return 0; 711 710 712 711 out_stop_queues: 713 - for (i--; i >= 1; i--) 712 + for (i--; i >= first; i--) 714 713 nvme_rdma_stop_queue(&ctrl->queues[i]); 715 714 return ret; 716 715 } ··· 788 787 return ret; 789 788 } 790 789 791 - static int nvme_rdma_alloc_admin_tag_set(struct nvme_ctrl *nctrl) 790 + static int nvme_rdma_alloc_tag_set(struct nvme_ctrl *ctrl) 792 791 { 793 - struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); 794 - struct blk_mq_tag_set *set = &ctrl->admin_tag_set; 795 - int ret; 792 + unsigned int cmd_size = sizeof(struct nvme_rdma_request) + 793 + NVME_RDMA_DATA_SGL_SIZE; 796 794 797 - memset(set, 0, sizeof(*set)); 798 - set->ops = &nvme_rdma_admin_mq_ops; 799 - set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; 800 - set->reserved_tags = NVMF_RESERVED_TAGS; 801 - set->numa_node = nctrl->numa_node; 802 - set->cmd_size = sizeof(struct nvme_rdma_request) + 803 - NVME_RDMA_DATA_SGL_SIZE; 804 - set->driver_data = ctrl; 805 - set->nr_hw_queues = 1; 806 - set->timeout = NVME_ADMIN_TIMEOUT; 807 - set->flags = BLK_MQ_F_NO_SCHED; 808 - ret = blk_mq_alloc_tag_set(set); 809 - if (!ret) 810 - ctrl->ctrl.admin_tagset = set; 811 - return ret; 795 + if (ctrl->max_integrity_segments) 796 + cmd_size += sizeof(struct nvme_rdma_sgl) + 797 + NVME_RDMA_METADATA_SGL_SIZE; 798 + 799 + return nvme_alloc_io_tag_set(ctrl, &to_rdma_ctrl(ctrl)->tag_set, 800 + &nvme_rdma_mq_ops, BLK_MQ_F_SHOULD_MERGE, cmd_size); 812 801 } 813 802 814 - static int nvme_rdma_alloc_tag_set(struct nvme_ctrl *nctrl) 803 + static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl) 815 804 { 816 - struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); 817 - struct blk_mq_tag_set *set = &ctrl->tag_set; 818 - int ret; 819 - 820 - memset(set, 0, sizeof(*set)); 821 - set->ops = &nvme_rdma_mq_ops; 822 - set->queue_depth = nctrl->sqsize + 1; 823 - set->reserved_tags = NVMF_RESERVED_TAGS; 824 - set->numa_node = nctrl->numa_node; 825 - set->flags = BLK_MQ_F_SHOULD_MERGE; 826 - set->cmd_size = sizeof(struct nvme_rdma_request) + 827 - NVME_RDMA_DATA_SGL_SIZE; 828 - if (nctrl->max_integrity_segments) 829 - set->cmd_size += sizeof(struct nvme_rdma_sgl) + 830 - NVME_RDMA_METADATA_SGL_SIZE; 831 - set->driver_data = ctrl; 832 - set->nr_hw_queues = nctrl->queue_count - 1; 833 - set->timeout = NVME_IO_TIMEOUT; 834 - set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; 835 - ret = blk_mq_alloc_tag_set(set); 836 - if (!ret) 837 - ctrl->ctrl.tagset = set; 838 - return ret; 839 - } 840 - 841 - static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, 842 - bool remove) 843 - { 844 - if (remove) { 845 - blk_mq_destroy_queue(ctrl->ctrl.admin_q); 846 - blk_mq_destroy_queue(ctrl->ctrl.fabrics_q); 847 - blk_mq_free_tag_set(ctrl->ctrl.admin_tagset); 848 - } 849 805 if (ctrl->async_event_sqe.data) { 850 806 cancel_work_sync(&ctrl->ctrl.async_event_work); 851 807 nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe, ··· 844 886 goto out_free_queue; 845 887 846 888 if (new) { 847 - error = nvme_rdma_alloc_admin_tag_set(&ctrl->ctrl); 889 + error = nvme_alloc_admin_tag_set(&ctrl->ctrl, 890 + &ctrl->admin_tag_set, &nvme_rdma_admin_mq_ops, 891 + BLK_MQ_F_NO_SCHED, 892 + sizeof(struct nvme_rdma_request) + 893 + NVME_RDMA_DATA_SGL_SIZE); 848 894 if (error) 849 895 goto out_free_async_qe; 850 896 851 - ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set); 852 - if (IS_ERR(ctrl->ctrl.fabrics_q)) { 853 - error = PTR_ERR(ctrl->ctrl.fabrics_q); 854 - goto out_free_tagset; 855 - } 856 - 857 - ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); 858 - if (IS_ERR(ctrl->ctrl.admin_q)) { 859 - error = PTR_ERR(ctrl->ctrl.admin_q); 860 - goto out_cleanup_fabrics_q; 861 - } 862 897 } 863 898 864 899 error = nvme_rdma_start_queue(ctrl, 0); 865 900 if (error) 866 - goto out_cleanup_queue; 901 + goto out_remove_admin_tag_set; 867 902 868 903 error = nvme_enable_ctrl(&ctrl->ctrl); 869 904 if (error) ··· 883 932 out_stop_queue: 884 933 nvme_rdma_stop_queue(&ctrl->queues[0]); 885 934 nvme_cancel_admin_tagset(&ctrl->ctrl); 886 - out_cleanup_queue: 935 + out_remove_admin_tag_set: 887 936 if (new) 888 - blk_mq_destroy_queue(ctrl->ctrl.admin_q); 889 - out_cleanup_fabrics_q: 890 - if (new) 891 - blk_mq_destroy_queue(ctrl->ctrl.fabrics_q); 892 - out_free_tagset: 893 - if (new) 894 - blk_mq_free_tag_set(ctrl->ctrl.admin_tagset); 937 + nvme_remove_admin_tag_set(&ctrl->ctrl); 895 938 out_free_async_qe: 896 939 if (ctrl->async_event_sqe.data) { 897 940 nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe, ··· 897 952 return error; 898 953 } 899 954 900 - static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl, 901 - bool remove) 902 - { 903 - if (remove) { 904 - blk_mq_destroy_queue(ctrl->ctrl.connect_q); 905 - blk_mq_free_tag_set(ctrl->ctrl.tagset); 906 - } 907 - nvme_rdma_free_io_queues(ctrl); 908 - } 909 - 910 955 static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) 911 956 { 912 - int ret; 957 + int ret, nr_queues; 913 958 914 959 ret = nvme_rdma_alloc_io_queues(ctrl); 915 960 if (ret) ··· 909 974 ret = nvme_rdma_alloc_tag_set(&ctrl->ctrl); 910 975 if (ret) 911 976 goto out_free_io_queues; 912 - 913 - ret = nvme_ctrl_init_connect_q(&(ctrl->ctrl)); 914 - if (ret) 915 - goto out_free_tag_set; 916 977 } 917 978 918 - ret = nvme_rdma_start_io_queues(ctrl); 979 + /* 980 + * Only start IO queues for which we have allocated the tagset 981 + * and limitted it to the available queues. On reconnects, the 982 + * queue number might have changed. 983 + */ 984 + nr_queues = min(ctrl->tag_set.nr_hw_queues + 1, ctrl->ctrl.queue_count); 985 + ret = nvme_rdma_start_io_queues(ctrl, 1, nr_queues); 919 986 if (ret) 920 - goto out_cleanup_connect_q; 987 + goto out_cleanup_tagset; 921 988 922 989 if (!new) { 923 990 nvme_start_queues(&ctrl->ctrl); ··· 937 1000 nvme_unfreeze(&ctrl->ctrl); 938 1001 } 939 1002 1003 + /* 1004 + * If the number of queues has increased (reconnect case) 1005 + * start all new queues now. 1006 + */ 1007 + ret = nvme_rdma_start_io_queues(ctrl, nr_queues, 1008 + ctrl->tag_set.nr_hw_queues + 1); 1009 + if (ret) 1010 + goto out_wait_freeze_timed_out; 1011 + 940 1012 return 0; 941 1013 942 1014 out_wait_freeze_timed_out: 943 1015 nvme_stop_queues(&ctrl->ctrl); 944 1016 nvme_sync_io_queues(&ctrl->ctrl); 945 1017 nvme_rdma_stop_io_queues(ctrl); 946 - out_cleanup_connect_q: 1018 + out_cleanup_tagset: 947 1019 nvme_cancel_tagset(&ctrl->ctrl); 948 1020 if (new) 949 - blk_mq_destroy_queue(ctrl->ctrl.connect_q); 950 - out_free_tag_set: 951 - if (new) 952 - blk_mq_free_tag_set(ctrl->ctrl.tagset); 1021 + nvme_remove_io_tag_set(&ctrl->ctrl); 953 1022 out_free_io_queues: 954 1023 nvme_rdma_free_io_queues(ctrl); 955 1024 return ret; ··· 968 1025 blk_sync_queue(ctrl->ctrl.admin_q); 969 1026 nvme_rdma_stop_queue(&ctrl->queues[0]); 970 1027 nvme_cancel_admin_tagset(&ctrl->ctrl); 971 - if (remove) 1028 + if (remove) { 972 1029 nvme_start_admin_queue(&ctrl->ctrl); 973 - nvme_rdma_destroy_admin_queue(ctrl, remove); 1030 + nvme_remove_admin_tag_set(&ctrl->ctrl); 1031 + } 1032 + nvme_rdma_destroy_admin_queue(ctrl); 974 1033 } 975 1034 976 1035 static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, ··· 984 1039 nvme_sync_io_queues(&ctrl->ctrl); 985 1040 nvme_rdma_stop_io_queues(ctrl); 986 1041 nvme_cancel_tagset(&ctrl->ctrl); 987 - if (remove) 1042 + if (remove) { 988 1043 nvme_start_queues(&ctrl->ctrl); 989 - nvme_rdma_destroy_io_queues(ctrl, remove); 1044 + nvme_remove_io_tag_set(&ctrl->ctrl); 1045 + } 1046 + nvme_rdma_free_io_queues(ctrl); 990 1047 } 991 1048 } 992 1049 ··· 1110 1163 nvme_sync_io_queues(&ctrl->ctrl); 1111 1164 nvme_rdma_stop_io_queues(ctrl); 1112 1165 nvme_cancel_tagset(&ctrl->ctrl); 1113 - nvme_rdma_destroy_io_queues(ctrl, new); 1166 + if (new) 1167 + nvme_remove_io_tag_set(&ctrl->ctrl); 1168 + nvme_rdma_free_io_queues(ctrl); 1114 1169 } 1115 1170 destroy_admin: 1116 1171 nvme_stop_admin_queue(&ctrl->ctrl); 1117 1172 blk_sync_queue(ctrl->ctrl.admin_q); 1118 1173 nvme_rdma_stop_queue(&ctrl->queues[0]); 1119 1174 nvme_cancel_admin_tagset(&ctrl->ctrl); 1120 - nvme_rdma_destroy_admin_queue(ctrl, new); 1175 + if (new) 1176 + nvme_remove_admin_tag_set(&ctrl->ctrl); 1177 + nvme_rdma_destroy_admin_queue(ctrl); 1121 1178 return ret; 1122 1179 } 1123 1180 ··· 2139 2188 nvme_complete_rq(rq); 2140 2189 } 2141 2190 2142 - static int nvme_rdma_map_queues(struct blk_mq_tag_set *set) 2191 + static void nvme_rdma_map_queues(struct blk_mq_tag_set *set) 2143 2192 { 2144 - struct nvme_rdma_ctrl *ctrl = set->driver_data; 2193 + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(set->driver_data); 2145 2194 struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; 2146 2195 2147 2196 if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) { ··· 2182 2231 ctrl->io_queues[HCTX_TYPE_DEFAULT], 2183 2232 ctrl->io_queues[HCTX_TYPE_READ], 2184 2233 ctrl->io_queues[HCTX_TYPE_POLL]); 2185 - 2186 - return 0; 2187 2234 } 2188 2235 2189 2236 static const struct blk_mq_ops nvme_rdma_mq_ops = {

+65 -104

drivers/nvme/host/tcp.c

··· 133 133 /* send state */ 134 134 struct nvme_tcp_request *request; 135 135 136 - int queue_size; 137 136 u32 maxh2cdata; 138 137 size_t cmnd_capsule_len; 139 138 struct nvme_tcp_ctrl *ctrl; ··· 462 463 struct request *rq, unsigned int hctx_idx, 463 464 unsigned int numa_node) 464 465 { 465 - struct nvme_tcp_ctrl *ctrl = set->driver_data; 466 + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data); 466 467 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); 467 468 struct nvme_tcp_cmd_pdu *pdu; 468 469 int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; ··· 486 487 static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 487 488 unsigned int hctx_idx) 488 489 { 489 - struct nvme_tcp_ctrl *ctrl = data; 490 + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(data); 490 491 struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1]; 491 492 492 493 hctx->driver_data = queue; ··· 496 497 static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, 497 498 unsigned int hctx_idx) 498 499 { 499 - struct nvme_tcp_ctrl *ctrl = data; 500 + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(data); 500 501 struct nvme_tcp_queue *queue = &ctrl->queues[0]; 501 502 502 503 hctx->driver_data = queue; ··· 1475 1476 queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); 1476 1477 } 1477 1478 1478 - static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, 1479 - int qid, size_t queue_size) 1479 + static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid) 1480 1480 { 1481 1481 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); 1482 1482 struct nvme_tcp_queue *queue = &ctrl->queues[qid]; ··· 1487 1489 INIT_LIST_HEAD(&queue->send_list); 1488 1490 mutex_init(&queue->send_mutex); 1489 1491 INIT_WORK(&queue->io_work, nvme_tcp_io_work); 1490 - queue->queue_size = queue_size; 1491 1492 1492 1493 if (qid > 0) 1493 1494 queue->cmnd_capsule_len = nctrl->ioccsz * 16; ··· 1684 1687 return ret; 1685 1688 } 1686 1689 1687 - static int nvme_tcp_alloc_admin_tag_set(struct nvme_ctrl *nctrl) 1688 - { 1689 - struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); 1690 - struct blk_mq_tag_set *set = &ctrl->admin_tag_set; 1691 - int ret; 1692 - 1693 - memset(set, 0, sizeof(*set)); 1694 - set->ops = &nvme_tcp_admin_mq_ops; 1695 - set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; 1696 - set->reserved_tags = NVMF_RESERVED_TAGS; 1697 - set->numa_node = nctrl->numa_node; 1698 - set->flags = BLK_MQ_F_BLOCKING; 1699 - set->cmd_size = sizeof(struct nvme_tcp_request); 1700 - set->driver_data = ctrl; 1701 - set->nr_hw_queues = 1; 1702 - set->timeout = NVME_ADMIN_TIMEOUT; 1703 - ret = blk_mq_alloc_tag_set(set); 1704 - if (!ret) 1705 - nctrl->admin_tagset = set; 1706 - return ret; 1707 - } 1708 - 1709 - static int nvme_tcp_alloc_tag_set(struct nvme_ctrl *nctrl) 1710 - { 1711 - struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); 1712 - struct blk_mq_tag_set *set = &ctrl->tag_set; 1713 - int ret; 1714 - 1715 - memset(set, 0, sizeof(*set)); 1716 - set->ops = &nvme_tcp_mq_ops; 1717 - set->queue_depth = nctrl->sqsize + 1; 1718 - set->reserved_tags = NVMF_RESERVED_TAGS; 1719 - set->numa_node = nctrl->numa_node; 1720 - set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; 1721 - set->cmd_size = sizeof(struct nvme_tcp_request); 1722 - set->driver_data = ctrl; 1723 - set->nr_hw_queues = nctrl->queue_count - 1; 1724 - set->timeout = NVME_IO_TIMEOUT; 1725 - set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; 1726 - ret = blk_mq_alloc_tag_set(set); 1727 - if (!ret) 1728 - nctrl->tagset = set; 1729 - return ret; 1730 - } 1731 - 1732 1690 static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl) 1733 1691 { 1734 1692 if (to_tcp_ctrl(ctrl)->async_req.pdu) { ··· 1711 1759 nvme_tcp_stop_queue(ctrl, i); 1712 1760 } 1713 1761 1714 - static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl) 1762 + static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl, 1763 + int first, int last) 1715 1764 { 1716 1765 int i, ret; 1717 1766 1718 - for (i = 1; i < ctrl->queue_count; i++) { 1767 + for (i = first; i < last; i++) { 1719 1768 ret = nvme_tcp_start_queue(ctrl, i); 1720 1769 if (ret) 1721 1770 goto out_stop_queues; ··· 1725 1772 return 0; 1726 1773 1727 1774 out_stop_queues: 1728 - for (i--; i >= 1; i--) 1775 + for (i--; i >= first; i--) 1729 1776 nvme_tcp_stop_queue(ctrl, i); 1730 1777 return ret; 1731 1778 } ··· 1734 1781 { 1735 1782 int ret; 1736 1783 1737 - ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH); 1784 + ret = nvme_tcp_alloc_queue(ctrl, 0); 1738 1785 if (ret) 1739 1786 return ret; 1740 1787 ··· 1754 1801 int i, ret; 1755 1802 1756 1803 for (i = 1; i < ctrl->queue_count; i++) { 1757 - ret = nvme_tcp_alloc_queue(ctrl, i, ctrl->sqsize + 1); 1804 + ret = nvme_tcp_alloc_queue(ctrl, i); 1758 1805 if (ret) 1759 1806 goto out_free_queues; 1760 1807 } ··· 1842 1889 static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove) 1843 1890 { 1844 1891 nvme_tcp_stop_io_queues(ctrl); 1845 - if (remove) { 1846 - blk_mq_destroy_queue(ctrl->connect_q); 1847 - blk_mq_free_tag_set(ctrl->tagset); 1848 - } 1892 + if (remove) 1893 + nvme_remove_io_tag_set(ctrl); 1849 1894 nvme_tcp_free_io_queues(ctrl); 1850 1895 } 1851 1896 1852 1897 static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) 1853 1898 { 1854 - int ret; 1899 + int ret, nr_queues; 1855 1900 1856 1901 ret = nvme_tcp_alloc_io_queues(ctrl); 1857 1902 if (ret) 1858 1903 return ret; 1859 1904 1860 1905 if (new) { 1861 - ret = nvme_tcp_alloc_tag_set(ctrl); 1906 + ret = nvme_alloc_io_tag_set(ctrl, &to_tcp_ctrl(ctrl)->tag_set, 1907 + &nvme_tcp_mq_ops, 1908 + BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING, 1909 + sizeof(struct nvme_tcp_request)); 1862 1910 if (ret) 1863 1911 goto out_free_io_queues; 1864 - 1865 - ret = nvme_ctrl_init_connect_q(ctrl); 1866 - if (ret) 1867 - goto out_free_tag_set; 1868 1912 } 1869 1913 1870 - ret = nvme_tcp_start_io_queues(ctrl); 1914 + /* 1915 + * Only start IO queues for which we have allocated the tagset 1916 + * and limitted it to the available queues. On reconnects, the 1917 + * queue number might have changed. 1918 + */ 1919 + nr_queues = min(ctrl->tagset->nr_hw_queues + 1, ctrl->queue_count); 1920 + ret = nvme_tcp_start_io_queues(ctrl, 1, nr_queues); 1871 1921 if (ret) 1872 1922 goto out_cleanup_connect_q; 1873 1923 ··· 1890 1934 nvme_unfreeze(ctrl); 1891 1935 } 1892 1936 1937 + /* 1938 + * If the number of queues has increased (reconnect case) 1939 + * start all new queues now. 1940 + */ 1941 + ret = nvme_tcp_start_io_queues(ctrl, nr_queues, 1942 + ctrl->tagset->nr_hw_queues + 1); 1943 + if (ret) 1944 + goto out_wait_freeze_timed_out; 1945 + 1893 1946 return 0; 1894 1947 1895 1948 out_wait_freeze_timed_out: ··· 1908 1943 out_cleanup_connect_q: 1909 1944 nvme_cancel_tagset(ctrl); 1910 1945 if (new) 1911 - blk_mq_destroy_queue(ctrl->connect_q); 1912 - out_free_tag_set: 1913 - if (new) 1914 - blk_mq_free_tag_set(ctrl->tagset); 1946 + nvme_remove_io_tag_set(ctrl); 1915 1947 out_free_io_queues: 1916 1948 nvme_tcp_free_io_queues(ctrl); 1917 1949 return ret; ··· 1917 1955 static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove) 1918 1956 { 1919 1957 nvme_tcp_stop_queue(ctrl, 0); 1920 - if (remove) { 1921 - blk_mq_destroy_queue(ctrl->admin_q); 1922 - blk_mq_destroy_queue(ctrl->fabrics_q); 1923 - blk_mq_free_tag_set(ctrl->admin_tagset); 1924 - } 1958 + if (remove) 1959 + nvme_remove_admin_tag_set(ctrl); 1925 1960 nvme_tcp_free_admin_queue(ctrl); 1926 1961 } 1927 1962 ··· 1931 1972 return error; 1932 1973 1933 1974 if (new) { 1934 - error = nvme_tcp_alloc_admin_tag_set(ctrl); 1975 + error = nvme_alloc_admin_tag_set(ctrl, 1976 + &to_tcp_ctrl(ctrl)->admin_tag_set, 1977 + &nvme_tcp_admin_mq_ops, BLK_MQ_F_BLOCKING, 1978 + sizeof(struct nvme_tcp_request)); 1935 1979 if (error) 1936 1980 goto out_free_queue; 1937 - 1938 - ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset); 1939 - if (IS_ERR(ctrl->fabrics_q)) { 1940 - error = PTR_ERR(ctrl->fabrics_q); 1941 - goto out_free_tagset; 1942 - } 1943 - 1944 - ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset); 1945 - if (IS_ERR(ctrl->admin_q)) { 1946 - error = PTR_ERR(ctrl->admin_q); 1947 - goto out_cleanup_fabrics_q; 1948 - } 1949 1981 } 1950 1982 1951 1983 error = nvme_tcp_start_queue(ctrl, 0); 1952 1984 if (error) 1953 - goto out_cleanup_queue; 1985 + goto out_cleanup_tagset; 1954 1986 1955 1987 error = nvme_enable_ctrl(ctrl); 1956 1988 if (error) ··· 1961 2011 out_stop_queue: 1962 2012 nvme_tcp_stop_queue(ctrl, 0); 1963 2013 nvme_cancel_admin_tagset(ctrl); 1964 - out_cleanup_queue: 2014 + out_cleanup_tagset: 1965 2015 if (new) 1966 - blk_mq_destroy_queue(ctrl->admin_q); 1967 - out_cleanup_fabrics_q: 1968 - if (new) 1969 - blk_mq_destroy_queue(ctrl->fabrics_q); 1970 - out_free_tagset: 1971 - if (new) 1972 - blk_mq_free_tag_set(ctrl->admin_tagset); 2016 + nvme_remove_admin_tag_set(ctrl); 1973 2017 out_free_queue: 1974 2018 nvme_tcp_free_admin_queue(ctrl); 1975 2019 return error; ··· 2412 2468 return BLK_STS_OK; 2413 2469 } 2414 2470 2415 - static int nvme_tcp_map_queues(struct blk_mq_tag_set *set) 2471 + static void nvme_tcp_map_queues(struct blk_mq_tag_set *set) 2416 2472 { 2417 - struct nvme_tcp_ctrl *ctrl = set->driver_data; 2473 + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data); 2418 2474 struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; 2419 2475 2420 2476 if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) { ··· 2453 2509 ctrl->io_queues[HCTX_TYPE_DEFAULT], 2454 2510 ctrl->io_queues[HCTX_TYPE_READ], 2455 2511 ctrl->io_queues[HCTX_TYPE_POLL]); 2456 - 2457 - return 0; 2458 2512 } 2459 2513 2460 2514 static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) ··· 2469 2527 nvme_tcp_try_recv(queue); 2470 2528 clear_bit(NVME_TCP_Q_POLLING, &queue->flags); 2471 2529 return queue->nr_cqe; 2530 + } 2531 + 2532 + static int nvme_tcp_get_address(struct nvme_ctrl *ctrl, char *buf, int size) 2533 + { 2534 + struct nvme_tcp_queue *queue = &to_tcp_ctrl(ctrl)->queues[0]; 2535 + struct sockaddr_storage src_addr; 2536 + int ret, len; 2537 + 2538 + len = nvmf_get_address(ctrl, buf, size); 2539 + 2540 + ret = kernel_getsockname(queue->sock, (struct sockaddr *)&src_addr); 2541 + if (ret > 0) { 2542 + if (len > 0) 2543 + len--; /* strip trailing newline */ 2544 + len += scnprintf(buf + len, size - len, "%ssrc_addr=%pISc\n", 2545 + (len) ? "," : "", &src_addr); 2546 + } 2547 + 2548 + return len; 2472 2549 } 2473 2550 2474 2551 static const struct blk_mq_ops nvme_tcp_mq_ops = { ··· 2521 2560 .free_ctrl = nvme_tcp_free_ctrl, 2522 2561 .submit_async_event = nvme_tcp_submit_async_event, 2523 2562 .delete_ctrl = nvme_tcp_delete_ctrl, 2524 - .get_address = nvmf_get_address, 2563 + .get_address = nvme_tcp_get_address, 2525 2564 .stop_ctrl = nvme_tcp_stop_ctrl, 2526 2565 }; 2527 2566

+1 -1

drivers/nvme/target/admin-cmd.c

··· 449 449 if (req->port->inline_data_size) 450 450 id->sgls |= cpu_to_le32(1 << 20); 451 451 452 - strlcpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn)); 452 + strscpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn)); 453 453 454 454 /* 455 455 * Max command capsule size is sqe + in-capsule data size.

+29

drivers/nvme/target/configfs.c

··· 1281 1281 CONFIGFS_ATTR(nvmet_subsys_, attr_pi_enable); 1282 1282 #endif 1283 1283 1284 + static ssize_t nvmet_subsys_attr_qid_max_show(struct config_item *item, 1285 + char *page) 1286 + { 1287 + return snprintf(page, PAGE_SIZE, "%u\n", to_subsys(item)->max_qid); 1288 + } 1289 + 1290 + static ssize_t nvmet_subsys_attr_qid_max_store(struct config_item *item, 1291 + const char *page, size_t cnt) 1292 + { 1293 + struct nvmet_port *port = to_nvmet_port(item); 1294 + u16 qid_max; 1295 + 1296 + if (nvmet_is_port_enabled(port, __func__)) 1297 + return -EACCES; 1298 + 1299 + if (sscanf(page, "%hu\n", &qid_max) != 1) 1300 + return -EINVAL; 1301 + 1302 + if (qid_max < 1 || qid_max > NVMET_NR_QUEUES) 1303 + return -EINVAL; 1304 + 1305 + down_write(&nvmet_config_sem); 1306 + to_subsys(item)->max_qid = qid_max; 1307 + up_write(&nvmet_config_sem); 1308 + return cnt; 1309 + } 1310 + CONFIGFS_ATTR(nvmet_subsys_, attr_qid_max); 1311 + 1284 1312 static struct configfs_attribute *nvmet_subsys_attrs[] = { 1285 1313 &nvmet_subsys_attr_attr_allow_any_host, 1286 1314 &nvmet_subsys_attr_attr_version, ··· 1316 1288 &nvmet_subsys_attr_attr_cntlid_min, 1317 1289 &nvmet_subsys_attr_attr_cntlid_max, 1318 1290 &nvmet_subsys_attr_attr_model, 1291 + &nvmet_subsys_attr_attr_qid_max, 1319 1292 #ifdef CONFIG_BLK_DEV_INTEGRITY 1320 1293 &nvmet_subsys_attr_attr_pi_enable, 1321 1294 #endif

+1

drivers/nvme/target/core.c

··· 832 832 } 833 833 init_completion(&sq->free_done); 834 834 init_completion(&sq->confirm_done); 835 + nvmet_auth_sq_init(sq); 835 836 836 837 return 0; 837 838 }

+1 -1

drivers/nvme/target/discovery.c

··· 292 292 293 293 id->oaes = cpu_to_le32(NVMET_DISC_AEN_CFG_OPTIONAL); 294 294 295 - strlcpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn)); 295 + strscpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn)); 296 296 297 297 status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); 298 298

+8 -15

drivers/nvme/target/fabrics-cmd-auth.c

··· 23 23 sq->dhchap_tid = -1; 24 24 } 25 25 26 - void nvmet_init_auth(struct nvmet_ctrl *ctrl, struct nvmet_req *req) 26 + void nvmet_auth_sq_init(struct nvmet_sq *sq) 27 27 { 28 - u32 result = le32_to_cpu(req->cqe->result.u32); 29 - 30 28 /* Initialize in-band authentication */ 31 - INIT_DELAYED_WORK(&req->sq->auth_expired_work, 32 - nvmet_auth_expired_work); 33 - req->sq->authenticated = false; 34 - req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE; 35 - result |= (u32)NVME_CONNECT_AUTHREQ_ATR << 16; 36 - req->cqe->result.u32 = cpu_to_le32(result); 29 + INIT_DELAYED_WORK(&sq->auth_expired_work, nvmet_auth_expired_work); 30 + sq->authenticated = false; 31 + sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE; 37 32 } 38 33 39 34 static u16 nvmet_auth_negotiate(struct nvmet_req *req, void *d) ··· 172 177 return 0; 173 178 } 174 179 175 - static u16 nvmet_auth_failure2(struct nvmet_req *req, void *d) 180 + static u16 nvmet_auth_failure2(void *d) 176 181 { 177 182 struct nvmf_auth_dhchap_failure_data *data = d; 178 183 ··· 224 229 } 225 230 226 231 status = nvmet_copy_from_sgl(req, 0, d, tl); 227 - if (status) { 228 - kfree(d); 229 - goto done; 230 - } 232 + if (status) 233 + goto done_kfree; 231 234 232 235 data = d; 233 236 pr_debug("%s: ctrl %d qid %d type %d id %d step %x\n", __func__, ··· 303 310 goto done_kfree; 304 311 break; 305 312 case NVME_AUTH_DHCHAP_MESSAGE_FAILURE2: 306 - status = nvmet_auth_failure2(req, d); 313 + status = nvmet_auth_failure2(d); 307 314 if (status) { 308 315 pr_warn("ctrl %d qid %d: authentication failed (%d)\n", 309 316 ctrl->cntlid, req->sq->qid, status);

+8 -11

drivers/nvme/target/fabrics-cmd.c

··· 198 198 return ret; 199 199 } 200 200 201 + static u32 nvmet_connect_result(struct nvmet_ctrl *ctrl) 202 + { 203 + return (u32)ctrl->cntlid | 204 + (nvmet_has_auth(ctrl) ? NVME_CONNECT_AUTHREQ_ATR : 0); 205 + } 206 + 201 207 static void nvmet_execute_admin_connect(struct nvmet_req *req) 202 208 { 203 209 struct nvmf_connect_command *c = &req->cmd->connect; ··· 275 269 ctrl->cntlid, ctrl->subsys->subsysnqn, ctrl->hostnqn, 276 270 ctrl->pi_support ? " T10-PI is enabled" : "", 277 271 nvmet_has_auth(ctrl) ? " with DH-HMAC-CHAP" : ""); 278 - req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); 279 - 280 - if (nvmet_has_auth(ctrl)) 281 - nvmet_init_auth(ctrl, req); 272 + req->cqe->result.u32 = cpu_to_le32(nvmet_connect_result(ctrl)); 282 273 out: 283 274 kfree(d); 284 275 complete: ··· 331 328 if (status) 332 329 goto out_ctrl_put; 333 330 334 - /* pass back cntlid for successful completion */ 335 - req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); 336 - 337 331 pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid); 338 - req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); 339 - if (nvmet_has_auth(ctrl)) 340 - nvmet_init_auth(ctrl, req); 341 - 332 + req->cqe->result.u32 = cpu_to_le32(nvmet_connect_result(ctrl)); 342 333 out: 343 334 kfree(d); 344 335 complete:

+13 -6

drivers/nvme/target/io-cmd-bdev.c

··· 12 12 13 13 void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) 14 14 { 15 - const struct queue_limits *ql = &bdev_get_queue(bdev)->limits; 16 - /* Number of logical blocks per physical block. */ 17 - const u32 lpp = ql->physical_block_size / ql->logical_block_size; 18 15 /* Logical blocks per physical block, 0's based. */ 19 - const __le16 lpp0b = to0based(lpp); 16 + const __le16 lpp0b = to0based(bdev_physical_block_size(bdev) / 17 + bdev_logical_block_size(bdev)); 20 18 21 19 /* 22 20 * For NVMe 1.2 and later, bit 1 indicates that the fields NAWUN, ··· 40 42 /* NPWA = Namespace Preferred Write Alignment. 0's based */ 41 43 id->npwa = id->npwg; 42 44 /* NPDG = Namespace Preferred Deallocate Granularity. 0's based */ 43 - id->npdg = to0based(ql->discard_granularity / ql->logical_block_size); 45 + id->npdg = to0based(bdev_discard_granularity(bdev) / 46 + bdev_logical_block_size(bdev)); 44 47 /* NPDG = Namespace Preferred Deallocate Alignment */ 45 48 id->npda = id->npdg; 46 49 /* NOWS = Namespace Optimal Write Size */ 47 - id->nows = to0based(ql->io_opt / ql->logical_block_size); 50 + id->nows = to0based(bdev_io_opt(bdev) / bdev_logical_block_size(bdev)); 48 51 } 49 52 50 53 void nvmet_bdev_ns_disable(struct nvmet_ns *ns) ··· 333 334 { 334 335 struct bio *bio = &req->b.inline_bio; 335 336 337 + if (!bdev_write_cache(req->ns->bdev)) { 338 + nvmet_req_complete(req, NVME_SC_SUCCESS); 339 + return; 340 + } 341 + 336 342 if (!nvmet_check_transfer_len(req, 0)) 337 343 return; 338 344 ··· 351 347 352 348 u16 nvmet_bdev_flush(struct nvmet_req *req) 353 349 { 350 + if (!bdev_write_cache(req->ns->bdev)) 351 + return 0; 352 + 354 353 if (blkdev_issue_flush(req->ns->bdev)) 355 354 return NVME_SC_INTERNAL | NVME_SC_DNR; 356 355 return 0;

+23 -68

drivers/nvme/target/loop.c

··· 204 204 struct request *req, unsigned int hctx_idx, 205 205 unsigned int numa_node) 206 206 { 207 - struct nvme_loop_ctrl *ctrl = set->driver_data; 207 + struct nvme_loop_ctrl *ctrl = to_loop_ctrl(set->driver_data); 208 208 struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); 209 209 210 210 nvme_req(req)->ctrl = &ctrl->ctrl; ··· 218 218 static int nvme_loop_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 219 219 unsigned int hctx_idx) 220 220 { 221 - struct nvme_loop_ctrl *ctrl = data; 221 + struct nvme_loop_ctrl *ctrl = to_loop_ctrl(data); 222 222 struct nvme_loop_queue *queue = &ctrl->queues[hctx_idx + 1]; 223 223 224 224 BUG_ON(hctx_idx >= ctrl->ctrl.queue_count); ··· 238 238 static int nvme_loop_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, 239 239 unsigned int hctx_idx) 240 240 { 241 - struct nvme_loop_ctrl *ctrl = data; 241 + struct nvme_loop_ctrl *ctrl = to_loop_ctrl(data); 242 242 struct nvme_loop_queue *queue = &ctrl->queues[0]; 243 243 244 244 BUG_ON(hctx_idx != 0); ··· 266 266 if (!test_and_clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags)) 267 267 return; 268 268 nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); 269 - blk_mq_destroy_queue(ctrl->ctrl.admin_q); 270 - blk_mq_destroy_queue(ctrl->ctrl.fabrics_q); 271 - blk_mq_free_tag_set(&ctrl->admin_tag_set); 269 + nvme_remove_admin_tag_set(&ctrl->ctrl); 272 270 } 273 271 274 272 static void nvme_loop_free_ctrl(struct nvme_ctrl *nctrl) ··· 280 282 list_del(&ctrl->list); 281 283 mutex_unlock(&nvme_loop_ctrl_mutex); 282 284 283 - if (nctrl->tagset) { 284 - blk_mq_destroy_queue(ctrl->ctrl.connect_q); 285 - blk_mq_free_tag_set(&ctrl->tag_set); 286 - } 285 + if (nctrl->tagset) 286 + nvme_remove_io_tag_set(nctrl); 287 287 kfree(ctrl->queues); 288 288 nvmf_free_options(nctrl->opts); 289 289 free_ctrl: ··· 346 350 { 347 351 int error; 348 352 349 - memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); 350 - ctrl->admin_tag_set.ops = &nvme_loop_admin_mq_ops; 351 - ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH; 352 - ctrl->admin_tag_set.reserved_tags = NVMF_RESERVED_TAGS; 353 - ctrl->admin_tag_set.numa_node = ctrl->ctrl.numa_node; 354 - ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_loop_iod) + 355 - NVME_INLINE_SG_CNT * sizeof(struct scatterlist); 356 - ctrl->admin_tag_set.driver_data = ctrl; 357 - ctrl->admin_tag_set.nr_hw_queues = 1; 358 - ctrl->admin_tag_set.timeout = NVME_ADMIN_TIMEOUT; 359 - ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED; 360 - 361 353 ctrl->queues[0].ctrl = ctrl; 362 354 error = nvmet_sq_init(&ctrl->queues[0].nvme_sq); 363 355 if (error) 364 356 return error; 365 357 ctrl->ctrl.queue_count = 1; 366 358 367 - error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); 359 + error = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set, 360 + &nvme_loop_admin_mq_ops, BLK_MQ_F_NO_SCHED, 361 + sizeof(struct nvme_loop_iod) + 362 + NVME_INLINE_SG_CNT * sizeof(struct scatterlist)); 368 363 if (error) 369 364 goto out_free_sq; 370 - ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set; 371 365 372 - ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set); 373 - if (IS_ERR(ctrl->ctrl.fabrics_q)) { 374 - error = PTR_ERR(ctrl->ctrl.fabrics_q); 375 - goto out_free_tagset; 376 - } 377 - 378 - ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); 379 - if (IS_ERR(ctrl->ctrl.admin_q)) { 380 - error = PTR_ERR(ctrl->ctrl.admin_q); 381 - goto out_cleanup_fabrics_q; 382 - } 383 366 /* reset stopped state for the fresh admin queue */ 384 367 clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->ctrl.flags); 385 368 386 369 error = nvmf_connect_admin_queue(&ctrl->ctrl); 387 370 if (error) 388 - goto out_cleanup_queue; 371 + goto out_cleanup_tagset; 389 372 390 373 set_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); 391 374 392 375 error = nvme_enable_ctrl(&ctrl->ctrl); 393 376 if (error) 394 - goto out_cleanup_queue; 377 + goto out_cleanup_tagset; 395 378 396 379 ctrl->ctrl.max_hw_sectors = 397 380 (NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9); ··· 379 404 380 405 error = nvme_init_ctrl_finish(&ctrl->ctrl); 381 406 if (error) 382 - goto out_cleanup_queue; 407 + goto out_cleanup_tagset; 383 408 384 409 return 0; 385 410 386 - out_cleanup_queue: 411 + out_cleanup_tagset: 387 412 clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); 388 - blk_mq_destroy_queue(ctrl->ctrl.admin_q); 389 - out_cleanup_fabrics_q: 390 - blk_mq_destroy_queue(ctrl->ctrl.fabrics_q); 391 - out_free_tagset: 392 - blk_mq_free_tag_set(&ctrl->admin_tag_set); 413 + nvme_remove_admin_tag_set(&ctrl->ctrl); 393 414 out_free_sq: 394 415 nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); 395 416 return error; ··· 493 522 if (ret) 494 523 return ret; 495 524 496 - memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set)); 497 - ctrl->tag_set.ops = &nvme_loop_mq_ops; 498 - ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size; 499 - ctrl->tag_set.reserved_tags = NVMF_RESERVED_TAGS; 500 - ctrl->tag_set.numa_node = ctrl->ctrl.numa_node; 501 - ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; 502 - ctrl->tag_set.cmd_size = sizeof(struct nvme_loop_iod) + 503 - NVME_INLINE_SG_CNT * sizeof(struct scatterlist); 504 - ctrl->tag_set.driver_data = ctrl; 505 - ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1; 506 - ctrl->tag_set.timeout = NVME_IO_TIMEOUT; 507 - ctrl->ctrl.tagset = &ctrl->tag_set; 508 - 509 - ret = blk_mq_alloc_tag_set(&ctrl->tag_set); 525 + ret = nvme_alloc_io_tag_set(&ctrl->ctrl, &ctrl->tag_set, 526 + &nvme_loop_mq_ops, BLK_MQ_F_SHOULD_MERGE, 527 + sizeof(struct nvme_loop_iod) + 528 + NVME_INLINE_SG_CNT * sizeof(struct scatterlist)); 510 529 if (ret) 511 530 goto out_destroy_queues; 512 531 513 - ret = nvme_ctrl_init_connect_q(&(ctrl->ctrl)); 514 - if (ret) 515 - goto out_free_tagset; 516 - 517 532 ret = nvme_loop_connect_io_queues(ctrl); 518 533 if (ret) 519 - goto out_cleanup_connect_q; 534 + goto out_cleanup_tagset; 520 535 521 536 return 0; 522 537 523 - out_cleanup_connect_q: 524 - blk_mq_destroy_queue(ctrl->ctrl.connect_q); 525 - out_free_tagset: 526 - blk_mq_free_tag_set(&ctrl->tag_set); 538 + out_cleanup_tagset: 539 + nvme_remove_io_tag_set(&ctrl->ctrl); 527 540 out_destroy_queues: 528 541 nvme_loop_destroy_io_queues(ctrl); 529 542 return ret; ··· 556 601 557 602 ret = -ENOMEM; 558 603 559 - ctrl->ctrl.sqsize = opts->queue_size - 1; 560 604 ctrl->ctrl.kato = opts->kato; 561 605 ctrl->port = nvme_loop_find_port(&ctrl->ctrl); 562 606 ··· 575 621 opts->queue_size, ctrl->ctrl.maxcmd); 576 622 opts->queue_size = ctrl->ctrl.maxcmd; 577 623 } 624 + ctrl->ctrl.sqsize = opts->queue_size - 1; 578 625 579 626 if (opts->nr_io_queues) { 580 627 ret = nvme_loop_create_io_queues(ctrl);

+4 -3

drivers/nvme/target/nvmet.h

··· 704 704 bool set_ctrl); 705 705 int nvmet_auth_set_host_hash(struct nvmet_host *host, const char *hash); 706 706 int nvmet_setup_auth(struct nvmet_ctrl *ctrl); 707 - void nvmet_init_auth(struct nvmet_ctrl *ctrl, struct nvmet_req *req); 707 + void nvmet_auth_sq_init(struct nvmet_sq *sq); 708 708 void nvmet_destroy_auth(struct nvmet_ctrl *ctrl); 709 709 void nvmet_auth_sq_free(struct nvmet_sq *sq); 710 710 int nvmet_setup_dhgroup(struct nvmet_ctrl *ctrl, u8 dhgroup_id); ··· 726 726 { 727 727 return 0; 728 728 } 729 - static inline void nvmet_init_auth(struct nvmet_ctrl *ctrl, 730 - struct nvmet_req *req) {}; 729 + static inline void nvmet_auth_sq_init(struct nvmet_sq *sq) 730 + { 731 + } 731 732 static inline void nvmet_destroy_auth(struct nvmet_ctrl *ctrl) {}; 732 733 static inline void nvmet_auth_sq_free(struct nvmet_sq *sq) {}; 733 734 static inline bool nvmet_check_auth_status(struct nvmet_req *req)

+6 -1

drivers/nvme/target/passthru.c

··· 215 215 { 216 216 struct nvmet_req *req = container_of(w, struct nvmet_req, p.work); 217 217 struct request *rq = req->p.rq; 218 + struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl; 219 + u32 effects; 218 220 int status; 219 221 220 - status = nvme_execute_passthru_rq(rq); 222 + status = nvme_execute_passthru_rq(rq, &effects); 221 223 222 224 if (status == NVME_SC_SUCCESS && 223 225 req->cmd->common.opcode == nvme_admin_identify) { ··· 240 238 req->cqe->result = nvme_req(rq)->result; 241 239 nvmet_req_complete(req, status); 242 240 blk_mq_free_request(rq); 241 + 242 + if (effects) 243 + nvme_passthru_end(ctrl, effects, req->cmd, status); 243 244 } 244 245 245 246 static void nvmet_passthru_req_done(struct request *rq,

+47 -44

drivers/nvme/target/tcp.c

··· 77 77 u32 pdu_len; 78 78 u32 pdu_recv; 79 79 int sg_idx; 80 - int nr_mapped; 81 80 struct msghdr recv_msg; 82 - struct kvec *iov; 81 + struct bio_vec *iov; 83 82 u32 flags; 84 83 85 84 struct list_head entry; ··· 164 165 static struct workqueue_struct *nvmet_tcp_wq; 165 166 static const struct nvmet_fabrics_ops nvmet_tcp_ops; 166 167 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c); 167 - static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd); 168 168 static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd); 169 - static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd); 170 169 171 170 static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue, 172 171 struct nvmet_tcp_cmd *cmd) ··· 298 301 299 302 static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd) 300 303 { 301 - WARN_ON(unlikely(cmd->nr_mapped > 0)); 302 - 303 304 kfree(cmd->iov); 304 305 sgl_free(cmd->req.sg); 305 306 cmd->iov = NULL; 306 307 cmd->req.sg = NULL; 307 308 } 308 309 309 - static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd) 310 + static void nvmet_tcp_build_pdu_iovec(struct nvmet_tcp_cmd *cmd) 310 311 { 311 - struct scatterlist *sg; 312 - int i; 313 - 314 - sg = &cmd->req.sg[cmd->sg_idx]; 315 - 316 - for (i = 0; i < cmd->nr_mapped; i++) 317 - kunmap(sg_page(&sg[i])); 318 - 319 - cmd->nr_mapped = 0; 320 - } 321 - 322 - static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd) 323 - { 324 - struct kvec *iov = cmd->iov; 312 + struct bio_vec *iov = cmd->iov; 325 313 struct scatterlist *sg; 326 314 u32 length, offset, sg_offset; 315 + int nr_pages; 327 316 328 317 length = cmd->pdu_len; 329 - cmd->nr_mapped = DIV_ROUND_UP(length, PAGE_SIZE); 318 + nr_pages = DIV_ROUND_UP(length, PAGE_SIZE); 330 319 offset = cmd->rbytes_done; 331 320 cmd->sg_idx = offset / PAGE_SIZE; 332 321 sg_offset = offset % PAGE_SIZE; ··· 321 338 while (length) { 322 339 u32 iov_len = min_t(u32, length, sg->length - sg_offset); 323 340 324 - iov->iov_base = kmap(sg_page(sg)) + sg->offset + sg_offset; 325 - iov->iov_len = iov_len; 341 + iov->bv_page = sg_page(sg); 342 + iov->bv_len = sg->length; 343 + iov->bv_offset = sg->offset + sg_offset; 326 344 327 345 length -= iov_len; 328 346 sg = sg_next(sg); ··· 331 347 sg_offset = 0; 332 348 } 333 349 334 - iov_iter_kvec(&cmd->recv_msg.msg_iter, READ, cmd->iov, 335 - cmd->nr_mapped, cmd->pdu_len); 350 + iov_iter_bvec(&cmd->recv_msg.msg_iter, READ, cmd->iov, 351 + nr_pages, cmd->pdu_len); 336 352 } 337 353 338 354 static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue) ··· 910 926 } 911 927 912 928 queue->rcv_state = NVMET_TCP_RECV_DATA; 913 - nvmet_tcp_map_pdu_iovec(cmd); 929 + nvmet_tcp_build_pdu_iovec(cmd); 914 930 cmd->flags |= NVMET_TCP_F_INIT_FAILED; 915 931 } 916 932 ··· 919 935 struct nvme_tcp_data_pdu *data = &queue->pdu.data; 920 936 struct nvmet_tcp_cmd *cmd; 921 937 922 - if (likely(queue->nr_cmds)) 938 + if (likely(queue->nr_cmds)) { 939 + if (unlikely(data->ttag >= queue->nr_cmds)) { 940 + pr_err("queue %d: received out of bound ttag %u, nr_cmds %u\n", 941 + queue->idx, data->ttag, queue->nr_cmds); 942 + nvmet_tcp_fatal_error(queue); 943 + return -EPROTO; 944 + } 923 945 cmd = &queue->cmds[data->ttag]; 924 - else 946 + } else { 925 947 cmd = &queue->connect; 948 + } 926 949 927 950 if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) { 928 951 pr_err("ttag %u unexpected data offset %u (expected %u)\n", ··· 943 952 944 953 cmd->pdu_len = le32_to_cpu(data->data_length); 945 954 cmd->pdu_recv = 0; 946 - nvmet_tcp_map_pdu_iovec(cmd); 955 + nvmet_tcp_build_pdu_iovec(cmd); 947 956 queue->cmd = cmd; 948 957 queue->rcv_state = NVMET_TCP_RECV_DATA; 949 958 ··· 965 974 return -EPROTO; 966 975 } 967 976 return nvmet_tcp_handle_icreq(queue); 977 + } 978 + 979 + if (unlikely(hdr->type == nvme_tcp_icreq)) { 980 + pr_err("queue %d: received icreq pdu in state %d\n", 981 + queue->idx, queue->state); 982 + nvmet_tcp_fatal_error(queue); 983 + return -EPROTO; 968 984 } 969 985 970 986 if (hdr->type == nvme_tcp_h2c_data) { ··· 1019 1021 if (nvmet_tcp_need_data_in(queue->cmd)) { 1020 1022 if (nvmet_tcp_has_inline_data(queue->cmd)) { 1021 1023 queue->rcv_state = NVMET_TCP_RECV_DATA; 1022 - nvmet_tcp_map_pdu_iovec(queue->cmd); 1024 + nvmet_tcp_build_pdu_iovec(queue->cmd); 1023 1025 return 0; 1024 1026 } 1025 1027 /* send back R2T */ ··· 1139 1141 cmd->rbytes_done += ret; 1140 1142 } 1141 1143 1142 - nvmet_tcp_unmap_pdu_iovec(cmd); 1143 1144 if (queue->data_digest) { 1144 1145 nvmet_tcp_prep_recv_ddgst(cmd); 1145 1146 return 0; ··· 1176 1179 queue->idx, cmd->req.cmd->common.command_id, 1177 1180 queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst), 1178 1181 le32_to_cpu(cmd->exp_ddgst)); 1179 - nvmet_tcp_finish_cmd(cmd); 1182 + nvmet_req_uninit(&cmd->req); 1183 + nvmet_tcp_free_cmd_buffers(cmd); 1180 1184 nvmet_tcp_fatal_error(queue); 1181 1185 ret = -EPROTO; 1182 1186 goto out; ··· 1406 1408 write_unlock_bh(&sock->sk->sk_callback_lock); 1407 1409 } 1408 1410 1409 - static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd) 1410 - { 1411 - nvmet_req_uninit(&cmd->req); 1412 - nvmet_tcp_unmap_pdu_iovec(cmd); 1413 - nvmet_tcp_free_cmd_buffers(cmd); 1414 - } 1415 - 1416 1411 static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue) 1417 1412 { 1418 1413 struct nvmet_tcp_cmd *cmd = queue->cmds; ··· 1414 1423 for (i = 0; i < queue->nr_cmds; i++, cmd++) { 1415 1424 if (nvmet_tcp_need_data_in(cmd)) 1416 1425 nvmet_req_uninit(&cmd->req); 1417 - 1418 - nvmet_tcp_unmap_pdu_iovec(cmd); 1419 - nvmet_tcp_free_cmd_buffers(cmd); 1420 1426 } 1421 1427 1422 1428 if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) { 1423 1429 /* failed in connect */ 1424 - nvmet_tcp_finish_cmd(&queue->connect); 1430 + nvmet_req_uninit(&queue->connect.req); 1425 1431 } 1432 + } 1433 + 1434 + static void nvmet_tcp_free_cmd_data_in_buffers(struct nvmet_tcp_queue *queue) 1435 + { 1436 + struct nvmet_tcp_cmd *cmd = queue->cmds; 1437 + int i; 1438 + 1439 + for (i = 0; i < queue->nr_cmds; i++, cmd++) { 1440 + if (nvmet_tcp_need_data_in(cmd)) 1441 + nvmet_tcp_free_cmd_buffers(cmd); 1442 + } 1443 + 1444 + if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) 1445 + nvmet_tcp_free_cmd_buffers(&queue->connect); 1426 1446 } 1427 1447 1428 1448 static void nvmet_tcp_release_queue_work(struct work_struct *w) ··· 1454 1452 nvmet_tcp_uninit_data_in_cmds(queue); 1455 1453 nvmet_sq_destroy(&queue->nvme_sq); 1456 1454 cancel_work_sync(&queue->io_work); 1455 + nvmet_tcp_free_cmd_data_in_buffers(queue); 1457 1456 sock_release(queue->sock); 1458 1457 nvmet_tcp_free_cmds(queue); 1459 1458 if (queue->hdr_digest || queue->data_digest)

+1 -2

drivers/nvme/target/zns.c

··· 400 400 { 401 401 struct block_device *bdev = req->ns->bdev; 402 402 unsigned int nr_zones = bdev_nr_zones(bdev); 403 - struct request_queue *q = bdev_get_queue(bdev); 404 403 struct bio *bio = NULL; 405 404 sector_t sector = 0; 406 405 int ret; ··· 408 409 }; 409 410 410 411 d.zbitmap = kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(*(d.zbitmap)), 411 - GFP_NOIO, q->node); 412 + GFP_NOIO, bdev->bd_disk->node_id); 412 413 if (!d.zbitmap) { 413 414 ret = -ENOMEM; 414 415 goto out;

+11 -75

drivers/s390/block/dasd.c

··· 41 41 42 42 #define DASD_DIAG_MOD "dasd_diag_mod" 43 43 44 - static unsigned int queue_depth = 32; 45 - static unsigned int nr_hw_queues = 4; 46 - 47 - module_param(queue_depth, uint, 0444); 48 - MODULE_PARM_DESC(queue_depth, "Default queue depth for new DASD devices"); 49 - 50 - module_param(nr_hw_queues, uint, 0444); 51 - MODULE_PARM_DESC(nr_hw_queues, "Default number of hardware queues for new DASD devices"); 52 - 53 44 /* 54 45 * SECTION: exported variables of dasd.c 55 46 */ ··· 59 68 /* 60 69 * SECTION: prototypes for static functions of dasd.c 61 70 */ 62 - static int dasd_alloc_queue(struct dasd_block *); 63 - static void dasd_free_queue(struct dasd_block *); 64 71 static int dasd_flush_block_queue(struct dasd_block *); 65 72 static void dasd_device_tasklet(unsigned long); 66 73 static void dasd_block_tasklet(unsigned long); ··· 187 198 */ 188 199 static int dasd_state_new_to_known(struct dasd_device *device) 189 200 { 190 - int rc; 191 - 192 201 /* 193 202 * As long as the device is not in state DASD_STATE_NEW we want to 194 203 * keep the reference count > 0. 195 204 */ 196 205 dasd_get_device(device); 197 - 198 - if (device->block) { 199 - rc = dasd_alloc_queue(device->block); 200 - if (rc) { 201 - dasd_put_device(device); 202 - return rc; 203 - } 204 - } 205 206 device->state = DASD_STATE_KNOWN; 206 207 return 0; 207 208 } ··· 204 225 /* Disable extended error reporting for this device. */ 205 226 dasd_eer_disable(device); 206 227 device->state = DASD_STATE_NEW; 207 - 208 - if (device->block) 209 - dasd_free_queue(device->block); 210 228 211 229 /* Give up reference we took in dasd_state_new_to_known. */ 212 230 dasd_put_device(device); ··· 1567 1591 dasd_schedule_device_bh(device); 1568 1592 if (device->block) { 1569 1593 dasd_schedule_block_bh(device->block); 1570 - if (device->block->request_queue) 1571 - blk_mq_run_hw_queues(device->block->request_queue, 1572 - true); 1594 + if (device->block->gdp) 1595 + blk_mq_run_hw_queues(device->block->gdp->queue, true); 1573 1596 } 1574 1597 } 1575 1598 EXPORT_SYMBOL_GPL(dasd_generic_handle_state_change); ··· 2666 2691 dasd_device_remove_stop_bits(block->base, DASD_STOPPED_PENDING); 2667 2692 spin_unlock_irqrestore(get_ccwdev_lock(block->base->cdev), flags); 2668 2693 dasd_schedule_block_bh(block); 2669 - blk_mq_run_hw_queues(block->request_queue, true); 2694 + blk_mq_run_hw_queues(block->gdp->queue, true); 2670 2695 } 2671 2696 2672 2697 /* ··· 3214 3239 blk_mq_run_hw_queues(req->q, true); 3215 3240 } 3216 3241 3217 - static struct blk_mq_ops dasd_mq_ops = { 3242 + struct blk_mq_ops dasd_mq_ops = { 3218 3243 .queue_rq = do_dasd_request, 3219 3244 .complete = dasd_request_done, 3220 3245 .timeout = dasd_times_out, 3221 3246 .init_hctx = dasd_init_hctx, 3222 3247 .exit_hctx = dasd_exit_hctx, 3223 3248 }; 3224 - 3225 - /* 3226 - * Allocate and initialize request queue and default I/O scheduler. 3227 - */ 3228 - static int dasd_alloc_queue(struct dasd_block *block) 3229 - { 3230 - int rc; 3231 - 3232 - block->tag_set.ops = &dasd_mq_ops; 3233 - block->tag_set.cmd_size = sizeof(struct dasd_ccw_req); 3234 - block->tag_set.nr_hw_queues = nr_hw_queues; 3235 - block->tag_set.queue_depth = queue_depth; 3236 - block->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; 3237 - block->tag_set.numa_node = NUMA_NO_NODE; 3238 - 3239 - rc = blk_mq_alloc_tag_set(&block->tag_set); 3240 - if (rc) 3241 - return rc; 3242 - 3243 - block->request_queue = blk_mq_init_queue(&block->tag_set); 3244 - if (IS_ERR(block->request_queue)) 3245 - return PTR_ERR(block->request_queue); 3246 - 3247 - block->request_queue->queuedata = block; 3248 - 3249 - return 0; 3250 - } 3251 - 3252 - /* 3253 - * Deactivate and free request queue. 3254 - */ 3255 - static void dasd_free_queue(struct dasd_block *block) 3256 - { 3257 - if (block->request_queue) { 3258 - blk_mq_destroy_queue(block->request_queue); 3259 - blk_mq_free_tag_set(&block->tag_set); 3260 - block->request_queue = NULL; 3261 - } 3262 - } 3263 3249 3264 3250 static int dasd_open(struct block_device *bdev, fmode_t mode) 3265 3251 { ··· 3698 3762 dasd_schedule_device_bh(device); 3699 3763 if (device->block) { 3700 3764 dasd_schedule_block_bh(device->block); 3701 - if (device->block->request_queue) 3702 - blk_mq_run_hw_queues(device->block->request_queue, 3703 - true); 3704 - } 3765 + if (device->block->gdp) 3766 + blk_mq_run_hw_queues(device->block->gdp->queue, true); 3767 + } 3705 3768 3706 3769 if (!device->stopped) 3707 3770 wake_up(&generic_waitq); ··· 3851 3916 3852 3917 if (device->block) { 3853 3918 dasd_schedule_block_bh(device->block); 3854 - if (device->block->request_queue) 3855 - blk_mq_run_hw_queues(device->block->request_queue, true); 3919 + if (device->block->gdp) 3920 + blk_mq_run_hw_queues(device->block->gdp->queue, true); 3856 3921 } 3857 3922 if (!device->stopped) 3858 3923 wake_up(&generic_waitq); ··· 3862 3927 /* 3863 3928 * clear active requests and requeue them to block layer if possible 3864 3929 */ 3865 - static int dasd_generic_requeue_all_requests(struct dasd_device *device) 3930 + int dasd_generic_requeue_all_requests(struct dasd_device *device) 3866 3931 { 3867 3932 struct list_head requeue_queue; 3868 3933 struct dasd_ccw_req *cqr, *n; ··· 3936 4001 dasd_schedule_device_bh(device); 3937 4002 return rc; 3938 4003 } 4004 + EXPORT_SYMBOL_GPL(dasd_generic_requeue_all_requests); 3939 4005 3940 4006 static void do_requeue_requests(struct work_struct *work) 3941 4007 {

+5

drivers/s390/block/dasd_3990_erp.c

··· 1050 1050 dev_err(&device->cdev->dev, "An I/O request was rejected" 1051 1051 " because writing is inhibited\n"); 1052 1052 erp = dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED); 1053 + } else if (sense[7] & SNS7_INVALID_ON_SEC) { 1054 + dev_err(&device->cdev->dev, "An I/O request was rejected on a copy pair secondary device\n"); 1055 + /* suppress dump of sense data for this error */ 1056 + set_bit(DASD_CQR_SUPPRESS_CR, &erp->refers->flags); 1057 + erp = dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED); 1053 1058 } else { 1054 1059 /* fatal error - set status to FAILED 1055 1060 internal error 09 - Command Reject */

+597 -12

drivers/s390/block/dasd_devmap.c

··· 26 26 27 27 /* This is ugly... */ 28 28 #define PRINTK_HEADER "dasd_devmap:" 29 - #define DASD_BUS_ID_SIZE 20 30 29 #define DASD_MAX_PARAMS 256 31 30 32 31 #include "dasd_int.h" ··· 49 50 unsigned int devindex; 50 51 unsigned short features; 51 52 struct dasd_device *device; 53 + struct dasd_copy_relation *copy; 52 54 }; 53 55 54 56 /* ··· 130 130 /* 131 131 * Read a device busid/devno from a string. 132 132 */ 133 - static int __init dasd_busid(char *str, int *id0, int *id1, int *devno) 133 + static int dasd_busid(char *str, int *id0, int *id1, int *devno) 134 134 { 135 135 unsigned int val; 136 136 char *tok; ··· 438 438 return devmap; 439 439 } 440 440 441 - /* 442 - * Find devmap for device with given bus_id. 443 - */ 444 441 static struct dasd_devmap * 445 - dasd_find_busid(const char *bus_id) 442 + dasd_find_busid_locked(const char *bus_id) 446 443 { 447 444 struct dasd_devmap *devmap, *tmp; 448 445 int hash; 449 446 450 - spin_lock(&dasd_devmap_lock); 451 447 devmap = ERR_PTR(-ENODEV); 452 448 hash = dasd_hash_busid(bus_id); 453 449 list_for_each_entry(tmp, &dasd_hashlists[hash], list) { ··· 452 456 break; 453 457 } 454 458 } 459 + return devmap; 460 + } 461 + 462 + /* 463 + * Find devmap for device with given bus_id. 464 + */ 465 + static struct dasd_devmap * 466 + dasd_find_busid(const char *bus_id) 467 + { 468 + struct dasd_devmap *devmap; 469 + 470 + spin_lock(&dasd_devmap_lock); 471 + devmap = dasd_find_busid_locked(bus_id); 455 472 spin_unlock(&dasd_devmap_lock); 456 473 return devmap; 457 474 } ··· 594 585 } 595 586 596 587 /* 588 + * allocate a PPRC data structure and call the discipline function to fill 589 + */ 590 + static int dasd_devmap_get_pprc_status(struct dasd_device *device, 591 + struct dasd_pprc_data_sc4 **data) 592 + { 593 + struct dasd_pprc_data_sc4 *temp; 594 + 595 + if (!device->discipline || !device->discipline->pprc_status) { 596 + dev_warn(&device->cdev->dev, "Unable to query copy relation status\n"); 597 + return -EOPNOTSUPP; 598 + } 599 + temp = kzalloc(sizeof(*temp), GFP_KERNEL); 600 + if (!temp) 601 + return -ENOMEM; 602 + 603 + /* get PPRC information from storage */ 604 + if (device->discipline->pprc_status(device, temp)) { 605 + dev_warn(&device->cdev->dev, "Error during copy relation status query\n"); 606 + kfree(temp); 607 + return -EINVAL; 608 + } 609 + *data = temp; 610 + 611 + return 0; 612 + } 613 + 614 + /* 615 + * find an entry in a PPRC device_info array by a given UID 616 + * depending on the primary/secondary state of the device it has to be 617 + * matched with the respective fields 618 + */ 619 + static int dasd_devmap_entry_from_pprc_data(struct dasd_pprc_data_sc4 *data, 620 + struct dasd_uid uid, 621 + bool primary) 622 + { 623 + int i; 624 + 625 + for (i = 0; i < DASD_CP_ENTRIES; i++) { 626 + if (primary) { 627 + if (data->dev_info[i].prim_cu_ssid == uid.ssid && 628 + data->dev_info[i].primary == uid.real_unit_addr) 629 + return i; 630 + } else { 631 + if (data->dev_info[i].sec_cu_ssid == uid.ssid && 632 + data->dev_info[i].secondary == uid.real_unit_addr) 633 + return i; 634 + } 635 + } 636 + return -1; 637 + } 638 + 639 + /* 640 + * check the consistency of a specified copy relation by checking 641 + * the following things: 642 + * 643 + * - is the given device part of a copy pair setup 644 + * - does the state of the device match the state in the PPRC status data 645 + * - does the device UID match with the UID in the PPRC status data 646 + * - to prevent misrouted IO check if the given device is present in all 647 + * related PPRC status data 648 + */ 649 + static int dasd_devmap_check_copy_relation(struct dasd_device *device, 650 + struct dasd_copy_entry *entry, 651 + struct dasd_pprc_data_sc4 *data, 652 + struct dasd_copy_relation *copy) 653 + { 654 + struct dasd_pprc_data_sc4 *tmp_dat; 655 + struct dasd_device *tmp_dev; 656 + struct dasd_uid uid; 657 + int i, j; 658 + 659 + if (!device->discipline || !device->discipline->get_uid || 660 + device->discipline->get_uid(device, &uid)) 661 + return 1; 662 + 663 + i = dasd_devmap_entry_from_pprc_data(data, uid, entry->primary); 664 + if (i < 0) { 665 + dev_warn(&device->cdev->dev, "Device not part of a copy relation\n"); 666 + return 1; 667 + } 668 + 669 + /* double check which role the current device has */ 670 + if (entry->primary) { 671 + if (data->dev_info[i].flags & 0x80) { 672 + dev_warn(&device->cdev->dev, "Copy pair secondary is setup as primary\n"); 673 + return 1; 674 + } 675 + if (data->dev_info[i].prim_cu_ssid != uid.ssid || 676 + data->dev_info[i].primary != uid.real_unit_addr) { 677 + dev_warn(&device->cdev->dev, 678 + "Primary device %s does not match copy pair status primary device %04x\n", 679 + dev_name(&device->cdev->dev), 680 + data->dev_info[i].prim_cu_ssid | 681 + data->dev_info[i].primary); 682 + return 1; 683 + } 684 + } else { 685 + if (!(data->dev_info[i].flags & 0x80)) { 686 + dev_warn(&device->cdev->dev, "Copy pair primary is setup as secondary\n"); 687 + return 1; 688 + } 689 + if (data->dev_info[i].sec_cu_ssid != uid.ssid || 690 + data->dev_info[i].secondary != uid.real_unit_addr) { 691 + dev_warn(&device->cdev->dev, 692 + "Secondary device %s does not match copy pair status secondary device %04x\n", 693 + dev_name(&device->cdev->dev), 694 + data->dev_info[i].sec_cu_ssid | 695 + data->dev_info[i].secondary); 696 + return 1; 697 + } 698 + } 699 + 700 + /* 701 + * the current device has to be part of the copy relation of all 702 + * entries to prevent misrouted IO to another copy pair 703 + */ 704 + for (j = 0; j < DASD_CP_ENTRIES; j++) { 705 + if (entry == &copy->entry[j]) 706 + tmp_dev = device; 707 + else 708 + tmp_dev = copy->entry[j].device; 709 + 710 + if (!tmp_dev) 711 + continue; 712 + 713 + if (dasd_devmap_get_pprc_status(tmp_dev, &tmp_dat)) 714 + return 1; 715 + 716 + if (dasd_devmap_entry_from_pprc_data(tmp_dat, uid, entry->primary) < 0) { 717 + dev_warn(&tmp_dev->cdev->dev, 718 + "Copy pair relation does not contain device: %s\n", 719 + dev_name(&device->cdev->dev)); 720 + kfree(tmp_dat); 721 + return 1; 722 + } 723 + kfree(tmp_dat); 724 + } 725 + return 0; 726 + } 727 + 728 + /* delete device from copy relation entry */ 729 + static void dasd_devmap_delete_copy_relation_device(struct dasd_device *device) 730 + { 731 + struct dasd_copy_relation *copy; 732 + int i; 733 + 734 + if (!device->copy) 735 + return; 736 + 737 + copy = device->copy; 738 + for (i = 0; i < DASD_CP_ENTRIES; i++) { 739 + if (copy->entry[i].device == device) 740 + copy->entry[i].device = NULL; 741 + } 742 + dasd_put_device(device); 743 + device->copy = NULL; 744 + } 745 + 746 + /* 747 + * read all required information for a copy relation setup and setup the device 748 + * accordingly 749 + */ 750 + int dasd_devmap_set_device_copy_relation(struct ccw_device *cdev, 751 + bool pprc_enabled) 752 + { 753 + struct dasd_pprc_data_sc4 *data = NULL; 754 + struct dasd_copy_entry *entry = NULL; 755 + struct dasd_copy_relation *copy; 756 + struct dasd_devmap *devmap; 757 + struct dasd_device *device; 758 + int i, rc = 0; 759 + 760 + devmap = dasd_devmap_from_cdev(cdev); 761 + if (IS_ERR(devmap)) 762 + return PTR_ERR(devmap); 763 + 764 + device = devmap->device; 765 + if (!device) 766 + return -ENODEV; 767 + 768 + copy = devmap->copy; 769 + /* no copy pair setup for this device */ 770 + if (!copy) 771 + goto out; 772 + 773 + rc = dasd_devmap_get_pprc_status(device, &data); 774 + if (rc) 775 + return rc; 776 + 777 + /* print error if PPRC is requested but not enabled on storage server */ 778 + if (!pprc_enabled) { 779 + dev_err(&cdev->dev, "Copy relation not enabled on storage server\n"); 780 + rc = -EINVAL; 781 + goto out; 782 + } 783 + 784 + if (!data->dev_info[0].state) { 785 + dev_warn(&device->cdev->dev, "Copy pair setup requested for device not in copy relation\n"); 786 + rc = -EINVAL; 787 + goto out; 788 + } 789 + /* find entry */ 790 + for (i = 0; i < DASD_CP_ENTRIES; i++) { 791 + if (copy->entry[i].configured && 792 + strncmp(dev_name(&cdev->dev), 793 + copy->entry[i].busid, DASD_BUS_ID_SIZE) == 0) { 794 + entry = &copy->entry[i]; 795 + break; 796 + } 797 + } 798 + if (!entry) { 799 + dev_warn(&device->cdev->dev, "Copy relation entry not found\n"); 800 + rc = -EINVAL; 801 + goto out; 802 + } 803 + /* check if the copy relation is valid */ 804 + if (dasd_devmap_check_copy_relation(device, entry, data, copy)) { 805 + dev_warn(&device->cdev->dev, "Copy relation faulty\n"); 806 + rc = -EINVAL; 807 + goto out; 808 + } 809 + 810 + dasd_get_device(device); 811 + copy->entry[i].device = device; 812 + device->copy = copy; 813 + out: 814 + kfree(data); 815 + return rc; 816 + } 817 + EXPORT_SYMBOL_GPL(dasd_devmap_set_device_copy_relation); 818 + 819 + /* 597 820 * Wait queue for dasd_delete_device waits. 598 821 */ 599 822 static DECLARE_WAIT_QUEUE_HEAD(dasd_delete_wq); ··· 858 617 dev_set_drvdata(&device->cdev->dev, NULL); 859 618 spin_unlock_irqrestore(get_ccwdev_lock(device->cdev), flags); 860 619 620 + /* Removve copy relation */ 621 + dasd_devmap_delete_copy_relation_device(device); 861 622 /* 862 623 * Drop ref_count by 3, one for the devmap reference, one for 863 624 * the cdev reference and one for the passed reference. ··· 937 694 gdp->private_data = devmap; 938 695 spin_unlock(&dasd_devmap_lock); 939 696 } 697 + EXPORT_SYMBOL(dasd_add_link_to_gendisk); 940 698 941 699 struct dasd_device *dasd_device_from_gendisk(struct gendisk *gdp) 942 700 { ··· 1578 1334 const char *buf, size_t count) 1579 1335 { 1580 1336 struct dasd_device *device; 1581 - struct request_queue *q; 1582 1337 unsigned long val; 1583 1338 1584 1339 device = dasd_device_from_cdev(to_ccwdev(dev)); ··· 1589 1346 dasd_put_device(device); 1590 1347 return -EINVAL; 1591 1348 } 1592 - q = device->block->request_queue; 1593 - if (!q) { 1349 + if (!device->block->gdp) { 1594 1350 dasd_put_device(device); 1595 1351 return -ENODEV; 1596 1352 } 1597 1353 1598 1354 device->blk_timeout = val; 1599 - 1600 - blk_queue_rq_timeout(q, device->blk_timeout * HZ); 1355 + blk_queue_rq_timeout(device->block->gdp->queue, val * HZ); 1601 1356 1602 1357 dasd_put_device(device); 1603 1358 return count; ··· 1924 1683 static struct kobj_attribute path_fcs_attribute = 1925 1684 __ATTR(fc_security, 0444, dasd_path_fcs_show, NULL); 1926 1685 1686 + /* 1687 + * print copy relation in the form 1688 + * primary,secondary[1] primary,secondary[2], ... 1689 + */ 1690 + static ssize_t 1691 + dasd_copy_pair_show(struct device *dev, 1692 + struct device_attribute *attr, char *buf) 1693 + { 1694 + char prim_busid[DASD_BUS_ID_SIZE]; 1695 + struct dasd_copy_relation *copy; 1696 + struct dasd_devmap *devmap; 1697 + int len = 0; 1698 + int i; 1699 + 1700 + devmap = dasd_find_busid(dev_name(dev)); 1701 + if (IS_ERR(devmap)) 1702 + return -ENODEV; 1703 + 1704 + if (!devmap->copy) 1705 + return -ENODEV; 1706 + 1707 + copy = devmap->copy; 1708 + /* find primary */ 1709 + for (i = 0; i < DASD_CP_ENTRIES; i++) { 1710 + if (copy->entry[i].configured && copy->entry[i].primary) { 1711 + strscpy(prim_busid, copy->entry[i].busid, 1712 + DASD_BUS_ID_SIZE); 1713 + break; 1714 + } 1715 + } 1716 + if (!copy->entry[i].primary) 1717 + goto out; 1718 + 1719 + /* print all secondary */ 1720 + for (i = 0; i < DASD_CP_ENTRIES; i++) { 1721 + if (copy->entry[i].configured && !copy->entry[i].primary) 1722 + len += sysfs_emit_at(buf, len, "%s,%s ", prim_busid, 1723 + copy->entry[i].busid); 1724 + } 1725 + 1726 + len += sysfs_emit_at(buf, len, "\n"); 1727 + out: 1728 + return len; 1729 + } 1730 + 1731 + static int dasd_devmap_set_copy_relation(struct dasd_devmap *devmap, 1732 + struct dasd_copy_relation *copy, 1733 + char *busid, bool primary) 1734 + { 1735 + int i; 1736 + 1737 + /* find free entry */ 1738 + for (i = 0; i < DASD_CP_ENTRIES; i++) { 1739 + /* current bus_id already included, nothing to do */ 1740 + if (copy->entry[i].configured && 1741 + strncmp(copy->entry[i].busid, busid, DASD_BUS_ID_SIZE) == 0) 1742 + return 0; 1743 + 1744 + if (!copy->entry[i].configured) 1745 + break; 1746 + } 1747 + if (i == DASD_CP_ENTRIES) 1748 + return -EINVAL; 1749 + 1750 + copy->entry[i].configured = true; 1751 + strscpy(copy->entry[i].busid, busid, DASD_BUS_ID_SIZE); 1752 + if (primary) { 1753 + copy->active = &copy->entry[i]; 1754 + copy->entry[i].primary = true; 1755 + } 1756 + if (!devmap->copy) 1757 + devmap->copy = copy; 1758 + 1759 + return 0; 1760 + } 1761 + 1762 + static void dasd_devmap_del_copy_relation(struct dasd_copy_relation *copy, 1763 + char *busid) 1764 + { 1765 + int i; 1766 + 1767 + spin_lock(&dasd_devmap_lock); 1768 + /* find entry */ 1769 + for (i = 0; i < DASD_CP_ENTRIES; i++) { 1770 + if (copy->entry[i].configured && 1771 + strncmp(copy->entry[i].busid, busid, DASD_BUS_ID_SIZE) == 0) 1772 + break; 1773 + } 1774 + if (i == DASD_CP_ENTRIES || !copy->entry[i].configured) { 1775 + spin_unlock(&dasd_devmap_lock); 1776 + return; 1777 + } 1778 + 1779 + copy->entry[i].configured = false; 1780 + memset(copy->entry[i].busid, 0, DASD_BUS_ID_SIZE); 1781 + if (copy->active == &copy->entry[i]) { 1782 + copy->active = NULL; 1783 + copy->entry[i].primary = false; 1784 + } 1785 + spin_unlock(&dasd_devmap_lock); 1786 + } 1787 + 1788 + static int dasd_devmap_clear_copy_relation(struct device *dev) 1789 + { 1790 + struct dasd_copy_relation *copy; 1791 + struct dasd_devmap *devmap; 1792 + int i, rc = 1; 1793 + 1794 + devmap = dasd_devmap_from_cdev(to_ccwdev(dev)); 1795 + if (IS_ERR(devmap)) 1796 + return 1; 1797 + 1798 + spin_lock(&dasd_devmap_lock); 1799 + if (!devmap->copy) 1800 + goto out; 1801 + 1802 + copy = devmap->copy; 1803 + /* first check if all secondary devices are offline*/ 1804 + for (i = 0; i < DASD_CP_ENTRIES; i++) { 1805 + if (!copy->entry[i].configured) 1806 + continue; 1807 + 1808 + if (copy->entry[i].device == copy->active->device) 1809 + continue; 1810 + 1811 + if (copy->entry[i].device) 1812 + goto out; 1813 + } 1814 + /* clear all devmap entries */ 1815 + for (i = 0; i < DASD_CP_ENTRIES; i++) { 1816 + if (strlen(copy->entry[i].busid) == 0) 1817 + continue; 1818 + if (copy->entry[i].device) { 1819 + dasd_put_device(copy->entry[i].device); 1820 + copy->entry[i].device->copy = NULL; 1821 + copy->entry[i].device = NULL; 1822 + } 1823 + devmap = dasd_find_busid_locked(copy->entry[i].busid); 1824 + devmap->copy = NULL; 1825 + memset(copy->entry[i].busid, 0, DASD_BUS_ID_SIZE); 1826 + } 1827 + kfree(copy); 1828 + rc = 0; 1829 + out: 1830 + spin_unlock(&dasd_devmap_lock); 1831 + return rc; 1832 + } 1833 + 1834 + /* 1835 + * parse BUSIDs from a copy pair 1836 + */ 1837 + static int dasd_devmap_parse_busid(const char *buf, char *prim_busid, 1838 + char *sec_busid) 1839 + { 1840 + char *primary, *secondary, *tmp, *pt; 1841 + int id0, id1, id2; 1842 + 1843 + pt = kstrdup(buf, GFP_KERNEL); 1844 + tmp = pt; 1845 + if (!tmp) 1846 + return -ENOMEM; 1847 + 1848 + primary = strsep(&tmp, ","); 1849 + if (!primary) { 1850 + kfree(pt); 1851 + return -EINVAL; 1852 + } 1853 + secondary = strsep(&tmp, ","); 1854 + if (!secondary) { 1855 + kfree(pt); 1856 + return -EINVAL; 1857 + } 1858 + if (dasd_busid(primary, &id0, &id1, &id2)) { 1859 + kfree(pt); 1860 + return -EINVAL; 1861 + } 1862 + sprintf(prim_busid, "%01x.%01x.%04x", id0, id1, id2); 1863 + if (dasd_busid(secondary, &id0, &id1, &id2)) { 1864 + kfree(pt); 1865 + return -EINVAL; 1866 + } 1867 + sprintf(sec_busid, "%01x.%01x.%04x", id0, id1, id2); 1868 + kfree(pt); 1869 + 1870 + return 0; 1871 + } 1872 + 1873 + static ssize_t dasd_copy_pair_store(struct device *dev, 1874 + struct device_attribute *attr, 1875 + const char *buf, size_t count) 1876 + { 1877 + struct dasd_devmap *prim_devmap, *sec_devmap; 1878 + char prim_busid[DASD_BUS_ID_SIZE]; 1879 + char sec_busid[DASD_BUS_ID_SIZE]; 1880 + struct dasd_copy_relation *copy; 1881 + struct dasd_device *device; 1882 + bool pprc_enabled; 1883 + int rc; 1884 + 1885 + if (strncmp(buf, "clear", strlen("clear")) == 0) { 1886 + if (dasd_devmap_clear_copy_relation(dev)) 1887 + return -EINVAL; 1888 + return count; 1889 + } 1890 + 1891 + rc = dasd_devmap_parse_busid(buf, prim_busid, sec_busid); 1892 + if (rc) 1893 + return rc; 1894 + 1895 + if (strncmp(dev_name(dev), prim_busid, DASD_BUS_ID_SIZE) != 0 && 1896 + strncmp(dev_name(dev), sec_busid, DASD_BUS_ID_SIZE) != 0) 1897 + return -EINVAL; 1898 + 1899 + /* allocate primary devmap if needed */ 1900 + prim_devmap = dasd_find_busid(prim_busid); 1901 + if (IS_ERR(prim_devmap)) 1902 + prim_devmap = dasd_add_busid(prim_busid, DASD_FEATURE_DEFAULT); 1903 + 1904 + /* allocate secondary devmap if needed */ 1905 + sec_devmap = dasd_find_busid(sec_busid); 1906 + if (IS_ERR(sec_devmap)) 1907 + sec_devmap = dasd_add_busid(sec_busid, DASD_FEATURE_DEFAULT); 1908 + 1909 + /* setting copy relation is only allowed for offline secondary */ 1910 + if (sec_devmap->device) 1911 + return -EINVAL; 1912 + 1913 + if (prim_devmap->copy) { 1914 + copy = prim_devmap->copy; 1915 + } else if (sec_devmap->copy) { 1916 + copy = sec_devmap->copy; 1917 + } else { 1918 + copy = kzalloc(sizeof(*copy), GFP_KERNEL); 1919 + if (!copy) 1920 + return -ENOMEM; 1921 + } 1922 + spin_lock(&dasd_devmap_lock); 1923 + rc = dasd_devmap_set_copy_relation(prim_devmap, copy, prim_busid, true); 1924 + if (rc) { 1925 + spin_unlock(&dasd_devmap_lock); 1926 + return rc; 1927 + } 1928 + rc = dasd_devmap_set_copy_relation(sec_devmap, copy, sec_busid, false); 1929 + if (rc) { 1930 + spin_unlock(&dasd_devmap_lock); 1931 + return rc; 1932 + } 1933 + spin_unlock(&dasd_devmap_lock); 1934 + 1935 + /* if primary device is already online call device setup directly */ 1936 + if (prim_devmap->device && !prim_devmap->device->copy) { 1937 + device = prim_devmap->device; 1938 + if (device->discipline->pprc_enabled) { 1939 + pprc_enabled = device->discipline->pprc_enabled(device); 1940 + rc = dasd_devmap_set_device_copy_relation(device->cdev, 1941 + pprc_enabled); 1942 + } else { 1943 + rc = -EOPNOTSUPP; 1944 + } 1945 + } 1946 + if (rc) { 1947 + dasd_devmap_del_copy_relation(copy, prim_busid); 1948 + dasd_devmap_del_copy_relation(copy, sec_busid); 1949 + count = rc; 1950 + } 1951 + 1952 + return count; 1953 + } 1954 + static DEVICE_ATTR(copy_pair, 0644, dasd_copy_pair_show, 1955 + dasd_copy_pair_store); 1956 + 1957 + static ssize_t 1958 + dasd_copy_role_show(struct device *dev, 1959 + struct device_attribute *attr, char *buf) 1960 + { 1961 + struct dasd_copy_relation *copy; 1962 + struct dasd_device *device; 1963 + int len, i; 1964 + 1965 + device = dasd_device_from_cdev(to_ccwdev(dev)); 1966 + if (IS_ERR(device)) 1967 + return -ENODEV; 1968 + 1969 + if (!device->copy) { 1970 + len = sysfs_emit(buf, "none\n"); 1971 + goto out; 1972 + } 1973 + copy = device->copy; 1974 + /* only the active device is primary */ 1975 + if (copy->active->device == device) { 1976 + len = sysfs_emit(buf, "primary\n"); 1977 + goto out; 1978 + } 1979 + for (i = 0; i < DASD_CP_ENTRIES; i++) { 1980 + if (copy->entry[i].device == device) { 1981 + len = sysfs_emit(buf, "secondary\n"); 1982 + goto out; 1983 + } 1984 + } 1985 + /* not in the list, no COPY role */ 1986 + len = sysfs_emit(buf, "none\n"); 1987 + out: 1988 + dasd_put_device(device); 1989 + return len; 1990 + } 1991 + static DEVICE_ATTR(copy_role, 0444, dasd_copy_role_show, NULL); 1992 + 1993 + static ssize_t dasd_device_ping(struct device *dev, 1994 + struct device_attribute *attr, 1995 + const char *buf, size_t count) 1996 + { 1997 + struct dasd_device *device; 1998 + size_t rc; 1999 + 2000 + device = dasd_device_from_cdev(to_ccwdev(dev)); 2001 + if (IS_ERR(device)) 2002 + return -ENODEV; 2003 + 2004 + /* 2005 + * do not try during offline processing 2006 + * early check only 2007 + * the sleep_on function itself checks for offline 2008 + * processing again 2009 + */ 2010 + if (test_bit(DASD_FLAG_OFFLINE, &device->flags)) { 2011 + rc = -EBUSY; 2012 + goto out; 2013 + } 2014 + if (!device->discipline || !device->discipline->device_ping) { 2015 + rc = -EOPNOTSUPP; 2016 + goto out; 2017 + } 2018 + rc = device->discipline->device_ping(device); 2019 + if (!rc) 2020 + rc = count; 2021 + out: 2022 + dasd_put_device(device); 2023 + return rc; 2024 + } 2025 + static DEVICE_ATTR(ping, 0200, NULL, dasd_device_ping); 2026 + 1927 2027 #define DASD_DEFINE_ATTR(_name, _func) \ 1928 2028 static ssize_t dasd_##_name##_show(struct device *dev, \ 1929 2029 struct device_attribute *attr, \ ··· 2321 1739 &dev_attr_hpf.attr, 2322 1740 &dev_attr_ese.attr, 2323 1741 &dev_attr_fc_security.attr, 1742 + &dev_attr_copy_pair.attr, 1743 + &dev_attr_copy_role.attr, 1744 + &dev_attr_ping.attr, 2324 1745 NULL, 2325 1746 }; 2326 1747

+1 -1

drivers/s390/block/dasd_diag.c

··· 627 627 static void dasd_diag_setup_blk_queue(struct dasd_block *block) 628 628 { 629 629 unsigned int logical_block_size = block->bp_block; 630 - struct request_queue *q = block->request_queue; 630 + struct request_queue *q = block->gdp->queue; 631 631 int max; 632 632 633 633 max = DIAG_MAX_BLOCKS << block->s2b_shift;

+270 -24

drivers/s390/block/dasd_eckd.c

··· 2013 2013 } 2014 2014 2015 2015 /* 2016 + * return if the device is the copy relation primary if a copy relation is active 2017 + */ 2018 + static int dasd_device_is_primary(struct dasd_device *device) 2019 + { 2020 + if (!device->copy) 2021 + return 1; 2022 + 2023 + if (device->copy->active->device == device) 2024 + return 1; 2025 + 2026 + return 0; 2027 + } 2028 + 2029 + static int dasd_eckd_alloc_block(struct dasd_device *device) 2030 + { 2031 + struct dasd_block *block; 2032 + struct dasd_uid temp_uid; 2033 + 2034 + if (!dasd_device_is_primary(device)) 2035 + return 0; 2036 + 2037 + dasd_eckd_get_uid(device, &temp_uid); 2038 + if (temp_uid.type == UA_BASE_DEVICE) { 2039 + block = dasd_alloc_block(); 2040 + if (IS_ERR(block)) { 2041 + DBF_EVENT_DEVID(DBF_WARNING, device->cdev, "%s", 2042 + "could not allocate dasd block structure"); 2043 + return PTR_ERR(block); 2044 + } 2045 + device->block = block; 2046 + block->base = device; 2047 + } 2048 + return 0; 2049 + } 2050 + 2051 + static bool dasd_eckd_pprc_enabled(struct dasd_device *device) 2052 + { 2053 + struct dasd_eckd_private *private = device->private; 2054 + 2055 + return private->rdc_data.facilities.PPRC_enabled; 2056 + } 2057 + 2058 + /* 2016 2059 * Check device characteristics. 2017 2060 * If the device is accessible using ECKD discipline, the device is enabled. 2018 2061 */ ··· 2063 2020 dasd_eckd_check_characteristics(struct dasd_device *device) 2064 2021 { 2065 2022 struct dasd_eckd_private *private = device->private; 2066 - struct dasd_block *block; 2067 - struct dasd_uid temp_uid; 2068 2023 int rc, i; 2069 2024 int readonly; 2070 2025 unsigned long value; ··· 2120 2079 device->default_expires = value; 2121 2080 } 2122 2081 2123 - dasd_eckd_get_uid(device, &temp_uid); 2124 - if (temp_uid.type == UA_BASE_DEVICE) { 2125 - block = dasd_alloc_block(); 2126 - if (IS_ERR(block)) { 2127 - DBF_EVENT_DEVID(DBF_WARNING, device->cdev, "%s", 2128 - "could not allocate dasd " 2129 - "block structure"); 2130 - rc = PTR_ERR(block); 2131 - goto out_err1; 2132 - } 2133 - device->block = block; 2134 - block->base = device; 2082 + /* Read Device Characteristics */ 2083 + rc = dasd_generic_read_dev_chars(device, DASD_ECKD_MAGIC, 2084 + &private->rdc_data, 64); 2085 + if (rc) { 2086 + DBF_EVENT_DEVID(DBF_WARNING, device->cdev, 2087 + "Read device characteristic failed, rc=%d", rc); 2088 + goto out_err1; 2135 2089 } 2090 + 2091 + /* setup PPRC for device from devmap */ 2092 + rc = dasd_devmap_set_device_copy_relation(device->cdev, 2093 + dasd_eckd_pprc_enabled(device)); 2094 + if (rc) { 2095 + DBF_EVENT_DEVID(DBF_WARNING, device->cdev, 2096 + "copy relation setup failed, rc=%d", rc); 2097 + goto out_err1; 2098 + } 2099 + 2100 + /* check if block device is needed and allocate in case */ 2101 + rc = dasd_eckd_alloc_block(device); 2102 + if (rc) 2103 + goto out_err1; 2136 2104 2137 2105 /* register lcu with alias handling, enable PAV */ 2138 2106 rc = dasd_alias_make_device_known_to_lcu(device); ··· 2166 2116 2167 2117 /* Read Extent Pool Information */ 2168 2118 dasd_eckd_read_ext_pool_info(device); 2169 - 2170 - /* Read Device Characteristics */ 2171 - rc = dasd_generic_read_dev_chars(device, DASD_ECKD_MAGIC, 2172 - &private->rdc_data, 64); 2173 - if (rc) { 2174 - DBF_EVENT_DEVID(DBF_WARNING, device->cdev, 2175 - "Read device characteristic failed, rc=%d", rc); 2176 - goto out_err3; 2177 - } 2178 2119 2179 2120 if ((device->features & DASD_FEATURE_USERAW) && 2180 2121 !(private->rdc_data.facilities.RT_in_LR)) { ··· 6119 6078 return 0; 6120 6079 } 6121 6080 6081 + static struct dasd_device 6082 + *copy_relation_find_device(struct dasd_copy_relation *copy, 6083 + char *busid) 6084 + { 6085 + int i; 6086 + 6087 + for (i = 0; i < DASD_CP_ENTRIES; i++) { 6088 + if (copy->entry[i].configured && 6089 + strncmp(copy->entry[i].busid, busid, DASD_BUS_ID_SIZE) == 0) 6090 + return copy->entry[i].device; 6091 + } 6092 + return NULL; 6093 + } 6094 + 6095 + /* 6096 + * set the new active/primary device 6097 + */ 6098 + static void copy_pair_set_active(struct dasd_copy_relation *copy, char *new_busid, 6099 + char *old_busid) 6100 + { 6101 + int i; 6102 + 6103 + for (i = 0; i < DASD_CP_ENTRIES; i++) { 6104 + if (copy->entry[i].configured && 6105 + strncmp(copy->entry[i].busid, new_busid, 6106 + DASD_BUS_ID_SIZE) == 0) { 6107 + copy->active = &copy->entry[i]; 6108 + copy->entry[i].primary = true; 6109 + } else if (copy->entry[i].configured && 6110 + strncmp(copy->entry[i].busid, old_busid, 6111 + DASD_BUS_ID_SIZE) == 0) { 6112 + copy->entry[i].primary = false; 6113 + } 6114 + } 6115 + } 6116 + 6117 + /* 6118 + * The function will swap the role of a given copy pair. 6119 + * During the swap operation the relation of the blockdevice is disconnected 6120 + * from the old primary and connected to the new. 6121 + * 6122 + * IO is paused on the block queue before swap and may be resumed afterwards. 6123 + */ 6124 + static int dasd_eckd_copy_pair_swap(struct dasd_device *device, char *prim_busid, 6125 + char *sec_busid) 6126 + { 6127 + struct dasd_device *primary, *secondary; 6128 + struct dasd_copy_relation *copy; 6129 + struct dasd_block *block; 6130 + struct gendisk *gdp; 6131 + 6132 + copy = device->copy; 6133 + if (!copy) 6134 + return DASD_COPYPAIRSWAP_INVALID; 6135 + primary = copy->active->device; 6136 + if (!primary) 6137 + return DASD_COPYPAIRSWAP_INVALID; 6138 + /* double check if swap has correct primary */ 6139 + if (strncmp(dev_name(&primary->cdev->dev), prim_busid, DASD_BUS_ID_SIZE) != 0) 6140 + return DASD_COPYPAIRSWAP_PRIMARY; 6141 + 6142 + secondary = copy_relation_find_device(copy, sec_busid); 6143 + if (!secondary) 6144 + return DASD_COPYPAIRSWAP_SECONDARY; 6145 + 6146 + /* 6147 + * usually the device should be quiesced for swap 6148 + * for paranoia stop device and requeue requests again 6149 + */ 6150 + dasd_device_set_stop_bits(primary, DASD_STOPPED_PPRC); 6151 + dasd_device_set_stop_bits(secondary, DASD_STOPPED_PPRC); 6152 + dasd_generic_requeue_all_requests(primary); 6153 + 6154 + /* swap DASD internal device <> block assignment */ 6155 + block = primary->block; 6156 + primary->block = NULL; 6157 + secondary->block = block; 6158 + block->base = secondary; 6159 + /* set new primary device in COPY relation */ 6160 + copy_pair_set_active(copy, sec_busid, prim_busid); 6161 + 6162 + /* swap blocklayer device link */ 6163 + gdp = block->gdp; 6164 + dasd_add_link_to_gendisk(gdp, secondary); 6165 + 6166 + /* re-enable device */ 6167 + dasd_device_remove_stop_bits(primary, DASD_STOPPED_PPRC); 6168 + dasd_device_remove_stop_bits(secondary, DASD_STOPPED_PPRC); 6169 + dasd_schedule_device_bh(secondary); 6170 + 6171 + return DASD_COPYPAIRSWAP_SUCCESS; 6172 + } 6173 + 6174 + /* 6175 + * Perform Subsystem Function - Peer-to-Peer Remote Copy Extended Query 6176 + */ 6177 + static int dasd_eckd_query_pprc_status(struct dasd_device *device, 6178 + struct dasd_pprc_data_sc4 *data) 6179 + { 6180 + struct dasd_pprc_data_sc4 *pprc_data; 6181 + struct dasd_psf_prssd_data *prssdp; 6182 + struct dasd_ccw_req *cqr; 6183 + struct ccw1 *ccw; 6184 + int rc; 6185 + 6186 + cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1 /* PSF */ + 1 /* RSSD */, 6187 + sizeof(*prssdp) + sizeof(*pprc_data) + 1, 6188 + device, NULL); 6189 + if (IS_ERR(cqr)) { 6190 + DBF_EVENT_DEVID(DBF_WARNING, device->cdev, "%s", 6191 + "Could not allocate query PPRC status request"); 6192 + return PTR_ERR(cqr); 6193 + } 6194 + cqr->startdev = device; 6195 + cqr->memdev = device; 6196 + cqr->block = NULL; 6197 + cqr->retries = 256; 6198 + cqr->expires = 10 * HZ; 6199 + 6200 + /* Prepare for Read Subsystem Data */ 6201 + prssdp = (struct dasd_psf_prssd_data *)cqr->data; 6202 + memset(prssdp, 0, sizeof(struct dasd_psf_prssd_data)); 6203 + prssdp->order = PSF_ORDER_PRSSD; 6204 + prssdp->suborder = PSF_SUBORDER_PPRCEQ; 6205 + prssdp->varies[0] = PPRCEQ_SCOPE_4; 6206 + pprc_data = (struct dasd_pprc_data_sc4 *)(prssdp + 1); 6207 + 6208 + ccw = cqr->cpaddr; 6209 + ccw->cmd_code = DASD_ECKD_CCW_PSF; 6210 + ccw->count = sizeof(struct dasd_psf_prssd_data); 6211 + ccw->flags |= CCW_FLAG_CC; 6212 + ccw->flags |= CCW_FLAG_SLI; 6213 + ccw->cda = (__u32)(addr_t)prssdp; 6214 + 6215 + /* Read Subsystem Data - query host access */ 6216 + ccw++; 6217 + ccw->cmd_code = DASD_ECKD_CCW_RSSD; 6218 + ccw->count = sizeof(*pprc_data); 6219 + ccw->flags |= CCW_FLAG_SLI; 6220 + ccw->cda = (__u32)(addr_t)pprc_data; 6221 + 6222 + cqr->buildclk = get_tod_clock(); 6223 + cqr->status = DASD_CQR_FILLED; 6224 + 6225 + rc = dasd_sleep_on_interruptible(cqr); 6226 + if (rc == 0) { 6227 + *data = *pprc_data; 6228 + } else { 6229 + DBF_EVENT_DEVID(DBF_WARNING, device->cdev, 6230 + "PPRC Extended Query failed with rc=%d\n", 6231 + rc); 6232 + rc = -EOPNOTSUPP; 6233 + } 6234 + 6235 + dasd_sfree_request(cqr, cqr->memdev); 6236 + return rc; 6237 + } 6238 + 6239 + /* 6240 + * ECKD NOP - no operation 6241 + */ 6242 + static int dasd_eckd_nop(struct dasd_device *device) 6243 + { 6244 + struct dasd_ccw_req *cqr; 6245 + struct ccw1 *ccw; 6246 + int rc; 6247 + 6248 + cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1, 1, device, NULL); 6249 + if (IS_ERR(cqr)) { 6250 + DBF_EVENT_DEVID(DBF_WARNING, device->cdev, "%s", 6251 + "Could not allocate NOP request"); 6252 + return PTR_ERR(cqr); 6253 + } 6254 + cqr->startdev = device; 6255 + cqr->memdev = device; 6256 + cqr->block = NULL; 6257 + cqr->retries = 1; 6258 + cqr->expires = 10 * HZ; 6259 + 6260 + ccw = cqr->cpaddr; 6261 + ccw->cmd_code = DASD_ECKD_CCW_NOP; 6262 + ccw->flags |= CCW_FLAG_SLI; 6263 + 6264 + cqr->buildclk = get_tod_clock(); 6265 + cqr->status = DASD_CQR_FILLED; 6266 + 6267 + rc = dasd_sleep_on_interruptible(cqr); 6268 + if (rc != 0) { 6269 + DBF_EVENT_DEVID(DBF_WARNING, device->cdev, 6270 + "NOP failed with rc=%d\n", rc); 6271 + rc = -EOPNOTSUPP; 6272 + } 6273 + dasd_sfree_request(cqr, cqr->memdev); 6274 + return rc; 6275 + } 6276 + 6277 + static int dasd_eckd_device_ping(struct dasd_device *device) 6278 + { 6279 + return dasd_eckd_nop(device); 6280 + } 6281 + 6122 6282 /* 6123 6283 * Perform Subsystem Function - CUIR response 6124 6284 */ ··· 6844 6602 static void dasd_eckd_setup_blk_queue(struct dasd_block *block) 6845 6603 { 6846 6604 unsigned int logical_block_size = block->bp_block; 6847 - struct request_queue *q = block->request_queue; 6605 + struct request_queue *q = block->gdp->queue; 6848 6606 struct dasd_device *device = block->base; 6849 6607 int max; 6850 6608 ··· 6939 6697 .ext_pool_exhaust = dasd_eckd_ext_pool_exhaust, 6940 6698 .ese_format = dasd_eckd_ese_format, 6941 6699 .ese_read = dasd_eckd_ese_read, 6700 + .pprc_status = dasd_eckd_query_pprc_status, 6701 + .pprc_enabled = dasd_eckd_pprc_enabled, 6702 + .copy_pair_swap = dasd_eckd_copy_pair_swap, 6703 + .device_ping = dasd_eckd_device_ping, 6942 6704 }; 6943 6705 6944 6706 static int __init

+8 -1

drivers/s390/block/dasd_eckd.h

··· 13 13 /***************************************************************************** 14 14 * SECTION: CCW Definitions 15 15 ****************************************************************************/ 16 + #define DASD_ECKD_CCW_NOP 0x03 16 17 #define DASD_ECKD_CCW_WRITE 0x05 17 18 #define DASD_ECKD_CCW_READ 0x06 18 19 #define DASD_ECKD_CCW_WRITE_HOME_ADDRESS 0x09 ··· 67 66 * Perform Subsystem Function / Sub-Orders 68 67 */ 69 68 #define PSF_SUBORDER_QHA 0x1C /* Query Host Access */ 69 + #define PSF_SUBORDER_PPRCEQ 0x50 /* PPRC Extended Query */ 70 70 #define PSF_SUBORDER_VSQ 0x52 /* Volume Storage Query */ 71 71 #define PSF_SUBORDER_LCQ 0x53 /* Logical Configuration Query */ 72 + 73 + /* 74 + * PPRC Extended Query Scopes 75 + */ 76 + #define PPRCEQ_SCOPE_4 0x04 /* Scope 4 for PPRC Extended Query */ 72 77 73 78 /* 74 79 * CUIR response condition codes ··· 268 261 unsigned char reserved3:8; 269 262 unsigned char defect_wr:1; 270 263 unsigned char XRC_supported:1; 271 - unsigned char reserved4:1; 264 + unsigned char PPRC_enabled:1; 272 265 unsigned char striping:1; 273 266 unsigned char reserved5:4; 274 267 unsigned char cfw:1;

+1 -1

drivers/s390/block/dasd_fba.c

··· 767 767 static void dasd_fba_setup_blk_queue(struct dasd_block *block) 768 768 { 769 769 unsigned int logical_block_size = block->bp_block; 770 - struct request_queue *q = block->request_queue; 770 + struct request_queue *q = block->gdp->queue; 771 771 unsigned int max_bytes, max_discard_sectors; 772 772 int max; 773 773

+24 -5

drivers/s390/block/dasd_genhd.c

··· 25 25 26 26 #include "dasd_int.h" 27 27 28 - static struct lock_class_key dasd_bio_compl_lkclass; 28 + static unsigned int queue_depth = 32; 29 + static unsigned int nr_hw_queues = 4; 30 + 31 + module_param(queue_depth, uint, 0444); 32 + MODULE_PARM_DESC(queue_depth, "Default queue depth for new DASD devices"); 33 + 34 + module_param(nr_hw_queues, uint, 0444); 35 + MODULE_PARM_DESC(nr_hw_queues, "Default number of hardware queues for new DASD devices"); 29 36 30 37 /* 31 38 * Allocate and register gendisk structure for device. ··· 48 41 if (base->devindex >= DASD_PER_MAJOR) 49 42 return -EBUSY; 50 43 51 - gdp = blk_mq_alloc_disk_for_queue(block->request_queue, 52 - &dasd_bio_compl_lkclass); 53 - if (!gdp) 54 - return -ENOMEM; 44 + block->tag_set.ops = &dasd_mq_ops; 45 + block->tag_set.cmd_size = sizeof(struct dasd_ccw_req); 46 + block->tag_set.nr_hw_queues = nr_hw_queues; 47 + block->tag_set.queue_depth = queue_depth; 48 + block->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; 49 + block->tag_set.numa_node = NUMA_NO_NODE; 50 + rc = blk_mq_alloc_tag_set(&block->tag_set); 51 + if (rc) 52 + return rc; 53 + 54 + gdp = blk_mq_alloc_disk(&block->tag_set, block); 55 + if (IS_ERR(gdp)) { 56 + blk_mq_free_tag_set(&block->tag_set); 57 + return PTR_ERR(gdp); 58 + } 55 59 56 60 /* Initialize gendisk structure. */ 57 61 gdp->major = DASD_MAJOR; ··· 118 100 block->gdp->private_data = NULL; 119 101 put_disk(block->gdp); 120 102 block->gdp = NULL; 103 + blk_mq_free_tag_set(&block->tag_set); 121 104 } 122 105 } 123 106

+74 -1

drivers/s390/block/dasd_int.h

··· 260 260 }; 261 261 262 262 /* 263 + * PPRC Status data 264 + */ 265 + struct dasd_pprc_header { 266 + __u8 entries; /* 0 Number of device entries */ 267 + __u8 unused; /* 1 unused */ 268 + __u16 entry_length; /* 2-3 Length of device entry */ 269 + __u32 unused2; /* 4-7 unused */ 270 + } __packed; 271 + 272 + struct dasd_pprc_dev_info { 273 + __u8 state; /* 0 Copy State */ 274 + __u8 flags; /* 1 Flags */ 275 + __u8 reserved1[2]; /* 2-3 reserved */ 276 + __u8 prim_lss; /* 4 Primary device LSS */ 277 + __u8 primary; /* 5 Primary device address */ 278 + __u8 sec_lss; /* 6 Secondary device LSS */ 279 + __u8 secondary; /* 7 Secondary device address */ 280 + __u16 pprc_id; /* 8-9 Peer-to-Peer Remote Copy ID */ 281 + __u8 reserved2[12]; /* 10-21 reserved */ 282 + __u16 prim_cu_ssid; /* 22-23 Pimary Control Unit SSID */ 283 + __u8 reserved3[12]; /* 24-35 reserved */ 284 + __u16 sec_cu_ssid; /* 36-37 Secondary Control Unit SSID */ 285 + __u8 reserved4[90]; /* 38-127 reserved */ 286 + } __packed; 287 + 288 + struct dasd_pprc_data_sc4 { 289 + struct dasd_pprc_header header; 290 + struct dasd_pprc_dev_info dev_info[5]; 291 + } __packed; 292 + 293 + #define DASD_BUS_ID_SIZE 20 294 + #define DASD_CP_ENTRIES 5 295 + 296 + struct dasd_copy_entry { 297 + char busid[DASD_BUS_ID_SIZE]; 298 + struct dasd_device *device; 299 + bool primary; 300 + bool configured; 301 + }; 302 + 303 + struct dasd_copy_relation { 304 + struct dasd_copy_entry entry[DASD_CP_ENTRIES]; 305 + struct dasd_copy_entry *active; 306 + }; 307 + 308 + int dasd_devmap_set_device_copy_relation(struct ccw_device *, 309 + bool pprc_enabled); 310 + 311 + /* 263 312 * the struct dasd_discipline is 264 313 * sth like a table of virtual functions, if you think of dasd_eckd 265 314 * inheriting dasd... ··· 436 387 struct dasd_ccw_req *(*ese_format)(struct dasd_device *, 437 388 struct dasd_ccw_req *, struct irb *); 438 389 int (*ese_read)(struct dasd_ccw_req *, struct irb *); 390 + int (*pprc_status)(struct dasd_device *, struct dasd_pprc_data_sc4 *); 391 + bool (*pprc_enabled)(struct dasd_device *); 392 + int (*copy_pair_swap)(struct dasd_device *, char *, char *); 393 + int (*device_ping)(struct dasd_device *); 439 394 }; 440 395 441 396 extern struct dasd_discipline *dasd_diag_discipline_pointer; ··· 636 583 struct dasd_profile profile; 637 584 struct dasd_format_entry format_entry; 638 585 struct kset *paths_info; 586 + struct dasd_copy_relation *copy; 639 587 }; 640 588 641 589 struct dasd_block { 642 590 /* Block device stuff. */ 643 591 struct gendisk *gdp; 644 - struct request_queue *request_queue; 645 592 spinlock_t request_queue_lock; 646 593 struct blk_mq_tag_set tag_set; 647 594 struct block_device *bdev; ··· 682 629 #define DASD_STOPPED_PENDING 4 /* long busy */ 683 630 #define DASD_STOPPED_DC_WAIT 8 /* disconnected, wait */ 684 631 #define DASD_STOPPED_SU 16 /* summary unit check handling */ 632 + #define DASD_STOPPED_PPRC 32 /* PPRC swap */ 685 633 #define DASD_STOPPED_NOSPC 128 /* no space left */ 686 634 687 635 /* per device flags */ ··· 706 652 #define DASD_SLEEPON_END_TAG ((void *) 2) 707 653 708 654 void dasd_put_device_wake(struct dasd_device *); 655 + 656 + /* 657 + * return values to be returned from the copy pair swap function 658 + * 0x00: swap successful 659 + * 0x01: swap data invalid 660 + * 0x02: no active device found 661 + * 0x03: wrong primary specified 662 + * 0x04: secondary device not found 663 + * 0x05: swap already running 664 + */ 665 + #define DASD_COPYPAIRSWAP_SUCCESS 0 666 + #define DASD_COPYPAIRSWAP_INVALID 1 667 + #define DASD_COPYPAIRSWAP_NOACTIVE 2 668 + #define DASD_COPYPAIRSWAP_PRIMARY 3 669 + #define DASD_COPYPAIRSWAP_SECONDARY 4 670 + #define DASD_COPYPAIRSWAP_MULTIPLE 5 709 671 710 672 /* 711 673 * Reference count inliners ··· 849 779 extern struct dasd_profile dasd_global_profile; 850 780 extern unsigned int dasd_global_profile_level; 851 781 extern const struct block_device_operations dasd_device_operations; 782 + extern struct blk_mq_ops dasd_mq_ops; 852 783 853 784 extern struct kmem_cache *dasd_page_cache; 854 785 ··· 907 836 int dasd_generic_verify_path(struct dasd_device *, __u8); 908 837 void dasd_generic_space_exhaust(struct dasd_device *, struct dasd_ccw_req *); 909 838 void dasd_generic_space_avail(struct dasd_device *); 839 + 840 + int dasd_generic_requeue_all_requests(struct dasd_device *); 910 841 911 842 int dasd_generic_read_dev_chars(struct dasd_device *, int, void *, int); 912 843 char *dasd_get_sense(struct irb *);

+53

drivers/s390/block/dasd_ioctl.c

··· 379 379 return rc; 380 380 } 381 381 382 + /* 383 + * Swap driver iternal copy relation. 384 + */ 385 + static int 386 + dasd_ioctl_copy_pair_swap(struct block_device *bdev, void __user *argp) 387 + { 388 + struct dasd_copypair_swap_data_t data; 389 + struct dasd_device *device; 390 + int rc; 391 + 392 + if (!capable(CAP_SYS_ADMIN)) 393 + return -EACCES; 394 + 395 + device = dasd_device_from_gendisk(bdev->bd_disk); 396 + if (!device) 397 + return -ENODEV; 398 + 399 + if (copy_from_user(&data, argp, sizeof(struct dasd_copypair_swap_data_t))) { 400 + dasd_put_device(device); 401 + return -EFAULT; 402 + } 403 + if (memchr_inv(data.reserved, 0, sizeof(data.reserved))) { 404 + pr_warn("%s: Ivalid swap data specified.\n", 405 + dev_name(&device->cdev->dev)); 406 + dasd_put_device(device); 407 + return DASD_COPYPAIRSWAP_INVALID; 408 + } 409 + if (bdev_is_partition(bdev)) { 410 + pr_warn("%s: The specified DASD is a partition and cannot be swapped\n", 411 + dev_name(&device->cdev->dev)); 412 + dasd_put_device(device); 413 + return DASD_COPYPAIRSWAP_INVALID; 414 + } 415 + if (!device->copy) { 416 + pr_warn("%s: The specified DASD has no copy pair set up\n", 417 + dev_name(&device->cdev->dev)); 418 + dasd_put_device(device); 419 + return -ENODEV; 420 + } 421 + if (!device->discipline->copy_pair_swap) { 422 + dasd_put_device(device); 423 + return -EOPNOTSUPP; 424 + } 425 + rc = device->discipline->copy_pair_swap(device, data.primary, 426 + data.secondary); 427 + dasd_put_device(device); 428 + 429 + return rc; 430 + } 431 + 382 432 #ifdef CONFIG_DASD_PROFILE 383 433 /* 384 434 * Reset device profile information ··· 686 636 break; 687 637 case BIODASDRAS: 688 638 rc = dasd_ioctl_release_space(bdev, argp); 639 + break; 640 + case BIODASDCOPYPAIRSWAP: 641 + rc = dasd_ioctl_copy_pair_swap(bdev, argp); 689 642 break; 690 643 default: 691 644 /* if the discipline has an ioctl method try it. */

+1 -4

drivers/scsi/hisi_sas/hisi_sas_v2_hw.c

··· 3537 3537 3538 3538 ATTRIBUTE_GROUPS(host_v2_hw); 3539 3539 3540 - static int map_queues_v2_hw(struct Scsi_Host *shost) 3540 + static void map_queues_v2_hw(struct Scsi_Host *shost) 3541 3541 { 3542 3542 struct hisi_hba *hisi_hba = shost_priv(shost); 3543 3543 struct blk_mq_queue_map *qmap = &shost->tag_set.map[HCTX_TYPE_DEFAULT]; ··· 3552 3552 for_each_cpu(cpu, mask) 3553 3553 qmap->mq_map[cpu] = qmap->queue_offset + queue; 3554 3554 } 3555 - 3556 - return 0; 3557 - 3558 3555 } 3559 3556 3560 3557 static struct scsi_host_template sht_v2_hw = {

+2 -3

drivers/scsi/hisi_sas/hisi_sas_v3_hw.c

··· 3171 3171 return 0; 3172 3172 } 3173 3173 3174 - static int hisi_sas_map_queues(struct Scsi_Host *shost) 3174 + static void hisi_sas_map_queues(struct Scsi_Host *shost) 3175 3175 { 3176 3176 struct hisi_hba *hisi_hba = shost_priv(shost); 3177 3177 struct blk_mq_queue_map *qmap = &shost->tag_set.map[HCTX_TYPE_DEFAULT]; 3178 3178 3179 - return blk_mq_pci_map_queues(qmap, hisi_hba->pci_dev, 3180 - BASE_VECTORS_V3_HW); 3179 + blk_mq_pci_map_queues(qmap, hisi_hba->pci_dev, BASE_VECTORS_V3_HW); 3181 3180 } 3182 3181 3183 3182 static struct scsi_host_template sht_v3_hw = {

+2 -4

drivers/scsi/megaraid/megaraid_sas_base.c

··· 3174 3174 return 0; 3175 3175 } 3176 3176 3177 - static int megasas_map_queues(struct Scsi_Host *shost) 3177 + static void megasas_map_queues(struct Scsi_Host *shost) 3178 3178 { 3179 3179 struct megasas_instance *instance; 3180 3180 int qoff = 0, offset; ··· 3183 3183 instance = (struct megasas_instance *)shost->hostdata; 3184 3184 3185 3185 if (shost->nr_hw_queues == 1) 3186 - return 0; 3186 + return; 3187 3187 3188 3188 offset = instance->low_latency_index_start; 3189 3189 ··· 3209 3209 map->queue_offset = qoff; 3210 3210 blk_mq_map_queues(map); 3211 3211 } 3212 - 3213 - return 0; 3214 3212 } 3215 3213 3216 3214 static void megasas_aen_polling(struct work_struct *work);

+1 -4

drivers/scsi/mpi3mr/mpi3mr_os.c

··· 3464 3464 * 3465 3465 * Return: return zero. 3466 3466 */ 3467 - static int mpi3mr_map_queues(struct Scsi_Host *shost) 3467 + static void mpi3mr_map_queues(struct Scsi_Host *shost) 3468 3468 { 3469 3469 struct mpi3mr_ioc *mrioc = shost_priv(shost); 3470 3470 int i, qoff, offset; ··· 3500 3500 qoff += map->nr_queues; 3501 3501 offset += map->nr_queues; 3502 3502 } 3503 - 3504 - return 0; 3505 - 3506 3503 } 3507 3504 3508 3505 /**

+2 -3

drivers/scsi/mpt3sas/mpt3sas_scsih.c

··· 11872 11872 * scsih_map_queues - map reply queues with request queues 11873 11873 * @shost: SCSI host pointer 11874 11874 */ 11875 - static int scsih_map_queues(struct Scsi_Host *shost) 11875 + static void scsih_map_queues(struct Scsi_Host *shost) 11876 11876 { 11877 11877 struct MPT3SAS_ADAPTER *ioc = 11878 11878 (struct MPT3SAS_ADAPTER *)shost->hostdata; ··· 11882 11882 int iopoll_q_count = ioc->reply_queue_count - nr_msix_vectors; 11883 11883 11884 11884 if (shost->nr_hw_queues == 1) 11885 - return 0; 11885 + return; 11886 11886 11887 11887 for (i = 0, qoff = 0; i < shost->nr_maps; i++) { 11888 11888 map = &shost->tag_set.map[i]; ··· 11910 11910 11911 11911 qoff += map->nr_queues; 11912 11912 } 11913 - return 0; 11914 11913 } 11915 11914 11916 11915 /* shost template for SAS 2.0 HBA devices */

+1 -1

drivers/scsi/pm8001/pm8001_init.c

··· 81 81 82 82 struct workqueue_struct *pm8001_wq; 83 83 84 - static int pm8001_map_queues(struct Scsi_Host *shost) 84 + static void pm8001_map_queues(struct Scsi_Host *shost) 85 85 { 86 86 struct sas_ha_struct *sha = SHOST_TO_SAS_HA(shost); 87 87 struct pm8001_hba_info *pm8001_ha = sha->lldd_ha;

+1 -5

drivers/scsi/qla2xxx/qla_nvme.c

··· 684 684 struct blk_mq_queue_map *map) 685 685 { 686 686 struct scsi_qla_host *vha = lport->private; 687 - int rc; 688 687 689 - rc = blk_mq_pci_map_queues(map, vha->hw->pdev, vha->irq_offset); 690 - if (rc) 691 - ql_log(ql_log_warn, vha, 0x21de, 692 - "pci map queue failed 0x%x", rc); 688 + blk_mq_pci_map_queues(map, vha->hw->pdev, vha->irq_offset); 693 689 } 694 690 695 691 static void qla_nvme_localport_delete(struct nvme_fc_local_port *lport)

+4 -6

drivers/scsi/qla2xxx/qla_os.c

··· 350 350 351 351 static void qla2x00_clear_drv_active(struct qla_hw_data *); 352 352 static void qla2x00_free_device(scsi_qla_host_t *); 353 - static int qla2xxx_map_queues(struct Scsi_Host *shost); 353 + static void qla2xxx_map_queues(struct Scsi_Host *shost); 354 354 static void qla2x00_destroy_deferred_work(struct qla_hw_data *); 355 355 356 356 u32 ql2xnvme_queues = DEF_NVME_HW_QUEUES; ··· 7994 7994 clear_bit(ABORT_ISP_ACTIVE, &base_vha->dpc_flags); 7995 7995 } 7996 7996 7997 - static int qla2xxx_map_queues(struct Scsi_Host *shost) 7997 + static void qla2xxx_map_queues(struct Scsi_Host *shost) 7998 7998 { 7999 - int rc; 8000 7999 scsi_qla_host_t *vha = (scsi_qla_host_t *)shost->hostdata; 8001 8000 struct blk_mq_queue_map *qmap = &shost->tag_set.map[HCTX_TYPE_DEFAULT]; 8002 8001 8003 8002 if (USER_CTRL_IRQ(vha->hw) || !vha->hw->mqiobase) 8004 - rc = blk_mq_map_queues(qmap); 8003 + blk_mq_map_queues(qmap); 8005 8004 else 8006 - rc = blk_mq_pci_map_queues(qmap, vha->hw->pdev, vha->irq_offset); 8007 - return rc; 8005 + blk_mq_pci_map_queues(qmap, vha->hw->pdev, vha->irq_offset); 8008 8006 } 8009 8007 8010 8008 struct scsi_host_template qla2xxx_driver_template = {

+2 -5

drivers/scsi/scsi_debug.c

··· 7474 7474 return check_condition_result; 7475 7475 } 7476 7476 7477 - static int sdebug_map_queues(struct Scsi_Host *shost) 7477 + static void sdebug_map_queues(struct Scsi_Host *shost) 7478 7478 { 7479 7479 int i, qoff; 7480 7480 7481 7481 if (shost->nr_hw_queues == 1) 7482 - return 0; 7482 + return; 7483 7483 7484 7484 for (i = 0, qoff = 0; i < HCTX_MAX_TYPES; i++) { 7485 7485 struct blk_mq_queue_map *map = &shost->tag_set.map[i]; ··· 7501 7501 7502 7502 qoff += map->nr_queues; 7503 7503 } 7504 - 7505 - return 0; 7506 - 7507 7504 } 7508 7505 7509 7506 static int sdebug_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num)

+2 -2

drivers/scsi/scsi_lib.c

··· 1856 1856 return 0; 1857 1857 } 1858 1858 1859 - static int scsi_map_queues(struct blk_mq_tag_set *set) 1859 + static void scsi_map_queues(struct blk_mq_tag_set *set) 1860 1860 { 1861 1861 struct Scsi_Host *shost = container_of(set, struct Scsi_Host, tag_set); 1862 1862 1863 1863 if (shost->hostt->map_queues) 1864 1864 return shost->hostt->map_queues(shost); 1865 - return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); 1865 + blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); 1866 1866 } 1867 1867 1868 1868 void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q)

+3 -3

drivers/scsi/smartpqi/smartpqi_init.c

··· 6436 6436 return 0; 6437 6437 } 6438 6438 6439 - static int pqi_map_queues(struct Scsi_Host *shost) 6439 + static void pqi_map_queues(struct Scsi_Host *shost) 6440 6440 { 6441 6441 struct pqi_ctrl_info *ctrl_info = shost_to_hba(shost); 6442 6442 6443 - return blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT], 6444 - ctrl_info->pci_dev, 0); 6443 + blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT], 6444 + ctrl_info->pci_dev, 0); 6445 6445 } 6446 6446 6447 6447 static inline bool pqi_is_tape_changer_device(struct pqi_scsi_dev *device)

+2 -2

drivers/scsi/virtio_scsi.c

··· 711 711 return virtscsi_tmf(vscsi, cmd); 712 712 } 713 713 714 - static int virtscsi_map_queues(struct Scsi_Host *shost) 714 + static void virtscsi_map_queues(struct Scsi_Host *shost) 715 715 { 716 716 struct virtio_scsi *vscsi = shost_priv(shost); 717 717 struct blk_mq_queue_map *qmap = &shost->tag_set.map[HCTX_TYPE_DEFAULT]; 718 718 719 - return blk_mq_virtio_map_queues(qmap, vscsi->vdev, 2); 719 + blk_mq_virtio_map_queues(qmap, vscsi->vdev, 2); 720 720 } 721 721 722 722 static void virtscsi_commit_rqs(struct Scsi_Host *shost, u16 hwq)

+3 -6

drivers/ufs/core/ufshcd.c

··· 2701 2701 * Associate the UFS controller queue with the default and poll HCTX types. 2702 2702 * Initialize the mq_map[] arrays. 2703 2703 */ 2704 - static int ufshcd_map_queues(struct Scsi_Host *shost) 2704 + static void ufshcd_map_queues(struct Scsi_Host *shost) 2705 2705 { 2706 - int i, ret; 2706 + int i; 2707 2707 2708 2708 for (i = 0; i < shost->nr_maps; i++) { 2709 2709 struct blk_mq_queue_map *map = &shost->tag_set.map[i]; ··· 2720 2720 WARN_ON_ONCE(true); 2721 2721 } 2722 2722 map->queue_offset = 0; 2723 - ret = blk_mq_map_queues(map); 2724 - WARN_ON_ONCE(ret); 2723 + blk_mq_map_queues(map); 2725 2724 } 2726 - 2727 - return 0; 2728 2725 } 2729 2726 2730 2727 static void ufshcd_init_lrb(struct ufs_hba *hba, struct ufshcd_lrb *lrb, int i)

+12 -2

fs/btrfs/compression.c

··· 15 15 #include <linux/string.h> 16 16 #include <linux/backing-dev.h> 17 17 #include <linux/writeback.h> 18 + #include <linux/psi.h> 18 19 #include <linux/slab.h> 19 20 #include <linux/sched/mm.h> 20 21 #include <linux/log2.h> ··· 512 511 */ 513 512 static noinline int add_ra_bio_pages(struct inode *inode, 514 513 u64 compressed_end, 515 - struct compressed_bio *cb) 514 + struct compressed_bio *cb, 515 + unsigned long *pflags) 516 516 { 517 517 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 518 518 unsigned long end_index; ··· 581 579 cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; 582 580 continue; 583 581 } 582 + 583 + if (PageWorkingset(page)) 584 + psi_memstall_enter(pflags); 584 585 585 586 ret = set_page_extent_mapped(page); 586 587 if (ret < 0) { ··· 671 666 u64 em_len; 672 667 u64 em_start; 673 668 struct extent_map *em; 669 + /* Initialize to 1 to make skip psi_memstall_leave unless needed */ 670 + unsigned long pflags = 1; 674 671 blk_status_t ret; 675 672 int ret2; 676 673 int i; ··· 728 721 goto fail; 729 722 } 730 723 731 - add_ra_bio_pages(inode, em_start + em_len, cb); 724 + add_ra_bio_pages(inode, em_start + em_len, cb, &pflags); 732 725 733 726 /* include any pages we added in add_ra-bio_pages */ 734 727 cb->len = bio->bi_iter.bi_size; ··· 807 800 comp_bio = NULL; 808 801 } 809 802 } 803 + 804 + if (!pflags) 805 + psi_memstall_leave(&pflags); 810 806 811 807 if (refcount_dec_and_test(&cb->pending_ios)) 812 808 finish_compressed_bio_read(cb);

-2

fs/direct-io.c

··· 421 421 unsigned long flags; 422 422 423 423 bio->bi_private = dio; 424 - /* don't account direct I/O as memory stall */ 425 - bio_clear_flag(bio, BIO_WORKINGSET); 426 424 427 425 spin_lock_irqsave(&dio->bio_lock, flags); 428 426 dio->refcount++;

+12 -1

fs/erofs/zdata.c

··· 7 7 #include "zdata.h" 8 8 #include "compress.h" 9 9 #include <linux/prefetch.h> 10 + #include <linux/psi.h> 10 11 11 12 #include <trace/events/erofs.h> 12 13 ··· 1415 1414 struct block_device *last_bdev; 1416 1415 unsigned int nr_bios = 0; 1417 1416 struct bio *bio = NULL; 1417 + /* initialize to 1 to make skip psi_memstall_leave unless needed */ 1418 + unsigned long pflags = 1; 1418 1419 1419 1420 bi_private = jobqueueset_init(sb, q, fgq, force_fg); 1420 1421 qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; ··· 1466 1463 if (bio && (cur != last_index + 1 || 1467 1464 last_bdev != mdev.m_bdev)) { 1468 1465 submit_bio_retry: 1466 + if (!pflags) 1467 + psi_memstall_leave(&pflags); 1469 1468 submit_bio(bio); 1470 1469 bio = NULL; 1471 1470 } 1471 + 1472 + if (unlikely(PageWorkingset(page))) 1473 + psi_memstall_enter(&pflags); 1472 1474 1473 1475 if (!bio) { 1474 1476 bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, ··· 1502 1494 move_to_bypass_jobqueue(pcl, qtail, owned_head); 1503 1495 } while (owned_head != Z_EROFS_PCLUSTER_TAIL); 1504 1496 1505 - if (bio) 1497 + if (bio) { 1498 + if (!pflags) 1499 + psi_memstall_leave(&pflags); 1506 1500 submit_bio(bio); 1501 + } 1507 1502 1508 1503 /* 1509 1504 * although background is preferred, no one is pending for submission.

+1 -1

include/linux/bio.h

··· 509 509 { 510 510 bio_clear_flag(bio, BIO_REMAPPED); 511 511 if (bio->bi_bdev != bdev) 512 - bio_clear_flag(bio, BIO_THROTTLED); 512 + bio_clear_flag(bio, BIO_BPS_THROTTLED); 513 513 bio->bi_bdev = bdev; 514 514 bio_associate_blkg(bio); 515 515 }

+2 -3

include/linux/blk-cgroup.h

··· 18 18 19 19 struct bio; 20 20 struct cgroup_subsys_state; 21 - struct request_queue; 21 + struct gendisk; 22 22 23 23 #define FC_APPID_LEN 129 24 24 25 25 #ifdef CONFIG_BLK_CGROUP 26 26 extern struct cgroup_subsys_state * const blkcg_root_css; 27 27 28 - void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay); 28 + void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay); 29 29 void blkcg_maybe_throttle_current(void); 30 30 bool blk_cgroup_congested(void); 31 31 void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css); ··· 39 39 40 40 static inline void blkcg_maybe_throttle_current(void) { } 41 41 static inline bool blk_cgroup_congested(void) { return false; } 42 - static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { } 43 42 static inline struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio) 44 43 { 45 44 return NULL;

+2 -2

include/linux/blk-mq-pci.h

··· 5 5 struct blk_mq_queue_map; 6 6 struct pci_dev; 7 7 8 - int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, 9 - int offset); 8 + void blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, 9 + int offset); 10 10 11 11 #endif /* _LINUX_BLK_MQ_PCI_H */

+1 -1

include/linux/blk-mq-rdma.h

··· 5 5 struct blk_mq_tag_set; 6 6 struct ib_device; 7 7 8 - int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, 8 + void blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, 9 9 struct ib_device *dev, int first_vec); 10 10 11 11 #endif /* _LINUX_BLK_MQ_RDMA_H */

+1 -1

include/linux/blk-mq-virtio.h

··· 5 5 struct blk_mq_queue_map; 6 6 struct virtio_device; 7 7 8 - int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, 8 + void blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, 9 9 struct virtio_device *vdev, int first_vec); 10 10 11 11 #endif /* _LINUX_BLK_MQ_VIRTIO_H */

+15 -8

include/linux/blk-mq.h

··· 268 268 rq_list_add(dst, rq); 269 269 } 270 270 271 + /** 272 + * enum blk_eh_timer_return - How the timeout handler should proceed 273 + * @BLK_EH_DONE: The block driver completed the command or will complete it at 274 + * a later time. 275 + * @BLK_EH_RESET_TIMER: Reset the request timer and continue waiting for the 276 + * request to complete. 277 + */ 271 278 enum blk_eh_timer_return { 272 - BLK_EH_DONE, /* drivers has completed the command */ 273 - BLK_EH_RESET_TIMER, /* reset timer and try again */ 279 + BLK_EH_DONE, 280 + BLK_EH_RESET_TIMER, 274 281 }; 275 282 276 283 #define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */ ··· 637 630 * @map_queues: This allows drivers specify their own queue mapping by 638 631 * overriding the setup-time function that builds the mq_map. 639 632 */ 640 - int (*map_queues)(struct blk_mq_tag_set *set); 633 + void (*map_queues)(struct blk_mq_tag_set *set); 641 634 642 635 #ifdef CONFIG_BLK_DEBUG_FS 643 636 /** ··· 887 880 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, 888 881 unsigned long timeout); 889 882 890 - int blk_mq_map_queues(struct blk_mq_queue_map *qmap); 883 + void blk_mq_map_queues(struct blk_mq_queue_map *qmap); 891 884 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); 892 885 893 886 void blk_mq_quiesce_queue_nowait(struct request_queue *q); ··· 970 963 971 964 struct rq_map_data { 972 965 struct page **pages; 973 - int page_order; 974 - int nr_entries; 975 966 unsigned long offset; 976 - int null_mapped; 977 - int from_user; 967 + unsigned short page_order; 968 + unsigned short nr_entries; 969 + bool null_mapped; 970 + bool from_user; 978 971 }; 979 972 980 973 int blk_rq_map_user(struct request_queue *, struct request *,

+1 -2

include/linux/blk_types.h

··· 321 321 BIO_NO_PAGE_REF, /* don't put release vec pages */ 322 322 BIO_CLONED, /* doesn't own data */ 323 323 BIO_BOUNCED, /* bio is a bounce bio */ 324 - BIO_WORKINGSET, /* contains userspace workingset pages */ 325 324 BIO_QUIET, /* Make BIO Quiet */ 326 325 BIO_CHAIN, /* chained bio, ->bi_remaining in effect */ 327 326 BIO_REFFED, /* bio has elevated ->bi_cnt */ 328 - BIO_THROTTLED, /* This bio has already been subjected to 327 + BIO_BPS_THROTTLED, /* This bio has already been subjected to 329 328 * throttling rules. Don't do it again. */ 330 329 BIO_TRACE_COMPLETION, /* bio_endio() should trace the final completion 331 330 * of this bio. */

+14 -1

include/linux/blkdev.h

··· 618 618 #define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) 619 619 #define blk_queue_pm_only(q) atomic_read(&(q)->pm_only) 620 620 #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags) 621 - #define blk_queue_nowait(q) test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags) 622 621 #define blk_queue_sq_sched(q) test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags) 623 622 624 623 extern void blk_set_pm_only(struct request_queue *q); ··· 1279 1280 return test_bit(QUEUE_FLAG_FUA, &bdev_get_queue(bdev)->queue_flags); 1280 1281 } 1281 1282 1283 + static inline bool bdev_nowait(struct block_device *bdev) 1284 + { 1285 + return test_bit(QUEUE_FLAG_NOWAIT, &bdev_get_queue(bdev)->queue_flags); 1286 + } 1287 + 1282 1288 static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev) 1283 1289 { 1284 1290 struct request_queue *q = bdev_get_queue(bdev); ··· 1302 1298 return blk_queue_is_zoned(q); 1303 1299 1304 1300 return false; 1301 + } 1302 + 1303 + static inline bool bdev_op_is_zoned_write(struct block_device *bdev, 1304 + blk_opf_t op) 1305 + { 1306 + if (!bdev_is_zoned(bdev)) 1307 + return false; 1308 + 1309 + return op == REQ_OP_WRITE || op == REQ_OP_WRITE_ZEROES; 1305 1310 } 1306 1311 1307 1312 static inline sector_t bdev_zone_sectors(struct block_device *bdev)

+2 -2

include/linux/nvme.h

··· 1482 1482 }; 1483 1483 1484 1484 enum { 1485 - NVME_CONNECT_AUTHREQ_ASCR = (1 << 2), 1486 - NVME_CONNECT_AUTHREQ_ATR = (1 << 1), 1485 + NVME_CONNECT_AUTHREQ_ASCR = (1U << 18), 1486 + NVME_CONNECT_AUTHREQ_ATR = (1U << 17), 1487 1487 }; 1488 1488 1489 1489 struct nvmf_connect_data {

+2

include/linux/pagemap.h

··· 1173 1173 pgoff_t _index; 1174 1174 unsigned int _nr_pages; 1175 1175 unsigned int _batch_count; 1176 + bool _workingset; 1177 + unsigned long _pflags; 1176 1178 }; 1177 1179 1178 1180 #define DEFINE_READAHEAD(ractl, f, r, m, i) \

+2 -1

include/linux/sbitmap.h

··· 575 575 * sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue 576 576 * on a &struct sbitmap_queue. 577 577 * @sbq: Bitmap queue to wake up. 578 + * @nr: Number of bits cleared. 578 579 */ 579 - void sbitmap_queue_wake_up(struct sbitmap_queue *sbq); 580 + void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr); 580 581 581 582 /** 582 583 * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct

+1

include/linux/sed-opal.h

··· 43 43 case IOC_OPAL_MBR_DONE: 44 44 case IOC_OPAL_WRITE_SHADOW_MBR: 45 45 case IOC_OPAL_GENERIC_TABLE_RW: 46 + case IOC_OPAL_GET_STATUS: 46 47 return true; 47 48 } 48 49 return false;

+1 -1

include/scsi/scsi_host.h

··· 276 276 * 277 277 * Status: OPTIONAL 278 278 */ 279 - int (* map_queues)(struct Scsi_Host *shost); 279 + void (* map_queues)(struct Scsi_Host *shost); 280 280 281 281 /* 282 282 * SCSI interface of blk_poll - poll for IO completions.

+13

include/uapi/linux/sed-opal.h

··· 132 132 __u64 priv; 133 133 }; 134 134 135 + #define OPAL_FL_SUPPORTED 0x00000001 136 + #define OPAL_FL_LOCKING_SUPPORTED 0x00000002 137 + #define OPAL_FL_LOCKING_ENABLED 0x00000004 138 + #define OPAL_FL_LOCKED 0x00000008 139 + #define OPAL_FL_MBR_ENABLED 0x00000010 140 + #define OPAL_FL_MBR_DONE 0x00000020 141 + 142 + struct opal_status { 143 + __u32 flags; 144 + __u32 reserved; 145 + }; 146 + 135 147 #define IOC_OPAL_SAVE _IOW('p', 220, struct opal_lock_unlock) 136 148 #define IOC_OPAL_LOCK_UNLOCK _IOW('p', 221, struct opal_lock_unlock) 137 149 #define IOC_OPAL_TAKE_OWNERSHIP _IOW('p', 222, struct opal_key) ··· 160 148 #define IOC_OPAL_MBR_DONE _IOW('p', 233, struct opal_mbr_done) 161 149 #define IOC_OPAL_WRITE_SHADOW_MBR _IOW('p', 234, struct opal_shadow_mbr) 162 150 #define IOC_OPAL_GENERIC_TABLE_RW _IOW('p', 235, struct opal_read_write_table) 151 + #define IOC_OPAL_GET_STATUS _IOR('p', 236, struct opal_status) 163 152 164 153 #endif /* _UAPI_SED_OPAL_H */

+7 -1

include/uapi/linux/ublk_cmd.h

··· 17 17 #define UBLK_CMD_STOP_DEV 0x07 18 18 #define UBLK_CMD_SET_PARAMS 0x08 19 19 #define UBLK_CMD_GET_PARAMS 0x09 20 - 20 + #define UBLK_CMD_START_USER_RECOVERY 0x10 21 + #define UBLK_CMD_END_USER_RECOVERY 0x11 21 22 /* 22 23 * IO commands, issued by ublk server, and handled by ublk driver. 23 24 * ··· 75 74 */ 76 75 #define UBLK_F_NEED_GET_DATA (1UL << 2) 77 76 77 + #define UBLK_F_USER_RECOVERY (1UL << 3) 78 + 79 + #define UBLK_F_USER_RECOVERY_REISSUE (1UL << 4) 80 + 78 81 /* device state */ 79 82 #define UBLK_S_DEV_DEAD 0 80 83 #define UBLK_S_DEV_LIVE 1 84 + #define UBLK_S_DEV_QUIESCED 2 81 85 82 86 /* shipped via sqe->cmd of io_uring command */ 83 87 struct ublksrv_ctrl_cmd {

+1 -1

io_uring/io_uring.c

··· 1540 1540 1541 1541 static bool io_bdev_nowait(struct block_device *bdev) 1542 1542 { 1543 - return !bdev || blk_queue_nowait(bdev_get_queue(bdev)); 1543 + return !bdev || bdev_nowait(bdev); 1544 1544 } 1545 1545 1546 1546 /*

+2

kernel/sched/psi.c

··· 917 917 918 918 rq_unlock_irq(rq, &rf); 919 919 } 920 + EXPORT_SYMBOL_GPL(psi_memstall_enter); 920 921 921 922 /** 922 923 * psi_memstall_leave - mark the end of an memory stall section ··· 947 946 948 947 rq_unlock_irq(rq, &rf); 949 948 } 949 + EXPORT_SYMBOL_GPL(psi_memstall_leave); 950 950 951 951 #ifdef CONFIG_CGROUPS 952 952 int psi_cgroup_alloc(struct cgroup *cgroup)

+70 -39

lib/sbitmap.c

··· 533 533 nr = find_first_zero_bit(&map->word, map_depth); 534 534 if (nr + nr_tags <= map_depth) { 535 535 atomic_long_t *ptr = (atomic_long_t *) &map->word; 536 - int map_tags = min_t(int, nr_tags, map_depth); 537 - unsigned long val, ret; 536 + unsigned long val; 538 537 539 - get_mask = ((1UL << map_tags) - 1) << nr; 538 + get_mask = ((1UL << nr_tags) - 1) << nr; 539 + val = READ_ONCE(map->word); 540 540 do { 541 - val = READ_ONCE(map->word); 542 541 if ((val & ~get_mask) != val) 543 542 goto next; 544 - ret = atomic_long_cmpxchg(ptr, val, get_mask | val); 545 - } while (ret != val); 546 - get_mask = (get_mask & ~ret) >> nr; 543 + } while (!atomic_long_try_cmpxchg(ptr, &val, 544 + get_mask | val)); 545 + get_mask = (get_mask & ~val) >> nr; 547 546 if (get_mask) { 548 547 *offset = nr + (index << sb->shift); 549 548 update_alloc_hint_after_get(sb, depth, hint, 550 - *offset + map_tags - 1); 549 + *offset + nr_tags - 1); 551 550 return get_mask; 552 551 } 553 552 } ··· 587 588 for (i = 0; i < SBQ_WAIT_QUEUES; i++) { 588 589 struct sbq_wait_state *ws = &sbq->ws[wake_index]; 589 590 590 - if (waitqueue_active(&ws->wait)) { 591 + if (waitqueue_active(&ws->wait) && atomic_read(&ws->wait_cnt)) { 591 592 if (wake_index != atomic_read(&sbq->wake_index)) 592 593 atomic_set(&sbq->wake_index, wake_index); 593 594 return ws; ··· 599 600 return NULL; 600 601 } 601 602 602 - static bool __sbq_wake_up(struct sbitmap_queue *sbq) 603 + static bool __sbq_wake_up(struct sbitmap_queue *sbq, int *nr) 603 604 { 604 605 struct sbq_wait_state *ws; 605 606 unsigned int wake_batch; 606 - int wait_cnt; 607 + int wait_cnt, cur, sub; 608 + bool ret; 609 + 610 + if (*nr <= 0) 611 + return false; 607 612 608 613 ws = sbq_wake_ptr(sbq); 609 614 if (!ws) 610 615 return false; 611 616 612 - wait_cnt = atomic_dec_return(&ws->wait_cnt); 613 - if (wait_cnt <= 0) { 614 - int ret; 615 - 616 - wake_batch = READ_ONCE(sbq->wake_batch); 617 - 617 + cur = atomic_read(&ws->wait_cnt); 618 + do { 618 619 /* 619 - * Pairs with the memory barrier in sbitmap_queue_resize() to 620 - * ensure that we see the batch size update before the wait 621 - * count is reset. 620 + * For concurrent callers of this, callers should call this 621 + * function again to wakeup a new batch on a different 'ws'. 622 622 */ 623 - smp_mb__before_atomic(); 623 + if (cur == 0) 624 + return true; 625 + sub = min(*nr, cur); 626 + wait_cnt = cur - sub; 627 + } while (!atomic_try_cmpxchg(&ws->wait_cnt, &cur, wait_cnt)); 624 628 625 - /* 626 - * For concurrent callers of this, the one that failed the 627 - * atomic_cmpxhcg() race should call this function again 628 - * to wakeup a new batch on a different 'ws'. 629 - */ 630 - ret = atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wake_batch); 631 - if (ret == wait_cnt) { 632 - sbq_index_atomic_inc(&sbq->wake_index); 633 - wake_up_nr(&ws->wait, wake_batch); 634 - return false; 635 - } 629 + /* 630 + * If we decremented queue without waiters, retry to avoid lost 631 + * wakeups. 632 + */ 633 + if (wait_cnt > 0) 634 + return !waitqueue_active(&ws->wait); 636 635 637 - return true; 638 - } 636 + *nr -= sub; 639 637 640 - return false; 638 + /* 639 + * When wait_cnt == 0, we have to be particularly careful as we are 640 + * responsible to reset wait_cnt regardless whether we've actually 641 + * woken up anybody. But in case we didn't wakeup anybody, we still 642 + * need to retry. 643 + */ 644 + ret = !waitqueue_active(&ws->wait); 645 + wake_batch = READ_ONCE(sbq->wake_batch); 646 + 647 + /* 648 + * Wake up first in case that concurrent callers decrease wait_cnt 649 + * while waitqueue is empty. 650 + */ 651 + wake_up_nr(&ws->wait, wake_batch); 652 + 653 + /* 654 + * Pairs with the memory barrier in sbitmap_queue_resize() to 655 + * ensure that we see the batch size update before the wait 656 + * count is reset. 657 + * 658 + * Also pairs with the implicit barrier between decrementing wait_cnt 659 + * and checking for waitqueue_active() to make sure waitqueue_active() 660 + * sees result of the wakeup if atomic_dec_return() has seen the result 661 + * of atomic_set(). 662 + */ 663 + smp_mb__before_atomic(); 664 + 665 + /* 666 + * Increase wake_index before updating wait_cnt, otherwise concurrent 667 + * callers can see valid wait_cnt in old waitqueue, which can cause 668 + * invalid wakeup on the old waitqueue. 669 + */ 670 + sbq_index_atomic_inc(&sbq->wake_index); 671 + atomic_set(&ws->wait_cnt, wake_batch); 672 + 673 + return ret || *nr; 641 674 } 642 675 643 - void sbitmap_queue_wake_up(struct sbitmap_queue *sbq) 676 + void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) 644 677 { 645 - while (__sbq_wake_up(sbq)) 678 + while (__sbq_wake_up(sbq, &nr)) 646 679 ; 647 680 } 648 681 EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up); ··· 714 683 atomic_long_andnot(mask, (atomic_long_t *) addr); 715 684 716 685 smp_mb__after_atomic(); 717 - sbitmap_queue_wake_up(sbq); 686 + sbitmap_queue_wake_up(sbq, nr_tags); 718 687 sbitmap_update_cpu_hint(&sbq->sb, raw_smp_processor_id(), 719 688 tags[nr_tags - 1] - offset); 720 689 } ··· 742 711 * waiter. See the comment on waitqueue_active(). 743 712 */ 744 713 smp_mb__after_atomic(); 745 - sbitmap_queue_wake_up(sbq); 714 + sbitmap_queue_wake_up(sbq, 1); 746 715 sbitmap_update_cpu_hint(&sbq->sb, cpu, nr); 747 716 } 748 717 EXPORT_SYMBOL_GPL(sbitmap_queue_clear);

+7

mm/filemap.c

··· 2382 2382 static int filemap_read_folio(struct file *file, filler_t filler, 2383 2383 struct folio *folio) 2384 2384 { 2385 + bool workingset = folio_test_workingset(folio); 2386 + unsigned long pflags; 2385 2387 int error; 2386 2388 2387 2389 /* ··· 2392 2390 * fails. 2393 2391 */ 2394 2392 folio_clear_error(folio); 2393 + 2395 2394 /* Start the actual read. The read will unlock the page. */ 2395 + if (unlikely(workingset)) 2396 + psi_memstall_enter(&pflags); 2396 2397 error = filler(file, folio); 2398 + if (unlikely(workingset)) 2399 + psi_memstall_leave(&pflags); 2397 2400 if (error) 2398 2401 return error; 2399 2402

+18 -4

mm/readahead.c

··· 122 122 #include <linux/task_io_accounting_ops.h> 123 123 #include <linux/pagevec.h> 124 124 #include <linux/pagemap.h> 125 + #include <linux/psi.h> 125 126 #include <linux/syscalls.h> 126 127 #include <linux/file.h> 127 128 #include <linux/mm_inline.h> ··· 153 152 if (!readahead_count(rac)) 154 153 return; 155 154 155 + if (unlikely(rac->_workingset)) 156 + psi_memstall_enter(&rac->_pflags); 156 157 blk_start_plug(&plug); 157 158 158 159 if (aops->readahead) { ··· 182 179 } 183 180 184 181 blk_finish_plug(&plug); 182 + if (unlikely(rac->_workingset)) 183 + psi_memstall_leave(&rac->_pflags); 184 + rac->_workingset = false; 185 185 186 186 BUG_ON(readahead_count(rac)); 187 187 } ··· 258 252 } 259 253 if (i == nr_to_read - lookahead_size) 260 254 folio_set_readahead(folio); 255 + ractl->_workingset |= folio_test_workingset(folio); 261 256 ractl->_nr_pages++; 262 257 } 263 258 ··· 487 480 if (index == mark) 488 481 folio_set_readahead(folio); 489 482 err = filemap_add_folio(ractl->mapping, folio, index, gfp); 490 - if (err) 483 + if (err) { 491 484 folio_put(folio); 492 - else 493 - ractl->_nr_pages += 1UL << order; 494 - return err; 485 + return err; 486 + } 487 + 488 + ractl->_nr_pages += 1UL << order; 489 + ractl->_workingset |= folio_test_workingset(folio); 490 + return 0; 495 491 } 496 492 497 493 void page_cache_ra_order(struct readahead_control *ractl, ··· 835 825 if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) { 836 826 put_page(page); 837 827 return; 828 + } 829 + if (unlikely(PageWorkingset(page)) && !ractl->_workingset) { 830 + ractl->_workingset = true; 831 + psi_memstall_enter(&ractl->_pflags); 838 832 } 839 833 ractl->_nr_pages++; 840 834 if (ra) {

+1 -1

mm/swapfile.c

··· 3655 3655 plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], 3656 3656 avail_lists[nid]) { 3657 3657 if (si->bdev) { 3658 - blkcg_schedule_throttle(bdev_get_queue(si->bdev), true); 3658 + blkcg_schedule_throttle(si->bdev->bd_disk, true); 3659 3659 break; 3660 3660 } 3661 3661 }