Merge branch 'for-5.10/block' into for-5.10/drivers

-3

Documentation/filesystems/locking.rst

··· 488 488 swap_slot_free_notify: no (see below) 489 489 ======================= =================== 490 490 491 - unlock_native_capacity and revalidate_disk are called only from 492 - check_disk_change(). 493 - 494 491 swap_slot_free_notify is called with swap_lock and sometimes the page lock 495 492 held. 496 493

-2

block/Kconfig

··· 161 161 depends on BLK_WBT 162 162 help 163 163 Enable writeback throttling by default on multiqueue devices. 164 - Multiqueue currently doesn't have support for IO scheduling, 165 - enabling this option is recommended. 166 164 167 165 config BLK_DEBUG_FS 168 166 bool "Block layer debugging information in debugfs"

+7 -2

block/bfq-iosched.c

··· 4640 4640 { 4641 4641 struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; 4642 4642 4643 + if (!atomic_read(&hctx->elevator_queued)) 4644 + return false; 4645 + 4643 4646 /* 4644 4647 * Avoiding lock: a race on bfqd->busy_queues should cause at 4645 4648 * most a call to dispatch for nothing ··· 5557 5554 rq = list_first_entry(list, struct request, queuelist); 5558 5555 list_del_init(&rq->queuelist); 5559 5556 bfq_insert_request(hctx, rq, at_head); 5557 + atomic_inc(&hctx->elevator_queued); 5560 5558 } 5561 5559 } 5562 5560 ··· 5925 5921 5926 5922 bfq_completed_request(bfqq, bfqd); 5927 5923 bfq_finish_requeue_request_body(bfqq); 5924 + atomic_dec(&rq->mq_hctx->elevator_queued); 5928 5925 5929 5926 spin_unlock_irqrestore(&bfqd->lock, flags); 5930 5927 } else { ··· 6365 6360 struct blk_mq_tags *tags = hctx->sched_tags; 6366 6361 unsigned int min_shallow; 6367 6362 6368 - min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags); 6369 - sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow); 6363 + min_shallow = bfq_update_depths(bfqd, tags->bitmap_tags); 6364 + sbitmap_queue_min_shallow_depth(tags->bitmap_tags, min_shallow); 6370 6365 } 6371 6366 6372 6367 static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)

+26 -6

block/blk-cgroup.c

··· 119 119 async_bio_work); 120 120 struct bio_list bios = BIO_EMPTY_LIST; 121 121 struct bio *bio; 122 + struct blk_plug plug; 123 + bool need_plug = false; 122 124 123 125 /* as long as there are pending bios, @blkg can't go away */ 124 126 spin_lock_bh(&blkg->async_bio_lock); ··· 128 126 bio_list_init(&blkg->async_bios); 129 127 spin_unlock_bh(&blkg->async_bio_lock); 130 128 129 + /* start plug only when bio_list contains at least 2 bios */ 130 + if (bios.head && bios.head->bi_next) { 131 + need_plug = true; 132 + blk_start_plug(&plug); 133 + } 131 134 while ((bio = bio_list_pop(&bios))) 132 135 submit_bio(bio); 136 + if (need_plug) 137 + blk_finish_plug(&plug); 133 138 } 134 139 135 140 /** ··· 1622 1613 static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) 1623 1614 { 1624 1615 unsigned long pflags; 1616 + bool clamp; 1625 1617 u64 now = ktime_to_ns(ktime_get()); 1626 1618 u64 exp; 1627 1619 u64 delay_nsec = 0; 1628 1620 int tok; 1629 1621 1630 1622 while (blkg->parent) { 1631 - if (atomic_read(&blkg->use_delay)) { 1623 + int use_delay = atomic_read(&blkg->use_delay); 1624 + 1625 + if (use_delay) { 1626 + u64 this_delay; 1627 + 1632 1628 blkcg_scale_delay(blkg, now); 1633 - delay_nsec = max_t(u64, delay_nsec, 1634 - atomic64_read(&blkg->delay_nsec)); 1629 + this_delay = atomic64_read(&blkg->delay_nsec); 1630 + if (this_delay > delay_nsec) { 1631 + delay_nsec = this_delay; 1632 + clamp = use_delay > 0; 1633 + } 1635 1634 } 1636 1635 blkg = blkg->parent; 1637 1636 } ··· 1651 1634 * Let's not sleep for all eternity if we've amassed a huge delay. 1652 1635 * Swapping or metadata IO can accumulate 10's of seconds worth of 1653 1636 * delay, and we want userspace to be able to do _something_ so cap the 1654 - * delays at 1 second. If there's 10's of seconds worth of delay then 1655 - * the tasks will be delayed for 1 second for every syscall. 1637 + * delays at 0.25s. If there's 10's of seconds worth of delay then the 1638 + * tasks will be delayed for 0.25 second for every syscall. If 1639 + * blkcg_set_delay() was used as indicated by negative use_delay, the 1640 + * caller is responsible for regulating the range. 1656 1641 */ 1657 - delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); 1642 + if (clamp) 1643 + delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); 1658 1644 1659 1645 if (use_memdelay) 1660 1646 psi_memstall_enter(&pflags);

+63 -176

block/blk-core.c

··· 116 116 rq->__sector = (sector_t) -1; 117 117 INIT_HLIST_NODE(&rq->hash); 118 118 RB_CLEAR_NODE(&rq->rb_node); 119 - rq->tag = -1; 120 - rq->internal_tag = -1; 119 + rq->tag = BLK_MQ_NO_TAG; 120 + rq->internal_tag = BLK_MQ_NO_TAG; 121 121 rq->start_time_ns = ktime_get_ns(); 122 122 rq->part = NULL; 123 123 refcount_set(&rq->ref, 1); ··· 538 538 if (!q->stats) 539 539 goto fail_stats; 540 540 541 - q->backing_dev_info->ra_pages = VM_READAHEAD_PAGES; 542 - q->backing_dev_info->io_pages = VM_READAHEAD_PAGES; 543 - q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK; 544 541 q->node = node_id; 542 + 543 + atomic_set(&q->nr_active_requests_shared_sbitmap, 0); 545 544 546 545 timer_setup(&q->backing_dev_info->laptop_mode_wb_timer, 547 546 laptop_mode_timer_fn, 0); ··· 641 642 blk_mq_free_request(req); 642 643 } 643 644 EXPORT_SYMBOL(blk_put_request); 644 - 645 - static void blk_account_io_merge_bio(struct request *req) 646 - { 647 - if (!blk_do_io_stat(req)) 648 - return; 649 - 650 - part_stat_lock(); 651 - part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); 652 - part_stat_unlock(); 653 - } 654 - 655 - bool bio_attempt_back_merge(struct request *req, struct bio *bio, 656 - unsigned int nr_segs) 657 - { 658 - const int ff = bio->bi_opf & REQ_FAILFAST_MASK; 659 - 660 - if (!ll_back_merge_fn(req, bio, nr_segs)) 661 - return false; 662 - 663 - trace_block_bio_backmerge(req->q, req, bio); 664 - rq_qos_merge(req->q, req, bio); 665 - 666 - if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) 667 - blk_rq_set_mixed_merge(req); 668 - 669 - req->biotail->bi_next = bio; 670 - req->biotail = bio; 671 - req->__data_len += bio->bi_iter.bi_size; 672 - 673 - bio_crypt_free_ctx(bio); 674 - 675 - blk_account_io_merge_bio(req); 676 - return true; 677 - } 678 - 679 - bool bio_attempt_front_merge(struct request *req, struct bio *bio, 680 - unsigned int nr_segs) 681 - { 682 - const int ff = bio->bi_opf & REQ_FAILFAST_MASK; 683 - 684 - if (!ll_front_merge_fn(req, bio, nr_segs)) 685 - return false; 686 - 687 - trace_block_bio_frontmerge(req->q, req, bio); 688 - rq_qos_merge(req->q, req, bio); 689 - 690 - if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) 691 - blk_rq_set_mixed_merge(req); 692 - 693 - bio->bi_next = req->bio; 694 - req->bio = bio; 695 - 696 - req->__sector = bio->bi_iter.bi_sector; 697 - req->__data_len += bio->bi_iter.bi_size; 698 - 699 - bio_crypt_do_front_merge(req, bio); 700 - 701 - blk_account_io_merge_bio(req); 702 - return true; 703 - } 704 - 705 - bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, 706 - struct bio *bio) 707 - { 708 - unsigned short segments = blk_rq_nr_discard_segments(req); 709 - 710 - if (segments >= queue_max_discard_segments(q)) 711 - goto no_merge; 712 - if (blk_rq_sectors(req) + bio_sectors(bio) > 713 - blk_rq_get_max_sectors(req, blk_rq_pos(req))) 714 - goto no_merge; 715 - 716 - rq_qos_merge(q, req, bio); 717 - 718 - req->biotail->bi_next = bio; 719 - req->biotail = bio; 720 - req->__data_len += bio->bi_iter.bi_size; 721 - req->nr_phys_segments = segments + 1; 722 - 723 - blk_account_io_merge_bio(req); 724 - return true; 725 - no_merge: 726 - req_set_nomerge(q, req); 727 - return false; 728 - } 729 - 730 - /** 731 - * blk_attempt_plug_merge - try to merge with %current's plugged list 732 - * @q: request_queue new bio is being queued at 733 - * @bio: new bio being queued 734 - * @nr_segs: number of segments in @bio 735 - * @same_queue_rq: pointer to &struct request that gets filled in when 736 - * another request associated with @q is found on the plug list 737 - * (optional, may be %NULL) 738 - * 739 - * Determine whether @bio being queued on @q can be merged with a request 740 - * on %current's plugged list. Returns %true if merge was successful, 741 - * otherwise %false. 742 - * 743 - * Plugging coalesces IOs from the same issuer for the same purpose without 744 - * going through @q->queue_lock. As such it's more of an issuing mechanism 745 - * than scheduling, and the request, while may have elvpriv data, is not 746 - * added on the elevator at this point. In addition, we don't have 747 - * reliable access to the elevator outside queue lock. Only check basic 748 - * merging parameters without querying the elevator. 749 - * 750 - * Caller must ensure !blk_queue_nomerges(q) beforehand. 751 - */ 752 - bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, 753 - unsigned int nr_segs, struct request **same_queue_rq) 754 - { 755 - struct blk_plug *plug; 756 - struct request *rq; 757 - struct list_head *plug_list; 758 - 759 - plug = blk_mq_plug(q, bio); 760 - if (!plug) 761 - return false; 762 - 763 - plug_list = &plug->mq_list; 764 - 765 - list_for_each_entry_reverse(rq, plug_list, queuelist) { 766 - bool merged = false; 767 - 768 - if (rq->q == q && same_queue_rq) { 769 - /* 770 - * Only blk-mq multiple hardware queues case checks the 771 - * rq in the same queue, there should be only one such 772 - * rq in a queue 773 - **/ 774 - *same_queue_rq = rq; 775 - } 776 - 777 - if (rq->q != q || !blk_rq_merge_ok(rq, bio)) 778 - continue; 779 - 780 - switch (blk_try_merge(rq, bio)) { 781 - case ELEVATOR_BACK_MERGE: 782 - merged = bio_attempt_back_merge(rq, bio, nr_segs); 783 - break; 784 - case ELEVATOR_FRONT_MERGE: 785 - merged = bio_attempt_front_merge(rq, bio, nr_segs); 786 - break; 787 - case ELEVATOR_DISCARD_MERGE: 788 - merged = bio_attempt_discard_merge(q, rq, bio); 789 - break; 790 - default: 791 - break; 792 - } 793 - 794 - if (merged) 795 - return true; 796 - } 797 - 798 - return false; 799 - } 800 645 801 646 static void handle_bad_sector(struct bio *bio, sector_t maxsector) 802 647 { ··· 1144 1301 * limits when retrying requests on other queues. Those requests need 1145 1302 * to be checked against the new queue limits again during dispatch. 1146 1303 */ 1147 - static int blk_cloned_rq_check_limits(struct request_queue *q, 1304 + static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q, 1148 1305 struct request *rq) 1149 1306 { 1150 - if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, req_op(rq))) { 1307 + unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); 1308 + 1309 + if (blk_rq_sectors(rq) > max_sectors) { 1310 + /* 1311 + * SCSI device does not have a good way to return if 1312 + * Write Same/Zero is actually supported. If a device rejects 1313 + * a non-read/write command (discard, write same,etc.) the 1314 + * low-level device driver will set the relevant queue limit to 1315 + * 0 to prevent blk-lib from issuing more of the offending 1316 + * operations. Commands queued prior to the queue limit being 1317 + * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O 1318 + * errors being propagated to upper layers. 1319 + */ 1320 + if (max_sectors == 0) 1321 + return BLK_STS_NOTSUPP; 1322 + 1151 1323 printk(KERN_ERR "%s: over max size limit. (%u > %u)\n", 1152 - __func__, blk_rq_sectors(rq), 1153 - blk_queue_get_max_sectors(q, req_op(rq))); 1154 - return -EIO; 1324 + __func__, blk_rq_sectors(rq), max_sectors); 1325 + return BLK_STS_IOERR; 1155 1326 } 1156 1327 1157 1328 /* ··· 1178 1321 if (rq->nr_phys_segments > queue_max_segments(q)) { 1179 1322 printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n", 1180 1323 __func__, rq->nr_phys_segments, queue_max_segments(q)); 1181 - return -EIO; 1324 + return BLK_STS_IOERR; 1182 1325 } 1183 1326 1184 - return 0; 1327 + return BLK_STS_OK; 1185 1328 } 1186 1329 1187 1330 /** ··· 1191 1334 */ 1192 1335 blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) 1193 1336 { 1194 - if (blk_cloned_rq_check_limits(q, rq)) 1195 - return BLK_STS_IOERR; 1337 + blk_status_t ret; 1338 + 1339 + ret = blk_cloned_rq_check_limits(q, rq); 1340 + if (ret != BLK_STS_OK) 1341 + return ret; 1196 1342 1197 1343 if (rq->rq_disk && 1198 1344 should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) ··· 1321 1461 part_stat_unlock(); 1322 1462 } 1323 1463 1324 - unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, 1325 - unsigned int op) 1464 + static unsigned long __part_start_io_acct(struct hd_struct *part, 1465 + unsigned int sectors, unsigned int op) 1326 1466 { 1327 - struct hd_struct *part = &disk->part0; 1328 1467 const int sgrp = op_stat_group(op); 1329 1468 unsigned long now = READ_ONCE(jiffies); 1330 1469 ··· 1336 1477 1337 1478 return now; 1338 1479 } 1480 + 1481 + unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part, 1482 + struct bio *bio) 1483 + { 1484 + *part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector); 1485 + 1486 + return __part_start_io_acct(*part, bio_sectors(bio), bio_op(bio)); 1487 + } 1488 + EXPORT_SYMBOL_GPL(part_start_io_acct); 1489 + 1490 + unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, 1491 + unsigned int op) 1492 + { 1493 + return __part_start_io_acct(&disk->part0, sectors, op); 1494 + } 1339 1495 EXPORT_SYMBOL(disk_start_io_acct); 1340 1496 1341 - void disk_end_io_acct(struct gendisk *disk, unsigned int op, 1342 - unsigned long start_time) 1497 + static void __part_end_io_acct(struct hd_struct *part, unsigned int op, 1498 + unsigned long start_time) 1343 1499 { 1344 - struct hd_struct *part = &disk->part0; 1345 1500 const int sgrp = op_stat_group(op); 1346 1501 unsigned long now = READ_ONCE(jiffies); 1347 1502 unsigned long duration = now - start_time; ··· 1365 1492 part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration)); 1366 1493 part_stat_local_dec(part, in_flight[op_is_write(op)]); 1367 1494 part_stat_unlock(); 1495 + } 1496 + 1497 + void part_end_io_acct(struct hd_struct *part, struct bio *bio, 1498 + unsigned long start_time) 1499 + { 1500 + __part_end_io_acct(part, bio_op(bio), start_time); 1501 + hd_struct_put(part); 1502 + } 1503 + EXPORT_SYMBOL_GPL(part_end_io_acct); 1504 + 1505 + void disk_end_io_acct(struct gendisk *disk, unsigned int op, 1506 + unsigned long start_time) 1507 + { 1508 + __part_end_io_acct(&disk->part0, op, start_time); 1368 1509 } 1369 1510 EXPORT_SYMBOL(disk_end_io_acct); 1370 1511

+2 -2

block/blk-integrity.c

··· 408 408 bi->tuple_size = template->tuple_size; 409 409 bi->tag_size = template->tag_size; 410 410 411 - disk->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; 411 + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); 412 412 413 413 #ifdef CONFIG_BLK_INLINE_ENCRYPTION 414 414 if (disk->queue->ksm) { ··· 428 428 */ 429 429 void blk_integrity_unregister(struct gendisk *disk) 430 430 { 431 - disk->queue->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES; 431 + blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, disk->queue); 432 432 memset(&disk->queue->integrity, 0, sizeof(struct blk_integrity)); 433 433 } 434 434 EXPORT_SYMBOL(blk_integrity_unregister);

+1179 -366

block/blk-iocost.c

··· 68 68 * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest, 69 69 * 12.5% each. The distribution mechanism only cares about these flattened 70 70 * shares. They're called hweights (hierarchical weights) and always add 71 - * upto 1 (HWEIGHT_WHOLE). 71 + * upto 1 (WEIGHT_ONE). 72 72 * 73 73 * A given cgroup's vtime runs slower in inverse proportion to its hweight. 74 74 * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5) ··· 179 179 #include <linux/parser.h> 180 180 #include <linux/sched/signal.h> 181 181 #include <linux/blk-cgroup.h> 182 + #include <asm/local.h> 183 + #include <asm/local64.h> 182 184 #include "blk-rq-qos.h" 183 185 #include "blk-stat.h" 184 186 #include "blk-wbt.h" ··· 217 215 MAX_PERIOD = USEC_PER_SEC, 218 216 219 217 /* 220 - * A cgroup's vtime can run 50% behind the device vtime, which 218 + * iocg->vtime is targeted at 50% behind the device vtime, which 221 219 * serves as its IO credit buffer. Surplus weight adjustment is 222 220 * immediately canceled if the vtime margin runs below 10%. 223 221 */ 224 - MARGIN_PCT = 50, 225 - INUSE_MARGIN_PCT = 10, 222 + MARGIN_MIN_PCT = 10, 223 + MARGIN_LOW_PCT = 20, 224 + MARGIN_TARGET_PCT = 50, 226 225 227 - /* Have some play in waitq timer operations */ 228 - WAITQ_TIMER_MARGIN_PCT = 5, 226 + INUSE_ADJ_STEP_PCT = 25, 229 227 230 - /* 231 - * vtime can wrap well within a reasonable uptime when vrate is 232 - * consistently raised. Don't trust recorded cgroup vtime if the 233 - * period counter indicates that it's older than 5mins. 234 - */ 235 - VTIME_VALID_DUR = 300 * USEC_PER_SEC, 236 - 237 - /* 238 - * Remember the past three non-zero usages and use the max for 239 - * surplus calculation. Three slots guarantee that we remember one 240 - * full period usage from the last active stretch even after 241 - * partial deactivation and re-activation periods. Don't start 242 - * giving away weight before collecting two data points to prevent 243 - * hweight adjustments based on one partial activation period. 244 - */ 245 - NR_USAGE_SLOTS = 3, 246 - MIN_VALID_USAGES = 2, 228 + /* Have some play in timer operations */ 229 + TIMER_SLACK_PCT = 1, 247 230 248 231 /* 1/64k is granular enough and can easily be handled w/ u32 */ 249 - HWEIGHT_WHOLE = 1 << 16, 232 + WEIGHT_ONE = 1 << 16, 250 233 251 234 /* 252 235 * As vtime is used to calculate the cost of each IO, it needs to ··· 262 275 /* unbusy hysterisis */ 263 276 UNBUSY_THR_PCT = 75, 264 277 265 - /* don't let cmds which take a very long time pin lagging for too long */ 266 - MAX_LAGGING_PERIODS = 10, 278 + /* 279 + * The effect of delay is indirect and non-linear and a huge amount of 280 + * future debt can accumulate abruptly while unthrottled. Linearly scale 281 + * up delay as debt is going up and then let it decay exponentially. 282 + * This gives us quick ramp ups while delay is accumulating and long 283 + * tails which can help reducing the frequency of debt explosions on 284 + * unthrottle. The parameters are experimentally determined. 285 + * 286 + * The delay mechanism provides adequate protection and behavior in many 287 + * cases. However, this is far from ideal and falls shorts on both 288 + * fronts. The debtors are often throttled too harshly costing a 289 + * significant level of fairness and possibly total work while the 290 + * protection against their impacts on the system can be choppy and 291 + * unreliable. 292 + * 293 + * The shortcoming primarily stems from the fact that, unlike for page 294 + * cache, the kernel doesn't have well-defined back-pressure propagation 295 + * mechanism and policies for anonymous memory. Fully addressing this 296 + * issue will likely require substantial improvements in the area. 297 + */ 298 + MIN_DELAY_THR_PCT = 500, 299 + MAX_DELAY_THR_PCT = 25000, 300 + MIN_DELAY = 250, 301 + MAX_DELAY = 250 * USEC_PER_MSEC, 267 302 268 303 /* 269 - * If usage% * 1.25 + 2% is lower than hweight% by more than 3%, 270 - * donate the surplus. 304 + * Halve debts if total usage keeps staying under 25% w/o any shortages 305 + * for over 100ms. 271 306 */ 272 - SURPLUS_SCALE_PCT = 125, /* * 125% */ 273 - SURPLUS_SCALE_ABS = HWEIGHT_WHOLE / 50, /* + 2% */ 274 - SURPLUS_MIN_ADJ_DELTA = HWEIGHT_WHOLE / 33, /* 3% */ 307 + DEBT_BUSY_USAGE_PCT = 25, 308 + DEBT_REDUCTION_IDLE_DUR = 100 * USEC_PER_MSEC, 309 + 310 + /* don't let cmds which take a very long time pin lagging for too long */ 311 + MAX_LAGGING_PERIODS = 10, 275 312 276 313 /* switch iff the conditions are met for longer than this */ 277 314 AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC, ··· 383 372 u32 too_slow_vrate_pct; 384 373 }; 385 374 375 + struct ioc_margins { 376 + s64 min; 377 + s64 low; 378 + s64 target; 379 + }; 380 + 386 381 struct ioc_missed { 387 - u32 nr_met; 388 - u32 nr_missed; 382 + local_t nr_met; 383 + local_t nr_missed; 389 384 u32 last_met; 390 385 u32 last_missed; 391 386 }; ··· 399 382 struct ioc_pcpu_stat { 400 383 struct ioc_missed missed[2]; 401 384 402 - u64 rq_wait_ns; 385 + local64_t rq_wait_ns; 403 386 u64 last_rq_wait_ns; 404 387 }; 405 388 ··· 410 393 bool enabled; 411 394 412 395 struct ioc_params params; 396 + struct ioc_margins margins; 413 397 u32 period_us; 414 - u32 margin_us; 398 + u32 timer_slack_ns; 415 399 u64 vrate_min; 416 400 u64 vrate_max; 417 401 ··· 423 405 424 406 enum ioc_running running; 425 407 atomic64_t vtime_rate; 408 + u64 vtime_base_rate; 409 + s64 vtime_err; 426 410 427 411 seqcount_spinlock_t period_seqcount; 428 - u32 period_at; /* wallclock starttime */ 412 + u64 period_at; /* wallclock starttime */ 429 413 u64 period_at_vtime; /* vtime starttime */ 430 414 431 415 atomic64_t cur_period; /* inc'd each period */ 432 416 int busy_level; /* saturation history */ 433 417 434 - u64 inuse_margin_vtime; 435 418 bool weights_updated; 436 419 atomic_t hweight_gen; /* for lazy hweights */ 420 + 421 + /* the last time debt cancel condition wasn't met */ 422 + u64 debt_busy_at; 437 423 438 424 u64 autop_too_fast_at; 439 425 u64 autop_too_slow_at; 440 426 int autop_idx; 441 427 bool user_qos_params:1; 442 428 bool user_cost_model:1; 429 + }; 430 + 431 + struct iocg_pcpu_stat { 432 + local64_t abs_vusage; 433 + }; 434 + 435 + struct iocg_stat { 436 + u64 usage_us; 437 + u64 wait_us; 438 + u64 indebt_us; 439 + u64 indelay_us; 443 440 }; 444 441 445 442 /* per device-cgroup pair */ ··· 476 443 * 477 444 * `last_inuse` remembers `inuse` while an iocg is idle to persist 478 445 * surplus adjustments. 446 + * 447 + * `inuse` may be adjusted dynamically during period. `saved_*` are used 448 + * to determine and track adjustments. 479 449 */ 480 450 u32 cfg_weight; 481 451 u32 weight; 482 452 u32 active; 483 453 u32 inuse; 454 + 484 455 u32 last_inuse; 456 + s64 saved_margin; 485 457 486 458 sector_t cursor; /* to detect randio */ 487 459 ··· 499 461 * `vtime_done` is the same but progressed on completion rather 500 462 * than issue. The delta behind `vtime` represents the cost of 501 463 * currently in-flight IOs. 502 - * 503 - * `last_vtime` is used to remember `vtime` at the end of the last 504 - * period to calculate utilization. 505 464 */ 506 465 atomic64_t vtime; 507 466 atomic64_t done_vtime; 508 467 u64 abs_vdebt; 509 - u64 last_vtime; 468 + 469 + /* current delay in effect and when it started */ 470 + u64 delay; 471 + u64 delay_at; 510 472 511 473 /* 512 474 * The period this iocg was last active in. Used for deactivation ··· 515 477 atomic64_t active_period; 516 478 struct list_head active_list; 517 479 518 - /* see __propagate_active_weight() and current_hweight() for details */ 480 + /* see __propagate_weights() and current_hweight() for details */ 519 481 u64 child_active_sum; 520 482 u64 child_inuse_sum; 483 + u64 child_adjusted_sum; 521 484 int hweight_gen; 522 485 u32 hweight_active; 523 486 u32 hweight_inuse; 524 - bool has_surplus; 487 + u32 hweight_donating; 488 + u32 hweight_after_donation; 489 + 490 + struct list_head walk_list; 491 + struct list_head surplus_list; 525 492 526 493 struct wait_queue_head waitq; 527 494 struct hrtimer waitq_timer; 528 - struct hrtimer delay_timer; 529 495 530 - /* usage is recorded as fractions of HWEIGHT_WHOLE */ 531 - int usage_idx; 532 - u32 usages[NR_USAGE_SLOTS]; 496 + /* timestamp at the latest activation */ 497 + u64 activated_at; 498 + 499 + /* statistics */ 500 + struct iocg_pcpu_stat __percpu *pcpu_stat; 501 + struct iocg_stat local_stat; 502 + struct iocg_stat desc_stat; 503 + struct iocg_stat last_stat; 504 + u64 last_stat_abs_vusage; 505 + u64 usage_delta_us; 506 + u64 wait_since; 507 + u64 indebt_since; 508 + u64 indelay_since; 533 509 534 510 /* this iocg's depth in the hierarchy and ancestors including self */ 535 511 int level; ··· 558 506 559 507 struct ioc_now { 560 508 u64 now_ns; 561 - u32 now; 509 + u64 now; 562 510 u64 vnow; 563 511 u64 vrate; 564 512 }; ··· 708 656 */ 709 657 static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse) 710 658 { 711 - return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse); 659 + return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse); 712 660 } 713 661 714 662 /* ··· 716 664 */ 717 665 static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse) 718 666 { 719 - return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE); 667 + return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE); 720 668 } 721 669 722 - static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost) 670 + static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, 671 + u64 abs_cost, u64 cost) 723 672 { 673 + struct iocg_pcpu_stat *gcs; 674 + 724 675 bio->bi_iocost_cost = cost; 725 676 atomic64_add(cost, &iocg->vtime); 677 + 678 + gcs = get_cpu_ptr(iocg->pcpu_stat); 679 + local64_add(abs_cost, &gcs->abs_vusage); 680 + put_cpu_ptr(gcs); 681 + } 682 + 683 + static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags) 684 + { 685 + if (lock_ioc) { 686 + spin_lock_irqsave(&iocg->ioc->lock, *flags); 687 + spin_lock(&iocg->waitq.lock); 688 + } else { 689 + spin_lock_irqsave(&iocg->waitq.lock, *flags); 690 + } 691 + } 692 + 693 + static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags) 694 + { 695 + if (unlock_ioc) { 696 + spin_unlock(&iocg->waitq.lock); 697 + spin_unlock_irqrestore(&iocg->ioc->lock, *flags); 698 + } else { 699 + spin_unlock_irqrestore(&iocg->waitq.lock, *flags); 700 + } 726 701 } 727 702 728 703 #define CREATE_TRACE_POINTS 729 704 #include <trace/events/iocost.h> 705 + 706 + static void ioc_refresh_margins(struct ioc *ioc) 707 + { 708 + struct ioc_margins *margins = &ioc->margins; 709 + u32 period_us = ioc->period_us; 710 + u64 vrate = ioc->vtime_base_rate; 711 + 712 + margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate; 713 + margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate; 714 + margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate; 715 + } 730 716 731 717 /* latency Qos params changed, update period_us and all the dependent params */ 732 718 static void ioc_refresh_period_us(struct ioc *ioc) ··· 799 709 800 710 /* calculate dependent params */ 801 711 ioc->period_us = period_us; 802 - ioc->margin_us = period_us * MARGIN_PCT / 100; 803 - ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP( 804 - period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100); 712 + ioc->timer_slack_ns = div64_u64( 713 + (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT, 714 + 100); 715 + ioc_refresh_margins(ioc); 805 716 } 806 717 807 718 static int ioc_autop_idx(struct ioc *ioc) ··· 829 738 return idx; 830 739 831 740 /* step up/down based on the vrate */ 832 - vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100, 833 - VTIME_PER_USEC); 741 + vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC); 834 742 now_ns = ktime_get_ns(); 835 743 836 744 if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) { ··· 937 847 return true; 938 848 } 939 849 850 + /* 851 + * When an iocg accumulates too much vtime or gets deactivated, we throw away 852 + * some vtime, which lowers the overall device utilization. As the exact amount 853 + * which is being thrown away is known, we can compensate by accelerating the 854 + * vrate accordingly so that the extra vtime generated in the current period 855 + * matches what got lost. 856 + */ 857 + static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now) 858 + { 859 + s64 pleft = ioc->period_at + ioc->period_us - now->now; 860 + s64 vperiod = ioc->period_us * ioc->vtime_base_rate; 861 + s64 vcomp, vcomp_min, vcomp_max; 862 + 863 + lockdep_assert_held(&ioc->lock); 864 + 865 + /* we need some time left in this period */ 866 + if (pleft <= 0) 867 + goto done; 868 + 869 + /* 870 + * Calculate how much vrate should be adjusted to offset the error. 871 + * Limit the amount of adjustment and deduct the adjusted amount from 872 + * the error. 873 + */ 874 + vcomp = -div64_s64(ioc->vtime_err, pleft); 875 + vcomp_min = -(ioc->vtime_base_rate >> 1); 876 + vcomp_max = ioc->vtime_base_rate; 877 + vcomp = clamp(vcomp, vcomp_min, vcomp_max); 878 + 879 + ioc->vtime_err += vcomp * pleft; 880 + 881 + atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp); 882 + done: 883 + /* bound how much error can accumulate */ 884 + ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod); 885 + } 886 + 940 887 /* take a snapshot of the current [v]time and vrate */ 941 888 static void ioc_now(struct ioc *ioc, struct ioc_now *now) 942 889 { ··· 1013 886 1014 887 /* 1015 888 * Update @iocg's `active` and `inuse` to @active and @inuse, update level 1016 - * weight sums and propagate upwards accordingly. 889 + * weight sums and propagate upwards accordingly. If @save, the current margin 890 + * is saved to be used as reference for later inuse in-period adjustments. 1017 891 */ 1018 - static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse) 892 + static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, 893 + bool save, struct ioc_now *now) 1019 894 { 1020 895 struct ioc *ioc = iocg->ioc; 1021 896 int lvl; 1022 897 1023 898 lockdep_assert_held(&ioc->lock); 1024 899 1025 - inuse = min(active, inuse); 900 + inuse = clamp_t(u32, inuse, 1, active); 901 + 902 + iocg->last_inuse = iocg->inuse; 903 + if (save) 904 + iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime); 905 + 906 + if (active == iocg->active && inuse == iocg->inuse) 907 + return; 1026 908 1027 909 for (lvl = iocg->level - 1; lvl >= 0; lvl--) { 1028 910 struct ioc_gq *parent = iocg->ancestors[lvl]; ··· 1069 933 ioc->weights_updated = true; 1070 934 } 1071 935 1072 - static void commit_active_weights(struct ioc *ioc) 936 + static void commit_weights(struct ioc *ioc) 1073 937 { 1074 938 lockdep_assert_held(&ioc->lock); 1075 939 ··· 1081 945 } 1082 946 } 1083 947 1084 - static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse) 948 + static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, 949 + bool save, struct ioc_now *now) 1085 950 { 1086 - __propagate_active_weight(iocg, active, inuse); 1087 - commit_active_weights(iocg->ioc); 951 + __propagate_weights(iocg, active, inuse, save, now); 952 + commit_weights(iocg->ioc); 1088 953 } 1089 954 1090 955 static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep) ··· 1101 964 goto out; 1102 965 1103 966 /* 1104 - * Paired with wmb in commit_active_weights(). If we saw the 1105 - * updated hweight_gen, all the weight updates from 1106 - * __propagate_active_weight() are visible too. 967 + * Paired with wmb in commit_weights(). If we saw the updated 968 + * hweight_gen, all the weight updates from __propagate_weights() are 969 + * visible too. 1107 970 * 1108 971 * We can race with weight updates during calculation and get it 1109 972 * wrong. However, hweight_gen would have changed and a future ··· 1112 975 */ 1113 976 smp_rmb(); 1114 977 1115 - hwa = hwi = HWEIGHT_WHOLE; 978 + hwa = hwi = WEIGHT_ONE; 1116 979 for (lvl = 0; lvl <= iocg->level - 1; lvl++) { 1117 980 struct ioc_gq *parent = iocg->ancestors[lvl]; 1118 981 struct ioc_gq *child = iocg->ancestors[lvl + 1]; 1119 - u32 active_sum = READ_ONCE(parent->child_active_sum); 1120 - u32 inuse_sum = READ_ONCE(parent->child_inuse_sum); 982 + u64 active_sum = READ_ONCE(parent->child_active_sum); 983 + u64 inuse_sum = READ_ONCE(parent->child_inuse_sum); 1121 984 u32 active = READ_ONCE(child->active); 1122 985 u32 inuse = READ_ONCE(child->inuse); 1123 986 ··· 1125 988 if (!active_sum || !inuse_sum) 1126 989 continue; 1127 990 1128 - active_sum = max(active, active_sum); 1129 - hwa = hwa * active / active_sum; /* max 16bits * 10000 */ 991 + active_sum = max_t(u64, active, active_sum); 992 + hwa = div64_u64((u64)hwa * active, active_sum); 1130 993 1131 - inuse_sum = max(inuse, inuse_sum); 1132 - hwi = hwi * inuse / inuse_sum; /* max 16bits * 10000 */ 994 + inuse_sum = max_t(u64, inuse, inuse_sum); 995 + hwi = div64_u64((u64)hwi * inuse, inuse_sum); 1133 996 } 1134 997 1135 998 iocg->hweight_active = max_t(u32, hwa, 1); ··· 1142 1005 *hw_inusep = iocg->hweight_inuse; 1143 1006 } 1144 1007 1145 - static void weight_updated(struct ioc_gq *iocg) 1008 + /* 1009 + * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the 1010 + * other weights stay unchanged. 1011 + */ 1012 + static u32 current_hweight_max(struct ioc_gq *iocg) 1013 + { 1014 + u32 hwm = WEIGHT_ONE; 1015 + u32 inuse = iocg->active; 1016 + u64 child_inuse_sum; 1017 + int lvl; 1018 + 1019 + lockdep_assert_held(&iocg->ioc->lock); 1020 + 1021 + for (lvl = iocg->level - 1; lvl >= 0; lvl--) { 1022 + struct ioc_gq *parent = iocg->ancestors[lvl]; 1023 + struct ioc_gq *child = iocg->ancestors[lvl + 1]; 1024 + 1025 + child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse; 1026 + hwm = div64_u64((u64)hwm * inuse, child_inuse_sum); 1027 + inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum, 1028 + parent->child_active_sum); 1029 + } 1030 + 1031 + return max_t(u32, hwm, 1); 1032 + } 1033 + 1034 + static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now) 1146 1035 { 1147 1036 struct ioc *ioc = iocg->ioc; 1148 1037 struct blkcg_gq *blkg = iocg_to_blkg(iocg); ··· 1179 1016 1180 1017 weight = iocg->cfg_weight ?: iocc->dfl_weight; 1181 1018 if (weight != iocg->weight && iocg->active) 1182 - propagate_active_weight(iocg, weight, 1183 - DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight)); 1019 + propagate_weights(iocg, weight, iocg->inuse, true, now); 1184 1020 iocg->weight = weight; 1185 1021 } 1186 1022 1187 1023 static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) 1188 1024 { 1189 1025 struct ioc *ioc = iocg->ioc; 1190 - u64 last_period, cur_period, max_period_delta; 1191 - u64 vtime, vmargin, vmin; 1026 + u64 last_period, cur_period; 1027 + u64 vtime, vtarget; 1192 1028 int i; 1193 1029 1194 1030 /* ··· 1226 1064 goto fail_unlock; 1227 1065 1228 1066 /* 1229 - * vtime may wrap when vrate is raised substantially due to 1230 - * underestimated IO costs. Look at the period and ignore its 1231 - * vtime if the iocg has been idle for too long. Also, cap the 1232 - * budget it can start with to the margin. 1067 + * Always start with the target budget. On deactivation, we throw away 1068 + * anything above it. 1233 1069 */ 1234 - max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us); 1070 + vtarget = now->vnow - ioc->margins.target; 1235 1071 vtime = atomic64_read(&iocg->vtime); 1236 - vmargin = ioc->margin_us * now->vrate; 1237 - vmin = now->vnow - vmargin; 1238 1072 1239 - if (last_period + max_period_delta < cur_period || 1240 - time_before64(vtime, vmin)) { 1241 - atomic64_add(vmin - vtime, &iocg->vtime); 1242 - atomic64_add(vmin - vtime, &iocg->done_vtime); 1243 - vtime = vmin; 1244 - } 1073 + atomic64_add(vtarget - vtime, &iocg->vtime); 1074 + atomic64_add(vtarget - vtime, &iocg->done_vtime); 1075 + vtime = vtarget; 1245 1076 1246 1077 /* 1247 1078 * Activate, propagate weight and start period timer if not ··· 1243 1088 */ 1244 1089 iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1; 1245 1090 list_add(&iocg->active_list, &ioc->active_iocgs); 1246 - propagate_active_weight(iocg, iocg->weight, 1247 - iocg->last_inuse ?: iocg->weight); 1091 + 1092 + propagate_weights(iocg, iocg->weight, 1093 + iocg->last_inuse ?: iocg->weight, true, now); 1248 1094 1249 1095 TRACE_IOCG_PATH(iocg_activate, iocg, now, 1250 1096 last_period, cur_period, vtime); 1251 1097 1252 - iocg->last_vtime = vtime; 1098 + iocg->activated_at = now->now; 1253 1099 1254 1100 if (ioc->running == IOC_IDLE) { 1255 1101 ioc->running = IOC_RUNNING; 1102 + ioc->debt_busy_at = now->now; 1256 1103 ioc_start_period(ioc, now); 1257 1104 } 1258 1105 ··· 1265 1108 fail_unlock: 1266 1109 spin_unlock_irq(&ioc->lock); 1267 1110 return false; 1111 + } 1112 + 1113 + static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) 1114 + { 1115 + struct ioc *ioc = iocg->ioc; 1116 + struct blkcg_gq *blkg = iocg_to_blkg(iocg); 1117 + u64 tdelta, delay, new_delay; 1118 + s64 vover, vover_pct; 1119 + u32 hwa; 1120 + 1121 + lockdep_assert_held(&iocg->waitq.lock); 1122 + 1123 + /* calculate the current delay in effect - 1/2 every second */ 1124 + tdelta = now->now - iocg->delay_at; 1125 + if (iocg->delay) 1126 + delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC); 1127 + else 1128 + delay = 0; 1129 + 1130 + /* calculate the new delay from the debt amount */ 1131 + current_hweight(iocg, &hwa, NULL); 1132 + vover = atomic64_read(&iocg->vtime) + 1133 + abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow; 1134 + vover_pct = div64_s64(100 * vover, 1135 + ioc->period_us * ioc->vtime_base_rate); 1136 + 1137 + if (vover_pct <= MIN_DELAY_THR_PCT) 1138 + new_delay = 0; 1139 + else if (vover_pct >= MAX_DELAY_THR_PCT) 1140 + new_delay = MAX_DELAY; 1141 + else 1142 + new_delay = MIN_DELAY + 1143 + div_u64((MAX_DELAY - MIN_DELAY) * 1144 + (vover_pct - MIN_DELAY_THR_PCT), 1145 + MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT); 1146 + 1147 + /* pick the higher one and apply */ 1148 + if (new_delay > delay) { 1149 + iocg->delay = new_delay; 1150 + iocg->delay_at = now->now; 1151 + delay = new_delay; 1152 + } 1153 + 1154 + if (delay >= MIN_DELAY) { 1155 + if (!iocg->indelay_since) 1156 + iocg->indelay_since = now->now; 1157 + blkcg_set_delay(blkg, delay * NSEC_PER_USEC); 1158 + return true; 1159 + } else { 1160 + if (iocg->indelay_since) { 1161 + iocg->local_stat.indelay_us += now->now - iocg->indelay_since; 1162 + iocg->indelay_since = 0; 1163 + } 1164 + iocg->delay = 0; 1165 + blkcg_clear_delay(blkg); 1166 + return false; 1167 + } 1168 + } 1169 + 1170 + static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost, 1171 + struct ioc_now *now) 1172 + { 1173 + struct iocg_pcpu_stat *gcs; 1174 + 1175 + lockdep_assert_held(&iocg->ioc->lock); 1176 + lockdep_assert_held(&iocg->waitq.lock); 1177 + WARN_ON_ONCE(list_empty(&iocg->active_list)); 1178 + 1179 + /* 1180 + * Once in debt, debt handling owns inuse. @iocg stays at the minimum 1181 + * inuse donating all of it share to others until its debt is paid off. 1182 + */ 1183 + if (!iocg->abs_vdebt && abs_cost) { 1184 + iocg->indebt_since = now->now; 1185 + propagate_weights(iocg, iocg->active, 0, false, now); 1186 + } 1187 + 1188 + iocg->abs_vdebt += abs_cost; 1189 + 1190 + gcs = get_cpu_ptr(iocg->pcpu_stat); 1191 + local64_add(abs_cost, &gcs->abs_vusage); 1192 + put_cpu_ptr(gcs); 1193 + } 1194 + 1195 + static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay, 1196 + struct ioc_now *now) 1197 + { 1198 + lockdep_assert_held(&iocg->ioc->lock); 1199 + lockdep_assert_held(&iocg->waitq.lock); 1200 + 1201 + /* make sure that nobody messed with @iocg */ 1202 + WARN_ON_ONCE(list_empty(&iocg->active_list)); 1203 + WARN_ON_ONCE(iocg->inuse > 1); 1204 + 1205 + iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt); 1206 + 1207 + /* if debt is paid in full, restore inuse */ 1208 + if (!iocg->abs_vdebt) { 1209 + iocg->local_stat.indebt_us += now->now - iocg->indebt_since; 1210 + iocg->indebt_since = 0; 1211 + 1212 + propagate_weights(iocg, iocg->active, iocg->last_inuse, 1213 + false, now); 1214 + } 1268 1215 } 1269 1216 1270 1217 static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, ··· 1383 1122 if (ctx->vbudget < 0) 1384 1123 return -1; 1385 1124 1386 - iocg_commit_bio(ctx->iocg, wait->bio, cost); 1125 + iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost); 1387 1126 1388 1127 /* 1389 1128 * autoremove_wake_function() removes the wait entry only when it ··· 1397 1136 return 0; 1398 1137 } 1399 1138 1400 - static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now) 1139 + /* 1140 + * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters 1141 + * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in 1142 + * addition to iocg->waitq.lock. 1143 + */ 1144 + static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt, 1145 + struct ioc_now *now) 1401 1146 { 1402 1147 struct ioc *ioc = iocg->ioc; 1403 1148 struct iocg_wake_ctx ctx = { .iocg = iocg }; 1404 - u64 margin_ns = (u64)(ioc->period_us * 1405 - WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC; 1406 - u64 vdebt, vshortage, expires, oexpires; 1149 + u64 vshortage, expires, oexpires; 1407 1150 s64 vbudget; 1408 - u32 hw_inuse; 1151 + u32 hwa; 1409 1152 1410 1153 lockdep_assert_held(&iocg->waitq.lock); 1411 1154 1412 - current_hweight(iocg, NULL, &hw_inuse); 1155 + current_hweight(iocg, &hwa, NULL); 1413 1156 vbudget = now->vnow - atomic64_read(&iocg->vtime); 1414 1157 1415 1158 /* pay off debt */ 1416 - vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); 1417 - if (vdebt && vbudget > 0) { 1418 - u64 delta = min_t(u64, vbudget, vdebt); 1419 - u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse), 1420 - iocg->abs_vdebt); 1159 + if (pay_debt && iocg->abs_vdebt && vbudget > 0) { 1160 + u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa); 1161 + u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt); 1162 + u64 vpay = abs_cost_to_cost(abs_vpay, hwa); 1421 1163 1422 - atomic64_add(delta, &iocg->vtime); 1423 - atomic64_add(delta, &iocg->done_vtime); 1424 - iocg->abs_vdebt -= abs_delta; 1164 + lockdep_assert_held(&ioc->lock); 1165 + 1166 + atomic64_add(vpay, &iocg->vtime); 1167 + atomic64_add(vpay, &iocg->done_vtime); 1168 + iocg_pay_debt(iocg, abs_vpay, now); 1169 + vbudget -= vpay; 1170 + } 1171 + 1172 + if (iocg->abs_vdebt || iocg->delay) 1173 + iocg_kick_delay(iocg, now); 1174 + 1175 + /* 1176 + * Debt can still be outstanding if we haven't paid all yet or the 1177 + * caller raced and called without @pay_debt. Shouldn't wake up waiters 1178 + * under debt. Make sure @vbudget reflects the outstanding amount and is 1179 + * not positive. 1180 + */ 1181 + if (iocg->abs_vdebt) { 1182 + s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa); 1183 + vbudget = min_t(s64, 0, vbudget - vdebt); 1425 1184 } 1426 1185 1427 1186 /* 1428 - * Wake up the ones which are due and see how much vtime we'll need 1429 - * for the next one. 1187 + * Wake up the ones which are due and see how much vtime we'll need for 1188 + * the next one. As paying off debt restores hw_inuse, it must be read 1189 + * after the above debt payment. 1430 1190 */ 1431 - ctx.hw_inuse = hw_inuse; 1432 - ctx.vbudget = vbudget - vdebt; 1191 + ctx.vbudget = vbudget; 1192 + current_hweight(iocg, NULL, &ctx.hw_inuse); 1193 + 1433 1194 __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx); 1434 - if (!waitqueue_active(&iocg->waitq)) 1195 + 1196 + if (!waitqueue_active(&iocg->waitq)) { 1197 + if (iocg->wait_since) { 1198 + iocg->local_stat.wait_us += now->now - iocg->wait_since; 1199 + iocg->wait_since = 0; 1200 + } 1435 1201 return; 1202 + } 1203 + 1204 + if (!iocg->wait_since) 1205 + iocg->wait_since = now->now; 1206 + 1436 1207 if (WARN_ON_ONCE(ctx.vbudget >= 0)) 1437 1208 return; 1438 1209 1439 - /* determine next wakeup, add a quarter margin to guarantee chunking */ 1210 + /* determine next wakeup, add a timer margin to guarantee chunking */ 1440 1211 vshortage = -ctx.vbudget; 1441 1212 expires = now->now_ns + 1442 - DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC; 1443 - expires += margin_ns / 4; 1213 + DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) * 1214 + NSEC_PER_USEC; 1215 + expires += ioc->timer_slack_ns; 1444 1216 1445 1217 /* if already active and close enough, don't bother */ 1446 1218 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer)); 1447 1219 if (hrtimer_is_queued(&iocg->waitq_timer) && 1448 - abs(oexpires - expires) <= margin_ns / 4) 1220 + abs(oexpires - expires) <= ioc->timer_slack_ns) 1449 1221 return; 1450 1222 1451 1223 hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires), 1452 - margin_ns / 4, HRTIMER_MODE_ABS); 1224 + ioc->timer_slack_ns, HRTIMER_MODE_ABS); 1453 1225 } 1454 1226 1455 1227 static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer) 1456 1228 { 1457 1229 struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer); 1230 + bool pay_debt = READ_ONCE(iocg->abs_vdebt); 1458 1231 struct ioc_now now; 1459 1232 unsigned long flags; 1460 1233 1461 1234 ioc_now(iocg->ioc, &now); 1462 1235 1463 - spin_lock_irqsave(&iocg->waitq.lock, flags); 1464 - iocg_kick_waitq(iocg, &now); 1465 - spin_unlock_irqrestore(&iocg->waitq.lock, flags); 1466 - 1467 - return HRTIMER_NORESTART; 1468 - } 1469 - 1470 - static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) 1471 - { 1472 - struct ioc *ioc = iocg->ioc; 1473 - struct blkcg_gq *blkg = iocg_to_blkg(iocg); 1474 - u64 vtime = atomic64_read(&iocg->vtime); 1475 - u64 vmargin = ioc->margin_us * now->vrate; 1476 - u64 margin_ns = ioc->margin_us * NSEC_PER_USEC; 1477 - u64 delta_ns, expires, oexpires; 1478 - u32 hw_inuse; 1479 - 1480 - lockdep_assert_held(&iocg->waitq.lock); 1481 - 1482 - /* debt-adjust vtime */ 1483 - current_hweight(iocg, NULL, &hw_inuse); 1484 - vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); 1485 - 1486 - /* 1487 - * Clear or maintain depending on the overage. Non-zero vdebt is what 1488 - * guarantees that @iocg is online and future iocg_kick_delay() will 1489 - * clear use_delay. Don't leave it on when there's no vdebt. 1490 - */ 1491 - if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) { 1492 - blkcg_clear_delay(blkg); 1493 - return false; 1494 - } 1495 - if (!atomic_read(&blkg->use_delay) && 1496 - time_before_eq64(vtime, now->vnow + vmargin)) 1497 - return false; 1498 - 1499 - /* use delay */ 1500 - delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow, 1501 - now->vrate) * NSEC_PER_USEC; 1502 - blkcg_set_delay(blkg, delta_ns); 1503 - expires = now->now_ns + delta_ns; 1504 - 1505 - /* if already active and close enough, don't bother */ 1506 - oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer)); 1507 - if (hrtimer_is_queued(&iocg->delay_timer) && 1508 - abs(oexpires - expires) <= margin_ns / 4) 1509 - return true; 1510 - 1511 - hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires), 1512 - margin_ns / 4, HRTIMER_MODE_ABS); 1513 - return true; 1514 - } 1515 - 1516 - static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer) 1517 - { 1518 - struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer); 1519 - struct ioc_now now; 1520 - unsigned long flags; 1521 - 1522 - spin_lock_irqsave(&iocg->waitq.lock, flags); 1523 - ioc_now(iocg->ioc, &now); 1524 - iocg_kick_delay(iocg, &now); 1525 - spin_unlock_irqrestore(&iocg->waitq.lock, flags); 1236 + iocg_lock(iocg, pay_debt, &flags); 1237 + iocg_kick_waitq(iocg, pay_debt, &now); 1238 + iocg_unlock(iocg, pay_debt, &flags); 1526 1239 1527 1240 return HRTIMER_NORESTART; 1528 1241 } ··· 1513 1278 u64 this_rq_wait_ns; 1514 1279 1515 1280 for (rw = READ; rw <= WRITE; rw++) { 1516 - u32 this_met = READ_ONCE(stat->missed[rw].nr_met); 1517 - u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed); 1281 + u32 this_met = local_read(&stat->missed[rw].nr_met); 1282 + u32 this_missed = local_read(&stat->missed[rw].nr_missed); 1518 1283 1519 1284 nr_met[rw] += this_met - stat->missed[rw].last_met; 1520 1285 nr_missed[rw] += this_missed - stat->missed[rw].last_missed; ··· 1522 1287 stat->missed[rw].last_missed = this_missed; 1523 1288 } 1524 1289 1525 - this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns); 1290 + this_rq_wait_ns = local64_read(&stat->rq_wait_ns); 1526 1291 rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns; 1527 1292 stat->last_rq_wait_ns = this_rq_wait_ns; 1528 1293 } ··· 1557 1322 return true; 1558 1323 } 1559 1324 1560 - /* returns usage with margin added if surplus is large enough */ 1561 - static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse) 1325 + /* 1326 + * Call this function on the target leaf @iocg's to build pre-order traversal 1327 + * list of all the ancestors in @inner_walk. The inner nodes are linked through 1328 + * ->walk_list and the caller is responsible for dissolving the list after use. 1329 + */ 1330 + static void iocg_build_inner_walk(struct ioc_gq *iocg, 1331 + struct list_head *inner_walk) 1562 1332 { 1563 - /* add margin */ 1564 - usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100); 1565 - usage += SURPLUS_SCALE_ABS; 1333 + int lvl; 1566 1334 1567 - /* don't bother if the surplus is too small */ 1568 - if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse) 1569 - return 0; 1335 + WARN_ON_ONCE(!list_empty(&iocg->walk_list)); 1570 1336 1571 - return usage; 1337 + /* find the first ancestor which hasn't been visited yet */ 1338 + for (lvl = iocg->level - 1; lvl >= 0; lvl--) { 1339 + if (!list_empty(&iocg->ancestors[lvl]->walk_list)) 1340 + break; 1341 + } 1342 + 1343 + /* walk down and visit the inner nodes to get pre-order traversal */ 1344 + while (++lvl <= iocg->level - 1) { 1345 + struct ioc_gq *inner = iocg->ancestors[lvl]; 1346 + 1347 + /* record traversal order */ 1348 + list_add_tail(&inner->walk_list, inner_walk); 1349 + } 1350 + } 1351 + 1352 + /* collect per-cpu counters and propagate the deltas to the parent */ 1353 + static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now) 1354 + { 1355 + struct ioc *ioc = iocg->ioc; 1356 + struct iocg_stat new_stat; 1357 + u64 abs_vusage = 0; 1358 + u64 vusage_delta; 1359 + int cpu; 1360 + 1361 + lockdep_assert_held(&iocg->ioc->lock); 1362 + 1363 + /* collect per-cpu counters */ 1364 + for_each_possible_cpu(cpu) { 1365 + abs_vusage += local64_read( 1366 + per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu)); 1367 + } 1368 + vusage_delta = abs_vusage - iocg->last_stat_abs_vusage; 1369 + iocg->last_stat_abs_vusage = abs_vusage; 1370 + 1371 + iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate); 1372 + iocg->local_stat.usage_us += iocg->usage_delta_us; 1373 + 1374 + /* propagate upwards */ 1375 + new_stat.usage_us = 1376 + iocg->local_stat.usage_us + iocg->desc_stat.usage_us; 1377 + new_stat.wait_us = 1378 + iocg->local_stat.wait_us + iocg->desc_stat.wait_us; 1379 + new_stat.indebt_us = 1380 + iocg->local_stat.indebt_us + iocg->desc_stat.indebt_us; 1381 + new_stat.indelay_us = 1382 + iocg->local_stat.indelay_us + iocg->desc_stat.indelay_us; 1383 + 1384 + /* propagate the deltas to the parent */ 1385 + if (iocg->level > 0) { 1386 + struct iocg_stat *parent_stat = 1387 + &iocg->ancestors[iocg->level - 1]->desc_stat; 1388 + 1389 + parent_stat->usage_us += 1390 + new_stat.usage_us - iocg->last_stat.usage_us; 1391 + parent_stat->wait_us += 1392 + new_stat.wait_us - iocg->last_stat.wait_us; 1393 + parent_stat->indebt_us += 1394 + new_stat.indebt_us - iocg->last_stat.indebt_us; 1395 + parent_stat->indelay_us += 1396 + new_stat.indelay_us - iocg->last_stat.indelay_us; 1397 + } 1398 + 1399 + iocg->last_stat = new_stat; 1400 + } 1401 + 1402 + /* get stat counters ready for reading on all active iocgs */ 1403 + static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now) 1404 + { 1405 + LIST_HEAD(inner_walk); 1406 + struct ioc_gq *iocg, *tiocg; 1407 + 1408 + /* flush leaves and build inner node walk list */ 1409 + list_for_each_entry(iocg, target_iocgs, active_list) { 1410 + iocg_flush_stat_one(iocg, now); 1411 + iocg_build_inner_walk(iocg, &inner_walk); 1412 + } 1413 + 1414 + /* keep flushing upwards by walking the inner list backwards */ 1415 + list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) { 1416 + iocg_flush_stat_one(iocg, now); 1417 + list_del_init(&iocg->walk_list); 1418 + } 1419 + } 1420 + 1421 + /* 1422 + * Determine what @iocg's hweight_inuse should be after donating unused 1423 + * capacity. @hwm is the upper bound and used to signal no donation. This 1424 + * function also throws away @iocg's excess budget. 1425 + */ 1426 + static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm, 1427 + u32 usage, struct ioc_now *now) 1428 + { 1429 + struct ioc *ioc = iocg->ioc; 1430 + u64 vtime = atomic64_read(&iocg->vtime); 1431 + s64 excess, delta, target, new_hwi; 1432 + 1433 + /* debt handling owns inuse for debtors */ 1434 + if (iocg->abs_vdebt) 1435 + return 1; 1436 + 1437 + /* see whether minimum margin requirement is met */ 1438 + if (waitqueue_active(&iocg->waitq) || 1439 + time_after64(vtime, now->vnow - ioc->margins.min)) 1440 + return hwm; 1441 + 1442 + /* throw away excess above target */ 1443 + excess = now->vnow - vtime - ioc->margins.target; 1444 + if (excess > 0) { 1445 + atomic64_add(excess, &iocg->vtime); 1446 + atomic64_add(excess, &iocg->done_vtime); 1447 + vtime += excess; 1448 + ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE); 1449 + } 1450 + 1451 + /* 1452 + * Let's say the distance between iocg's and device's vtimes as a 1453 + * fraction of period duration is delta. Assuming that the iocg will 1454 + * consume the usage determined above, we want to determine new_hwi so 1455 + * that delta equals MARGIN_TARGET at the end of the next period. 1456 + * 1457 + * We need to execute usage worth of IOs while spending the sum of the 1458 + * new budget (1 - MARGIN_TARGET) and the leftover from the last period 1459 + * (delta): 1460 + * 1461 + * usage = (1 - MARGIN_TARGET + delta) * new_hwi 1462 + * 1463 + * Therefore, the new_hwi is: 1464 + * 1465 + * new_hwi = usage / (1 - MARGIN_TARGET + delta) 1466 + */ 1467 + delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime), 1468 + now->vnow - ioc->period_at_vtime); 1469 + target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100; 1470 + new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta); 1471 + 1472 + return clamp_t(s64, new_hwi, 1, hwm); 1473 + } 1474 + 1475 + /* 1476 + * For work-conservation, an iocg which isn't using all of its share should 1477 + * donate the leftover to other iocgs. There are two ways to achieve this - 1. 1478 + * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight. 1479 + * 1480 + * #1 is mathematically simpler but has the drawback of requiring synchronous 1481 + * global hweight_inuse updates when idle iocg's get activated or inuse weights 1482 + * change due to donation snapbacks as it has the possibility of grossly 1483 + * overshooting what's allowed by the model and vrate. 1484 + * 1485 + * #2 is inherently safe with local operations. The donating iocg can easily 1486 + * snap back to higher weights when needed without worrying about impacts on 1487 + * other nodes as the impacts will be inherently correct. This also makes idle 1488 + * iocg activations safe. The only effect activations have is decreasing 1489 + * hweight_inuse of others, the right solution to which is for those iocgs to 1490 + * snap back to higher weights. 1491 + * 1492 + * So, we go with #2. The challenge is calculating how each donating iocg's 1493 + * inuse should be adjusted to achieve the target donation amounts. This is done 1494 + * using Andy's method described in the following pdf. 1495 + * 1496 + * https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo 1497 + * 1498 + * Given the weights and target after-donation hweight_inuse values, Andy's 1499 + * method determines how the proportional distribution should look like at each 1500 + * sibling level to maintain the relative relationship between all non-donating 1501 + * pairs. To roughly summarize, it divides the tree into donating and 1502 + * non-donating parts, calculates global donation rate which is used to 1503 + * determine the target hweight_inuse for each node, and then derives per-level 1504 + * proportions. 1505 + * 1506 + * The following pdf shows that global distribution calculated this way can be 1507 + * achieved by scaling inuse weights of donating leaves and propagating the 1508 + * adjustments upwards proportionally. 1509 + * 1510 + * https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE 1511 + * 1512 + * Combining the above two, we can determine how each leaf iocg's inuse should 1513 + * be adjusted to achieve the target donation. 1514 + * 1515 + * https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN 1516 + * 1517 + * The inline comments use symbols from the last pdf. 1518 + * 1519 + * b is the sum of the absolute budgets in the subtree. 1 for the root node. 1520 + * f is the sum of the absolute budgets of non-donating nodes in the subtree. 1521 + * t is the sum of the absolute budgets of donating nodes in the subtree. 1522 + * w is the weight of the node. w = w_f + w_t 1523 + * w_f is the non-donating portion of w. w_f = w * f / b 1524 + * w_b is the donating portion of w. w_t = w * t / b 1525 + * s is the sum of all sibling weights. s = Sum(w) for siblings 1526 + * s_f and s_t are the non-donating and donating portions of s. 1527 + * 1528 + * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g. 1529 + * w_pt is the donating portion of the parent's weight and w'_pt the same value 1530 + * after adjustments. Subscript r denotes the root node's values. 1531 + */ 1532 + static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now) 1533 + { 1534 + LIST_HEAD(over_hwa); 1535 + LIST_HEAD(inner_walk); 1536 + struct ioc_gq *iocg, *tiocg, *root_iocg; 1537 + u32 after_sum, over_sum, over_target, gamma; 1538 + 1539 + /* 1540 + * It's pretty unlikely but possible for the total sum of 1541 + * hweight_after_donation's to be higher than WEIGHT_ONE, which will 1542 + * confuse the following calculations. If such condition is detected, 1543 + * scale down everyone over its full share equally to keep the sum below 1544 + * WEIGHT_ONE. 1545 + */ 1546 + after_sum = 0; 1547 + over_sum = 0; 1548 + list_for_each_entry(iocg, surpluses, surplus_list) { 1549 + u32 hwa; 1550 + 1551 + current_hweight(iocg, &hwa, NULL); 1552 + after_sum += iocg->hweight_after_donation; 1553 + 1554 + if (iocg->hweight_after_donation > hwa) { 1555 + over_sum += iocg->hweight_after_donation; 1556 + list_add(&iocg->walk_list, &over_hwa); 1557 + } 1558 + } 1559 + 1560 + if (after_sum >= WEIGHT_ONE) { 1561 + /* 1562 + * The delta should be deducted from the over_sum, calculate 1563 + * target over_sum value. 1564 + */ 1565 + u32 over_delta = after_sum - (WEIGHT_ONE - 1); 1566 + WARN_ON_ONCE(over_sum <= over_delta); 1567 + over_target = over_sum - over_delta; 1568 + } else { 1569 + over_target = 0; 1570 + } 1571 + 1572 + list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) { 1573 + if (over_target) 1574 + iocg->hweight_after_donation = 1575 + div_u64((u64)iocg->hweight_after_donation * 1576 + over_target, over_sum); 1577 + list_del_init(&iocg->walk_list); 1578 + } 1579 + 1580 + /* 1581 + * Build pre-order inner node walk list and prepare for donation 1582 + * adjustment calculations. 1583 + */ 1584 + list_for_each_entry(iocg, surpluses, surplus_list) { 1585 + iocg_build_inner_walk(iocg, &inner_walk); 1586 + } 1587 + 1588 + root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list); 1589 + WARN_ON_ONCE(root_iocg->level > 0); 1590 + 1591 + list_for_each_entry(iocg, &inner_walk, walk_list) { 1592 + iocg->child_adjusted_sum = 0; 1593 + iocg->hweight_donating = 0; 1594 + iocg->hweight_after_donation = 0; 1595 + } 1596 + 1597 + /* 1598 + * Propagate the donating budget (b_t) and after donation budget (b'_t) 1599 + * up the hierarchy. 1600 + */ 1601 + list_for_each_entry(iocg, surpluses, surplus_list) { 1602 + struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; 1603 + 1604 + parent->hweight_donating += iocg->hweight_donating; 1605 + parent->hweight_after_donation += iocg->hweight_after_donation; 1606 + } 1607 + 1608 + list_for_each_entry_reverse(iocg, &inner_walk, walk_list) { 1609 + if (iocg->level > 0) { 1610 + struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; 1611 + 1612 + parent->hweight_donating += iocg->hweight_donating; 1613 + parent->hweight_after_donation += iocg->hweight_after_donation; 1614 + } 1615 + } 1616 + 1617 + /* 1618 + * Calculate inner hwa's (b) and make sure the donation values are 1619 + * within the accepted ranges as we're doing low res calculations with 1620 + * roundups. 1621 + */ 1622 + list_for_each_entry(iocg, &inner_walk, walk_list) { 1623 + if (iocg->level) { 1624 + struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; 1625 + 1626 + iocg->hweight_active = DIV64_U64_ROUND_UP( 1627 + (u64)parent->hweight_active * iocg->active, 1628 + parent->child_active_sum); 1629 + 1630 + } 1631 + 1632 + iocg->hweight_donating = min(iocg->hweight_donating, 1633 + iocg->hweight_active); 1634 + iocg->hweight_after_donation = min(iocg->hweight_after_donation, 1635 + iocg->hweight_donating - 1); 1636 + if (WARN_ON_ONCE(iocg->hweight_active <= 1 || 1637 + iocg->hweight_donating <= 1 || 1638 + iocg->hweight_after_donation == 0)) { 1639 + pr_warn("iocg: invalid donation weights in "); 1640 + pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup); 1641 + pr_cont(": active=%u donating=%u after=%u\n", 1642 + iocg->hweight_active, iocg->hweight_donating, 1643 + iocg->hweight_after_donation); 1644 + } 1645 + } 1646 + 1647 + /* 1648 + * Calculate the global donation rate (gamma) - the rate to adjust 1649 + * non-donating budgets by. 1650 + * 1651 + * No need to use 64bit multiplication here as the first operand is 1652 + * guaranteed to be smaller than WEIGHT_ONE (1<<16). 1653 + * 1654 + * We know that there are beneficiary nodes and the sum of the donating 1655 + * hweights can't be whole; however, due to the round-ups during hweight 1656 + * calculations, root_iocg->hweight_donating might still end up equal to 1657 + * or greater than whole. Limit the range when calculating the divider. 1658 + * 1659 + * gamma = (1 - t_r') / (1 - t_r) 1660 + */ 1661 + gamma = DIV_ROUND_UP( 1662 + (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE, 1663 + WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1)); 1664 + 1665 + /* 1666 + * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner 1667 + * nodes. 1668 + */ 1669 + list_for_each_entry(iocg, &inner_walk, walk_list) { 1670 + struct ioc_gq *parent; 1671 + u32 inuse, wpt, wptp; 1672 + u64 st, sf; 1673 + 1674 + if (iocg->level == 0) { 1675 + /* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */ 1676 + iocg->child_adjusted_sum = DIV64_U64_ROUND_UP( 1677 + iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating), 1678 + WEIGHT_ONE - iocg->hweight_after_donation); 1679 + continue; 1680 + } 1681 + 1682 + parent = iocg->ancestors[iocg->level - 1]; 1683 + 1684 + /* b' = gamma * b_f + b_t' */ 1685 + iocg->hweight_inuse = DIV64_U64_ROUND_UP( 1686 + (u64)gamma * (iocg->hweight_active - iocg->hweight_donating), 1687 + WEIGHT_ONE) + iocg->hweight_after_donation; 1688 + 1689 + /* w' = s' * b' / b'_p */ 1690 + inuse = DIV64_U64_ROUND_UP( 1691 + (u64)parent->child_adjusted_sum * iocg->hweight_inuse, 1692 + parent->hweight_inuse); 1693 + 1694 + /* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */ 1695 + st = DIV64_U64_ROUND_UP( 1696 + iocg->child_active_sum * iocg->hweight_donating, 1697 + iocg->hweight_active); 1698 + sf = iocg->child_active_sum - st; 1699 + wpt = DIV64_U64_ROUND_UP( 1700 + (u64)iocg->active * iocg->hweight_donating, 1701 + iocg->hweight_active); 1702 + wptp = DIV64_U64_ROUND_UP( 1703 + (u64)inuse * iocg->hweight_after_donation, 1704 + iocg->hweight_inuse); 1705 + 1706 + iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt); 1707 + } 1708 + 1709 + /* 1710 + * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and 1711 + * we can finally determine leaf adjustments. 1712 + */ 1713 + list_for_each_entry(iocg, surpluses, surplus_list) { 1714 + struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; 1715 + u32 inuse; 1716 + 1717 + /* 1718 + * In-debt iocgs participated in the donation calculation with 1719 + * the minimum target hweight_inuse. Configuring inuse 1720 + * accordingly would work fine but debt handling expects 1721 + * @iocg->inuse stay at the minimum and we don't wanna 1722 + * interfere. 1723 + */ 1724 + if (iocg->abs_vdebt) { 1725 + WARN_ON_ONCE(iocg->inuse > 1); 1726 + continue; 1727 + } 1728 + 1729 + /* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */ 1730 + inuse = DIV64_U64_ROUND_UP( 1731 + parent->child_adjusted_sum * iocg->hweight_after_donation, 1732 + parent->hweight_inuse); 1733 + 1734 + TRACE_IOCG_PATH(inuse_transfer, iocg, now, 1735 + iocg->inuse, inuse, 1736 + iocg->hweight_inuse, 1737 + iocg->hweight_after_donation); 1738 + 1739 + __propagate_weights(iocg, iocg->active, inuse, true, now); 1740 + } 1741 + 1742 + /* walk list should be dissolved after use */ 1743 + list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list) 1744 + list_del_init(&iocg->walk_list); 1572 1745 } 1573 1746 1574 1747 static void ioc_timer_fn(struct timer_list *timer) ··· 1984 1341 struct ioc *ioc = container_of(timer, struct ioc, timer); 1985 1342 struct ioc_gq *iocg, *tiocg; 1986 1343 struct ioc_now now; 1987 - int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0; 1344 + LIST_HEAD(surpluses); 1345 + int nr_debtors = 0, nr_shortages = 0, nr_lagging = 0; 1346 + u64 usage_us_sum = 0; 1988 1347 u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; 1989 1348 u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; 1990 1349 u32 missed_ppm[2], rq_wait_pct; 1991 1350 u64 period_vtime; 1992 - int prev_busy_level, i; 1351 + int prev_busy_level; 1993 1352 1994 1353 /* how were the latencies during the period? */ 1995 1354 ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); ··· 2015 1370 */ 2016 1371 list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { 2017 1372 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && 2018 - !iocg_is_idle(iocg)) 1373 + !iocg->delay && !iocg_is_idle(iocg)) 2019 1374 continue; 2020 1375 2021 1376 spin_lock(&iocg->waitq.lock); 2022 1377 2023 - if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) { 1378 + /* flush wait and indebt stat deltas */ 1379 + if (iocg->wait_since) { 1380 + iocg->local_stat.wait_us += now.now - iocg->wait_since; 1381 + iocg->wait_since = now.now; 1382 + } 1383 + if (iocg->indebt_since) { 1384 + iocg->local_stat.indebt_us += 1385 + now.now - iocg->indebt_since; 1386 + iocg->indebt_since = now.now; 1387 + } 1388 + if (iocg->indelay_since) { 1389 + iocg->local_stat.indelay_us += 1390 + now.now - iocg->indelay_since; 1391 + iocg->indelay_since = now.now; 1392 + } 1393 + 1394 + if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt || 1395 + iocg->delay) { 2024 1396 /* might be oversleeping vtime / hweight changes, kick */ 2025 - iocg_kick_waitq(iocg, &now); 2026 - iocg_kick_delay(iocg, &now); 1397 + iocg_kick_waitq(iocg, true, &now); 1398 + if (iocg->abs_vdebt) 1399 + nr_debtors++; 2027 1400 } else if (iocg_is_idle(iocg)) { 2028 1401 /* no waiter and idle, deactivate */ 2029 - iocg->last_inuse = iocg->inuse; 2030 - __propagate_active_weight(iocg, 0, 0); 1402 + u64 vtime = atomic64_read(&iocg->vtime); 1403 + s64 excess; 1404 + 1405 + /* 1406 + * @iocg has been inactive for a full duration and will 1407 + * have a high budget. Account anything above target as 1408 + * error and throw away. On reactivation, it'll start 1409 + * with the target budget. 1410 + */ 1411 + excess = now.vnow - vtime - ioc->margins.target; 1412 + if (excess > 0) { 1413 + u32 old_hwi; 1414 + 1415 + current_hweight(iocg, NULL, &old_hwi); 1416 + ioc->vtime_err -= div64_u64(excess * old_hwi, 1417 + WEIGHT_ONE); 1418 + } 1419 + 1420 + __propagate_weights(iocg, 0, 0, false, &now); 2031 1421 list_del_init(&iocg->active_list); 2032 1422 } 2033 1423 2034 1424 spin_unlock(&iocg->waitq.lock); 2035 1425 } 2036 - commit_active_weights(ioc); 1426 + commit_weights(ioc); 2037 1427 2038 - /* calc usages and see whether some weights need to be moved around */ 1428 + /* 1429 + * Wait and indebt stat are flushed above and the donation calculation 1430 + * below needs updated usage stat. Let's bring stat up-to-date. 1431 + */ 1432 + iocg_flush_stat(&ioc->active_iocgs, &now); 1433 + 1434 + /* calc usage and see whether some weights need to be moved around */ 2039 1435 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { 2040 - u64 vdone, vtime, vusage, vmargin, vmin; 2041 - u32 hw_active, hw_inuse, usage; 1436 + u64 vdone, vtime, usage_us, usage_dur; 1437 + u32 usage, hw_active, hw_inuse; 2042 1438 2043 1439 /* 2044 1440 * Collect unused and wind vtime closer to vnow to prevent ··· 2103 1417 time_before64(vdone, now.vnow - period_vtime)) 2104 1418 nr_lagging++; 2105 1419 2106 - if (waitqueue_active(&iocg->waitq)) 2107 - vusage = now.vnow - iocg->last_vtime; 2108 - else if (time_before64(iocg->last_vtime, vtime)) 2109 - vusage = vtime - iocg->last_vtime; 2110 - else 2111 - vusage = 0; 2112 - 2113 - iocg->last_vtime += vusage; 2114 1420 /* 2115 - * Factor in in-flight vtime into vusage to avoid 2116 - * high-latency completions appearing as idle. This should 2117 - * be done after the above ->last_time adjustment. 1421 + * Determine absolute usage factoring in in-flight IOs to avoid 1422 + * high-latency completions appearing as idle. 2118 1423 */ 2119 - vusage = max(vusage, vtime - vdone); 1424 + usage_us = iocg->usage_delta_us; 1425 + usage_us_sum += usage_us; 2120 1426 2121 - /* calculate hweight based usage ratio and record */ 2122 - if (vusage) { 2123 - usage = DIV64_U64_ROUND_UP(vusage * hw_inuse, 2124 - period_vtime); 2125 - iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS; 2126 - iocg->usages[iocg->usage_idx] = usage; 2127 - } else { 2128 - usage = 0; 1427 + if (vdone != vtime) { 1428 + u64 inflight_us = DIV64_U64_ROUND_UP( 1429 + cost_to_abs_cost(vtime - vdone, hw_inuse), 1430 + ioc->vtime_base_rate); 1431 + usage_us = max(usage_us, inflight_us); 2129 1432 } 2130 1433 1434 + /* convert to hweight based usage ratio */ 1435 + if (time_after64(iocg->activated_at, ioc->period_at)) 1436 + usage_dur = max_t(u64, now.now - iocg->activated_at, 1); 1437 + else 1438 + usage_dur = max_t(u64, now.now - ioc->period_at, 1); 1439 + 1440 + usage = clamp_t(u32, 1441 + DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, 1442 + usage_dur), 1443 + 1, WEIGHT_ONE); 1444 + 2131 1445 /* see whether there's surplus vtime */ 2132 - vmargin = ioc->margin_us * now.vrate; 2133 - vmin = now.vnow - vmargin; 1446 + WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); 1447 + if (hw_inuse < hw_active || 1448 + (!waitqueue_active(&iocg->waitq) && 1449 + time_before64(vtime, now.vnow - ioc->margins.low))) { 1450 + u32 hwa, old_hwi, hwm, new_hwi; 2134 1451 2135 - iocg->has_surplus = false; 2136 - 2137 - if (!waitqueue_active(&iocg->waitq) && 2138 - time_before64(vtime, vmin)) { 2139 - u64 delta = vmin - vtime; 2140 - 2141 - /* throw away surplus vtime */ 2142 - atomic64_add(delta, &iocg->vtime); 2143 - atomic64_add(delta, &iocg->done_vtime); 2144 - iocg->last_vtime += delta; 2145 - /* if usage is sufficiently low, maybe it can donate */ 2146 - if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) { 2147 - iocg->has_surplus = true; 2148 - nr_surpluses++; 2149 - } 2150 - } else if (hw_inuse < hw_active) { 2151 - u32 new_hwi, new_inuse; 2152 - 2153 - /* was donating but might need to take back some */ 2154 - if (waitqueue_active(&iocg->waitq)) { 2155 - new_hwi = hw_active; 1452 + /* 1453 + * Already donating or accumulated enough to start. 1454 + * Determine the donation amount. 1455 + */ 1456 + current_hweight(iocg, &hwa, &old_hwi); 1457 + hwm = current_hweight_max(iocg); 1458 + new_hwi = hweight_after_donation(iocg, old_hwi, hwm, 1459 + usage, &now); 1460 + if (new_hwi < hwm) { 1461 + iocg->hweight_donating = hwa; 1462 + iocg->hweight_after_donation = new_hwi; 1463 + list_add(&iocg->surplus_list, &surpluses); 2156 1464 } else { 2157 - new_hwi = max(hw_inuse, 2158 - usage * SURPLUS_SCALE_PCT / 100 + 2159 - SURPLUS_SCALE_ABS); 2160 - } 1465 + TRACE_IOCG_PATH(inuse_shortage, iocg, &now, 1466 + iocg->inuse, iocg->active, 1467 + iocg->hweight_inuse, new_hwi); 2161 1468 2162 - new_inuse = div64_u64((u64)iocg->inuse * new_hwi, 2163 - hw_inuse); 2164 - new_inuse = clamp_t(u32, new_inuse, 1, iocg->active); 2165 - 2166 - if (new_inuse > iocg->inuse) { 2167 - TRACE_IOCG_PATH(inuse_takeback, iocg, &now, 2168 - iocg->inuse, new_inuse, 2169 - hw_inuse, new_hwi); 2170 - __propagate_active_weight(iocg, iocg->weight, 2171 - new_inuse); 1469 + __propagate_weights(iocg, iocg->active, 1470 + iocg->active, true, &now); 1471 + nr_shortages++; 2172 1472 } 2173 1473 } else { 2174 - /* genuninely out of vtime */ 1474 + /* genuinely short on vtime */ 2175 1475 nr_shortages++; 2176 1476 } 2177 1477 } 2178 1478 2179 - if (!nr_shortages || !nr_surpluses) 2180 - goto skip_surplus_transfers; 1479 + if (!list_empty(&surpluses) && nr_shortages) 1480 + transfer_surpluses(&surpluses, &now); 2181 1481 2182 - /* there are both shortages and surpluses, transfer surpluses */ 2183 - list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { 2184 - u32 usage, hw_active, hw_inuse, new_hwi, new_inuse; 2185 - int nr_valid = 0; 1482 + commit_weights(ioc); 2186 1483 2187 - if (!iocg->has_surplus) 2188 - continue; 1484 + /* surplus list should be dissolved after use */ 1485 + list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list) 1486 + list_del_init(&iocg->surplus_list); 2189 1487 2190 - /* base the decision on max historical usage */ 2191 - for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) { 2192 - if (iocg->usages[i]) { 2193 - usage = max(usage, iocg->usages[i]); 2194 - nr_valid++; 1488 + /* 1489 + * A low weight iocg can amass a large amount of debt, for example, when 1490 + * anonymous memory gets reclaimed aggressively. If the system has a lot 1491 + * of memory paired with a slow IO device, the debt can span multiple 1492 + * seconds or more. If there are no other subsequent IO issuers, the 1493 + * in-debt iocg may end up blocked paying its debt while the IO device 1494 + * is idle. 1495 + * 1496 + * The following protects against such pathological cases. If the device 1497 + * has been sufficiently idle for a substantial amount of time, the 1498 + * debts are halved. The criteria are on the conservative side as we 1499 + * want to resolve the rare extreme cases without impacting regular 1500 + * operation by forgiving debts too readily. 1501 + */ 1502 + if (nr_shortages || 1503 + div64_u64(100 * usage_us_sum, now.now - ioc->period_at) >= 1504 + DEBT_BUSY_USAGE_PCT) 1505 + ioc->debt_busy_at = now.now; 1506 + 1507 + if (nr_debtors && 1508 + now.now - ioc->debt_busy_at >= DEBT_REDUCTION_IDLE_DUR) { 1509 + list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { 1510 + if (iocg->abs_vdebt) { 1511 + spin_lock(&iocg->waitq.lock); 1512 + iocg->abs_vdebt /= 2; 1513 + iocg_kick_waitq(iocg, true, &now); 1514 + spin_unlock(&iocg->waitq.lock); 2195 1515 } 2196 1516 } 2197 - if (nr_valid < MIN_VALID_USAGES) 2198 - continue; 2199 - 2200 - current_hweight(iocg, &hw_active, &hw_inuse); 2201 - new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse); 2202 - if (!new_hwi) 2203 - continue; 2204 - 2205 - new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi, 2206 - hw_inuse); 2207 - if (new_inuse < iocg->inuse) { 2208 - TRACE_IOCG_PATH(inuse_giveaway, iocg, &now, 2209 - iocg->inuse, new_inuse, 2210 - hw_inuse, new_hwi); 2211 - __propagate_active_weight(iocg, iocg->weight, new_inuse); 2212 - } 1517 + ioc->debt_busy_at = now.now; 2213 1518 } 2214 - skip_surplus_transfers: 2215 - commit_active_weights(ioc); 2216 1519 2217 1520 /* 2218 1521 * If q is getting clogged or we're missing too much, we're issuing ··· 2229 1554 2230 1555 /* 2231 1556 * If there are IOs spanning multiple periods, wait 2232 - * them out before pushing the device harder. If 2233 - * there are surpluses, let redistribution work it 2234 - * out first. 1557 + * them out before pushing the device harder. 2235 1558 */ 2236 - if (!nr_lagging && !nr_surpluses) 1559 + if (!nr_lagging) 2237 1560 ioc->busy_level--; 2238 1561 } else { 2239 1562 /* ··· 2250 1577 ioc->busy_level = clamp(ioc->busy_level, -1000, 1000); 2251 1578 2252 1579 if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) { 2253 - u64 vrate = atomic64_read(&ioc->vtime_rate); 1580 + u64 vrate = ioc->vtime_base_rate; 2254 1581 u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; 2255 1582 2256 1583 /* rq_wait signal is always reliable, ignore user vrate_min */ ··· 2285 1612 } 2286 1613 2287 1614 trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, 2288 - nr_lagging, nr_shortages, 2289 - nr_surpluses); 1615 + nr_lagging, nr_shortages); 2290 1616 2291 - atomic64_set(&ioc->vtime_rate, vrate); 2292 - ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP( 2293 - ioc->period_us * vrate * INUSE_MARGIN_PCT, 100); 1617 + ioc->vtime_base_rate = vrate; 1618 + ioc_refresh_margins(ioc); 2294 1619 } else if (ioc->busy_level != prev_busy_level || nr_lagging) { 2295 1620 trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), 2296 1621 missed_ppm, rq_wait_pct, nr_lagging, 2297 - nr_shortages, nr_surpluses); 1622 + nr_shortages); 2298 1623 } 2299 1624 2300 1625 ioc_refresh_params(ioc, false); ··· 2308 1637 ioc_start_period(ioc, &now); 2309 1638 } else { 2310 1639 ioc->busy_level = 0; 1640 + ioc->vtime_err = 0; 2311 1641 ioc->running = IOC_IDLE; 2312 1642 } 1643 + 1644 + ioc_refresh_vrate(ioc, &now); 2313 1645 } 2314 1646 2315 1647 spin_unlock_irq(&ioc->lock); 1648 + } 1649 + 1650 + static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, 1651 + u64 abs_cost, struct ioc_now *now) 1652 + { 1653 + struct ioc *ioc = iocg->ioc; 1654 + struct ioc_margins *margins = &ioc->margins; 1655 + u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi; 1656 + u32 hwi, adj_step; 1657 + s64 margin; 1658 + u64 cost, new_inuse; 1659 + 1660 + current_hweight(iocg, NULL, &hwi); 1661 + old_hwi = hwi; 1662 + cost = abs_cost_to_cost(abs_cost, hwi); 1663 + margin = now->vnow - vtime - cost; 1664 + 1665 + /* debt handling owns inuse for debtors */ 1666 + if (iocg->abs_vdebt) 1667 + return cost; 1668 + 1669 + /* 1670 + * We only increase inuse during period and do so iff the margin has 1671 + * deteriorated since the previous adjustment. 1672 + */ 1673 + if (margin >= iocg->saved_margin || margin >= margins->low || 1674 + iocg->inuse == iocg->active) 1675 + return cost; 1676 + 1677 + spin_lock_irq(&ioc->lock); 1678 + 1679 + /* we own inuse only when @iocg is in the normal active state */ 1680 + if (iocg->abs_vdebt || list_empty(&iocg->active_list)) { 1681 + spin_unlock_irq(&ioc->lock); 1682 + return cost; 1683 + } 1684 + 1685 + /* 1686 + * Bump up inuse till @abs_cost fits in the existing budget. 1687 + * adj_step must be determined after acquiring ioc->lock - we might 1688 + * have raced and lost to another thread for activation and could 1689 + * be reading 0 iocg->active before ioc->lock which will lead to 1690 + * infinite loop. 1691 + */ 1692 + new_inuse = iocg->inuse; 1693 + adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100); 1694 + do { 1695 + new_inuse = new_inuse + adj_step; 1696 + propagate_weights(iocg, iocg->active, new_inuse, true, now); 1697 + current_hweight(iocg, NULL, &hwi); 1698 + cost = abs_cost_to_cost(abs_cost, hwi); 1699 + } while (time_after64(vtime + cost, now->vnow) && 1700 + iocg->inuse != iocg->active); 1701 + 1702 + spin_unlock_irq(&ioc->lock); 1703 + 1704 + TRACE_IOCG_PATH(inuse_adjust, iocg, now, 1705 + old_inuse, iocg->inuse, old_hwi, hwi); 1706 + 1707 + return cost; 2316 1708 } 2317 1709 2318 1710 static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg, ··· 2459 1725 struct ioc_gq *iocg = blkg_to_iocg(blkg); 2460 1726 struct ioc_now now; 2461 1727 struct iocg_wait wait; 2462 - u32 hw_active, hw_inuse; 2463 1728 u64 abs_cost, cost, vtime; 1729 + bool use_debt, ioc_locked; 1730 + unsigned long flags; 2464 1731 2465 1732 /* bypass IOs if disabled or for root cgroup */ 2466 1733 if (!ioc->enabled || !iocg->level) 2467 - return; 2468 - 2469 - /* always activate so that even 0 cost IOs get protected to some level */ 2470 - if (!iocg_activate(iocg, &now)) 2471 1734 return; 2472 1735 2473 1736 /* calculate the absolute vtime cost */ ··· 2472 1741 if (!abs_cost) 2473 1742 return; 2474 1743 1744 + if (!iocg_activate(iocg, &now)) 1745 + return; 1746 + 2475 1747 iocg->cursor = bio_end_sector(bio); 2476 - 2477 1748 vtime = atomic64_read(&iocg->vtime); 2478 - current_hweight(iocg, &hw_active, &hw_inuse); 2479 - 2480 - if (hw_inuse < hw_active && 2481 - time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) { 2482 - TRACE_IOCG_PATH(inuse_reset, iocg, &now, 2483 - iocg->inuse, iocg->weight, hw_inuse, hw_active); 2484 - spin_lock_irq(&ioc->lock); 2485 - propagate_active_weight(iocg, iocg->weight, iocg->weight); 2486 - spin_unlock_irq(&ioc->lock); 2487 - current_hweight(iocg, &hw_active, &hw_inuse); 2488 - } 2489 - 2490 - cost = abs_cost_to_cost(abs_cost, hw_inuse); 1749 + cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now); 2491 1750 2492 1751 /* 2493 1752 * If no one's waiting and within budget, issue right away. The ··· 2486 1765 */ 2487 1766 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && 2488 1767 time_before_eq64(vtime + cost, now.vnow)) { 2489 - iocg_commit_bio(iocg, bio, cost); 1768 + iocg_commit_bio(iocg, bio, abs_cost, cost); 2490 1769 return; 2491 1770 } 2492 1771 2493 1772 /* 2494 - * We activated above but w/o any synchronization. Deactivation is 2495 - * synchronized with waitq.lock and we won't get deactivated as long 2496 - * as we're waiting or has debt, so we're good if we're activated 2497 - * here. In the unlikely case that we aren't, just issue the IO. 1773 + * We're over budget. This can be handled in two ways. IOs which may 1774 + * cause priority inversions are punted to @ioc->aux_iocg and charged as 1775 + * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling 1776 + * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine 1777 + * whether debt handling is needed and acquire locks accordingly. 2498 1778 */ 2499 - spin_lock_irq(&iocg->waitq.lock); 1779 + use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current); 1780 + ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt); 1781 + retry_lock: 1782 + iocg_lock(iocg, ioc_locked, &flags); 2500 1783 1784 + /* 1785 + * @iocg must stay activated for debt and waitq handling. Deactivation 1786 + * is synchronized against both ioc->lock and waitq.lock and we won't 1787 + * get deactivated as long as we're waiting or has debt, so we're good 1788 + * if we're activated here. In the unlikely cases that we aren't, just 1789 + * issue the IO. 1790 + */ 2501 1791 if (unlikely(list_empty(&iocg->active_list))) { 2502 - spin_unlock_irq(&iocg->waitq.lock); 2503 - iocg_commit_bio(iocg, bio, cost); 1792 + iocg_unlock(iocg, ioc_locked, &flags); 1793 + iocg_commit_bio(iocg, bio, abs_cost, cost); 2504 1794 return; 2505 1795 } 2506 1796 ··· 2532 1800 * clear them and leave @iocg inactive w/ dangling use_delay heavily 2533 1801 * penalizing the cgroup and its descendants. 2534 1802 */ 2535 - if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) { 2536 - iocg->abs_vdebt += abs_cost; 1803 + if (use_debt) { 1804 + iocg_incur_debt(iocg, abs_cost, &now); 2537 1805 if (iocg_kick_delay(iocg, &now)) 2538 1806 blkcg_schedule_throttle(rqos->q, 2539 1807 (bio->bi_opf & REQ_SWAP) == REQ_SWAP); 2540 - spin_unlock_irq(&iocg->waitq.lock); 1808 + iocg_unlock(iocg, ioc_locked, &flags); 2541 1809 return; 1810 + } 1811 + 1812 + /* guarantee that iocgs w/ waiters have maximum inuse */ 1813 + if (!iocg->abs_vdebt && iocg->inuse != iocg->active) { 1814 + if (!ioc_locked) { 1815 + iocg_unlock(iocg, false, &flags); 1816 + ioc_locked = true; 1817 + goto retry_lock; 1818 + } 1819 + propagate_weights(iocg, iocg->active, iocg->active, true, 1820 + &now); 2542 1821 } 2543 1822 2544 1823 /* ··· 2572 1829 wait.committed = false; /* will be set true by waker */ 2573 1830 2574 1831 __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait); 2575 - iocg_kick_waitq(iocg, &now); 1832 + iocg_kick_waitq(iocg, ioc_locked, &now); 2576 1833 2577 - spin_unlock_irq(&iocg->waitq.lock); 1834 + iocg_unlock(iocg, ioc_locked, &flags); 2578 1835 2579 1836 while (true) { 2580 1837 set_current_state(TASK_UNINTERRUPTIBLE); ··· 2594 1851 struct ioc *ioc = iocg->ioc; 2595 1852 sector_t bio_end = bio_end_sector(bio); 2596 1853 struct ioc_now now; 2597 - u32 hw_inuse; 2598 - u64 abs_cost, cost; 1854 + u64 vtime, abs_cost, cost; 2599 1855 unsigned long flags; 2600 1856 2601 1857 /* bypass if disabled or for root cgroup */ ··· 2606 1864 return; 2607 1865 2608 1866 ioc_now(ioc, &now); 2609 - current_hweight(iocg, NULL, &hw_inuse); 2610 - cost = abs_cost_to_cost(abs_cost, hw_inuse); 1867 + 1868 + vtime = atomic64_read(&iocg->vtime); 1869 + cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now); 2611 1870 2612 1871 /* update cursor if backmerging into the request at the cursor */ 2613 1872 if (blk_rq_pos(rq) < bio_end && ··· 2621 1878 */ 2622 1879 if (rq->bio && rq->bio->bi_iocost_cost && 2623 1880 time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) { 2624 - iocg_commit_bio(iocg, bio, cost); 1881 + iocg_commit_bio(iocg, bio, abs_cost, cost); 2625 1882 return; 2626 1883 } 2627 1884 ··· 2630 1887 * be for the vast majority of cases. See debt handling in 2631 1888 * ioc_rqos_throttle() for details. 2632 1889 */ 2633 - spin_lock_irqsave(&iocg->waitq.lock, flags); 1890 + spin_lock_irqsave(&ioc->lock, flags); 1891 + spin_lock(&iocg->waitq.lock); 1892 + 2634 1893 if (likely(!list_empty(&iocg->active_list))) { 2635 - iocg->abs_vdebt += abs_cost; 2636 - iocg_kick_delay(iocg, &now); 1894 + iocg_incur_debt(iocg, abs_cost, &now); 1895 + if (iocg_kick_delay(iocg, &now)) 1896 + blkcg_schedule_throttle(rqos->q, 1897 + (bio->bi_opf & REQ_SWAP) == REQ_SWAP); 2637 1898 } else { 2638 - iocg_commit_bio(iocg, bio, cost); 1899 + iocg_commit_bio(iocg, bio, abs_cost, cost); 2639 1900 } 2640 - spin_unlock_irqrestore(&iocg->waitq.lock, flags); 1901 + 1902 + spin_unlock(&iocg->waitq.lock); 1903 + spin_unlock_irqrestore(&ioc->lock, flags); 2641 1904 } 2642 1905 2643 1906 static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio) ··· 2657 1908 static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) 2658 1909 { 2659 1910 struct ioc *ioc = rqos_to_ioc(rqos); 1911 + struct ioc_pcpu_stat *ccs; 2660 1912 u64 on_q_ns, rq_wait_ns, size_nsec; 2661 1913 int pidx, rw; 2662 1914 ··· 2681 1931 rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns; 2682 1932 size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC); 2683 1933 1934 + ccs = get_cpu_ptr(ioc->pcpu_stat); 1935 + 2684 1936 if (on_q_ns <= size_nsec || 2685 1937 on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC) 2686 - this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met); 1938 + local_inc(&ccs->missed[rw].nr_met); 2687 1939 else 2688 - this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed); 1940 + local_inc(&ccs->missed[rw].nr_missed); 2689 1941 2690 - this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns); 1942 + local64_add(rq_wait_ns, &ccs->rq_wait_ns); 1943 + 1944 + put_cpu_ptr(ccs); 2691 1945 } 2692 1946 2693 1947 static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos) ··· 2731 1977 { 2732 1978 struct ioc *ioc; 2733 1979 struct rq_qos *rqos; 2734 - int ret; 1980 + int i, cpu, ret; 2735 1981 2736 1982 ioc = kzalloc(sizeof(*ioc), GFP_KERNEL); 2737 1983 if (!ioc) ··· 2741 1987 if (!ioc->pcpu_stat) { 2742 1988 kfree(ioc); 2743 1989 return -ENOMEM; 1990 + } 1991 + 1992 + for_each_possible_cpu(cpu) { 1993 + struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu); 1994 + 1995 + for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) { 1996 + local_set(&ccs->missed[i].nr_met, 0); 1997 + local_set(&ccs->missed[i].nr_missed, 0); 1998 + } 1999 + local64_set(&ccs->rq_wait_ns, 0); 2744 2000 } 2745 2001 2746 2002 rqos = &ioc->rqos; ··· 2763 1999 INIT_LIST_HEAD(&ioc->active_iocgs); 2764 2000 2765 2001 ioc->running = IOC_IDLE; 2002 + ioc->vtime_base_rate = VTIME_PER_USEC; 2766 2003 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); 2767 2004 seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock); 2768 2005 ioc->period_at = ktime_to_us(ktime_get()); ··· 2794 2029 if (!iocc) 2795 2030 return NULL; 2796 2031 2797 - iocc->dfl_weight = CGROUP_WEIGHT_DFL; 2032 + iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE; 2798 2033 return &iocc->cpd; 2799 2034 } 2800 2035 ··· 2812 2047 iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node); 2813 2048 if (!iocg) 2814 2049 return NULL; 2050 + 2051 + iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp); 2052 + if (!iocg->pcpu_stat) { 2053 + kfree(iocg); 2054 + return NULL; 2055 + } 2815 2056 2816 2057 return &iocg->pd; 2817 2058 } ··· 2838 2067 atomic64_set(&iocg->done_vtime, now.vnow); 2839 2068 atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); 2840 2069 INIT_LIST_HEAD(&iocg->active_list); 2841 - iocg->hweight_active = HWEIGHT_WHOLE; 2842 - iocg->hweight_inuse = HWEIGHT_WHOLE; 2070 + INIT_LIST_HEAD(&iocg->walk_list); 2071 + INIT_LIST_HEAD(&iocg->surplus_list); 2072 + iocg->hweight_active = WEIGHT_ONE; 2073 + iocg->hweight_inuse = WEIGHT_ONE; 2843 2074 2844 2075 init_waitqueue_head(&iocg->waitq); 2845 2076 hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 2846 2077 iocg->waitq_timer.function = iocg_waitq_timer_fn; 2847 - hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 2848 - iocg->delay_timer.function = iocg_delay_timer_fn; 2849 2078 2850 2079 iocg->level = blkg->blkcg->css.cgroup->level; 2851 2080 ··· 2855 2084 } 2856 2085 2857 2086 spin_lock_irqsave(&ioc->lock, flags); 2858 - weight_updated(iocg); 2087 + weight_updated(iocg, &now); 2859 2088 spin_unlock_irqrestore(&ioc->lock, flags); 2860 2089 } 2861 2090 ··· 2867 2096 2868 2097 if (ioc) { 2869 2098 spin_lock_irqsave(&ioc->lock, flags); 2099 + 2870 2100 if (!list_empty(&iocg->active_list)) { 2871 - propagate_active_weight(iocg, 0, 0); 2101 + struct ioc_now now; 2102 + 2103 + ioc_now(ioc, &now); 2104 + propagate_weights(iocg, 0, 0, false, &now); 2872 2105 list_del_init(&iocg->active_list); 2873 2106 } 2107 + 2108 + WARN_ON_ONCE(!list_empty(&iocg->walk_list)); 2109 + WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); 2110 + 2874 2111 spin_unlock_irqrestore(&ioc->lock, flags); 2875 2112 2876 2113 hrtimer_cancel(&iocg->waitq_timer); 2877 - hrtimer_cancel(&iocg->delay_timer); 2878 2114 } 2115 + free_percpu(iocg->pcpu_stat); 2879 2116 kfree(iocg); 2117 + } 2118 + 2119 + static size_t ioc_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) 2120 + { 2121 + struct ioc_gq *iocg = pd_to_iocg(pd); 2122 + struct ioc *ioc = iocg->ioc; 2123 + size_t pos = 0; 2124 + 2125 + if (!ioc->enabled) 2126 + return 0; 2127 + 2128 + if (iocg->level == 0) { 2129 + unsigned vp10k = DIV64_U64_ROUND_CLOSEST( 2130 + ioc->vtime_base_rate * 10000, 2131 + VTIME_PER_USEC); 2132 + pos += scnprintf(buf + pos, size - pos, " cost.vrate=%u.%02u", 2133 + vp10k / 100, vp10k % 100); 2134 + } 2135 + 2136 + pos += scnprintf(buf + pos, size - pos, " cost.usage=%llu", 2137 + iocg->last_stat.usage_us); 2138 + 2139 + if (blkcg_debug_stats) 2140 + pos += scnprintf(buf + pos, size - pos, 2141 + " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu", 2142 + iocg->last_stat.wait_us, 2143 + iocg->last_stat.indebt_us, 2144 + iocg->last_stat.indelay_us); 2145 + 2146 + return pos; 2880 2147 } 2881 2148 2882 2149 static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, ··· 2924 2115 struct ioc_gq *iocg = pd_to_iocg(pd); 2925 2116 2926 2117 if (dname && iocg->cfg_weight) 2927 - seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight); 2118 + seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE); 2928 2119 return 0; 2929 2120 } 2930 2121 ··· 2934 2125 struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); 2935 2126 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg); 2936 2127 2937 - seq_printf(sf, "default %u\n", iocc->dfl_weight); 2128 + seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE); 2938 2129 blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill, 2939 2130 &blkcg_policy_iocost, seq_cft(sf)->private, false); 2940 2131 return 0; ··· 2946 2137 struct blkcg *blkcg = css_to_blkcg(of_css(of)); 2947 2138 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg); 2948 2139 struct blkg_conf_ctx ctx; 2140 + struct ioc_now now; 2949 2141 struct ioc_gq *iocg; 2950 2142 u32 v; 2951 2143 int ret; ··· 2961 2151 return -EINVAL; 2962 2152 2963 2153 spin_lock(&blkcg->lock); 2964 - iocc->dfl_weight = v; 2154 + iocc->dfl_weight = v * WEIGHT_ONE; 2965 2155 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { 2966 2156 struct ioc_gq *iocg = blkg_to_iocg(blkg); 2967 2157 2968 2158 if (iocg) { 2969 2159 spin_lock_irq(&iocg->ioc->lock); 2970 - weight_updated(iocg); 2160 + ioc_now(iocg->ioc, &now); 2161 + weight_updated(iocg, &now); 2971 2162 spin_unlock_irq(&iocg->ioc->lock); 2972 2163 } 2973 2164 } ··· 2993 2182 } 2994 2183 2995 2184 spin_lock(&iocg->ioc->lock); 2996 - iocg->cfg_weight = v; 2997 - weight_updated(iocg); 2185 + iocg->cfg_weight = v * WEIGHT_ONE; 2186 + ioc_now(iocg->ioc, &now); 2187 + weight_updated(iocg, &now); 2998 2188 spin_unlock(&iocg->ioc->lock); 2999 2189 3000 2190 blkg_conf_finish(&ctx); ··· 3333 2521 .pd_alloc_fn = ioc_pd_alloc, 3334 2522 .pd_init_fn = ioc_pd_init, 3335 2523 .pd_free_fn = ioc_pd_free, 2524 + .pd_stat_fn = ioc_pd_stat, 3336 2525 }; 3337 2526 3338 2527 static int __init ioc_init(void)

+66 -111

block/blk-map.c

··· 12 12 #include "blk.h" 13 13 14 14 struct bio_map_data { 15 - int is_our_pages; 15 + bool is_our_pages : 1; 16 + bool is_null_mapped : 1; 16 17 struct iov_iter iter; 17 18 struct iovec iov[]; 18 19 }; ··· 109 108 struct bio_map_data *bmd = bio->bi_private; 110 109 int ret = 0; 111 110 112 - if (!bio_flagged(bio, BIO_NULL_MAPPED)) { 111 + if (!bmd->is_null_mapped) { 113 112 /* 114 113 * if we're in a workqueue, the request is orphaned, so 115 114 * don't copy into a random user address space, just free ··· 127 126 return ret; 128 127 } 129 128 130 - /** 131 - * bio_copy_user_iov - copy user data to bio 132 - * @q: destination block queue 133 - * @map_data: pointer to the rq_map_data holding pages (if necessary) 134 - * @iter: iovec iterator 135 - * @gfp_mask: memory allocation flags 136 - * 137 - * Prepares and returns a bio for indirect user io, bouncing data 138 - * to/from kernel pages as necessary. Must be paired with 139 - * call bio_uncopy_user() on io completion. 140 - */ 141 - static struct bio *bio_copy_user_iov(struct request_queue *q, 142 - struct rq_map_data *map_data, struct iov_iter *iter, 143 - gfp_t gfp_mask) 129 + static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, 130 + struct iov_iter *iter, gfp_t gfp_mask) 144 131 { 145 132 struct bio_map_data *bmd; 146 133 struct page *page; 147 - struct bio *bio; 134 + struct bio *bio, *bounce_bio; 148 135 int i = 0, ret; 149 136 int nr_pages; 150 137 unsigned int len = iter->count; ··· 140 151 141 152 bmd = bio_alloc_map_data(iter, gfp_mask); 142 153 if (!bmd) 143 - return ERR_PTR(-ENOMEM); 154 + return -ENOMEM; 144 155 145 156 /* 146 157 * We need to do a deep copy of the iov_iter including the iovecs. 147 158 * The caller provided iov might point to an on-stack or otherwise 148 159 * shortlived one. 149 160 */ 150 - bmd->is_our_pages = map_data ? 0 : 1; 161 + bmd->is_our_pages = !map_data; 162 + bmd->is_null_mapped = (map_data && map_data->null_mapped); 151 163 152 164 nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); 153 165 if (nr_pages > BIO_MAX_PAGES) ··· 158 168 bio = bio_kmalloc(gfp_mask, nr_pages); 159 169 if (!bio) 160 170 goto out_bmd; 161 - 162 - ret = 0; 171 + bio->bi_opf |= req_op(rq); 163 172 164 173 if (map_data) { 165 174 nr_pages = 1 << map_data->page_order; ··· 175 186 if (map_data) { 176 187 if (i == map_data->nr_entries * nr_pages) { 177 188 ret = -ENOMEM; 178 - break; 189 + goto cleanup; 179 190 } 180 191 181 192 page = map_data->pages[i / nr_pages]; ··· 183 194 184 195 i++; 185 196 } else { 186 - page = alloc_page(q->bounce_gfp | gfp_mask); 197 + page = alloc_page(rq->q->bounce_gfp | gfp_mask); 187 198 if (!page) { 188 199 ret = -ENOMEM; 189 - break; 200 + goto cleanup; 190 201 } 191 202 } 192 203 193 - if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) { 204 + if (bio_add_pc_page(rq->q, bio, page, bytes, offset) < bytes) { 194 205 if (!map_data) 195 206 __free_page(page); 196 207 break; ··· 199 210 len -= bytes; 200 211 offset = 0; 201 212 } 202 - 203 - if (ret) 204 - goto cleanup; 205 213 206 214 if (map_data) 207 215 map_data->offset += bio->bi_iter.bi_size; ··· 219 233 } 220 234 221 235 bio->bi_private = bmd; 222 - if (map_data && map_data->null_mapped) 223 - bio_set_flag(bio, BIO_NULL_MAPPED); 224 - return bio; 236 + 237 + bounce_bio = bio; 238 + ret = blk_rq_append_bio(rq, &bounce_bio); 239 + if (ret) 240 + goto cleanup; 241 + 242 + /* 243 + * We link the bounce buffer in and could have to traverse it later, so 244 + * we have to get a ref to prevent it from being freed 245 + */ 246 + bio_get(bounce_bio); 247 + return 0; 225 248 cleanup: 226 249 if (!map_data) 227 250 bio_free_pages(bio); 228 251 bio_put(bio); 229 252 out_bmd: 230 253 kfree(bmd); 231 - return ERR_PTR(ret); 254 + return ret; 232 255 } 233 256 234 - /** 235 - * bio_map_user_iov - map user iovec into bio 236 - * @q: the struct request_queue for the bio 237 - * @iter: iovec iterator 238 - * @gfp_mask: memory allocation flags 239 - * 240 - * Map the user space address into a bio suitable for io to a block 241 - * device. Returns an error pointer in case of error. 242 - */ 243 - static struct bio *bio_map_user_iov(struct request_queue *q, 244 - struct iov_iter *iter, gfp_t gfp_mask) 257 + static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, 258 + gfp_t gfp_mask) 245 259 { 246 - unsigned int max_sectors = queue_max_hw_sectors(q); 247 - int j; 248 - struct bio *bio; 260 + unsigned int max_sectors = queue_max_hw_sectors(rq->q); 261 + struct bio *bio, *bounce_bio; 249 262 int ret; 263 + int j; 250 264 251 265 if (!iov_iter_count(iter)) 252 - return ERR_PTR(-EINVAL); 266 + return -EINVAL; 253 267 254 268 bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES)); 255 269 if (!bio) 256 - return ERR_PTR(-ENOMEM); 270 + return -ENOMEM; 271 + bio->bi_opf |= req_op(rq); 257 272 258 273 while (iov_iter_count(iter)) { 259 274 struct page **pages; ··· 270 283 271 284 npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE); 272 285 273 - if (unlikely(offs & queue_dma_alignment(q))) { 286 + if (unlikely(offs & queue_dma_alignment(rq->q))) { 274 287 ret = -EINVAL; 275 288 j = 0; 276 289 } else { ··· 282 295 if (n > bytes) 283 296 n = bytes; 284 297 285 - if (!bio_add_hw_page(q, bio, page, n, offs, 298 + if (!bio_add_hw_page(rq->q, bio, page, n, offs, 286 299 max_sectors, &same_page)) { 287 300 if (same_page) 288 301 put_page(page); ··· 306 319 break; 307 320 } 308 321 309 - bio_set_flag(bio, BIO_USER_MAPPED); 310 - 311 322 /* 312 - * subtle -- if bio_map_user_iov() ended up bouncing a bio, 313 - * it would normally disappear when its bi_end_io is run. 314 - * however, we need it for the unmap, so grab an extra 315 - * reference to it 323 + * Subtle: if we end up needing to bounce a bio, it would normally 324 + * disappear when its bi_end_io is run. However, we need the original 325 + * bio for the unmap, so grab an extra reference to it 316 326 */ 317 327 bio_get(bio); 318 - return bio; 319 328 329 + bounce_bio = bio; 330 + ret = blk_rq_append_bio(rq, &bounce_bio); 331 + if (ret) 332 + goto out_put_orig; 333 + 334 + /* 335 + * We link the bounce buffer in and could have to traverse it 336 + * later, so we have to get a ref to prevent it from being freed 337 + */ 338 + bio_get(bounce_bio); 339 + return 0; 340 + 341 + out_put_orig: 342 + bio_put(bio); 320 343 out_unmap: 321 344 bio_release_pages(bio, false); 322 345 bio_put(bio); 323 - return ERR_PTR(ret); 346 + return ret; 324 347 } 325 348 326 349 /** ··· 554 557 } 555 558 EXPORT_SYMBOL(blk_rq_append_bio); 556 559 557 - static int __blk_rq_unmap_user(struct bio *bio) 558 - { 559 - int ret = 0; 560 - 561 - if (bio) { 562 - if (bio_flagged(bio, BIO_USER_MAPPED)) 563 - bio_unmap_user(bio); 564 - else 565 - ret = bio_uncopy_user(bio); 566 - } 567 - 568 - return ret; 569 - } 570 - 571 - static int __blk_rq_map_user_iov(struct request *rq, 572 - struct rq_map_data *map_data, struct iov_iter *iter, 573 - gfp_t gfp_mask, bool copy) 574 - { 575 - struct request_queue *q = rq->q; 576 - struct bio *bio, *orig_bio; 577 - int ret; 578 - 579 - if (copy) 580 - bio = bio_copy_user_iov(q, map_data, iter, gfp_mask); 581 - else 582 - bio = bio_map_user_iov(q, iter, gfp_mask); 583 - 584 - if (IS_ERR(bio)) 585 - return PTR_ERR(bio); 586 - 587 - bio->bi_opf &= ~REQ_OP_MASK; 588 - bio->bi_opf |= req_op(rq); 589 - 590 - orig_bio = bio; 591 - 592 - /* 593 - * We link the bounce buffer in and could have to traverse it 594 - * later so we have to get a ref to prevent it from being freed 595 - */ 596 - ret = blk_rq_append_bio(rq, &bio); 597 - if (ret) { 598 - __blk_rq_unmap_user(orig_bio); 599 - return ret; 600 - } 601 - bio_get(bio); 602 - 603 - return 0; 604 - } 605 - 606 560 /** 607 561 * blk_rq_map_user_iov - map user data to a request, for passthrough requests 608 562 * @q: request queue where request should be inserted ··· 597 649 598 650 i = *iter; 599 651 do { 600 - ret =__blk_rq_map_user_iov(rq, map_data, &i, gfp_mask, copy); 652 + if (copy) 653 + ret = bio_copy_user_iov(rq, map_data, &i, gfp_mask); 654 + else 655 + ret = bio_map_user_iov(rq, &i, gfp_mask); 601 656 if (ret) 602 657 goto unmap_rq; 603 658 if (!bio) ··· 651 700 if (unlikely(bio_flagged(bio, BIO_BOUNCED))) 652 701 mapped_bio = bio->bi_private; 653 702 654 - ret2 = __blk_rq_unmap_user(mapped_bio); 655 - if (ret2 && !ret) 656 - ret = ret2; 703 + if (bio->bi_private) { 704 + ret2 = bio_uncopy_user(mapped_bio); 705 + if (ret2 && !ret) 706 + ret = ret2; 707 + } else { 708 + bio_unmap_user(mapped_bio); 709 + } 657 710 658 711 mapped_bio = bio; 659 712 bio = bio->bi_next;

+201

block/blk-merge.c

··· 11 11 #include <trace/events/block.h> 12 12 13 13 #include "blk.h" 14 + #include "blk-rq-qos.h" 14 15 15 16 static inline bool bio_will_gap(struct request_queue *q, 16 17 struct request *prev_rq, struct bio *prev, struct bio *next) ··· 896 895 return ELEVATOR_FRONT_MERGE; 897 896 return ELEVATOR_NO_MERGE; 898 897 } 898 + 899 + static void blk_account_io_merge_bio(struct request *req) 900 + { 901 + if (!blk_do_io_stat(req)) 902 + return; 903 + 904 + part_stat_lock(); 905 + part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); 906 + part_stat_unlock(); 907 + } 908 + 909 + enum bio_merge_status bio_attempt_back_merge(struct request *req, 910 + struct bio *bio, 911 + unsigned int nr_segs) 912 + { 913 + const int ff = bio->bi_opf & REQ_FAILFAST_MASK; 914 + 915 + if (!ll_back_merge_fn(req, bio, nr_segs)) 916 + return BIO_MERGE_FAILED; 917 + 918 + trace_block_bio_backmerge(req->q, req, bio); 919 + rq_qos_merge(req->q, req, bio); 920 + 921 + if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) 922 + blk_rq_set_mixed_merge(req); 923 + 924 + req->biotail->bi_next = bio; 925 + req->biotail = bio; 926 + req->__data_len += bio->bi_iter.bi_size; 927 + 928 + bio_crypt_free_ctx(bio); 929 + 930 + blk_account_io_merge_bio(req); 931 + return BIO_MERGE_OK; 932 + } 933 + 934 + enum bio_merge_status bio_attempt_front_merge(struct request *req, 935 + struct bio *bio, 936 + unsigned int nr_segs) 937 + { 938 + const int ff = bio->bi_opf & REQ_FAILFAST_MASK; 939 + 940 + if (!ll_front_merge_fn(req, bio, nr_segs)) 941 + return BIO_MERGE_FAILED; 942 + 943 + trace_block_bio_frontmerge(req->q, req, bio); 944 + rq_qos_merge(req->q, req, bio); 945 + 946 + if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) 947 + blk_rq_set_mixed_merge(req); 948 + 949 + bio->bi_next = req->bio; 950 + req->bio = bio; 951 + 952 + req->__sector = bio->bi_iter.bi_sector; 953 + req->__data_len += bio->bi_iter.bi_size; 954 + 955 + bio_crypt_do_front_merge(req, bio); 956 + 957 + blk_account_io_merge_bio(req); 958 + return BIO_MERGE_OK; 959 + } 960 + 961 + enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q, 962 + struct request *req, 963 + struct bio *bio) 964 + { 965 + unsigned short segments = blk_rq_nr_discard_segments(req); 966 + 967 + if (segments >= queue_max_discard_segments(q)) 968 + goto no_merge; 969 + if (blk_rq_sectors(req) + bio_sectors(bio) > 970 + blk_rq_get_max_sectors(req, blk_rq_pos(req))) 971 + goto no_merge; 972 + 973 + rq_qos_merge(q, req, bio); 974 + 975 + req->biotail->bi_next = bio; 976 + req->biotail = bio; 977 + req->__data_len += bio->bi_iter.bi_size; 978 + req->nr_phys_segments = segments + 1; 979 + 980 + blk_account_io_merge_bio(req); 981 + return BIO_MERGE_OK; 982 + no_merge: 983 + req_set_nomerge(q, req); 984 + return BIO_MERGE_FAILED; 985 + } 986 + 987 + static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q, 988 + struct request *rq, 989 + struct bio *bio, 990 + unsigned int nr_segs, 991 + bool sched_allow_merge) 992 + { 993 + if (!blk_rq_merge_ok(rq, bio)) 994 + return BIO_MERGE_NONE; 995 + 996 + switch (blk_try_merge(rq, bio)) { 997 + case ELEVATOR_BACK_MERGE: 998 + if (!sched_allow_merge || blk_mq_sched_allow_merge(q, rq, bio)) 999 + return bio_attempt_back_merge(rq, bio, nr_segs); 1000 + break; 1001 + case ELEVATOR_FRONT_MERGE: 1002 + if (!sched_allow_merge || blk_mq_sched_allow_merge(q, rq, bio)) 1003 + return bio_attempt_front_merge(rq, bio, nr_segs); 1004 + break; 1005 + case ELEVATOR_DISCARD_MERGE: 1006 + return bio_attempt_discard_merge(q, rq, bio); 1007 + default: 1008 + return BIO_MERGE_NONE; 1009 + } 1010 + 1011 + return BIO_MERGE_FAILED; 1012 + } 1013 + 1014 + /** 1015 + * blk_attempt_plug_merge - try to merge with %current's plugged list 1016 + * @q: request_queue new bio is being queued at 1017 + * @bio: new bio being queued 1018 + * @nr_segs: number of segments in @bio 1019 + * @same_queue_rq: pointer to &struct request that gets filled in when 1020 + * another request associated with @q is found on the plug list 1021 + * (optional, may be %NULL) 1022 + * 1023 + * Determine whether @bio being queued on @q can be merged with a request 1024 + * on %current's plugged list. Returns %true if merge was successful, 1025 + * otherwise %false. 1026 + * 1027 + * Plugging coalesces IOs from the same issuer for the same purpose without 1028 + * going through @q->queue_lock. As such it's more of an issuing mechanism 1029 + * than scheduling, and the request, while may have elvpriv data, is not 1030 + * added on the elevator at this point. In addition, we don't have 1031 + * reliable access to the elevator outside queue lock. Only check basic 1032 + * merging parameters without querying the elevator. 1033 + * 1034 + * Caller must ensure !blk_queue_nomerges(q) beforehand. 1035 + */ 1036 + bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, 1037 + unsigned int nr_segs, struct request **same_queue_rq) 1038 + { 1039 + struct blk_plug *plug; 1040 + struct request *rq; 1041 + struct list_head *plug_list; 1042 + 1043 + plug = blk_mq_plug(q, bio); 1044 + if (!plug) 1045 + return false; 1046 + 1047 + plug_list = &plug->mq_list; 1048 + 1049 + list_for_each_entry_reverse(rq, plug_list, queuelist) { 1050 + if (rq->q == q && same_queue_rq) { 1051 + /* 1052 + * Only blk-mq multiple hardware queues case checks the 1053 + * rq in the same queue, there should be only one such 1054 + * rq in a queue 1055 + **/ 1056 + *same_queue_rq = rq; 1057 + } 1058 + 1059 + if (rq->q != q) 1060 + continue; 1061 + 1062 + if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == 1063 + BIO_MERGE_OK) 1064 + return true; 1065 + } 1066 + 1067 + return false; 1068 + } 1069 + 1070 + /* 1071 + * Iterate list of requests and see if we can merge this bio with any 1072 + * of them. 1073 + */ 1074 + bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, 1075 + struct bio *bio, unsigned int nr_segs) 1076 + { 1077 + struct request *rq; 1078 + int checked = 8; 1079 + 1080 + list_for_each_entry_reverse(rq, list, queuelist) { 1081 + if (!checked--) 1082 + break; 1083 + 1084 + switch (blk_attempt_bio_merge(q, rq, bio, nr_segs, true)) { 1085 + case BIO_MERGE_NONE: 1086 + continue; 1087 + case BIO_MERGE_OK: 1088 + return true; 1089 + case BIO_MERGE_FAILED: 1090 + return false; 1091 + } 1092 + 1093 + } 1094 + 1095 + return false; 1096 + } 1097 + EXPORT_SYMBOL_GPL(blk_bio_list_merge);

+6 -5

block/blk-mq-debugfs.c

··· 116 116 QUEUE_FLAG_NAME(SAME_FORCE), 117 117 QUEUE_FLAG_NAME(DEAD), 118 118 QUEUE_FLAG_NAME(INIT_DONE), 119 + QUEUE_FLAG_NAME(STABLE_WRITES), 119 120 QUEUE_FLAG_NAME(POLL), 120 121 QUEUE_FLAG_NAME(WC), 121 122 QUEUE_FLAG_NAME(FUA), ··· 241 240 #define HCTX_FLAG_NAME(name) [ilog2(BLK_MQ_F_##name)] = #name 242 241 static const char *const hctx_flag_name[] = { 243 242 HCTX_FLAG_NAME(SHOULD_MERGE), 244 - HCTX_FLAG_NAME(TAG_SHARED), 243 + HCTX_FLAG_NAME(TAG_QUEUE_SHARED), 245 244 HCTX_FLAG_NAME(BLOCKING), 246 245 HCTX_FLAG_NAME(NO_SCHED), 247 246 HCTX_FLAG_NAME(STACKING), ··· 453 452 atomic_read(&tags->active_queues)); 454 453 455 454 seq_puts(m, "\nbitmap_tags:\n"); 456 - sbitmap_queue_show(&tags->bitmap_tags, m); 455 + sbitmap_queue_show(tags->bitmap_tags, m); 457 456 458 457 if (tags->nr_reserved_tags) { 459 458 seq_puts(m, "\nbreserved_tags:\n"); 460 - sbitmap_queue_show(&tags->breserved_tags, m); 459 + sbitmap_queue_show(tags->breserved_tags, m); 461 460 } 462 461 } 463 462 ··· 488 487 if (res) 489 488 goto out; 490 489 if (hctx->tags) 491 - sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m); 490 + sbitmap_bitmap_show(&hctx->tags->bitmap_tags->sb, m); 492 491 mutex_unlock(&q->sysfs_lock); 493 492 494 493 out: ··· 522 521 if (res) 523 522 goto out; 524 523 if (hctx->sched_tags) 525 - sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m); 524 + sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags->sb, m); 526 525 mutex_unlock(&q->sysfs_lock); 527 526 528 527 out:

+30 -94

block/blk-mq-sched.c

··· 18 18 #include "blk-mq-tag.h" 19 19 #include "blk-wbt.h" 20 20 21 - void blk_mq_sched_free_hctx_data(struct request_queue *q, 22 - void (*exit)(struct blk_mq_hw_ctx *)) 23 - { 24 - struct blk_mq_hw_ctx *hctx; 25 - int i; 26 - 27 - queue_for_each_hw_ctx(q, hctx, i) { 28 - if (exit && hctx->sched_data) 29 - exit(hctx); 30 - kfree(hctx->sched_data); 31 - hctx->sched_data = NULL; 32 - } 33 - } 34 - EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); 35 - 36 21 void blk_mq_sched_assign_ioc(struct request *rq) 37 22 { 38 23 struct request_queue *q = rq->q; ··· 353 368 case ELEVATOR_BACK_MERGE: 354 369 if (!blk_mq_sched_allow_merge(q, rq, bio)) 355 370 return false; 356 - if (!bio_attempt_back_merge(rq, bio, nr_segs)) 371 + if (bio_attempt_back_merge(rq, bio, nr_segs) != BIO_MERGE_OK) 357 372 return false; 358 373 *merged_request = attempt_back_merge(q, rq); 359 374 if (!*merged_request) ··· 362 377 case ELEVATOR_FRONT_MERGE: 363 378 if (!blk_mq_sched_allow_merge(q, rq, bio)) 364 379 return false; 365 - if (!bio_attempt_front_merge(rq, bio, nr_segs)) 380 + if (bio_attempt_front_merge(rq, bio, nr_segs) != BIO_MERGE_OK) 366 381 return false; 367 382 *merged_request = attempt_front_merge(q, rq); 368 383 if (!*merged_request) 369 384 elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE); 370 385 return true; 371 386 case ELEVATOR_DISCARD_MERGE: 372 - return bio_attempt_discard_merge(q, rq, bio); 387 + return bio_attempt_discard_merge(q, rq, bio) == BIO_MERGE_OK; 373 388 default: 374 389 return false; 375 390 } 376 391 } 377 392 EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); 378 - 379 - /* 380 - * Iterate list of requests and see if we can merge this bio with any 381 - * of them. 382 - */ 383 - bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, 384 - struct bio *bio, unsigned int nr_segs) 385 - { 386 - struct request *rq; 387 - int checked = 8; 388 - 389 - list_for_each_entry_reverse(rq, list, queuelist) { 390 - bool merged = false; 391 - 392 - if (!checked--) 393 - break; 394 - 395 - if (!blk_rq_merge_ok(rq, bio)) 396 - continue; 397 - 398 - switch (blk_try_merge(rq, bio)) { 399 - case ELEVATOR_BACK_MERGE: 400 - if (blk_mq_sched_allow_merge(q, rq, bio)) 401 - merged = bio_attempt_back_merge(rq, bio, 402 - nr_segs); 403 - break; 404 - case ELEVATOR_FRONT_MERGE: 405 - if (blk_mq_sched_allow_merge(q, rq, bio)) 406 - merged = bio_attempt_front_merge(rq, bio, 407 - nr_segs); 408 - break; 409 - case ELEVATOR_DISCARD_MERGE: 410 - merged = bio_attempt_discard_merge(q, rq, bio); 411 - break; 412 - default: 413 - continue; 414 - } 415 - 416 - return merged; 417 - } 418 - 419 - return false; 420 - } 421 - EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge); 422 - 423 - /* 424 - * Reverse check our software queue for entries that we could potentially 425 - * merge with. Currently includes a hand-wavy stop count of 8, to not spend 426 - * too much time checking for merges. 427 - */ 428 - static bool blk_mq_attempt_merge(struct request_queue *q, 429 - struct blk_mq_hw_ctx *hctx, 430 - struct blk_mq_ctx *ctx, struct bio *bio, 431 - unsigned int nr_segs) 432 - { 433 - enum hctx_type type = hctx->type; 434 - 435 - lockdep_assert_held(&ctx->lock); 436 - 437 - if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) { 438 - ctx->rq_merged++; 439 - return true; 440 - } 441 - 442 - return false; 443 - } 444 393 445 394 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, 446 395 unsigned int nr_segs) ··· 389 470 return e->type->ops.bio_merge(hctx, bio, nr_segs); 390 471 391 472 type = hctx->type; 392 - if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && 393 - !list_empty_careful(&ctx->rq_lists[type])) { 394 - /* default per sw-queue merge */ 395 - spin_lock(&ctx->lock); 396 - ret = blk_mq_attempt_merge(q, hctx, ctx, bio, nr_segs); 397 - spin_unlock(&ctx->lock); 473 + if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) || 474 + list_empty_careful(&ctx->rq_lists[type])) 475 + return false; 476 + 477 + /* default per sw-queue merge */ 478 + spin_lock(&ctx->lock); 479 + /* 480 + * Reverse check our software queue for entries that we could 481 + * potentially merge with. Currently includes a hand-wavy stop 482 + * count of 8, to not spend too much time checking for merges. 483 + */ 484 + if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) { 485 + ctx->rq_merged++; 486 + ret = true; 398 487 } 488 + 489 + spin_unlock(&ctx->lock); 399 490 400 491 return ret; 401 492 } ··· 460 531 goto run; 461 532 } 462 533 463 - WARN_ON(e && (rq->tag != -1)); 534 + WARN_ON(e && (rq->tag != BLK_MQ_NO_TAG)); 464 535 465 536 if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) { 466 537 /* ··· 545 616 struct blk_mq_hw_ctx *hctx, 546 617 unsigned int hctx_idx) 547 618 { 619 + unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; 620 + 548 621 if (hctx->sched_tags) { 549 622 blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); 550 - blk_mq_free_rq_map(hctx->sched_tags); 623 + blk_mq_free_rq_map(hctx->sched_tags, flags); 551 624 hctx->sched_tags = NULL; 552 625 } 553 626 } ··· 559 628 unsigned int hctx_idx) 560 629 { 561 630 struct blk_mq_tag_set *set = q->tag_set; 631 + /* Clear HCTX_SHARED so tags are init'ed */ 632 + unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; 562 633 int ret; 563 634 564 635 hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, 565 - set->reserved_tags); 636 + set->reserved_tags, flags); 566 637 if (!hctx->sched_tags) 567 638 return -ENOMEM; 568 639 ··· 582 649 int i; 583 650 584 651 queue_for_each_hw_ctx(q, hctx, i) { 652 + /* Clear HCTX_SHARED so tags are freed */ 653 + unsigned int flags = hctx->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; 654 + 585 655 if (hctx->sched_tags) { 586 - blk_mq_free_rq_map(hctx->sched_tags); 656 + blk_mq_free_rq_map(hctx->sched_tags, flags); 587 657 hctx->sched_tags = NULL; 588 658 } 589 659 }

-3

block/blk-mq-sched.h

··· 5 5 #include "blk-mq.h" 6 6 #include "blk-mq-tag.h" 7 7 8 - void blk_mq_sched_free_hctx_data(struct request_queue *q, 9 - void (*exit)(struct blk_mq_hw_ctx *)); 10 - 11 8 void blk_mq_sched_assign_ioc(struct request *rq); 12 9 13 10 void blk_mq_sched_request_inserted(struct request *rq);

+109 -41

block/blk-mq-tag.c

··· 23 23 */ 24 24 bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) 25 25 { 26 - if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && 27 - !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) 28 - atomic_inc(&hctx->tags->active_queues); 26 + if (blk_mq_is_sbitmap_shared(hctx->flags)) { 27 + struct request_queue *q = hctx->queue; 28 + struct blk_mq_tag_set *set = q->tag_set; 29 + 30 + if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) && 31 + !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) 32 + atomic_inc(&set->active_queues_shared_sbitmap); 33 + } else { 34 + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && 35 + !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) 36 + atomic_inc(&hctx->tags->active_queues); 37 + } 29 38 30 39 return true; 31 40 } ··· 44 35 */ 45 36 void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) 46 37 { 47 - sbitmap_queue_wake_all(&tags->bitmap_tags); 38 + sbitmap_queue_wake_all(tags->bitmap_tags); 48 39 if (include_reserve) 49 - sbitmap_queue_wake_all(&tags->breserved_tags); 40 + sbitmap_queue_wake_all(tags->breserved_tags); 50 41 } 51 42 52 43 /* ··· 56 47 void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) 57 48 { 58 49 struct blk_mq_tags *tags = hctx->tags; 50 + struct request_queue *q = hctx->queue; 51 + struct blk_mq_tag_set *set = q->tag_set; 59 52 60 - if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) 61 - return; 62 - 63 - atomic_dec(&tags->active_queues); 53 + if (blk_mq_is_sbitmap_shared(hctx->flags)) { 54 + if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE, 55 + &q->queue_flags)) 56 + return; 57 + atomic_dec(&set->active_queues_shared_sbitmap); 58 + } else { 59 + if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) 60 + return; 61 + atomic_dec(&tags->active_queues); 62 + } 64 63 65 64 blk_mq_tag_wakeup_all(tags, false); 66 65 } ··· 76 59 static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, 77 60 struct sbitmap_queue *bt) 78 61 { 79 - if (!data->q->elevator && !hctx_may_queue(data->hctx, bt)) 62 + if (!data->q->elevator && !(data->flags & BLK_MQ_REQ_RESERVED) && 63 + !hctx_may_queue(data->hctx, bt)) 80 64 return BLK_MQ_NO_TAG; 81 65 82 66 if (data->shallow_depth) ··· 100 82 WARN_ON_ONCE(1); 101 83 return BLK_MQ_NO_TAG; 102 84 } 103 - bt = &tags->breserved_tags; 85 + bt = tags->breserved_tags; 104 86 tag_offset = 0; 105 87 } else { 106 - bt = &tags->bitmap_tags; 88 + bt = tags->bitmap_tags; 107 89 tag_offset = tags->nr_reserved_tags; 108 90 } 109 91 ··· 149 131 data->ctx); 150 132 tags = blk_mq_tags_from_data(data); 151 133 if (data->flags & BLK_MQ_REQ_RESERVED) 152 - bt = &tags->breserved_tags; 134 + bt = tags->breserved_tags; 153 135 else 154 - bt = &tags->bitmap_tags; 136 + bt = tags->bitmap_tags; 155 137 156 138 /* 157 139 * If destination hw queue is changed, fake wake up on ··· 185 167 const int real_tag = tag - tags->nr_reserved_tags; 186 168 187 169 BUG_ON(real_tag >= tags->nr_tags); 188 - sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu); 170 + sbitmap_queue_clear(tags->bitmap_tags, real_tag, ctx->cpu); 189 171 } else { 190 172 BUG_ON(tag >= tags->nr_reserved_tags); 191 - sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu); 173 + sbitmap_queue_clear(tags->breserved_tags, tag, ctx->cpu); 192 174 } 193 175 } 194 176 ··· 215 197 * We can hit rq == NULL here, because the tagging functions 216 198 * test and set the bit before assigning ->rqs[]. 217 199 */ 218 - if (rq && rq->q == hctx->queue) 200 + if (rq && rq->q == hctx->queue && rq->mq_hctx == hctx) 219 201 return iter_data->fn(hctx, rq, iter_data->data, reserved); 220 202 return true; 221 203 } ··· 316 298 WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED); 317 299 318 300 if (tags->nr_reserved_tags) 319 - bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, 301 + bt_tags_for_each(tags, tags->breserved_tags, fn, priv, 320 302 flags | BT_TAG_ITER_RESERVED); 321 - bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, flags); 303 + bt_tags_for_each(tags, tags->bitmap_tags, fn, priv, flags); 322 304 } 323 305 324 306 /** ··· 434 416 continue; 435 417 436 418 if (tags->nr_reserved_tags) 437 - bt_for_each(hctx, &tags->breserved_tags, fn, priv, true); 438 - bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false); 419 + bt_for_each(hctx, tags->breserved_tags, fn, priv, true); 420 + bt_for_each(hctx, tags->bitmap_tags, fn, priv, false); 439 421 } 440 422 blk_queue_exit(q); 441 423 } ··· 447 429 node); 448 430 } 449 431 450 - static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, 451 - int node, int alloc_policy) 432 + static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, 433 + int node, int alloc_policy) 452 434 { 453 435 unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; 454 436 bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; 455 437 456 - if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node)) 457 - goto free_tags; 458 - if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, round_robin, 459 - node)) 438 + if (bt_alloc(&tags->__bitmap_tags, depth, round_robin, node)) 439 + return -ENOMEM; 440 + if (bt_alloc(&tags->__breserved_tags, tags->nr_reserved_tags, 441 + round_robin, node)) 460 442 goto free_bitmap_tags; 461 443 462 - return tags; 444 + tags->bitmap_tags = &tags->__bitmap_tags; 445 + tags->breserved_tags = &tags->__breserved_tags; 446 + 447 + return 0; 463 448 free_bitmap_tags: 464 - sbitmap_queue_free(&tags->bitmap_tags); 465 - free_tags: 466 - kfree(tags); 467 - return NULL; 449 + sbitmap_queue_free(&tags->__bitmap_tags); 450 + return -ENOMEM; 451 + } 452 + 453 + int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags) 454 + { 455 + unsigned int depth = set->queue_depth - set->reserved_tags; 456 + int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags); 457 + bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; 458 + int i, node = set->numa_node; 459 + 460 + if (bt_alloc(&set->__bitmap_tags, depth, round_robin, node)) 461 + return -ENOMEM; 462 + if (bt_alloc(&set->__breserved_tags, set->reserved_tags, 463 + round_robin, node)) 464 + goto free_bitmap_tags; 465 + 466 + for (i = 0; i < set->nr_hw_queues; i++) { 467 + struct blk_mq_tags *tags = set->tags[i]; 468 + 469 + tags->bitmap_tags = &set->__bitmap_tags; 470 + tags->breserved_tags = &set->__breserved_tags; 471 + } 472 + 473 + return 0; 474 + free_bitmap_tags: 475 + sbitmap_queue_free(&set->__bitmap_tags); 476 + return -ENOMEM; 477 + } 478 + 479 + void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set) 480 + { 481 + sbitmap_queue_free(&set->__bitmap_tags); 482 + sbitmap_queue_free(&set->__breserved_tags); 468 483 } 469 484 470 485 struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, 471 486 unsigned int reserved_tags, 472 - int node, int alloc_policy) 487 + int node, unsigned int flags) 473 488 { 489 + int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(flags); 474 490 struct blk_mq_tags *tags; 475 491 476 492 if (total_tags > BLK_MQ_TAG_MAX) { ··· 519 467 tags->nr_tags = total_tags; 520 468 tags->nr_reserved_tags = reserved_tags; 521 469 522 - return blk_mq_init_bitmap_tags(tags, node, alloc_policy); 470 + if (flags & BLK_MQ_F_TAG_HCTX_SHARED) 471 + return tags; 472 + 473 + if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) { 474 + kfree(tags); 475 + return NULL; 476 + } 477 + return tags; 523 478 } 524 479 525 - void blk_mq_free_tags(struct blk_mq_tags *tags) 480 + void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags) 526 481 { 527 - sbitmap_queue_free(&tags->bitmap_tags); 528 - sbitmap_queue_free(&tags->breserved_tags); 482 + if (!(flags & BLK_MQ_F_TAG_HCTX_SHARED)) { 483 + sbitmap_queue_free(tags->bitmap_tags); 484 + sbitmap_queue_free(tags->breserved_tags); 485 + } 529 486 kfree(tags); 530 487 } 531 488 ··· 553 492 */ 554 493 if (tdepth > tags->nr_tags) { 555 494 struct blk_mq_tag_set *set = hctx->queue->tag_set; 495 + /* Only sched tags can grow, so clear HCTX_SHARED flag */ 496 + unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; 556 497 struct blk_mq_tags *new; 557 498 bool ret; 558 499 ··· 569 506 return -EINVAL; 570 507 571 508 new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, 572 - tags->nr_reserved_tags); 509 + tags->nr_reserved_tags, flags); 573 510 if (!new) 574 511 return -ENOMEM; 575 512 ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth); 576 513 if (ret) { 577 - blk_mq_free_rq_map(new); 514 + blk_mq_free_rq_map(new, flags); 578 515 return -ENOMEM; 579 516 } 580 517 581 518 blk_mq_free_rqs(set, *tagsptr, hctx->queue_num); 582 - blk_mq_free_rq_map(*tagsptr); 519 + blk_mq_free_rq_map(*tagsptr, flags); 583 520 *tagsptr = new; 584 521 } else { 585 522 /* 586 523 * Don't need (or can't) update reserved tags here, they 587 524 * remain static and should never need resizing. 588 525 */ 589 - sbitmap_queue_resize(&tags->bitmap_tags, 526 + sbitmap_queue_resize(tags->bitmap_tags, 590 527 tdepth - tags->nr_reserved_tags); 591 528 } 592 529 593 530 return 0; 531 + } 532 + 533 + void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size) 534 + { 535 + sbitmap_queue_resize(&set->__bitmap_tags, size - set->reserved_tags); 594 536 } 595 537 596 538 /**

+17 -39

block/blk-mq-tag.h

··· 2 2 #ifndef INT_BLK_MQ_TAG_H 3 3 #define INT_BLK_MQ_TAG_H 4 4 5 - #include "blk-mq.h" 6 - 7 5 /* 8 6 * Tag address space map. 9 7 */ ··· 11 13 12 14 atomic_t active_queues; 13 15 14 - struct sbitmap_queue bitmap_tags; 15 - struct sbitmap_queue breserved_tags; 16 + struct sbitmap_queue *bitmap_tags; 17 + struct sbitmap_queue *breserved_tags; 18 + 19 + struct sbitmap_queue __bitmap_tags; 20 + struct sbitmap_queue __breserved_tags; 16 21 17 22 struct request **rqs; 18 23 struct request **static_rqs; 19 24 struct list_head page_list; 20 25 }; 21 26 27 + extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, 28 + unsigned int reserved_tags, 29 + int node, unsigned int flags); 30 + extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags); 22 31 23 - extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, int alloc_policy); 24 - extern void blk_mq_free_tags(struct blk_mq_tags *tags); 32 + extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, 33 + unsigned int flags); 34 + extern void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set); 25 35 26 36 extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); 27 37 extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, ··· 37 31 extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, 38 32 struct blk_mq_tags **tags, 39 33 unsigned int depth, bool can_grow); 34 + extern void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, 35 + unsigned int size); 36 + 40 37 extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); 41 38 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, 42 39 void *priv); ··· 65 56 66 57 static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) 67 58 { 68 - if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) 59 + if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) 69 60 return false; 70 61 71 62 return __blk_mq_tag_busy(hctx); ··· 73 64 74 65 static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) 75 66 { 76 - if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) 67 + if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) 77 68 return; 78 69 79 70 __blk_mq_tag_idle(hctx); 80 - } 81 - 82 - /* 83 - * For shared tag users, we track the number of currently active users 84 - * and attempt to provide a fair share of the tag depth for each of them. 85 - */ 86 - static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, 87 - struct sbitmap_queue *bt) 88 - { 89 - unsigned int depth, users; 90 - 91 - if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED)) 92 - return true; 93 - if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) 94 - return true; 95 - 96 - /* 97 - * Don't try dividing an ant 98 - */ 99 - if (bt->sb.depth == 1) 100 - return true; 101 - 102 - users = atomic_read(&hctx->tags->active_queues); 103 - if (!users) 104 - return true; 105 - 106 - /* 107 - * Allow at least some tags 108 - */ 109 - depth = max((bt->sb.depth + users - 1) / users, 4U); 110 - return atomic_read(&hctx->nr_active) < depth; 111 71 } 112 72 113 73 static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,

+54 -32

block/blk-mq.c

··· 519 519 520 520 ctx->rq_completed[rq_is_sync(rq)]++; 521 521 if (rq->rq_flags & RQF_MQ_INFLIGHT) 522 - atomic_dec(&hctx->nr_active); 522 + __blk_mq_dec_active_requests(hctx); 523 523 524 524 if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) 525 525 laptop_io_completion(q->backing_dev_info); ··· 1096 1096 1097 1097 static bool __blk_mq_get_driver_tag(struct request *rq) 1098 1098 { 1099 - struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; 1099 + struct sbitmap_queue *bt = rq->mq_hctx->tags->bitmap_tags; 1100 1100 unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; 1101 1101 int tag; 1102 1102 1103 1103 blk_mq_tag_busy(rq->mq_hctx); 1104 1104 1105 1105 if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { 1106 - bt = &rq->mq_hctx->tags->breserved_tags; 1106 + bt = rq->mq_hctx->tags->breserved_tags; 1107 1107 tag_offset = 0; 1108 + } else { 1109 + if (!hctx_may_queue(rq->mq_hctx, bt)) 1110 + return false; 1108 1111 } 1109 1112 1110 - if (!hctx_may_queue(rq->mq_hctx, bt)) 1111 - return false; 1112 1113 tag = __sbitmap_queue_get(bt); 1113 1114 if (tag == BLK_MQ_NO_TAG) 1114 1115 return false; ··· 1125 1124 if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq)) 1126 1125 return false; 1127 1126 1128 - if ((hctx->flags & BLK_MQ_F_TAG_SHARED) && 1127 + if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) && 1129 1128 !(rq->rq_flags & RQF_MQ_INFLIGHT)) { 1130 1129 rq->rq_flags |= RQF_MQ_INFLIGHT; 1131 - atomic_inc(&hctx->nr_active); 1130 + __blk_mq_inc_active_requests(hctx); 1132 1131 } 1133 1132 hctx->tags->rqs[rq->tag] = rq; 1134 1133 return true; ··· 1146 1145 struct sbitmap_queue *sbq; 1147 1146 1148 1147 list_del_init(&wait->entry); 1149 - sbq = &hctx->tags->bitmap_tags; 1148 + sbq = hctx->tags->bitmap_tags; 1150 1149 atomic_dec(&sbq->ws_active); 1151 1150 } 1152 1151 spin_unlock(&hctx->dispatch_wait_lock); ··· 1164 1163 static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, 1165 1164 struct request *rq) 1166 1165 { 1167 - struct sbitmap_queue *sbq = &hctx->tags->bitmap_tags; 1166 + struct sbitmap_queue *sbq = hctx->tags->bitmap_tags; 1168 1167 struct wait_queue_head *wq; 1169 1168 wait_queue_entry_t *wait; 1170 1169 bool ret; 1171 1170 1172 - if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) { 1171 + if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { 1173 1172 blk_mq_sched_mark_restart_hctx(hctx); 1174 1173 1175 1174 /* ··· 1421 1420 bool needs_restart; 1422 1421 /* For non-shared tags, the RESTART check will suffice */ 1423 1422 bool no_tag = prep == PREP_DISPATCH_NO_TAG && 1424 - (hctx->flags & BLK_MQ_F_TAG_SHARED); 1423 + (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED); 1425 1424 bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET; 1426 1425 1427 1426 blk_mq_release_budgets(q, nr_budgets); ··· 2297 2296 } 2298 2297 } 2299 2298 2300 - void blk_mq_free_rq_map(struct blk_mq_tags *tags) 2299 + void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags) 2301 2300 { 2302 2301 kfree(tags->rqs); 2303 2302 tags->rqs = NULL; 2304 2303 kfree(tags->static_rqs); 2305 2304 tags->static_rqs = NULL; 2306 2305 2307 - blk_mq_free_tags(tags); 2306 + blk_mq_free_tags(tags, flags); 2308 2307 } 2309 2308 2310 2309 struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, 2311 2310 unsigned int hctx_idx, 2312 2311 unsigned int nr_tags, 2313 - unsigned int reserved_tags) 2312 + unsigned int reserved_tags, 2313 + unsigned int flags) 2314 2314 { 2315 2315 struct blk_mq_tags *tags; 2316 2316 int node; ··· 2320 2318 if (node == NUMA_NO_NODE) 2321 2319 node = set->numa_node; 2322 2320 2323 - tags = blk_mq_init_tags(nr_tags, reserved_tags, node, 2324 - BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); 2321 + tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags); 2325 2322 if (!tags) 2326 2323 return NULL; 2327 2324 ··· 2328 2327 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, 2329 2328 node); 2330 2329 if (!tags->rqs) { 2331 - blk_mq_free_tags(tags); 2330 + blk_mq_free_tags(tags, flags); 2332 2331 return NULL; 2333 2332 } 2334 2333 ··· 2337 2336 node); 2338 2337 if (!tags->static_rqs) { 2339 2338 kfree(tags->rqs); 2340 - blk_mq_free_tags(tags); 2339 + blk_mq_free_tags(tags, flags); 2341 2340 return NULL; 2342 2341 } 2343 2342 ··· 2661 2660 goto free_hctx; 2662 2661 2663 2662 atomic_set(&hctx->nr_active, 0); 2663 + atomic_set(&hctx->elevator_queued, 0); 2664 2664 if (node == NUMA_NO_NODE) 2665 2665 node = set->numa_node; 2666 2666 hctx->numa_node = node; ··· 2670 2668 spin_lock_init(&hctx->lock); 2671 2669 INIT_LIST_HEAD(&hctx->dispatch); 2672 2670 hctx->queue = q; 2673 - hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED; 2671 + hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED; 2674 2672 2675 2673 INIT_LIST_HEAD(&hctx->hctx_list); 2676 2674 ··· 2747 2745 static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set, 2748 2746 int hctx_idx) 2749 2747 { 2748 + unsigned int flags = set->flags; 2750 2749 int ret = 0; 2751 2750 2752 2751 set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx, 2753 - set->queue_depth, set->reserved_tags); 2752 + set->queue_depth, set->reserved_tags, flags); 2754 2753 if (!set->tags[hctx_idx]) 2755 2754 return false; 2756 2755 ··· 2760 2757 if (!ret) 2761 2758 return true; 2762 2759 2763 - blk_mq_free_rq_map(set->tags[hctx_idx]); 2760 + blk_mq_free_rq_map(set->tags[hctx_idx], flags); 2764 2761 set->tags[hctx_idx] = NULL; 2765 2762 return false; 2766 2763 } ··· 2768 2765 static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, 2769 2766 unsigned int hctx_idx) 2770 2767 { 2768 + unsigned int flags = set->flags; 2769 + 2771 2770 if (set->tags && set->tags[hctx_idx]) { 2772 2771 blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); 2773 - blk_mq_free_rq_map(set->tags[hctx_idx]); 2772 + blk_mq_free_rq_map(set->tags[hctx_idx], flags); 2774 2773 set->tags[hctx_idx] = NULL; 2775 2774 } 2776 2775 } ··· 2890 2885 2891 2886 queue_for_each_hw_ctx(q, hctx, i) { 2892 2887 if (shared) 2893 - hctx->flags |= BLK_MQ_F_TAG_SHARED; 2888 + hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; 2894 2889 else 2895 - hctx->flags &= ~BLK_MQ_F_TAG_SHARED; 2890 + hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; 2896 2891 } 2897 2892 } 2898 2893 2899 - static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, 2900 - bool shared) 2894 + static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set, 2895 + bool shared) 2901 2896 { 2902 2897 struct request_queue *q; 2903 2898 ··· 2918 2913 list_del(&q->tag_set_list); 2919 2914 if (list_is_singular(&set->tag_list)) { 2920 2915 /* just transitioned to unshared */ 2921 - set->flags &= ~BLK_MQ_F_TAG_SHARED; 2916 + set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; 2922 2917 /* update existing queue */ 2923 - blk_mq_update_tag_set_depth(set, false); 2918 + blk_mq_update_tag_set_shared(set, false); 2924 2919 } 2925 2920 mutex_unlock(&set->tag_list_lock); 2926 2921 INIT_LIST_HEAD(&q->tag_set_list); ··· 2935 2930 * Check to see if we're transitioning to shared (from 1 to 2 queues). 2936 2931 */ 2937 2932 if (!list_empty(&set->tag_list) && 2938 - !(set->flags & BLK_MQ_F_TAG_SHARED)) { 2939 - set->flags |= BLK_MQ_F_TAG_SHARED; 2933 + !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { 2934 + set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; 2940 2935 /* update existing queue */ 2941 - blk_mq_update_tag_set_depth(set, true); 2936 + blk_mq_update_tag_set_shared(set, true); 2942 2937 } 2943 - if (set->flags & BLK_MQ_F_TAG_SHARED) 2938 + if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED) 2944 2939 queue_set_hctx_shared(q, true); 2945 2940 list_add_tail(&q->tag_set_list, &set->tag_list); 2946 2941 ··· 3443 3438 if (ret) 3444 3439 goto out_free_mq_map; 3445 3440 3441 + if (blk_mq_is_sbitmap_shared(set->flags)) { 3442 + atomic_set(&set->active_queues_shared_sbitmap, 0); 3443 + 3444 + if (blk_mq_init_shared_sbitmap(set, set->flags)) { 3445 + ret = -ENOMEM; 3446 + goto out_free_mq_rq_maps; 3447 + } 3448 + } 3449 + 3446 3450 mutex_init(&set->tag_list_lock); 3447 3451 INIT_LIST_HEAD(&set->tag_list); 3448 3452 3449 3453 return 0; 3450 3454 3455 + out_free_mq_rq_maps: 3456 + for (i = 0; i < set->nr_hw_queues; i++) 3457 + blk_mq_free_map_and_requests(set, i); 3451 3458 out_free_mq_map: 3452 3459 for (i = 0; i < set->nr_maps; i++) { 3453 3460 kfree(set->map[i].mq_map); ··· 3477 3460 3478 3461 for (i = 0; i < set->nr_hw_queues; i++) 3479 3462 blk_mq_free_map_and_requests(set, i); 3463 + 3464 + if (blk_mq_is_sbitmap_shared(set->flags)) 3465 + blk_mq_exit_shared_sbitmap(set); 3480 3466 3481 3467 for (j = 0; j < set->nr_maps; j++) { 3482 3468 kfree(set->map[j].mq_map); ··· 3517 3497 if (!hctx->sched_tags) { 3518 3498 ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, 3519 3499 false); 3500 + if (!ret && blk_mq_is_sbitmap_shared(set->flags)) 3501 + blk_mq_tag_resize_shared_sbitmap(set, nr); 3520 3502 } else { 3521 3503 ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, 3522 3504 nr, true);

+73 -3

block/blk-mq.h

··· 53 53 */ 54 54 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, 55 55 unsigned int hctx_idx); 56 - void blk_mq_free_rq_map(struct blk_mq_tags *tags); 56 + void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags); 57 57 struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, 58 58 unsigned int hctx_idx, 59 59 unsigned int nr_tags, 60 - unsigned int reserved_tags); 60 + unsigned int reserved_tags, 61 + unsigned int flags); 61 62 int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, 62 63 unsigned int hctx_idx, unsigned int depth); 63 64 ··· 159 158 struct blk_mq_hw_ctx *hctx; 160 159 }; 161 160 161 + static inline bool blk_mq_is_sbitmap_shared(unsigned int flags) 162 + { 163 + return flags & BLK_MQ_F_TAG_HCTX_SHARED; 164 + } 165 + 162 166 static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) 163 167 { 164 168 if (data->q->elevator) ··· 199 193 return true; 200 194 } 201 195 196 + static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) 197 + { 198 + if (blk_mq_is_sbitmap_shared(hctx->flags)) 199 + atomic_inc(&hctx->queue->nr_active_requests_shared_sbitmap); 200 + else 201 + atomic_inc(&hctx->nr_active); 202 + } 203 + 204 + static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) 205 + { 206 + if (blk_mq_is_sbitmap_shared(hctx->flags)) 207 + atomic_dec(&hctx->queue->nr_active_requests_shared_sbitmap); 208 + else 209 + atomic_dec(&hctx->nr_active); 210 + } 211 + 212 + static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx) 213 + { 214 + if (blk_mq_is_sbitmap_shared(hctx->flags)) 215 + return atomic_read(&hctx->queue->nr_active_requests_shared_sbitmap); 216 + return atomic_read(&hctx->nr_active); 217 + } 202 218 static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, 203 219 struct request *rq) 204 220 { ··· 229 201 230 202 if (rq->rq_flags & RQF_MQ_INFLIGHT) { 231 203 rq->rq_flags &= ~RQF_MQ_INFLIGHT; 232 - atomic_dec(&hctx->nr_active); 204 + __blk_mq_dec_active_requests(hctx); 233 205 } 234 206 } 235 207 ··· 280 252 /* Zoned block device write operation case: do not plug the BIO */ 281 253 return NULL; 282 254 } 255 + 256 + /* 257 + * For shared tag users, we track the number of currently active users 258 + * and attempt to provide a fair share of the tag depth for each of them. 259 + */ 260 + static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, 261 + struct sbitmap_queue *bt) 262 + { 263 + unsigned int depth, users; 264 + 265 + if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) 266 + return true; 267 + 268 + /* 269 + * Don't try dividing an ant 270 + */ 271 + if (bt->sb.depth == 1) 272 + return true; 273 + 274 + if (blk_mq_is_sbitmap_shared(hctx->flags)) { 275 + struct request_queue *q = hctx->queue; 276 + struct blk_mq_tag_set *set = q->tag_set; 277 + 278 + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &q->queue_flags)) 279 + return true; 280 + users = atomic_read(&set->active_queues_shared_sbitmap); 281 + } else { 282 + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) 283 + return true; 284 + users = atomic_read(&hctx->tags->active_queues); 285 + } 286 + 287 + if (!users) 288 + return true; 289 + 290 + /* 291 + * Allow at least some tags 292 + */ 293 + depth = max((bt->sb.depth + users - 1) / users, 4U); 294 + return __blk_mq_active_requests(hctx) < depth; 295 + } 296 + 283 297 284 298 #endif

+28 -12

block/blk-settings.c

··· 172 172 * 173 173 * Description: 174 174 * If a driver doesn't want IOs to cross a given chunk size, it can set 175 - * this limit and prevent merging across chunks. Note that the chunk size 176 - * must currently be a power-of-2 in sectors. Also note that the block 177 - * layer must accept a page worth of data at any offset. So if the 178 - * crossing of chunks is a hard limitation in the driver, it must still be 179 - * prepared to split single page bios. 175 + * this limit and prevent merging across chunks. Note that the block layer 176 + * must accept a page worth of data at any offset. So if the crossing of 177 + * chunks is a hard limitation in the driver, it must still be prepared 178 + * to split single page bios. 180 179 **/ 181 180 void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors) 182 181 { 183 - BUG_ON(!is_power_of_2(chunk_sectors)); 184 182 q->limits.chunk_sectors = chunk_sectors; 185 183 } 186 184 EXPORT_SYMBOL(blk_queue_chunk_sectors); ··· 372 374 } 373 375 EXPORT_SYMBOL(blk_queue_alignment_offset); 374 376 377 + void blk_queue_update_readahead(struct request_queue *q) 378 + { 379 + /* 380 + * For read-ahead of large files to be effective, we need to read ahead 381 + * at least twice the optimal I/O size. 382 + */ 383 + q->backing_dev_info->ra_pages = 384 + max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); 385 + q->backing_dev_info->io_pages = 386 + queue_max_sectors(q) >> (PAGE_SHIFT - 9); 387 + } 388 + EXPORT_SYMBOL_GPL(blk_queue_update_readahead); 389 + 375 390 /** 376 391 * blk_limits_io_min - set minimum request size for a device 377 392 * @limits: the queue limits ··· 463 452 void blk_queue_io_opt(struct request_queue *q, unsigned int opt) 464 453 { 465 454 blk_limits_io_opt(&q->limits, opt); 455 + q->backing_dev_info->ra_pages = 456 + max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); 466 457 } 467 458 EXPORT_SYMBOL(blk_queue_io_opt); 468 459 ··· 547 534 548 535 t->io_min = max(t->io_min, b->io_min); 549 536 t->io_opt = lcm_not_zero(t->io_opt, b->io_opt); 537 + t->chunk_sectors = lcm_not_zero(t->chunk_sectors, b->chunk_sectors); 550 538 551 539 /* Physical block size a multiple of the logical block size? */ 552 540 if (t->physical_block_size & (t->logical_block_size - 1)) { ··· 566 552 /* Optimal I/O a multiple of the physical block size? */ 567 553 if (t->io_opt & (t->physical_block_size - 1)) { 568 554 t->io_opt = 0; 555 + t->misaligned = 1; 556 + ret = -1; 557 + } 558 + 559 + /* chunk_sectors a multiple of the physical block size? */ 560 + if ((t->chunk_sectors << 9) & (t->physical_block_size - 1)) { 561 + t->chunk_sectors = 0; 569 562 t->misaligned = 1; 570 563 ret = -1; 571 564 } ··· 615 594 t->discard_granularity; 616 595 } 617 596 618 - if (b->chunk_sectors) 619 - t->chunk_sectors = min_not_zero(t->chunk_sectors, 620 - b->chunk_sectors); 621 - 622 597 t->zoned = max(t->zoned, b->zoned); 623 598 return ret; 624 599 } ··· 646 629 top, bottom); 647 630 } 648 631 649 - t->backing_dev_info->io_pages = 650 - t->limits.max_sectors >> (PAGE_SHIFT - 9); 632 + blk_queue_update_readahead(disk->queue); 651 633 } 652 634 EXPORT_SYMBOL(disk_stack_limits); 653 635

+59 -200

block/blk-sysfs.c

··· 260 260 261 261 #define QUEUE_SYSFS_BIT_FNS(name, flag, neg) \ 262 262 static ssize_t \ 263 - queue_show_##name(struct request_queue *q, char *page) \ 263 + queue_##name##_show(struct request_queue *q, char *page) \ 264 264 { \ 265 265 int bit; \ 266 266 bit = test_bit(QUEUE_FLAG_##flag, &q->queue_flags); \ 267 267 return queue_var_show(neg ? !bit : bit, page); \ 268 268 } \ 269 269 static ssize_t \ 270 - queue_store_##name(struct request_queue *q, const char *page, size_t count) \ 270 + queue_##name##_store(struct request_queue *q, const char *page, size_t count) \ 271 271 { \ 272 272 unsigned long val; \ 273 273 ssize_t ret; \ ··· 287 287 QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1); 288 288 QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0); 289 289 QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0); 290 + QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0); 290 291 #undef QUEUE_SYSFS_BIT_FNS 291 292 292 293 static ssize_t queue_zoned_show(struct request_queue *q, char *page) ··· 548 547 return queue_var_show(blk_queue_dax(q), page); 549 548 } 550 549 551 - static struct queue_sysfs_entry queue_requests_entry = { 552 - .attr = {.name = "nr_requests", .mode = 0644 }, 553 - .show = queue_requests_show, 554 - .store = queue_requests_store, 550 + #define QUEUE_RO_ENTRY(_prefix, _name) \ 551 + static struct queue_sysfs_entry _prefix##_entry = { \ 552 + .attr = { .name = _name, .mode = 0444 }, \ 553 + .show = _prefix##_show, \ 555 554 }; 556 555 557 - static struct queue_sysfs_entry queue_ra_entry = { 558 - .attr = {.name = "read_ahead_kb", .mode = 0644 }, 559 - .show = queue_ra_show, 560 - .store = queue_ra_store, 556 + #define QUEUE_RW_ENTRY(_prefix, _name) \ 557 + static struct queue_sysfs_entry _prefix##_entry = { \ 558 + .attr = { .name = _name, .mode = 0644 }, \ 559 + .show = _prefix##_show, \ 560 + .store = _prefix##_store, \ 561 561 }; 562 562 563 - static struct queue_sysfs_entry queue_max_sectors_entry = { 564 - .attr = {.name = "max_sectors_kb", .mode = 0644 }, 565 - .show = queue_max_sectors_show, 566 - .store = queue_max_sectors_store, 567 - }; 563 + QUEUE_RW_ENTRY(queue_requests, "nr_requests"); 564 + QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb"); 565 + QUEUE_RW_ENTRY(queue_max_sectors, "max_sectors_kb"); 566 + QUEUE_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb"); 567 + QUEUE_RO_ENTRY(queue_max_segments, "max_segments"); 568 + QUEUE_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments"); 569 + QUEUE_RO_ENTRY(queue_max_segment_size, "max_segment_size"); 570 + QUEUE_RW_ENTRY(elv_iosched, "scheduler"); 568 571 569 - static struct queue_sysfs_entry queue_max_hw_sectors_entry = { 570 - .attr = {.name = "max_hw_sectors_kb", .mode = 0444 }, 571 - .show = queue_max_hw_sectors_show, 572 - }; 572 + QUEUE_RO_ENTRY(queue_logical_block_size, "logical_block_size"); 573 + QUEUE_RO_ENTRY(queue_physical_block_size, "physical_block_size"); 574 + QUEUE_RO_ENTRY(queue_chunk_sectors, "chunk_sectors"); 575 + QUEUE_RO_ENTRY(queue_io_min, "minimum_io_size"); 576 + QUEUE_RO_ENTRY(queue_io_opt, "optimal_io_size"); 573 577 574 - static struct queue_sysfs_entry queue_max_segments_entry = { 575 - .attr = {.name = "max_segments", .mode = 0444 }, 576 - .show = queue_max_segments_show, 577 - }; 578 + QUEUE_RO_ENTRY(queue_max_discard_segments, "max_discard_segments"); 579 + QUEUE_RO_ENTRY(queue_discard_granularity, "discard_granularity"); 580 + QUEUE_RO_ENTRY(queue_discard_max_hw, "discard_max_hw_bytes"); 581 + QUEUE_RW_ENTRY(queue_discard_max, "discard_max_bytes"); 582 + QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); 578 583 579 - static struct queue_sysfs_entry queue_max_discard_segments_entry = { 580 - .attr = {.name = "max_discard_segments", .mode = 0444 }, 581 - .show = queue_max_discard_segments_show, 582 - }; 584 + QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); 585 + QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes"); 586 + QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes"); 583 587 584 - static struct queue_sysfs_entry queue_max_integrity_segments_entry = { 585 - .attr = {.name = "max_integrity_segments", .mode = 0444 }, 586 - .show = queue_max_integrity_segments_show, 587 - }; 588 + QUEUE_RO_ENTRY(queue_zoned, "zoned"); 589 + QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones"); 590 + QUEUE_RO_ENTRY(queue_max_open_zones, "max_open_zones"); 591 + QUEUE_RO_ENTRY(queue_max_active_zones, "max_active_zones"); 588 592 589 - static struct queue_sysfs_entry queue_max_segment_size_entry = { 590 - .attr = {.name = "max_segment_size", .mode = 0444 }, 591 - .show = queue_max_segment_size_show, 592 - }; 593 + QUEUE_RW_ENTRY(queue_nomerges, "nomerges"); 594 + QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity"); 595 + QUEUE_RW_ENTRY(queue_poll, "io_poll"); 596 + QUEUE_RW_ENTRY(queue_poll_delay, "io_poll_delay"); 597 + QUEUE_RW_ENTRY(queue_wc, "write_cache"); 598 + QUEUE_RO_ENTRY(queue_fua, "fua"); 599 + QUEUE_RO_ENTRY(queue_dax, "dax"); 600 + QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); 601 + QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); 593 602 594 - static struct queue_sysfs_entry queue_iosched_entry = { 595 - .attr = {.name = "scheduler", .mode = 0644 }, 596 - .show = elv_iosched_show, 597 - .store = elv_iosched_store, 598 - }; 603 + #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 604 + QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time"); 605 + #endif 599 606 607 + /* legacy alias for logical_block_size: */ 600 608 static struct queue_sysfs_entry queue_hw_sector_size_entry = { 601 609 .attr = {.name = "hw_sector_size", .mode = 0444 }, 602 610 .show = queue_logical_block_size_show, 603 611 }; 604 612 605 - static struct queue_sysfs_entry queue_logical_block_size_entry = { 606 - .attr = {.name = "logical_block_size", .mode = 0444 }, 607 - .show = queue_logical_block_size_show, 608 - }; 609 - 610 - static struct queue_sysfs_entry queue_physical_block_size_entry = { 611 - .attr = {.name = "physical_block_size", .mode = 0444 }, 612 - .show = queue_physical_block_size_show, 613 - }; 614 - 615 - static struct queue_sysfs_entry queue_chunk_sectors_entry = { 616 - .attr = {.name = "chunk_sectors", .mode = 0444 }, 617 - .show = queue_chunk_sectors_show, 618 - }; 619 - 620 - static struct queue_sysfs_entry queue_io_min_entry = { 621 - .attr = {.name = "minimum_io_size", .mode = 0444 }, 622 - .show = queue_io_min_show, 623 - }; 624 - 625 - static struct queue_sysfs_entry queue_io_opt_entry = { 626 - .attr = {.name = "optimal_io_size", .mode = 0444 }, 627 - .show = queue_io_opt_show, 628 - }; 629 - 630 - static struct queue_sysfs_entry queue_discard_granularity_entry = { 631 - .attr = {.name = "discard_granularity", .mode = 0444 }, 632 - .show = queue_discard_granularity_show, 633 - }; 634 - 635 - static struct queue_sysfs_entry queue_discard_max_hw_entry = { 636 - .attr = {.name = "discard_max_hw_bytes", .mode = 0444 }, 637 - .show = queue_discard_max_hw_show, 638 - }; 639 - 640 - static struct queue_sysfs_entry queue_discard_max_entry = { 641 - .attr = {.name = "discard_max_bytes", .mode = 0644 }, 642 - .show = queue_discard_max_show, 643 - .store = queue_discard_max_store, 644 - }; 645 - 646 - static struct queue_sysfs_entry queue_discard_zeroes_data_entry = { 647 - .attr = {.name = "discard_zeroes_data", .mode = 0444 }, 648 - .show = queue_discard_zeroes_data_show, 649 - }; 650 - 651 - static struct queue_sysfs_entry queue_write_same_max_entry = { 652 - .attr = {.name = "write_same_max_bytes", .mode = 0444 }, 653 - .show = queue_write_same_max_show, 654 - }; 655 - 656 - static struct queue_sysfs_entry queue_write_zeroes_max_entry = { 657 - .attr = {.name = "write_zeroes_max_bytes", .mode = 0444 }, 658 - .show = queue_write_zeroes_max_show, 659 - }; 660 - 661 - static struct queue_sysfs_entry queue_zone_append_max_entry = { 662 - .attr = {.name = "zone_append_max_bytes", .mode = 0444 }, 663 - .show = queue_zone_append_max_show, 664 - }; 665 - 666 - static struct queue_sysfs_entry queue_nonrot_entry = { 667 - .attr = {.name = "rotational", .mode = 0644 }, 668 - .show = queue_show_nonrot, 669 - .store = queue_store_nonrot, 670 - }; 671 - 672 - static struct queue_sysfs_entry queue_zoned_entry = { 673 - .attr = {.name = "zoned", .mode = 0444 }, 674 - .show = queue_zoned_show, 675 - }; 676 - 677 - static struct queue_sysfs_entry queue_nr_zones_entry = { 678 - .attr = {.name = "nr_zones", .mode = 0444 }, 679 - .show = queue_nr_zones_show, 680 - }; 681 - 682 - static struct queue_sysfs_entry queue_max_open_zones_entry = { 683 - .attr = {.name = "max_open_zones", .mode = 0444 }, 684 - .show = queue_max_open_zones_show, 685 - }; 686 - 687 - static struct queue_sysfs_entry queue_max_active_zones_entry = { 688 - .attr = {.name = "max_active_zones", .mode = 0444 }, 689 - .show = queue_max_active_zones_show, 690 - }; 691 - 692 - static struct queue_sysfs_entry queue_nomerges_entry = { 693 - .attr = {.name = "nomerges", .mode = 0644 }, 694 - .show = queue_nomerges_show, 695 - .store = queue_nomerges_store, 696 - }; 697 - 698 - static struct queue_sysfs_entry queue_rq_affinity_entry = { 699 - .attr = {.name = "rq_affinity", .mode = 0644 }, 700 - .show = queue_rq_affinity_show, 701 - .store = queue_rq_affinity_store, 702 - }; 703 - 704 - static struct queue_sysfs_entry queue_iostats_entry = { 705 - .attr = {.name = "iostats", .mode = 0644 }, 706 - .show = queue_show_iostats, 707 - .store = queue_store_iostats, 708 - }; 709 - 710 - static struct queue_sysfs_entry queue_random_entry = { 711 - .attr = {.name = "add_random", .mode = 0644 }, 712 - .show = queue_show_random, 713 - .store = queue_store_random, 714 - }; 715 - 716 - static struct queue_sysfs_entry queue_poll_entry = { 717 - .attr = {.name = "io_poll", .mode = 0644 }, 718 - .show = queue_poll_show, 719 - .store = queue_poll_store, 720 - }; 721 - 722 - static struct queue_sysfs_entry queue_poll_delay_entry = { 723 - .attr = {.name = "io_poll_delay", .mode = 0644 }, 724 - .show = queue_poll_delay_show, 725 - .store = queue_poll_delay_store, 726 - }; 727 - 728 - static struct queue_sysfs_entry queue_wc_entry = { 729 - .attr = {.name = "write_cache", .mode = 0644 }, 730 - .show = queue_wc_show, 731 - .store = queue_wc_store, 732 - }; 733 - 734 - static struct queue_sysfs_entry queue_fua_entry = { 735 - .attr = {.name = "fua", .mode = 0444 }, 736 - .show = queue_fua_show, 737 - }; 738 - 739 - static struct queue_sysfs_entry queue_dax_entry = { 740 - .attr = {.name = "dax", .mode = 0444 }, 741 - .show = queue_dax_show, 742 - }; 743 - 744 - static struct queue_sysfs_entry queue_io_timeout_entry = { 745 - .attr = {.name = "io_timeout", .mode = 0644 }, 746 - .show = queue_io_timeout_show, 747 - .store = queue_io_timeout_store, 748 - }; 749 - 750 - static struct queue_sysfs_entry queue_wb_lat_entry = { 751 - .attr = {.name = "wbt_lat_usec", .mode = 0644 }, 752 - .show = queue_wb_lat_show, 753 - .store = queue_wb_lat_store, 754 - }; 755 - 756 - #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 757 - static struct queue_sysfs_entry throtl_sample_time_entry = { 758 - .attr = {.name = "throttle_sample_time", .mode = 0644 }, 759 - .show = blk_throtl_sample_time_show, 760 - .store = blk_throtl_sample_time_store, 761 - }; 762 - #endif 613 + QUEUE_RW_ENTRY(queue_nonrot, "rotational"); 614 + QUEUE_RW_ENTRY(queue_iostats, "iostats"); 615 + QUEUE_RW_ENTRY(queue_random, "add_random"); 616 + QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes"); 763 617 764 618 static struct attribute *queue_attrs[] = { 765 619 &queue_requests_entry.attr, ··· 625 769 &queue_max_discard_segments_entry.attr, 626 770 &queue_max_integrity_segments_entry.attr, 627 771 &queue_max_segment_size_entry.attr, 628 - &queue_iosched_entry.attr, 772 + &elv_iosched_entry.attr, 629 773 &queue_hw_sector_size_entry.attr, 630 774 &queue_logical_block_size_entry.attr, 631 775 &queue_physical_block_size_entry.attr, ··· 647 791 &queue_nomerges_entry.attr, 648 792 &queue_rq_affinity_entry.attr, 649 793 &queue_iostats_entry.attr, 794 + &queue_stable_writes_entry.attr, 650 795 &queue_random_entry.attr, 651 796 &queue_poll_entry.attr, 652 797 &queue_wc_entry.attr, ··· 657 800 &queue_poll_delay_entry.attr, 658 801 &queue_io_timeout_entry.attr, 659 802 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 660 - &throtl_sample_time_entry.attr, 803 + &blk_throtl_sample_time_entry.attr, 661 804 #endif 662 805 NULL, 663 806 }; ··· 856 999 blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q); 857 1000 percpu_ref_switch_to_percpu(&q->q_usage_counter); 858 1001 } 1002 + 1003 + blk_queue_update_readahead(q); 859 1004 860 1005 ret = blk_trace_init_sysfs(dev); 861 1006 if (ret)

+36 -23

block/blk-throttle.c

··· 15 15 #include "blk-cgroup-rwstat.h" 16 16 17 17 /* Max dispatch from a group in 1 round */ 18 - static int throtl_grp_quantum = 8; 18 + #define THROTL_GRP_QUANTUM 8 19 19 20 20 /* Total max dispatch from all groups in one round */ 21 - static int throtl_quantum = 32; 21 + #define THROTL_QUANTUM 32 22 22 23 23 /* Throttling is performed over a slice and after that slice is renewed */ 24 24 #define DFL_THROTL_SLICE_HD (HZ / 10) ··· 150 150 /* user configured IOPS limits */ 151 151 unsigned int iops_conf[2][LIMIT_CNT]; 152 152 153 - /* Number of bytes disptached in current slice */ 153 + /* Number of bytes dispatched in current slice */ 154 154 uint64_t bytes_disp[2]; 155 155 /* Number of bio's dispatched in current slice */ 156 156 unsigned int io_disp[2]; ··· 852 852 /* 853 853 * A bio has been dispatched. Also adjust slice_end. It might happen 854 854 * that initially cgroup limit was very low resulting in high 855 - * slice_end, but later limit was bumped up and bio was dispached 855 + * slice_end, but later limit was bumped up and bio was dispatched 856 856 * sooner, then we need to reduce slice_end. A high bogus slice_end 857 857 * is bad because it does not allow new slice to start. 858 858 */ ··· 894 894 } 895 895 896 896 static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, 897 - unsigned long *wait) 897 + u32 iops_limit, unsigned long *wait) 898 898 { 899 899 bool rw = bio_data_dir(bio); 900 900 unsigned int io_allowed; 901 901 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 902 902 u64 tmp; 903 + 904 + if (iops_limit == UINT_MAX) { 905 + if (wait) 906 + *wait = 0; 907 + return true; 908 + } 903 909 904 910 jiffy_elapsed = jiffies - tg->slice_start[rw]; 905 911 ··· 919 913 * have been trimmed. 920 914 */ 921 915 922 - tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd; 916 + tmp = (u64)iops_limit * jiffy_elapsed_rnd; 923 917 do_div(tmp, HZ); 924 918 925 919 if (tmp > UINT_MAX) ··· 942 936 } 943 937 944 938 static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, 945 - unsigned long *wait) 939 + u64 bps_limit, unsigned long *wait) 946 940 { 947 941 bool rw = bio_data_dir(bio); 948 942 u64 bytes_allowed, extra_bytes, tmp; 949 943 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 950 944 unsigned int bio_size = throtl_bio_data_size(bio); 945 + 946 + if (bps_limit == U64_MAX) { 947 + if (wait) 948 + *wait = 0; 949 + return true; 950 + } 951 951 952 952 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 953 953 ··· 963 951 964 952 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); 965 953 966 - tmp = tg_bps_limit(tg, rw) * jiffy_elapsed_rnd; 954 + tmp = bps_limit * jiffy_elapsed_rnd; 967 955 do_div(tmp, HZ); 968 956 bytes_allowed = tmp; 969 957 ··· 975 963 976 964 /* Calc approx time to dispatch */ 977 965 extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed; 978 - jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw)); 966 + jiffy_wait = div64_u64(extra_bytes * HZ, bps_limit); 979 967 980 968 if (!jiffy_wait) 981 969 jiffy_wait = 1; ··· 999 987 { 1000 988 bool rw = bio_data_dir(bio); 1001 989 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; 990 + u64 bps_limit = tg_bps_limit(tg, rw); 991 + u32 iops_limit = tg_iops_limit(tg, rw); 1002 992 1003 993 /* 1004 994 * Currently whole state machine of group depends on first bio ··· 1012 998 bio != throtl_peek_queued(&tg->service_queue.queued[rw])); 1013 999 1014 1000 /* If tg->bps = -1, then BW is unlimited */ 1015 - if (tg_bps_limit(tg, rw) == U64_MAX && 1016 - tg_iops_limit(tg, rw) == UINT_MAX) { 1001 + if (bps_limit == U64_MAX && iops_limit == UINT_MAX) { 1017 1002 if (wait) 1018 1003 *wait = 0; 1019 1004 return true; ··· 1034 1021 jiffies + tg->td->throtl_slice); 1035 1022 } 1036 1023 1037 - if (tg_with_in_bps_limit(tg, bio, &bps_wait) && 1038 - tg_with_in_iops_limit(tg, bio, &iops_wait)) { 1024 + if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) && 1025 + tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) { 1039 1026 if (wait) 1040 1027 *wait = 0; 1041 1028 return true; ··· 1095 1082 * If @tg doesn't currently have any bios queued in the same 1096 1083 * direction, queueing @bio can change when @tg should be 1097 1084 * dispatched. Mark that @tg was empty. This is automatically 1098 - * cleaered on the next tg_update_disptime(). 1085 + * cleared on the next tg_update_disptime(). 1099 1086 */ 1100 1087 if (!sq->nr_queued[rw]) 1101 1088 tg->flags |= THROTL_TG_WAS_EMPTY; ··· 1188 1175 { 1189 1176 struct throtl_service_queue *sq = &tg->service_queue; 1190 1177 unsigned int nr_reads = 0, nr_writes = 0; 1191 - unsigned int max_nr_reads = throtl_grp_quantum*3/4; 1192 - unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; 1178 + unsigned int max_nr_reads = THROTL_GRP_QUANTUM * 3 / 4; 1179 + unsigned int max_nr_writes = THROTL_GRP_QUANTUM - max_nr_reads; 1193 1180 struct bio *bio; 1194 1181 1195 1182 /* Try to dispatch 75% READS and 25% WRITES */ ··· 1239 1226 if (sq->nr_queued[0] || sq->nr_queued[1]) 1240 1227 tg_update_disptime(tg); 1241 1228 1242 - if (nr_disp >= throtl_quantum) 1229 + if (nr_disp >= THROTL_QUANTUM) 1243 1230 break; 1244 1231 } 1245 1232 ··· 1316 1303 } 1317 1304 } 1318 1305 } else { 1319 - /* reached the top-level, queue issueing */ 1306 + /* reached the top-level, queue issuing */ 1320 1307 queue_work(kthrotld_workqueue, &td->dispatch_work); 1321 1308 } 1322 1309 out_unlock: ··· 1327 1314 * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work 1328 1315 * @work: work item being executed 1329 1316 * 1330 - * This function is queued for execution when bio's reach the bio_lists[] 1331 - * of throtl_data->service_queue. Those bio's are ready and issued by this 1317 + * This function is queued for execution when bios reach the bio_lists[] 1318 + * of throtl_data->service_queue. Those bios are ready and issued by this 1332 1319 * function. 1333 1320 */ 1334 1321 static void blk_throtl_dispatch_work_fn(struct work_struct *work) ··· 1441 1428 * that a group's limit are dropped suddenly and we don't want to 1442 1429 * account recently dispatched IO with new low rate. 1443 1430 */ 1444 - throtl_start_new_slice(tg, 0); 1445 - throtl_start_new_slice(tg, 1); 1431 + throtl_start_new_slice(tg, READ); 1432 + throtl_start_new_slice(tg, WRITE); 1446 1433 1447 1434 if (tg->flags & THROTL_TG_PENDING) { 1448 1435 tg_update_disptime(tg); ··· 2243 2230 2244 2231 /* 2245 2232 * @bio passed through this layer without being throttled. 2246 - * Climb up the ladder. If we''re already at the top, it 2233 + * Climb up the ladder. If we're already at the top, it 2247 2234 * can be executed directly. 2248 2235 */ 2249 2236 qn = &tg->qnode_on_parent[rw];

+18 -7

block/blk.h

··· 29 29 spinlock_t mq_flush_lock; 30 30 }; 31 31 32 + enum bio_merge_status { 33 + BIO_MERGE_OK, 34 + BIO_MERGE_NONE, 35 + BIO_MERGE_FAILED, 36 + }; 37 + 32 38 extern struct kmem_cache *blk_requestq_cachep; 33 39 extern struct kobj_type blk_queue_ktype; 34 40 extern struct ida blk_queue_ida; ··· 175 169 unsigned long blk_rq_timeout(unsigned long timeout); 176 170 void blk_add_timer(struct request *req); 177 171 178 - bool bio_attempt_front_merge(struct request *req, struct bio *bio, 179 - unsigned int nr_segs); 180 - bool bio_attempt_back_merge(struct request *req, struct bio *bio, 181 - unsigned int nr_segs); 182 - bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, 183 - struct bio *bio); 172 + enum bio_merge_status bio_attempt_front_merge(struct request *req, 173 + struct bio *bio, 174 + unsigned int nr_segs); 175 + enum bio_merge_status bio_attempt_back_merge(struct request *req, 176 + struct bio *bio, 177 + unsigned int nr_segs); 178 + enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q, 179 + struct request *req, 180 + struct bio *bio); 184 181 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, 185 182 unsigned int nr_segs, struct request **same_queue_rq); 183 + bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, 184 + struct bio *bio, unsigned int nr_segs); 186 185 187 186 void blk_account_io_start(struct request *req); 188 187 void blk_account_io_done(struct request *req, u64 now); ··· 361 350 #define ADDPART_FLAG_NONE 0 362 351 #define ADDPART_FLAG_RAID 1 363 352 #define ADDPART_FLAG_WHOLEDISK 2 364 - void delete_partition(struct gendisk *disk, struct hd_struct *part); 353 + void delete_partition(struct hd_struct *part); 365 354 int bdev_add_partition(struct block_device *bdev, int partno, 366 355 sector_t start, sector_t length); 367 356 int bdev_del_partition(struct block_device *bdev, int partno);

+1 -1

block/bsg-lib.c

··· 207 207 208 208 BUG_ON(!req->nr_phys_segments); 209 209 210 - buf->sg_list = kzalloc(sz, GFP_KERNEL); 210 + buf->sg_list = kmalloc(sz, GFP_KERNEL); 211 211 if (!buf->sg_list) 212 212 return -ENOMEM; 213 213 sg_init_table(buf->sg_list, req->nr_phys_segments);

+89 -67

block/genhd.c

··· 50 50 * zero and will not be set to zero 51 51 */ 52 52 void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, 53 - bool revalidate) 53 + bool update_bdev) 54 54 { 55 55 sector_t capacity = get_capacity(disk); 56 56 57 57 set_capacity(disk, size); 58 - 59 - if (revalidate) 60 - revalidate_disk(disk); 58 + if (update_bdev) 59 + revalidate_disk_size(disk, true); 61 60 62 61 if (capacity != size && capacity != 0 && size != 0) { 63 62 char *envp[] = { "RESIZE=1", NULL }; ··· 109 110 } 110 111 } 111 112 112 - static unsigned int part_in_flight(struct request_queue *q, 113 - struct hd_struct *part) 113 + static unsigned int part_in_flight(struct hd_struct *part) 114 114 { 115 115 unsigned int inflight = 0; 116 116 int cpu; ··· 124 126 return inflight; 125 127 } 126 128 127 - static void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, 128 - unsigned int inflight[2]) 129 + static void part_in_flight_rw(struct hd_struct *part, unsigned int inflight[2]) 129 130 { 130 131 int cpu; 131 132 ··· 673 676 return 0; 674 677 } 675 678 679 + static void disk_scan_partitions(struct gendisk *disk) 680 + { 681 + struct block_device *bdev; 682 + 683 + if (!get_capacity(disk) || !disk_part_scan_enabled(disk)) 684 + return; 685 + 686 + set_bit(GD_NEED_PART_SCAN, &disk->state); 687 + bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL); 688 + if (!IS_ERR(bdev)) 689 + blkdev_put(bdev, FMODE_READ); 690 + } 691 + 676 692 static void register_disk(struct device *parent, struct gendisk *disk, 677 693 const struct attribute_group **groups) 678 694 { 679 695 struct device *ddev = disk_to_dev(disk); 680 - struct block_device *bdev; 681 696 struct disk_part_iter piter; 682 697 struct hd_struct *part; 683 698 int err; ··· 731 722 return; 732 723 } 733 724 734 - /* No minors to use for partitions */ 735 - if (!disk_part_scan_enabled(disk)) 736 - goto exit; 725 + disk_scan_partitions(disk); 737 726 738 - /* No such device (e.g., media were just removed) */ 739 - if (!get_capacity(disk)) 740 - goto exit; 741 - 742 - bdev = bdget_disk(disk, 0); 743 - if (!bdev) 744 - goto exit; 745 - 746 - bdev->bd_invalidated = 1; 747 - err = blkdev_get(bdev, FMODE_READ, NULL); 748 - if (err < 0) 749 - goto exit; 750 - blkdev_put(bdev, FMODE_READ); 751 - 752 - exit: 753 727 /* announce disk after possible partitions are created */ 754 728 dev_set_uevent_suppress(ddev, 0); 755 729 kobject_uevent(&ddev->kobj, KOBJ_ADD); ··· 905 913 DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); 906 914 while ((part = disk_part_iter_next(&piter))) { 907 915 invalidate_partition(disk, part->partno); 908 - delete_partition(disk, part); 916 + delete_partition(part); 909 917 } 910 918 disk_part_iter_exit(&piter); 911 919 ··· 1293 1301 if (queue_is_mq(q)) 1294 1302 inflight = blk_mq_in_flight(q, p); 1295 1303 else 1296 - inflight = part_in_flight(q, p); 1304 + inflight = part_in_flight(p); 1297 1305 1298 1306 return sprintf(buf, 1299 1307 "%8lu %8lu %8llu %8u " ··· 1335 1343 if (queue_is_mq(q)) 1336 1344 blk_mq_in_flight_rw(q, p, inflight); 1337 1345 else 1338 - part_in_flight_rw(q, p, inflight); 1346 + part_in_flight_rw(p, inflight); 1339 1347 1340 1348 return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); 1341 1349 } ··· 1615 1623 if (queue_is_mq(gp->queue)) 1616 1624 inflight = blk_mq_in_flight(gp->queue, hd); 1617 1625 else 1618 - inflight = part_in_flight(gp->queue, hd); 1626 + inflight = part_in_flight(hd); 1619 1627 1620 1628 seq_printf(seqf, "%4d %7d %s " 1621 1629 "%lu %lu %lu %u " ··· 1721 1729 } 1722 1730 1723 1731 disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); 1724 - if (disk) { 1725 - disk->part0.dkstats = alloc_percpu(struct disk_stats); 1726 - if (!disk->part0.dkstats) { 1727 - kfree(disk); 1728 - return NULL; 1729 - } 1730 - init_rwsem(&disk->lookup_sem); 1731 - disk->node_id = node_id; 1732 - if (disk_expand_part_tbl(disk, 0)) { 1733 - free_percpu(disk->part0.dkstats); 1734 - kfree(disk); 1735 - return NULL; 1736 - } 1737 - ptbl = rcu_dereference_protected(disk->part_tbl, 1); 1738 - rcu_assign_pointer(ptbl->part[0], &disk->part0); 1732 + if (!disk) 1733 + return NULL; 1739 1734 1740 - /* 1741 - * set_capacity() and get_capacity() currently don't use 1742 - * seqcounter to read/update the part0->nr_sects. Still init 1743 - * the counter as we can read the sectors in IO submission 1744 - * patch using seqence counters. 1745 - * 1746 - * TODO: Ideally set_capacity() and get_capacity() should be 1747 - * converted to make use of bd_mutex and sequence counters. 1748 - */ 1749 - hd_sects_seq_init(&disk->part0); 1750 - if (hd_ref_init(&disk->part0)) { 1751 - hd_free_part(&disk->part0); 1752 - kfree(disk); 1753 - return NULL; 1754 - } 1735 + disk->part0.dkstats = alloc_percpu(struct disk_stats); 1736 + if (!disk->part0.dkstats) 1737 + goto out_free_disk; 1755 1738 1756 - disk->minors = minors; 1757 - rand_initialize_disk(disk); 1758 - disk_to_dev(disk)->class = &block_class; 1759 - disk_to_dev(disk)->type = &disk_type; 1760 - device_initialize(disk_to_dev(disk)); 1739 + init_rwsem(&disk->lookup_sem); 1740 + disk->node_id = node_id; 1741 + if (disk_expand_part_tbl(disk, 0)) { 1742 + free_percpu(disk->part0.dkstats); 1743 + goto out_free_disk; 1761 1744 } 1745 + 1746 + ptbl = rcu_dereference_protected(disk->part_tbl, 1); 1747 + rcu_assign_pointer(ptbl->part[0], &disk->part0); 1748 + 1749 + /* 1750 + * set_capacity() and get_capacity() currently don't use 1751 + * seqcounter to read/update the part0->nr_sects. Still init 1752 + * the counter as we can read the sectors in IO submission 1753 + * patch using seqence counters. 1754 + * 1755 + * TODO: Ideally set_capacity() and get_capacity() should be 1756 + * converted to make use of bd_mutex and sequence counters. 1757 + */ 1758 + hd_sects_seq_init(&disk->part0); 1759 + if (hd_ref_init(&disk->part0)) 1760 + goto out_free_part0; 1761 + 1762 + disk->minors = minors; 1763 + rand_initialize_disk(disk); 1764 + disk_to_dev(disk)->class = &block_class; 1765 + disk_to_dev(disk)->type = &disk_type; 1766 + device_initialize(disk_to_dev(disk)); 1762 1767 return disk; 1768 + 1769 + out_free_part0: 1770 + hd_free_part(&disk->part0); 1771 + out_free_disk: 1772 + kfree(disk); 1773 + return NULL; 1763 1774 } 1764 1775 EXPORT_SYMBOL(__alloc_disk_node); 1765 1776 ··· 2047 2052 * CONTEXT: 2048 2053 * Might sleep. 2049 2054 */ 2050 - unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) 2055 + static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) 2051 2056 { 2052 2057 struct disk_events *ev = disk->ev; 2053 2058 unsigned int pending; ··· 2084 2089 2085 2090 return pending; 2086 2091 } 2092 + 2093 + /** 2094 + * bdev_check_media_change - check if a removable media has been changed 2095 + * @bdev: block device to check 2096 + * 2097 + * Check whether a removable media has been changed, and attempt to free all 2098 + * dentries and inodes and invalidates all block device page cache entries in 2099 + * that case. 2100 + * 2101 + * Returns %true if the block device changed, or %false if not. 2102 + */ 2103 + bool bdev_check_media_change(struct block_device *bdev) 2104 + { 2105 + unsigned int events; 2106 + 2107 + events = disk_clear_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE | 2108 + DISK_EVENT_EJECT_REQUEST); 2109 + if (!(events & DISK_EVENT_MEDIA_CHANGE)) 2110 + return false; 2111 + 2112 + if (__invalidate_device(bdev, true)) 2113 + pr_warn("VFS: busy inodes on changed media %s\n", 2114 + bdev->bd_disk->disk_name); 2115 + set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); 2116 + return true; 2117 + } 2118 + EXPORT_SYMBOL(bdev_check_media_change); 2087 2119 2088 2120 /* 2089 2121 * Separate this part out so that a different pointer for clearing_ptr can be

+16 -13

block/ioctl.c

··· 112 112 uint64_t range[2]; 113 113 uint64_t start, len; 114 114 struct request_queue *q = bdev_get_queue(bdev); 115 - struct address_space *mapping = bdev->bd_inode->i_mapping; 116 - 115 + int err; 117 116 118 117 if (!(mode & FMODE_WRITE)) 119 118 return -EBADF; ··· 133 134 134 135 if (start + len > i_size_read(bdev->bd_inode)) 135 136 return -EINVAL; 136 - truncate_inode_pages_range(mapping, start, start + len - 1); 137 + 138 + err = truncate_bdev_range(bdev, mode, start, start + len - 1); 139 + if (err) 140 + return err; 141 + 137 142 return blkdev_issue_discard(bdev, start >> 9, len >> 9, 138 143 GFP_KERNEL, flags); 139 144 } ··· 146 143 unsigned long arg) 147 144 { 148 145 uint64_t range[2]; 149 - struct address_space *mapping; 150 146 uint64_t start, end, len; 147 + int err; 151 148 152 149 if (!(mode & FMODE_WRITE)) 153 150 return -EBADF; ··· 169 166 return -EINVAL; 170 167 171 168 /* Invalidate the page cache, including dirty pages */ 172 - mapping = bdev->bd_inode->i_mapping; 173 - truncate_inode_pages_range(mapping, start, end); 169 + err = truncate_bdev_range(bdev, mode, start, end); 170 + if (err) 171 + return err; 174 172 175 173 return blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL, 176 174 BLKDEV_ZERO_NOUNMAP); ··· 478 474 if (get_user(n, argp)) 479 475 return -EFAULT; 480 476 481 - if (!(mode & FMODE_EXCL)) { 482 - bdgrab(bdev); 483 - if (blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0) 484 - return -EBUSY; 485 - } 477 + if (mode & FMODE_EXCL) 478 + return set_blocksize(bdev, n); 486 479 480 + if (IS_ERR(blkdev_get_by_dev(bdev->bd_dev, mode | FMODE_EXCL, &bdev))) 481 + return -EBUSY; 487 482 ret = set_blocksize(bdev, n); 488 - if (!(mode & FMODE_EXCL)) 489 - blkdev_put(bdev, mode | FMODE_EXCL); 483 + blkdev_put(bdev, mode | FMODE_EXCL); 484 + 490 485 return ret; 491 486 } 492 487

+1 -1

block/ioprio.c

··· 69 69 70 70 switch (class) { 71 71 case IOPRIO_CLASS_RT: 72 - if (!capable(CAP_SYS_ADMIN)) 72 + if (!capable(CAP_SYS_NICE) && !capable(CAP_SYS_ADMIN)) 73 73 return -EPERM; 74 74 fallthrough; 75 75 /* rt has prio field too */

+3 -3

block/kyber-iosched.c

··· 359 359 * All of the hardware queues have the same depth, so we can just grab 360 360 * the shift of the first one. 361 361 */ 362 - return q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift; 362 + return q->queue_hw_ctx[0]->sched_tags->bitmap_tags->sb.shift; 363 363 } 364 364 365 365 static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) ··· 502 502 khd->batching = 0; 503 503 504 504 hctx->sched_data = khd; 505 - sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags, 505 + sbitmap_queue_min_shallow_depth(hctx->sched_tags->bitmap_tags, 506 506 kqd->async_depth); 507 507 508 508 return 0; ··· 573 573 bool merged; 574 574 575 575 spin_lock(&kcq->lock); 576 - merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio, nr_segs); 576 + merged = blk_bio_list_merge(hctx->queue, rq_list, bio, nr_segs); 577 577 spin_unlock(&kcq->lock); 578 578 579 579 return merged;

+6

block/mq-deadline.c

··· 386 386 spin_lock(&dd->lock); 387 387 rq = __dd_dispatch_request(dd); 388 388 spin_unlock(&dd->lock); 389 + if (rq) 390 + atomic_dec(&rq->mq_hctx->elevator_queued); 389 391 390 392 return rq; 391 393 } ··· 535 533 rq = list_first_entry(list, struct request, queuelist); 536 534 list_del_init(&rq->queuelist); 537 535 dd_insert_request(hctx, rq, at_head); 536 + atomic_inc(&hctx->elevator_queued); 538 537 } 539 538 spin_unlock(&dd->lock); 540 539 } ··· 581 578 static bool dd_has_work(struct blk_mq_hw_ctx *hctx) 582 579 { 583 580 struct deadline_data *dd = hctx->queue->elevator->elevator_data; 581 + 582 + if (!atomic_read(&hctx->elevator_queued)) 583 + return false; 584 584 585 585 return !list_empty_careful(&dd->dispatch) || 586 586 !list_empty_careful(&dd->fifo_list[0]) ||

+15 -12

block/partitions/core.c

··· 199 199 struct device_attribute *attr, char *buf) 200 200 { 201 201 struct hd_struct *p = dev_to_part(dev); 202 - return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset); 202 + 203 + return sprintf(buf, "%u\n", 204 + queue_limit_alignment_offset(&part_to_disk(p)->queue->limits, 205 + p->start_sect)); 203 206 } 204 207 205 208 static ssize_t part_discard_alignment_show(struct device *dev, 206 209 struct device_attribute *attr, char *buf) 207 210 { 208 211 struct hd_struct *p = dev_to_part(dev); 209 - return sprintf(buf, "%u\n", p->discard_alignment); 212 + 213 + return sprintf(buf, "%u\n", 214 + queue_limit_discard_alignment(&part_to_disk(p)->queue->limits, 215 + p->start_sect)); 210 216 } 211 217 212 218 static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); ··· 324 318 * Must be called either with bd_mutex held, before a disk can be opened or 325 319 * after all disk users are gone. 326 320 */ 327 - void delete_partition(struct gendisk *disk, struct hd_struct *part) 321 + void delete_partition(struct hd_struct *part) 328 322 { 323 + struct gendisk *disk = part_to_disk(part); 329 324 struct disk_part_tbl *ptbl = 330 325 rcu_dereference_protected(disk->part_tbl, 1); 331 326 ··· 334 327 * ->part_tbl is referenced in this part's release handler, so 335 328 * we have to hold the disk device 336 329 */ 337 - get_device(disk_to_dev(part_to_disk(part))); 330 + get_device(disk_to_dev(disk)); 338 331 rcu_assign_pointer(ptbl->part[part->partno], NULL); 339 332 kobject_put(part->holder_dir); 340 333 device_del(part_to_dev(part)); ··· 412 405 pdev = part_to_dev(p); 413 406 414 407 p->start_sect = start; 415 - p->alignment_offset = 416 - queue_limit_alignment_offset(&disk->queue->limits, start); 417 - p->discard_alignment = 418 - queue_limit_discard_alignment(&disk->queue->limits, start); 419 408 p->nr_sects = len; 420 409 p->partno = partno; 421 410 p->policy = get_disk_ro(disk); ··· 557 554 sync_blockdev(bdevp); 558 555 invalidate_bdev(bdevp); 559 556 560 - delete_partition(bdev->bd_disk, part); 557 + delete_partition(part); 561 558 ret = 0; 562 559 out_unlock: 563 560 mutex_unlock(&bdev->bd_mutex); ··· 595 592 if (partition_overlaps(bdev->bd_disk, start, length, partno)) 596 593 goto out_unlock; 597 594 598 - part_nr_sects_write(part, (sector_t)length); 599 - i_size_write(bdevp->bd_inode, length << SECTOR_SHIFT); 595 + part_nr_sects_write(part, length); 596 + bd_set_nr_sectors(bdevp, length); 600 597 601 598 ret = 0; 602 599 out_unlock: ··· 637 634 638 635 disk_part_iter_init(&piter, bdev->bd_disk, DISK_PITER_INCL_EMPTY); 639 636 while ((part = disk_part_iter_next(&piter))) 640 - delete_partition(bdev->bd_disk, part); 637 + delete_partition(part); 641 638 disk_part_iter_exit(&piter); 642 639 643 640 return 0;

-2

block/scsi_ioctl.c

··· 37 37 }; 38 38 EXPORT_SYMBOL(scsi_command_size_tbl); 39 39 40 - #include <scsi/sg.h> 41 - 42 40 static int sg_get_version(int __user *p) 43 41 { 44 42 static const int sg_version_num = 30527;

+1 -1

drivers/block/amiflop.c

··· 1670 1670 } 1671 1671 1672 1672 if (mode & (FMODE_READ|FMODE_WRITE)) { 1673 - check_disk_change(bdev); 1673 + bdev_check_media_change(bdev); 1674 1674 if (mode & FMODE_WRITE) { 1675 1675 int wrprot; 1676 1676

+1 -2

drivers/block/aoe/aoeblk.c

··· 347 347 mempool_t *mp; 348 348 struct request_queue *q; 349 349 struct blk_mq_tag_set *set; 350 - enum { KB = 1024, MB = KB * KB, READ_AHEAD = 2 * MB, }; 351 350 ulong flags; 352 351 int late = 0; 353 352 int err; ··· 406 407 WARN_ON(d->gd); 407 408 WARN_ON(d->flags & DEVFL_UP); 408 409 blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS); 409 - q->backing_dev_info->ra_pages = READ_AHEAD / PAGE_SIZE; 410 + blk_queue_io_opt(q, SZ_2M); 410 411 d->bufpool = mp; 411 412 d->blkq = gd->queue = q; 412 413 q->queuedata = d;

+1 -3

drivers/block/aoe/aoecmd.c

··· 900 900 ssize = get_capacity(d->gd); 901 901 bd = bdget_disk(d->gd, 0); 902 902 if (bd) { 903 - inode_lock(bd->bd_inode); 904 - i_size_write(bd->bd_inode, (loff_t)ssize<<9); 905 - inode_unlock(bd->bd_inode); 903 + bd_set_nr_sectors(bd, ssize); 906 904 bdput(bd); 907 905 } 908 906 spin_lock_irq(&d->lock);

+4 -3

drivers/block/ataflop.c

··· 1732 1732 /* invalidate the buffer track to force a reread */ 1733 1733 BufferDrive = -1; 1734 1734 set_bit(drive, &fake_change); 1735 - check_disk_change(bdev); 1735 + if (bdev_check_media_change(bdev)) 1736 + floppy_revalidate(bdev->bd_disk); 1736 1737 return 0; 1737 1738 default: 1738 1739 return -EINVAL; ··· 1910 1909 return 0; 1911 1910 1912 1911 if (mode & (FMODE_READ|FMODE_WRITE)) { 1913 - check_disk_change(bdev); 1912 + if (bdev_check_media_change(bdev)) 1913 + floppy_revalidate(bdev->bd_disk); 1914 1914 if (mode & FMODE_WRITE) { 1915 1915 if (p->wpstat) { 1916 1916 if (p->ref < 0) ··· 1955 1953 .release = floppy_release, 1956 1954 .ioctl = fd_ioctl, 1957 1955 .check_events = floppy_check_events, 1958 - .revalidate_disk= floppy_revalidate, 1959 1956 }; 1960 1957 1961 1958 static const struct blk_mq_ops ataflop_mq_ops = {

-1

drivers/block/brd.c

··· 403 403 disk->flags = GENHD_FL_EXT_DEVT; 404 404 sprintf(disk->disk_name, "ram%d", i); 405 405 set_capacity(disk, rd_size * 2); 406 - brd->brd_queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO; 407 406 408 407 /* Tell the block layer that this is not a rotational device */ 409 408 blk_queue_flag_set(QUEUE_FLAG_NONROT, brd->brd_queue);

+1 -15

drivers/block/drbd/drbd_nl.c

··· 1362 1362 1363 1363 if (b) { 1364 1364 blk_stack_limits(&q->limits, &b->limits, 0); 1365 - 1366 - if (q->backing_dev_info->ra_pages != 1367 - b->backing_dev_info->ra_pages) { 1368 - drbd_info(device, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n", 1369 - q->backing_dev_info->ra_pages, 1370 - b->backing_dev_info->ra_pages); 1371 - q->backing_dev_info->ra_pages = 1372 - b->backing_dev_info->ra_pages; 1373 - } 1365 + blk_queue_update_readahead(q); 1374 1366 } 1375 1367 fixup_discard_if_not_supported(q); 1376 1368 fixup_write_zeroes(device, q); ··· 3362 3370 if (get_ldev(device)) { 3363 3371 struct drbd_md *md = &device->ldev->md; 3364 3372 u64 *history_uuids = (u64 *)s->history_uuids; 3365 - struct request_queue *q; 3366 3373 int n; 3367 3374 3368 3375 spin_lock_irq(&md->uuid_lock); ··· 3375 3384 spin_unlock_irq(&md->uuid_lock); 3376 3385 3377 3386 s->dev_disk_flags = md->flags; 3378 - q = bdev_get_queue(device->ldev->backing_bdev); 3379 - s->dev_lower_blocked = 3380 - bdi_congested(q->backing_dev_info, 3381 - (1 << WB_async_congested) | 3382 - (1 << WB_sync_congested)); 3383 3387 put_ldev(device); 3384 3388 } 3385 3389 s->dev_size = drbd_get_capacity(device->this_bdev);

+5 -3

drivers/block/floppy.c

··· 561 561 * output_byte is automatically disabled when reset is set. 562 562 */ 563 563 static void reset_fdc(void); 564 + static int floppy_revalidate(struct gendisk *disk); 564 565 565 566 /* 566 567 * These are global variables, as that's the easiest way to give ··· 3276 3275 /* invalidate the buffer track to force a reread */ 3277 3276 set_bit((long)bdev->bd_disk->private_data, &fake_change); 3278 3277 process_fd_request(); 3279 - check_disk_change(bdev); 3278 + if (bdev_check_media_change(bdev)) 3279 + floppy_revalidate(bdev->bd_disk); 3280 3280 return 0; 3281 3281 } 3282 3282 ··· 4125 4123 drive_state[drive].last_checked = 0; 4126 4124 clear_bit(FD_OPEN_SHOULD_FAIL_BIT, 4127 4125 &drive_state[drive].flags); 4128 - check_disk_change(bdev); 4126 + if (bdev_check_media_change(bdev)) 4127 + floppy_revalidate(bdev->bd_disk); 4129 4128 if (test_bit(FD_DISK_CHANGED_BIT, &drive_state[drive].flags)) 4130 4129 goto out; 4131 4130 if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &drive_state[drive].flags)) ··· 4294 4291 .ioctl = fd_ioctl, 4295 4292 .getgeo = fd_getgeo, 4296 4293 .check_events = floppy_check_events, 4297 - .revalidate_disk = floppy_revalidate, 4298 4294 #ifdef CONFIG_COMPAT 4299 4295 .compat_ioctl = fd_compat_ioctl, 4300 4296 #endif

+2 -2

drivers/block/loop.c

··· 253 253 { 254 254 struct block_device *bdev = lo->lo_device; 255 255 256 - bd_set_size(bdev, size << SECTOR_SHIFT); 256 + bd_set_nr_sectors(bdev, size); 257 257 258 258 set_capacity_revalidate_and_notify(lo->lo_disk, size, false); 259 259 } ··· 1251 1251 set_capacity(lo->lo_disk, 0); 1252 1252 loop_sysfs_exit(lo); 1253 1253 if (bdev) { 1254 - bd_set_size(bdev, 0); 1254 + bd_set_nr_sectors(bdev, 0); 1255 1255 /* let user-space know about this change */ 1256 1256 kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); 1257 1257 }

+8 -7

drivers/block/nbd.c

··· 300 300 { 301 301 struct nbd_config *config = nbd->config; 302 302 struct block_device *bdev = bdget_disk(nbd->disk, 0); 303 + sector_t nr_sectors = config->bytesize >> 9; 303 304 304 305 if (config->flags & NBD_FLAG_SEND_TRIM) { 305 306 nbd->disk->queue->limits.discard_granularity = config->blksize; ··· 309 308 } 310 309 blk_queue_logical_block_size(nbd->disk->queue, config->blksize); 311 310 blk_queue_physical_block_size(nbd->disk->queue, config->blksize); 312 - set_capacity(nbd->disk, config->bytesize >> 9); 311 + set_capacity(nbd->disk, nr_sectors); 313 312 if (bdev) { 314 313 if (bdev->bd_disk) { 315 - bd_set_size(bdev, config->bytesize); 314 + bd_set_nr_sectors(bdev, nr_sectors); 316 315 set_blocksize(bdev, config->blksize); 317 316 } else 318 - bdev->bd_invalidated = 1; 317 + set_bit(GD_NEED_PART_SCAN, &nbd->disk->state); 319 318 bdput(bdev); 320 319 } 321 320 kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE); ··· 1139 1138 { 1140 1139 if (bdev->bd_openers > 1) 1141 1140 return; 1142 - bd_set_size(bdev, 0); 1141 + bd_set_nr_sectors(bdev, 0); 1143 1142 } 1144 1143 1145 1144 static void nbd_parse_flags(struct nbd_device *nbd) ··· 1322 1321 return ret; 1323 1322 1324 1323 if (max_part) 1325 - bdev->bd_invalidated = 1; 1324 + set_bit(GD_NEED_PART_SCAN, &nbd->disk->state); 1326 1325 mutex_unlock(&nbd->config_lock); 1327 1326 ret = wait_event_interruptible(config->recv_wq, 1328 1327 atomic_read(&config->recv_threads) == 0); ··· 1500 1499 refcount_set(&nbd->config_refs, 1); 1501 1500 refcount_inc(&nbd->refs); 1502 1501 mutex_unlock(&nbd->config_lock); 1503 - bdev->bd_invalidated = 1; 1502 + set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); 1504 1503 } else if (nbd_disconnected(nbd->config)) { 1505 - bdev->bd_invalidated = 1; 1504 + set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); 1506 1505 } 1507 1506 out: 1508 1507 mutex_unlock(&nbd_index_mutex);

+1 -1

drivers/block/paride/pcd.c

··· 233 233 struct pcd_unit *cd = bdev->bd_disk->private_data; 234 234 int ret; 235 235 236 - check_disk_change(bdev); 236 + bdev_check_media_change(bdev); 237 237 238 238 mutex_lock(&pcd_mutex); 239 239 ret = cdrom_open(&cd->info, bdev, mode);

+14 -80

drivers/block/pktcdvd.c

··· 1082 1082 } 1083 1083 } 1084 1084 1085 - /* 1086 - * recover a failed write, query for relocation if possible 1087 - * 1088 - * returns 1 if recovery is possible, or 0 if not 1089 - * 1090 - */ 1091 - static int pkt_start_recovery(struct packet_data *pkt) 1092 - { 1093 - /* 1094 - * FIXME. We need help from the file system to implement 1095 - * recovery handling. 1096 - */ 1097 - return 0; 1098 - #if 0 1099 - struct request *rq = pkt->rq; 1100 - struct pktcdvd_device *pd = rq->rq_disk->private_data; 1101 - struct block_device *pkt_bdev; 1102 - struct super_block *sb = NULL; 1103 - unsigned long old_block, new_block; 1104 - sector_t new_sector; 1105 - 1106 - pkt_bdev = bdget(kdev_t_to_nr(pd->pkt_dev)); 1107 - if (pkt_bdev) { 1108 - sb = get_super(pkt_bdev); 1109 - bdput(pkt_bdev); 1110 - } 1111 - 1112 - if (!sb) 1113 - return 0; 1114 - 1115 - if (!sb->s_op->relocate_blocks) 1116 - goto out; 1117 - 1118 - old_block = pkt->sector / (CD_FRAMESIZE >> 9); 1119 - if (sb->s_op->relocate_blocks(sb, old_block, &new_block)) 1120 - goto out; 1121 - 1122 - new_sector = new_block * (CD_FRAMESIZE >> 9); 1123 - pkt->sector = new_sector; 1124 - 1125 - bio_reset(pkt->bio); 1126 - bio_set_dev(pkt->bio, pd->bdev); 1127 - bio_set_op_attrs(pkt->bio, REQ_OP_WRITE, 0); 1128 - pkt->bio->bi_iter.bi_sector = new_sector; 1129 - pkt->bio->bi_iter.bi_size = pkt->frames * CD_FRAMESIZE; 1130 - pkt->bio->bi_vcnt = pkt->frames; 1131 - 1132 - pkt->bio->bi_end_io = pkt_end_io_packet_write; 1133 - pkt->bio->bi_private = pkt; 1134 - 1135 - drop_super(sb); 1136 - return 1; 1137 - 1138 - out: 1139 - drop_super(sb); 1140 - return 0; 1141 - #endif 1142 - } 1143 - 1144 1085 static inline void pkt_set_state(struct packet_data *pkt, enum packet_data_state state) 1145 1086 { 1146 1087 #if PACKET_DEBUG > 1 ··· 1298 1357 break; 1299 1358 1300 1359 case PACKET_RECOVERY_STATE: 1301 - if (pkt_start_recovery(pkt)) { 1302 - pkt_start_write(pd, pkt); 1303 - } else { 1304 - pkt_dbg(2, pd, "No recovery possible\n"); 1305 - pkt_set_state(pkt, PACKET_FINISHED_STATE); 1306 - } 1360 + pkt_dbg(2, pd, "No recovery possible\n"); 1361 + pkt_set_state(pkt, PACKET_FINISHED_STATE); 1307 1362 break; 1308 1363 1309 1364 case PACKET_FINISHED_STATE: ··· 2110 2173 int ret; 2111 2174 long lba; 2112 2175 struct request_queue *q; 2176 + struct block_device *bdev; 2113 2177 2114 2178 /* 2115 2179 * We need to re-open the cdrom device without O_NONBLOCK to be able 2116 2180 * to read/write from/to it. It is already opened in O_NONBLOCK mode 2117 - * so bdget() can't fail. 2181 + * so open should not fail. 2118 2182 */ 2119 - bdget(pd->bdev->bd_dev); 2120 - ret = blkdev_get(pd->bdev, FMODE_READ | FMODE_EXCL, pd); 2121 - if (ret) 2183 + bdev = blkdev_get_by_dev(pd->bdev->bd_dev, FMODE_READ | FMODE_EXCL, pd); 2184 + if (IS_ERR(bdev)) { 2185 + ret = PTR_ERR(bdev); 2122 2186 goto out; 2187 + } 2123 2188 2124 2189 ret = pkt_get_last_written(pd, &lba); 2125 2190 if (ret) { ··· 2131 2192 2132 2193 set_capacity(pd->disk, lba << 2); 2133 2194 set_capacity(pd->bdev->bd_disk, lba << 2); 2134 - bd_set_size(pd->bdev, (loff_t)lba << 11); 2195 + bd_set_nr_sectors(pd->bdev, lba << 2); 2135 2196 2136 2197 q = bdev_get_queue(pd->bdev); 2137 2198 if (write) { ··· 2165 2226 return 0; 2166 2227 2167 2228 out_putdev: 2168 - blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL); 2229 + blkdev_put(bdev, FMODE_READ | FMODE_EXCL); 2169 2230 out: 2170 2231 return ret; 2171 2232 } ··· 2502 2563 static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) 2503 2564 { 2504 2565 int i; 2505 - int ret = 0; 2506 2566 char b[BDEVNAME_SIZE]; 2507 2567 struct block_device *bdev; 2508 2568 ··· 2524 2586 } 2525 2587 } 2526 2588 2527 - bdev = bdget(dev); 2528 - if (!bdev) 2529 - return -ENOMEM; 2530 - ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL); 2531 - if (ret) 2532 - return ret; 2589 + bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_NDELAY, NULL); 2590 + if (IS_ERR(bdev)) 2591 + return PTR_ERR(bdev); 2533 2592 if (!blk_queue_scsi_passthrough(bdev_get_queue(bdev))) { 2534 2593 blkdev_put(bdev, FMODE_READ | FMODE_NDELAY); 2535 2594 return -EINVAL; ··· 2544 2609 pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->name); 2545 2610 if (IS_ERR(pd->cdrw.thread)) { 2546 2611 pkt_err(pd, "can't start kernel thread\n"); 2547 - ret = -ENOMEM; 2548 2612 goto out_mem; 2549 2613 } 2550 2614 ··· 2555 2621 blkdev_put(bdev, FMODE_READ | FMODE_NDELAY); 2556 2622 /* This is safe: open() is still holding a reference. */ 2557 2623 module_put(THIS_MODULE); 2558 - return ret; 2624 + return -ENOMEM; 2559 2625 } 2560 2626 2561 2627 static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)

+2 -2

drivers/block/rbd.c

··· 4921 4921 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; 4922 4922 dout("setting size to %llu sectors", (unsigned long long)size); 4923 4923 set_capacity(rbd_dev->disk, size); 4924 - revalidate_disk(rbd_dev->disk); 4924 + revalidate_disk_size(rbd_dev->disk, true); 4925 4925 } 4926 4926 } 4927 4927 ··· 5022 5022 } 5023 5023 5024 5024 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) 5025 - q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; 5025 + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); 5026 5026 5027 5027 /* 5028 5028 * disk_release() expects a queue ref from add_disk() and will

+3 -9

drivers/block/rnbd/rnbd-clt.c

··· 102 102 static int rnbd_clt_change_capacity(struct rnbd_clt_dev *dev, 103 103 size_t new_nsectors) 104 104 { 105 - int err = 0; 106 - 107 105 rnbd_clt_info(dev, "Device size changed from %zu to %zu sectors\n", 108 106 dev->nsectors, new_nsectors); 109 107 dev->nsectors = new_nsectors; 110 108 set_capacity(dev->gd, dev->nsectors); 111 - err = revalidate_disk(dev->gd); 112 - if (err) 113 - rnbd_clt_err(dev, 114 - "Failed to change device size from %zu to %zu, err: %d\n", 115 - dev->nsectors, new_nsectors, err); 116 - return err; 109 + revalidate_disk_size(dev->gd, true); 110 + return 0; 117 111 } 118 112 119 113 static int process_msg_open_rsp(struct rnbd_clt_dev *dev, ··· 1174 1180 tag_set->queue_depth = sess->queue_depth; 1175 1181 tag_set->numa_node = NUMA_NO_NODE; 1176 1182 tag_set->flags = BLK_MQ_F_SHOULD_MERGE | 1177 - BLK_MQ_F_TAG_SHARED; 1183 + BLK_MQ_F_TAG_QUEUE_SHARED; 1178 1184 tag_set->cmd_size = sizeof(struct rnbd_iu); 1179 1185 tag_set->nr_hw_queues = num_online_cpus(); 1180 1186

+2 -20

drivers/block/swim.c

··· 638 638 return 0; 639 639 640 640 if (mode & (FMODE_READ|FMODE_WRITE)) { 641 - check_disk_change(bdev); 641 + if (bdev_check_media_change(bdev) && fs->disk_in) 642 + fs->ejected = 0; 642 643 if ((mode & FMODE_WRITE) && fs->write_protected) { 643 644 err = -EROFS; 644 645 goto out; ··· 736 735 return fs->ejected ? DISK_EVENT_MEDIA_CHANGE : 0; 737 736 } 738 737 739 - static int floppy_revalidate(struct gendisk *disk) 740 - { 741 - struct floppy_state *fs = disk->private_data; 742 - struct swim __iomem *base = fs->swd->base; 743 - 744 - swim_drive(base, fs->location); 745 - 746 - if (fs->ejected) 747 - setup_medium(fs); 748 - 749 - if (!fs->disk_in) 750 - swim_motor(base, OFF); 751 - else 752 - fs->ejected = 0; 753 - 754 - return !fs->disk_in; 755 - } 756 - 757 738 static const struct block_device_operations floppy_fops = { 758 739 .owner = THIS_MODULE, 759 740 .open = floppy_unlocked_open, ··· 743 760 .ioctl = floppy_ioctl, 744 761 .getgeo = floppy_getgeo, 745 762 .check_events = floppy_check_events, 746 - .revalidate_disk = floppy_revalidate, 747 763 }; 748 764 749 765 static struct kobject *floppy_find(dev_t dev, int *part, void *data)

+2 -2

drivers/block/swim3.c

··· 945 945 946 946 if (err == 0 && (mode & FMODE_NDELAY) == 0 947 947 && (mode & (FMODE_READ|FMODE_WRITE))) { 948 - check_disk_change(bdev); 948 + if (bdev_check_media_change(bdev)) 949 + floppy_revalidate(bdev->bd_disk); 949 950 if (fs->ejected) 950 951 err = -ENXIO; 951 952 } ··· 1056 1055 .release = floppy_release, 1057 1056 .ioctl = floppy_ioctl, 1058 1057 .check_events = floppy_check_events, 1059 - .revalidate_disk= floppy_revalidate, 1060 1058 }; 1061 1059 1062 1060 static const struct blk_mq_ops swim3_mq_ops = {

+2 -2

drivers/block/virtio_blk.c

··· 598 598 struct virtio_blk *vblk = vdev->priv; 599 599 600 600 blk_queue_write_cache(vblk->disk->queue, writeback, false); 601 - revalidate_disk(vblk->disk); 601 + revalidate_disk_size(vblk->disk, true); 602 602 } 603 603 604 604 static const char *const virtblk_cache_types[] = { ··· 646 646 static umode_t virtblk_attrs_are_visible(struct kobject *kobj, 647 647 struct attribute *a, int n) 648 648 { 649 - struct device *dev = container_of(kobj, struct device, kobj); 649 + struct device *dev = kobj_to_dev(kobj); 650 650 struct gendisk *disk = dev_to_disk(dev); 651 651 struct virtio_blk *vblk = disk->private_data; 652 652 struct virtio_device *vdev = vblk->vdev;

+10 -16

drivers/block/xsysace.c

··· 888 888 return ace->media_change ? DISK_EVENT_MEDIA_CHANGE : 0; 889 889 } 890 890 891 - static int ace_revalidate_disk(struct gendisk *gd) 891 + static void ace_media_changed(struct ace_device *ace) 892 892 { 893 - struct ace_device *ace = gd->private_data; 894 893 unsigned long flags; 895 894 896 - dev_dbg(ace->dev, "ace_revalidate_disk()\n"); 895 + dev_dbg(ace->dev, "requesting cf id and scheduling tasklet\n"); 897 896 898 - if (ace->media_change) { 899 - dev_dbg(ace->dev, "requesting cf id and scheduling tasklet\n"); 897 + spin_lock_irqsave(&ace->lock, flags); 898 + ace->id_req_count++; 899 + spin_unlock_irqrestore(&ace->lock, flags); 900 900 901 - spin_lock_irqsave(&ace->lock, flags); 902 - ace->id_req_count++; 903 - spin_unlock_irqrestore(&ace->lock, flags); 904 - 905 - tasklet_schedule(&ace->fsm_tasklet); 906 - wait_for_completion(&ace->id_completion); 907 - } 901 + tasklet_schedule(&ace->fsm_tasklet); 902 + wait_for_completion(&ace->id_completion); 908 903 909 904 dev_dbg(ace->dev, "revalidate complete\n"); 910 - return ace->id_result; 911 905 } 912 906 913 907 static int ace_open(struct block_device *bdev, fmode_t mode) ··· 916 922 ace->users++; 917 923 spin_unlock_irqrestore(&ace->lock, flags); 918 924 919 - check_disk_change(bdev); 925 + if (bdev_check_media_change(bdev) && ace->media_change) 926 + ace_media_changed(ace); 920 927 mutex_unlock(&xsysace_mutex); 921 928 922 929 return 0; ··· 961 966 .open = ace_open, 962 967 .release = ace_release, 963 968 .check_events = ace_check_events, 964 - .revalidate_disk = ace_revalidate_disk, 965 969 .getgeo = ace_getgeo, 966 970 }; 967 971 ··· 1074 1080 (unsigned long long) ace->physaddr, ace->baseaddr, ace->irq); 1075 1081 1076 1082 ace->media_change = 1; 1077 - ace_revalidate_disk(ace->gd); 1083 + ace_media_changed(ace); 1078 1084 1079 1085 /* Make the sysace device 'live' */ 1080 1086 add_disk(ace->gd);

+19 -11

drivers/block/zram/zram_drv.c

··· 52 52 */ 53 53 static size_t huge_class_size; 54 54 55 + static const struct block_device_operations zram_devops; 56 + static const struct block_device_operations zram_wb_devops; 57 + 55 58 static void zram_free_page(struct zram *zram, size_t index); 56 59 static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, 57 60 u32 index, int offset, struct bio *bio); ··· 411 408 zram->backing_dev = NULL; 412 409 zram->old_block_size = 0; 413 410 zram->bdev = NULL; 414 - zram->disk->queue->backing_dev_info->capabilities |= 415 - BDI_CAP_SYNCHRONOUS_IO; 411 + zram->disk->fops = &zram_devops; 416 412 kvfree(zram->bitmap); 417 413 zram->bitmap = NULL; 418 414 } ··· 493 491 goto out; 494 492 } 495 493 496 - bdev = bdgrab(I_BDEV(inode)); 497 - err = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, zram); 498 - if (err < 0) { 494 + bdev = blkdev_get_by_dev(inode->i_rdev, 495 + FMODE_READ | FMODE_WRITE | FMODE_EXCL, zram); 496 + if (IS_ERR(bdev)) { 497 + err = PTR_ERR(bdev); 499 498 bdev = NULL; 500 499 goto out; 501 500 } ··· 531 528 * freely but in fact, IO is going on so finally could cause 532 529 * use-after-free when the IO is really done. 533 530 */ 534 - zram->disk->queue->backing_dev_info->capabilities &= 535 - ~BDI_CAP_SYNCHRONOUS_IO; 531 + zram->disk->fops = &zram_wb_devops; 536 532 up_write(&zram->init_lock); 537 533 538 534 pr_info("setup backing device %s\n", file_name); ··· 1741 1739 zram->disksize = disksize; 1742 1740 set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); 1743 1741 1744 - revalidate_disk(zram->disk); 1742 + revalidate_disk_size(zram->disk, true); 1745 1743 up_write(&zram->init_lock); 1746 1744 1747 1745 return len; ··· 1788 1786 /* Make sure all the pending I/O are finished */ 1789 1787 fsync_bdev(bdev); 1790 1788 zram_reset_device(zram); 1791 - revalidate_disk(zram->disk); 1789 + revalidate_disk_size(zram->disk, true); 1792 1790 bdput(bdev); 1793 1791 1794 1792 mutex_lock(&bdev->bd_mutex); ··· 1818 1816 .submit_bio = zram_submit_bio, 1819 1817 .swap_slot_free_notify = zram_slot_free_notify, 1820 1818 .rw_page = zram_rw_page, 1819 + .owner = THIS_MODULE 1820 + }; 1821 + 1822 + static const struct block_device_operations zram_wb_devops = { 1823 + .open = zram_open, 1824 + .submit_bio = zram_submit_bio, 1825 + .swap_slot_free_notify = zram_slot_free_notify, 1821 1826 .owner = THIS_MODULE 1822 1827 }; 1823 1828 ··· 1955 1946 if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE) 1956 1947 blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX); 1957 1948 1958 - zram->disk->queue->backing_dev_info->capabilities |= 1959 - (BDI_CAP_STABLE_WRITES | BDI_CAP_SYNCHRONOUS_IO); 1949 + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue); 1960 1950 device_add_disk(NULL, zram->disk, zram_disk_attr_groups); 1961 1951 1962 1952 strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));

+1 -1

drivers/cdrom/gdrom.c

··· 479 479 { 480 480 int ret; 481 481 482 - check_disk_change(bdev); 482 + bdev_check_media_change(bdev); 483 483 484 484 mutex_lock(&gdrom_mutex); 485 485 ret = cdrom_open(gd.cd_info, bdev, mode);

+24 -32

drivers/char/raw.c

··· 28 28 #include <linux/uaccess.h> 29 29 30 30 struct raw_device_data { 31 - struct block_device *binding; 31 + dev_t binding; 32 + struct block_device *bdev; 32 33 int inuse; 33 34 }; 34 35 ··· 64 63 return 0; 65 64 } 66 65 66 + pr_warn_ratelimited( 67 + "process %s (pid %d) is using the deprecated raw device\n" 68 + "support will be removed in Linux 5.14.\n", 69 + current->comm, current->pid); 70 + 67 71 mutex_lock(&raw_mutex); 68 72 69 73 /* 70 74 * All we need to do on open is check that the device is bound. 71 75 */ 72 - bdev = raw_devices[minor].binding; 73 76 err = -ENODEV; 74 - if (!bdev) 77 + if (!raw_devices[minor].binding) 75 78 goto out; 76 - bdgrab(bdev); 77 - err = blkdev_get(bdev, filp->f_mode | FMODE_EXCL, raw_open); 78 - if (err) 79 + bdev = blkdev_get_by_dev(raw_devices[minor].binding, 80 + filp->f_mode | FMODE_EXCL, raw_open); 81 + if (IS_ERR(bdev)) { 82 + err = PTR_ERR(bdev); 79 83 goto out; 84 + } 80 85 err = set_blocksize(bdev, bdev_logical_block_size(bdev)); 81 86 if (err) 82 87 goto out1; ··· 92 85 file_inode(filp)->i_mapping = 93 86 bdev->bd_inode->i_mapping; 94 87 filp->private_data = bdev; 88 + raw_devices[minor].bdev = bdev; 95 89 mutex_unlock(&raw_mutex); 96 90 return 0; 97 91 ··· 113 105 struct block_device *bdev; 114 106 115 107 mutex_lock(&raw_mutex); 116 - bdev = raw_devices[minor].binding; 108 + bdev = raw_devices[minor].bdev; 117 109 if (--raw_devices[minor].inuse == 0) 118 110 /* Here inode->i_mapping == bdev->bd_inode->i_mapping */ 119 111 inode->i_mapping = &inode->i_data; ··· 136 128 static int bind_set(int number, u64 major, u64 minor) 137 129 { 138 130 dev_t dev = MKDEV(major, minor); 131 + dev_t raw = MKDEV(RAW_MAJOR, number); 139 132 struct raw_device_data *rawdev; 140 133 int err = 0; 141 134 ··· 170 161 mutex_unlock(&raw_mutex); 171 162 return -EBUSY; 172 163 } 173 - if (rawdev->binding) { 174 - bdput(rawdev->binding); 164 + if (rawdev->binding) 175 165 module_put(THIS_MODULE); 176 - } 166 + 167 + rawdev->binding = dev; 177 168 if (!dev) { 178 169 /* unbind */ 179 - rawdev->binding = NULL; 180 - device_destroy(raw_class, MKDEV(RAW_MAJOR, number)); 170 + device_destroy(raw_class, raw); 181 171 } else { 182 - rawdev->binding = bdget(dev); 183 - if (rawdev->binding == NULL) { 184 - err = -ENOMEM; 185 - } else { 186 - dev_t raw = MKDEV(RAW_MAJOR, number); 187 - __module_get(THIS_MODULE); 188 - device_destroy(raw_class, raw); 189 - device_create(raw_class, NULL, raw, NULL, 190 - "raw%d", number); 191 - } 172 + __module_get(THIS_MODULE); 173 + device_destroy(raw_class, raw); 174 + device_create(raw_class, NULL, raw, NULL, "raw%d", number); 192 175 } 193 176 mutex_unlock(&raw_mutex); 194 177 return err; ··· 188 187 189 188 static int bind_get(int number, dev_t *dev) 190 189 { 191 - struct raw_device_data *rawdev; 192 - struct block_device *bdev; 193 - 194 190 if (number <= 0 || number >= max_raw_minors) 195 191 return -EINVAL; 196 - 197 - rawdev = &raw_devices[number]; 198 - 199 - mutex_lock(&raw_mutex); 200 - bdev = rawdev->binding; 201 - *dev = bdev ? bdev->bd_dev : 0; 202 - mutex_unlock(&raw_mutex); 192 + *dev = raw_devices[number].binding; 203 193 return 0; 204 194 } 205 195

+5 -11

drivers/ide/ide-cd.c

··· 1611 1611 struct cdrom_info *info; 1612 1612 int rc = -ENXIO; 1613 1613 1614 - check_disk_change(bdev); 1614 + if (bdev_check_media_change(bdev)) { 1615 + info = ide_drv_g(bdev->bd_disk, cdrom_info); 1616 + 1617 + ide_cd_read_toc(info->drive); 1618 + } 1615 1619 1616 1620 mutex_lock(&ide_cd_mutex); 1617 1621 info = ide_cd_get(bdev->bd_disk); ··· 1757 1753 return cdrom_check_events(&info->devinfo, clearing); 1758 1754 } 1759 1755 1760 - static int idecd_revalidate_disk(struct gendisk *disk) 1761 - { 1762 - struct cdrom_info *info = ide_drv_g(disk, cdrom_info); 1763 - 1764 - ide_cd_read_toc(info->drive); 1765 - 1766 - return 0; 1767 - } 1768 - 1769 1756 static const struct block_device_operations idecd_ops = { 1770 1757 .owner = THIS_MODULE, 1771 1758 .open = idecd_open, ··· 1765 1770 .compat_ioctl = IS_ENABLED(CONFIG_COMPAT) ? 1766 1771 idecd_compat_ioctl : NULL, 1767 1772 .check_events = idecd_check_events, 1768 - .revalidate_disk = idecd_revalidate_disk 1769 1773 }; 1770 1774 1771 1775 /* module options */

+1 -4

drivers/ide/ide-disk.c

··· 739 739 set_wcache(drive, 1); 740 740 741 741 if ((drive->dev_flags & IDE_DFLAG_LBA) == 0 && 742 - (drive->head == 0 || drive->head > 16)) { 742 + (drive->head == 0 || drive->head > 16)) 743 743 printk(KERN_ERR "%s: invalid geometry: %d physical heads?\n", 744 744 drive->name, drive->head); 745 - drive->dev_flags &= ~IDE_DFLAG_ATTACH; 746 - } else 747 - drive->dev_flags |= IDE_DFLAG_ATTACH; 748 745 } 749 746 750 747 static void ide_disk_flush(ide_drive_t *drive)

-2

drivers/ide/ide-floppy.c

··· 516 516 (void) ide_floppy_get_capacity(drive); 517 517 518 518 ide_proc_register_driver(drive, floppy->driver); 519 - 520 - drive->dev_flags |= IDE_DFLAG_ATTACH; 521 519 } 522 520 523 521 static void ide_floppy_flush(ide_drive_t *drive)

+6 -42

drivers/ide/ide-gd.c

··· 225 225 * and the door_lock is irrelevant at this point. 226 226 */ 227 227 drive->disk_ops->set_doorlock(drive, disk, 1); 228 - drive->dev_flags |= IDE_DFLAG_MEDIA_CHANGED; 229 - check_disk_change(bdev); 228 + if (__invalidate_device(bdev, true)) 229 + pr_warn("VFS: busy inodes on changed media %s\n", 230 + bdev->bd_disk->disk_name); 231 + drive->disk_ops->get_capacity(drive); 232 + set_capacity(disk, ide_gd_capacity(drive)); 233 + set_bit(GD_NEED_PART_SCAN, &disk->state); 230 234 } else if (drive->dev_flags & IDE_DFLAG_FORMAT_IN_PROGRESS) { 231 235 ret = -EBUSY; 232 236 goto out_put_idkp; ··· 288 284 return 0; 289 285 } 290 286 291 - static unsigned int ide_gd_check_events(struct gendisk *disk, 292 - unsigned int clearing) 293 - { 294 - struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj); 295 - ide_drive_t *drive = idkp->drive; 296 - bool ret; 297 - 298 - /* do not scan partitions twice if this is a removable device */ 299 - if (drive->dev_flags & IDE_DFLAG_ATTACH) { 300 - drive->dev_flags &= ~IDE_DFLAG_ATTACH; 301 - return 0; 302 - } 303 - 304 - /* 305 - * The following is used to force revalidation on the first open on 306 - * removeable devices, and never gets reported to userland as 307 - * DISK_EVENT_FLAG_UEVENT isn't set in genhd->event_flags. 308 - * This is intended as removable ide disk can't really detect 309 - * MEDIA_CHANGE events. 310 - */ 311 - ret = drive->dev_flags & IDE_DFLAG_MEDIA_CHANGED; 312 - drive->dev_flags &= ~IDE_DFLAG_MEDIA_CHANGED; 313 - 314 - return ret ? DISK_EVENT_MEDIA_CHANGE : 0; 315 - } 316 - 317 287 static void ide_gd_unlock_native_capacity(struct gendisk *disk) 318 288 { 319 289 struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj); ··· 296 318 297 319 if (disk_ops->unlock_native_capacity) 298 320 disk_ops->unlock_native_capacity(drive); 299 - } 300 - 301 - static int ide_gd_revalidate_disk(struct gendisk *disk) 302 - { 303 - struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj); 304 - ide_drive_t *drive = idkp->drive; 305 - 306 - if (ide_gd_check_events(disk, 0)) 307 - drive->disk_ops->get_capacity(drive); 308 - 309 - set_capacity(disk, ide_gd_capacity(drive)); 310 - return 0; 311 321 } 312 322 313 323 static int ide_gd_ioctl(struct block_device *bdev, fmode_t mode, ··· 330 364 .compat_ioctl = ide_gd_compat_ioctl, 331 365 #endif 332 366 .getgeo = ide_gd_getgeo, 333 - .check_events = ide_gd_check_events, 334 367 .unlock_native_capacity = ide_gd_unlock_native_capacity, 335 - .revalidate_disk = ide_gd_revalidate_disk 336 368 }; 337 369 338 370 static int ide_gd_probe(ide_drive_t *drive)

+6 -4

drivers/md/bcache/request.c

··· 475 475 unsigned int read_dirty_data:1; 476 476 unsigned int cache_missed:1; 477 477 478 + struct hd_struct *part; 478 479 unsigned long start_time; 479 480 480 481 struct btree_op op; ··· 670 669 { 671 670 if (s->orig_bio) { 672 671 /* Count on bcache device */ 673 - disk_end_io_acct(s->d->disk, bio_op(s->orig_bio), s->start_time); 672 + part_end_io_acct(s->part, s->orig_bio, s->start_time); 674 673 675 674 trace_bcache_request_end(s->d, s->orig_bio); 676 675 s->orig_bio->bi_status = s->iop.status; ··· 732 731 s->write = op_is_write(bio_op(bio)); 733 732 s->read_dirty_data = 0; 734 733 /* Count on the bcache device */ 735 - s->start_time = disk_start_io_acct(d->disk, bio_sectors(bio), bio_op(bio)); 734 + s->start_time = part_start_io_acct(d->disk, &s->part, bio); 736 735 s->iop.c = d->c; 737 736 s->iop.bio = NULL; 738 737 s->iop.inode = d->id; ··· 1073 1072 unsigned long start_time; 1074 1073 bio_end_io_t *bi_end_io; 1075 1074 void *bi_private; 1075 + struct hd_struct *part; 1076 1076 }; 1077 1077 1078 1078 static void detached_dev_end_io(struct bio *bio) ··· 1085 1083 bio->bi_private = ddip->bi_private; 1086 1084 1087 1085 /* Count on the bcache device */ 1088 - disk_end_io_acct(ddip->d->disk, bio_op(bio), ddip->start_time); 1086 + part_end_io_acct(ddip->part, bio, ddip->start_time); 1089 1087 1090 1088 if (bio->bi_status) { 1091 1089 struct cached_dev *dc = container_of(ddip->d, ··· 1111 1109 ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO); 1112 1110 ddip->d = d; 1113 1111 /* Count on the bcache device */ 1114 - ddip->start_time = disk_start_io_acct(d->disk, bio_sectors(bio), bio_op(bio)); 1112 + ddip->start_time = part_start_io_acct(d->disk, &ddip->part, bio); 1115 1113 ddip->bi_end_io = bio->bi_end_io; 1116 1114 ddip->bi_private = bio->bi_private; 1117 1115 bio->bi_end_io = detached_dev_end_io;

+2 -3

drivers/md/bcache/super.c

··· 1427 1427 if (ret) 1428 1428 return ret; 1429 1429 1430 - dc->disk.disk->queue->backing_dev_info->ra_pages = 1431 - max(dc->disk.disk->queue->backing_dev_info->ra_pages, 1432 - q->backing_dev_info->ra_pages); 1430 + blk_queue_io_opt(dc->disk.disk->queue, 1431 + max(queue_io_opt(dc->disk.disk->queue), queue_io_opt(q))); 1433 1432 1434 1433 atomic_set(&dc->io_errors, 0); 1435 1434 dc->io_disable = false;

+1 -1

drivers/md/dm-raid.c

··· 701 701 struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table)); 702 702 703 703 set_capacity(gendisk, rs->md.array_sectors); 704 - revalidate_disk(gendisk); 704 + revalidate_disk_size(gendisk, true); 705 705 } 706 706 707 707 /*

+4 -5

drivers/md/dm-table.c

··· 1819 1819 { 1820 1820 struct request_queue *q = bdev_get_queue(dev->bdev); 1821 1821 1822 - return q && bdi_cap_stable_pages_required(q->backing_dev_info); 1822 + return q && blk_queue_stable_writes(q); 1823 1823 } 1824 1824 1825 1825 /* ··· 1904 1904 * because they do their own checksumming. 1905 1905 */ 1906 1906 if (dm_table_requires_stable_pages(t)) 1907 - q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; 1907 + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); 1908 1908 else 1909 - q->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES; 1909 + blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q); 1910 1910 1911 1911 /* 1912 1912 * Determine whether or not this queue's I/O timings contribute ··· 1929 1929 } 1930 1930 #endif 1931 1931 1932 - /* Allow reads to exceed readahead limits */ 1933 - q->backing_dev_info->io_pages = limits->max_sectors >> (PAGE_SHIFT - 9); 1932 + blk_queue_update_readahead(q); 1934 1933 } 1935 1934 1936 1935 unsigned int dm_table_get_num_targets(struct dm_table *t)

+2 -13

drivers/md/dm.c

··· 2099 2099 } 2100 2100 2101 2101 /* 2102 - * Protected by md->suspend_lock obtained by dm_swap_table(). 2103 - */ 2104 - static void __set_size(struct mapped_device *md, sector_t size) 2105 - { 2106 - lockdep_assert_held(&md->suspend_lock); 2107 - 2108 - set_capacity(md->disk, size); 2109 - 2110 - i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 2111 - } 2112 - 2113 - /* 2114 2102 * Returns old map, which caller must destroy. 2115 2103 */ 2116 2104 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, ··· 2120 2132 if (size != dm_get_size(md)) 2121 2133 memset(&md->geometry, 0, sizeof(md->geometry)); 2122 2134 2123 - __set_size(md, size); 2135 + set_capacity(md->disk, size); 2136 + bd_set_nr_sectors(md->bdev, size); 2124 2137 2125 2138 dm_table_event_callback(t, event_callback, md); 2126 2139

+3 -3

drivers/md/md-cluster.c

··· 582 582 break; 583 583 case CHANGE_CAPACITY: 584 584 set_capacity(mddev->gendisk, mddev->array_sectors); 585 - revalidate_disk(mddev->gendisk); 585 + revalidate_disk_size(mddev->gendisk, true); 586 586 break; 587 587 case RESYNCING: 588 588 set_bit(MD_RESYNCING_REMOTE, &mddev->recovery); ··· 1296 1296 pr_err("%s:%d: failed to send CHANGE_CAPACITY msg\n", 1297 1297 __func__, __LINE__); 1298 1298 set_capacity(mddev->gendisk, mddev->array_sectors); 1299 - revalidate_disk(mddev->gendisk); 1299 + revalidate_disk_size(mddev->gendisk, true); 1300 1300 } else { 1301 1301 /* revert to previous sectors */ 1302 1302 ret = mddev->pers->resize(mddev, old_dev_sectors); 1303 1303 if (!ret) 1304 - revalidate_disk(mddev->gendisk); 1304 + revalidate_disk_size(mddev->gendisk, true); 1305 1305 ret = __sendmsg(cinfo, &cmsg); 1306 1306 if (ret) 1307 1307 pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",

+1 -1

drivers/md/md-linear.c

··· 202 202 md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); 203 203 set_capacity(mddev->gendisk, mddev->array_sectors); 204 204 mddev_resume(mddev); 205 - revalidate_disk(mddev->gendisk); 205 + revalidate_disk_size(mddev->gendisk, true); 206 206 kfree_rcu(oldconf, rcu); 207 207 return 0; 208 208 }

+10 -10

drivers/md/md.c

··· 464 464 bio_end_io_t *orig_bi_end_io; 465 465 void *orig_bi_private; 466 466 unsigned long start_time; 467 + struct hd_struct *part; 467 468 }; 468 469 469 470 static void md_end_io(struct bio *bio) ··· 472 471 struct md_io *md_io = bio->bi_private; 473 472 struct mddev *mddev = md_io->mddev; 474 473 475 - disk_end_io_acct(mddev->gendisk, bio_op(bio), md_io->start_time); 474 + part_end_io_acct(md_io->part, bio, md_io->start_time); 476 475 477 476 bio->bi_end_io = md_io->orig_bi_end_io; 478 477 bio->bi_private = md_io->orig_bi_private; ··· 518 517 bio->bi_end_io = md_end_io; 519 518 bio->bi_private = md_io; 520 519 521 - md_io->start_time = disk_start_io_acct(mddev->gendisk, 522 - bio_sectors(bio), 523 - bio_op(bio)); 520 + md_io->start_time = part_start_io_acct(mddev->gendisk, 521 + &md_io->part, bio); 524 522 } 525 523 526 524 /* bio could be mergeable after passing to underlayer */ ··· 5358 5358 mddev->array_sectors = sectors; 5359 5359 if (mddev->pers) { 5360 5360 set_capacity(mddev->gendisk, mddev->array_sectors); 5361 - revalidate_disk(mddev->gendisk); 5361 + revalidate_disk_size(mddev->gendisk, true); 5362 5362 } 5363 5363 } 5364 5364 mddev_unlock(mddev); ··· 6109 6109 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 6110 6110 6111 6111 set_capacity(mddev->gendisk, mddev->array_sectors); 6112 - revalidate_disk(mddev->gendisk); 6112 + revalidate_disk_size(mddev->gendisk, true); 6113 6113 clear_bit(MD_NOT_READY, &mddev->flags); 6114 6114 mddev->changed = 1; 6115 6115 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); ··· 6427 6427 set_capacity(disk, 0); 6428 6428 mutex_unlock(&mddev->open_mutex); 6429 6429 mddev->changed = 1; 6430 - revalidate_disk(disk); 6430 + revalidate_disk_size(disk, true); 6431 6431 6432 6432 if (mddev->ro) 6433 6433 mddev->ro = 0; ··· 7259 7259 md_cluster_ops->update_size(mddev, old_dev_sectors); 7260 7260 else if (mddev->queue) { 7261 7261 set_capacity(mddev->gendisk, mddev->array_sectors); 7262 - revalidate_disk(mddev->gendisk); 7262 + revalidate_disk_size(mddev->gendisk, true); 7263 7263 } 7264 7264 } 7265 7265 return rv; ··· 7848 7848 atomic_inc(&mddev->openers); 7849 7849 mutex_unlock(&mddev->open_mutex); 7850 7850 7851 - check_disk_change(bdev); 7851 + bdev_check_media_change(bdev); 7852 7852 out: 7853 7853 if (err) 7854 7854 mddev_put(mddev); ··· 9018 9018 mddev_unlock(mddev); 9019 9019 if (!mddev_is_clustered(mddev)) { 9020 9020 set_capacity(mddev->gendisk, mddev->array_sectors); 9021 - revalidate_disk(mddev->gendisk); 9021 + revalidate_disk_size(mddev->gendisk, true); 9022 9022 } 9023 9023 } 9024 9024

+1 -1

drivers/md/md.h

··· 397 397 * These locks are separate due to conflicting interactions 398 398 * with bdev->bd_mutex. 399 399 * Lock ordering is: 400 - * reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk 400 + * reconfig_mutex -> bd_mutex 401 401 * bd_mutex -> open_mutex: e.g. __blkdev_get -> md_open 402 402 */ 403 403 struct mutex open_mutex;

-16

drivers/md/raid0.c

··· 410 410 mdname(mddev), 411 411 (unsigned long long)mddev->array_sectors); 412 412 413 - if (mddev->queue) { 414 - /* calculate the max read-ahead size. 415 - * For read-ahead of large files to be effective, we need to 416 - * readahead at least twice a whole stripe. i.e. number of devices 417 - * multiplied by chunk size times 2. 418 - * If an individual device has an ra_pages greater than the 419 - * chunk size, then we will not drive that device as hard as it 420 - * wants. We consider this a configuration error: a larger 421 - * chunksize should be used in that case. 422 - */ 423 - int stripe = mddev->raid_disks * 424 - (mddev->chunk_sectors << 9) / PAGE_SIZE; 425 - if (mddev->queue->backing_dev_info->ra_pages < 2* stripe) 426 - mddev->queue->backing_dev_info->ra_pages = 2* stripe; 427 - } 428 - 429 413 dump_zones(mddev); 430 414 431 415 ret = md_integrity_register(mddev);

+15 -31

drivers/md/raid10.c

··· 3703 3703 return ERR_PTR(err); 3704 3704 } 3705 3705 3706 + static void raid10_set_io_opt(struct r10conf *conf) 3707 + { 3708 + int raid_disks = conf->geo.raid_disks; 3709 + 3710 + if (!(conf->geo.raid_disks % conf->geo.near_copies)) 3711 + raid_disks /= conf->geo.near_copies; 3712 + blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) * 3713 + raid_disks); 3714 + } 3715 + 3706 3716 static int raid10_run(struct mddev *mddev) 3707 3717 { 3708 3718 struct r10conf *conf; 3709 - int i, disk_idx, chunk_size; 3719 + int i, disk_idx; 3710 3720 struct raid10_info *disk; 3711 3721 struct md_rdev *rdev; 3712 3722 sector_t size; ··· 3752 3742 mddev->thread = conf->thread; 3753 3743 conf->thread = NULL; 3754 3744 3755 - chunk_size = mddev->chunk_sectors << 9; 3756 3745 if (mddev->queue) { 3757 3746 blk_queue_max_discard_sectors(mddev->queue, 3758 3747 mddev->chunk_sectors); 3759 3748 blk_queue_max_write_same_sectors(mddev->queue, 0); 3760 3749 blk_queue_max_write_zeroes_sectors(mddev->queue, 0); 3761 - blk_queue_io_min(mddev->queue, chunk_size); 3762 - if (conf->geo.raid_disks % conf->geo.near_copies) 3763 - blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); 3764 - else 3765 - blk_queue_io_opt(mddev->queue, chunk_size * 3766 - (conf->geo.raid_disks / conf->geo.near_copies)); 3750 + blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); 3751 + raid10_set_io_opt(conf); 3767 3752 } 3768 3753 3769 3754 rdev_for_each(rdev, mddev) { ··· 3872 3867 md_set_array_sectors(mddev, size); 3873 3868 mddev->resync_max_sectors = size; 3874 3869 set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); 3875 - 3876 - if (mddev->queue) { 3877 - int stripe = conf->geo.raid_disks * 3878 - ((mddev->chunk_sectors << 9) / PAGE_SIZE); 3879 - 3880 - /* Calculate max read-ahead size. 3881 - * We need to readahead at least twice a whole stripe.... 3882 - * maybe... 3883 - */ 3884 - stripe /= conf->geo.near_copies; 3885 - if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe) 3886 - mddev->queue->backing_dev_info->ra_pages = 2 * stripe; 3887 - } 3888 3870 3889 3871 if (md_integrity_register(mddev)) 3890 3872 goto out_free_conf; ··· 4710 4718 conf->reshape_safe = MaxSector; 4711 4719 spin_unlock_irq(&conf->device_lock); 4712 4720 4713 - /* read-ahead size must cover two whole stripes, which is 4714 - * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 4715 - */ 4716 - if (conf->mddev->queue) { 4717 - int stripe = conf->geo.raid_disks * 4718 - ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE); 4719 - stripe /= conf->geo.near_copies; 4720 - if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe) 4721 - conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe; 4722 - } 4721 + if (conf->mddev->queue) 4722 + raid10_set_io_opt(conf); 4723 4723 conf->fullsync = 0; 4724 4724 } 4725 4725

+13 -18

drivers/md/raid5.c

··· 6638 6638 if (!conf) 6639 6639 err = -ENODEV; 6640 6640 else if (new != conf->skip_copy) { 6641 + struct request_queue *q = mddev->queue; 6642 + 6641 6643 mddev_suspend(mddev); 6642 6644 conf->skip_copy = new; 6643 6645 if (new) 6644 - mddev->queue->backing_dev_info->capabilities |= 6645 - BDI_CAP_STABLE_WRITES; 6646 + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); 6646 6647 else 6647 - mddev->queue->backing_dev_info->capabilities &= 6648 - ~BDI_CAP_STABLE_WRITES; 6648 + blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q); 6649 6649 mddev_resume(mddev); 6650 6650 } 6651 6651 mddev_unlock(mddev); ··· 7232 7232 return 0; 7233 7233 } 7234 7234 7235 + static void raid5_set_io_opt(struct r5conf *conf) 7236 + { 7237 + blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) * 7238 + (conf->raid_disks - conf->max_degraded)); 7239 + } 7240 + 7235 7241 static int raid5_run(struct mddev *mddev) 7236 7242 { 7237 7243 struct r5conf *conf; ··· 7522 7516 int data_disks = conf->previous_raid_disks - conf->max_degraded; 7523 7517 int stripe = data_disks * 7524 7518 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 7525 - if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe) 7526 - mddev->queue->backing_dev_info->ra_pages = 2 * stripe; 7527 7519 7528 7520 chunk_size = mddev->chunk_sectors << 9; 7529 7521 blk_queue_io_min(mddev->queue, chunk_size); 7530 - blk_queue_io_opt(mddev->queue, chunk_size * 7531 - (conf->raid_disks - conf->max_degraded)); 7522 + raid5_set_io_opt(conf); 7532 7523 mddev->queue->limits.raid_partial_stripes_expensive = 1; 7533 7524 /* 7534 7525 * We can only discard a whole stripe. It doesn't make sense to ··· 8109 8106 spin_unlock_irq(&conf->device_lock); 8110 8107 wake_up(&conf->wait_for_overlap); 8111 8108 8112 - /* read-ahead size must cover two whole stripes, which is 8113 - * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 8114 - */ 8115 - if (conf->mddev->queue) { 8116 - int data_disks = conf->raid_disks - conf->max_degraded; 8117 - int stripe = data_disks * ((conf->chunk_sectors << 9) 8118 - / PAGE_SIZE); 8119 - if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe) 8120 - conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe; 8121 - } 8109 + if (conf->mddev->queue) 8110 + raid5_set_io_opt(conf); 8122 8111 } 8123 8112 } 8124 8113

+1 -2

drivers/mmc/core/queue.c

··· 472 472 } 473 473 474 474 if (mmc_host_is_spi(host) && host->use_spi_crc) 475 - mq->queue->backing_dev_info->capabilities |= 476 - BDI_CAP_STABLE_WRITES; 475 + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, mq->queue); 477 476 478 477 mq->queue->queuedata = mq; 479 478 blk_queue_rq_timeout(mq->queue, 60 * HZ);

+2

drivers/mtd/mtdcore.c

··· 2196 2196 bdi = bdi_alloc(NUMA_NO_NODE); 2197 2197 if (!bdi) 2198 2198 return ERR_PTR(-ENOMEM); 2199 + bdi->ra_pages = 0; 2200 + bdi->io_pages = 0; 2199 2201 2200 2202 /* 2201 2203 * We put '-0' suffix to the name to get the same name format as we

+1 -2

drivers/nvdimm/blk.c

··· 226 226 static const struct block_device_operations nd_blk_fops = { 227 227 .owner = THIS_MODULE, 228 228 .submit_bio = nd_blk_submit_bio, 229 - .revalidate_disk = nvdimm_revalidate_disk, 230 229 }; 231 230 232 231 static void nd_blk_release_queue(void *q) ··· 283 284 284 285 set_capacity(disk, available_disk_size >> SECTOR_SHIFT); 285 286 device_add_disk(dev, disk, NULL); 286 - revalidate_disk(disk); 287 + nvdimm_check_and_set_ro(disk); 287 288 return 0; 288 289 } 289 290

+1 -4

drivers/nvdimm/btt.c

··· 1513 1513 .submit_bio = btt_submit_bio, 1514 1514 .rw_page = btt_rw_page, 1515 1515 .getgeo = btt_getgeo, 1516 - .revalidate_disk = nvdimm_revalidate_disk, 1517 1516 }; 1518 1517 1519 1518 static int btt_blk_init(struct btt *btt) ··· 1537 1538 btt->btt_disk->private_data = btt; 1538 1539 btt->btt_disk->queue = btt->btt_queue; 1539 1540 btt->btt_disk->flags = GENHD_FL_EXT_DEVT; 1540 - btt->btt_disk->queue->backing_dev_info->capabilities |= 1541 - BDI_CAP_SYNCHRONOUS_IO; 1542 1541 1543 1542 blk_queue_logical_block_size(btt->btt_queue, btt->sector_size); 1544 1543 blk_queue_max_hw_sectors(btt->btt_queue, UINT_MAX); ··· 1555 1558 set_capacity(btt->btt_disk, btt->nlba * btt->sector_size >> 9); 1556 1559 device_add_disk(&btt->nd_btt->dev, btt->btt_disk, NULL); 1557 1560 btt->nd_btt->size = btt->nlba * (u64)btt->sector_size; 1558 - revalidate_disk(btt->btt_disk); 1561 + nvdimm_check_and_set_ro(btt->btt_disk); 1559 1562 1560 1563 return 0; 1561 1564 }

+3 -6

drivers/nvdimm/bus.c

··· 628 628 } 629 629 EXPORT_SYMBOL(__nd_driver_register); 630 630 631 - int nvdimm_revalidate_disk(struct gendisk *disk) 631 + void nvdimm_check_and_set_ro(struct gendisk *disk) 632 632 { 633 633 struct device *dev = disk_to_dev(disk)->parent; 634 634 struct nd_region *nd_region = to_nd_region(dev->parent); ··· 639 639 * read-only if the disk is already read-only. 640 640 */ 641 641 if (disk_ro || nd_region->ro == disk_ro) 642 - return 0; 642 + return; 643 643 644 644 dev_info(dev, "%s read-only, marking %s read-only\n", 645 645 dev_name(&nd_region->dev), disk->disk_name); 646 646 set_disk_ro(disk, 1); 647 - 648 - return 0; 649 - 650 647 } 651 - EXPORT_SYMBOL(nvdimm_revalidate_disk); 648 + EXPORT_SYMBOL(nvdimm_check_and_set_ro); 652 649 653 650 static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, 654 651 char *buf)

+1 -1

drivers/nvdimm/nd.h

··· 361 361 void nvdimm_bus_lock(struct device *dev); 362 362 void nvdimm_bus_unlock(struct device *dev); 363 363 bool is_nvdimm_bus_locked(struct device *dev); 364 - int nvdimm_revalidate_disk(struct gendisk *disk); 364 + void nvdimm_check_and_set_ro(struct gendisk *disk); 365 365 void nvdimm_drvdata_release(struct kref *kref); 366 366 void put_ndd(struct nvdimm_drvdata *ndd); 367 367 int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd);

+1 -3

drivers/nvdimm/pmem.c

··· 281 281 .owner = THIS_MODULE, 282 282 .submit_bio = pmem_submit_bio, 283 283 .rw_page = pmem_rw_page, 284 - .revalidate_disk = nvdimm_revalidate_disk, 285 284 }; 286 285 287 286 static int pmem_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, ··· 475 476 disk->queue = q; 476 477 disk->flags = GENHD_FL_EXT_DEVT; 477 478 disk->private_data = pmem; 478 - disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO; 479 479 nvdimm_namespace_disk_name(ndns, disk->disk_name); 480 480 set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset) 481 481 / 512); ··· 499 501 if (devm_add_action_or_reset(dev, pmem_release_disk, pmem)) 500 502 return -ENOMEM; 501 503 502 - revalidate_disk(disk); 504 + nvdimm_check_and_set_ro(disk); 503 505 504 506 pmem->bb_state = sysfs_get_dirent(disk_to_dev(disk)->kobj.sd, 505 507 "badblocks");

+35 -18

drivers/nvme/host/core.c

··· 94 94 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, 95 95 unsigned nsid); 96 96 97 + static void nvme_update_bdev_size(struct gendisk *disk) 98 + { 99 + struct block_device *bdev = bdget_disk(disk, 0); 100 + 101 + if (bdev) { 102 + bd_set_nr_sectors(bdev, get_capacity(disk)); 103 + bdput(bdev); 104 + } 105 + } 106 + 107 + /* 108 + * Prepare a queue for teardown. 109 + * 110 + * This must forcibly unquiesce queues to avoid blocking dispatch, and only set 111 + * the capacity to 0 after that to avoid blocking dispatchers that may be 112 + * holding bd_butex. This will end buffered writers dirtying pages that can't 113 + * be synced. 114 + */ 97 115 static void nvme_set_queue_dying(struct nvme_ns *ns) 98 116 { 99 - /* 100 - * Revalidating a dead namespace sets capacity to 0. This will end 101 - * buffered writers dirtying pages that can't be synced. 102 - */ 103 117 if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) 104 118 return; 119 + 105 120 blk_set_queue_dying(ns->queue); 106 - /* Forcibly unquiesce queues to avoid blocking dispatch */ 107 121 blk_mq_unquiesce_queue(ns->queue); 108 - /* 109 - * Revalidate after unblocking dispatchers that may be holding bd_butex 110 - */ 111 - revalidate_disk(ns->disk); 122 + 123 + set_capacity(ns->disk, 0); 124 + nvme_update_bdev_size(ns->disk); 112 125 } 113 126 114 127 static void nvme_queue_scan(struct nvme_ctrl *ctrl) ··· 2147 2134 nvme_update_disk_info(ns->head->disk, ns, id); 2148 2135 blk_stack_limits(&ns->head->disk->queue->limits, 2149 2136 &ns->queue->limits, 0); 2150 - nvme_mpath_update_disk_size(ns->head->disk); 2137 + blk_queue_update_readahead(ns->head->disk->queue); 2138 + nvme_update_bdev_size(ns->head->disk); 2151 2139 } 2152 2140 #endif 2153 2141 return 0; ··· 2353 2339 .open = nvme_open, 2354 2340 .release = nvme_release, 2355 2341 .getgeo = nvme_getgeo, 2356 - .revalidate_disk= nvme_revalidate_disk, 2357 2342 .report_zones = nvme_report_zones, 2358 2343 .pr_ops = &nvme_pr_ops, 2359 2344 }; ··· 3937 3924 goto out_free_ns; 3938 3925 3939 3926 if (ctrl->opts && ctrl->opts->data_digest) 3940 - ns->queue->backing_dev_info->capabilities 3941 - |= BDI_CAP_STABLE_WRITES; 3927 + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue); 3942 3928 3943 3929 blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue); 3944 3930 if (ctrl->ops->flags & NVME_F_PCI_P2PDMA) ··· 4063 4051 static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid) 4064 4052 { 4065 4053 struct nvme_ns *ns; 4054 + int ret; 4066 4055 4067 4056 ns = nvme_find_get_ns(ctrl, nsid); 4068 - if (ns) { 4069 - if (revalidate_disk(ns->disk)) 4070 - nvme_ns_remove(ns); 4071 - nvme_put_ns(ns); 4072 - } else 4057 + if (!ns) { 4073 4058 nvme_alloc_ns(ctrl, nsid); 4059 + return; 4060 + } 4061 + 4062 + ret = nvme_revalidate_disk(ns->disk); 4063 + revalidate_disk_size(ns->disk, ret == 0); 4064 + if (ret) 4065 + nvme_ns_remove(ns); 4066 + nvme_put_ns(ns); 4074 4067 } 4075 4068 4076 4069 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,

+3 -7

drivers/nvme/host/multipath.c

··· 673 673 nvme_mpath_set_live(ns); 674 674 } 675 675 676 - if (bdi_cap_stable_pages_required(ns->queue->backing_dev_info)) { 677 - struct gendisk *disk = ns->head->disk; 678 - 679 - if (disk) 680 - disk->queue->backing_dev_info->capabilities |= 681 - BDI_CAP_STABLE_WRITES; 682 - } 676 + if (blk_queue_stable_writes(ns->queue) && ns->head->disk) 677 + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, 678 + ns->head->disk->queue); 683 679 } 684 680 685 681 void nvme_mpath_remove_disk(struct nvme_ns_head *head)

-13

drivers/nvme/host/nvme.h

··· 682 682 trace_block_bio_complete(ns->head->disk->queue, req->bio); 683 683 } 684 684 685 - static inline void nvme_mpath_update_disk_size(struct gendisk *disk) 686 - { 687 - struct block_device *bdev = bdget_disk(disk, 0); 688 - 689 - if (bdev) { 690 - bd_set_size(bdev, get_capacity(disk) << SECTOR_SHIFT); 691 - bdput(bdev); 692 - } 693 - } 694 - 695 685 extern struct device_attribute dev_attr_ana_grpid; 696 686 extern struct device_attribute dev_attr_ana_state; 697 687 extern struct device_attribute subsys_attr_iopolicy; ··· 754 764 { 755 765 } 756 766 static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) 757 - { 758 - } 759 - static inline void nvme_mpath_update_disk_size(struct gendisk *disk) 760 767 { 761 768 } 762 769 #endif /* CONFIG_NVME_MULTIPATH */

+4 -11

drivers/s390/block/dasd_genhd.c

··· 101 101 struct block_device *bdev; 102 102 int rc; 103 103 104 - bdev = bdget_disk(block->gdp, 0); 105 - if (!bdev) { 106 - DBF_DEV_EVENT(DBF_ERR, block->base, "%s", 107 - "scan partitions error, bdget returned NULL"); 108 - return -ENODEV; 109 - } 110 - 111 - rc = blkdev_get(bdev, FMODE_READ, NULL); 112 - if (rc < 0) { 104 + bdev = blkdev_get_by_dev(disk_devt(block->gdp), FMODE_READ, NULL); 105 + if (IS_ERR(bdev)) { 113 106 DBF_DEV_EVENT(DBF_ERR, block->base, 114 - "scan partitions error, blkdev_get returned %d", 115 - rc); 107 + "scan partitions error, blkdev_get returned %ld", 108 + PTR_ERR(bdev)); 116 109 return -ENODEV; 117 110 } 118 111

+2 -7

drivers/s390/block/dasd_ioctl.c

··· 55 55 56 56 dasd_enable_device(base); 57 57 /* Formatting the dasd device can change the capacity. */ 58 - mutex_lock(&bdev->bd_mutex); 59 - i_size_write(bdev->bd_inode, 60 - (loff_t)get_capacity(base->block->gdp) << 9); 61 - mutex_unlock(&bdev->bd_mutex); 58 + bd_set_nr_sectors(bdev, get_capacity(base->block->gdp)); 62 59 dasd_put_device(base); 63 60 return 0; 64 61 } ··· 88 91 * Set i_size to zero, since read, write, etc. check against this 89 92 * value. 90 93 */ 91 - mutex_lock(&bdev->bd_mutex); 92 - i_size_write(bdev->bd_inode, 0); 93 - mutex_unlock(&bdev->bd_mutex); 94 + bd_set_nr_sectors(bdev, 0); 94 95 dasd_put_device(base); 95 96 return 0; 96 97 }

+2 -2

drivers/scsi/iscsi_tcp.c

··· 962 962 struct iscsi_conn *conn = session->leadconn; 963 963 964 964 if (conn->datadgst_en) 965 - sdev->request_queue->backing_dev_info->capabilities 966 - |= BDI_CAP_STABLE_WRITES; 965 + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, 966 + sdev->request_queue); 967 967 blk_queue_dma_alignment(sdev->request_queue, 0); 968 968 return 0; 969 969 }

+8 -5

drivers/scsi/sd.c

··· 217 217 sd_print_sense_hdr(sdkp, &sshdr); 218 218 return -EINVAL; 219 219 } 220 - revalidate_disk(sdkp->disk); 220 + sd_revalidate_disk(sdkp->disk); 221 221 return count; 222 222 } 223 223 ··· 1381 1381 if (!scsi_block_when_processing_errors(sdev)) 1382 1382 goto error_out; 1383 1383 1384 - if (sdev->removable || sdkp->write_prot) 1385 - check_disk_change(bdev); 1384 + if (sdev->removable || sdkp->write_prot) { 1385 + if (bdev_check_media_change(bdev)) 1386 + sd_revalidate_disk(bdev->bd_disk); 1387 + } 1386 1388 1387 1389 /* 1388 1390 * If the drive is empty, just let the open fail. ··· 1708 1706 static void sd_rescan(struct device *dev) 1709 1707 { 1710 1708 struct scsi_disk *sdkp = dev_get_drvdata(dev); 1709 + int ret; 1711 1710 1712 - revalidate_disk(sdkp->disk); 1711 + ret = sd_revalidate_disk(sdkp->disk); 1712 + revalidate_disk_size(sdkp->disk, ret == 0); 1713 1713 } 1714 1714 1715 1715 static int sd_ioctl(struct block_device *bdev, fmode_t mode, ··· 1845 1841 .compat_ioctl = sd_compat_ioctl, 1846 1842 #endif 1847 1843 .check_events = sd_check_events, 1848 - .revalidate_disk = sd_revalidate_disk, 1849 1844 .unlock_native_capacity = sd_unlock_native_capacity, 1850 1845 .report_zones = sd_zbc_report_zones, 1851 1846 .pr_ops = &sd_pr_ops,

+14 -22

drivers/scsi/sr.c

··· 517 517 return ret; 518 518 } 519 519 520 + static void sr_revalidate_disk(struct scsi_cd *cd) 521 + { 522 + struct scsi_sense_hdr sshdr; 523 + 524 + /* if the unit is not ready, nothing more to do */ 525 + if (scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES, &sshdr)) 526 + return; 527 + sr_cd_check(&cd->cdi); 528 + get_sectorsize(cd); 529 + } 530 + 520 531 static int sr_block_open(struct block_device *bdev, fmode_t mode) 521 532 { 522 533 struct scsi_cd *cd; ··· 540 529 541 530 sdev = cd->device; 542 531 scsi_autopm_get_device(sdev); 543 - check_disk_change(bdev); 532 + if (bdev_check_media_change(bdev)) 533 + sr_revalidate_disk(cd); 544 534 545 535 mutex_lock(&cd->lock); 546 536 ret = cdrom_open(&cd->cdi, bdev, mode); ··· 670 658 return ret; 671 659 } 672 660 673 - static int sr_block_revalidate_disk(struct gendisk *disk) 674 - { 675 - struct scsi_sense_hdr sshdr; 676 - struct scsi_cd *cd; 677 - 678 - cd = scsi_cd_get(disk); 679 - if (!cd) 680 - return -ENXIO; 681 - 682 - /* if the unit is not ready, nothing more to do */ 683 - if (scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES, &sshdr)) 684 - goto out; 685 - 686 - sr_cd_check(&cd->cdi); 687 - get_sectorsize(cd); 688 - out: 689 - scsi_cd_put(cd); 690 - return 0; 691 - } 692 - 693 661 static const struct block_device_operations sr_bdops = 694 662 { 695 663 .owner = THIS_MODULE, ··· 680 688 .compat_ioctl = sr_block_compat_ioctl, 681 689 #endif 682 690 .check_events = sr_block_check_events, 683 - .revalidate_disk = sr_block_revalidate_disk, 684 691 }; 685 692 686 693 static int sr_open(struct cdrom_device_info *cdi, int purpose) ··· 793 802 794 803 dev_set_drvdata(dev, cd); 795 804 disk->flags |= GENHD_FL_REMOVABLE; 805 + sr_revalidate_disk(cd); 796 806 device_add_disk(&sdev->sdev_gendev, disk, NULL); 797 807 798 808 sdev_printk(KERN_DEBUG, sdev,

+1 -1

fs/9p/vfs_file.c

··· 625 625 626 626 inode = file_inode(vma->vm_file); 627 627 628 - if (!mapping_cap_writeback_dirty(inode->i_mapping)) 628 + if (!mapping_can_writeback(inode->i_mapping)) 629 629 wbc.nr_to_write = 0; 630 630 631 631 might_sleep();

+4 -2

fs/9p/vfs_super.c

··· 80 80 if (ret) 81 81 return ret; 82 82 83 - if (v9ses->cache) 84 - sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; 83 + if (!v9ses->cache) { 84 + sb->s_bdi->ra_pages = 0; 85 + sb->s_bdi->io_pages = 0; 86 + } 85 87 86 88 sb->s_flags |= SB_ACTIVE | SB_DIRSYNC; 87 89 if (!v9ses->cache)

-1

fs/afs/super.c

··· 456 456 ret = super_setup_bdi(sb); 457 457 if (ret) 458 458 return ret; 459 - sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; 460 459 461 460 /* allocate the root inode and dentry */ 462 461 if (as->dyn_root) {

+85 -90

fs/block_dev.c

··· 103 103 } 104 104 EXPORT_SYMBOL(invalidate_bdev); 105 105 106 + /* 107 + * Drop all buffers & page cache for given bdev range. This function bails 108 + * with error if bdev has other exclusive owner (such as filesystem). 109 + */ 110 + int truncate_bdev_range(struct block_device *bdev, fmode_t mode, 111 + loff_t lstart, loff_t lend) 112 + { 113 + struct block_device *claimed_bdev = NULL; 114 + int err; 115 + 116 + /* 117 + * If we don't hold exclusive handle for the device, upgrade to it 118 + * while we discard the buffer cache to avoid discarding buffers 119 + * under live filesystem. 120 + */ 121 + if (!(mode & FMODE_EXCL)) { 122 + claimed_bdev = bdev->bd_contains; 123 + err = bd_prepare_to_claim(bdev, claimed_bdev, 124 + truncate_bdev_range); 125 + if (err) 126 + return err; 127 + } 128 + truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend); 129 + if (claimed_bdev) 130 + bd_abort_claiming(bdev, claimed_bdev, truncate_bdev_range); 131 + return 0; 132 + } 133 + EXPORT_SYMBOL(truncate_bdev_range); 134 + 106 135 static void set_init_blocksize(struct block_device *bdev) 107 136 { 108 137 bdev->bd_inode->i_blkbits = blksize_bits(bdev_logical_block_size(bdev)); ··· 905 876 bdev = &BDEV_I(inode)->bdev; 906 877 907 878 if (inode->i_state & I_NEW) { 879 + spin_lock_init(&bdev->bd_size_lock); 908 880 bdev->bd_contains = NULL; 909 881 bdev->bd_super = NULL; 910 882 bdev->bd_inode = inode; 911 883 bdev->bd_part_count = 0; 912 - bdev->bd_invalidated = 0; 913 884 inode->i_mode = S_IFBLK; 914 885 inode->i_rdev = dev; 915 886 inode->i_bdev = bdev; ··· 1319 1290 { 1320 1291 loff_t disk_size, bdev_size; 1321 1292 1293 + spin_lock(&bdev->bd_size_lock); 1322 1294 disk_size = (loff_t)get_capacity(disk) << 9; 1323 1295 bdev_size = i_size_read(bdev->bd_inode); 1324 1296 if (disk_size != bdev_size) { ··· 1329 1299 disk->disk_name, bdev_size, disk_size); 1330 1300 } 1331 1301 i_size_write(bdev->bd_inode, disk_size); 1332 - if (bdev_size > disk_size && __invalidate_device(bdev, false)) 1302 + } 1303 + spin_unlock(&bdev->bd_size_lock); 1304 + 1305 + if (bdev_size > disk_size) { 1306 + if (__invalidate_device(bdev, false)) 1333 1307 pr_warn("VFS: busy inodes on resized disk %s\n", 1334 1308 disk->disk_name); 1335 1309 } 1336 - bdev->bd_invalidated = 0; 1337 1310 } 1338 1311 1339 1312 /** 1340 - * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back 1341 - * @disk: struct gendisk to be revalidated 1313 + * revalidate_disk_size - checks for disk size change and adjusts bdev size. 1314 + * @disk: struct gendisk to check 1315 + * @verbose: if %true log a message about a size change if there is any 1342 1316 * 1343 - * This routine is a wrapper for lower-level driver's revalidate_disk 1344 - * call-backs. It is used to do common pre and post operations needed 1345 - * for all revalidate_disk operations. 1317 + * This routine checks to see if the bdev size does not match the disk size 1318 + * and adjusts it if it differs. When shrinking the bdev size, its all caches 1319 + * are freed. 1346 1320 */ 1347 - int revalidate_disk(struct gendisk *disk) 1321 + void revalidate_disk_size(struct gendisk *disk, bool verbose) 1348 1322 { 1349 - int ret = 0; 1350 - 1351 - if (disk->fops->revalidate_disk) 1352 - ret = disk->fops->revalidate_disk(disk); 1323 + struct block_device *bdev; 1353 1324 1354 1325 /* 1355 1326 * Hidden disks don't have associated bdev so there's no point in 1356 - * revalidating it. 1327 + * revalidating them. 1357 1328 */ 1358 - if (!(disk->flags & GENHD_FL_HIDDEN)) { 1359 - struct block_device *bdev = bdget_disk(disk, 0); 1329 + if (disk->flags & GENHD_FL_HIDDEN) 1330 + return; 1360 1331 1361 - if (!bdev) 1362 - return ret; 1363 - 1364 - mutex_lock(&bdev->bd_mutex); 1365 - check_disk_size_change(disk, bdev, ret == 0); 1366 - mutex_unlock(&bdev->bd_mutex); 1332 + bdev = bdget_disk(disk, 0); 1333 + if (bdev) { 1334 + check_disk_size_change(disk, bdev, verbose); 1367 1335 bdput(bdev); 1368 1336 } 1369 - return ret; 1370 1337 } 1371 - EXPORT_SYMBOL(revalidate_disk); 1338 + EXPORT_SYMBOL(revalidate_disk_size); 1372 1339 1373 - /* 1374 - * This routine checks whether a removable media has been changed, 1375 - * and invalidates all buffer-cache-entries in that case. This 1376 - * is a relatively slow routine, so we have to try to minimize using 1377 - * it. Thus it is called only upon a 'mount' or 'open'. This 1378 - * is the best way of combining speed and utility, I think. 1379 - * People changing diskettes in the middle of an operation deserve 1380 - * to lose :-) 1381 - */ 1382 - int check_disk_change(struct block_device *bdev) 1340 + void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors) 1383 1341 { 1384 - struct gendisk *disk = bdev->bd_disk; 1385 - const struct block_device_operations *bdops = disk->fops; 1386 - unsigned int events; 1387 - 1388 - events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE | 1389 - DISK_EVENT_EJECT_REQUEST); 1390 - if (!(events & DISK_EVENT_MEDIA_CHANGE)) 1391 - return 0; 1392 - 1393 - if (__invalidate_device(bdev, true)) 1394 - pr_warn("VFS: busy inodes on changed media %s\n", 1395 - disk->disk_name); 1396 - bdev->bd_invalidated = 1; 1397 - if (bdops->revalidate_disk) 1398 - bdops->revalidate_disk(bdev->bd_disk); 1399 - return 1; 1342 + spin_lock(&bdev->bd_size_lock); 1343 + i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); 1344 + spin_unlock(&bdev->bd_size_lock); 1400 1345 } 1401 - 1402 - EXPORT_SYMBOL(check_disk_change); 1403 - 1404 - void bd_set_size(struct block_device *bdev, loff_t size) 1405 - { 1406 - inode_lock(bdev->bd_inode); 1407 - i_size_write(bdev->bd_inode, size); 1408 - inode_unlock(bdev->bd_inode); 1409 - } 1410 - EXPORT_SYMBOL(bd_set_size); 1346 + EXPORT_SYMBOL(bd_set_nr_sectors); 1411 1347 1412 1348 static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); 1413 1349 ··· 1383 1387 int ret; 1384 1388 1385 1389 lockdep_assert_held(&bdev->bd_mutex); 1390 + 1391 + clear_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); 1386 1392 1387 1393 rescan: 1388 1394 ret = blk_drop_partitions(bdev); ··· 1444 1446 struct gendisk *disk; 1445 1447 int ret; 1446 1448 int partno; 1447 - int perm = 0; 1448 1449 bool first_open = false, unblock_events = true, need_restart; 1449 - 1450 - if (mode & FMODE_READ) 1451 - perm |= MAY_READ; 1452 - if (mode & FMODE_WRITE) 1453 - perm |= MAY_WRITE; 1454 - /* 1455 - * hooks: /n/, see "layering violations". 1456 - */ 1457 - if (!for_part) { 1458 - ret = devcgroup_inode_permission(bdev->bd_inode, perm); 1459 - if (ret != 0) 1460 - return ret; 1461 - } 1462 1450 1463 1451 restart: 1464 1452 need_restart = false; ··· 1498 1514 } 1499 1515 1500 1516 if (!ret) { 1501 - bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); 1517 + bd_set_nr_sectors(bdev, get_capacity(disk)); 1502 1518 set_init_blocksize(bdev); 1503 1519 } 1504 1520 ··· 1508 1524 * The latter is necessary to prevent ghost 1509 1525 * partitions on a removed medium. 1510 1526 */ 1511 - if (bdev->bd_invalidated && 1527 + if (test_bit(GD_NEED_PART_SCAN, &disk->state) && 1512 1528 (!ret || ret == -ENOMEDIUM)) 1513 1529 bdev_disk_changed(bdev, ret == -ENOMEDIUM); 1514 1530 ··· 1526 1542 ret = -ENXIO; 1527 1543 goto out_clear; 1528 1544 } 1529 - bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); 1545 + bd_set_nr_sectors(bdev, bdev->bd_part->nr_sects); 1530 1546 set_init_blocksize(bdev); 1531 1547 } 1532 1548 ··· 1538 1554 if (bdev->bd_disk->fops->open) 1539 1555 ret = bdev->bd_disk->fops->open(bdev, mode); 1540 1556 /* the same as first opener case, read comment there */ 1541 - if (bdev->bd_invalidated && 1557 + if (test_bit(GD_NEED_PART_SCAN, &disk->state) && 1542 1558 (!ret || ret == -ENOMEDIUM)) 1543 1559 bdev_disk_changed(bdev, ret == -ENOMEDIUM); 1544 1560 if (ret) ··· 1616 1632 * RETURNS: 1617 1633 * 0 on success, -errno on failure. 1618 1634 */ 1619 - int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) 1635 + static int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) 1620 1636 { 1621 - int res; 1637 + int ret, perm = 0; 1622 1638 1623 - res =__blkdev_get(bdev, mode, holder, 0); 1624 - if (res) 1625 - bdput(bdev); 1626 - return res; 1639 + if (mode & FMODE_READ) 1640 + perm |= MAY_READ; 1641 + if (mode & FMODE_WRITE) 1642 + perm |= MAY_WRITE; 1643 + ret = devcgroup_inode_permission(bdev->bd_inode, perm); 1644 + if (ret) 1645 + goto bdput; 1646 + 1647 + ret =__blkdev_get(bdev, mode, holder, 0); 1648 + if (ret) 1649 + goto bdput; 1650 + return 0; 1651 + 1652 + bdput: 1653 + bdput(bdev); 1654 + return ret; 1627 1655 } 1628 - EXPORT_SYMBOL(blkdev_get); 1629 1656 1630 1657 /** 1631 1658 * blkdev_get_by_path - open a block device by name ··· 1884 1889 if (bdev_read_only(I_BDEV(bd_inode))) 1885 1890 return -EPERM; 1886 1891 1887 - if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode)) 1892 + if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev)) 1888 1893 return -ETXTBSY; 1889 1894 1890 1895 if (!iov_iter_count(from)) ··· 1964 1969 loff_t len) 1965 1970 { 1966 1971 struct block_device *bdev = I_BDEV(bdev_file_inode(file)); 1967 - struct address_space *mapping; 1968 1972 loff_t end = start + len - 1; 1969 1973 loff_t isize; 1970 1974 int error; ··· 1991 1997 return -EINVAL; 1992 1998 1993 1999 /* Invalidate the page cache, including dirty pages. */ 1994 - mapping = bdev->bd_inode->i_mapping; 1995 - truncate_inode_pages_range(mapping, start, end); 2000 + error = truncate_bdev_range(bdev, file->f_mode, start, end); 2001 + if (error) 2002 + return error; 1996 2003 1997 2004 switch (mode) { 1998 2005 case FALLOC_FL_ZERO_RANGE: ··· 2020 2025 * the caller will be given -EBUSY. The third argument is 2021 2026 * inclusive, so the rounding here is safe. 2022 2027 */ 2023 - return invalidate_inode_pages2_range(mapping, 2028 + return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping, 2024 2029 start >> PAGE_SHIFT, 2025 2030 end >> PAGE_SHIFT); 2026 2031 }

-2

fs/btrfs/disk-io.c

··· 3091 3091 goto fail_sb_buffer; 3092 3092 } 3093 3093 3094 - sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; 3095 - sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; 3096 3094 sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); 3097 3095 sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); 3098 3096

-16

fs/buffer.c

··· 2771 2771 /* Is the page fully outside i_size? (truncate in progress) */ 2772 2772 offset = i_size & (PAGE_SIZE-1); 2773 2773 if (page->index >= end_index+1 || !offset) { 2774 - /* 2775 - * The page may have dirty, unmapped buffers. For example, 2776 - * they may have been added in ext3_writepage(). Make them 2777 - * freeable here, so the page does not leak. 2778 - */ 2779 - #if 0 2780 - /* Not really sure about this - do we need this ? */ 2781 - if (page->mapping->a_ops->invalidatepage) 2782 - page->mapping->a_ops->invalidatepage(page, offset); 2783 - #endif 2784 2774 unlock_page(page); 2785 2775 return 0; /* don't care */ 2786 2776 } ··· 2965 2975 /* Is the page fully outside i_size? (truncate in progress) */ 2966 2976 offset = i_size & (PAGE_SIZE-1); 2967 2977 if (page->index >= end_index+1 || !offset) { 2968 - /* 2969 - * The page may have dirty, unmapped buffers. For example, 2970 - * they may have been added in ext3_writepage(). Make them 2971 - * freeable here, so the page does not leak. 2972 - */ 2973 - do_invalidatepage(page, 0, PAGE_SIZE); 2974 2978 unlock_page(page); 2975 2979 return 0; /* don't care */ 2976 2980 }

+4 -3

fs/fs-writeback.c

··· 2321 2321 2322 2322 wb = locked_inode_to_wb_and_lock_list(inode); 2323 2323 2324 - WARN(bdi_cap_writeback_dirty(wb->bdi) && 2324 + WARN((wb->bdi->capabilities & BDI_CAP_WRITEBACK) && 2325 2325 !test_bit(WB_registered, &wb->state), 2326 2326 "bdi-%s not registered\n", bdi_dev_name(wb->bdi)); 2327 2327 ··· 2346 2346 * to make sure background write-back happens 2347 2347 * later. 2348 2348 */ 2349 - if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi) 2349 + if (wakeup_bdi && 2350 + (wb->bdi->capabilities & BDI_CAP_WRITEBACK)) 2350 2351 wb_wakeup_delayed(wb); 2351 2352 return; 2352 2353 } ··· 2582 2581 .range_end = LLONG_MAX, 2583 2582 }; 2584 2583 2585 - if (!mapping_cap_writeback_dirty(inode->i_mapping)) 2584 + if (!mapping_can_writeback(inode->i_mapping)) 2586 2585 wbc.nr_to_write = 0; 2587 2586 2588 2587 might_sleep();

+2 -2

fs/fuse/inode.c

··· 1049 1049 if (err) 1050 1050 return err; 1051 1051 1052 - sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; 1053 1052 /* fuse does it's own writeback accounting */ 1054 - sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT; 1053 + sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT; 1054 + sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT; 1055 1055 1056 1056 /* 1057 1057 * For a single fuse filesystem use max 1% of dirty +

+2 -2

fs/namei.c

··· 568 568 { 569 569 struct super_block *sb = mnt->mnt_sb; 570 570 571 - /* Bind mounts and multi-root filesystems can have disconnected paths */ 572 - if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root)) 571 + /* Bind mounts can have disconnected paths */ 572 + if (mnt->mnt_root == sb->s_root) 573 573 return true; 574 574 575 575 return is_subdir(dentry, mnt->mnt_root);

+1 -8

fs/nfs/super.c

··· 1200 1200 } 1201 1201 #endif 1202 1202 1203 - static void nfs_set_readahead(struct backing_dev_info *bdi, 1204 - unsigned long iomax_pages) 1205 - { 1206 - bdi->ra_pages = VM_READAHEAD_PAGES; 1207 - bdi->io_pages = iomax_pages; 1208 - } 1209 - 1210 1203 int nfs_get_tree_common(struct fs_context *fc) 1211 1204 { 1212 1205 struct nfs_fs_context *ctx = nfs_fc2context(fc); ··· 1244 1251 MINOR(server->s_dev)); 1245 1252 if (error) 1246 1253 goto error_splat_super; 1247 - nfs_set_readahead(s->s_bdi, server->rpages); 1254 + s->s_bdi->io_pages = server->rpages; 1248 1255 server->super = s; 1249 1256 } 1250 1257

+10 -18

fs/ocfs2/cluster/heartbeat.c

··· 1766 1766 int sectsize; 1767 1767 char *p = (char *)page; 1768 1768 struct fd f; 1769 - struct inode *inode; 1770 1769 ssize_t ret = -EINVAL; 1771 1770 int live_threshold; 1772 1771 ··· 1792 1793 reg->hr_block_bytes == 0) 1793 1794 goto out2; 1794 1795 1795 - inode = igrab(f.file->f_mapping->host); 1796 - if (inode == NULL) 1796 + if (!S_ISBLK(f.file->f_mapping->host->i_mode)) 1797 1797 goto out2; 1798 1798 1799 - if (!S_ISBLK(inode->i_mode)) 1800 - goto out3; 1801 - 1802 - reg->hr_bdev = I_BDEV(f.file->f_mapping->host); 1803 - ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL); 1804 - if (ret) { 1799 + reg->hr_bdev = blkdev_get_by_dev(f.file->f_mapping->host->i_rdev, 1800 + FMODE_WRITE | FMODE_READ, NULL); 1801 + if (IS_ERR(reg->hr_bdev)) { 1802 + ret = PTR_ERR(reg->hr_bdev); 1805 1803 reg->hr_bdev = NULL; 1806 - goto out3; 1804 + goto out2; 1807 1805 } 1808 - inode = NULL; 1809 1806 1810 1807 bdevname(reg->hr_bdev, reg->hr_dev_name); 1811 1808 ··· 1904 1909 config_item_name(&reg->hr_item), reg->hr_dev_name); 1905 1910 1906 1911 out3: 1907 - iput(inode); 1912 + if (ret < 0) { 1913 + blkdev_put(reg->hr_bdev, FMODE_READ | FMODE_WRITE); 1914 + reg->hr_bdev = NULL; 1915 + } 1908 1916 out2: 1909 1917 fdput(f); 1910 1918 out: 1911 - if (ret < 0) { 1912 - if (reg->hr_bdev) { 1913 - blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE); 1914 - reg->hr_bdev = NULL; 1915 - } 1916 - } 1917 1919 return ret; 1918 1920 } 1919 1921

+2

fs/super.c

··· 1256 1256 s->s_dev = s->s_bdev->bd_dev; 1257 1257 s->s_bdi = bdi_get(s->s_bdev->bd_bdi); 1258 1258 1259 + if (blk_queue_stable_writes(s->s_bdev->bd_disk->queue)) 1260 + s->s_iflags |= SB_I_STABLE_WRITES; 1259 1261 return 0; 1260 1262 } 1261 1263

+2

fs/ubifs/super.c

··· 2177 2177 c->vi.vol_id); 2178 2178 if (err) 2179 2179 goto out_close; 2180 + sb->s_bdi->ra_pages = 0; 2181 + sb->s_bdi->io_pages = 0; 2180 2182 2181 2183 sb->s_fs_info = c; 2182 2184 sb->s_magic = UBIFS_SUPER_MAGIC;

+2

fs/vboxsf/super.c

··· 167 167 err = super_setup_bdi_name(sb, "vboxsf-%d", sbi->bdi_id); 168 168 if (err) 169 169 goto fail_free; 170 + sb->s_bdi->ra_pages = 0; 171 + sb->s_bdi->io_pages = 0; 170 172 171 173 /* Turn source into a shfl_string and map the folder */ 172 174 size = strlen(fc->source) + 1;

+13 -65

include/linux/backing-dev.h

··· 110 110 /* 111 111 * Flags in backing_dev_info::capability 112 112 * 113 - * The first three flags control whether dirty pages will contribute to the 114 - * VM's accounting and whether writepages() should be called for dirty pages 115 - * (something that would not, for example, be appropriate for ramfs) 116 - * 117 - * WARNING: these flags are closely related and should not normally be 118 - * used separately. The BDI_CAP_NO_ACCT_AND_WRITEBACK combines these 119 - * three flags into a single convenience macro. 120 - * 121 - * BDI_CAP_NO_ACCT_DIRTY: Dirty pages shouldn't contribute to accounting 122 - * BDI_CAP_NO_WRITEBACK: Don't write pages back 123 - * BDI_CAP_NO_ACCT_WB: Don't automatically account writeback pages 124 - * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold. 125 - * 126 - * BDI_CAP_CGROUP_WRITEBACK: Supports cgroup-aware writeback. 127 - * BDI_CAP_SYNCHRONOUS_IO: Device is so fast that asynchronous IO would be 128 - * inefficient. 113 + * BDI_CAP_WRITEBACK: Supports dirty page writeback, and dirty pages 114 + * should contribute to accounting 115 + * BDI_CAP_WRITEBACK_ACCT: Automatically account writeback pages 116 + * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold 129 117 */ 130 - #define BDI_CAP_NO_ACCT_DIRTY 0x00000001 131 - #define BDI_CAP_NO_WRITEBACK 0x00000002 132 - #define BDI_CAP_NO_ACCT_WB 0x00000004 133 - #define BDI_CAP_STABLE_WRITES 0x00000008 134 - #define BDI_CAP_STRICTLIMIT 0x00000010 135 - #define BDI_CAP_CGROUP_WRITEBACK 0x00000020 136 - #define BDI_CAP_SYNCHRONOUS_IO 0x00000040 137 - 138 - #define BDI_CAP_NO_ACCT_AND_WRITEBACK \ 139 - (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB) 118 + #define BDI_CAP_WRITEBACK (1 << 0) 119 + #define BDI_CAP_WRITEBACK_ACCT (1 << 1) 120 + #define BDI_CAP_STRICTLIMIT (1 << 2) 140 121 141 122 extern struct backing_dev_info noop_backing_dev_info; 142 123 ··· 156 175 long congestion_wait(int sync, long timeout); 157 176 long wait_iff_congested(int sync, long timeout); 158 177 159 - static inline bool bdi_cap_synchronous_io(struct backing_dev_info *bdi) 178 + static inline bool mapping_can_writeback(struct address_space *mapping) 160 179 { 161 - return bdi->capabilities & BDI_CAP_SYNCHRONOUS_IO; 162 - } 163 - 164 - static inline bool bdi_cap_stable_pages_required(struct backing_dev_info *bdi) 165 - { 166 - return bdi->capabilities & BDI_CAP_STABLE_WRITES; 167 - } 168 - 169 - static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi) 170 - { 171 - return !(bdi->capabilities & BDI_CAP_NO_WRITEBACK); 172 - } 173 - 174 - static inline bool bdi_cap_account_dirty(struct backing_dev_info *bdi) 175 - { 176 - return !(bdi->capabilities & BDI_CAP_NO_ACCT_DIRTY); 177 - } 178 - 179 - static inline bool bdi_cap_account_writeback(struct backing_dev_info *bdi) 180 - { 181 - /* Paranoia: BDI_CAP_NO_WRITEBACK implies BDI_CAP_NO_ACCT_WB */ 182 - return !(bdi->capabilities & (BDI_CAP_NO_ACCT_WB | 183 - BDI_CAP_NO_WRITEBACK)); 184 - } 185 - 186 - static inline bool mapping_cap_writeback_dirty(struct address_space *mapping) 187 - { 188 - return bdi_cap_writeback_dirty(inode_to_bdi(mapping->host)); 189 - } 190 - 191 - static inline bool mapping_cap_account_dirty(struct address_space *mapping) 192 - { 193 - return bdi_cap_account_dirty(inode_to_bdi(mapping->host)); 180 + return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK; 194 181 } 195 182 196 183 static inline int bdi_sched_wait(void *word) ··· 182 233 * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode 183 234 * @inode: inode of interest 184 235 * 185 - * cgroup writeback requires support from both the bdi and filesystem. 186 - * Also, both memcg and iocg have to be on the default hierarchy. Test 187 - * whether all conditions are met. 236 + * Cgroup writeback requires support from the filesystem. Also, both memcg and 237 + * iocg have to be on the default hierarchy. Test whether all conditions are 238 + * met. 188 239 * 189 240 * Note that the test result may change dynamically on the same inode 190 241 * depending on how memcg and iocg are configured. ··· 195 246 196 247 return cgroup_subsys_on_dfl(memory_cgrp_subsys) && 197 248 cgroup_subsys_on_dfl(io_cgrp_subsys) && 198 - bdi_cap_account_dirty(bdi) && 199 - (bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) && 249 + (bdi->capabilities & BDI_CAP_WRITEBACK) && 200 250 (inode->i_sb->s_iflags & SB_I_CGROUPWB); 201 251 } 202 252

+12 -3

include/linux/blk-mq.h

··· 139 139 * shared across request queues. 140 140 */ 141 141 atomic_t nr_active; 142 + /** 143 + * @elevator_queued: Number of queued requests on hctx. 144 + */ 145 + atomic_t elevator_queued; 142 146 143 147 /** @cpuhp_online: List to store request if CPU is going to die */ 144 148 struct hlist_node cpuhp_online; ··· 235 231 * @flags: Zero or more BLK_MQ_F_* flags. 236 232 * @driver_data: Pointer to data owned by the block driver that created this 237 233 * tag set. 234 + * @__bitmap_tags: A shared tags sbitmap, used over all hctx's 235 + * @__breserved_tags: 236 + * A shared reserved tags sbitmap, used over all hctx's 238 237 * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues 239 238 * elements. 240 239 * @tag_list_lock: Serializes tag_list accesses. ··· 256 249 unsigned int timeout; 257 250 unsigned int flags; 258 251 void *driver_data; 252 + atomic_t active_queues_shared_sbitmap; 259 253 254 + struct sbitmap_queue __bitmap_tags; 255 + struct sbitmap_queue __breserved_tags; 260 256 struct blk_mq_tags **tags; 261 257 262 258 struct mutex tag_list_lock; ··· 388 378 389 379 enum { 390 380 BLK_MQ_F_SHOULD_MERGE = 1 << 0, 391 - BLK_MQ_F_TAG_SHARED = 1 << 1, 381 + BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1, 392 382 /* 393 383 * Set when this device requires underlying blk-mq device for 394 384 * completing IO: 395 385 */ 396 386 BLK_MQ_F_STACKING = 1 << 2, 387 + BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, 397 388 BLK_MQ_F_BLOCKING = 1 << 5, 398 389 BLK_MQ_F_NO_SCHED = 1 << 6, 399 390 BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, ··· 500 489 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); 501 490 void blk_mq_complete_request(struct request *rq); 502 491 bool blk_mq_complete_request_remote(struct request *rq); 503 - bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, 504 - struct bio *bio, unsigned int nr_segs); 505 492 bool blk_mq_queue_stopped(struct request_queue *q); 506 493 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); 507 494 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);

+3 -4

include/linux/blk_types.h

··· 20 20 struct bio_crypt_ctx; 21 21 22 22 struct block_device { 23 - dev_t bd_dev; /* not a kdev_t - it's a search key */ 23 + dev_t bd_dev; 24 24 int bd_openers; 25 25 struct inode * bd_inode; /* will die */ 26 26 struct super_block * bd_super; ··· 37 37 struct hd_struct * bd_part; 38 38 /* number of times partitions within this device have been opened. */ 39 39 unsigned bd_part_count; 40 - int bd_invalidated; 40 + 41 + spinlock_t bd_size_lock; /* for bd_inode->i_size updates */ 41 42 struct gendisk * bd_disk; 42 43 struct backing_dev_info *bd_bdi; 43 44 ··· 256 255 BIO_NO_PAGE_REF, /* don't put release vec pages */ 257 256 BIO_CLONED, /* doesn't own data */ 258 257 BIO_BOUNCED, /* bio is a bounce bio */ 259 - BIO_USER_MAPPED, /* contains user pages */ 260 - BIO_NULL_MAPPED, /* contains invalid user pages */ 261 258 BIO_WORKINGSET, /* contains userspace workingset pages */ 262 259 BIO_QUIET, /* Make BIO Quiet */ 263 260 BIO_CHAIN, /* chained bio, ->bi_remaining in effect */

+34 -10

include/linux/blkdev.h

··· 24 24 #include <linux/percpu-refcount.h> 25 25 #include <linux/scatterlist.h> 26 26 #include <linux/blkzoned.h> 27 + #include <linux/pm.h> 27 28 28 29 struct module; 29 30 struct scsi_ioctl_command; ··· 459 458 460 459 #ifdef CONFIG_PM 461 460 struct device *dev; 462 - int rpm_status; 461 + enum rpm_status rpm_status; 463 462 unsigned int nr_pending; 464 463 #endif 465 464 ··· 484 483 485 484 struct timer_list timeout; 486 485 struct work_struct timeout_work; 486 + 487 + atomic_t nr_active_requests_shared_sbitmap; 487 488 488 489 struct list_head icq_list; 489 490 #ifdef CONFIG_BLK_CGROUP ··· 606 603 #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ 607 604 #define QUEUE_FLAG_DEAD 13 /* queue tear-down finished */ 608 605 #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ 606 + #define QUEUE_FLAG_STABLE_WRITES 15 /* don't modify blks until WB is done */ 609 607 #define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */ 610 608 #define QUEUE_FLAG_WC 17 /* Write back caching */ 611 609 #define QUEUE_FLAG_FUA 18 /* device supports FUA writes */ ··· 619 615 #define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */ 620 616 #define QUEUE_FLAG_ZONE_RESETALL 26 /* supports Zone Reset All */ 621 617 #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ 618 + #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ 622 619 623 620 #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ 624 621 (1 << QUEUE_FLAG_SAME_COMP)) ··· 636 631 #define blk_queue_noxmerges(q) \ 637 632 test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) 638 633 #define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) 634 + #define blk_queue_stable_writes(q) \ 635 + test_bit(QUEUE_FLAG_STABLE_WRITES, &(q)->queue_flags) 639 636 #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) 640 637 #define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) 641 638 #define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags) ··· 1066 1059 static inline unsigned int blk_max_size_offset(struct request_queue *q, 1067 1060 sector_t offset) 1068 1061 { 1069 - if (!q->limits.chunk_sectors) 1062 + unsigned int chunk_sectors = q->limits.chunk_sectors; 1063 + 1064 + if (!chunk_sectors) 1070 1065 return q->limits.max_sectors; 1071 1066 1072 - return min(q->limits.max_sectors, (unsigned int)(q->limits.chunk_sectors - 1073 - (offset & (q->limits.chunk_sectors - 1)))); 1067 + if (likely(is_power_of_2(chunk_sectors))) 1068 + chunk_sectors -= offset & (chunk_sectors - 1); 1069 + else 1070 + chunk_sectors -= sector_div(offset, chunk_sectors); 1071 + 1072 + return min(q->limits.max_sectors, chunk_sectors); 1074 1073 } 1075 1074 1076 1075 static inline unsigned int blk_rq_get_max_sectors(struct request *rq, ··· 1143 1130 extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); 1144 1131 extern void blk_queue_alignment_offset(struct request_queue *q, 1145 1132 unsigned int alignment); 1133 + void blk_queue_update_readahead(struct request_queue *q); 1146 1134 extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); 1147 1135 extern void blk_queue_io_min(struct request_queue *q, unsigned int min); 1148 1136 extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); ··· 1469 1455 1470 1456 if (q->limits.misaligned) 1471 1457 return -1; 1472 - 1473 1458 if (bdev != bdev->bd_contains) 1474 - return bdev->bd_part->alignment_offset; 1475 - 1459 + return queue_limit_alignment_offset(&q->limits, 1460 + bdev->bd_part->start_sect); 1476 1461 return q->limits.alignment_offset; 1477 1462 } 1478 1463 ··· 1511 1498 struct request_queue *q = bdev_get_queue(bdev); 1512 1499 1513 1500 if (bdev != bdev->bd_contains) 1514 - return bdev->bd_part->discard_alignment; 1515 - 1501 + return queue_limit_discard_alignment(&q->limits, 1502 + bdev->bd_part->start_sect); 1516 1503 return q->limits.discard_alignment; 1517 1504 } 1518 1505 ··· 1943 1930 void disk_end_io_acct(struct gendisk *disk, unsigned int op, 1944 1931 unsigned long start_time); 1945 1932 1933 + unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part, 1934 + struct bio *bio); 1935 + void part_end_io_acct(struct hd_struct *part, struct bio *bio, 1936 + unsigned long start_time); 1937 + 1946 1938 /** 1947 1939 * bio_start_io_acct - start I/O accounting for bio based drivers 1948 1940 * @bio: bio to start account for ··· 1985 1967 #define BLKDEV_MAJOR_MAX 0 1986 1968 #endif 1987 1969 1988 - int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder); 1989 1970 struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, 1990 1971 void *holder); 1991 1972 struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder); ··· 2001 1984 2002 1985 #ifdef CONFIG_BLOCK 2003 1986 void invalidate_bdev(struct block_device *bdev); 1987 + int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, 1988 + loff_t lend); 2004 1989 int sync_blockdev(struct block_device *bdev); 2005 1990 #else 2006 1991 static inline void invalidate_bdev(struct block_device *bdev) 2007 1992 { 1993 + } 1994 + static inline int truncate_bdev_range(struct block_device *bdev, fmode_t mode, 1995 + loff_t lstart, loff_t lend) 1996 + { 1997 + return 0; 2008 1998 } 2009 1999 static inline int sync_blockdev(struct block_device *bdev) 2010 2000 {

+1 -1

include/linux/fs.h

··· 1385 1385 #define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ 1386 1386 #define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ 1387 1387 #define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */ 1388 - #define SB_I_MULTIROOT 0x00000008 /* Multiple roots to the dentry tree */ 1388 + #define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */ 1389 1389 1390 1390 /* sb->s_iflags to limit user namespace mounts */ 1391 1391 #define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */

+7 -8

include/linux/genhd.h

··· 65 65 struct disk_stats __percpu *dkstats; 66 66 struct percpu_ref ref; 67 67 68 - sector_t alignment_offset; 69 - unsigned int discard_alignment; 70 68 struct device __dev; 71 69 struct kobject *holder_dir; 72 70 int policy, partno; ··· 191 193 void *private_data; 192 194 193 195 int flags; 196 + unsigned long state; 197 + #define GD_NEED_PART_SCAN 0 194 198 struct rw_semaphore lookup_sem; 195 199 struct kobject *slave_dir; 196 200 ··· 315 315 extern void disk_block_events(struct gendisk *disk); 316 316 extern void disk_unblock_events(struct gendisk *disk); 317 317 extern void disk_flush_events(struct gendisk *disk, unsigned int mask); 318 - extern void set_capacity_revalidate_and_notify(struct gendisk *disk, 319 - sector_t size, bool revalidate); 320 - extern unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask); 318 + void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, 319 + bool update_bdev); 321 320 322 321 /* drivers/char/random.c */ 323 322 extern void add_disk_randomness(struct gendisk *disk) __latent_entropy; ··· 371 372 int register_blkdev(unsigned int major, const char *name); 372 373 void unregister_blkdev(unsigned int major, const char *name); 373 374 374 - int revalidate_disk(struct gendisk *disk); 375 - int check_disk_change(struct block_device *bdev); 375 + void revalidate_disk_size(struct gendisk *disk, bool verbose); 376 + bool bdev_check_media_change(struct block_device *bdev); 376 377 int __invalidate_device(struct block_device *bdev, bool kill_dirty); 377 - void bd_set_size(struct block_device *bdev, loff_t size); 378 + void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors); 378 379 379 380 /* for drivers/char/raw.c: */ 380 381 int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long);

-2

include/linux/ide.h

··· 490 490 IDE_DFLAG_NOPROBE = BIT(9), 491 491 /* need to do check_media_change() */ 492 492 IDE_DFLAG_REMOVABLE = BIT(10), 493 - /* needed for removable devices */ 494 - IDE_DFLAG_ATTACH = BIT(11), 495 493 IDE_DFLAG_FORCED_GEOM = BIT(12), 496 494 /* disallow setting unmask bit */ 497 495 IDE_DFLAG_NO_UNMASK = BIT(13),

+2 -2

include/linux/suspend.h

··· 473 473 #endif /* CONFIG_HIBERNATION */ 474 474 475 475 #ifdef CONFIG_HIBERNATION_SNAPSHOT_DEV 476 - int is_hibernate_resume_dev(const struct inode *); 476 + int is_hibernate_resume_dev(dev_t dev); 477 477 #else 478 - static inline int is_hibernate_resume_dev(const struct inode *i) { return 0; } 478 + static inline int is_hibernate_resume_dev(dev_t dev) { return 0; } 479 479 #endif 480 480 481 481 /* Hibernation and suspend events */

+2 -1

include/linux/swap.h

··· 467 467 extern void swap_free(swp_entry_t); 468 468 extern void swapcache_free_entries(swp_entry_t *entries, int n); 469 469 extern int free_swap_and_cache(swp_entry_t); 470 - extern int swap_type_of(dev_t, sector_t, struct block_device **); 470 + int swap_type_of(dev_t device, sector_t offset); 471 + int find_first_swap(dev_t *device); 471 472 extern unsigned int count_swap_pages(int, int); 472 473 extern sector_t map_swap_page(struct page *, struct block_device **); 473 474 extern sector_t swapdev_block(int, pgoff_t);

+9 -17

include/trace/events/iocost.h

··· 26 26 __field(u64, vrate) 27 27 __field(u64, last_period) 28 28 __field(u64, cur_period) 29 - __field(u64, last_vtime) 30 29 __field(u64, vtime) 31 30 __field(u32, weight) 32 31 __field(u32, inuse) ··· 41 42 __entry->vrate = now->vrate; 42 43 __entry->last_period = last_period; 43 44 __entry->cur_period = cur_period; 44 - __entry->last_vtime = iocg->last_vtime; 45 45 __entry->vtime = vtime; 46 46 __entry->weight = iocg->weight; 47 47 __entry->inuse = iocg->inuse; ··· 49 51 ), 50 52 51 53 TP_printk("[%s:%s] now=%llu:%llu vrate=%llu " 52 - "period=%llu->%llu vtime=%llu->%llu " 54 + "period=%llu->%llu vtime=%llu " 53 55 "weight=%u/%u hweight=%llu/%llu", 54 56 __get_str(devname), __get_str(cgroup), 55 57 __entry->now, __entry->vnow, __entry->vrate, 56 58 __entry->last_period, __entry->cur_period, 57 - __entry->last_vtime, __entry->vtime, 58 - __entry->inuse, __entry->weight, 59 + __entry->vtime, __entry->inuse, __entry->weight, 59 60 __entry->hweight_inuse, __entry->hweight_active 60 61 ) 61 62 ); ··· 95 98 ) 96 99 ); 97 100 98 - DEFINE_EVENT(iocg_inuse_update, iocost_inuse_takeback, 101 + DEFINE_EVENT(iocg_inuse_update, iocost_inuse_shortage, 99 102 100 103 TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, 101 104 u32 old_inuse, u32 new_inuse, ··· 105 108 old_hw_inuse, new_hw_inuse) 106 109 ); 107 110 108 - DEFINE_EVENT(iocg_inuse_update, iocost_inuse_giveaway, 111 + DEFINE_EVENT(iocg_inuse_update, iocost_inuse_transfer, 109 112 110 113 TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, 111 114 u32 old_inuse, u32 new_inuse, ··· 115 118 old_hw_inuse, new_hw_inuse) 116 119 ); 117 120 118 - DEFINE_EVENT(iocg_inuse_update, iocost_inuse_reset, 121 + DEFINE_EVENT(iocg_inuse_update, iocost_inuse_adjust, 119 122 120 123 TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, 121 124 u32 old_inuse, u32 new_inuse, ··· 128 131 TRACE_EVENT(iocost_ioc_vrate_adj, 129 132 130 133 TP_PROTO(struct ioc *ioc, u64 new_vrate, u32 *missed_ppm, 131 - u32 rq_wait_pct, int nr_lagging, int nr_shortages, 132 - int nr_surpluses), 134 + u32 rq_wait_pct, int nr_lagging, int nr_shortages), 133 135 134 - TP_ARGS(ioc, new_vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages, 135 - nr_surpluses), 136 + TP_ARGS(ioc, new_vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages), 136 137 137 138 TP_STRUCT__entry ( 138 139 __string(devname, ioc_name(ioc)) ··· 142 147 __field(u32, rq_wait_pct) 143 148 __field(int, nr_lagging) 144 149 __field(int, nr_shortages) 145 - __field(int, nr_surpluses) 146 150 ), 147 151 148 152 TP_fast_assign( ··· 154 160 __entry->rq_wait_pct = rq_wait_pct; 155 161 __entry->nr_lagging = nr_lagging; 156 162 __entry->nr_shortages = nr_shortages; 157 - __entry->nr_surpluses = nr_surpluses; 158 163 ), 159 164 160 - TP_printk("[%s] vrate=%llu->%llu busy=%d missed_ppm=%u:%u rq_wait_pct=%u lagging=%d shortages=%d surpluses=%d", 165 + TP_printk("[%s] vrate=%llu->%llu busy=%d missed_ppm=%u:%u rq_wait_pct=%u lagging=%d shortages=%d", 161 166 __get_str(devname), __entry->old_vrate, __entry->new_vrate, 162 167 __entry->busy_level, 163 168 __entry->read_missed_ppm, __entry->write_missed_ppm, 164 - __entry->rq_wait_pct, __entry->nr_lagging, __entry->nr_shortages, 165 - __entry->nr_surpluses 169 + __entry->rq_wait_pct, __entry->nr_lagging, __entry->nr_shortages 166 170 ) 167 171 ); 168 172

+2

include/uapi/linux/capability.h

··· 288 288 processes and setting the scheduling algorithm used by another 289 289 process. */ 290 290 /* Allow setting cpu affinity on other processes */ 291 + /* Allow setting realtime ioprio class */ 292 + /* Allow setting ioprio class on other processes */ 291 293 292 294 #define CAP_SYS_NICE 23 293 295

+9 -12

kernel/power/swap.c

··· 335 335 { 336 336 int res; 337 337 338 - res = swap_type_of(swsusp_resume_device, swsusp_resume_block, 339 - &hib_resume_bdev); 338 + if (swsusp_resume_device) 339 + res = swap_type_of(swsusp_resume_device, swsusp_resume_block); 340 + else 341 + res = find_first_swap(&swsusp_resume_device); 340 342 if (res < 0) 341 343 return res; 342 - 343 344 root_swap = res; 344 - res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL); 345 - if (res) 346 - return res; 345 + 346 + hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, FMODE_WRITE, 347 + NULL); 348 + if (IS_ERR(hib_resume_bdev)) 349 + return PTR_ERR(hib_resume_bdev); 347 350 348 351 res = set_blocksize(hib_resume_bdev, PAGE_SIZE); 349 352 if (res < 0) 350 353 blkdev_put(hib_resume_bdev, FMODE_WRITE); 351 354 352 - /* 353 - * Update the resume device to the one actually used, 354 - * so the test_resume mode can use it in case it is 355 - * invoked from hibernate() to test the snapshot. 356 - */ 357 - swsusp_resume_device = hib_resume_bdev->bd_dev; 358 355 return res; 359 356 } 360 357

+9 -17

kernel/power/user.c

··· 35 35 bool ready; 36 36 bool platform_support; 37 37 bool free_bitmaps; 38 - struct inode *bd_inode; 38 + dev_t dev; 39 39 } snapshot_state; 40 40 41 - int is_hibernate_resume_dev(const struct inode *bd_inode) 41 + int is_hibernate_resume_dev(dev_t dev) 42 42 { 43 - return hibernation_available() && snapshot_state.bd_inode == bd_inode; 43 + return hibernation_available() && snapshot_state.dev == dev; 44 44 } 45 45 46 46 static int snapshot_open(struct inode *inode, struct file *filp) ··· 69 69 memset(&data->handle, 0, sizeof(struct snapshot_handle)); 70 70 if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { 71 71 /* Hibernating. The image device should be accessible. */ 72 - data->swap = swsusp_resume_device ? 73 - swap_type_of(swsusp_resume_device, 0, NULL) : -1; 72 + data->swap = swap_type_of(swsusp_resume_device, 0); 74 73 data->mode = O_RDONLY; 75 74 data->free_bitmaps = false; 76 75 error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); ··· 100 101 data->frozen = false; 101 102 data->ready = false; 102 103 data->platform_support = false; 103 - data->bd_inode = NULL; 104 + data->dev = 0; 104 105 105 106 Unlock: 106 107 unlock_system_sleep(); ··· 116 117 117 118 swsusp_free(); 118 119 data = filp->private_data; 119 - data->bd_inode = NULL; 120 + data->dev = 0; 120 121 free_all_swap_pages(data->swap); 121 122 if (data->frozen) { 122 123 pm_restore_gfp_mask(); ··· 209 210 static int snapshot_set_swap_area(struct snapshot_data *data, 210 211 void __user *argp) 211 212 { 212 - struct block_device *bdev; 213 213 sector_t offset; 214 214 dev_t swdev; 215 215 ··· 235 237 * User space encodes device types as two-byte values, 236 238 * so we need to recode them 237 239 */ 238 - if (!swdev) { 239 - data->swap = -1; 240 - return -EINVAL; 241 - } 242 - data->swap = swap_type_of(swdev, offset, &bdev); 240 + data->swap = swap_type_of(swdev, offset); 243 241 if (data->swap < 0) 244 - return -ENODEV; 245 - 246 - data->bd_inode = bdev->bd_inode; 247 - bdput(bdev); 242 + return swdev ? -ENODEV : -EINVAL; 243 + data->dev = swdev; 248 244 return 0; 249 245 } 250 246

+1 -1

kernel/trace/blktrace.c

··· 793 793 return cgroup_id(bio_blkcg(bio)->css.cgroup); 794 794 } 795 795 #else 796 - u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) 796 + static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) 797 797 { 798 798 return 0; 799 799 }

+7 -7

mm/backing-dev.c

··· 14 14 #include <linux/device.h> 15 15 #include <trace/events/writeback.h> 16 16 17 - struct backing_dev_info noop_backing_dev_info = { 18 - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 19 - }; 17 + struct backing_dev_info noop_backing_dev_info; 20 18 EXPORT_SYMBOL_GPL(noop_backing_dev_info); 21 19 22 20 static struct class *bdi_class; ··· 202 204 struct device_attribute *attr, 203 205 char *page) 204 206 { 205 - struct backing_dev_info *bdi = dev_get_drvdata(dev); 206 - 207 - return snprintf(page, PAGE_SIZE-1, "%d\n", 208 - bdi_cap_stable_pages_required(bdi) ? 1 : 0); 207 + dev_warn_once(dev, 208 + "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n"); 209 + return snprintf(page, PAGE_SIZE-1, "%d\n", 0); 209 210 } 210 211 static DEVICE_ATTR_RO(stable_pages_required); 211 212 ··· 743 746 kfree(bdi); 744 747 return NULL; 745 748 } 749 + bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT; 750 + bdi->ra_pages = VM_READAHEAD_PAGES; 751 + bdi->io_pages = VM_READAHEAD_PAGES; 746 752 return bdi; 747 753 } 748 754 EXPORT_SYMBOL(bdi_alloc);

+2 -2

mm/filemap.c

··· 414 414 .range_end = end, 415 415 }; 416 416 417 - if (!mapping_cap_writeback_dirty(mapping) || 417 + if (!mapping_can_writeback(mapping) || 418 418 !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 419 419 return 0; 420 420 ··· 1800 1800 no_page: 1801 1801 if (!page && (fgp_flags & FGP_CREAT)) { 1802 1802 int err; 1803 - if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping)) 1803 + if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) 1804 1804 gfp_mask |= __GFP_WRITE; 1805 1805 if (fgp_flags & FGP_NOFS) 1806 1806 gfp_mask &= ~__GFP_FS;

+1 -1

mm/memcontrol.c

··· 5643 5643 if (PageDirty(page)) { 5644 5644 struct address_space *mapping = page_mapping(page); 5645 5645 5646 - if (mapping_cap_account_dirty(mapping)) { 5646 + if (mapping_can_writeback(mapping)) { 5647 5647 __mod_lruvec_state(from_vec, NR_FILE_DIRTY, 5648 5648 -nr_pages); 5649 5649 __mod_lruvec_state(to_vec, NR_FILE_DIRTY,

+1 -1

mm/memory-failure.c

··· 1006 1006 */ 1007 1007 mapping = page_mapping(hpage); 1008 1008 if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping && 1009 - mapping_cap_writeback_dirty(mapping)) { 1009 + mapping_can_writeback(mapping)) { 1010 1010 if (page_mkclean(hpage)) { 1011 1011 SetPageDirty(hpage); 1012 1012 } else {

+1 -1

mm/migrate.c

··· 503 503 __dec_lruvec_state(old_lruvec, NR_SHMEM); 504 504 __inc_lruvec_state(new_lruvec, NR_SHMEM); 505 505 } 506 - if (dirty && mapping_cap_account_dirty(mapping)) { 506 + if (dirty && mapping_can_writeback(mapping)) { 507 507 __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY); 508 508 __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING); 509 509 __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY);

+1 -1

mm/mmap.c

··· 1666 1666 1667 1667 /* Can the mapping track the dirty pages? */ 1668 1668 return vma->vm_file && vma->vm_file->f_mapping && 1669 - mapping_cap_account_dirty(vma->vm_file->f_mapping); 1669 + mapping_can_writeback(vma->vm_file->f_mapping); 1670 1670 } 1671 1671 1672 1672 /*

+9 -9

mm/page-writeback.c

··· 1882 1882 int ratelimit; 1883 1883 int *p; 1884 1884 1885 - if (!bdi_cap_account_dirty(bdi)) 1885 + if (!(bdi->capabilities & BDI_CAP_WRITEBACK)) 1886 1886 return; 1887 1887 1888 1888 if (inode_cgwb_enabled(inode)) ··· 2423 2423 2424 2424 trace_writeback_dirty_page(page, mapping); 2425 2425 2426 - if (mapping_cap_account_dirty(mapping)) { 2426 + if (mapping_can_writeback(mapping)) { 2427 2427 struct bdi_writeback *wb; 2428 2428 2429 2429 inode_attach_wb(inode, page); ··· 2450 2450 void account_page_cleaned(struct page *page, struct address_space *mapping, 2451 2451 struct bdi_writeback *wb) 2452 2452 { 2453 - if (mapping_cap_account_dirty(mapping)) { 2453 + if (mapping_can_writeback(mapping)) { 2454 2454 dec_lruvec_page_state(page, NR_FILE_DIRTY); 2455 2455 dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); 2456 2456 dec_wb_stat(wb, WB_RECLAIMABLE); ··· 2513 2513 { 2514 2514 struct address_space *mapping = page->mapping; 2515 2515 2516 - if (mapping && mapping_cap_account_dirty(mapping)) { 2516 + if (mapping && mapping_can_writeback(mapping)) { 2517 2517 struct inode *inode = mapping->host; 2518 2518 struct bdi_writeback *wb; 2519 2519 struct wb_lock_cookie cookie = {}; ··· 2625 2625 { 2626 2626 struct address_space *mapping = page_mapping(page); 2627 2627 2628 - if (mapping_cap_account_dirty(mapping)) { 2628 + if (mapping_can_writeback(mapping)) { 2629 2629 struct inode *inode = mapping->host; 2630 2630 struct bdi_writeback *wb; 2631 2631 struct wb_lock_cookie cookie = {}; ··· 2665 2665 2666 2666 VM_BUG_ON_PAGE(!PageLocked(page), page); 2667 2667 2668 - if (mapping && mapping_cap_account_dirty(mapping)) { 2668 + if (mapping && mapping_can_writeback(mapping)) { 2669 2669 struct inode *inode = mapping->host; 2670 2670 struct bdi_writeback *wb; 2671 2671 struct wb_lock_cookie cookie = {}; ··· 2738 2738 if (ret) { 2739 2739 __xa_clear_mark(&mapping->i_pages, page_index(page), 2740 2740 PAGECACHE_TAG_WRITEBACK); 2741 - if (bdi_cap_account_writeback(bdi)) { 2741 + if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { 2742 2742 struct bdi_writeback *wb = inode_to_wb(inode); 2743 2743 2744 2744 dec_wb_stat(wb, WB_WRITEBACK); ··· 2791 2791 PAGECACHE_TAG_WRITEBACK); 2792 2792 2793 2793 xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); 2794 - if (bdi_cap_account_writeback(bdi)) 2794 + if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) 2795 2795 inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); 2796 2796 2797 2797 /* ··· 2849 2849 */ 2850 2850 void wait_for_stable_page(struct page *page) 2851 2851 { 2852 - if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host))) 2852 + if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES) 2853 2853 wait_on_page_writeback(page); 2854 2854 } 2855 2855 EXPORT_SYMBOL_GPL(wait_for_stable_page);

+10 -8

mm/page_io.c

··· 403 403 goto out; 404 404 } 405 405 406 - ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); 407 - if (!ret) { 408 - if (trylock_page(page)) { 409 - swap_slot_free_notify(page); 410 - unlock_page(page); 411 - } 406 + if (sis->flags & SWP_SYNCHRONOUS_IO) { 407 + ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); 408 + if (!ret) { 409 + if (trylock_page(page)) { 410 + swap_slot_free_notify(page); 411 + unlock_page(page); 412 + } 412 413 413 - count_vm_event(PSWPIN); 414 - goto out; 414 + count_vm_event(PSWPIN); 415 + goto out; 416 + } 415 417 } 416 418 417 419 ret = 0;

+26 -23

mm/swapfile.c

··· 1801 1801 * 1802 1802 * This is needed for the suspend to disk (aka swsusp). 1803 1803 */ 1804 - int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) 1804 + int swap_type_of(dev_t device, sector_t offset) 1805 1805 { 1806 - struct block_device *bdev = NULL; 1807 1806 int type; 1808 1807 1809 - if (device) 1810 - bdev = bdget(device); 1808 + if (!device) 1809 + return -1; 1811 1810 1812 1811 spin_lock(&swap_lock); 1813 1812 for (type = 0; type < nr_swapfiles; type++) { ··· 1815 1816 if (!(sis->flags & SWP_WRITEOK)) 1816 1817 continue; 1817 1818 1818 - if (!bdev) { 1819 - if (bdev_p) 1820 - *bdev_p = bdgrab(sis->bdev); 1821 - 1822 - spin_unlock(&swap_lock); 1823 - return type; 1824 - } 1825 - if (bdev == sis->bdev) { 1819 + if (device == sis->bdev->bd_dev) { 1826 1820 struct swap_extent *se = first_se(sis); 1827 1821 1828 1822 if (se->start_block == offset) { 1829 - if (bdev_p) 1830 - *bdev_p = bdgrab(sis->bdev); 1831 - 1832 1823 spin_unlock(&swap_lock); 1833 - bdput(bdev); 1834 1824 return type; 1835 1825 } 1836 1826 } 1837 1827 } 1838 1828 spin_unlock(&swap_lock); 1839 - if (bdev) 1840 - bdput(bdev); 1829 + return -ENODEV; 1830 + } 1841 1831 1832 + int find_first_swap(dev_t *device) 1833 + { 1834 + int type; 1835 + 1836 + spin_lock(&swap_lock); 1837 + for (type = 0; type < nr_swapfiles; type++) { 1838 + struct swap_info_struct *sis = swap_info[type]; 1839 + 1840 + if (!(sis->flags & SWP_WRITEOK)) 1841 + continue; 1842 + *device = sis->bdev->bd_dev; 1843 + spin_unlock(&swap_lock); 1844 + return type; 1845 + } 1846 + spin_unlock(&swap_lock); 1842 1847 return -ENODEV; 1843 1848 } 1844 1849 ··· 2923 2920 int error; 2924 2921 2925 2922 if (S_ISBLK(inode->i_mode)) { 2926 - p->bdev = bdgrab(I_BDEV(inode)); 2927 - error = blkdev_get(p->bdev, 2923 + p->bdev = blkdev_get_by_dev(inode->i_rdev, 2928 2924 FMODE_READ | FMODE_WRITE | FMODE_EXCL, p); 2929 - if (error < 0) { 2925 + if (IS_ERR(p->bdev)) { 2926 + error = PTR_ERR(p->bdev); 2930 2927 p->bdev = NULL; 2931 2928 return error; 2932 2929 } ··· 3237 3234 goto bad_swap_unlock_inode; 3238 3235 } 3239 3236 3240 - if (bdi_cap_stable_pages_required(inode_to_bdi(inode))) 3237 + if (p->bdev && blk_queue_stable_writes(p->bdev->bd_disk->queue)) 3241 3238 p->flags |= SWP_STABLE_WRITES; 3242 3239 3243 - if (bdi_cap_synchronous_io(inode_to_bdi(inode))) 3240 + if (p->bdev && p->bdev->bd_disk->fops->rw_page) 3244 3241 p->flags |= SWP_SYNCHRONOUS_IO; 3245 3242 3246 3243 if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {

+19 -35

tools/cgroup/iocost_monitor.py

··· 45 45 err('The kernel does not have iocost enabled') 46 46 47 47 IOC_RUNNING = prog['IOC_RUNNING'].value_() 48 - NR_USAGE_SLOTS = prog['NR_USAGE_SLOTS'].value_() 49 - HWEIGHT_WHOLE = prog['HWEIGHT_WHOLE'].value_() 48 + WEIGHT_ONE = prog['WEIGHT_ONE'].value_() 50 49 VTIME_PER_SEC = prog['VTIME_PER_SEC'].value_() 51 50 VTIME_PER_USEC = prog['VTIME_PER_USEC'].value_() 52 51 AUTOP_SSD_FAST = prog['AUTOP_SSD_FAST'].value_() ··· 99 100 self.period_ms = ioc.period_us.value_() / 1_000 100 101 self.period_at = ioc.period_at.value_() / 1_000_000 101 102 self.vperiod_at = ioc.period_at_vtime.value_() / VTIME_PER_SEC 102 - self.vrate_pct = ioc.vtime_rate.counter.value_() * 100 / VTIME_PER_USEC 103 + self.vrate_pct = ioc.vtime_base_rate.value_() * 100 / VTIME_PER_USEC 103 104 self.busy_level = ioc.busy_level.value_() 104 105 self.autop_idx = ioc.autop_idx.value_() 105 106 self.user_cost_model = ioc.user_cost_model.value_() ··· 135 136 136 137 def table_header_str(self): 137 138 return f'{"":25} active {"weight":>9} {"hweight%":>13} {"inflt%":>6} ' \ 138 - f'{"dbt":>3} {"delay":>6} {"usages%"}' 139 + f'{"debt":>7} {"delay":>7} {"usage%"}' 139 140 140 141 class IocgStat: 141 142 def __init__(self, iocg): ··· 143 144 blkg = iocg.pd.blkg 144 145 145 146 self.is_active = not list_empty(iocg.active_list.address_of_()) 146 - self.weight = iocg.weight.value_() 147 - self.active = iocg.active.value_() 148 - self.inuse = iocg.inuse.value_() 149 - self.hwa_pct = iocg.hweight_active.value_() * 100 / HWEIGHT_WHOLE 150 - self.hwi_pct = iocg.hweight_inuse.value_() * 100 / HWEIGHT_WHOLE 147 + self.weight = iocg.weight.value_() / WEIGHT_ONE 148 + self.active = iocg.active.value_() / WEIGHT_ONE 149 + self.inuse = iocg.inuse.value_() / WEIGHT_ONE 150 + self.hwa_pct = iocg.hweight_active.value_() * 100 / WEIGHT_ONE 151 + self.hwi_pct = iocg.hweight_inuse.value_() * 100 / WEIGHT_ONE 151 152 self.address = iocg.value_() 152 153 153 154 vdone = iocg.done_vtime.counter.value_() ··· 159 160 else: 160 161 self.inflight_pct = 0 161 162 162 - # vdebt used to be an atomic64_t and is now u64, support both 163 - try: 164 - self.debt_ms = iocg.abs_vdebt.counter.value_() / VTIME_PER_USEC / 1000 165 - except: 166 - self.debt_ms = iocg.abs_vdebt.value_() / VTIME_PER_USEC / 1000 167 - 168 - self.use_delay = blkg.use_delay.counter.value_() 169 - self.delay_ms = blkg.delay_nsec.counter.value_() / 1_000_000 170 - 171 - usage_idx = iocg.usage_idx.value_() 172 - self.usages = [] 173 - self.usage = 0 174 - for i in range(NR_USAGE_SLOTS): 175 - usage = iocg.usages[(usage_idx + 1 + i) % NR_USAGE_SLOTS].value_() 176 - upct = usage * 100 / HWEIGHT_WHOLE 177 - self.usages.append(upct) 178 - self.usage = max(self.usage, upct) 163 + self.usage = (100 * iocg.usage_delta_us.value_() / 164 + ioc.period_us.value_()) if self.active else 0 165 + self.debt_ms = iocg.abs_vdebt.value_() / VTIME_PER_USEC / 1000 166 + if blkg.use_delay.counter.value_() != 0: 167 + self.delay_ms = blkg.delay_nsec.counter.value_() / 1_000_000 168 + else: 169 + self.delay_ms = 0 179 170 180 171 def dict(self, now, path): 181 172 out = { 'cgroup' : path, ··· 178 189 'hweight_inuse_pct' : self.hwi_pct, 179 190 'inflight_pct' : self.inflight_pct, 180 191 'debt_ms' : self.debt_ms, 181 - 'use_delay' : self.use_delay, 182 192 'delay_ms' : self.delay_ms, 183 193 'usage_pct' : self.usage, 184 194 'address' : self.address } 185 - for i in range(len(self.usages)): 186 - out[f'usage_pct_{i}'] = str(self.usages[i]) 187 195 return out 188 196 189 197 def table_row_str(self, path): 190 198 out = f'{path[-28:]:28} ' \ 191 199 f'{"*" if self.is_active else " "} ' \ 192 - f'{self.inuse:5}/{self.active:5} ' \ 200 + f'{round(self.inuse):5}/{round(self.active):5} ' \ 193 201 f'{self.hwi_pct:6.2f}/{self.hwa_pct:6.2f} ' \ 194 202 f'{self.inflight_pct:6.2f} ' \ 195 - f'{min(math.ceil(self.debt_ms), 999):3} ' \ 196 - f'{min(self.use_delay, 99):2}*'\ 197 - f'{min(math.ceil(self.delay_ms), 999):03} ' 198 - for u in self.usages: 199 - out += f'{min(round(u), 999):03d}:' 203 + f'{self.debt_ms:7.2f} ' \ 204 + f'{self.delay_ms:7.2f} '\ 205 + f'{min(self.usage, 999):6.2f}' 200 206 out = out.rstrip(':') 201 207 return out 202 208