Merge branch 'for-linus' into test · tjh.dev/kernel@3c15f3f

+3

block/blk-core.c

··· 145 145 [BLK_STS_MEDIUM] = { -ENODATA, "critical medium" }, 146 146 [BLK_STS_PROTECTION] = { -EILSEQ, "protection" }, 147 147 [BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" }, 148 + [BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" }, 148 149 [BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" }, 149 150 150 151 /* device mapper special case, should not leak out: */ ··· 3283 3282 { 3284 3283 if (bio_has_data(bio)) 3285 3284 rq->nr_phys_segments = bio_phys_segments(q, bio); 3285 + else if (bio_op(bio) == REQ_OP_DISCARD) 3286 + rq->nr_phys_segments = 1; 3286 3287 3287 3288 rq->__data_len = bio->bi_iter.bi_size; 3288 3289 rq->bio = rq->biotail = bio;

+26 -3

block/blk-merge.c

··· 550 550 return !q->mq_ops && req->special; 551 551 } 552 552 553 + static bool req_attempt_discard_merge(struct request_queue *q, struct request *req, 554 + struct request *next) 555 + { 556 + unsigned short segments = blk_rq_nr_discard_segments(req); 557 + 558 + if (segments >= queue_max_discard_segments(q)) 559 + goto no_merge; 560 + if (blk_rq_sectors(req) + bio_sectors(next->bio) > 561 + blk_rq_get_max_sectors(req, blk_rq_pos(req))) 562 + goto no_merge; 563 + 564 + req->nr_phys_segments = segments + blk_rq_nr_discard_segments(next); 565 + return true; 566 + no_merge: 567 + req_set_nomerge(q, req); 568 + return false; 569 + } 570 + 553 571 static int ll_merge_requests_fn(struct request_queue *q, struct request *req, 554 572 struct request *next) 555 573 { ··· 701 683 * If we are allowed to merge, then append bio list 702 684 * from next to rq and release next. merge_requests_fn 703 685 * will have updated segment counts, update sector 704 - * counts here. 686 + * counts here. Handle DISCARDs separately, as they 687 + * have separate settings. 705 688 */ 706 - if (!ll_merge_requests_fn(q, req, next)) 689 + if (req_op(req) == REQ_OP_DISCARD) { 690 + if (!req_attempt_discard_merge(q, req, next)) 691 + return NULL; 692 + } else if (!ll_merge_requests_fn(q, req, next)) 707 693 return NULL; 708 694 709 695 /* ··· 737 715 738 716 req->__data_len += blk_rq_bytes(next); 739 717 740 - elv_merge_requests(q, req, next); 718 + if (req_op(req) != REQ_OP_DISCARD) 719 + elv_merge_requests(q, req, next); 741 720 742 721 /* 743 722 * 'next' is going away, so update stats accordingly

+2

block/blk-mq-sched.c

··· 259 259 if (!*merged_request) 260 260 elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE); 261 261 return true; 262 + case ELEVATOR_DISCARD_MERGE: 263 + return bio_attempt_discard_merge(q, rq, bio); 262 264 default: 263 265 return false; 264 266 }

+16 -4

block/blk-mq.c

··· 1162 1162 return true; 1163 1163 } 1164 1164 1165 + #define BLK_MQ_RESOURCE_DELAY 3 /* ms units */ 1166 + 1165 1167 bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, 1166 1168 bool got_budget) 1167 1169 { ··· 1171 1169 struct request *rq, *nxt; 1172 1170 bool no_tag = false; 1173 1171 int errors, queued; 1172 + blk_status_t ret = BLK_STS_OK; 1174 1173 1175 1174 if (list_empty(list)) 1176 1175 return false; ··· 1184 1181 errors = queued = 0; 1185 1182 do { 1186 1183 struct blk_mq_queue_data bd; 1187 - blk_status_t ret; 1188 1184 1189 1185 rq = list_first_entry(list, struct request, queuelist); 1190 1186 if (!blk_mq_get_driver_tag(rq, &hctx, false)) { ··· 1228 1226 } 1229 1227 1230 1228 ret = q->mq_ops->queue_rq(hctx, &bd); 1231 - if (ret == BLK_STS_RESOURCE) { 1229 + if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { 1232 1230 /* 1233 1231 * If an I/O scheduler has been configured and we got a 1234 1232 * driver tag for the next request already, free it ··· 1259 1257 * that is where we will continue on next queue run. 1260 1258 */ 1261 1259 if (!list_empty(list)) { 1260 + bool needs_restart; 1261 + 1262 1262 spin_lock(&hctx->lock); 1263 1263 list_splice_init(list, &hctx->dispatch); 1264 1264 spin_unlock(&hctx->lock); ··· 1284 1280 * - Some but not all block drivers stop a queue before 1285 1281 * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq 1286 1282 * and dm-rq. 1283 + * 1284 + * If driver returns BLK_STS_RESOURCE and SCHED_RESTART 1285 + * bit is set, run queue after a delay to avoid IO stalls 1286 + * that could otherwise occur if the queue is idle. 1287 1287 */ 1288 - if (!blk_mq_sched_needs_restart(hctx) || 1288 + needs_restart = blk_mq_sched_needs_restart(hctx); 1289 + if (!needs_restart || 1289 1290 (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) 1290 1291 blk_mq_run_hw_queue(hctx, true); 1292 + else if (needs_restart && (ret == BLK_STS_RESOURCE)) 1293 + blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); 1291 1294 } 1292 1295 1293 1296 return (queued + errors) != 0; ··· 1775 1764 *cookie = new_cookie; 1776 1765 break; 1777 1766 case BLK_STS_RESOURCE: 1767 + case BLK_STS_DEV_RESOURCE: 1778 1768 __blk_mq_requeue_request(rq); 1779 1769 break; 1780 1770 default: ··· 1838 1826 hctx_lock(hctx, &srcu_idx); 1839 1827 1840 1828 ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false); 1841 - if (ret == BLK_STS_RESOURCE) 1829 + if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) 1842 1830 blk_mq_sched_insert_request(rq, false, true, false); 1843 1831 else if (ret != BLK_STS_OK) 1844 1832 blk_mq_end_request(rq, ret);

+1 -1

drivers/block/null_blk.c

··· 1230 1230 return BLK_STS_OK; 1231 1231 } else 1232 1232 /* requeue request */ 1233 - return BLK_STS_RESOURCE; 1233 + return BLK_STS_DEV_RESOURCE; 1234 1234 } 1235 1235 } 1236 1236

+1 -1

drivers/block/virtio_blk.c

··· 276 276 /* Out of mem doesn't actually happen, since we fall back 277 277 * to direct descriptors */ 278 278 if (err == -ENOMEM || err == -ENOSPC) 279 - return BLK_STS_RESOURCE; 279 + return BLK_STS_DEV_RESOURCE; 280 280 return BLK_STS_IOERR; 281 281 } 282 282

+1 -1

drivers/block/xen-blkfront.c

··· 911 911 out_busy: 912 912 blk_mq_stop_hw_queue(hctx); 913 913 spin_unlock_irqrestore(&rinfo->ring_lock, flags); 914 - return BLK_STS_RESOURCE; 914 + return BLK_STS_DEV_RESOURCE; 915 915 } 916 916 917 917 static void blkif_complete_rq(struct request *rq)

+2 -3

drivers/md/dm-rq.c

··· 408 408 409 409 clone->start_time = jiffies; 410 410 r = blk_insert_cloned_request(clone->q, clone); 411 - if (r != BLK_STS_OK && r != BLK_STS_RESOURCE) 411 + if (r != BLK_STS_OK && r != BLK_STS_RESOURCE && r != BLK_STS_DEV_RESOURCE) 412 412 /* must complete clone in terms of original request */ 413 413 dm_complete_request(rq, r); 414 414 return r; ··· 500 500 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), 501 501 blk_rq_pos(rq)); 502 502 ret = dm_dispatch_clone_request(clone, rq); 503 - if (ret == BLK_STS_RESOURCE) { 503 + if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { 504 504 blk_rq_unprep_clone(clone); 505 505 tio->ti->type->release_clone_rq(clone); 506 506 tio->clone = NULL; ··· 772 772 /* Undo dm_start_request() before requeuing */ 773 773 rq_end_stats(md, rq); 774 774 rq_completed(md, rq_data_dir(rq), false); 775 - blk_mq_delay_run_hw_queue(hctx, 100/*ms*/); 776 775 return BLK_STS_RESOURCE; 777 776 } 778 777

+2 -10

drivers/nvme/host/fc.c

··· 35 35 NVME_FC_Q_LIVE, 36 36 }; 37 37 38 - #define NVMEFC_QUEUE_DELAY 3 /* ms units */ 39 - 40 38 #define NVME_FC_DEFAULT_DEV_LOSS_TMO 60 /* seconds */ 41 39 42 40 struct nvme_fc_queue { ··· 2229 2231 * the target device is present 2230 2232 */ 2231 2233 if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE) 2232 - goto busy; 2234 + return BLK_STS_RESOURCE; 2233 2235 2234 2236 if (!nvme_fc_ctrl_get(ctrl)) 2235 2237 return BLK_STS_IOERR; ··· 2309 2311 ret != -EBUSY) 2310 2312 return BLK_STS_IOERR; 2311 2313 2312 - goto busy; 2314 + return BLK_STS_RESOURCE; 2313 2315 } 2314 2316 2315 2317 return BLK_STS_OK; 2316 - 2317 - busy: 2318 - if (!(op->flags & FCOP_FLAGS_AEN) && queue->hctx) 2319 - blk_mq_delay_run_hw_queue(queue->hctx, NVMEFC_QUEUE_DELAY); 2320 - 2321 - return BLK_STS_RESOURCE; 2322 2318 } 2323 2319 2324 2320 static inline blk_status_t nvme_fc_is_ready(struct nvme_fc_queue *queue,

+3 -3

drivers/scsi/scsi_lib.c

··· 2046 2046 case BLK_STS_OK: 2047 2047 break; 2048 2048 case BLK_STS_RESOURCE: 2049 - if (atomic_read(&sdev->device_busy) == 0 && 2050 - !scsi_device_blocked(sdev)) 2051 - blk_mq_delay_run_hw_queue(hctx, SCSI_QUEUE_DELAY); 2049 + if (atomic_read(&sdev->device_busy) || 2050 + scsi_device_blocked(sdev)) 2051 + ret = BLK_STS_DEV_RESOURCE; 2052 2052 break; 2053 2053 default: 2054 2054 /*

+18

include/linux/blk_types.h

··· 39 39 40 40 #define BLK_STS_AGAIN ((__force blk_status_t)12) 41 41 42 + /* 43 + * BLK_STS_DEV_RESOURCE is returned from the driver to the block layer if 44 + * device related resources are unavailable, but the driver can guarantee 45 + * that the queue will be rerun in the future once resources become 46 + * available again. This is typically the case for device specific 47 + * resources that are consumed for IO. If the driver fails allocating these 48 + * resources, we know that inflight (or pending) IO will free these 49 + * resource upon completion. 50 + * 51 + * This is different from BLK_STS_RESOURCE in that it explicitly references 52 + * a device specific resource. For resources of wider scope, allocation 53 + * failure can happen without having pending IO. This means that we can't 54 + * rely on request completions freeing these resources, as IO may not be in 55 + * flight. Examples of that are kernel memory allocations, DMA mappings, or 56 + * any other system wide resources. 57 + */ 58 + #define BLK_STS_DEV_RESOURCE ((__force blk_status_t)13) 59 + 42 60 /** 43 61 * blk_path_error - returns true if error may be path related 44 62 * @error: status the request was completed with

Configure Feed

Configure Feed