Merge branch 'for-3.10/core' of git://git.kernel.dk/linux-block

+44 -3

Documentation/block/cfq-iosched.txt

··· 5 5 I/O bandwidth for all the processes which requests an I/O operation. 6 6 7 7 CFQ maintains the per process queue for the processes which request I/O 8 - operation(syncronous requests). In case of asynchronous requests, all the 8 + operation(synchronous requests). In case of asynchronous requests, all the 9 9 requests from all the processes are batched together according to their 10 10 process's I/O priority. 11 11 ··· 66 66 value of this is 124ms. In case to favor synchronous requests over asynchronous 67 67 one, this value should be decreased relative to fifo_expire_async. 68 68 69 + group_idle 70 + ----------- 71 + This parameter forces idling at the CFQ group level instead of CFQ 72 + queue level. This was introduced after after a bottleneck was observed 73 + in higher end storage due to idle on sequential queue and allow dispatch 74 + from a single queue. The idea with this parameter is that it can be run with 75 + slice_idle=0 and group_idle=8, so that idling does not happen on individual 76 + queues in the group but happens overall on the group and thus still keeps the 77 + IO controller working. 78 + Not idling on individual queues in the group will dispatch requests from 79 + multiple queues in the group at the same time and achieve higher throughput 80 + on higher end storage. 81 + 82 + Default value for this parameter is 8ms. 83 + 84 + latency 85 + ------- 86 + This parameter is used to enable/disable the latency mode of the CFQ 87 + scheduler. If latency mode (called low_latency) is enabled, CFQ tries 88 + to recompute the slice time for each process based on the target_latency set 89 + for the system. This favors fairness over throughput. Disabling low 90 + latency (setting it to 0) ignores target latency, allowing each process in the 91 + system to get a full time slice. 92 + 93 + By default low latency mode is enabled. 94 + 95 + target_latency 96 + -------------- 97 + This parameter is used to calculate the time slice for a process if cfq's 98 + latency mode is enabled. It will ensure that sync requests have an estimated 99 + latency. But if sequential workload is higher(e.g. sequential read), 100 + then to meet the latency constraints, throughput may decrease because of less 101 + time for each process to issue I/O request before the cfq queue is switched. 102 + 103 + Though this can be overcome by disabling the latency_mode, it may increase 104 + the read latency for some applications. This parameter allows for changing 105 + target_latency through the sysfs interface which can provide the balanced 106 + throughput and read latency. 107 + 108 + Default value for target_latency is 300ms. 109 + 69 110 slice_async 70 111 ----------- 71 112 This parameter is same as of slice_sync but for asynchronous queue. The ··· 139 98 request. 140 99 141 100 In case of storage with several disk, this setting can limit the parallel 142 - processing of request. Therefore, increasing the value can imporve the 143 - performace although this can cause the latency of some I/O to increase due 101 + processing of request. Therefore, increasing the value can improve the 102 + performance although this can cause the latency of some I/O to increase due 144 103 to more number of requests. 145 104 146 105 CFQ Group scheduling

+2 -2

block/blk-cgroup.c

··· 972 972 if (!new_blkg) 973 973 return -ENOMEM; 974 974 975 - preloaded = !radix_tree_preload(GFP_KERNEL); 976 - 977 975 blk_queue_bypass_start(q); 976 + 977 + preloaded = !radix_tree_preload(GFP_KERNEL); 978 978 979 979 /* 980 980 * Make sure the root blkg exists and count the existing blkgs. As

+196 -69

block/blk-core.c

··· 30 30 #include <linux/list_sort.h> 31 31 #include <linux/delay.h> 32 32 #include <linux/ratelimit.h> 33 + #include <linux/pm_runtime.h> 33 34 34 35 #define CREATE_TRACE_POINTS 35 36 #include <trace/events/block.h> ··· 160 159 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 161 160 error = -EIO; 162 161 163 - if (unlikely(nbytes > bio->bi_size)) { 164 - printk(KERN_ERR "%s: want %u bytes done, %u left\n", 165 - __func__, nbytes, bio->bi_size); 166 - nbytes = bio->bi_size; 167 - } 168 - 169 162 if (unlikely(rq->cmd_flags & REQ_QUIET)) 170 163 set_bit(BIO_QUIET, &bio->bi_flags); 171 164 172 - bio->bi_size -= nbytes; 173 - bio->bi_sector += (nbytes >> 9); 174 - 175 - if (bio_integrity(bio)) 176 - bio_integrity_advance(bio, nbytes); 165 + bio_advance(bio, nbytes); 177 166 178 167 /* don't actually finish bio if it's part of flush sequence */ 179 168 if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ)) ··· 1255 1264 } 1256 1265 EXPORT_SYMBOL_GPL(part_round_stats); 1257 1266 1267 + #ifdef CONFIG_PM_RUNTIME 1268 + static void blk_pm_put_request(struct request *rq) 1269 + { 1270 + if (rq->q->dev && !(rq->cmd_flags & REQ_PM) && !--rq->q->nr_pending) 1271 + pm_runtime_mark_last_busy(rq->q->dev); 1272 + } 1273 + #else 1274 + static inline void blk_pm_put_request(struct request *rq) {} 1275 + #endif 1276 + 1258 1277 /* 1259 1278 * queue lock must be held 1260 1279 */ ··· 1274 1273 return; 1275 1274 if (unlikely(--req->ref_count)) 1276 1275 return; 1276 + 1277 + blk_pm_put_request(req); 1277 1278 1278 1279 elv_completed_request(q, req); 1279 1280 ··· 1600 1597 printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n", 1601 1598 bdevname(bio->bi_bdev, b), 1602 1599 bio->bi_rw, 1603 - (unsigned long long)bio->bi_sector + bio_sectors(bio), 1600 + (unsigned long long)bio_end_sector(bio), 1604 1601 (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9)); 1605 1602 1606 1603 set_bit(BIO_EOF, &bio->bi_flags); ··· 2056 2053 } 2057 2054 } 2058 2055 2056 + #ifdef CONFIG_PM_RUNTIME 2057 + /* 2058 + * Don't process normal requests when queue is suspended 2059 + * or in the process of suspending/resuming 2060 + */ 2061 + static struct request *blk_pm_peek_request(struct request_queue *q, 2062 + struct request *rq) 2063 + { 2064 + if (q->dev && (q->rpm_status == RPM_SUSPENDED || 2065 + (q->rpm_status != RPM_ACTIVE && !(rq->cmd_flags & REQ_PM)))) 2066 + return NULL; 2067 + else 2068 + return rq; 2069 + } 2070 + #else 2071 + static inline struct request *blk_pm_peek_request(struct request_queue *q, 2072 + struct request *rq) 2073 + { 2074 + return rq; 2075 + } 2076 + #endif 2077 + 2059 2078 /** 2060 2079 * blk_peek_request - peek at the top of a request queue 2061 2080 * @q: request queue to peek at ··· 2100 2075 int ret; 2101 2076 2102 2077 while ((rq = __elv_next_request(q)) != NULL) { 2078 + 2079 + rq = blk_pm_peek_request(q, rq); 2080 + if (!rq) 2081 + break; 2082 + 2103 2083 if (!(rq->cmd_flags & REQ_STARTED)) { 2104 2084 /* 2105 2085 * This is the first time the device driver ··· 2283 2253 **/ 2284 2254 bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) 2285 2255 { 2286 - int total_bytes, bio_nbytes, next_idx = 0; 2287 - struct bio *bio; 2256 + int total_bytes; 2288 2257 2289 2258 if (!req->bio) 2290 2259 return false; ··· 2329 2300 2330 2301 blk_account_io_completion(req, nr_bytes); 2331 2302 2332 - total_bytes = bio_nbytes = 0; 2333 - while ((bio = req->bio) != NULL) { 2334 - int nbytes; 2303 + total_bytes = 0; 2304 + while (req->bio) { 2305 + struct bio *bio = req->bio; 2306 + unsigned bio_bytes = min(bio->bi_size, nr_bytes); 2335 2307 2336 - if (nr_bytes >= bio->bi_size) { 2308 + if (bio_bytes == bio->bi_size) 2337 2309 req->bio = bio->bi_next; 2338 - nbytes = bio->bi_size; 2339 - req_bio_endio(req, bio, nbytes, error); 2340 - next_idx = 0; 2341 - bio_nbytes = 0; 2342 - } else { 2343 - int idx = bio->bi_idx + next_idx; 2344 2310 2345 - if (unlikely(idx >= bio->bi_vcnt)) { 2346 - blk_dump_rq_flags(req, "__end_that"); 2347 - printk(KERN_ERR "%s: bio idx %d >= vcnt %d\n", 2348 - __func__, idx, bio->bi_vcnt); 2349 - break; 2350 - } 2311 + req_bio_endio(req, bio, bio_bytes, error); 2351 2312 2352 - nbytes = bio_iovec_idx(bio, idx)->bv_len; 2353 - BIO_BUG_ON(nbytes > bio->bi_size); 2313 + total_bytes += bio_bytes; 2314 + nr_bytes -= bio_bytes; 2354 2315 2355 - /* 2356 - * not a complete bvec done 2357 - */ 2358 - if (unlikely(nbytes > nr_bytes)) { 2359 - bio_nbytes += nr_bytes; 2360 - total_bytes += nr_bytes; 2361 - break; 2362 - } 2363 - 2364 - /* 2365 - * advance to the next vector 2366 - */ 2367 - next_idx++; 2368 - bio_nbytes += nbytes; 2369 - } 2370 - 2371 - total_bytes += nbytes; 2372 - nr_bytes -= nbytes; 2373 - 2374 - bio = req->bio; 2375 - if (bio) { 2376 - /* 2377 - * end more in this run, or just return 'not-done' 2378 - */ 2379 - if (unlikely(nr_bytes <= 0)) 2380 - break; 2381 - } 2316 + if (!nr_bytes) 2317 + break; 2382 2318 } 2383 2319 2384 2320 /* ··· 2357 2363 */ 2358 2364 req->__data_len = 0; 2359 2365 return false; 2360 - } 2361 - 2362 - /* 2363 - * if the request wasn't completed, update state 2364 - */ 2365 - if (bio_nbytes) { 2366 - req_bio_endio(req, bio, bio_nbytes, error); 2367 - bio->bi_idx += next_idx; 2368 - bio_iovec(bio)->bv_offset += nr_bytes; 2369 - bio_iovec(bio)->bv_len -= nr_bytes; 2370 2366 } 2371 2367 2372 2368 req->__data_len -= total_bytes; ··· 3029 3045 current->plug = NULL; 3030 3046 } 3031 3047 EXPORT_SYMBOL(blk_finish_plug); 3048 + 3049 + #ifdef CONFIG_PM_RUNTIME 3050 + /** 3051 + * blk_pm_runtime_init - Block layer runtime PM initialization routine 3052 + * @q: the queue of the device 3053 + * @dev: the device the queue belongs to 3054 + * 3055 + * Description: 3056 + * Initialize runtime-PM-related fields for @q and start auto suspend for 3057 + * @dev. Drivers that want to take advantage of request-based runtime PM 3058 + * should call this function after @dev has been initialized, and its 3059 + * request queue @q has been allocated, and runtime PM for it can not happen 3060 + * yet(either due to disabled/forbidden or its usage_count > 0). In most 3061 + * cases, driver should call this function before any I/O has taken place. 3062 + * 3063 + * This function takes care of setting up using auto suspend for the device, 3064 + * the autosuspend delay is set to -1 to make runtime suspend impossible 3065 + * until an updated value is either set by user or by driver. Drivers do 3066 + * not need to touch other autosuspend settings. 3067 + * 3068 + * The block layer runtime PM is request based, so only works for drivers 3069 + * that use request as their IO unit instead of those directly use bio's. 3070 + */ 3071 + void blk_pm_runtime_init(struct request_queue *q, struct device *dev) 3072 + { 3073 + q->dev = dev; 3074 + q->rpm_status = RPM_ACTIVE; 3075 + pm_runtime_set_autosuspend_delay(q->dev, -1); 3076 + pm_runtime_use_autosuspend(q->dev); 3077 + } 3078 + EXPORT_SYMBOL(blk_pm_runtime_init); 3079 + 3080 + /** 3081 + * blk_pre_runtime_suspend - Pre runtime suspend check 3082 + * @q: the queue of the device 3083 + * 3084 + * Description: 3085 + * This function will check if runtime suspend is allowed for the device 3086 + * by examining if there are any requests pending in the queue. If there 3087 + * are requests pending, the device can not be runtime suspended; otherwise, 3088 + * the queue's status will be updated to SUSPENDING and the driver can 3089 + * proceed to suspend the device. 3090 + * 3091 + * For the not allowed case, we mark last busy for the device so that 3092 + * runtime PM core will try to autosuspend it some time later. 3093 + * 3094 + * This function should be called near the start of the device's 3095 + * runtime_suspend callback. 3096 + * 3097 + * Return: 3098 + * 0 - OK to runtime suspend the device 3099 + * -EBUSY - Device should not be runtime suspended 3100 + */ 3101 + int blk_pre_runtime_suspend(struct request_queue *q) 3102 + { 3103 + int ret = 0; 3104 + 3105 + spin_lock_irq(q->queue_lock); 3106 + if (q->nr_pending) { 3107 + ret = -EBUSY; 3108 + pm_runtime_mark_last_busy(q->dev); 3109 + } else { 3110 + q->rpm_status = RPM_SUSPENDING; 3111 + } 3112 + spin_unlock_irq(q->queue_lock); 3113 + return ret; 3114 + } 3115 + EXPORT_SYMBOL(blk_pre_runtime_suspend); 3116 + 3117 + /** 3118 + * blk_post_runtime_suspend - Post runtime suspend processing 3119 + * @q: the queue of the device 3120 + * @err: return value of the device's runtime_suspend function 3121 + * 3122 + * Description: 3123 + * Update the queue's runtime status according to the return value of the 3124 + * device's runtime suspend function and mark last busy for the device so 3125 + * that PM core will try to auto suspend the device at a later time. 3126 + * 3127 + * This function should be called near the end of the device's 3128 + * runtime_suspend callback. 3129 + */ 3130 + void blk_post_runtime_suspend(struct request_queue *q, int err) 3131 + { 3132 + spin_lock_irq(q->queue_lock); 3133 + if (!err) { 3134 + q->rpm_status = RPM_SUSPENDED; 3135 + } else { 3136 + q->rpm_status = RPM_ACTIVE; 3137 + pm_runtime_mark_last_busy(q->dev); 3138 + } 3139 + spin_unlock_irq(q->queue_lock); 3140 + } 3141 + EXPORT_SYMBOL(blk_post_runtime_suspend); 3142 + 3143 + /** 3144 + * blk_pre_runtime_resume - Pre runtime resume processing 3145 + * @q: the queue of the device 3146 + * 3147 + * Description: 3148 + * Update the queue's runtime status to RESUMING in preparation for the 3149 + * runtime resume of the device. 3150 + * 3151 + * This function should be called near the start of the device's 3152 + * runtime_resume callback. 3153 + */ 3154 + void blk_pre_runtime_resume(struct request_queue *q) 3155 + { 3156 + spin_lock_irq(q->queue_lock); 3157 + q->rpm_status = RPM_RESUMING; 3158 + spin_unlock_irq(q->queue_lock); 3159 + } 3160 + EXPORT_SYMBOL(blk_pre_runtime_resume); 3161 + 3162 + /** 3163 + * blk_post_runtime_resume - Post runtime resume processing 3164 + * @q: the queue of the device 3165 + * @err: return value of the device's runtime_resume function 3166 + * 3167 + * Description: 3168 + * Update the queue's runtime status according to the return value of the 3169 + * device's runtime_resume function. If it is successfully resumed, process 3170 + * the requests that are queued into the device's queue when it is resuming 3171 + * and then mark last busy and initiate autosuspend for it. 3172 + * 3173 + * This function should be called near the end of the device's 3174 + * runtime_resume callback. 3175 + */ 3176 + void blk_post_runtime_resume(struct request_queue *q, int err) 3177 + { 3178 + spin_lock_irq(q->queue_lock); 3179 + if (!err) { 3180 + q->rpm_status = RPM_ACTIVE; 3181 + __blk_run_queue(q); 3182 + pm_runtime_mark_last_busy(q->dev); 3183 + pm_runtime_autosuspend(q->dev); 3184 + } else { 3185 + q->rpm_status = RPM_SUSPENDED; 3186 + } 3187 + spin_unlock_irq(q->queue_lock); 3188 + } 3189 + EXPORT_SYMBOL(blk_post_runtime_resume); 3190 + #endif 3032 3191 3033 3192 int __init blk_dev_init(void) 3034 3193 {

+2 -5

block/cfq-iosched.c

··· 2270 2270 return NULL; 2271 2271 2272 2272 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); 2273 - if (cfqq) { 2274 - sector_t sector = bio->bi_sector + bio_sectors(bio); 2275 - 2276 - return elv_rb_find(&cfqq->sort_list, sector); 2277 - } 2273 + if (cfqq) 2274 + return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio)); 2278 2275 2279 2276 return NULL; 2280 2277 }

+1 -1

block/deadline-iosched.c

··· 132 132 * check for front merge 133 133 */ 134 134 if (dd->front_merges) { 135 - sector_t sector = bio->bi_sector + bio_sectors(bio); 135 + sector_t sector = bio_end_sector(bio); 136 136 137 137 __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector); 138 138 if (__rq) {

+26

block/elevator.c

··· 34 34 #include <linux/blktrace_api.h> 35 35 #include <linux/hash.h> 36 36 #include <linux/uaccess.h> 37 + #include <linux/pm_runtime.h> 37 38 38 39 #include <trace/events/block.h> 39 40 ··· 537 536 e->type->ops.elevator_bio_merged_fn(q, rq, bio); 538 537 } 539 538 539 + #ifdef CONFIG_PM_RUNTIME 540 + static void blk_pm_requeue_request(struct request *rq) 541 + { 542 + if (rq->q->dev && !(rq->cmd_flags & REQ_PM)) 543 + rq->q->nr_pending--; 544 + } 545 + 546 + static void blk_pm_add_request(struct request_queue *q, struct request *rq) 547 + { 548 + if (q->dev && !(rq->cmd_flags & REQ_PM) && q->nr_pending++ == 0 && 549 + (q->rpm_status == RPM_SUSPENDED || q->rpm_status == RPM_SUSPENDING)) 550 + pm_request_resume(q->dev); 551 + } 552 + #else 553 + static inline void blk_pm_requeue_request(struct request *rq) {} 554 + static inline void blk_pm_add_request(struct request_queue *q, 555 + struct request *rq) 556 + { 557 + } 558 + #endif 559 + 540 560 void elv_requeue_request(struct request_queue *q, struct request *rq) 541 561 { 542 562 /* ··· 571 549 } 572 550 573 551 rq->cmd_flags &= ~REQ_STARTED; 552 + 553 + blk_pm_requeue_request(rq); 574 554 575 555 __elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE); 576 556 } ··· 595 571 void __elv_add_request(struct request_queue *q, struct request *rq, int where) 596 572 { 597 573 trace_block_rq_insert(q, rq); 574 + 575 + blk_pm_add_request(q, rq); 598 576 599 577 rq->q = q; 600 578

+2 -2

block/partitions/efi.c

··· 238 238 le32_to_cpu(gpt->sizeof_partition_entry); 239 239 if (!count) 240 240 return NULL; 241 - pte = kzalloc(count, GFP_KERNEL); 241 + pte = kmalloc(count, GFP_KERNEL); 242 242 if (!pte) 243 243 return NULL; 244 244 ··· 267 267 gpt_header *gpt; 268 268 unsigned ssz = bdev_logical_block_size(state->bdev); 269 269 270 - gpt = kzalloc(ssz, GFP_KERNEL); 270 + gpt = kmalloc(ssz, GFP_KERNEL); 271 271 if (!gpt) 272 272 return NULL; 273 273

+1 -1

drivers/block/aoe/aoecmd.c

··· 928 928 buf->resid = bio->bi_size; 929 929 buf->sector = bio->bi_sector; 930 930 bio_pageinc(bio); 931 - buf->bv = bv = &bio->bi_io_vec[bio->bi_idx]; 931 + buf->bv = bv = bio_iovec(bio); 932 932 buf->bv_resid = bv->bv_len; 933 933 WARN_ON(buf->bv_resid == 0); 934 934 }

+1 -2

drivers/block/brd.c

··· 334 334 int err = -EIO; 335 335 336 336 sector = bio->bi_sector; 337 - if (sector + (bio->bi_size >> SECTOR_SHIFT) > 338 - get_capacity(bdev->bd_disk)) 337 + if (bio_end_sector(bio) > get_capacity(bdev->bd_disk)) 339 338 goto out; 340 339 341 340 if (unlikely(bio->bi_rw & REQ_DISCARD)) {

-1

drivers/block/floppy.c

··· 3775 3775 bio_vec.bv_len = size; 3776 3776 bio_vec.bv_offset = 0; 3777 3777 bio.bi_vcnt = 1; 3778 - bio.bi_idx = 0; 3779 3778 bio.bi_size = size; 3780 3779 bio.bi_bdev = bdev; 3781 3780 bio.bi_sector = 0;

+23 -79

drivers/block/pktcdvd.c

··· 901 901 pd->iosched.successive_reads += bio->bi_size >> 10; 902 902 else { 903 903 pd->iosched.successive_reads = 0; 904 - pd->iosched.last_write = bio->bi_sector + bio_sectors(bio); 904 + pd->iosched.last_write = bio_end_sector(bio); 905 905 } 906 906 if (pd->iosched.successive_reads >= HI_SPEED_SWITCH) { 907 907 if (pd->read_speed == pd->write_speed) { ··· 944 944 } else { 945 945 printk(DRIVER_NAME": cdrom max_phys_segments too small\n"); 946 946 return -EIO; 947 - } 948 - } 949 - 950 - /* 951 - * Copy CD_FRAMESIZE bytes from src_bio into a destination page 952 - */ 953 - static void pkt_copy_bio_data(struct bio *src_bio, int seg, int offs, struct page *dst_page, int dst_offs) 954 - { 955 - unsigned int copy_size = CD_FRAMESIZE; 956 - 957 - while (copy_size > 0) { 958 - struct bio_vec *src_bvl = bio_iovec_idx(src_bio, seg); 959 - void *vfrom = kmap_atomic(src_bvl->bv_page) + 960 - src_bvl->bv_offset + offs; 961 - void *vto = page_address(dst_page) + dst_offs; 962 - int len = min_t(int, copy_size, src_bvl->bv_len - offs); 963 - 964 - BUG_ON(len < 0); 965 - memcpy(vto, vfrom, len); 966 - kunmap_atomic(vfrom); 967 - 968 - seg++; 969 - offs = 0; 970 - dst_offs += len; 971 - copy_size -= len; 972 947 } 973 948 } 974 949 ··· 1156 1181 new_sector = new_block * (CD_FRAMESIZE >> 9); 1157 1182 pkt->sector = new_sector; 1158 1183 1184 + bio_reset(pkt->bio); 1185 + pkt->bio->bi_bdev = pd->bdev; 1186 + pkt->bio->bi_rw = REQ_WRITE; 1159 1187 pkt->bio->bi_sector = new_sector; 1160 - pkt->bio->bi_next = NULL; 1161 - pkt->bio->bi_flags = 1 << BIO_UPTODATE; 1162 - pkt->bio->bi_idx = 0; 1188 + pkt->bio->bi_size = pkt->frames * CD_FRAMESIZE; 1189 + pkt->bio->bi_vcnt = pkt->frames; 1163 1190 1164 - BUG_ON(pkt->bio->bi_rw != REQ_WRITE); 1165 - BUG_ON(pkt->bio->bi_vcnt != pkt->frames); 1166 - BUG_ON(pkt->bio->bi_size != pkt->frames * CD_FRAMESIZE); 1167 - BUG_ON(pkt->bio->bi_end_io != pkt_end_io_packet_write); 1168 - BUG_ON(pkt->bio->bi_private != pkt); 1191 + pkt->bio->bi_end_io = pkt_end_io_packet_write; 1192 + pkt->bio->bi_private = pkt; 1169 1193 1170 1194 drop_super(sb); 1171 1195 return 1; ··· 1299 1325 */ 1300 1326 static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) 1301 1327 { 1302 - struct bio *bio; 1303 1328 int f; 1304 - int frames_write; 1305 1329 struct bio_vec *bvec = pkt->w_bio->bi_io_vec; 1306 1330 1331 + bio_reset(pkt->w_bio); 1332 + pkt->w_bio->bi_sector = pkt->sector; 1333 + pkt->w_bio->bi_bdev = pd->bdev; 1334 + pkt->w_bio->bi_end_io = pkt_end_io_packet_write; 1335 + pkt->w_bio->bi_private = pkt; 1336 + 1337 + /* XXX: locking? */ 1307 1338 for (f = 0; f < pkt->frames; f++) { 1308 1339 bvec[f].bv_page = pkt->pages[(f * CD_FRAMESIZE) / PAGE_SIZE]; 1309 1340 bvec[f].bv_offset = (f * CD_FRAMESIZE) % PAGE_SIZE; 1341 + if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset)) 1342 + BUG(); 1310 1343 } 1344 + VPRINTK(DRIVER_NAME": vcnt=%d\n", pkt->w_bio->bi_vcnt); 1311 1345 1312 1346 /* 1313 1347 * Fill-in bvec with data from orig_bios. 1314 1348 */ 1315 - frames_write = 0; 1316 1349 spin_lock(&pkt->lock); 1317 - bio_list_for_each(bio, &pkt->orig_bios) { 1318 - int segment = bio->bi_idx; 1319 - int src_offs = 0; 1320 - int first_frame = (bio->bi_sector - pkt->sector) / (CD_FRAMESIZE >> 9); 1321 - int num_frames = bio->bi_size / CD_FRAMESIZE; 1322 - BUG_ON(first_frame < 0); 1323 - BUG_ON(first_frame + num_frames > pkt->frames); 1324 - for (f = first_frame; f < first_frame + num_frames; f++) { 1325 - struct bio_vec *src_bvl = bio_iovec_idx(bio, segment); 1350 + bio_copy_data(pkt->w_bio, pkt->orig_bios.head); 1326 1351 1327 - while (src_offs >= src_bvl->bv_len) { 1328 - src_offs -= src_bvl->bv_len; 1329 - segment++; 1330 - BUG_ON(segment >= bio->bi_vcnt); 1331 - src_bvl = bio_iovec_idx(bio, segment); 1332 - } 1333 - 1334 - if (src_bvl->bv_len - src_offs >= CD_FRAMESIZE) { 1335 - bvec[f].bv_page = src_bvl->bv_page; 1336 - bvec[f].bv_offset = src_bvl->bv_offset + src_offs; 1337 - } else { 1338 - pkt_copy_bio_data(bio, segment, src_offs, 1339 - bvec[f].bv_page, bvec[f].bv_offset); 1340 - } 1341 - src_offs += CD_FRAMESIZE; 1342 - frames_write++; 1343 - } 1344 - } 1345 1352 pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE); 1346 1353 spin_unlock(&pkt->lock); 1347 1354 1348 1355 VPRINTK("pkt_start_write: Writing %d frames for zone %llx\n", 1349 - frames_write, (unsigned long long)pkt->sector); 1350 - BUG_ON(frames_write != pkt->write_size); 1356 + pkt->write_size, (unsigned long long)pkt->sector); 1351 1357 1352 1358 if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) { 1353 1359 pkt_make_local_copy(pkt, bvec); ··· 1337 1383 } 1338 1384 1339 1385 /* Start the write request */ 1340 - bio_reset(pkt->w_bio); 1341 - pkt->w_bio->bi_sector = pkt->sector; 1342 - pkt->w_bio->bi_bdev = pd->bdev; 1343 - pkt->w_bio->bi_end_io = pkt_end_io_packet_write; 1344 - pkt->w_bio->bi_private = pkt; 1345 - for (f = 0; f < pkt->frames; f++) 1346 - if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset)) 1347 - BUG(); 1348 - VPRINTK(DRIVER_NAME": vcnt=%d\n", pkt->w_bio->bi_vcnt); 1349 - 1350 1386 atomic_set(&pkt->io_wait, 1); 1351 1387 pkt->w_bio->bi_rw = WRITE; 1352 1388 pkt_queue_bio(pd, pkt->w_bio); ··· 2375 2431 cloned_bio->bi_bdev = pd->bdev; 2376 2432 cloned_bio->bi_private = psd; 2377 2433 cloned_bio->bi_end_io = pkt_end_io_read_cloned; 2378 - pd->stats.secs_r += bio->bi_size >> 9; 2434 + pd->stats.secs_r += bio_sectors(bio); 2379 2435 pkt_queue_bio(pd, cloned_bio); 2380 2436 return; 2381 2437 } ··· 2396 2452 zone = ZONE(bio->bi_sector, pd); 2397 2453 VPRINTK("pkt_make_request: start = %6llx stop = %6llx\n", 2398 2454 (unsigned long long)bio->bi_sector, 2399 - (unsigned long long)(bio->bi_sector + bio_sectors(bio))); 2455 + (unsigned long long)bio_end_sector(bio)); 2400 2456 2401 2457 /* Check if we have to split the bio */ 2402 2458 { ··· 2404 2460 sector_t last_zone; 2405 2461 int first_sectors; 2406 2462 2407 - last_zone = ZONE(bio->bi_sector + bio_sectors(bio) - 1, pd); 2463 + last_zone = ZONE(bio_end_sector(bio) - 1, pd); 2408 2464 if (last_zone != zone) { 2409 2465 BUG_ON(last_zone != zone + pd->settings.size); 2410 2466 first_sectors = last_zone - bio->bi_sector;

+1 -1

drivers/block/rbd.c

··· 1143 1143 /* Find first affected segment... */ 1144 1144 1145 1145 resid = offset; 1146 - __bio_for_each_segment(bv, bio_src, idx, 0) { 1146 + bio_for_each_segment(bv, bio_src, idx) { 1147 1147 if (resid < bv->bv_len) 1148 1148 break; 1149 1149 resid -= bv->bv_len;

+1 -2

drivers/md/dm-crypt.c

··· 858 858 unsigned int i; 859 859 struct bio_vec *bv; 860 860 861 - for (i = 0; i < clone->bi_vcnt; i++) { 862 - bv = bio_iovec_idx(clone, i); 861 + bio_for_each_segment_all(bv, clone, i) { 863 862 BUG_ON(!bv->bv_page); 864 863 mempool_free(bv->bv_page, cc->page_pool); 865 864 bv->bv_page = NULL;

+1 -1

drivers/md/dm-raid1.c

··· 458 458 { 459 459 io->bdev = m->dev->bdev; 460 460 io->sector = map_sector(m, bio); 461 - io->count = bio->bi_size >> 9; 461 + io->count = bio_sectors(bio); 462 462 } 463 463 464 464 static void hold_bio(struct mirror_set *ms, struct bio *bio)

+1 -1

drivers/md/dm-stripe.c

··· 258 258 sector_t begin, end; 259 259 260 260 stripe_map_range_sector(sc, bio->bi_sector, target_stripe, &begin); 261 - stripe_map_range_sector(sc, bio->bi_sector + bio_sectors(bio), 261 + stripe_map_range_sector(sc, bio_end_sector(bio), 262 262 target_stripe, &end); 263 263 if (begin < end) { 264 264 bio->bi_bdev = sc->stripe[target_stripe].dev->bdev;

+2 -2

drivers/md/dm-verity.c

··· 501 501 return -EIO; 502 502 } 503 503 504 - if ((bio->bi_sector + bio_sectors(bio)) >> 504 + if (bio_end_sector(bio) >> 505 505 (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) { 506 506 DMERR_LIMIT("io out of range"); 507 507 return -EIO; ··· 519 519 520 520 bio->bi_end_io = verity_end_io; 521 521 bio->bi_private = io; 522 - io->io_vec_size = bio->bi_vcnt - bio->bi_idx; 522 + io->io_vec_size = bio_segments(bio); 523 523 if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE) 524 524 io->io_vec = io->io_vec_inline; 525 525 else

+2 -4

drivers/md/faulty.c

··· 185 185 return; 186 186 } 187 187 188 - if (check_sector(conf, bio->bi_sector, bio->bi_sector+(bio->bi_size>>9), 189 - WRITE)) 188 + if (check_sector(conf, bio->bi_sector, bio_end_sector(bio), WRITE)) 190 189 failit = 1; 191 190 if (check_mode(conf, WritePersistent)) { 192 191 add_sector(conf, bio->bi_sector, WritePersistent); ··· 195 196 failit = 1; 196 197 } else { 197 198 /* read request */ 198 - if (check_sector(conf, bio->bi_sector, bio->bi_sector + (bio->bi_size>>9), 199 - READ)) 199 + if (check_sector(conf, bio->bi_sector, bio_end_sector(bio), READ)) 200 200 failit = 1; 201 201 if (check_mode(conf, ReadTransient)) 202 202 failit = 1;

+1 -2

drivers/md/linear.c

··· 317 317 bio_io_error(bio); 318 318 return; 319 319 } 320 - if (unlikely(bio->bi_sector + (bio->bi_size >> 9) > 321 - tmp_dev->end_sector)) { 320 + if (unlikely(bio_end_sector(bio) > tmp_dev->end_sector)) { 322 321 /* This bio crosses a device boundary, so we have to 323 322 * split it. 324 323 */

+4 -13

drivers/md/md.c

··· 197 197 if (offset == 0 && size == bio->bi_size) 198 198 return; 199 199 200 - bio->bi_sector += offset; 201 - bio->bi_size = size; 202 - offset <<= 9; 203 200 clear_bit(BIO_SEG_VALID, &bio->bi_flags); 204 201 205 - while (bio->bi_idx < bio->bi_vcnt && 206 - bio->bi_io_vec[bio->bi_idx].bv_len <= offset) { 207 - /* remove this whole bio_vec */ 208 - offset -= bio->bi_io_vec[bio->bi_idx].bv_len; 209 - bio->bi_idx++; 210 - } 211 - if (bio->bi_idx < bio->bi_vcnt) { 212 - bio->bi_io_vec[bio->bi_idx].bv_offset += offset; 213 - bio->bi_io_vec[bio->bi_idx].bv_len -= offset; 214 - } 202 + bio_advance(bio, offset << 9); 203 + 204 + bio->bi_size = size; 205 + 215 206 /* avoid any complications with bi_idx being non-zero*/ 216 207 if (bio->bi_idx) { 217 208 memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,

+4 -5

drivers/md/raid0.c

··· 502 502 { 503 503 if (likely(is_power_of_2(chunk_sects))) { 504 504 return chunk_sects >= ((bio->bi_sector & (chunk_sects-1)) 505 - + (bio->bi_size >> 9)); 505 + + bio_sectors(bio)); 506 506 } else{ 507 507 sector_t sector = bio->bi_sector; 508 508 return chunk_sects >= (sector_div(sector, chunk_sects) 509 - + (bio->bi_size >> 9)); 509 + + bio_sectors(bio)); 510 510 } 511 511 } 512 512 ··· 527 527 sector_t sector = bio->bi_sector; 528 528 struct bio_pair *bp; 529 529 /* Sanity check -- queue functions should prevent this happening */ 530 - if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) || 531 - bio->bi_idx != 0) 530 + if (bio_segments(bio) > 1) 532 531 goto bad_map; 533 532 /* This is a one page bio that upper layers 534 533 * refuse to split for us, so we need to split it. ··· 566 567 printk("md/raid0:%s: make_request bug: can't convert block across chunks" 567 568 " or bigger than %dk %llu %d\n", 568 569 mdname(mddev), chunk_sects / 2, 569 - (unsigned long long)bio->bi_sector, bio->bi_size >> 10); 570 + (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2); 570 571 571 572 bio_io_error(bio); 572 573 return;

+43 -90

drivers/md/raid1.c

··· 92 92 static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) 93 93 { 94 94 struct pool_info *pi = data; 95 - struct page *page; 96 95 struct r1bio *r1_bio; 97 96 struct bio *bio; 98 97 int i, j; ··· 121 122 j = 1; 122 123 while(j--) { 123 124 bio = r1_bio->bios[j]; 124 - for (i = 0; i < RESYNC_PAGES; i++) { 125 - page = alloc_page(gfp_flags); 126 - if (unlikely(!page)) 127 - goto out_free_pages; 125 + bio->bi_vcnt = RESYNC_PAGES; 128 126 129 - bio->bi_io_vec[i].bv_page = page; 130 - bio->bi_vcnt = i+1; 131 - } 127 + if (bio_alloc_pages(bio, gfp_flags)) 128 + goto out_free_bio; 132 129 } 133 130 /* If not user-requests, copy the page pointers to all bios */ 134 131 if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) { ··· 138 143 139 144 return r1_bio; 140 145 141 - out_free_pages: 142 - for (j=0 ; j < pi->raid_disks; j++) 143 - for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++) 144 - put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page); 145 - j = -1; 146 146 out_free_bio: 147 147 while (++j < pi->raid_disks) 148 148 bio_put(r1_bio->bios[j]); ··· 257 267 (bio_data_dir(bio) == WRITE) ? "write" : "read", 258 268 (unsigned long long) bio->bi_sector, 259 269 (unsigned long long) bio->bi_sector + 260 - (bio->bi_size >> 9) - 1); 270 + bio_sectors(bio) - 1); 261 271 262 272 call_bio_endio(r1_bio); 263 273 } ··· 448 458 " %llu-%llu\n", 449 459 (unsigned long long) mbio->bi_sector, 450 460 (unsigned long long) mbio->bi_sector + 451 - (mbio->bi_size >> 9) - 1); 461 + bio_sectors(mbio) - 1); 452 462 call_bio_endio(r1_bio); 453 463 } 454 464 } ··· 915 925 if (unlikely(!bvecs)) 916 926 return; 917 927 918 - bio_for_each_segment(bvec, bio, i) { 928 + bio_for_each_segment_all(bvec, bio, i) { 919 929 bvecs[i] = *bvec; 920 930 bvecs[i].bv_page = alloc_page(GFP_NOIO); 921 931 if (unlikely(!bvecs[i].bv_page)) ··· 1013 1023 md_write_start(mddev, bio); /* wait on superblock update early */ 1014 1024 1015 1025 if (bio_data_dir(bio) == WRITE && 1016 - bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo && 1026 + bio_end_sector(bio) > mddev->suspend_lo && 1017 1027 bio->bi_sector < mddev->suspend_hi) { 1018 1028 /* As the suspend_* range is controlled by 1019 1029 * userspace, we want an interruptible ··· 1024 1034 flush_signals(current); 1025 1035 prepare_to_wait(&conf->wait_barrier, 1026 1036 &w, TASK_INTERRUPTIBLE); 1027 - if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo || 1037 + if (bio_end_sector(bio) <= mddev->suspend_lo || 1028 1038 bio->bi_sector >= mddev->suspend_hi) 1029 1039 break; 1030 1040 schedule(); ··· 1044 1054 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); 1045 1055 1046 1056 r1_bio->master_bio = bio; 1047 - r1_bio->sectors = bio->bi_size >> 9; 1057 + r1_bio->sectors = bio_sectors(bio); 1048 1058 r1_bio->state = 0; 1049 1059 r1_bio->mddev = mddev; 1050 1060 r1_bio->sector = bio->bi_sector; ··· 1122 1132 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); 1123 1133 1124 1134 r1_bio->master_bio = bio; 1125 - r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled; 1135 + r1_bio->sectors = bio_sectors(bio) - sectors_handled; 1126 1136 r1_bio->state = 0; 1127 1137 r1_bio->mddev = mddev; 1128 1138 r1_bio->sector = bio->bi_sector + sectors_handled; ··· 1279 1289 struct bio_vec *bvec; 1280 1290 int j; 1281 1291 1282 - /* Yes, I really want the '__' version so that 1283 - * we clear any unused pointer in the io_vec, rather 1284 - * than leave them unchanged. This is important 1285 - * because when we come to free the pages, we won't 1286 - * know the original bi_idx, so we just free 1287 - * them all 1292 + /* 1293 + * We trimmed the bio, so _all is legit 1288 1294 */ 1289 - __bio_for_each_segment(bvec, mbio, j, 0) 1295 + bio_for_each_segment_all(bvec, mbio, j) 1290 1296 bvec->bv_page = r1_bio->behind_bvecs[j].bv_page; 1291 1297 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) 1292 1298 atomic_inc(&r1_bio->behind_remaining); ··· 1320 1334 /* Mustn't call r1_bio_write_done before this next test, 1321 1335 * as it could result in the bio being freed. 1322 1336 */ 1323 - if (sectors_handled < (bio->bi_size >> 9)) { 1337 + if (sectors_handled < bio_sectors(bio)) { 1324 1338 r1_bio_write_done(r1_bio); 1325 1339 /* We need another r1_bio. It has already been counted 1326 1340 * in bio->bi_phys_segments 1327 1341 */ 1328 1342 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); 1329 1343 r1_bio->master_bio = bio; 1330 - r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled; 1344 + r1_bio->sectors = bio_sectors(bio) - sectors_handled; 1331 1345 r1_bio->state = 0; 1332 1346 r1_bio->mddev = mddev; 1333 1347 r1_bio->sector = bio->bi_sector + sectors_handled; ··· 1853 1867 struct bio *sbio = r1_bio->bios[i]; 1854 1868 int size; 1855 1869 1856 - if (r1_bio->bios[i]->bi_end_io != end_sync_read) 1870 + if (sbio->bi_end_io != end_sync_read) 1857 1871 continue; 1858 1872 1859 1873 if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { ··· 1878 1892 continue; 1879 1893 } 1880 1894 /* fixup the bio for reuse */ 1895 + bio_reset(sbio); 1881 1896 sbio->bi_vcnt = vcnt; 1882 1897 sbio->bi_size = r1_bio->sectors << 9; 1883 - sbio->bi_idx = 0; 1884 - sbio->bi_phys_segments = 0; 1885 - sbio->bi_flags &= ~(BIO_POOL_MASK - 1); 1886 - sbio->bi_flags |= 1 << BIO_UPTODATE; 1887 - sbio->bi_next = NULL; 1888 1898 sbio->bi_sector = r1_bio->sector + 1889 1899 conf->mirrors[i].rdev->data_offset; 1890 1900 sbio->bi_bdev = conf->mirrors[i].rdev->bdev; 1901 + sbio->bi_end_io = end_sync_read; 1902 + sbio->bi_private = r1_bio; 1903 + 1891 1904 size = sbio->bi_size; 1892 1905 for (j = 0; j < vcnt ; j++) { 1893 1906 struct bio_vec *bi; ··· 1897 1912 else 1898 1913 bi->bv_len = size; 1899 1914 size -= PAGE_SIZE; 1900 - memcpy(page_address(bi->bv_page), 1901 - page_address(pbio->bi_io_vec[j].bv_page), 1902 - PAGE_SIZE); 1903 1915 } 1916 + 1917 + bio_copy_data(sbio, pbio); 1904 1918 } 1905 1919 return 0; 1906 1920 } ··· 1936 1952 wbio->bi_rw = WRITE; 1937 1953 wbio->bi_end_io = end_sync_write; 1938 1954 atomic_inc(&r1_bio->remaining); 1939 - md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9); 1955 + md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio)); 1940 1956 1941 1957 generic_make_request(wbio); 1942 1958 } ··· 2048 2064 } 2049 2065 } 2050 2066 2051 - static void bi_complete(struct bio *bio, int error) 2052 - { 2053 - complete((struct completion *)bio->bi_private); 2054 - } 2055 - 2056 - static int submit_bio_wait(int rw, struct bio *bio) 2057 - { 2058 - struct completion event; 2059 - rw |= REQ_SYNC; 2060 - 2061 - init_completion(&event); 2062 - bio->bi_private = &event; 2063 - bio->bi_end_io = bi_complete; 2064 - submit_bio(rw, bio); 2065 - wait_for_completion(&event); 2066 - 2067 - return test_bit(BIO_UPTODATE, &bio->bi_flags); 2068 - } 2069 - 2070 2067 static int narrow_write_error(struct r1bio *r1_bio, int i) 2071 2068 { 2072 2069 struct mddev *mddev = r1_bio->mddev; 2073 2070 struct r1conf *conf = mddev->private; 2074 2071 struct md_rdev *rdev = conf->mirrors[i].rdev; 2075 - int vcnt, idx; 2076 - struct bio_vec *vec; 2077 2072 2078 2073 /* bio has the data to be written to device 'i' where 2079 2074 * we just recently had a write error. ··· 2080 2117 & ~(sector_t)(block_sectors - 1)) 2081 2118 - sector; 2082 2119 2083 - if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { 2084 - vcnt = r1_bio->behind_page_count; 2085 - vec = r1_bio->behind_bvecs; 2086 - idx = 0; 2087 - while (vec[idx].bv_page == NULL) 2088 - idx++; 2089 - } else { 2090 - vcnt = r1_bio->master_bio->bi_vcnt; 2091 - vec = r1_bio->master_bio->bi_io_vec; 2092 - idx = r1_bio->master_bio->bi_idx; 2093 - } 2094 2120 while (sect_to_write) { 2095 2121 struct bio *wbio; 2096 2122 if (sectors > sect_to_write) 2097 2123 sectors = sect_to_write; 2098 2124 /* Write at 'sector' for 'sectors'*/ 2099 2125 2100 - wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev); 2101 - memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec)); 2102 - wbio->bi_sector = r1_bio->sector; 2126 + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { 2127 + unsigned vcnt = r1_bio->behind_page_count; 2128 + struct bio_vec *vec = r1_bio->behind_bvecs; 2129 + 2130 + while (!vec->bv_page) { 2131 + vec++; 2132 + vcnt--; 2133 + } 2134 + 2135 + wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev); 2136 + memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec)); 2137 + 2138 + wbio->bi_vcnt = vcnt; 2139 + } else { 2140 + wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); 2141 + } 2142 + 2103 2143 wbio->bi_rw = WRITE; 2104 - wbio->bi_vcnt = vcnt; 2144 + wbio->bi_sector = r1_bio->sector; 2105 2145 wbio->bi_size = r1_bio->sectors << 9; 2106 - wbio->bi_idx = idx; 2107 2146 2108 2147 md_trim_bio(wbio, sector - r1_bio->sector, sectors); 2109 2148 wbio->bi_sector += rdev->data_offset; ··· 2254 2289 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); 2255 2290 2256 2291 r1_bio->master_bio = mbio; 2257 - r1_bio->sectors = (mbio->bi_size >> 9) 2258 - - sectors_handled; 2292 + r1_bio->sectors = bio_sectors(mbio) - sectors_handled; 2259 2293 r1_bio->state = 0; 2260 2294 set_bit(R1BIO_ReadError, &r1_bio->state); 2261 2295 r1_bio->mddev = mddev; ··· 2428 2464 for (i = 0; i < conf->raid_disks * 2; i++) { 2429 2465 struct md_rdev *rdev; 2430 2466 bio = r1_bio->bios[i]; 2431 - 2432 - /* take from bio_init */ 2433 - bio->bi_next = NULL; 2434 - bio->bi_flags &= ~(BIO_POOL_MASK-1); 2435 - bio->bi_flags |= 1 << BIO_UPTODATE; 2436 - bio->bi_rw = READ; 2437 - bio->bi_vcnt = 0; 2438 - bio->bi_idx = 0; 2439 - bio->bi_phys_segments = 0; 2440 - bio->bi_size = 0; 2441 - bio->bi_end_io = NULL; 2442 - bio->bi_private = NULL; 2467 + bio_reset(bio); 2443 2468 2444 2469 rdev = rcu_dereference(conf->mirrors[i].rdev); 2445 2470 if (rdev == NULL ||

+21 -57

drivers/md/raid10.c

··· 1174 1174 /* If this request crosses a chunk boundary, we need to 1175 1175 * split it. This will only happen for 1 PAGE (or less) requests. 1176 1176 */ 1177 - if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9) 1177 + if (unlikely((bio->bi_sector & chunk_mask) + bio_sectors(bio) 1178 1178 > chunk_sects 1179 1179 && (conf->geo.near_copies < conf->geo.raid_disks 1180 1180 || conf->prev.near_copies < conf->prev.raid_disks))) { 1181 1181 struct bio_pair *bp; 1182 1182 /* Sanity check -- queue functions should prevent this happening */ 1183 - if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) || 1184 - bio->bi_idx != 0) 1183 + if (bio_segments(bio) > 1) 1185 1184 goto bad_map; 1186 1185 /* This is a one page bio that upper layers 1187 1186 * refuse to split for us, so we need to split it. ··· 1213 1214 bad_map: 1214 1215 printk("md/raid10:%s: make_request bug: can't convert block across chunks" 1215 1216 " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2, 1216 - (unsigned long long)bio->bi_sector, bio->bi_size >> 10); 1217 + (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2); 1217 1218 1218 1219 bio_io_error(bio); 1219 1220 return; ··· 1228 1229 */ 1229 1230 wait_barrier(conf); 1230 1231 1231 - sectors = bio->bi_size >> 9; 1232 + sectors = bio_sectors(bio); 1232 1233 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 1233 1234 bio->bi_sector < conf->reshape_progress && 1234 1235 bio->bi_sector + sectors > conf->reshape_progress) { ··· 1330 1331 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); 1331 1332 1332 1333 r10_bio->master_bio = bio; 1333 - r10_bio->sectors = ((bio->bi_size >> 9) 1334 - - sectors_handled); 1334 + r10_bio->sectors = bio_sectors(bio) - sectors_handled; 1335 1335 r10_bio->state = 0; 1336 1336 r10_bio->mddev = mddev; 1337 1337 r10_bio->sector = bio->bi_sector + sectors_handled; ··· 1572 1574 * after checking if we need to go around again. 1573 1575 */ 1574 1576 1575 - if (sectors_handled < (bio->bi_size >> 9)) { 1577 + if (sectors_handled < bio_sectors(bio)) { 1576 1578 one_write_done(r10_bio); 1577 1579 /* We need another r10_bio. It has already been counted 1578 1580 * in bio->bi_phys_segments. ··· 1580 1582 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); 1581 1583 1582 1584 r10_bio->master_bio = bio; 1583 - r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled; 1585 + r10_bio->sectors = bio_sectors(bio) - sectors_handled; 1584 1586 1585 1587 r10_bio->mddev = mddev; 1586 1588 r10_bio->sector = bio->bi_sector + sectors_handled; ··· 2082 2084 * First we need to fixup bv_offset, bv_len and 2083 2085 * bi_vecs, as the read request might have corrupted these 2084 2086 */ 2087 + bio_reset(tbio); 2088 + 2085 2089 tbio->bi_vcnt = vcnt; 2086 2090 tbio->bi_size = r10_bio->sectors << 9; 2087 - tbio->bi_idx = 0; 2088 - tbio->bi_phys_segments = 0; 2089 - tbio->bi_flags &= ~(BIO_POOL_MASK - 1); 2090 - tbio->bi_flags |= 1 << BIO_UPTODATE; 2091 - tbio->bi_next = NULL; 2092 2091 tbio->bi_rw = WRITE; 2093 2092 tbio->bi_private = r10_bio; 2094 2093 tbio->bi_sector = r10_bio->devs[i].addr; ··· 2103 2108 d = r10_bio->devs[i].devnum; 2104 2109 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2105 2110 atomic_inc(&r10_bio->remaining); 2106 - md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9); 2111 + md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio)); 2107 2112 2108 2113 tbio->bi_sector += conf->mirrors[d].rdev->data_offset; 2109 2114 tbio->bi_bdev = conf->mirrors[d].rdev->bdev; ··· 2128 2133 d = r10_bio->devs[i].devnum; 2129 2134 atomic_inc(&r10_bio->remaining); 2130 2135 md_sync_acct(conf->mirrors[d].replacement->bdev, 2131 - tbio->bi_size >> 9); 2136 + bio_sectors(tbio)); 2132 2137 generic_make_request(tbio); 2133 2138 } 2134 2139 ··· 2254 2259 wbio2 = r10_bio->devs[1].repl_bio; 2255 2260 if (wbio->bi_end_io) { 2256 2261 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2257 - md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); 2262 + md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio)); 2258 2263 generic_make_request(wbio); 2259 2264 } 2260 2265 if (wbio2 && wbio2->bi_end_io) { 2261 2266 atomic_inc(&conf->mirrors[d].replacement->nr_pending); 2262 2267 md_sync_acct(conf->mirrors[d].replacement->bdev, 2263 - wbio2->bi_size >> 9); 2268 + bio_sectors(wbio2)); 2264 2269 generic_make_request(wbio2); 2265 2270 } 2266 2271 } ··· 2531 2536 } 2532 2537 } 2533 2538 2534 - static void bi_complete(struct bio *bio, int error) 2535 - { 2536 - complete((struct completion *)bio->bi_private); 2537 - } 2538 - 2539 - static int submit_bio_wait(int rw, struct bio *bio) 2540 - { 2541 - struct completion event; 2542 - rw |= REQ_SYNC; 2543 - 2544 - init_completion(&event); 2545 - bio->bi_private = &event; 2546 - bio->bi_end_io = bi_complete; 2547 - submit_bio(rw, bio); 2548 - wait_for_completion(&event); 2549 - 2550 - return test_bit(BIO_UPTODATE, &bio->bi_flags); 2551 - } 2552 - 2553 2539 static int narrow_write_error(struct r10bio *r10_bio, int i) 2554 2540 { 2555 2541 struct bio *bio = r10_bio->master_bio; ··· 2671 2695 r10_bio = mempool_alloc(conf->r10bio_pool, 2672 2696 GFP_NOIO); 2673 2697 r10_bio->master_bio = mbio; 2674 - r10_bio->sectors = (mbio->bi_size >> 9) 2675 - - sectors_handled; 2698 + r10_bio->sectors = bio_sectors(mbio) - sectors_handled; 2676 2699 r10_bio->state = 0; 2677 2700 set_bit(R10BIO_ReadError, 2678 2701 &r10_bio->state); ··· 3108 3133 } 3109 3134 } 3110 3135 bio = r10_bio->devs[0].bio; 3136 + bio_reset(bio); 3111 3137 bio->bi_next = biolist; 3112 3138 biolist = bio; 3113 3139 bio->bi_private = r10_bio; ··· 3133 3157 rdev = mirror->rdev; 3134 3158 if (!test_bit(In_sync, &rdev->flags)) { 3135 3159 bio = r10_bio->devs[1].bio; 3160 + bio_reset(bio); 3136 3161 bio->bi_next = biolist; 3137 3162 biolist = bio; 3138 3163 bio->bi_private = r10_bio; ··· 3162 3185 if (rdev == NULL || bio == NULL || 3163 3186 test_bit(Faulty, &rdev->flags)) 3164 3187 break; 3188 + bio_reset(bio); 3165 3189 bio->bi_next = biolist; 3166 3190 biolist = bio; 3167 3191 bio->bi_private = r10_bio; ··· 3261 3283 r10_bio->devs[i].repl_bio->bi_end_io = NULL; 3262 3284 3263 3285 bio = r10_bio->devs[i].bio; 3264 - bio->bi_end_io = NULL; 3286 + bio_reset(bio); 3265 3287 clear_bit(BIO_UPTODATE, &bio->bi_flags); 3266 3288 if (conf->mirrors[d].rdev == NULL || 3267 3289 test_bit(Faulty, &conf->mirrors[d].rdev->flags)) ··· 3298 3320 3299 3321 /* Need to set up for writing to the replacement */ 3300 3322 bio = r10_bio->devs[i].repl_bio; 3323 + bio_reset(bio); 3301 3324 clear_bit(BIO_UPTODATE, &bio->bi_flags); 3302 3325 3303 3326 sector = r10_bio->devs[i].addr; ··· 3330 3351 biolist = NULL; 3331 3352 goto giveup; 3332 3353 } 3333 - } 3334 - 3335 - for (bio = biolist; bio ; bio=bio->bi_next) { 3336 - 3337 - bio->bi_flags &= ~(BIO_POOL_MASK - 1); 3338 - if (bio->bi_end_io) 3339 - bio->bi_flags |= 1 << BIO_UPTODATE; 3340 - bio->bi_vcnt = 0; 3341 - bio->bi_idx = 0; 3342 - bio->bi_phys_segments = 0; 3343 - bio->bi_size = 0; 3344 3354 } 3345 3355 3346 3356 nr_sectors = 0; ··· 4379 4411 read_bio->bi_flags &= ~(BIO_POOL_MASK - 1); 4380 4412 read_bio->bi_flags |= 1 << BIO_UPTODATE; 4381 4413 read_bio->bi_vcnt = 0; 4382 - read_bio->bi_idx = 0; 4383 4414 read_bio->bi_size = 0; 4384 4415 r10_bio->master_bio = read_bio; 4385 4416 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; ··· 4402 4435 } 4403 4436 if (!rdev2 || test_bit(Faulty, &rdev2->flags)) 4404 4437 continue; 4438 + 4439 + bio_reset(b); 4405 4440 b->bi_bdev = rdev2->bdev; 4406 4441 b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset; 4407 4442 b->bi_private = r10_bio; 4408 4443 b->bi_end_io = end_reshape_write; 4409 4444 b->bi_rw = WRITE; 4410 - b->bi_flags &= ~(BIO_POOL_MASK - 1); 4411 - b->bi_flags |= 1 << BIO_UPTODATE; 4412 4445 b->bi_next = blist; 4413 - b->bi_vcnt = 0; 4414 - b->bi_idx = 0; 4415 - b->bi_size = 0; 4416 4446 blist = b; 4417 4447 } 4418 4448

+24 -25

drivers/md/raid5.c

··· 90 90 */ 91 91 static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) 92 92 { 93 - int sectors = bio->bi_size >> 9; 93 + int sectors = bio_sectors(bio); 94 94 if (bio->bi_sector + sectors < sector + STRIPE_SECTORS) 95 95 return bio->bi_next; 96 96 else ··· 569 569 bi = &sh->dev[i].req; 570 570 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 571 571 572 - bi->bi_rw = rw; 573 - rbi->bi_rw = rw; 574 - if (rw & WRITE) { 575 - bi->bi_end_io = raid5_end_write_request; 576 - rbi->bi_end_io = raid5_end_write_request; 577 - } else 578 - bi->bi_end_io = raid5_end_read_request; 579 - 580 572 rcu_read_lock(); 581 573 rrdev = rcu_dereference(conf->disks[i].replacement); 582 574 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ ··· 643 651 644 652 set_bit(STRIPE_IO_STARTED, &sh->state); 645 653 654 + bio_reset(bi); 646 655 bi->bi_bdev = rdev->bdev; 656 + bi->bi_rw = rw; 657 + bi->bi_end_io = (rw & WRITE) 658 + ? raid5_end_write_request 659 + : raid5_end_read_request; 660 + bi->bi_private = sh; 661 + 647 662 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 648 663 __func__, (unsigned long long)sh->sector, 649 664 bi->bi_rw, i); ··· 664 665 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 665 666 bi->bi_rw |= REQ_FLUSH; 666 667 667 - bi->bi_flags = 1 << BIO_UPTODATE; 668 - bi->bi_idx = 0; 669 668 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 670 669 bi->bi_io_vec[0].bv_offset = 0; 671 670 bi->bi_size = STRIPE_SIZE; 672 - bi->bi_next = NULL; 673 671 if (rrdev) 674 672 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 675 673 ··· 683 687 684 688 set_bit(STRIPE_IO_STARTED, &sh->state); 685 689 690 + bio_reset(rbi); 686 691 rbi->bi_bdev = rrdev->bdev; 692 + rbi->bi_rw = rw; 693 + BUG_ON(!(rw & WRITE)); 694 + rbi->bi_end_io = raid5_end_write_request; 695 + rbi->bi_private = sh; 696 + 687 697 pr_debug("%s: for %llu schedule op %ld on " 688 698 "replacement disc %d\n", 689 699 __func__, (unsigned long long)sh->sector, ··· 701 699 else 702 700 rbi->bi_sector = (sh->sector 703 701 + rrdev->data_offset); 704 - rbi->bi_flags = 1 << BIO_UPTODATE; 705 - rbi->bi_idx = 0; 706 702 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 707 703 rbi->bi_io_vec[0].bv_offset = 0; 708 704 rbi->bi_size = STRIPE_SIZE; 709 - rbi->bi_next = NULL; 710 705 if (conf->mddev->gendisk) 711 706 trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), 712 707 rbi, disk_devt(conf->mddev->gendisk), ··· 2401 2402 } else 2402 2403 bip = &sh->dev[dd_idx].toread; 2403 2404 while (*bip && (*bip)->bi_sector < bi->bi_sector) { 2404 - if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) 2405 + if (bio_end_sector(*bip) > bi->bi_sector) 2405 2406 goto overlap; 2406 2407 bip = & (*bip)->bi_next; 2407 2408 } 2408 - if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) 2409 + if (*bip && (*bip)->bi_sector < bio_end_sector(bi)) 2409 2410 goto overlap; 2410 2411 2411 2412 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); ··· 2421 2422 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 2422 2423 bi && bi->bi_sector <= sector; 2423 2424 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 2424 - if (bi->bi_sector + (bi->bi_size>>9) >= sector) 2425 - sector = bi->bi_sector + (bi->bi_size>>9); 2425 + if (bio_end_sector(bi) >= sector) 2426 + sector = bio_end_sector(bi); 2426 2427 } 2427 2428 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2428 2429 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); ··· 3848 3849 { 3849 3850 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); 3850 3851 unsigned int chunk_sectors = mddev->chunk_sectors; 3851 - unsigned int bio_sectors = bio->bi_size >> 9; 3852 + unsigned int bio_sectors = bio_sectors(bio); 3852 3853 3853 3854 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3854 3855 chunk_sectors = mddev->new_chunk_sectors; ··· 3940 3941 { 3941 3942 struct request_queue *q = bdev_get_queue(bi->bi_bdev); 3942 3943 3943 - if ((bi->bi_size>>9) > queue_max_sectors(q)) 3944 + if (bio_sectors(bi) > queue_max_sectors(q)) 3944 3945 return 0; 3945 3946 blk_recount_segments(q, bi); 3946 3947 if (bi->bi_phys_segments > queue_max_segments(q)) ··· 3987 3988 0, 3988 3989 &dd_idx, NULL); 3989 3990 3990 - end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9); 3991 + end_sector = bio_end_sector(align_bi); 3991 3992 rcu_read_lock(); 3992 3993 rdev = rcu_dereference(conf->disks[dd_idx].replacement); 3993 3994 if (!rdev || test_bit(Faulty, &rdev->flags) || ··· 4010 4011 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 4011 4012 4012 4013 if (!bio_fits_rdev(align_bi) || 4013 - is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, 4014 + is_badblock(rdev, align_bi->bi_sector, bio_sectors(align_bi), 4014 4015 &first_bad, &bad_sectors)) { 4015 4016 /* too big in some way, or has a known bad block */ 4016 4017 bio_put(align_bi); ··· 4272 4273 } 4273 4274 4274 4275 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4275 - last_sector = bi->bi_sector + (bi->bi_size>>9); 4276 + last_sector = bio_end_sector(bi); 4276 4277 bi->bi_next = NULL; 4277 4278 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 4278 4279 ··· 4738 4739 logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4739 4740 sector = raid5_compute_sector(conf, logical_sector, 4740 4741 0, &dd_idx, NULL); 4741 - last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); 4742 + last_sector = bio_end_sector(raid_bio); 4742 4743 4743 4744 for (; logical_sector < last_sector; 4744 4745 logical_sector += STRIPE_SECTORS,

+3 -3

drivers/message/fusion/mptsas.c

··· 2235 2235 } 2236 2236 2237 2237 /* do we need to support multiple segments? */ 2238 - if (req->bio->bi_vcnt > 1 || rsp->bio->bi_vcnt > 1) { 2238 + if (bio_segments(req->bio) > 1 || bio_segments(rsp->bio) > 1) { 2239 2239 printk(MYIOC_s_ERR_FMT "%s: multiple segments req %u %u, rsp %u %u\n", 2240 - ioc->name, __func__, req->bio->bi_vcnt, blk_rq_bytes(req), 2241 - rsp->bio->bi_vcnt, blk_rq_bytes(rsp)); 2240 + ioc->name, __func__, bio_segments(req->bio), blk_rq_bytes(req), 2241 + bio_segments(rsp->bio), blk_rq_bytes(rsp)); 2242 2242 return -EINVAL; 2243 2243 } 2244 2244

+1 -2

drivers/s390/block/dcssblk.c

··· 822 822 if ((bio->bi_sector & 7) != 0 || (bio->bi_size & 4095) != 0) 823 823 /* Request is not page-aligned. */ 824 824 goto fail; 825 - if (((bio->bi_size >> 9) + bio->bi_sector) 826 - > get_capacity(bio->bi_bdev->bd_disk)) { 825 + if (bio_end_sector(bio) > get_capacity(bio->bi_bdev->bd_disk)) { 827 826 /* Request beyond end of DCSS segment. */ 828 827 goto fail; 829 828 }

+3 -3

drivers/scsi/libsas/sas_expander.c

··· 2163 2163 } 2164 2164 2165 2165 /* do we need to support multiple segments? */ 2166 - if (req->bio->bi_vcnt > 1 || rsp->bio->bi_vcnt > 1) { 2166 + if (bio_segments(req->bio) > 1 || bio_segments(rsp->bio) > 1) { 2167 2167 printk("%s: multiple segments req %u %u, rsp %u %u\n", 2168 - __func__, req->bio->bi_vcnt, blk_rq_bytes(req), 2169 - rsp->bio->bi_vcnt, blk_rq_bytes(rsp)); 2168 + __func__, bio_segments(req->bio), blk_rq_bytes(req), 2169 + bio_segments(rsp->bio), blk_rq_bytes(rsp)); 2170 2170 return -EINVAL; 2171 2171 } 2172 2172

+5 -5

drivers/scsi/mpt2sas/mpt2sas_transport.c

··· 1939 1939 ioc->transport_cmds.status = MPT2_CMD_PENDING; 1940 1940 1941 1941 /* Check if the request is split across multiple segments */ 1942 - if (req->bio->bi_vcnt > 1) { 1942 + if (bio_segments(req->bio) > 1) { 1943 1943 u32 offset = 0; 1944 1944 1945 1945 /* Allocate memory and copy the request */ ··· 1971 1971 1972 1972 /* Check if the response needs to be populated across 1973 1973 * multiple segments */ 1974 - if (rsp->bio->bi_vcnt > 1) { 1974 + if (bio_segments(rsp->bio) > 1) { 1975 1975 pci_addr_in = pci_alloc_consistent(ioc->pdev, blk_rq_bytes(rsp), 1976 1976 &pci_dma_in); 1977 1977 if (!pci_addr_in) { ··· 2038 2038 sgl_flags = (MPI2_SGE_FLAGS_SIMPLE_ELEMENT | 2039 2039 MPI2_SGE_FLAGS_END_OF_BUFFER | MPI2_SGE_FLAGS_HOST_TO_IOC); 2040 2040 sgl_flags = sgl_flags << MPI2_SGE_FLAGS_SHIFT; 2041 - if (req->bio->bi_vcnt > 1) { 2041 + if (bio_segments(req->bio) > 1) { 2042 2042 ioc->base_add_sg_single(psge, sgl_flags | 2043 2043 (blk_rq_bytes(req) - 4), pci_dma_out); 2044 2044 } else { ··· 2054 2054 MPI2_SGE_FLAGS_LAST_ELEMENT | MPI2_SGE_FLAGS_END_OF_BUFFER | 2055 2055 MPI2_SGE_FLAGS_END_OF_LIST); 2056 2056 sgl_flags = sgl_flags << MPI2_SGE_FLAGS_SHIFT; 2057 - if (rsp->bio->bi_vcnt > 1) { 2057 + if (bio_segments(rsp->bio) > 1) { 2058 2058 ioc->base_add_sg_single(psge, sgl_flags | 2059 2059 (blk_rq_bytes(rsp) + 4), pci_dma_in); 2060 2060 } else { ··· 2099 2099 le16_to_cpu(mpi_reply->ResponseDataLength); 2100 2100 /* check if the resp needs to be copied from the allocated 2101 2101 * pci mem */ 2102 - if (rsp->bio->bi_vcnt > 1) { 2102 + if (bio_segments(rsp->bio) > 1) { 2103 2103 u32 offset = 0; 2104 2104 u32 bytes_to_copy = 2105 2105 le16_to_cpu(mpi_reply->ResponseDataLength);

+53 -89

fs/bio-integrity.c

··· 27 27 #include <linux/workqueue.h> 28 28 #include <linux/slab.h> 29 29 30 - struct integrity_slab { 31 - struct kmem_cache *slab; 32 - unsigned short nr_vecs; 33 - char name[8]; 34 - }; 30 + #define BIP_INLINE_VECS 4 35 31 36 - #define IS(x) { .nr_vecs = x, .name = "bip-"__stringify(x) } 37 - struct integrity_slab bip_slab[BIOVEC_NR_POOLS] __read_mostly = { 38 - IS(1), IS(4), IS(16), IS(64), IS(128), IS(BIO_MAX_PAGES), 39 - }; 40 - #undef IS 41 - 32 + static struct kmem_cache *bip_slab; 42 33 static struct workqueue_struct *kintegrityd_wq; 43 - 44 - static inline unsigned int vecs_to_idx(unsigned int nr) 45 - { 46 - switch (nr) { 47 - case 1: 48 - return 0; 49 - case 2 ... 4: 50 - return 1; 51 - case 5 ... 16: 52 - return 2; 53 - case 17 ... 64: 54 - return 3; 55 - case 65 ... 128: 56 - return 4; 57 - case 129 ... BIO_MAX_PAGES: 58 - return 5; 59 - default: 60 - BUG(); 61 - } 62 - } 63 - 64 - static inline int use_bip_pool(unsigned int idx) 65 - { 66 - if (idx == BIOVEC_MAX_IDX) 67 - return 1; 68 - 69 - return 0; 70 - } 71 34 72 35 /** 73 36 * bio_integrity_alloc - Allocate integrity payload and attach it to bio ··· 47 84 unsigned int nr_vecs) 48 85 { 49 86 struct bio_integrity_payload *bip; 50 - unsigned int idx = vecs_to_idx(nr_vecs); 51 87 struct bio_set *bs = bio->bi_pool; 88 + unsigned long idx = BIO_POOL_NONE; 89 + unsigned inline_vecs; 52 90 53 - if (!bs) 54 - bs = fs_bio_set; 55 - 56 - BUG_ON(bio == NULL); 57 - bip = NULL; 58 - 59 - /* Lower order allocations come straight from slab */ 60 - if (!use_bip_pool(idx)) 61 - bip = kmem_cache_alloc(bip_slab[idx].slab, gfp_mask); 62 - 63 - /* Use mempool if lower order alloc failed or max vecs were requested */ 64 - if (bip == NULL) { 65 - idx = BIOVEC_MAX_IDX; /* so we free the payload properly later */ 91 + if (!bs) { 92 + bip = kmalloc(sizeof(struct bio_integrity_payload) + 93 + sizeof(struct bio_vec) * nr_vecs, gfp_mask); 94 + inline_vecs = nr_vecs; 95 + } else { 66 96 bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); 67 - 68 - if (unlikely(bip == NULL)) { 69 - printk(KERN_ERR "%s: could not alloc bip\n", __func__); 70 - return NULL; 71 - } 97 + inline_vecs = BIP_INLINE_VECS; 72 98 } 73 99 100 + if (unlikely(!bip)) 101 + return NULL; 102 + 74 103 memset(bip, 0, sizeof(*bip)); 104 + 105 + if (nr_vecs > inline_vecs) { 106 + bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx, 107 + bs->bvec_integrity_pool); 108 + if (!bip->bip_vec) 109 + goto err; 110 + } else { 111 + bip->bip_vec = bip->bip_inline_vecs; 112 + } 75 113 76 114 bip->bip_slab = idx; 77 115 bip->bip_bio = bio; 78 116 bio->bi_integrity = bip; 79 117 80 118 return bip; 119 + err: 120 + mempool_free(bip, bs->bio_integrity_pool); 121 + return NULL; 81 122 } 82 123 EXPORT_SYMBOL(bio_integrity_alloc); 83 124 ··· 97 130 struct bio_integrity_payload *bip = bio->bi_integrity; 98 131 struct bio_set *bs = bio->bi_pool; 99 132 100 - if (!bs) 101 - bs = fs_bio_set; 102 - 103 - BUG_ON(bip == NULL); 104 - 105 - /* A cloned bio doesn't own the integrity metadata */ 106 - if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY) 107 - && bip->bip_buf != NULL) 133 + if (bip->bip_owns_buf) 108 134 kfree(bip->bip_buf); 109 135 110 - if (use_bip_pool(bip->bip_slab)) 136 + if (bs) { 137 + if (bip->bip_slab != BIO_POOL_NONE) 138 + bvec_free(bs->bvec_integrity_pool, bip->bip_vec, 139 + bip->bip_slab); 140 + 111 141 mempool_free(bip, bs->bio_integrity_pool); 112 - else 113 - kmem_cache_free(bip_slab[bip->bip_slab].slab, bip); 142 + } else { 143 + kfree(bip); 144 + } 114 145 115 146 bio->bi_integrity = NULL; 116 147 } ··· 384 419 return -EIO; 385 420 } 386 421 422 + bip->bip_owns_buf = 1; 387 423 bip->bip_buf = buf; 388 424 bip->bip_size = len; 389 425 bip->bip_sector = bio->bi_sector; ··· 660 694 bp->bio1.bi_integrity = &bp->bip1; 661 695 bp->bio2.bi_integrity = &bp->bip2; 662 696 663 - bp->iv1 = bip->bip_vec[0]; 664 - bp->iv2 = bip->bip_vec[0]; 697 + bp->iv1 = bip->bip_vec[bip->bip_idx]; 698 + bp->iv2 = bip->bip_vec[bip->bip_idx]; 665 699 666 - bp->bip1.bip_vec[0] = bp->iv1; 667 - bp->bip2.bip_vec[0] = bp->iv2; 700 + bp->bip1.bip_vec = &bp->iv1; 701 + bp->bip2.bip_vec = &bp->iv2; 668 702 669 703 bp->iv1.bv_len = sectors * bi->tuple_size; 670 704 bp->iv2.bv_offset += sectors * bi->tuple_size; ··· 712 746 713 747 int bioset_integrity_create(struct bio_set *bs, int pool_size) 714 748 { 715 - unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES); 716 - 717 749 if (bs->bio_integrity_pool) 718 750 return 0; 719 751 720 - bs->bio_integrity_pool = 721 - mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab); 752 + bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab); 753 + 754 + bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size); 755 + if (!bs->bvec_integrity_pool) 756 + return -1; 722 757 723 758 if (!bs->bio_integrity_pool) 724 759 return -1; ··· 732 765 { 733 766 if (bs->bio_integrity_pool) 734 767 mempool_destroy(bs->bio_integrity_pool); 768 + 769 + if (bs->bvec_integrity_pool) 770 + mempool_destroy(bs->bio_integrity_pool); 735 771 } 736 772 EXPORT_SYMBOL(bioset_integrity_free); 737 773 738 774 void __init bio_integrity_init(void) 739 775 { 740 - unsigned int i; 741 - 742 776 /* 743 777 * kintegrityd won't block much but may burn a lot of CPU cycles. 744 778 * Make it highpri CPU intensive wq with max concurrency of 1. ··· 749 781 if (!kintegrityd_wq) 750 782 panic("Failed to create kintegrityd\n"); 751 783 752 - for (i = 0 ; i < BIOVEC_NR_POOLS ; i++) { 753 - unsigned int size; 754 - 755 - size = sizeof(struct bio_integrity_payload) 756 - + bip_slab[i].nr_vecs * sizeof(struct bio_vec); 757 - 758 - bip_slab[i].slab = 759 - kmem_cache_create(bip_slab[i].name, size, 0, 760 - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 761 - } 784 + bip_slab = kmem_cache_create("bio_integrity_payload", 785 + sizeof(struct bio_integrity_payload) + 786 + sizeof(struct bio_vec) * BIP_INLINE_VECS, 787 + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 788 + if (!bip_slab) 789 + panic("Failed to create slab\n"); 762 790 }

+324 -42

fs/bio.c

··· 161 161 return bvec_slabs[idx].nr_vecs; 162 162 } 163 163 164 - void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx) 164 + void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) 165 165 { 166 166 BIO_BUG_ON(idx >= BIOVEC_NR_POOLS); 167 167 168 168 if (idx == BIOVEC_MAX_IDX) 169 - mempool_free(bv, bs->bvec_pool); 169 + mempool_free(bv, pool); 170 170 else { 171 171 struct biovec_slab *bvs = bvec_slabs + idx; 172 172 ··· 174 174 } 175 175 } 176 176 177 - struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, 178 - struct bio_set *bs) 177 + struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, 178 + mempool_t *pool) 179 179 { 180 180 struct bio_vec *bvl; 181 181 ··· 211 211 */ 212 212 if (*idx == BIOVEC_MAX_IDX) { 213 213 fallback: 214 - bvl = mempool_alloc(bs->bvec_pool, gfp_mask); 214 + bvl = mempool_alloc(pool, gfp_mask); 215 215 } else { 216 216 struct biovec_slab *bvs = bvec_slabs + *idx; 217 217 gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO); ··· 253 253 __bio_free(bio); 254 254 255 255 if (bs) { 256 - if (bio_has_allocated_vec(bio)) 257 - bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio)); 256 + if (bio_flagged(bio, BIO_OWNS_VEC)) 257 + bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio)); 258 258 259 259 /* 260 260 * If we have front padding, adjust the bio pointer before freeing ··· 298 298 } 299 299 EXPORT_SYMBOL(bio_reset); 300 300 301 + static void bio_alloc_rescue(struct work_struct *work) 302 + { 303 + struct bio_set *bs = container_of(work, struct bio_set, rescue_work); 304 + struct bio *bio; 305 + 306 + while (1) { 307 + spin_lock(&bs->rescue_lock); 308 + bio = bio_list_pop(&bs->rescue_list); 309 + spin_unlock(&bs->rescue_lock); 310 + 311 + if (!bio) 312 + break; 313 + 314 + generic_make_request(bio); 315 + } 316 + } 317 + 318 + static void punt_bios_to_rescuer(struct bio_set *bs) 319 + { 320 + struct bio_list punt, nopunt; 321 + struct bio *bio; 322 + 323 + /* 324 + * In order to guarantee forward progress we must punt only bios that 325 + * were allocated from this bio_set; otherwise, if there was a bio on 326 + * there for a stacking driver higher up in the stack, processing it 327 + * could require allocating bios from this bio_set, and doing that from 328 + * our own rescuer would be bad. 329 + * 330 + * Since bio lists are singly linked, pop them all instead of trying to 331 + * remove from the middle of the list: 332 + */ 333 + 334 + bio_list_init(&punt); 335 + bio_list_init(&nopunt); 336 + 337 + while ((bio = bio_list_pop(current->bio_list))) 338 + bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); 339 + 340 + *current->bio_list = nopunt; 341 + 342 + spin_lock(&bs->rescue_lock); 343 + bio_list_merge(&bs->rescue_list, &punt); 344 + spin_unlock(&bs->rescue_lock); 345 + 346 + queue_work(bs->rescue_workqueue, &bs->rescue_work); 347 + } 348 + 301 349 /** 302 350 * bio_alloc_bioset - allocate a bio for I/O 303 351 * @gfp_mask: the GFP_ mask given to the slab allocator ··· 363 315 * previously allocated bio for IO before attempting to allocate a new one. 364 316 * Failure to do so can cause deadlocks under memory pressure. 365 317 * 318 + * Note that when running under generic_make_request() (i.e. any block 319 + * driver), bios are not submitted until after you return - see the code in 320 + * generic_make_request() that converts recursion into iteration, to prevent 321 + * stack overflows. 322 + * 323 + * This would normally mean allocating multiple bios under 324 + * generic_make_request() would be susceptible to deadlocks, but we have 325 + * deadlock avoidance code that resubmits any blocked bios from a rescuer 326 + * thread. 327 + * 328 + * However, we do not guarantee forward progress for allocations from other 329 + * mempools. Doing multiple allocations from the same mempool under 330 + * generic_make_request() should be avoided - instead, use bio_set's front_pad 331 + * for per bio allocations. 332 + * 366 333 * RETURNS: 367 334 * Pointer to new bio on success, NULL on failure. 368 335 */ 369 336 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) 370 337 { 338 + gfp_t saved_gfp = gfp_mask; 371 339 unsigned front_pad; 372 340 unsigned inline_vecs; 373 341 unsigned long idx = BIO_POOL_NONE; ··· 401 337 front_pad = 0; 402 338 inline_vecs = nr_iovecs; 403 339 } else { 340 + /* 341 + * generic_make_request() converts recursion to iteration; this 342 + * means if we're running beneath it, any bios we allocate and 343 + * submit will not be submitted (and thus freed) until after we 344 + * return. 345 + * 346 + * This exposes us to a potential deadlock if we allocate 347 + * multiple bios from the same bio_set() while running 348 + * underneath generic_make_request(). If we were to allocate 349 + * multiple bios (say a stacking block driver that was splitting 350 + * bios), we would deadlock if we exhausted the mempool's 351 + * reserve. 352 + * 353 + * We solve this, and guarantee forward progress, with a rescuer 354 + * workqueue per bio_set. If we go to allocate and there are 355 + * bios on current->bio_list, we first try the allocation 356 + * without __GFP_WAIT; if that fails, we punt those bios we 357 + * would be blocking to the rescuer workqueue before we retry 358 + * with the original gfp_flags. 359 + */ 360 + 361 + if (current->bio_list && !bio_list_empty(current->bio_list)) 362 + gfp_mask &= ~__GFP_WAIT; 363 + 404 364 p = mempool_alloc(bs->bio_pool, gfp_mask); 365 + if (!p && gfp_mask != saved_gfp) { 366 + punt_bios_to_rescuer(bs); 367 + gfp_mask = saved_gfp; 368 + p = mempool_alloc(bs->bio_pool, gfp_mask); 369 + } 370 + 405 371 front_pad = bs->front_pad; 406 372 inline_vecs = BIO_INLINE_VECS; 407 373 } ··· 443 349 bio_init(bio); 444 350 445 351 if (nr_iovecs > inline_vecs) { 446 - bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); 352 + bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); 353 + if (!bvl && gfp_mask != saved_gfp) { 354 + punt_bios_to_rescuer(bs); 355 + gfp_mask = saved_gfp; 356 + bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); 357 + } 358 + 447 359 if (unlikely(!bvl)) 448 360 goto err_free; 361 + 362 + bio->bi_flags |= 1 << BIO_OWNS_VEC; 449 363 } else if (nr_iovecs) { 450 364 bvl = bio->bi_inline_vecs; 451 365 } ··· 755 653 } 756 654 EXPORT_SYMBOL(bio_add_page); 757 655 656 + struct submit_bio_ret { 657 + struct completion event; 658 + int error; 659 + }; 660 + 661 + static void submit_bio_wait_endio(struct bio *bio, int error) 662 + { 663 + struct submit_bio_ret *ret = bio->bi_private; 664 + 665 + ret->error = error; 666 + complete(&ret->event); 667 + } 668 + 669 + /** 670 + * submit_bio_wait - submit a bio, and wait until it completes 671 + * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) 672 + * @bio: The &struct bio which describes the I/O 673 + * 674 + * Simple wrapper around submit_bio(). Returns 0 on success, or the error from 675 + * bio_endio() on failure. 676 + */ 677 + int submit_bio_wait(int rw, struct bio *bio) 678 + { 679 + struct submit_bio_ret ret; 680 + 681 + rw |= REQ_SYNC; 682 + init_completion(&ret.event); 683 + bio->bi_private = &ret; 684 + bio->bi_end_io = submit_bio_wait_endio; 685 + submit_bio(rw, bio); 686 + wait_for_completion(&ret.event); 687 + 688 + return ret.error; 689 + } 690 + EXPORT_SYMBOL(submit_bio_wait); 691 + 692 + /** 693 + * bio_advance - increment/complete a bio by some number of bytes 694 + * @bio: bio to advance 695 + * @bytes: number of bytes to complete 696 + * 697 + * This updates bi_sector, bi_size and bi_idx; if the number of bytes to 698 + * complete doesn't align with a bvec boundary, then bv_len and bv_offset will 699 + * be updated on the last bvec as well. 700 + * 701 + * @bio will then represent the remaining, uncompleted portion of the io. 702 + */ 703 + void bio_advance(struct bio *bio, unsigned bytes) 704 + { 705 + if (bio_integrity(bio)) 706 + bio_integrity_advance(bio, bytes); 707 + 708 + bio->bi_sector += bytes >> 9; 709 + bio->bi_size -= bytes; 710 + 711 + if (bio->bi_rw & BIO_NO_ADVANCE_ITER_MASK) 712 + return; 713 + 714 + while (bytes) { 715 + if (unlikely(bio->bi_idx >= bio->bi_vcnt)) { 716 + WARN_ONCE(1, "bio idx %d >= vcnt %d\n", 717 + bio->bi_idx, bio->bi_vcnt); 718 + break; 719 + } 720 + 721 + if (bytes >= bio_iovec(bio)->bv_len) { 722 + bytes -= bio_iovec(bio)->bv_len; 723 + bio->bi_idx++; 724 + } else { 725 + bio_iovec(bio)->bv_len -= bytes; 726 + bio_iovec(bio)->bv_offset += bytes; 727 + bytes = 0; 728 + } 729 + } 730 + } 731 + EXPORT_SYMBOL(bio_advance); 732 + 733 + /** 734 + * bio_alloc_pages - allocates a single page for each bvec in a bio 735 + * @bio: bio to allocate pages for 736 + * @gfp_mask: flags for allocation 737 + * 738 + * Allocates pages up to @bio->bi_vcnt. 739 + * 740 + * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are 741 + * freed. 742 + */ 743 + int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) 744 + { 745 + int i; 746 + struct bio_vec *bv; 747 + 748 + bio_for_each_segment_all(bv, bio, i) { 749 + bv->bv_page = alloc_page(gfp_mask); 750 + if (!bv->bv_page) { 751 + while (--bv >= bio->bi_io_vec) 752 + __free_page(bv->bv_page); 753 + return -ENOMEM; 754 + } 755 + } 756 + 757 + return 0; 758 + } 759 + EXPORT_SYMBOL(bio_alloc_pages); 760 + 761 + /** 762 + * bio_copy_data - copy contents of data buffers from one chain of bios to 763 + * another 764 + * @src: source bio list 765 + * @dst: destination bio list 766 + * 767 + * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats 768 + * @src and @dst as linked lists of bios. 769 + * 770 + * Stops when it reaches the end of either @src or @dst - that is, copies 771 + * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios). 772 + */ 773 + void bio_copy_data(struct bio *dst, struct bio *src) 774 + { 775 + struct bio_vec *src_bv, *dst_bv; 776 + unsigned src_offset, dst_offset, bytes; 777 + void *src_p, *dst_p; 778 + 779 + src_bv = bio_iovec(src); 780 + dst_bv = bio_iovec(dst); 781 + 782 + src_offset = src_bv->bv_offset; 783 + dst_offset = dst_bv->bv_offset; 784 + 785 + while (1) { 786 + if (src_offset == src_bv->bv_offset + src_bv->bv_len) { 787 + src_bv++; 788 + if (src_bv == bio_iovec_idx(src, src->bi_vcnt)) { 789 + src = src->bi_next; 790 + if (!src) 791 + break; 792 + 793 + src_bv = bio_iovec(src); 794 + } 795 + 796 + src_offset = src_bv->bv_offset; 797 + } 798 + 799 + if (dst_offset == dst_bv->bv_offset + dst_bv->bv_len) { 800 + dst_bv++; 801 + if (dst_bv == bio_iovec_idx(dst, dst->bi_vcnt)) { 802 + dst = dst->bi_next; 803 + if (!dst) 804 + break; 805 + 806 + dst_bv = bio_iovec(dst); 807 + } 808 + 809 + dst_offset = dst_bv->bv_offset; 810 + } 811 + 812 + bytes = min(dst_bv->bv_offset + dst_bv->bv_len - dst_offset, 813 + src_bv->bv_offset + src_bv->bv_len - src_offset); 814 + 815 + src_p = kmap_atomic(src_bv->bv_page); 816 + dst_p = kmap_atomic(dst_bv->bv_page); 817 + 818 + memcpy(dst_p + dst_bv->bv_offset, 819 + src_p + src_bv->bv_offset, 820 + bytes); 821 + 822 + kunmap_atomic(dst_p); 823 + kunmap_atomic(src_p); 824 + 825 + src_offset += bytes; 826 + dst_offset += bytes; 827 + } 828 + } 829 + EXPORT_SYMBOL(bio_copy_data); 830 + 758 831 struct bio_map_data { 759 832 struct bio_vec *iovecs; 760 833 struct sg_iovec *sgvecs; ··· 992 715 int iov_idx = 0; 993 716 unsigned int iov_off = 0; 994 717 995 - __bio_for_each_segment(bvec, bio, i, 0) { 718 + bio_for_each_segment_all(bvec, bio, i) { 996 719 char *bv_addr = page_address(bvec->bv_page); 997 720 unsigned int bv_len = iovecs[i].bv_len; 998 721 ··· 1174 897 return bio; 1175 898 cleanup: 1176 899 if (!map_data) 1177 - bio_for_each_segment(bvec, bio, i) 900 + bio_for_each_segment_all(bvec, bio, i) 1178 901 __free_page(bvec->bv_page); 1179 902 1180 903 bio_put(bio); ··· 1388 1111 /* 1389 1112 * make sure we dirty pages we wrote to 1390 1113 */ 1391 - __bio_for_each_segment(bvec, bio, i, 0) { 1114 + bio_for_each_segment_all(bvec, bio, i) { 1392 1115 if (bio_data_dir(bio) == READ) 1393 1116 set_page_dirty_lock(bvec->bv_page); 1394 1117 ··· 1494 1217 int i; 1495 1218 char *p = bmd->sgvecs[0].iov_base; 1496 1219 1497 - __bio_for_each_segment(bvec, bio, i, 0) { 1220 + bio_for_each_segment_all(bvec, bio, i) { 1498 1221 char *addr = page_address(bvec->bv_page); 1499 1222 int len = bmd->iovecs[i].bv_len; 1500 1223 ··· 1534 1257 if (!reading) { 1535 1258 void *p = data; 1536 1259 1537 - bio_for_each_segment(bvec, bio, i) { 1260 + bio_for_each_segment_all(bvec, bio, i) { 1538 1261 char *addr = page_address(bvec->bv_page); 1539 1262 1540 1263 memcpy(addr, p, bvec->bv_len); ··· 1579 1302 */ 1580 1303 void bio_set_pages_dirty(struct bio *bio) 1581 1304 { 1582 - struct bio_vec *bvec = bio->bi_io_vec; 1305 + struct bio_vec *bvec; 1583 1306 int i; 1584 1307 1585 - for (i = 0; i < bio->bi_vcnt; i++) { 1586 - struct page *page = bvec[i].bv_page; 1308 + bio_for_each_segment_all(bvec, bio, i) { 1309 + struct page *page = bvec->bv_page; 1587 1310 1588 1311 if (page && !PageCompound(page)) 1589 1312 set_page_dirty_lock(page); ··· 1592 1315 1593 1316 static void bio_release_pages(struct bio *bio) 1594 1317 { 1595 - struct bio_vec *bvec = bio->bi_io_vec; 1318 + struct bio_vec *bvec; 1596 1319 int i; 1597 1320 1598 - for (i = 0; i < bio->bi_vcnt; i++) { 1599 - struct page *page = bvec[i].bv_page; 1321 + bio_for_each_segment_all(bvec, bio, i) { 1322 + struct page *page = bvec->bv_page; 1600 1323 1601 1324 if (page) 1602 1325 put_page(page); ··· 1645 1368 1646 1369 void bio_check_pages_dirty(struct bio *bio) 1647 1370 { 1648 - struct bio_vec *bvec = bio->bi_io_vec; 1371 + struct bio_vec *bvec; 1649 1372 int nr_clean_pages = 0; 1650 1373 int i; 1651 1374 1652 - for (i = 0; i < bio->bi_vcnt; i++) { 1653 - struct page *page = bvec[i].bv_page; 1375 + bio_for_each_segment_all(bvec, bio, i) { 1376 + struct page *page = bvec->bv_page; 1654 1377 1655 1378 if (PageDirty(page) || PageCompound(page)) { 1656 1379 page_cache_release(page); 1657 - bvec[i].bv_page = NULL; 1380 + bvec->bv_page = NULL; 1658 1381 } else { 1659 1382 nr_clean_pages++; 1660 1383 } ··· 1755 1478 trace_block_split(bdev_get_queue(bi->bi_bdev), bi, 1756 1479 bi->bi_sector + first_sectors); 1757 1480 1758 - BUG_ON(bi->bi_vcnt != 1 && bi->bi_vcnt != 0); 1759 - BUG_ON(bi->bi_idx != 0); 1481 + BUG_ON(bio_segments(bi) > 1); 1760 1482 atomic_set(&bp->cnt, 3); 1761 1483 bp->error = 0; 1762 1484 bp->bio1 = *bi; ··· 1765 1489 bp->bio1.bi_size = first_sectors << 9; 1766 1490 1767 1491 if (bi->bi_vcnt != 0) { 1768 - bp->bv1 = bi->bi_io_vec[0]; 1769 - bp->bv2 = bi->bi_io_vec[0]; 1492 + bp->bv1 = *bio_iovec(bi); 1493 + bp->bv2 = *bio_iovec(bi); 1770 1494 1771 1495 if (bio_is_rw(bi)) { 1772 1496 bp->bv2.bv_offset += first_sectors << 9; ··· 1818 1542 if (index >= bio->bi_idx) 1819 1543 index = bio->bi_vcnt - 1; 1820 1544 1821 - __bio_for_each_segment(bv, bio, i, 0) { 1545 + bio_for_each_segment_all(bv, bio, i) { 1822 1546 if (i == index) { 1823 1547 if (offset > bv->bv_offset) 1824 1548 sectors += (offset - bv->bv_offset) / sector_sz; ··· 1836 1560 * create memory pools for biovec's in a bio_set. 1837 1561 * use the global biovec slabs created for general use. 1838 1562 */ 1839 - static int biovec_create_pools(struct bio_set *bs, int pool_entries) 1563 + mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries) 1840 1564 { 1841 1565 struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX; 1842 1566 1843 - bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab); 1844 - if (!bs->bvec_pool) 1845 - return -ENOMEM; 1846 - 1847 - return 0; 1848 - } 1849 - 1850 - static void biovec_free_pools(struct bio_set *bs) 1851 - { 1852 - mempool_destroy(bs->bvec_pool); 1567 + return mempool_create_slab_pool(pool_entries, bp->slab); 1853 1568 } 1854 1569 1855 1570 void bioset_free(struct bio_set *bs) 1856 1571 { 1572 + if (bs->rescue_workqueue) 1573 + destroy_workqueue(bs->rescue_workqueue); 1574 + 1857 1575 if (bs->bio_pool) 1858 1576 mempool_destroy(bs->bio_pool); 1859 1577 1578 + if (bs->bvec_pool) 1579 + mempool_destroy(bs->bvec_pool); 1580 + 1860 1581 bioset_integrity_free(bs); 1861 - biovec_free_pools(bs); 1862 1582 bio_put_slab(bs); 1863 1583 1864 1584 kfree(bs); ··· 1885 1613 1886 1614 bs->front_pad = front_pad; 1887 1615 1616 + spin_lock_init(&bs->rescue_lock); 1617 + bio_list_init(&bs->rescue_list); 1618 + INIT_WORK(&bs->rescue_work, bio_alloc_rescue); 1619 + 1888 1620 bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad); 1889 1621 if (!bs->bio_slab) { 1890 1622 kfree(bs); ··· 1899 1623 if (!bs->bio_pool) 1900 1624 goto bad; 1901 1625 1902 - if (!biovec_create_pools(bs, pool_size)) 1903 - return bs; 1626 + bs->bvec_pool = biovec_create_pool(bs, pool_size); 1627 + if (!bs->bvec_pool) 1628 + goto bad; 1904 1629 1630 + bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); 1631 + if (!bs->rescue_workqueue) 1632 + goto bad; 1633 + 1634 + return bs; 1905 1635 bad: 1906 1636 bioset_free(bs); 1907 1637 return NULL;

+1 -1

fs/block_dev.c

··· 1556 1556 return 0; 1557 1557 1558 1558 size -= pos; 1559 - if (size < INT_MAX) 1559 + if (size < iocb->ki_left) 1560 1560 nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size); 1561 1561 return generic_file_aio_read(iocb, iov, nr_segs, pos); 1562 1562 }

+1 -2

fs/btrfs/extent_io.c

··· 2560 2560 if (old_compressed) 2561 2561 contig = bio->bi_sector == sector; 2562 2562 else 2563 - contig = bio->bi_sector + (bio->bi_size >> 9) == 2564 - sector; 2563 + contig = bio_end_sector(bio) == sector; 2565 2564 2566 2565 if (prev_bio_flags != bio_flags || !contig || 2567 2566 merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||

+1 -1

fs/btrfs/volumes.c

··· 5177 5177 } 5178 5178 5179 5179 prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; 5180 - if ((bio->bi_size >> 9) > max_sectors) 5180 + if (bio_sectors(bio) > max_sectors) 5181 5181 return 0; 5182 5182 5183 5183 if (!q->merge_bvec_fn)

-1

fs/buffer.c

··· 2977 2977 bio->bi_io_vec[0].bv_offset = bh_offset(bh); 2978 2978 2979 2979 bio->bi_vcnt = 1; 2980 - bio->bi_idx = 0; 2981 2980 bio->bi_size = bh->b_size; 2982 2981 2983 2982 bio->bi_end_io = end_bio_bh_io_sync;

+4 -4

fs/direct-io.c

··· 442 442 static int dio_bio_complete(struct dio *dio, struct bio *bio) 443 443 { 444 444 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 445 - struct bio_vec *bvec = bio->bi_io_vec; 446 - int page_no; 445 + struct bio_vec *bvec; 446 + unsigned i; 447 447 448 448 if (!uptodate) 449 449 dio->io_error = -EIO; ··· 451 451 if (dio->is_async && dio->rw == READ) { 452 452 bio_check_pages_dirty(bio); /* transfers ownership */ 453 453 } else { 454 - for (page_no = 0; page_no < bio->bi_vcnt; page_no++) { 455 - struct page *page = bvec[page_no].bv_page; 454 + bio_for_each_segment_all(bvec, bio, i) { 455 + struct page *page = bvec->bv_page; 456 456 457 457 if (dio->rw == READ && !PageCompound(page)) 458 458 set_page_dirty_lock(page);

+1 -1

fs/exofs/ore.c

··· 401 401 struct bio_vec *bv; 402 402 unsigned i; 403 403 404 - __bio_for_each_segment(bv, bio, i, 0) { 404 + bio_for_each_segment_all(bv, bio, i) { 405 405 unsigned this_count = bv->bv_len; 406 406 407 407 if (likely(PAGE_SIZE == this_count))

+1 -1

fs/exofs/ore_raid.c

··· 432 432 if (!bio) 433 433 continue; 434 434 435 - __bio_for_each_segment(bv, bio, i, 0) { 435 + bio_for_each_segment_all(bv, bio, i) { 436 436 struct page *page = bv->bv_page; 437 437 438 438 SetPageUptodate(page);

+32 -70

fs/fs-writeback.c

··· 22 22 #include <linux/mm.h> 23 23 #include <linux/pagemap.h> 24 24 #include <linux/kthread.h> 25 - #include <linux/freezer.h> 26 25 #include <linux/writeback.h> 27 26 #include <linux/blkdev.h> 28 27 #include <linux/backing-dev.h> ··· 87 88 #define CREATE_TRACE_POINTS 88 89 #include <trace/events/writeback.h> 89 90 90 - /* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */ 91 - static void bdi_wakeup_flusher(struct backing_dev_info *bdi) 92 - { 93 - if (bdi->wb.task) { 94 - wake_up_process(bdi->wb.task); 95 - } else { 96 - /* 97 - * The bdi thread isn't there, wake up the forker thread which 98 - * will create and run it. 99 - */ 100 - wake_up_process(default_backing_dev_info.wb.task); 101 - } 102 - } 103 - 104 91 static void bdi_queue_work(struct backing_dev_info *bdi, 105 92 struct wb_writeback_work *work) 106 93 { ··· 94 109 95 110 spin_lock_bh(&bdi->wb_lock); 96 111 list_add_tail(&work->list, &bdi->work_list); 97 - if (!bdi->wb.task) 98 - trace_writeback_nothread(bdi, work); 99 - bdi_wakeup_flusher(bdi); 100 112 spin_unlock_bh(&bdi->wb_lock); 113 + 114 + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); 101 115 } 102 116 103 117 static void ··· 111 127 */ 112 128 work = kzalloc(sizeof(*work), GFP_ATOMIC); 113 129 if (!work) { 114 - if (bdi->wb.task) { 115 - trace_writeback_nowork(bdi); 116 - wake_up_process(bdi->wb.task); 117 - } 130 + trace_writeback_nowork(bdi); 131 + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); 118 132 return; 119 133 } 120 134 ··· 159 177 * writeback as soon as there is no other work to do. 160 178 */ 161 179 trace_writeback_wake_background(bdi); 162 - spin_lock_bh(&bdi->wb_lock); 163 - bdi_wakeup_flusher(bdi); 164 - spin_unlock_bh(&bdi->wb_lock); 180 + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); 165 181 } 166 182 167 183 /* ··· 1000 1020 1001 1021 /* 1002 1022 * Handle writeback of dirty data for the device backed by this bdi. Also 1003 - * wakes up periodically and does kupdated style flushing. 1023 + * reschedules periodically and does kupdated style flushing. 1004 1024 */ 1005 - int bdi_writeback_thread(void *data) 1025 + void bdi_writeback_workfn(struct work_struct *work) 1006 1026 { 1007 - struct bdi_writeback *wb = data; 1027 + struct bdi_writeback *wb = container_of(to_delayed_work(work), 1028 + struct bdi_writeback, dwork); 1008 1029 struct backing_dev_info *bdi = wb->bdi; 1009 1030 long pages_written; 1010 1031 1011 1032 set_worker_desc("flush-%s", dev_name(bdi->dev)); 1012 1033 current->flags |= PF_SWAPWRITE; 1013 - set_freezable(); 1014 - wb->last_active = jiffies; 1015 1034 1016 - /* 1017 - * Our parent may run at a different priority, just set us to normal 1018 - */ 1019 - set_user_nice(current, 0); 1020 - 1021 - trace_writeback_thread_start(bdi); 1022 - 1023 - while (!kthread_freezable_should_stop(NULL)) { 1035 + if (likely(!current_is_workqueue_rescuer() || 1036 + list_empty(&bdi->bdi_list))) { 1024 1037 /* 1025 - * Remove own delayed wake-up timer, since we are already awake 1026 - * and we'll take care of the periodic write-back. 1038 + * The normal path. Keep writing back @bdi until its 1039 + * work_list is empty. Note that this path is also taken 1040 + * if @bdi is shutting down even when we're running off the 1041 + * rescuer as work_list needs to be drained. 1027 1042 */ 1028 - del_timer(&wb->wakeup_timer); 1029 - 1030 - pages_written = wb_do_writeback(wb, 0); 1031 - 1043 + do { 1044 + pages_written = wb_do_writeback(wb, 0); 1045 + trace_writeback_pages_written(pages_written); 1046 + } while (!list_empty(&bdi->work_list)); 1047 + } else { 1048 + /* 1049 + * bdi_wq can't get enough workers and we're running off 1050 + * the emergency worker. Don't hog it. Hopefully, 1024 is 1051 + * enough for efficient IO. 1052 + */ 1053 + pages_written = writeback_inodes_wb(&bdi->wb, 1024, 1054 + WB_REASON_FORKER_THREAD); 1032 1055 trace_writeback_pages_written(pages_written); 1033 - 1034 - if (pages_written) 1035 - wb->last_active = jiffies; 1036 - 1037 - set_current_state(TASK_INTERRUPTIBLE); 1038 - if (!list_empty(&bdi->work_list) || kthread_should_stop()) { 1039 - __set_current_state(TASK_RUNNING); 1040 - continue; 1041 - } 1042 - 1043 - if (wb_has_dirty_io(wb) && dirty_writeback_interval) 1044 - schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); 1045 - else { 1046 - /* 1047 - * We have nothing to do, so can go sleep without any 1048 - * timeout and save power. When a work is queued or 1049 - * something is made dirty - we will be woken up. 1050 - */ 1051 - schedule(); 1052 - } 1053 1056 } 1054 1057 1055 - /* Flush any work that raced with us exiting */ 1056 - if (!list_empty(&bdi->work_list)) 1057 - wb_do_writeback(wb, 1); 1058 + if (!list_empty(&bdi->work_list) || 1059 + (wb_has_dirty_io(wb) && dirty_writeback_interval)) 1060 + queue_delayed_work(bdi_wq, &wb->dwork, 1061 + msecs_to_jiffies(dirty_writeback_interval * 10)); 1058 1062 1059 - trace_writeback_thread_stop(bdi); 1060 - return 0; 1063 + current->flags &= ~PF_SWAPWRITE; 1061 1064 } 1062 - 1063 1065 1064 1066 /* 1065 1067 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back

+1 -1

fs/gfs2/lops.c

··· 300 300 u64 nblk; 301 301 302 302 if (bio) { 303 - nblk = bio->bi_sector + bio_sectors(bio); 303 + nblk = bio_end_sector(bio); 304 304 nblk >>= sdp->sd_fsb2bb_shift; 305 305 if (blkno == nblk) 306 306 return bio;

-2

fs/jfs/jfs_logmgr.c

··· 2005 2005 bio->bi_io_vec[0].bv_offset = bp->l_offset; 2006 2006 2007 2007 bio->bi_vcnt = 1; 2008 - bio->bi_idx = 0; 2009 2008 bio->bi_size = LOGPSIZE; 2010 2009 2011 2010 bio->bi_end_io = lbmIODone; ··· 2145 2146 bio->bi_io_vec[0].bv_offset = bp->l_offset; 2146 2147 2147 2148 bio->bi_vcnt = 1; 2148 - bio->bi_idx = 0; 2149 2149 bio->bi_size = LOGPSIZE; 2150 2150 2151 2151 bio->bi_end_io = lbmIODone;

-5

fs/logfs/dev_bdev.c

··· 32 32 bio_vec.bv_len = PAGE_SIZE; 33 33 bio_vec.bv_offset = 0; 34 34 bio.bi_vcnt = 1; 35 - bio.bi_idx = 0; 36 35 bio.bi_size = PAGE_SIZE; 37 36 bio.bi_bdev = bdev; 38 37 bio.bi_sector = page->index * (PAGE_SIZE >> 9); ··· 107 108 if (i >= max_pages) { 108 109 /* Block layer cannot split bios :( */ 109 110 bio->bi_vcnt = i; 110 - bio->bi_idx = 0; 111 111 bio->bi_size = i * PAGE_SIZE; 112 112 bio->bi_bdev = super->s_bdev; 113 113 bio->bi_sector = ofs >> 9; ··· 134 136 unlock_page(page); 135 137 } 136 138 bio->bi_vcnt = nr_pages; 137 - bio->bi_idx = 0; 138 139 bio->bi_size = nr_pages * PAGE_SIZE; 139 140 bio->bi_bdev = super->s_bdev; 140 141 bio->bi_sector = ofs >> 9; ··· 199 202 if (i >= max_pages) { 200 203 /* Block layer cannot split bios :( */ 201 204 bio->bi_vcnt = i; 202 - bio->bi_idx = 0; 203 205 bio->bi_size = i * PAGE_SIZE; 204 206 bio->bi_bdev = super->s_bdev; 205 207 bio->bi_sector = ofs >> 9; ··· 220 224 bio->bi_io_vec[i].bv_offset = 0; 221 225 } 222 226 bio->bi_vcnt = nr_pages; 223 - bio->bi_idx = 0; 224 227 bio->bi_size = nr_pages * PAGE_SIZE; 225 228 bio->bi_bdev = super->s_bdev; 226 229 bio->bi_sector = ofs >> 9;

+5 -11

include/linux/backing-dev.h

··· 18 18 #include <linux/writeback.h> 19 19 #include <linux/atomic.h> 20 20 #include <linux/sysctl.h> 21 + #include <linux/workqueue.h> 21 22 22 23 struct page; 23 24 struct device; ··· 28 27 * Bits in backing_dev_info.state 29 28 */ 30 29 enum bdi_state { 31 - BDI_pending, /* On its way to being activated */ 32 30 BDI_wb_alloc, /* Default embedded wb allocated */ 33 31 BDI_async_congested, /* The async (write) queue is getting full */ 34 32 BDI_sync_congested, /* The sync queue is getting full */ ··· 53 53 unsigned int nr; 54 54 55 55 unsigned long last_old_flush; /* last old data flush */ 56 - unsigned long last_active; /* last time bdi thread was active */ 57 56 58 - struct task_struct *task; /* writeback thread */ 59 - struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */ 57 + struct delayed_work dwork; /* work item used for writeback */ 60 58 struct list_head b_dirty; /* dirty inodes */ 61 59 struct list_head b_io; /* parked for writeback */ 62 60 struct list_head b_more_io; /* parked for more writeback */ ··· 121 123 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, 122 124 enum wb_reason reason); 123 125 void bdi_start_background_writeback(struct backing_dev_info *bdi); 124 - int bdi_writeback_thread(void *data); 126 + void bdi_writeback_workfn(struct work_struct *work); 125 127 int bdi_has_dirty_io(struct backing_dev_info *bdi); 126 128 void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); 127 129 void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); 128 130 129 131 extern spinlock_t bdi_lock; 130 132 extern struct list_head bdi_list; 131 - extern struct list_head bdi_pending_list; 133 + 134 + extern struct workqueue_struct *bdi_wq; 132 135 133 136 static inline int wb_has_dirty_io(struct bdi_writeback *wb) 134 137 { ··· 333 334 static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi) 334 335 { 335 336 return bdi->capabilities & BDI_CAP_SWAP_BACKED; 336 - } 337 - 338 - static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi) 339 - { 340 - return bdi == &default_backing_dev_info; 341 337 } 342 338 343 339 static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)

+71 -44

include/linux/bio.h

··· 67 67 #define bio_offset(bio) bio_iovec((bio))->bv_offset 68 68 #define bio_segments(bio) ((bio)->bi_vcnt - (bio)->bi_idx) 69 69 #define bio_sectors(bio) ((bio)->bi_size >> 9) 70 + #define bio_end_sector(bio) ((bio)->bi_sector + bio_sectors((bio))) 70 71 71 72 static inline unsigned int bio_cur_bytes(struct bio *bio) 72 73 { ··· 83 82 return page_address(bio_page(bio)) + bio_offset(bio); 84 83 85 84 return NULL; 86 - } 87 - 88 - static inline int bio_has_allocated_vec(struct bio *bio) 89 - { 90 - return bio->bi_io_vec && bio->bi_io_vec != bio->bi_inline_vecs; 91 85 } 92 86 93 87 /* ··· 132 136 #define bio_io_error(bio) bio_endio((bio), -EIO) 133 137 134 138 /* 135 - * drivers should not use the __ version unless they _really_ want to 136 - * run through the entire bio and not just pending pieces 139 + * drivers should not use the __ version unless they _really_ know what 140 + * they're doing 137 141 */ 138 142 #define __bio_for_each_segment(bvl, bio, i, start_idx) \ 139 143 for (bvl = bio_iovec_idx((bio), (start_idx)), i = (start_idx); \ 140 144 i < (bio)->bi_vcnt; \ 141 145 bvl++, i++) 142 146 147 + /* 148 + * drivers should _never_ use the all version - the bio may have been split 149 + * before it got to the driver and the driver won't own all of it 150 + */ 151 + #define bio_for_each_segment_all(bvl, bio, i) \ 152 + for (i = 0; \ 153 + bvl = bio_iovec_idx((bio), (i)), i < (bio)->bi_vcnt; \ 154 + i++) 155 + 143 156 #define bio_for_each_segment(bvl, bio, i) \ 144 - __bio_for_each_segment(bvl, bio, i, (bio)->bi_idx) 157 + for (i = (bio)->bi_idx; \ 158 + bvl = bio_iovec_idx((bio), (i)), i < (bio)->bi_vcnt; \ 159 + i++) 145 160 146 161 /* 147 162 * get a reference to a bio, so it won't disappear. the intended use is ··· 187 180 unsigned short bip_slab; /* slab the bip came from */ 188 181 unsigned short bip_vcnt; /* # of integrity bio_vecs */ 189 182 unsigned short bip_idx; /* current bip_vec index */ 183 + unsigned bip_owns_buf:1; /* should free bip_buf */ 190 184 191 185 struct work_struct bip_work; /* I/O completion */ 192 - struct bio_vec bip_vec[0]; /* embedded bvec array */ 186 + 187 + struct bio_vec *bip_vec; 188 + struct bio_vec bip_inline_vecs[0];/* embedded bvec array */ 193 189 }; 194 190 #endif /* CONFIG_BLK_DEV_INTEGRITY */ 195 191 ··· 221 211 222 212 extern struct bio_set *bioset_create(unsigned int, unsigned int); 223 213 extern void bioset_free(struct bio_set *); 214 + extern mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries); 224 215 225 216 extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); 226 217 extern void bio_put(struct bio *); ··· 255 244 extern void bio_endio(struct bio *, int); 256 245 struct request_queue; 257 246 extern int bio_phys_segments(struct request_queue *, struct bio *); 247 + 248 + extern int submit_bio_wait(int rw, struct bio *bio); 249 + extern void bio_advance(struct bio *, unsigned); 258 250 259 251 extern void bio_init(struct bio *); 260 252 extern void bio_reset(struct bio *); ··· 293 279 } 294 280 #endif 295 281 282 + extern void bio_copy_data(struct bio *dst, struct bio *src); 283 + extern int bio_alloc_pages(struct bio *bio, gfp_t gfp); 284 + 296 285 extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *, 297 286 unsigned long, unsigned int, int, gfp_t); 298 287 extern struct bio *bio_copy_user_iov(struct request_queue *, ··· 303 286 int, int, gfp_t); 304 287 extern int bio_uncopy_user(struct bio *); 305 288 void zero_fill_bio(struct bio *bio); 306 - extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *); 307 - extern void bvec_free_bs(struct bio_set *, struct bio_vec *, unsigned int); 289 + extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *); 290 + extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int); 308 291 extern unsigned int bvec_nr_vecs(unsigned short idx); 309 292 310 293 #ifdef CONFIG_BLK_CGROUP ··· 314 297 static inline int bio_associate_current(struct bio *bio) { return -ENOENT; } 315 298 static inline void bio_disassociate_task(struct bio *bio) { } 316 299 #endif /* CONFIG_BLK_CGROUP */ 317 - 318 - /* 319 - * bio_set is used to allow other portions of the IO system to 320 - * allocate their own private memory pools for bio and iovec structures. 321 - * These memory pools in turn all allocate from the bio_slab 322 - * and the bvec_slabs[]. 323 - */ 324 - #define BIO_POOL_SIZE 2 325 - #define BIOVEC_NR_POOLS 6 326 - #define BIOVEC_MAX_IDX (BIOVEC_NR_POOLS - 1) 327 - 328 - struct bio_set { 329 - struct kmem_cache *bio_slab; 330 - unsigned int front_pad; 331 - 332 - mempool_t *bio_pool; 333 - #if defined(CONFIG_BLK_DEV_INTEGRITY) 334 - mempool_t *bio_integrity_pool; 335 - #endif 336 - mempool_t *bvec_pool; 337 - }; 338 - 339 - struct biovec_slab { 340 - int nr_vecs; 341 - char *name; 342 - struct kmem_cache *slab; 343 - }; 344 - 345 - /* 346 - * a small number of entries is fine, not going to be performance critical. 347 - * basically we just need to survive 348 - */ 349 - #define BIO_SPLIT_ENTRIES 2 350 300 351 301 #ifdef CONFIG_HIGHMEM 352 302 /* ··· 510 526 511 527 return bio; 512 528 } 529 + 530 + /* 531 + * bio_set is used to allow other portions of the IO system to 532 + * allocate their own private memory pools for bio and iovec structures. 533 + * These memory pools in turn all allocate from the bio_slab 534 + * and the bvec_slabs[]. 535 + */ 536 + #define BIO_POOL_SIZE 2 537 + #define BIOVEC_NR_POOLS 6 538 + #define BIOVEC_MAX_IDX (BIOVEC_NR_POOLS - 1) 539 + 540 + struct bio_set { 541 + struct kmem_cache *bio_slab; 542 + unsigned int front_pad; 543 + 544 + mempool_t *bio_pool; 545 + mempool_t *bvec_pool; 546 + #if defined(CONFIG_BLK_DEV_INTEGRITY) 547 + mempool_t *bio_integrity_pool; 548 + mempool_t *bvec_integrity_pool; 549 + #endif 550 + 551 + /* 552 + * Deadlock avoidance for stacking block drivers: see comments in 553 + * bio_alloc_bioset() for details 554 + */ 555 + spinlock_t rescue_lock; 556 + struct bio_list rescue_list; 557 + struct work_struct rescue_work; 558 + struct workqueue_struct *rescue_workqueue; 559 + }; 560 + 561 + struct biovec_slab { 562 + int nr_vecs; 563 + char *name; 564 + struct kmem_cache *slab; 565 + }; 566 + 567 + /* 568 + * a small number of entries is fine, not going to be performance critical. 569 + * basically we just need to survive 570 + */ 571 + #define BIO_SPLIT_ENTRIES 2 513 572 514 573 #if defined(CONFIG_BLK_DEV_INTEGRITY) 515 574

+5

include/linux/blk_types.h

··· 118 118 * BIO_POOL_IDX() 119 119 */ 120 120 #define BIO_RESET_BITS 13 121 + #define BIO_OWNS_VEC 13 /* bio_free() should free bvec */ 121 122 122 123 #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) 123 124 ··· 177 176 __REQ_IO_STAT, /* account I/O stat */ 178 177 __REQ_MIXED_MERGE, /* merge of different types, fail separately */ 179 178 __REQ_KERNEL, /* direct IO to kernel pages */ 179 + __REQ_PM, /* runtime pm request */ 180 180 __REQ_NR_BITS, /* stops here */ 181 181 }; 182 182 ··· 199 197 REQ_DISCARD | REQ_WRITE_SAME | REQ_NOIDLE | REQ_FLUSH | REQ_FUA | \ 200 198 REQ_SECURE) 201 199 #define REQ_CLONE_MASK REQ_COMMON_MASK 200 + 201 + #define BIO_NO_ADVANCE_ITER_MASK (REQ_DISCARD|REQ_WRITE_SAME) 202 202 203 203 /* This mask is used for both bio and request merge checking */ 204 204 #define REQ_NOMERGE_FLAGS \ ··· 228 224 #define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE) 229 225 #define REQ_SECURE (1 << __REQ_SECURE) 230 226 #define REQ_KERNEL (1 << __REQ_KERNEL) 227 + #define REQ_PM (1 << __REQ_PM) 231 228 232 229 #endif /* __LINUX_BLK_TYPES_H */

+28 -1

include/linux/blkdev.h

··· 361 361 */ 362 362 struct kobject kobj; 363 363 364 + #ifdef CONFIG_PM_RUNTIME 365 + struct device *dev; 366 + int rpm_status; 367 + unsigned int nr_pending; 368 + #endif 369 + 364 370 /* 365 371 * queue settings 366 372 */ ··· 844 838 unsigned int cmd_flags) 845 839 { 846 840 if (unlikely(cmd_flags & REQ_DISCARD)) 847 - return q->limits.max_discard_sectors; 841 + return min(q->limits.max_discard_sectors, UINT_MAX >> 9); 848 842 849 843 if (unlikely(cmd_flags & REQ_WRITE_SAME)) 850 844 return q->limits.max_write_same_sectors; ··· 965 959 struct request_queue *blk_alloc_queue(gfp_t); 966 960 struct request_queue *blk_alloc_queue_node(gfp_t, int); 967 961 extern void blk_put_queue(struct request_queue *); 962 + 963 + /* 964 + * block layer runtime pm functions 965 + */ 966 + #ifdef CONFIG_PM_RUNTIME 967 + extern void blk_pm_runtime_init(struct request_queue *q, struct device *dev); 968 + extern int blk_pre_runtime_suspend(struct request_queue *q); 969 + extern void blk_post_runtime_suspend(struct request_queue *q, int err); 970 + extern void blk_pre_runtime_resume(struct request_queue *q); 971 + extern void blk_post_runtime_resume(struct request_queue *q, int err); 972 + #else 973 + static inline void blk_pm_runtime_init(struct request_queue *q, 974 + struct device *dev) {} 975 + static inline int blk_pre_runtime_suspend(struct request_queue *q) 976 + { 977 + return -ENOSYS; 978 + } 979 + static inline void blk_post_runtime_suspend(struct request_queue *q, int err) {} 980 + static inline void blk_pre_runtime_resume(struct request_queue *q) {} 981 + static inline void blk_post_runtime_resume(struct request_queue *q, int err) {} 982 + #endif 968 983 969 984 /* 970 985 * blk_plug permits building a queue of related requests by holding the I/O

+6 -6

include/trace/events/block.h

··· 244 244 __entry->dev = bio->bi_bdev ? 245 245 bio->bi_bdev->bd_dev : 0; 246 246 __entry->sector = bio->bi_sector; 247 - __entry->nr_sector = bio->bi_size >> 9; 247 + __entry->nr_sector = bio_sectors(bio); 248 248 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); 249 249 memcpy(__entry->comm, current->comm, TASK_COMM_LEN); 250 250 ), ··· 281 281 TP_fast_assign( 282 282 __entry->dev = bio->bi_bdev->bd_dev; 283 283 __entry->sector = bio->bi_sector; 284 - __entry->nr_sector = bio->bi_size >> 9; 284 + __entry->nr_sector = bio_sectors(bio); 285 285 __entry->error = error; 286 286 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); 287 287 ), ··· 309 309 TP_fast_assign( 310 310 __entry->dev = bio->bi_bdev->bd_dev; 311 311 __entry->sector = bio->bi_sector; 312 - __entry->nr_sector = bio->bi_size >> 9; 312 + __entry->nr_sector = bio_sectors(bio); 313 313 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); 314 314 memcpy(__entry->comm, current->comm, TASK_COMM_LEN); 315 315 ), ··· 376 376 TP_fast_assign( 377 377 __entry->dev = bio->bi_bdev->bd_dev; 378 378 __entry->sector = bio->bi_sector; 379 - __entry->nr_sector = bio->bi_size >> 9; 379 + __entry->nr_sector = bio_sectors(bio); 380 380 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); 381 381 memcpy(__entry->comm, current->comm, TASK_COMM_LEN); 382 382 ), ··· 404 404 TP_fast_assign( 405 405 __entry->dev = bio ? bio->bi_bdev->bd_dev : 0; 406 406 __entry->sector = bio ? bio->bi_sector : 0; 407 - __entry->nr_sector = bio ? bio->bi_size >> 9 : 0; 407 + __entry->nr_sector = bio ? bio_sectors(bio) : 0; 408 408 blk_fill_rwbs(__entry->rwbs, 409 409 bio ? bio->bi_rw : 0, __entry->nr_sector); 410 410 memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ··· 580 580 TP_fast_assign( 581 581 __entry->dev = bio->bi_bdev->bd_dev; 582 582 __entry->sector = bio->bi_sector; 583 - __entry->nr_sector = bio->bi_size >> 9; 583 + __entry->nr_sector = bio_sectors(bio); 584 584 __entry->old_dev = dev; 585 585 __entry->old_sector = from; 586 586 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);

-5

include/trace/events/writeback.h

··· 183 183 DEFINE_EVENT(writeback_work_class, name, \ 184 184 TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \ 185 185 TP_ARGS(bdi, work)) 186 - DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread); 187 186 DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); 188 187 DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); 189 188 DEFINE_WRITEBACK_WORK_EVENT(writeback_start); ··· 221 222 222 223 DEFINE_WRITEBACK_EVENT(writeback_nowork); 223 224 DEFINE_WRITEBACK_EVENT(writeback_wake_background); 224 - DEFINE_WRITEBACK_EVENT(writeback_wake_thread); 225 - DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread); 226 225 DEFINE_WRITEBACK_EVENT(writeback_bdi_register); 227 226 DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); 228 - DEFINE_WRITEBACK_EVENT(writeback_thread_start); 229 - DEFINE_WRITEBACK_EVENT(writeback_thread_stop); 230 227 231 228 DECLARE_EVENT_CLASS(wbc_class, 232 229 TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),

+1 -1

kernel/relay.c

··· 234 234 static void relay_remove_buf(struct kref *kref) 235 235 { 236 236 struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); 237 - buf->chan->cb->remove_buf_file(buf->dentry); 238 237 relay_destroy_buf(buf); 239 238 } 240 239 ··· 483 484 { 484 485 buf->finalized = 1; 485 486 del_timer_sync(&buf->timer); 487 + buf->chan->cb->remove_buf_file(buf->dentry); 486 488 kref_put(&buf->kref, relay_remove_buf); 487 489 } 488 490

+29 -230

mm/backing-dev.c

··· 31 31 static struct class *bdi_class; 32 32 33 33 /* 34 - * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as 35 - * reader side protection for bdi_pending_list. bdi_list has RCU reader side 34 + * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side 36 35 * locking. 37 36 */ 38 37 DEFINE_SPINLOCK(bdi_lock); 39 38 LIST_HEAD(bdi_list); 40 - LIST_HEAD(bdi_pending_list); 39 + 40 + /* bdi_wq serves all asynchronous writeback tasks */ 41 + struct workqueue_struct *bdi_wq; 41 42 42 43 void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) 43 44 { ··· 258 257 { 259 258 int err; 260 259 260 + bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE | 261 + WQ_UNBOUND | WQ_SYSFS, 0); 262 + if (!bdi_wq) 263 + return -ENOMEM; 264 + 261 265 err = bdi_init(&default_backing_dev_info); 262 266 if (!err) 263 267 bdi_register(&default_backing_dev_info, NULL, "default"); ··· 275 269 int bdi_has_dirty_io(struct backing_dev_info *bdi) 276 270 { 277 271 return wb_has_dirty_io(&bdi->wb); 278 - } 279 - 280 - static void wakeup_timer_fn(unsigned long data) 281 - { 282 - struct backing_dev_info *bdi = (struct backing_dev_info *)data; 283 - 284 - spin_lock_bh(&bdi->wb_lock); 285 - if (bdi->wb.task) { 286 - trace_writeback_wake_thread(bdi); 287 - wake_up_process(bdi->wb.task); 288 - } else if (bdi->dev) { 289 - /* 290 - * When bdi tasks are inactive for long time, they are killed. 291 - * In this case we have to wake-up the forker thread which 292 - * should create and run the bdi thread. 293 - */ 294 - trace_writeback_wake_forker_thread(bdi); 295 - wake_up_process(default_backing_dev_info.wb.task); 296 - } 297 - spin_unlock_bh(&bdi->wb_lock); 298 272 } 299 273 300 274 /* ··· 293 307 unsigned long timeout; 294 308 295 309 timeout = msecs_to_jiffies(dirty_writeback_interval * 10); 296 - mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout); 297 - } 298 - 299 - /* 300 - * Calculate the longest interval (jiffies) bdi threads are allowed to be 301 - * inactive. 302 - */ 303 - static unsigned long bdi_longest_inactive(void) 304 - { 305 - unsigned long interval; 306 - 307 - interval = msecs_to_jiffies(dirty_writeback_interval * 10); 308 - return max(5UL * 60 * HZ, interval); 309 - } 310 - 311 - /* 312 - * Clear pending bit and wakeup anybody waiting for flusher thread creation or 313 - * shutdown 314 - */ 315 - static void bdi_clear_pending(struct backing_dev_info *bdi) 316 - { 317 - clear_bit(BDI_pending, &bdi->state); 318 - smp_mb__after_clear_bit(); 319 - wake_up_bit(&bdi->state, BDI_pending); 320 - } 321 - 322 - static int bdi_forker_thread(void *ptr) 323 - { 324 - struct bdi_writeback *me = ptr; 325 - 326 - current->flags |= PF_SWAPWRITE; 327 - set_freezable(); 328 - 329 - /* 330 - * Our parent may run at a different priority, just set us to normal 331 - */ 332 - set_user_nice(current, 0); 333 - 334 - for (;;) { 335 - struct task_struct *task = NULL; 336 - struct backing_dev_info *bdi; 337 - enum { 338 - NO_ACTION, /* Nothing to do */ 339 - FORK_THREAD, /* Fork bdi thread */ 340 - KILL_THREAD, /* Kill inactive bdi thread */ 341 - } action = NO_ACTION; 342 - 343 - /* 344 - * Temporary measure, we want to make sure we don't see 345 - * dirty data on the default backing_dev_info 346 - */ 347 - if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) { 348 - del_timer(&me->wakeup_timer); 349 - wb_do_writeback(me, 0); 350 - } 351 - 352 - spin_lock_bh(&bdi_lock); 353 - /* 354 - * In the following loop we are going to check whether we have 355 - * some work to do without any synchronization with tasks 356 - * waking us up to do work for them. Set the task state here 357 - * so that we don't miss wakeups after verifying conditions. 358 - */ 359 - set_current_state(TASK_INTERRUPTIBLE); 360 - 361 - list_for_each_entry(bdi, &bdi_list, bdi_list) { 362 - bool have_dirty_io; 363 - 364 - if (!bdi_cap_writeback_dirty(bdi) || 365 - bdi_cap_flush_forker(bdi)) 366 - continue; 367 - 368 - WARN(!test_bit(BDI_registered, &bdi->state), 369 - "bdi %p/%s is not registered!\n", bdi, bdi->name); 370 - 371 - have_dirty_io = !list_empty(&bdi->work_list) || 372 - wb_has_dirty_io(&bdi->wb); 373 - 374 - /* 375 - * If the bdi has work to do, but the thread does not 376 - * exist - create it. 377 - */ 378 - if (!bdi->wb.task && have_dirty_io) { 379 - /* 380 - * Set the pending bit - if someone will try to 381 - * unregister this bdi - it'll wait on this bit. 382 - */ 383 - set_bit(BDI_pending, &bdi->state); 384 - action = FORK_THREAD; 385 - break; 386 - } 387 - 388 - spin_lock(&bdi->wb_lock); 389 - 390 - /* 391 - * If there is no work to do and the bdi thread was 392 - * inactive long enough - kill it. The wb_lock is taken 393 - * to make sure no-one adds more work to this bdi and 394 - * wakes the bdi thread up. 395 - */ 396 - if (bdi->wb.task && !have_dirty_io && 397 - time_after(jiffies, bdi->wb.last_active + 398 - bdi_longest_inactive())) { 399 - task = bdi->wb.task; 400 - bdi->wb.task = NULL; 401 - spin_unlock(&bdi->wb_lock); 402 - set_bit(BDI_pending, &bdi->state); 403 - action = KILL_THREAD; 404 - break; 405 - } 406 - spin_unlock(&bdi->wb_lock); 407 - } 408 - spin_unlock_bh(&bdi_lock); 409 - 410 - /* Keep working if default bdi still has things to do */ 411 - if (!list_empty(&me->bdi->work_list)) 412 - __set_current_state(TASK_RUNNING); 413 - 414 - switch (action) { 415 - case FORK_THREAD: 416 - __set_current_state(TASK_RUNNING); 417 - task = kthread_create(bdi_writeback_thread, &bdi->wb, 418 - "flush-%s", dev_name(bdi->dev)); 419 - if (IS_ERR(task)) { 420 - /* 421 - * If thread creation fails, force writeout of 422 - * the bdi from the thread. Hopefully 1024 is 423 - * large enough for efficient IO. 424 - */ 425 - writeback_inodes_wb(&bdi->wb, 1024, 426 - WB_REASON_FORKER_THREAD); 427 - } else { 428 - /* 429 - * The spinlock makes sure we do not lose 430 - * wake-ups when racing with 'bdi_queue_work()'. 431 - * And as soon as the bdi thread is visible, we 432 - * can start it. 433 - */ 434 - spin_lock_bh(&bdi->wb_lock); 435 - bdi->wb.task = task; 436 - spin_unlock_bh(&bdi->wb_lock); 437 - wake_up_process(task); 438 - } 439 - bdi_clear_pending(bdi); 440 - break; 441 - 442 - case KILL_THREAD: 443 - __set_current_state(TASK_RUNNING); 444 - kthread_stop(task); 445 - bdi_clear_pending(bdi); 446 - break; 447 - 448 - case NO_ACTION: 449 - if (!wb_has_dirty_io(me) || !dirty_writeback_interval) 450 - /* 451 - * There are no dirty data. The only thing we 452 - * should now care about is checking for 453 - * inactive bdi threads and killing them. Thus, 454 - * let's sleep for longer time, save energy and 455 - * be friendly for battery-driven devices. 456 - */ 457 - schedule_timeout(bdi_longest_inactive()); 458 - else 459 - schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); 460 - try_to_freeze(); 461 - break; 462 - } 463 - } 464 - 465 - return 0; 310 + mod_delayed_work(bdi_wq, &bdi->wb.dwork, timeout); 466 311 } 467 312 468 313 /* ··· 306 489 spin_unlock_bh(&bdi_lock); 307 490 308 491 synchronize_rcu_expedited(); 492 + 493 + /* bdi_list is now unused, clear it to mark @bdi dying */ 494 + INIT_LIST_HEAD(&bdi->bdi_list); 309 495 } 310 496 311 497 int bdi_register(struct backing_dev_info *bdi, struct device *parent, ··· 327 507 return PTR_ERR(dev); 328 508 329 509 bdi->dev = dev; 330 - 331 - /* 332 - * Just start the forker thread for our default backing_dev_info, 333 - * and add other bdi's to the list. They will get a thread created 334 - * on-demand when they need it. 335 - */ 336 - if (bdi_cap_flush_forker(bdi)) { 337 - struct bdi_writeback *wb = &bdi->wb; 338 - 339 - wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s", 340 - dev_name(dev)); 341 - if (IS_ERR(wb->task)) 342 - return PTR_ERR(wb->task); 343 - } 344 510 345 511 bdi_debug_register(bdi, dev_name(dev)); 346 512 set_bit(BDI_registered, &bdi->state); ··· 351 545 */ 352 546 static void bdi_wb_shutdown(struct backing_dev_info *bdi) 353 547 { 354 - struct task_struct *task; 355 - 356 548 if (!bdi_cap_writeback_dirty(bdi)) 357 549 return; 358 550 ··· 360 556 bdi_remove_from_list(bdi); 361 557 362 558 /* 363 - * If setup is pending, wait for that to complete first 559 + * Drain work list and shutdown the delayed_work. At this point, 560 + * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi 561 + * is dying and its work_list needs to be drained no matter what. 364 562 */ 365 - wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, 366 - TASK_UNINTERRUPTIBLE); 563 + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); 564 + flush_delayed_work(&bdi->wb.dwork); 565 + WARN_ON(!list_empty(&bdi->work_list)); 367 566 368 567 /* 369 - * Finally, kill the kernel thread. We don't need to be RCU 370 - * safe anymore, since the bdi is gone from visibility. 568 + * This shouldn't be necessary unless @bdi for some reason has 569 + * unflushed dirty IO after work_list is drained. Do it anyway 570 + * just in case. 371 571 */ 372 - spin_lock_bh(&bdi->wb_lock); 373 - task = bdi->wb.task; 374 - bdi->wb.task = NULL; 375 - spin_unlock_bh(&bdi->wb_lock); 376 - 377 - if (task) 378 - kthread_stop(task); 572 + cancel_delayed_work_sync(&bdi->wb.dwork); 379 573 } 380 574 381 575 /* ··· 399 597 bdi_set_min_ratio(bdi, 0); 400 598 trace_writeback_bdi_unregister(bdi); 401 599 bdi_prune_sb(bdi); 402 - del_timer_sync(&bdi->wb.wakeup_timer); 403 600 404 - if (!bdi_cap_flush_forker(bdi)) 405 - bdi_wb_shutdown(bdi); 601 + bdi_wb_shutdown(bdi); 406 602 bdi_debug_unregister(bdi); 407 603 408 604 spin_lock_bh(&bdi->wb_lock); ··· 422 622 INIT_LIST_HEAD(&wb->b_io); 423 623 INIT_LIST_HEAD(&wb->b_more_io); 424 624 spin_lock_init(&wb->list_lock); 425 - setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); 625 + INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn); 426 626 } 427 627 428 628 /* ··· 495 695 bdi_unregister(bdi); 496 696 497 697 /* 498 - * If bdi_unregister() had already been called earlier, the 499 - * wakeup_timer could still be armed because bdi_prune_sb() 500 - * can race with the bdi_wakeup_thread_delayed() calls from 501 - * __mark_inode_dirty(). 698 + * If bdi_unregister() had already been called earlier, the dwork 699 + * could still be pending because bdi_prune_sb() can race with the 700 + * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty(). 502 701 */ 503 - del_timer_sync(&bdi->wb.wakeup_timer); 702 + cancel_delayed_work_sync(&bdi->wb.dwork); 504 703 505 704 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) 506 705 percpu_counter_destroy(&bdi->bdi_stat[i]);

+20 -55

mm/bounce.c

··· 101 101 struct bio_vec *tovec, *fromvec; 102 102 int i; 103 103 104 - __bio_for_each_segment(tovec, to, i, 0) { 104 + bio_for_each_segment(tovec, to, i) { 105 105 fromvec = from->bi_io_vec + i; 106 106 107 107 /* ··· 134 134 /* 135 135 * free up bounce indirect pages used 136 136 */ 137 - __bio_for_each_segment(bvec, bio, i, 0) { 137 + bio_for_each_segment_all(bvec, bio, i) { 138 138 org_vec = bio_orig->bi_io_vec + i; 139 139 if (bvec->bv_page == org_vec->bv_page) 140 140 continue; ··· 199 199 static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, 200 200 mempool_t *pool, int force) 201 201 { 202 - struct page *page; 203 - struct bio *bio = NULL; 204 - int i, rw = bio_data_dir(*bio_orig); 202 + struct bio *bio; 203 + int rw = bio_data_dir(*bio_orig); 205 204 struct bio_vec *to, *from; 205 + unsigned i; 206 206 207 - bio_for_each_segment(from, *bio_orig, i) { 208 - page = from->bv_page; 207 + bio_for_each_segment(from, *bio_orig, i) 208 + if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q)) 209 + goto bounce; 209 210 210 - /* 211 - * is destination page below bounce pfn? 212 - */ 211 + return; 212 + bounce: 213 + bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set); 214 + 215 + bio_for_each_segment_all(to, bio, i) { 216 + struct page *page = to->bv_page; 217 + 213 218 if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force) 214 219 continue; 215 220 216 - /* 217 - * irk, bounce it 218 - */ 219 - if (!bio) { 220 - unsigned int cnt = (*bio_orig)->bi_vcnt; 221 - 222 - bio = bio_alloc(GFP_NOIO, cnt); 223 - memset(bio->bi_io_vec, 0, cnt * sizeof(struct bio_vec)); 224 - } 225 - 226 - 227 - to = bio->bi_io_vec + i; 228 - 229 - to->bv_page = mempool_alloc(pool, q->bounce_gfp); 230 - to->bv_len = from->bv_len; 231 - to->bv_offset = from->bv_offset; 232 221 inc_zone_page_state(to->bv_page, NR_BOUNCE); 222 + to->bv_page = mempool_alloc(pool, q->bounce_gfp); 233 223 234 224 if (rw == WRITE) { 235 225 char *vto, *vfrom; 236 226 237 - flush_dcache_page(from->bv_page); 227 + flush_dcache_page(page); 228 + 238 229 vto = page_address(to->bv_page) + to->bv_offset; 239 - vfrom = kmap(from->bv_page) + from->bv_offset; 230 + vfrom = kmap_atomic(page) + to->bv_offset; 240 231 memcpy(vto, vfrom, to->bv_len); 241 - kunmap(from->bv_page); 232 + kunmap_atomic(vfrom); 242 233 } 243 234 } 244 - 245 - /* 246 - * no pages bounced 247 - */ 248 - if (!bio) 249 - return; 250 235 251 236 trace_block_bio_bounce(q, *bio_orig); 252 237 253 - /* 254 - * at least one page was bounced, fill in possible non-highmem 255 - * pages 256 - */ 257 - __bio_for_each_segment(from, *bio_orig, i, 0) { 258 - to = bio_iovec_idx(bio, i); 259 - if (!to->bv_page) { 260 - to->bv_page = from->bv_page; 261 - to->bv_len = from->bv_len; 262 - to->bv_offset = from->bv_offset; 263 - } 264 - } 265 - 266 - bio->bi_bdev = (*bio_orig)->bi_bdev; 267 238 bio->bi_flags |= (1 << BIO_BOUNCED); 268 - bio->bi_sector = (*bio_orig)->bi_sector; 269 - bio->bi_rw = (*bio_orig)->bi_rw; 270 - 271 - bio->bi_vcnt = (*bio_orig)->bi_vcnt; 272 - bio->bi_idx = (*bio_orig)->bi_idx; 273 - bio->bi_size = (*bio_orig)->bi_size; 274 239 275 240 if (pool == page_pool) { 276 241 bio->bi_end_io = bounce_end_io_write;

-1

mm/page_io.c

··· 36 36 bio->bi_io_vec[0].bv_len = PAGE_SIZE; 37 37 bio->bi_io_vec[0].bv_offset = 0; 38 38 bio->bi_vcnt = 1; 39 - bio->bi_idx = 0; 40 39 bio->bi_size = PAGE_SIZE; 41 40 bio->bi_end_io = end_io; 42 41 }