Merge branch 'for-3.1/core' of git://git.kernel.dk/linux-block

+7 -3

Documentation/block/queue-sysfs.txt

··· 45 45 46 46 rq_affinity (RW) 47 47 ---------------- 48 - If this option is enabled, the block layer will migrate request completions 49 - to the CPU that originally submitted the request. For some workloads 50 - this provides a significant reduction in CPU cycles due to caching effects. 48 + If this option is '1', the block layer will migrate request completions to the 49 + cpu "group" that originally submitted the request. For some workloads this 50 + provides a significant reduction in CPU cycles due to caching effects. 51 + 52 + For storage configurations that need to maximize distribution of completion 53 + processing setting this option to '2' forces the completion to run on the 54 + requesting cpu (bypassing the "group" aggregation logic). 51 55 52 56 scheduler (RW) 53 57 --------------

+7 -4

block/blk-core.c

··· 1282 1282 init_request_from_bio(req, bio); 1283 1283 1284 1284 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || 1285 - bio_flagged(bio, BIO_CPU_AFFINE)) { 1286 - req->cpu = blk_cpu_to_group(get_cpu()); 1287 - put_cpu(); 1288 - } 1285 + bio_flagged(bio, BIO_CPU_AFFINE)) 1286 + req->cpu = smp_processor_id(); 1289 1287 1290 1288 plug = current->plug; 1291 1289 if (plug) { ··· 1303 1305 plug->should_sort = 1; 1304 1306 } 1305 1307 list_add_tail(&req->queuelist, &plug->list); 1308 + plug->count++; 1306 1309 drive_stat_acct(req, 1); 1310 + if (plug->count >= BLK_MAX_REQUEST_COUNT) 1311 + blk_flush_plug_list(plug, false); 1307 1312 } else { 1308 1313 spin_lock_irq(q->queue_lock); 1309 1314 add_acct_request(q, req, where); ··· 2630 2629 INIT_LIST_HEAD(&plug->list); 2631 2630 INIT_LIST_HEAD(&plug->cb_list); 2632 2631 plug->should_sort = 0; 2632 + plug->count = 0; 2633 2633 2634 2634 /* 2635 2635 * If this is a nested plug, don't actually assign it. It will be ··· 2714 2712 return; 2715 2713 2716 2714 list_splice_init(&plug->list, &list); 2715 + plug->count = 0; 2717 2716 2718 2717 if (plug->should_sort) { 2719 2718 list_sort(NULL, &list, plug_rq_cmp);

+20 -20

block/blk-ioc.c

··· 82 82 83 83 struct io_context *alloc_io_context(gfp_t gfp_flags, int node) 84 84 { 85 - struct io_context *ret; 85 + struct io_context *ioc; 86 86 87 - ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node); 88 - if (ret) { 89 - atomic_long_set(&ret->refcount, 1); 90 - atomic_set(&ret->nr_tasks, 1); 91 - spin_lock_init(&ret->lock); 92 - ret->ioprio_changed = 0; 93 - ret->ioprio = 0; 94 - ret->last_waited = 0; /* doesn't matter... */ 95 - ret->nr_batch_requests = 0; /* because this is 0 */ 96 - INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); 97 - INIT_HLIST_HEAD(&ret->cic_list); 98 - ret->ioc_data = NULL; 87 + ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node); 88 + if (ioc) { 89 + atomic_long_set(&ioc->refcount, 1); 90 + atomic_set(&ioc->nr_tasks, 1); 91 + spin_lock_init(&ioc->lock); 92 + ioc->ioprio_changed = 0; 93 + ioc->ioprio = 0; 94 + ioc->last_waited = 0; /* doesn't matter... */ 95 + ioc->nr_batch_requests = 0; /* because this is 0 */ 96 + INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH); 97 + INIT_HLIST_HEAD(&ioc->cic_list); 98 + ioc->ioc_data = NULL; 99 99 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) 100 - ret->cgroup_changed = 0; 100 + ioc->cgroup_changed = 0; 101 101 #endif 102 102 } 103 103 104 - return ret; 104 + return ioc; 105 105 } 106 106 107 107 /* ··· 139 139 */ 140 140 struct io_context *get_io_context(gfp_t gfp_flags, int node) 141 141 { 142 - struct io_context *ret = NULL; 142 + struct io_context *ioc = NULL; 143 143 144 144 /* 145 145 * Check for unlikely race with exiting task. ioc ref count is 146 146 * zero when ioc is being detached. 147 147 */ 148 148 do { 149 - ret = current_io_context(gfp_flags, node); 150 - if (unlikely(!ret)) 149 + ioc = current_io_context(gfp_flags, node); 150 + if (unlikely(!ioc)) 151 151 break; 152 - } while (!atomic_long_inc_not_zero(&ret->refcount)); 152 + } while (!atomic_long_inc_not_zero(&ioc->refcount)); 153 153 154 - return ret; 154 + return ioc; 155 155 } 156 156 EXPORT_SYMBOL(get_io_context); 157 157

+4 -1

block/blk-lib.c

··· 59 59 * granularity 60 60 */ 61 61 max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); 62 - if (q->limits.discard_granularity) { 62 + if (unlikely(!max_discard_sectors)) { 63 + /* Avoid infinite loop below. Being cautious never hurts. */ 64 + return -EOPNOTSUPP; 65 + } else if (q->limits.discard_granularity) { 63 66 unsigned int disc_sects = q->limits.discard_granularity >> 9; 64 67 65 68 max_discard_sectors &= ~(disc_sects - 1);

+7 -4

block/blk-softirq.c

··· 103 103 104 104 void __blk_complete_request(struct request *req) 105 105 { 106 + int ccpu, cpu, group_cpu = NR_CPUS; 106 107 struct request_queue *q = req->q; 107 108 unsigned long flags; 108 - int ccpu, cpu, group_cpu; 109 109 110 110 BUG_ON(!q->softirq_done_fn); 111 111 112 112 local_irq_save(flags); 113 113 cpu = smp_processor_id(); 114 - group_cpu = blk_cpu_to_group(cpu); 115 114 116 115 /* 117 116 * Select completion CPU 118 117 */ 119 - if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) 118 + if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) { 120 119 ccpu = req->cpu; 121 - else 120 + if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) { 121 + ccpu = blk_cpu_to_group(ccpu); 122 + group_cpu = blk_cpu_to_group(cpu); 123 + } 124 + } else 122 125 ccpu = cpu; 123 126 124 127 if (ccpu == cpu || ccpu == group_cpu) {

+9 -4

block/blk-sysfs.c

··· 244 244 static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page) 245 245 { 246 246 bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags); 247 + bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags); 247 248 248 - return queue_var_show(set, page); 249 + return queue_var_show(set << force, page); 249 250 } 250 251 251 252 static ssize_t ··· 258 257 259 258 ret = queue_var_store(&val, page, count); 260 259 spin_lock_irq(q->queue_lock); 261 - if (val) 260 + if (val) { 262 261 queue_flag_set(QUEUE_FLAG_SAME_COMP, q); 263 - else 264 - queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); 262 + if (val == 2) 263 + queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); 264 + } else { 265 + queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); 266 + queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); 267 + } 265 268 spin_unlock_irq(q->queue_lock); 266 269 #endif 267 270 return ret;

+4 -4

block/blk-throttle.c

··· 142 142 return NULL; 143 143 } 144 144 145 - static inline int total_nr_queued(struct throtl_data *td) 145 + static inline unsigned int total_nr_queued(struct throtl_data *td) 146 146 { 147 - return (td->nr_queued[0] + td->nr_queued[1]); 147 + return td->nr_queued[0] + td->nr_queued[1]; 148 148 } 149 149 150 150 static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) ··· 927 927 928 928 bio_list_init(&bio_list_on_stack); 929 929 930 - throtl_log(td, "dispatch nr_queued=%d read=%u write=%u", 930 + throtl_log(td, "dispatch nr_queued=%u read=%u write=%u", 931 931 total_nr_queued(td), td->nr_queued[READ], 932 932 td->nr_queued[WRITE]); 933 933 ··· 970 970 struct delayed_work *dwork = &td->throtl_work; 971 971 972 972 /* schedule work if limits changed even if no bio is queued */ 973 - if (total_nr_queued(td) > 0 || td->limits_changed) { 973 + if (total_nr_queued(td) || td->limits_changed) { 974 974 /* 975 975 * We might have a work scheduled to be executed in future. 976 976 * Cancel that and schedule a new one.

+76 -76

block/cfq-iosched.c

··· 87 87 unsigned count; 88 88 unsigned total_weight; 89 89 u64 min_vdisktime; 90 + struct cfq_ttime ttime; 90 91 }; 91 - #define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \ 92 - .count = 0, .min_vdisktime = 0, } 92 + #define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, \ 93 + .ttime = {.last_end_request = jiffies,},} 93 94 94 95 /* 95 96 * Per process-grouping structure ··· 130 129 unsigned long slice_end; 131 130 long slice_resid; 132 131 133 - /* pending metadata requests */ 134 - int meta_pending; 135 132 /* number of requests that are on the dispatch list or inside driver */ 136 133 int dispatched; 137 134 138 135 /* io prio of this group */ 139 136 unsigned short ioprio, org_ioprio; 140 - unsigned short ioprio_class, org_ioprio_class; 137 + unsigned short ioprio_class; 141 138 142 139 pid_t pid; 143 140 ··· 211 212 #endif 212 213 /* number of requests that are on the dispatch list or inside driver */ 213 214 int dispatched; 215 + struct cfq_ttime ttime; 214 216 }; 215 217 216 218 /* ··· 393 393 j++, st = i < IDLE_WORKLOAD ? \ 394 394 &cfqg->service_trees[i][j]: NULL) \ 395 395 396 + static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd, 397 + struct cfq_ttime *ttime, bool group_idle) 398 + { 399 + unsigned long slice; 400 + if (!sample_valid(ttime->ttime_samples)) 401 + return false; 402 + if (group_idle) 403 + slice = cfqd->cfq_group_idle; 404 + else 405 + slice = cfqd->cfq_slice_idle; 406 + return ttime->ttime_mean > slice; 407 + } 396 408 397 409 static inline bool iops_mode(struct cfq_data *cfqd) 398 410 { ··· 681 669 682 670 if (rq_is_sync(rq1) != rq_is_sync(rq2)) 683 671 return rq_is_sync(rq1) ? rq1 : rq2; 684 - 685 - if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_META) 686 - return rq1->cmd_flags & REQ_META ? rq1 : rq2; 687 672 688 673 s1 = blk_rq_pos(rq1); 689 674 s2 = blk_rq_pos(rq2); ··· 1014 1005 return NULL; 1015 1006 } 1016 1007 1017 - void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, 1018 - unsigned int weight) 1008 + static void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, 1009 + unsigned int weight) 1019 1010 { 1020 1011 struct cfq_group *cfqg = cfqg_of_blkg(blkg); 1021 1012 cfqg->new_weight = weight; ··· 1067 1058 for_each_cfqg_st(cfqg, i, j, st) 1068 1059 *st = CFQ_RB_ROOT; 1069 1060 RB_CLEAR_NODE(&cfqg->rb_node); 1061 + 1062 + cfqg->ttime.last_end_request = jiffies; 1070 1063 1071 1064 /* 1072 1065 * Take the initial reference that will be released on destroy ··· 1246 1235 * it should not be NULL as even if elevator was exiting, cgroup deltion 1247 1236 * path got to it first. 1248 1237 */ 1249 - void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) 1238 + static void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) 1250 1239 { 1251 1240 unsigned long flags; 1252 1241 struct cfq_data *cfqd = key; ··· 1513 1502 { 1514 1503 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1515 1504 struct cfq_data *cfqd = cfqq->cfqd; 1516 - struct request *__alias, *prev; 1505 + struct request *prev; 1517 1506 1518 1507 cfqq->queued[rq_is_sync(rq)]++; 1519 1508 1520 - /* 1521 - * looks a little odd, but the first insert might return an alias. 1522 - * if that happens, put the alias on the dispatch list 1523 - */ 1524 - while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL) 1525 - cfq_dispatch_insert(cfqd->queue, __alias); 1509 + elv_rb_add(&cfqq->sort_list, rq); 1526 1510 1527 1511 if (!cfq_cfqq_on_rr(cfqq)) 1528 1512 cfq_add_cfqq_rr(cfqd, cfqq); ··· 1604 1598 cfqq->cfqd->rq_queued--; 1605 1599 cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, 1606 1600 rq_data_dir(rq), rq_is_sync(rq)); 1607 - if (rq->cmd_flags & REQ_META) { 1608 - WARN_ON(!cfqq->meta_pending); 1609 - cfqq->meta_pending--; 1610 - } 1611 1601 } 1612 1602 1613 1603 static int cfq_merge(struct request_queue *q, struct request **req, ··· 1971 1969 * Otherwise, we do only if they are the last ones 1972 1970 * in their service tree. 1973 1971 */ 1974 - if (service_tree->count == 1 && cfq_cfqq_sync(cfqq)) 1972 + if (service_tree->count == 1 && cfq_cfqq_sync(cfqq) && 1973 + !cfq_io_thinktime_big(cfqd, &service_tree->ttime, false)) 1975 1974 return true; 1976 1975 cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", 1977 1976 service_tree->count); ··· 2025 2022 * slice, then don't idle. This avoids overrunning the allotted 2026 2023 * time slice. 2027 2024 */ 2028 - if (sample_valid(cic->ttime_samples) && 2029 - (cfqq->slice_end - jiffies < cic->ttime_mean)) { 2025 + if (sample_valid(cic->ttime.ttime_samples) && 2026 + (cfqq->slice_end - jiffies < cic->ttime.ttime_mean)) { 2030 2027 cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu", 2031 - cic->ttime_mean); 2028 + cic->ttime.ttime_mean); 2032 2029 return; 2033 2030 } 2034 2031 ··· 2384 2381 * this group, wait for requests to complete. 2385 2382 */ 2386 2383 check_group_idle: 2387 - if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 2388 - && cfqq->cfqg->dispatched) { 2384 + if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 && 2385 + cfqq->cfqg->dispatched && 2386 + !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) { 2389 2387 cfqq = NULL; 2390 2388 goto keep_queue; 2391 2389 } ··· 2837 2833 cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO, 2838 2834 cfqd->queue->node); 2839 2835 if (cic) { 2840 - cic->last_end_request = jiffies; 2836 + cic->ttime.last_end_request = jiffies; 2841 2837 INIT_LIST_HEAD(&cic->queue_list); 2842 2838 INIT_HLIST_NODE(&cic->cic_list); 2843 2839 cic->dtor = cfq_free_io_context; ··· 2887 2883 * elevate the priority of this queue 2888 2884 */ 2889 2885 cfqq->org_ioprio = cfqq->ioprio; 2890 - cfqq->org_ioprio_class = cfqq->ioprio_class; 2891 2886 cfq_clear_cfqq_prio_changed(cfqq); 2892 2887 } 2893 2888 ··· 3224 3221 } 3225 3222 3226 3223 static void 3227 - cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic) 3224 + __cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle) 3228 3225 { 3229 - unsigned long elapsed = jiffies - cic->last_end_request; 3230 - unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle); 3226 + unsigned long elapsed = jiffies - ttime->last_end_request; 3227 + elapsed = min(elapsed, 2UL * slice_idle); 3231 3228 3232 - cic->ttime_samples = (7*cic->ttime_samples + 256) / 8; 3233 - cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8; 3234 - cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples; 3229 + ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8; 3230 + ttime->ttime_total = (7*ttime->ttime_total + 256*elapsed) / 8; 3231 + ttime->ttime_mean = (ttime->ttime_total + 128) / ttime->ttime_samples; 3232 + } 3233 + 3234 + static void 3235 + cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3236 + struct cfq_io_context *cic) 3237 + { 3238 + if (cfq_cfqq_sync(cfqq)) { 3239 + __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle); 3240 + __cfq_update_io_thinktime(&cfqq->service_tree->ttime, 3241 + cfqd->cfq_slice_idle); 3242 + } 3243 + #ifdef CONFIG_CFQ_GROUP_IOSCHED 3244 + __cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle); 3245 + #endif 3235 3246 } 3236 3247 3237 3248 static void ··· 3294 3277 else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || 3295 3278 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) 3296 3279 enable_idle = 0; 3297 - else if (sample_valid(cic->ttime_samples)) { 3298 - if (cic->ttime_mean > cfqd->cfq_slice_idle) 3280 + else if (sample_valid(cic->ttime.ttime_samples)) { 3281 + if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle) 3299 3282 enable_idle = 0; 3300 3283 else 3301 3284 enable_idle = 1; ··· 3354 3337 cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && 3355 3338 new_cfqq->service_tree->count == 2 && 3356 3339 RB_EMPTY_ROOT(&cfqq->sort_list)) 3357 - return true; 3358 - 3359 - /* 3360 - * So both queues are sync. Let the new request get disk time if 3361 - * it's a metadata request and the current queue is doing regular IO. 3362 - */ 3363 - if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending) 3364 3340 return true; 3365 3341 3366 3342 /* ··· 3420 3410 struct cfq_io_context *cic = RQ_CIC(rq); 3421 3411 3422 3412 cfqd->rq_queued++; 3423 - if (rq->cmd_flags & REQ_META) 3424 - cfqq->meta_pending++; 3425 3413 3426 - cfq_update_io_thinktime(cfqd, cic); 3414 + cfq_update_io_thinktime(cfqd, cfqq, cic); 3427 3415 cfq_update_io_seektime(cfqd, cfqq, rq); 3428 3416 cfq_update_idle_window(cfqd, cfqq, cic); 3429 3417 ··· 3528 3520 if (cfqq->cfqg->nr_cfqq > 1) 3529 3521 return false; 3530 3522 3523 + /* the only queue in the group, but think time is big */ 3524 + if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) 3525 + return false; 3526 + 3531 3527 if (cfq_slice_used(cfqq)) 3532 3528 return true; 3533 3529 3534 3530 /* if slice left is less than think time, wait busy */ 3535 - if (cic && sample_valid(cic->ttime_samples) 3536 - && (cfqq->slice_end - jiffies < cic->ttime_mean)) 3531 + if (cic && sample_valid(cic->ttime.ttime_samples) 3532 + && (cfqq->slice_end - jiffies < cic->ttime.ttime_mean)) 3537 3533 return true; 3538 3534 3539 3535 /* ··· 3578 3566 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; 3579 3567 3580 3568 if (sync) { 3581 - RQ_CIC(rq)->last_end_request = now; 3569 + struct cfq_rb_root *service_tree; 3570 + 3571 + RQ_CIC(rq)->ttime.last_end_request = now; 3572 + 3573 + if (cfq_cfqq_on_rr(cfqq)) 3574 + service_tree = cfqq->service_tree; 3575 + else 3576 + service_tree = service_tree_for(cfqq->cfqg, 3577 + cfqq_prio(cfqq), cfqq_type(cfqq)); 3578 + service_tree->ttime.last_end_request = now; 3582 3579 if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now)) 3583 3580 cfqd->last_delayed_sync = now; 3584 3581 } 3582 + 3583 + #ifdef CONFIG_CFQ_GROUP_IOSCHED 3584 + cfqq->cfqg->ttime.last_end_request = now; 3585 + #endif 3585 3586 3586 3587 /* 3587 3588 * If this is the active queue, check if it needs to be expired, ··· 3641 3616 cfq_schedule_dispatch(cfqd); 3642 3617 } 3643 3618 3644 - /* 3645 - * we temporarily boost lower priority queues if they are holding fs exclusive 3646 - * resources. they are boosted to normal prio (CLASS_BE/4) 3647 - */ 3648 - static void cfq_prio_boost(struct cfq_queue *cfqq) 3649 - { 3650 - if (has_fs_excl()) { 3651 - /* 3652 - * boost idle prio on transactions that would lock out other 3653 - * users of the filesystem 3654 - */ 3655 - if (cfq_class_idle(cfqq)) 3656 - cfqq->ioprio_class = IOPRIO_CLASS_BE; 3657 - if (cfqq->ioprio > IOPRIO_NORM) 3658 - cfqq->ioprio = IOPRIO_NORM; 3659 - } else { 3660 - /* 3661 - * unboost the queue (if needed) 3662 - */ 3663 - cfqq->ioprio_class = cfqq->org_ioprio_class; 3664 - cfqq->ioprio = cfqq->org_ioprio; 3665 - } 3666 - } 3667 - 3668 3619 static inline int __cfq_may_queue(struct cfq_queue *cfqq) 3669 3620 { 3670 3621 if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) { ··· 3671 3670 cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); 3672 3671 if (cfqq) { 3673 3672 cfq_init_prio_data(cfqq, cic->ioc); 3674 - cfq_prio_boost(cfqq); 3675 3673 3676 3674 return __cfq_may_queue(cfqq); 3677 3675 }

-14

block/compat_ioctl.c

··· 208 208 #define BLKBSZSET_32 _IOW(0x12, 113, int) 209 209 #define BLKGETSIZE64_32 _IOR(0x12, 114, int) 210 210 211 - struct compat_floppy_struct { 212 - compat_uint_t size; 213 - compat_uint_t sect; 214 - compat_uint_t head; 215 - compat_uint_t track; 216 - compat_uint_t stretch; 217 - unsigned char gap; 218 - unsigned char rate; 219 - unsigned char spec1; 220 - unsigned char fmt_gap; 221 - const compat_caddr_t name; 222 - }; 223 - 224 211 struct compat_floppy_drive_params { 225 212 char cmos; 226 213 compat_ulong_t max_dtr; ··· 275 288 276 289 #define FDSETPRM32 _IOW(2, 0x42, struct compat_floppy_struct) 277 290 #define FDDEFPRM32 _IOW(2, 0x43, struct compat_floppy_struct) 278 - #define FDGETPRM32 _IOR(2, 0x04, struct compat_floppy_struct) 279 291 #define FDSETDRVPRM32 _IOW(2, 0x90, struct compat_floppy_drive_params) 280 292 #define FDGETDRVPRM32 _IOR(2, 0x11, struct compat_floppy_drive_params) 281 293 #define FDGETDRVSTAT32 _IOR(2, 0x12, struct compat_floppy_drive_struct)

+1 -3

block/deadline-iosched.c

··· 77 77 deadline_add_rq_rb(struct deadline_data *dd, struct request *rq) 78 78 { 79 79 struct rb_root *root = deadline_rb_root(dd, rq); 80 - struct request *__alias; 81 80 82 - while (unlikely(__alias = elv_rb_add(root, rq))) 83 - deadline_move_request(dd, __alias); 81 + elv_rb_add(root, rq); 84 82 } 85 83 86 84 static inline void

+2 -5

block/elevator.c

··· 353 353 * RB-tree support functions for inserting/lookup/removal of requests 354 354 * in a sorted RB tree. 355 355 */ 356 - struct request *elv_rb_add(struct rb_root *root, struct request *rq) 356 + void elv_rb_add(struct rb_root *root, struct request *rq) 357 357 { 358 358 struct rb_node **p = &root->rb_node; 359 359 struct rb_node *parent = NULL; ··· 365 365 366 366 if (blk_rq_pos(rq) < blk_rq_pos(__rq)) 367 367 p = &(*p)->rb_left; 368 - else if (blk_rq_pos(rq) > blk_rq_pos(__rq)) 368 + else if (blk_rq_pos(rq) >= blk_rq_pos(__rq)) 369 369 p = &(*p)->rb_right; 370 - else 371 - return __rq; 372 370 } 373 371 374 372 rb_link_node(&rq->rb_node, parent, p); 375 373 rb_insert_color(&rq->rb_node, root); 376 - return NULL; 377 374 } 378 375 EXPORT_SYMBOL(elv_rb_add); 379 376

+15 -13

block/genhd.c

··· 602 602 disk->major = MAJOR(devt); 603 603 disk->first_minor = MINOR(devt); 604 604 605 - /* Register BDI before referencing it from bdev */ 605 + /* Register BDI before referencing it from bdev */ 606 606 bdi = &disk->queue->backing_dev_info; 607 607 bdi_register_dev(bdi, disk_devt(disk)); 608 608 ··· 1140 1140 "wsect wuse running use aveq" 1141 1141 "\n\n"); 1142 1142 */ 1143 - 1143 + 1144 1144 disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); 1145 1145 while ((hd = disk_part_iter_next(&piter))) { 1146 1146 cpu = part_stat_lock(); ··· 1164 1164 ); 1165 1165 } 1166 1166 disk_part_iter_exit(&piter); 1167 - 1167 + 1168 1168 return 0; 1169 1169 } 1170 1170 ··· 1492 1492 } 1493 1493 1494 1494 /** 1495 - * disk_check_events - schedule immediate event checking 1496 - * @disk: disk to check events for 1495 + * disk_flush_events - schedule immediate event checking and flushing 1496 + * @disk: disk to check and flush events for 1497 + * @mask: events to flush 1497 1498 * 1498 - * Schedule immediate event checking on @disk if not blocked. 1499 + * Schedule immediate event checking on @disk if not blocked. Events in 1500 + * @mask are scheduled to be cleared from the driver. Note that this 1501 + * doesn't clear the events from @disk->ev. 1499 1502 * 1500 1503 * CONTEXT: 1501 - * Don't care. Safe to call from irq context. 1504 + * If @mask is non-zero must be called with bdev->bd_mutex held. 1502 1505 */ 1503 - void disk_check_events(struct gendisk *disk) 1506 + void disk_flush_events(struct gendisk *disk, unsigned int mask) 1504 1507 { 1505 1508 struct disk_events *ev = disk->ev; 1506 - unsigned long flags; 1507 1509 1508 1510 if (!ev) 1509 1511 return; 1510 1512 1511 - spin_lock_irqsave(&ev->lock, flags); 1513 + spin_lock_irq(&ev->lock); 1514 + ev->clearing |= mask; 1512 1515 if (!ev->block) { 1513 1516 cancel_delayed_work(&ev->dwork); 1514 1517 queue_delayed_work(system_nrt_wq, &ev->dwork, 0); 1515 1518 } 1516 - spin_unlock_irqrestore(&ev->lock, flags); 1519 + spin_unlock_irq(&ev->lock); 1517 1520 } 1518 - EXPORT_SYMBOL_GPL(disk_check_events); 1519 1521 1520 1522 /** 1521 1523 * disk_clear_events - synchronously check, clear and return pending events ··· 1707 1705 mutex_lock(&disk_events_mutex); 1708 1706 1709 1707 list_for_each_entry(ev, &disk_events, node) 1710 - disk_check_events(ev->disk); 1708 + disk_flush_events(ev->disk, 0); 1711 1709 1712 1710 mutex_unlock(&disk_events_mutex); 1713 1711

+14 -9

fs/block_dev.c

··· 1448 1448 1449 1449 int blkdev_put(struct block_device *bdev, fmode_t mode) 1450 1450 { 1451 + mutex_lock(&bdev->bd_mutex); 1452 + 1451 1453 if (mode & FMODE_EXCL) { 1452 1454 bool bdev_free; 1453 1455 ··· 1458 1456 * are protected with bdev_lock. bd_mutex is to 1459 1457 * synchronize disk_holder unlinking. 1460 1458 */ 1461 - mutex_lock(&bdev->bd_mutex); 1462 1459 spin_lock(&bdev_lock); 1463 1460 1464 1461 WARN_ON_ONCE(--bdev->bd_holders < 0); ··· 1475 1474 * If this was the last claim, remove holder link and 1476 1475 * unblock evpoll if it was a write holder. 1477 1476 */ 1478 - if (bdev_free) { 1479 - if (bdev->bd_write_holder) { 1480 - disk_unblock_events(bdev->bd_disk); 1481 - disk_check_events(bdev->bd_disk); 1482 - bdev->bd_write_holder = false; 1483 - } 1477 + if (bdev_free && bdev->bd_write_holder) { 1478 + disk_unblock_events(bdev->bd_disk); 1479 + bdev->bd_write_holder = false; 1484 1480 } 1485 - 1486 - mutex_unlock(&bdev->bd_mutex); 1487 1481 } 1482 + 1483 + /* 1484 + * Trigger event checking and tell drivers to flush MEDIA_CHANGE 1485 + * event. This is to ensure detection of media removal commanded 1486 + * from userland - e.g. eject(1). 1487 + */ 1488 + disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE); 1489 + 1490 + mutex_unlock(&bdev->bd_mutex); 1488 1491 1489 1492 return __blkdev_put(bdev, mode, 0); 1490 1493 }

+5

fs/compat_ioctl.c

··· 68 68 69 69 #ifdef CONFIG_BLOCK 70 70 #include <linux/loop.h> 71 + #include <linux/cdrom.h> 72 + #include <linux/fd.h> 71 73 #include <scsi/scsi.h> 72 74 #include <scsi/scsi_ioctl.h> 73 75 #include <scsi/sg.h> ··· 946 944 IGNORE_IOCTL(LOOP_CLR_FD) 947 945 /* md calls this on random blockdevs */ 948 946 IGNORE_IOCTL(RAID_VERSION) 947 + /* qemu/qemu-img might call these two on plain files for probing */ 948 + IGNORE_IOCTL(CDROM_DRIVE_STATUS) 949 + IGNORE_IOCTL(FDGETPRM32) 949 950 /* SG stuff */ 950 951 COMPATIBLE_IOCTL(SG_SET_TIMEOUT) 951 952 COMPATIBLE_IOCTL(SG_GET_TIMEOUT)

+6 -6

fs/partitions/check.c

··· 237 237 return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); 238 238 } 239 239 240 - ssize_t part_ro_show(struct device *dev, 241 - struct device_attribute *attr, char *buf) 240 + static ssize_t part_ro_show(struct device *dev, 241 + struct device_attribute *attr, char *buf) 242 242 { 243 243 struct hd_struct *p = dev_to_part(dev); 244 244 return sprintf(buf, "%d\n", p->policy ? 1 : 0); 245 245 } 246 246 247 - ssize_t part_alignment_offset_show(struct device *dev, 248 - struct device_attribute *attr, char *buf) 247 + static ssize_t part_alignment_offset_show(struct device *dev, 248 + struct device_attribute *attr, char *buf) 249 249 { 250 250 struct hd_struct *p = dev_to_part(dev); 251 251 return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset); 252 252 } 253 253 254 - ssize_t part_discard_alignment_show(struct device *dev, 255 - struct device_attribute *attr, char *buf) 254 + static ssize_t part_discard_alignment_show(struct device *dev, 255 + struct device_attribute *attr, char *buf) 256 256 { 257 257 struct hd_struct *p = dev_to_part(dev); 258 258 return sprintf(buf, "%u\n", p->discard_alignment);

-13

fs/reiserfs/journal.c

··· 678 678 static void write_chunk(struct buffer_chunk *chunk) 679 679 { 680 680 int i; 681 - get_fs_excl(); 682 681 for (i = 0; i < chunk->nr; i++) { 683 682 submit_logged_buffer(chunk->bh[i]); 684 683 } 685 684 chunk->nr = 0; 686 - put_fs_excl(); 687 685 } 688 686 689 687 static void write_ordered_chunk(struct buffer_chunk *chunk) 690 688 { 691 689 int i; 692 - get_fs_excl(); 693 690 for (i = 0; i < chunk->nr; i++) { 694 691 submit_ordered_buffer(chunk->bh[i]); 695 692 } 696 693 chunk->nr = 0; 697 - put_fs_excl(); 698 694 } 699 695 700 696 static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh, ··· 982 986 return 0; 983 987 } 984 988 985 - get_fs_excl(); 986 - 987 989 /* before we can put our commit blocks on disk, we have to make sure everyone older than 988 990 ** us is on disk too 989 991 */ ··· 1139 1145 if (retval) 1140 1146 reiserfs_abort(s, retval, "Journal write error in %s", 1141 1147 __func__); 1142 - put_fs_excl(); 1143 1148 return retval; 1144 1149 } 1145 1150 ··· 1367 1374 return 0; 1368 1375 } 1369 1376 1370 - get_fs_excl(); 1371 - 1372 1377 /* if all the work is already done, get out of here */ 1373 1378 if (atomic_read(&(jl->j_nonzerolen)) <= 0 && 1374 1379 atomic_read(&(jl->j_commit_left)) <= 0) { ··· 1588 1597 put_journal_list(s, jl); 1589 1598 if (flushall) 1590 1599 mutex_unlock(&journal->j_flush_mutex); 1591 - put_fs_excl(); 1592 1600 return err; 1593 1601 } 1594 1602 ··· 3098 3108 th->t_trans_id = journal->j_trans_id; 3099 3109 unlock_journal(sb); 3100 3110 INIT_LIST_HEAD(&th->t_list); 3101 - get_fs_excl(); 3102 3111 return 0; 3103 3112 3104 3113 out_fail: ··· 3953 3964 flush = flags & FLUSH_ALL; 3954 3965 wait_on_commit = flags & WAIT; 3955 3966 3956 - put_fs_excl(); 3957 3967 current->journal_info = th->t_handle_save; 3958 3968 reiserfs_check_lock_depth(sb, "journal end"); 3959 3969 if (journal->j_len == 0) { ··· 4304 4316 dump_stack(); 4305 4317 #endif 4306 4318 } 4307 -

-4

fs/super.c

··· 351 351 */ 352 352 void lock_super(struct super_block * sb) 353 353 { 354 - get_fs_excl(); 355 354 mutex_lock(&sb->s_lock); 356 355 } 357 356 358 357 void unlock_super(struct super_block * sb) 359 358 { 360 - put_fs_excl(); 361 359 mutex_unlock(&sb->s_lock); 362 360 } 363 361 ··· 383 385 if (sb->s_root) { 384 386 shrink_dcache_for_umount(sb); 385 387 sync_filesystem(sb); 386 - get_fs_excl(); 387 388 sb->s_flags &= ~MS_ACTIVE; 388 389 389 390 fsnotify_unmount_inodes(&sb->s_inodes); ··· 397 400 "Self-destruct in 5 seconds. Have a nice day...\n", 398 401 sb->s_id); 399 402 } 400 - put_fs_excl(); 401 403 } 402 404 spin_lock(&sb_lock); 403 405 /* should be initialized for __put_super_and_need_restart() */

+19 -10

include/linux/blkdev.h

··· 73 73 74 74 /* 75 75 * try to put the fields that are referenced together in the same cacheline. 76 - * if you modify this structure, be sure to check block/blk-core.c:rq_init() 76 + * if you modify this structure, be sure to check block/blk-core.c:blk_rq_init() 77 77 * as well! 78 78 */ 79 79 struct request { ··· 260 260 unsigned char discard_zeroes_data; 261 261 }; 262 262 263 - struct request_queue 264 - { 263 + struct request_queue { 265 264 /* 266 265 * Together with queue_head for cacheline sharing 267 266 */ ··· 303 304 void *queuedata; 304 305 305 306 /* 306 - * queue needs bounce pages for pages above this limit 307 - */ 308 - gfp_t bounce_gfp; 309 - 310 - /* 311 307 * various queue flags, see QUEUE_* below 312 308 */ 313 309 unsigned long queue_flags; 310 + 311 + /* 312 + * queue needs bounce pages for pages above this limit 313 + */ 314 + gfp_t bounce_gfp; 314 315 315 316 /* 316 317 * protects queue structures from reentrancy. ->__queue_lock should ··· 333 334 unsigned int nr_congestion_off; 334 335 unsigned int nr_batching; 335 336 336 - void *dma_drain_buffer; 337 337 unsigned int dma_drain_size; 338 + void *dma_drain_buffer; 338 339 unsigned int dma_pad_mask; 339 340 unsigned int dma_alignment; 340 341 ··· 392 393 #define QUEUE_FLAG_ELVSWITCH 6 /* don't use elevator, just do FIFO */ 393 394 #define QUEUE_FLAG_BIDI 7 /* queue supports bidi requests */ 394 395 #define QUEUE_FLAG_NOMERGES 8 /* disable merge attempts */ 395 - #define QUEUE_FLAG_SAME_COMP 9 /* force complete on same CPU */ 396 + #define QUEUE_FLAG_SAME_COMP 9 /* complete on same CPU-group */ 396 397 #define QUEUE_FLAG_FAIL_IO 10 /* fake timeout */ 397 398 #define QUEUE_FLAG_STACKABLE 11 /* supports request stacking */ 398 399 #define QUEUE_FLAG_NONROT 12 /* non-rotational device (SSD) */ ··· 402 403 #define QUEUE_FLAG_NOXMERGES 15 /* No extended merges */ 403 404 #define QUEUE_FLAG_ADD_RANDOM 16 /* Contributes to random pool */ 404 405 #define QUEUE_FLAG_SECDISCARD 17 /* supports SECDISCARD */ 406 + #define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */ 405 407 406 408 #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ 407 409 (1 << QUEUE_FLAG_STACKABLE) | \ ··· 857 857 struct request_queue *blk_alloc_queue_node(gfp_t, int); 858 858 extern void blk_put_queue(struct request_queue *); 859 859 860 + /* 861 + * Note: Code in between changing the blk_plug list/cb_list or element of such 862 + * lists is preemptable, but such code can't do sleep (or be very careful), 863 + * otherwise data is corrupted. For details, please check schedule() where 864 + * blk_schedule_flush_plug() is called. 865 + */ 860 866 struct blk_plug { 861 867 unsigned long magic; 862 868 struct list_head list; 863 869 struct list_head cb_list; 864 870 unsigned int should_sort; 871 + unsigned int count; 865 872 }; 873 + #define BLK_MAX_REQUEST_COUNT 16 874 + 866 875 struct blk_plug_cb { 867 876 struct list_head list; 868 877 void (*callback)(struct blk_plug_cb *);

+1 -1

include/linux/elevator.h

··· 146 146 /* 147 147 * rb support functions. 148 148 */ 149 - extern struct request *elv_rb_add(struct rb_root *, struct request *); 149 + extern void elv_rb_add(struct rb_root *, struct request *); 150 150 extern void elv_rb_del(struct rb_root *, struct request *); 151 151 extern struct request *elv_rb_find(struct rb_root *, sector_t); 152 152

+22

include/linux/fd.h

··· 377 377 #define FDEJECT _IO(2, 0x5a) 378 378 /* eject the disk */ 379 379 380 + 381 + #ifdef __KERNEL__ 382 + #ifdef CONFIG_COMPAT 383 + #include <linux/compat.h> 384 + 385 + struct compat_floppy_struct { 386 + compat_uint_t size; 387 + compat_uint_t sect; 388 + compat_uint_t head; 389 + compat_uint_t track; 390 + compat_uint_t stretch; 391 + unsigned char gap; 392 + unsigned char rate; 393 + unsigned char spec1; 394 + unsigned char fmt_gap; 395 + const compat_caddr_t name; 396 + }; 397 + 398 + #define FDGETPRM32 _IOR(2, 0x04, struct compat_floppy_struct) 399 + #endif 400 + #endif 401 + 380 402 #endif

-4

include/linux/fs.h

··· 1469 1469 #define vfs_check_frozen(sb, level) \ 1470 1470 wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level))) 1471 1471 1472 - #define get_fs_excl() atomic_inc(&current->fs_excl) 1473 - #define put_fs_excl() atomic_dec(&current->fs_excl) 1474 - #define has_fs_excl() atomic_read(&current->fs_excl) 1475 - 1476 1472 /* 1477 1473 * until VFS tracks user namespaces for inodes, just make all files 1478 1474 * belong to init_user_ns

+1 -1

include/linux/genhd.h

··· 420 420 421 421 extern void disk_block_events(struct gendisk *disk); 422 422 extern void disk_unblock_events(struct gendisk *disk); 423 - extern void disk_check_events(struct gendisk *disk); 423 + extern void disk_flush_events(struct gendisk *disk, unsigned int mask); 424 424 extern unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask); 425 425 426 426 /* drivers/char/random.c */

-1

include/linux/init_task.h

··· 176 176 .alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \ 177 177 .journal_info = NULL, \ 178 178 .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ 179 - .fs_excl = ATOMIC_INIT(0), \ 180 179 .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ 181 180 .timer_slack_ns = 50000, /* 50 usec default slack */ \ 182 181 .pids = { \

+9 -5

include/linux/iocontext.h

··· 5 5 #include <linux/rcupdate.h> 6 6 7 7 struct cfq_queue; 8 + struct cfq_ttime { 9 + unsigned long last_end_request; 10 + 11 + unsigned long ttime_total; 12 + unsigned long ttime_samples; 13 + unsigned long ttime_mean; 14 + }; 15 + 8 16 struct cfq_io_context { 9 17 void *key; 10 18 ··· 20 12 21 13 struct io_context *ioc; 22 14 23 - unsigned long last_end_request; 24 - 25 - unsigned long ttime_total; 26 - unsigned long ttime_samples; 27 - unsigned long ttime_mean; 15 + struct cfq_ttime ttime; 28 16 29 17 struct list_head queue_list; 30 18 struct hlist_node cic_list;

-1

include/linux/sched.h

··· 1512 1512 short il_next; 1513 1513 short pref_node_fork; 1514 1514 #endif 1515 - atomic_t fs_excl; /* holding fs exclusive resources */ 1516 1515 struct rcu_head rcu; 1517 1516 1518 1517 /*

-1

kernel/exit.c

··· 898 898 899 899 profile_task_exit(tsk); 900 900 901 - WARN_ON(atomic_read(&tsk->fs_excl)); 902 901 WARN_ON(blk_needs_flush_plug(tsk)); 903 902 904 903 if (unlikely(in_interrupt()))

-1

kernel/fork.c

··· 290 290 291 291 /* One for us, one for whoever does the "release_task()" (usually parent) */ 292 292 atomic_set(&tsk->usage,2); 293 - atomic_set(&tsk->fs_excl, 0); 294 293 #ifdef CONFIG_BLK_DEV_IO_TRACE 295 294 tsk->btrace_seq = 0; 296 295 #endif

+1 -1

mm/backing-dev.c

··· 505 505 list_del_rcu(&bdi->bdi_list); 506 506 spin_unlock_bh(&bdi_lock); 507 507 508 - synchronize_rcu(); 508 + synchronize_rcu_expedited(); 509 509 } 510 510 511 511 int bdi_register(struct backing_dev_info *bdi, struct device *parent,