commit 23da64b4714812b66ecf010e7dfb3ed1bf2eda69 · tjh.dev/kernel

+6 -13

Documentation/block/biodoc.txt

··· 1040 1040 iii. Plugging the queue to batch requests in anticipation of opportunities for 1041 1041 merge/sort optimizations 1042 1042 1043 - This is just the same as in 2.4 so far, though per-device unplugging 1044 - support is anticipated for 2.5. Also with a priority-based i/o scheduler, 1045 - such decisions could be based on request priorities. 1046 - 1047 1043 Plugging is an approach that the current i/o scheduling algorithm resorts to so 1048 1044 that it collects up enough requests in the queue to be able to take 1049 1045 advantage of the sorting/merging logic in the elevator. If the 1050 1046 queue is empty when a request comes in, then it plugs the request queue 1051 - (sort of like plugging the bottom of a vessel to get fluid to build up) 1047 + (sort of like plugging the bath tub of a vessel to get fluid to build up) 1052 1048 till it fills up with a few more requests, before starting to service 1053 1049 the requests. This provides an opportunity to merge/sort the requests before 1054 1050 passing them down to the device. There are various conditions when the queue is 1055 1051 unplugged (to open up the flow again), either through a scheduled task or 1056 1052 could be on demand. For example wait_on_buffer sets the unplugging going 1057 - (by running tq_disk) so the read gets satisfied soon. So in the read case, 1058 - the queue gets explicitly unplugged as part of waiting for completion, 1059 - in fact all queues get unplugged as a side-effect. 1053 + through sync_buffer() running blk_run_address_space(mapping). Or the caller 1054 + can do it explicity through blk_unplug(bdev). So in the read case, 1055 + the queue gets explicitly unplugged as part of waiting for completion on that 1056 + buffer. For page driven IO, the address space ->sync_page() takes care of 1057 + doing the blk_run_address_space(). 1060 1058 1061 1059 Aside: 1062 1060 This is kind of controversial territory, as it's not clear if plugging is ··· 1064 1066 balance between when to plug and when to open up. Also now that we have 1065 1067 multi-page bios being queued in one shot, we may not need to wait to merge 1066 1068 a big request from the broken up pieces coming by. 1067 - 1068 - Per-queue granularity unplugging (still a Todo) may help reduce some of the 1069 - concerns with just a single tq_disk flush approach. Something like 1070 - blk_kick_queue() to unplug a specific queue (right away ?) 1071 - or optionally, all queues, is in the plan. 1072 1069 1073 1070 4.4 I/O contexts 1074 1071 I/O contexts provide a dynamically allocated per process data area. They may

+57 -59

block/as-iosched.c

··· 17 17 #include <linux/rbtree.h> 18 18 #include <linux/interrupt.h> 19 19 20 - #define REQ_SYNC 1 21 - #define REQ_ASYNC 0 22 - 23 20 /* 24 21 * See Documentation/block/as-iosched.txt 25 22 */ ··· 90 93 struct list_head fifo_list[2]; 91 94 92 95 struct request *next_rq[2]; /* next in sort order */ 93 - sector_t last_sector[2]; /* last REQ_SYNC & REQ_ASYNC sectors */ 96 + sector_t last_sector[2]; /* last SYNC & ASYNC sectors */ 94 97 95 98 unsigned long exit_prob; /* probability a task will exit while 96 99 being waited on */ ··· 106 109 unsigned long last_check_fifo[2]; 107 110 int changed_batch; /* 1: waiting for old batch to end */ 108 111 int new_batch; /* 1: waiting on first read complete */ 109 - int batch_data_dir; /* current batch REQ_SYNC / REQ_ASYNC */ 112 + int batch_data_dir; /* current batch SYNC / ASYNC */ 110 113 int write_batch_count; /* max # of reqs in a write batch */ 111 114 int current_write_count; /* how many requests left this batch */ 112 115 int write_batch_idled; /* has the write batch gone idle? */ ··· 551 554 if (aic == NULL) 552 555 return; 553 556 554 - if (data_dir == REQ_SYNC) { 557 + if (data_dir == BLK_RW_SYNC) { 555 558 unsigned long in_flight = atomic_read(&aic->nr_queued) 556 559 + atomic_read(&aic->nr_dispatched); 557 560 spin_lock(&aic->lock); ··· 808 811 */ 809 812 static void update_write_batch(struct as_data *ad) 810 813 { 811 - unsigned long batch = ad->batch_expire[REQ_ASYNC]; 814 + unsigned long batch = ad->batch_expire[BLK_RW_ASYNC]; 812 815 long write_time; 813 816 814 817 write_time = (jiffies - ad->current_batch_expires) + batch; ··· 852 855 kblockd_schedule_work(q, &ad->antic_work); 853 856 ad->changed_batch = 0; 854 857 855 - if (ad->batch_data_dir == REQ_SYNC) 858 + if (ad->batch_data_dir == BLK_RW_SYNC) 856 859 ad->new_batch = 1; 857 860 } 858 861 WARN_ON(ad->nr_dispatched == 0); ··· 866 869 if (ad->new_batch && ad->batch_data_dir == rq_is_sync(rq)) { 867 870 update_write_batch(ad); 868 871 ad->current_batch_expires = jiffies + 869 - ad->batch_expire[REQ_SYNC]; 872 + ad->batch_expire[BLK_RW_SYNC]; 870 873 ad->new_batch = 0; 871 874 } 872 875 ··· 957 960 if (ad->changed_batch || ad->new_batch) 958 961 return 0; 959 962 960 - if (ad->batch_data_dir == REQ_SYNC) 963 + if (ad->batch_data_dir == BLK_RW_SYNC) 961 964 /* TODO! add a check so a complete fifo gets written? */ 962 965 return time_after(jiffies, ad->current_batch_expires); 963 966 ··· 983 986 */ 984 987 ad->last_sector[data_dir] = rq->sector + rq->nr_sectors; 985 988 986 - if (data_dir == REQ_SYNC) { 989 + if (data_dir == BLK_RW_SYNC) { 987 990 struct io_context *ioc = RQ_IOC(rq); 988 991 /* In case we have to anticipate after this */ 989 992 copy_io_context(&ad->io_context, &ioc); ··· 1022 1025 static int as_dispatch_request(struct request_queue *q, int force) 1023 1026 { 1024 1027 struct as_data *ad = q->elevator->elevator_data; 1025 - const int reads = !list_empty(&ad->fifo_list[REQ_SYNC]); 1026 - const int writes = !list_empty(&ad->fifo_list[REQ_ASYNC]); 1028 + const int reads = !list_empty(&ad->fifo_list[BLK_RW_SYNC]); 1029 + const int writes = !list_empty(&ad->fifo_list[BLK_RW_ASYNC]); 1027 1030 struct request *rq; 1028 1031 1029 1032 if (unlikely(force)) { 1030 1033 /* 1031 1034 * Forced dispatch, accounting is useless. Reset 1032 1035 * accounting states and dump fifo_lists. Note that 1033 - * batch_data_dir is reset to REQ_SYNC to avoid 1036 + * batch_data_dir is reset to BLK_RW_SYNC to avoid 1034 1037 * screwing write batch accounting as write batch 1035 1038 * accounting occurs on W->R transition. 1036 1039 */ 1037 1040 int dispatched = 0; 1038 1041 1039 - ad->batch_data_dir = REQ_SYNC; 1042 + ad->batch_data_dir = BLK_RW_SYNC; 1040 1043 ad->changed_batch = 0; 1041 1044 ad->new_batch = 0; 1042 1045 1043 - while (ad->next_rq[REQ_SYNC]) { 1044 - as_move_to_dispatch(ad, ad->next_rq[REQ_SYNC]); 1046 + while (ad->next_rq[BLK_RW_SYNC]) { 1047 + as_move_to_dispatch(ad, ad->next_rq[BLK_RW_SYNC]); 1045 1048 dispatched++; 1046 1049 } 1047 - ad->last_check_fifo[REQ_SYNC] = jiffies; 1050 + ad->last_check_fifo[BLK_RW_SYNC] = jiffies; 1048 1051 1049 - while (ad->next_rq[REQ_ASYNC]) { 1050 - as_move_to_dispatch(ad, ad->next_rq[REQ_ASYNC]); 1052 + while (ad->next_rq[BLK_RW_ASYNC]) { 1053 + as_move_to_dispatch(ad, ad->next_rq[BLK_RW_ASYNC]); 1051 1054 dispatched++; 1052 1055 } 1053 - ad->last_check_fifo[REQ_ASYNC] = jiffies; 1056 + ad->last_check_fifo[BLK_RW_ASYNC] = jiffies; 1054 1057 1055 1058 return dispatched; 1056 1059 } 1057 1060 1058 1061 /* Signal that the write batch was uncontended, so we can't time it */ 1059 - if (ad->batch_data_dir == REQ_ASYNC && !reads) { 1062 + if (ad->batch_data_dir == BLK_RW_ASYNC && !reads) { 1060 1063 if (ad->current_write_count == 0 || !writes) 1061 1064 ad->write_batch_idled = 1; 1062 1065 } ··· 1073 1076 */ 1074 1077 rq = ad->next_rq[ad->batch_data_dir]; 1075 1078 1076 - if (ad->batch_data_dir == REQ_SYNC && ad->antic_expire) { 1077 - if (as_fifo_expired(ad, REQ_SYNC)) 1079 + if (ad->batch_data_dir == BLK_RW_SYNC && ad->antic_expire) { 1080 + if (as_fifo_expired(ad, BLK_RW_SYNC)) 1078 1081 goto fifo_expired; 1079 1082 1080 1083 if (as_can_anticipate(ad, rq)) { ··· 1087 1090 /* we have a "next request" */ 1088 1091 if (reads && !writes) 1089 1092 ad->current_batch_expires = 1090 - jiffies + ad->batch_expire[REQ_SYNC]; 1093 + jiffies + ad->batch_expire[BLK_RW_SYNC]; 1091 1094 goto dispatch_request; 1092 1095 } 1093 1096 } ··· 1098 1101 */ 1099 1102 1100 1103 if (reads) { 1101 - BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[REQ_SYNC])); 1104 + BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_SYNC])); 1102 1105 1103 - if (writes && ad->batch_data_dir == REQ_SYNC) 1106 + if (writes && ad->batch_data_dir == BLK_RW_SYNC) 1104 1107 /* 1105 1108 * Last batch was a read, switch to writes 1106 1109 */ 1107 1110 goto dispatch_writes; 1108 1111 1109 - if (ad->batch_data_dir == REQ_ASYNC) { 1112 + if (ad->batch_data_dir == BLK_RW_ASYNC) { 1110 1113 WARN_ON(ad->new_batch); 1111 1114 ad->changed_batch = 1; 1112 1115 } 1113 - ad->batch_data_dir = REQ_SYNC; 1114 - rq = rq_entry_fifo(ad->fifo_list[REQ_SYNC].next); 1116 + ad->batch_data_dir = BLK_RW_SYNC; 1117 + rq = rq_entry_fifo(ad->fifo_list[BLK_RW_SYNC].next); 1115 1118 ad->last_check_fifo[ad->batch_data_dir] = jiffies; 1116 1119 goto dispatch_request; 1117 1120 } ··· 1122 1125 1123 1126 if (writes) { 1124 1127 dispatch_writes: 1125 - BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[REQ_ASYNC])); 1128 + BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_ASYNC])); 1126 1129 1127 - if (ad->batch_data_dir == REQ_SYNC) { 1130 + if (ad->batch_data_dir == BLK_RW_SYNC) { 1128 1131 ad->changed_batch = 1; 1129 1132 1130 1133 /* ··· 1134 1137 */ 1135 1138 ad->new_batch = 0; 1136 1139 } 1137 - ad->batch_data_dir = REQ_ASYNC; 1140 + ad->batch_data_dir = BLK_RW_ASYNC; 1138 1141 ad->current_write_count = ad->write_batch_count; 1139 1142 ad->write_batch_idled = 0; 1140 - rq = rq_entry_fifo(ad->fifo_list[REQ_ASYNC].next); 1141 - ad->last_check_fifo[REQ_ASYNC] = jiffies; 1143 + rq = rq_entry_fifo(ad->fifo_list[BLK_RW_ASYNC].next); 1144 + ad->last_check_fifo[BLK_RW_ASYNC] = jiffies; 1142 1145 goto dispatch_request; 1143 1146 } 1144 1147 ··· 1161 1164 if (ad->nr_dispatched) 1162 1165 return 0; 1163 1166 1164 - if (ad->batch_data_dir == REQ_ASYNC) 1167 + if (ad->batch_data_dir == BLK_RW_ASYNC) 1165 1168 ad->current_batch_expires = jiffies + 1166 - ad->batch_expire[REQ_ASYNC]; 1169 + ad->batch_expire[BLK_RW_ASYNC]; 1167 1170 else 1168 1171 ad->new_batch = 1; 1169 1172 ··· 1235 1238 { 1236 1239 struct as_data *ad = q->elevator->elevator_data; 1237 1240 1238 - return list_empty(&ad->fifo_list[REQ_ASYNC]) 1239 - && list_empty(&ad->fifo_list[REQ_SYNC]); 1241 + return list_empty(&ad->fifo_list[BLK_RW_ASYNC]) 1242 + && list_empty(&ad->fifo_list[BLK_RW_SYNC]); 1240 1243 } 1241 1244 1242 1245 static int ··· 1343 1346 del_timer_sync(&ad->antic_timer); 1344 1347 cancel_work_sync(&ad->antic_work); 1345 1348 1346 - BUG_ON(!list_empty(&ad->fifo_list[REQ_SYNC])); 1347 - BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC])); 1349 + BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_SYNC])); 1350 + BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_ASYNC])); 1348 1351 1349 1352 put_io_context(ad->io_context); 1350 1353 kfree(ad); ··· 1369 1372 init_timer(&ad->antic_timer); 1370 1373 INIT_WORK(&ad->antic_work, as_work_handler); 1371 1374 1372 - INIT_LIST_HEAD(&ad->fifo_list[REQ_SYNC]); 1373 - INIT_LIST_HEAD(&ad->fifo_list[REQ_ASYNC]); 1374 - ad->sort_list[REQ_SYNC] = RB_ROOT; 1375 - ad->sort_list[REQ_ASYNC] = RB_ROOT; 1376 - ad->fifo_expire[REQ_SYNC] = default_read_expire; 1377 - ad->fifo_expire[REQ_ASYNC] = default_write_expire; 1375 + INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_SYNC]); 1376 + INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_ASYNC]); 1377 + ad->sort_list[BLK_RW_SYNC] = RB_ROOT; 1378 + ad->sort_list[BLK_RW_ASYNC] = RB_ROOT; 1379 + ad->fifo_expire[BLK_RW_SYNC] = default_read_expire; 1380 + ad->fifo_expire[BLK_RW_ASYNC] = default_write_expire; 1378 1381 ad->antic_expire = default_antic_expire; 1379 - ad->batch_expire[REQ_SYNC] = default_read_batch_expire; 1380 - ad->batch_expire[REQ_ASYNC] = default_write_batch_expire; 1382 + ad->batch_expire[BLK_RW_SYNC] = default_read_batch_expire; 1383 + ad->batch_expire[BLK_RW_ASYNC] = default_write_batch_expire; 1381 1384 1382 - ad->current_batch_expires = jiffies + ad->batch_expire[REQ_SYNC]; 1383 - ad->write_batch_count = ad->batch_expire[REQ_ASYNC] / 10; 1385 + ad->current_batch_expires = jiffies + ad->batch_expire[BLK_RW_SYNC]; 1386 + ad->write_batch_count = ad->batch_expire[BLK_RW_ASYNC] / 10; 1384 1387 if (ad->write_batch_count < 2) 1385 1388 ad->write_batch_count = 2; 1386 1389 ··· 1429 1432 struct as_data *ad = e->elevator_data; \ 1430 1433 return as_var_show(jiffies_to_msecs((__VAR)), (page)); \ 1431 1434 } 1432 - SHOW_FUNCTION(as_read_expire_show, ad->fifo_expire[REQ_SYNC]); 1433 - SHOW_FUNCTION(as_write_expire_show, ad->fifo_expire[REQ_ASYNC]); 1435 + SHOW_FUNCTION(as_read_expire_show, ad->fifo_expire[BLK_RW_SYNC]); 1436 + SHOW_FUNCTION(as_write_expire_show, ad->fifo_expire[BLK_RW_ASYNC]); 1434 1437 SHOW_FUNCTION(as_antic_expire_show, ad->antic_expire); 1435 - SHOW_FUNCTION(as_read_batch_expire_show, ad->batch_expire[REQ_SYNC]); 1436 - SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[REQ_ASYNC]); 1438 + SHOW_FUNCTION(as_read_batch_expire_show, ad->batch_expire[BLK_RW_SYNC]); 1439 + SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[BLK_RW_ASYNC]); 1437 1440 #undef SHOW_FUNCTION 1438 1441 1439 1442 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ ··· 1448 1451 *(__PTR) = msecs_to_jiffies(*(__PTR)); \ 1449 1452 return ret; \ 1450 1453 } 1451 - STORE_FUNCTION(as_read_expire_store, &ad->fifo_expire[REQ_SYNC], 0, INT_MAX); 1452 - STORE_FUNCTION(as_write_expire_store, &ad->fifo_expire[REQ_ASYNC], 0, INT_MAX); 1454 + STORE_FUNCTION(as_read_expire_store, &ad->fifo_expire[BLK_RW_SYNC], 0, INT_MAX); 1455 + STORE_FUNCTION(as_write_expire_store, 1456 + &ad->fifo_expire[BLK_RW_ASYNC], 0, INT_MAX); 1453 1457 STORE_FUNCTION(as_antic_expire_store, &ad->antic_expire, 0, INT_MAX); 1454 1458 STORE_FUNCTION(as_read_batch_expire_store, 1455 - &ad->batch_expire[REQ_SYNC], 0, INT_MAX); 1459 + &ad->batch_expire[BLK_RW_SYNC], 0, INT_MAX); 1456 1460 STORE_FUNCTION(as_write_batch_expire_store, 1457 - &ad->batch_expire[REQ_ASYNC], 0, INT_MAX); 1461 + &ad->batch_expire[BLK_RW_ASYNC], 0, INT_MAX); 1458 1462 #undef STORE_FUNCTION 1459 1463 1460 1464 #define AS_ATTR(name) \

-3

block/blk-barrier.c

··· 319 319 return -ENXIO; 320 320 321 321 bio = bio_alloc(GFP_KERNEL, 0); 322 - if (!bio) 323 - return -ENOMEM; 324 - 325 322 bio->bi_end_io = bio_end_empty_barrier; 326 323 bio->bi_private = &wait; 327 324 bio->bi_bdev = bdev;

+2 -2

block/blk-sysfs.c

··· 209 209 ssize_t ret = queue_var_store(&stats, page, count); 210 210 211 211 spin_lock_irq(q->queue_lock); 212 - elv_quisce_start(q); 212 + elv_quiesce_start(q); 213 213 214 214 if (stats) 215 215 queue_flag_set(QUEUE_FLAG_IO_STAT, q); 216 216 else 217 217 queue_flag_clear(QUEUE_FLAG_IO_STAT, q); 218 218 219 - elv_quisce_end(q); 219 + elv_quiesce_end(q); 220 220 spin_unlock_irq(q->queue_lock); 221 221 222 222 return ret;

+2 -2

block/blk.h

··· 70 70 71 71 int blk_dev_init(void); 72 72 73 - void elv_quisce_start(struct request_queue *q); 74 - void elv_quisce_end(struct request_queue *q); 73 + void elv_quiesce_start(struct request_queue *q); 74 + void elv_quiesce_end(struct request_queue *q); 75 75 76 76 77 77 /*

+225 -45

block/cfq-iosched.c

··· 56 56 #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) 57 57 #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) 58 58 59 - #define ASYNC (0) 60 - #define SYNC (1) 61 - 62 59 #define sample_valid(samples) ((samples) > 80) 63 60 64 61 /* ··· 80 83 * rr list of queues with requests and the count of them 81 84 */ 82 85 struct cfq_rb_root service_tree; 86 + 87 + /* 88 + * Each priority tree is sorted by next_request position. These 89 + * trees are used when determining if two or more queues are 90 + * interleaving requests (see cfq_close_cooperator). 91 + */ 92 + struct rb_root prio_trees[CFQ_PRIO_LISTS]; 93 + 83 94 unsigned int busy_queues; 84 95 /* 85 96 * Used to track any pending rt requests so we can pre-empt current ··· 152 147 struct rb_node rb_node; 153 148 /* service_tree key */ 154 149 unsigned long rb_key; 150 + /* prio tree member */ 151 + struct rb_node p_node; 155 152 /* sorted list of pending requests */ 156 153 struct rb_root sort_list; 157 154 /* if fifo isn't expired, next request to serve */ ··· 192 185 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ 193 186 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ 194 187 CFQ_CFQQ_FLAG_sync, /* synchronous queue */ 188 + CFQ_CFQQ_FLAG_coop, /* has done a coop jump of the queue */ 195 189 }; 196 190 197 191 #define CFQ_CFQQ_FNS(name) \ ··· 219 211 CFQ_CFQQ_FNS(prio_changed); 220 212 CFQ_CFQQ_FNS(slice_new); 221 213 CFQ_CFQQ_FNS(sync); 214 + CFQ_CFQQ_FNS(coop); 222 215 #undef CFQ_CFQQ_FNS 223 216 224 217 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ ··· 428 419 return NULL; 429 420 } 430 421 422 + static void rb_erase_init(struct rb_node *n, struct rb_root *root) 423 + { 424 + rb_erase(n, root); 425 + RB_CLEAR_NODE(n); 426 + } 427 + 431 428 static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) 432 429 { 433 430 if (root->left == n) 434 431 root->left = NULL; 435 - 436 - rb_erase(n, &root->rb); 437 - RB_CLEAR_NODE(n); 432 + rb_erase_init(n, &root->rb); 438 433 } 439 434 440 435 /* ··· 483 470 * requests waiting to be processed. It is sorted in the order that 484 471 * we will service the queues. 485 472 */ 486 - static void cfq_service_tree_add(struct cfq_data *cfqd, 487 - struct cfq_queue *cfqq, int add_front) 473 + static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, 474 + int add_front) 488 475 { 489 476 struct rb_node **p, *parent; 490 477 struct cfq_queue *__cfqq; ··· 557 544 rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb); 558 545 } 559 546 547 + static struct cfq_queue * 548 + cfq_prio_tree_lookup(struct cfq_data *cfqd, int ioprio, sector_t sector, 549 + struct rb_node **ret_parent, struct rb_node ***rb_link) 550 + { 551 + struct rb_root *root = &cfqd->prio_trees[ioprio]; 552 + struct rb_node **p, *parent; 553 + struct cfq_queue *cfqq = NULL; 554 + 555 + parent = NULL; 556 + p = &root->rb_node; 557 + while (*p) { 558 + struct rb_node **n; 559 + 560 + parent = *p; 561 + cfqq = rb_entry(parent, struct cfq_queue, p_node); 562 + 563 + /* 564 + * Sort strictly based on sector. Smallest to the left, 565 + * largest to the right. 566 + */ 567 + if (sector > cfqq->next_rq->sector) 568 + n = &(*p)->rb_right; 569 + else if (sector < cfqq->next_rq->sector) 570 + n = &(*p)->rb_left; 571 + else 572 + break; 573 + p = n; 574 + } 575 + 576 + *ret_parent = parent; 577 + if (rb_link) 578 + *rb_link = p; 579 + return NULL; 580 + } 581 + 582 + static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq) 583 + { 584 + struct rb_root *root = &cfqd->prio_trees[cfqq->ioprio]; 585 + struct rb_node **p, *parent; 586 + struct cfq_queue *__cfqq; 587 + 588 + if (!RB_EMPTY_NODE(&cfqq->p_node)) 589 + rb_erase_init(&cfqq->p_node, root); 590 + 591 + if (cfq_class_idle(cfqq)) 592 + return; 593 + if (!cfqq->next_rq) 594 + return; 595 + 596 + __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->ioprio, cfqq->next_rq->sector, 597 + &parent, &p); 598 + BUG_ON(__cfqq); 599 + 600 + rb_link_node(&cfqq->p_node, parent, p); 601 + rb_insert_color(&cfqq->p_node, root); 602 + } 603 + 560 604 /* 561 605 * Update cfqq's position in the service tree. 562 606 */ ··· 622 552 /* 623 553 * Resorting requires the cfqq to be on the RR list already. 624 554 */ 625 - if (cfq_cfqq_on_rr(cfqq)) 555 + if (cfq_cfqq_on_rr(cfqq)) { 626 556 cfq_service_tree_add(cfqd, cfqq, 0); 557 + cfq_prio_tree_add(cfqd, cfqq); 558 + } 627 559 } 628 560 629 561 /* ··· 656 584 657 585 if (!RB_EMPTY_NODE(&cfqq->rb_node)) 658 586 cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); 587 + if (!RB_EMPTY_NODE(&cfqq->p_node)) 588 + rb_erase_init(&cfqq->p_node, &cfqd->prio_trees[cfqq->ioprio]); 659 589 660 590 BUG_ON(!cfqd->busy_queues); 661 591 cfqd->busy_queues--; ··· 687 613 { 688 614 struct cfq_queue *cfqq = RQ_CFQQ(rq); 689 615 struct cfq_data *cfqd = cfqq->cfqd; 690 - struct request *__alias; 616 + struct request *__alias, *prev; 691 617 692 618 cfqq->queued[rq_is_sync(rq)]++; 693 619 ··· 704 630 /* 705 631 * check if this request is a better next-serve candidate 706 632 */ 633 + prev = cfqq->next_rq; 707 634 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq); 635 + 636 + /* 637 + * adjust priority tree position, if ->next_rq changes 638 + */ 639 + if (prev != cfqq->next_rq) 640 + cfq_prio_tree_add(cfqd, cfqq); 641 + 708 642 BUG_ON(!cfqq->next_rq); 709 643 } 710 644 ··· 925 843 /* 926 844 * Get and set a new active queue for service. 927 845 */ 928 - static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd) 846 + static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, 847 + struct cfq_queue *cfqq) 929 848 { 930 - struct cfq_queue *cfqq; 849 + if (!cfqq) { 850 + cfqq = cfq_get_next_queue(cfqd); 851 + if (cfqq) 852 + cfq_clear_cfqq_coop(cfqq); 853 + } 931 854 932 - cfqq = cfq_get_next_queue(cfqd); 933 855 __cfq_set_active_queue(cfqd, cfqq); 934 856 return cfqq; 935 857 } ··· 957 871 return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean; 958 872 } 959 873 960 - static int cfq_close_cooperator(struct cfq_data *cfq_data, 961 - struct cfq_queue *cfqq) 874 + static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, 875 + struct cfq_queue *cur_cfqq) 962 876 { 877 + struct rb_root *root = &cfqd->prio_trees[cur_cfqq->ioprio]; 878 + struct rb_node *parent, *node; 879 + struct cfq_queue *__cfqq; 880 + sector_t sector = cfqd->last_position; 881 + 882 + if (RB_EMPTY_ROOT(root)) 883 + return NULL; 884 + 885 + /* 886 + * First, if we find a request starting at the end of the last 887 + * request, choose it. 888 + */ 889 + __cfqq = cfq_prio_tree_lookup(cfqd, cur_cfqq->ioprio, 890 + sector, &parent, NULL); 891 + if (__cfqq) 892 + return __cfqq; 893 + 894 + /* 895 + * If the exact sector wasn't found, the parent of the NULL leaf 896 + * will contain the closest sector. 897 + */ 898 + __cfqq = rb_entry(parent, struct cfq_queue, p_node); 899 + if (cfq_rq_close(cfqd, __cfqq->next_rq)) 900 + return __cfqq; 901 + 902 + if (__cfqq->next_rq->sector < sector) 903 + node = rb_next(&__cfqq->p_node); 904 + else 905 + node = rb_prev(&__cfqq->p_node); 906 + if (!node) 907 + return NULL; 908 + 909 + __cfqq = rb_entry(node, struct cfq_queue, p_node); 910 + if (cfq_rq_close(cfqd, __cfqq->next_rq)) 911 + return __cfqq; 912 + 913 + return NULL; 914 + } 915 + 916 + /* 917 + * cfqd - obvious 918 + * cur_cfqq - passed in so that we don't decide that the current queue is 919 + * closely cooperating with itself. 920 + * 921 + * So, basically we're assuming that that cur_cfqq has dispatched at least 922 + * one request, and that cfqd->last_position reflects a position on the disk 923 + * associated with the I/O issued by cur_cfqq. I'm not sure this is a valid 924 + * assumption. 925 + */ 926 + static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, 927 + struct cfq_queue *cur_cfqq, 928 + int probe) 929 + { 930 + struct cfq_queue *cfqq; 931 + 932 + /* 933 + * A valid cfq_io_context is necessary to compare requests against 934 + * the seek_mean of the current cfqq. 935 + */ 936 + if (!cfqd->active_cic) 937 + return NULL; 938 + 963 939 /* 964 940 * We should notice if some of the queues are cooperating, eg 965 941 * working closely on the same area of the disk. In that case, 966 942 * we can group them together and don't waste time idling. 967 943 */ 968 - return 0; 944 + cfqq = cfqq_close(cfqd, cur_cfqq); 945 + if (!cfqq) 946 + return NULL; 947 + 948 + if (cfq_cfqq_coop(cfqq)) 949 + return NULL; 950 + 951 + if (!probe) 952 + cfq_mark_cfqq_coop(cfqq); 953 + return cfqq; 969 954 } 955 + 970 956 971 957 #define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024)) 972 958 ··· 1078 920 if (!cic || !atomic_read(&cic->ioc->nr_tasks)) 1079 921 return; 1080 922 1081 - /* 1082 - * See if this prio level has a good candidate 1083 - */ 1084 - if (cfq_close_cooperator(cfqd, cfqq) && 1085 - (sample_valid(cic->ttime_samples) && cic->ttime_mean > 2)) 1086 - return; 1087 - 1088 923 cfq_mark_cfqq_wait_request(cfqq); 1089 924 1090 925 /* ··· 1090 939 sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT)); 1091 940 1092 941 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 1093 - cfq_log(cfqd, "arm_idle: %lu", sl); 942 + cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); 1094 943 } 1095 944 1096 945 /* ··· 1154 1003 */ 1155 1004 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) 1156 1005 { 1157 - struct cfq_queue *cfqq; 1006 + struct cfq_queue *cfqq, *new_cfqq = NULL; 1158 1007 1159 1008 cfqq = cfqd->active_queue; 1160 1009 if (!cfqq) ··· 1188 1037 goto keep_queue; 1189 1038 1190 1039 /* 1040 + * If another queue has a request waiting within our mean seek 1041 + * distance, let it run. The expire code will check for close 1042 + * cooperators and put the close queue at the front of the service 1043 + * tree. 1044 + */ 1045 + new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0); 1046 + if (new_cfqq) 1047 + goto expire; 1048 + 1049 + /* 1191 1050 * No requests pending. If the active queue still has requests in 1192 1051 * flight or is idling for a new request, allow either of these 1193 1052 * conditions to happen (or time out) before selecting a new queue. ··· 1211 1050 expire: 1212 1051 cfq_slice_expired(cfqd, 0); 1213 1052 new_queue: 1214 - cfqq = cfq_set_active_queue(cfqd); 1053 + cfqq = cfq_set_active_queue(cfqd, new_cfqq); 1215 1054 keep_queue: 1216 1055 return cfqq; 1217 1056 } ··· 1494 1333 if (ioc->ioc_data == cic) 1495 1334 rcu_assign_pointer(ioc->ioc_data, NULL); 1496 1335 1497 - if (cic->cfqq[ASYNC]) { 1498 - cfq_exit_cfqq(cfqd, cic->cfqq[ASYNC]); 1499 - cic->cfqq[ASYNC] = NULL; 1336 + if (cic->cfqq[BLK_RW_ASYNC]) { 1337 + cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); 1338 + cic->cfqq[BLK_RW_ASYNC] = NULL; 1500 1339 } 1501 1340 1502 - if (cic->cfqq[SYNC]) { 1503 - cfq_exit_cfqq(cfqd, cic->cfqq[SYNC]); 1504 - cic->cfqq[SYNC] = NULL; 1341 + if (cic->cfqq[BLK_RW_SYNC]) { 1342 + cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]); 1343 + cic->cfqq[BLK_RW_SYNC] = NULL; 1505 1344 } 1506 1345 } 1507 1346 ··· 1610 1449 1611 1450 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 1612 1451 1613 - cfqq = cic->cfqq[ASYNC]; 1452 + cfqq = cic->cfqq[BLK_RW_ASYNC]; 1614 1453 if (cfqq) { 1615 1454 struct cfq_queue *new_cfqq; 1616 - new_cfqq = cfq_get_queue(cfqd, ASYNC, cic->ioc, GFP_ATOMIC); 1455 + new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc, 1456 + GFP_ATOMIC); 1617 1457 if (new_cfqq) { 1618 - cic->cfqq[ASYNC] = new_cfqq; 1458 + cic->cfqq[BLK_RW_ASYNC] = new_cfqq; 1619 1459 cfq_put_queue(cfqq); 1620 1460 } 1621 1461 } 1622 1462 1623 - cfqq = cic->cfqq[SYNC]; 1463 + cfqq = cic->cfqq[BLK_RW_SYNC]; 1624 1464 if (cfqq) 1625 1465 cfq_mark_cfqq_prio_changed(cfqq); 1626 1466 ··· 1672 1510 } 1673 1511 1674 1512 RB_CLEAR_NODE(&cfqq->rb_node); 1513 + RB_CLEAR_NODE(&cfqq->p_node); 1675 1514 INIT_LIST_HEAD(&cfqq->fifo); 1676 1515 1677 1516 atomic_set(&cfqq->ref, 0); ··· 2068 1905 * Remember that we saw a request from this process, but 2069 1906 * don't start queuing just yet. Otherwise we risk seeing lots 2070 1907 * of tiny requests, because we disrupt the normal plugging 2071 - * and merging. 1908 + * and merging. If the request is already larger than a single 1909 + * page, let it rip immediately. For that case we assume that 1910 + * merging is already done. Ditto for a busy system that 1911 + * has other work pending, don't risk delaying until the 1912 + * idle timer unplug to continue working. 2072 1913 */ 2073 - if (cfq_cfqq_wait_request(cfqq)) 1914 + if (cfq_cfqq_wait_request(cfqq)) { 1915 + if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || 1916 + cfqd->busy_queues > 1) { 1917 + del_timer(&cfqd->idle_slice_timer); 1918 + blk_start_queueing(cfqd->queue); 1919 + } 2074 1920 cfq_mark_cfqq_must_dispatch(cfqq); 1921 + } 2075 1922 } else if (cfq_should_preempt(cfqd, cfqq, rq)) { 2076 1923 /* 2077 1924 * not the active queue - expire current slice if it is ··· 2165 1992 * or if we want to idle in case it has no pending requests. 2166 1993 */ 2167 1994 if (cfqd->active_queue == cfqq) { 1995 + const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list); 1996 + 2168 1997 if (cfq_cfqq_slice_new(cfqq)) { 2169 1998 cfq_set_prio_slice(cfqd, cfqq); 2170 1999 cfq_clear_cfqq_slice_new(cfqq); 2171 2000 } 2001 + /* 2002 + * If there are no requests waiting in this queue, and 2003 + * there are other queues ready to issue requests, AND 2004 + * those other queues are issuing requests within our 2005 + * mean seek distance, give them a chance to run instead 2006 + * of idling. 2007 + */ 2172 2008 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) 2173 2009 cfq_slice_expired(cfqd, 1); 2174 - else if (sync && !rq_noidle(rq) && 2175 - RB_EMPTY_ROOT(&cfqq->sort_list)) { 2010 + else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq, 1) && 2011 + sync && !rq_noidle(rq)) 2176 2012 cfq_arm_slice_timer(cfqd); 2177 - } 2178 2013 } 2179 2014 2180 2015 if (!cfqd->rq_in_driver) ··· 2243 2062 if (!cic) 2244 2063 return ELV_MQUEUE_MAY; 2245 2064 2246 - cfqq = cic_to_cfqq(cic, rw & REQ_RW_SYNC); 2065 + cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); 2247 2066 if (cfqq) { 2248 2067 cfq_init_prio_data(cfqq, cic->ioc); 2249 2068 cfq_prio_boost(cfqq); ··· 2333 2152 struct cfq_data *cfqd = 2334 2153 container_of(work, struct cfq_data, unplug_work); 2335 2154 struct request_queue *q = cfqd->queue; 2336 - unsigned long flags; 2337 2155 2338 - spin_lock_irqsave(q->queue_lock, flags); 2156 + spin_lock_irq(q->queue_lock); 2339 2157 blk_start_queueing(q); 2340 - spin_unlock_irqrestore(q->queue_lock, flags); 2158 + spin_unlock_irq(q->queue_lock); 2341 2159 } 2342 2160 2343 2161 /*

+4 -4

block/elevator.c

··· 590 590 /* 591 591 * Call with queue lock held, interrupts disabled 592 592 */ 593 - void elv_quisce_start(struct request_queue *q) 593 + void elv_quiesce_start(struct request_queue *q) 594 594 { 595 595 queue_flag_set(QUEUE_FLAG_ELVSWITCH, q); 596 596 ··· 607 607 } 608 608 } 609 609 610 - void elv_quisce_end(struct request_queue *q) 610 + void elv_quiesce_end(struct request_queue *q) 611 611 { 612 612 queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); 613 613 } ··· 1126 1126 * Turn on BYPASS and drain all requests w/ elevator private data 1127 1127 */ 1128 1128 spin_lock_irq(q->queue_lock); 1129 - elv_quisce_start(q); 1129 + elv_quiesce_start(q); 1130 1130 1131 1131 /* 1132 1132 * Remember old elevator. ··· 1150 1150 */ 1151 1151 elevator_exit(old_elevator); 1152 1152 spin_lock_irq(q->queue_lock); 1153 - elv_quisce_end(q); 1153 + elv_quiesce_end(q); 1154 1154 spin_unlock_irq(q->queue_lock); 1155 1155 1156 1156 blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name);

-2

block/ioctl.c

··· 146 146 struct bio *bio; 147 147 148 148 bio = bio_alloc(GFP_KERNEL, 0); 149 - if (!bio) 150 - return -ENOMEM; 151 149 152 150 bio->bi_end_io = blk_ioc_discard_endio; 153 151 bio->bi_bdev = bdev;

+4 -2

block/scsi_ioctl.c

··· 217 217 static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr, 218 218 struct bio *bio) 219 219 { 220 - int ret = 0; 220 + int r, ret = 0; 221 221 222 222 /* 223 223 * fill in all the output members ··· 242 242 ret = -EFAULT; 243 243 } 244 244 245 - blk_rq_unmap_user(bio); 245 + r = blk_rq_unmap_user(bio); 246 + if (!ret) 247 + ret = r; 246 248 blk_put_request(rq); 247 249 248 250 return ret;

+4 -1

drivers/block/brd.c

··· 275 275 if (rw == READ) { 276 276 copy_from_brd(mem + off, brd, sector, len); 277 277 flush_dcache_page(page); 278 - } else 278 + } else { 279 + flush_dcache_page(page); 279 280 copy_to_brd(brd, mem + off, sector, len); 281 + } 280 282 kunmap_atomic(mem, KM_USER0); 281 283 282 284 out: ··· 438 436 if (!brd->brd_queue) 439 437 goto out_free_dev; 440 438 blk_queue_make_request(brd->brd_queue, brd_make_request); 439 + blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_TAG, NULL); 441 440 blk_queue_max_sectors(brd->brd_queue, 1024); 442 441 blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY); 443 442

-117

drivers/md/dm-bio-list.h

··· 1 - /* 2 - * Copyright (C) 2004 Red Hat UK Ltd. 3 - * 4 - * This file is released under the GPL. 5 - */ 6 - 7 - #ifndef DM_BIO_LIST_H 8 - #define DM_BIO_LIST_H 9 - 10 - #include <linux/bio.h> 11 - 12 - #ifdef CONFIG_BLOCK 13 - 14 - struct bio_list { 15 - struct bio *head; 16 - struct bio *tail; 17 - }; 18 - 19 - static inline int bio_list_empty(const struct bio_list *bl) 20 - { 21 - return bl->head == NULL; 22 - } 23 - 24 - static inline void bio_list_init(struct bio_list *bl) 25 - { 26 - bl->head = bl->tail = NULL; 27 - } 28 - 29 - #define bio_list_for_each(bio, bl) \ 30 - for (bio = (bl)->head; bio; bio = bio->bi_next) 31 - 32 - static inline unsigned bio_list_size(const struct bio_list *bl) 33 - { 34 - unsigned sz = 0; 35 - struct bio *bio; 36 - 37 - bio_list_for_each(bio, bl) 38 - sz++; 39 - 40 - return sz; 41 - } 42 - 43 - static inline void bio_list_add(struct bio_list *bl, struct bio *bio) 44 - { 45 - bio->bi_next = NULL; 46 - 47 - if (bl->tail) 48 - bl->tail->bi_next = bio; 49 - else 50 - bl->head = bio; 51 - 52 - bl->tail = bio; 53 - } 54 - 55 - static inline void bio_list_add_head(struct bio_list *bl, struct bio *bio) 56 - { 57 - bio->bi_next = bl->head; 58 - 59 - bl->head = bio; 60 - 61 - if (!bl->tail) 62 - bl->tail = bio; 63 - } 64 - 65 - static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2) 66 - { 67 - if (!bl2->head) 68 - return; 69 - 70 - if (bl->tail) 71 - bl->tail->bi_next = bl2->head; 72 - else 73 - bl->head = bl2->head; 74 - 75 - bl->tail = bl2->tail; 76 - } 77 - 78 - static inline void bio_list_merge_head(struct bio_list *bl, 79 - struct bio_list *bl2) 80 - { 81 - if (!bl2->head) 82 - return; 83 - 84 - if (bl->head) 85 - bl2->tail->bi_next = bl->head; 86 - else 87 - bl->tail = bl2->tail; 88 - 89 - bl->head = bl2->head; 90 - } 91 - 92 - static inline struct bio *bio_list_pop(struct bio_list *bl) 93 - { 94 - struct bio *bio = bl->head; 95 - 96 - if (bio) { 97 - bl->head = bl->head->bi_next; 98 - if (!bl->head) 99 - bl->tail = NULL; 100 - 101 - bio->bi_next = NULL; 102 - } 103 - 104 - return bio; 105 - } 106 - 107 - static inline struct bio *bio_list_get(struct bio_list *bl) 108 - { 109 - struct bio *bio = bl->head; 110 - 111 - bl->head = bl->tail = NULL; 112 - 113 - return bio; 114 - } 115 - 116 - #endif /* CONFIG_BLOCK */ 117 - #endif

-2

drivers/md/dm-delay.c

··· 15 15 16 16 #include <linux/device-mapper.h> 17 17 18 - #include "dm-bio-list.h" 19 - 20 18 #define DM_MSG_PREFIX "delay" 21 19 22 20 struct delay_c {

-1

drivers/md/dm-mpath.c

··· 8 8 #include <linux/device-mapper.h> 9 9 10 10 #include "dm-path-selector.h" 11 - #include "dm-bio-list.h" 12 11 #include "dm-bio-record.h" 13 12 #include "dm-uevent.h" 14 13

-1

drivers/md/dm-raid1.c

··· 5 5 * This file is released under the GPL. 6 6 */ 7 7 8 - #include "dm-bio-list.h" 9 8 #include "dm-bio-record.h" 10 9 11 10 #include <linux/init.h>

-1

drivers/md/dm-region-hash.c

··· 14 14 #include <linux/vmalloc.h> 15 15 16 16 #include "dm.h" 17 - #include "dm-bio-list.h" 18 17 19 18 #define DM_MSG_PREFIX "region hash" 20 19

-1

drivers/md/dm-snap.c

··· 22 22 #include <linux/workqueue.h> 23 23 24 24 #include "dm-exception-store.h" 25 - #include "dm-bio-list.h" 26 25 27 26 #define DM_MSG_PREFIX "snapshots" 28 27

-1

drivers/md/dm.c

··· 6 6 */ 7 7 8 8 #include "dm.h" 9 - #include "dm-bio-list.h" 10 9 #include "dm-uevent.h" 11 10 12 11 #include <linux/init.h>

-1

drivers/md/raid1.c

··· 35 35 #include <linux/blkdev.h> 36 36 #include <linux/seq_file.h> 37 37 #include "md.h" 38 - #include "dm-bio-list.h" 39 38 #include "raid1.h" 40 39 #include "bitmap.h" 41 40

-1

drivers/md/raid10.c

··· 22 22 #include <linux/blkdev.h> 23 23 #include <linux/seq_file.h> 24 24 #include "md.h" 25 - #include "dm-bio-list.h" 26 25 #include "raid10.h" 27 26 #include "bitmap.h" 28 27

+18

fs/bio.c

··· 348 348 return NULL; 349 349 } 350 350 351 + /** 352 + * bio_alloc - allocate a bio for I/O 353 + * @gfp_mask: the GFP_ mask given to the slab allocator 354 + * @nr_iovecs: number of iovecs to pre-allocate 355 + * 356 + * Description: 357 + * bio_alloc will allocate a bio and associated bio_vec array that can hold 358 + * at least @nr_iovecs entries. Allocations will be done from the 359 + * fs_bio_set. Also see @bio_alloc_bioset. 360 + * 361 + * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate 362 + * a bio. This is due to the mempool guarantees. To make this work, callers 363 + * must never allocate more than 1 bio at the time from this pool. Callers 364 + * that need to allocate more than 1 bio must always submit the previously 365 + * allocate bio for IO before attempting to allocate a new one. Failure to 366 + * do so can cause livelocks under memory pressure. 367 + * 368 + **/ 351 369 struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) 352 370 { 353 371 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);

+9 -2

fs/buffer.c

··· 547 547 return err; 548 548 } 549 549 550 - void do_thaw_all(unsigned long unused) 550 + void do_thaw_all(struct work_struct *work) 551 551 { 552 552 struct super_block *sb; 553 553 char b[BDEVNAME_SIZE]; ··· 567 567 goto restart; 568 568 } 569 569 spin_unlock(&sb_lock); 570 + kfree(work); 570 571 printk(KERN_WARNING "Emergency Thaw complete\n"); 571 572 } 572 573 ··· 578 577 */ 579 578 void emergency_thaw_all(void) 580 579 { 581 - pdflush_operation(do_thaw_all, 0); 580 + struct work_struct *work; 581 + 582 + work = kmalloc(sizeof(*work), GFP_ATOMIC); 583 + if (work) { 584 + INIT_WORK(work, do_thaw_all); 585 + schedule_work(work); 586 + } 582 587 } 583 588 584 589 /**

-2

fs/direct-io.c

··· 307 307 struct bio *bio; 308 308 309 309 bio = bio_alloc(GFP_KERNEL, nr_vecs); 310 - if (bio == NULL) 311 - return -ENOMEM; 312 310 313 311 bio->bi_bdev = bdev; 314 312 bio->bi_sector = first_sector;

-2

fs/ext4/extents.c

··· 2416 2416 len = ee_len; 2417 2417 2418 2418 bio = bio_alloc(GFP_NOIO, len); 2419 - if (!bio) 2420 - return -ENOMEM; 2421 2419 bio->bi_sector = ee_pblock; 2422 2420 bio->bi_bdev = inode->i_sb->s_bdev; 2423 2421

-5

fs/gfs2/ops_fstype.c

··· 272 272 lock_page(page); 273 273 274 274 bio = bio_alloc(GFP_NOFS, 1); 275 - if (unlikely(!bio)) { 276 - __free_page(page); 277 - return -ENOBUFS; 278 - } 279 - 280 275 bio->bi_sector = sector * (sb->s_blocksize >> 9); 281 276 bio->bi_bdev = sb->s_bdev; 282 277 bio_add_page(bio, page, PAGE_SIZE, 0);

-36

fs/inode.c

··· 1470 1470 spin_lock(&inode_lock); 1471 1471 } 1472 1472 1473 - /* 1474 - * We rarely want to lock two inodes that do not have a parent/child 1475 - * relationship (such as directory, child inode) simultaneously. The 1476 - * vast majority of file systems should be able to get along fine 1477 - * without this. Do not use these functions except as a last resort. 1478 - */ 1479 - void inode_double_lock(struct inode *inode1, struct inode *inode2) 1480 - { 1481 - if (inode1 == NULL || inode2 == NULL || inode1 == inode2) { 1482 - if (inode1) 1483 - mutex_lock(&inode1->i_mutex); 1484 - else if (inode2) 1485 - mutex_lock(&inode2->i_mutex); 1486 - return; 1487 - } 1488 - 1489 - if (inode1 < inode2) { 1490 - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); 1491 - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); 1492 - } else { 1493 - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); 1494 - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); 1495 - } 1496 - } 1497 - EXPORT_SYMBOL(inode_double_lock); 1498 - 1499 - void inode_double_unlock(struct inode *inode1, struct inode *inode2) 1500 - { 1501 - if (inode1) 1502 - mutex_unlock(&inode1->i_mutex); 1503 - 1504 - if (inode2 && inode2 != inode1) 1505 - mutex_unlock(&inode2->i_mutex); 1506 - } 1507 - EXPORT_SYMBOL(inode_double_unlock); 1508 - 1509 1473 static __initdata unsigned long ihash_entries; 1510 1474 static int __init set_ihash_entries(char *str) 1511 1475 {

+76 -22

fs/ocfs2/file.c

··· 1912 1912 return written ? written : ret; 1913 1913 } 1914 1914 1915 + static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, 1916 + struct file *out, 1917 + struct splice_desc *sd) 1918 + { 1919 + int ret; 1920 + 1921 + ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos, 1922 + sd->total_len, 0, NULL); 1923 + if (ret < 0) { 1924 + mlog_errno(ret); 1925 + return ret; 1926 + } 1927 + 1928 + return splice_from_pipe_feed(pipe, sd, pipe_to_file); 1929 + } 1930 + 1915 1931 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, 1916 1932 struct file *out, 1917 1933 loff_t *ppos, ··· 1935 1919 unsigned int flags) 1936 1920 { 1937 1921 int ret; 1938 - struct inode *inode = out->f_path.dentry->d_inode; 1922 + struct address_space *mapping = out->f_mapping; 1923 + struct inode *inode = mapping->host; 1924 + struct splice_desc sd = { 1925 + .total_len = len, 1926 + .flags = flags, 1927 + .pos = *ppos, 1928 + .u.file = out, 1929 + }; 1939 1930 1940 1931 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe, 1941 1932 (unsigned int)len, 1942 1933 out->f_path.dentry->d_name.len, 1943 1934 out->f_path.dentry->d_name.name); 1944 1935 1945 - mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); 1946 - 1947 - ret = ocfs2_rw_lock(inode, 1); 1948 - if (ret < 0) { 1949 - mlog_errno(ret); 1950 - goto out; 1951 - } 1952 - 1953 - ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0, 1954 - NULL); 1955 - if (ret < 0) { 1956 - mlog_errno(ret); 1957 - goto out_unlock; 1958 - } 1959 - 1960 1936 if (pipe->inode) 1961 - mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD); 1962 - ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags); 1937 + mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT); 1938 + 1939 + splice_from_pipe_begin(&sd); 1940 + do { 1941 + ret = splice_from_pipe_next(pipe, &sd); 1942 + if (ret <= 0) 1943 + break; 1944 + 1945 + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 1946 + ret = ocfs2_rw_lock(inode, 1); 1947 + if (ret < 0) 1948 + mlog_errno(ret); 1949 + else { 1950 + ret = ocfs2_splice_to_file(pipe, out, &sd); 1951 + ocfs2_rw_unlock(inode, 1); 1952 + } 1953 + mutex_unlock(&inode->i_mutex); 1954 + } while (ret > 0); 1955 + splice_from_pipe_end(pipe, &sd); 1956 + 1963 1957 if (pipe->inode) 1964 1958 mutex_unlock(&pipe->inode->i_mutex); 1965 1959 1966 - out_unlock: 1967 - ocfs2_rw_unlock(inode, 1); 1968 - out: 1969 - mutex_unlock(&inode->i_mutex); 1960 + if (sd.num_spliced) 1961 + ret = sd.num_spliced; 1962 + 1963 + if (ret > 0) { 1964 + unsigned long nr_pages; 1965 + 1966 + *ppos += ret; 1967 + nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1968 + 1969 + /* 1970 + * If file or inode is SYNC and we actually wrote some data, 1971 + * sync it. 1972 + */ 1973 + if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 1974 + int err; 1975 + 1976 + mutex_lock(&inode->i_mutex); 1977 + err = ocfs2_rw_lock(inode, 1); 1978 + if (err < 0) { 1979 + mlog_errno(err); 1980 + } else { 1981 + err = generic_osync_inode(inode, mapping, 1982 + OSYNC_METADATA|OSYNC_DATA); 1983 + ocfs2_rw_unlock(inode, 1); 1984 + } 1985 + mutex_unlock(&inode->i_mutex); 1986 + 1987 + if (err) 1988 + ret = err; 1989 + } 1990 + balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 1991 + } 1970 1992 1971 1993 mlog_exit(ret); 1972 1994 return ret;

+38 -4

fs/pipe.c

··· 37 37 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 38 38 */ 39 39 40 + static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) 41 + { 42 + if (pipe->inode) 43 + mutex_lock_nested(&pipe->inode->i_mutex, subclass); 44 + } 45 + 46 + void pipe_lock(struct pipe_inode_info *pipe) 47 + { 48 + /* 49 + * pipe_lock() nests non-pipe inode locks (for writing to a file) 50 + */ 51 + pipe_lock_nested(pipe, I_MUTEX_PARENT); 52 + } 53 + EXPORT_SYMBOL(pipe_lock); 54 + 55 + void pipe_unlock(struct pipe_inode_info *pipe) 56 + { 57 + if (pipe->inode) 58 + mutex_unlock(&pipe->inode->i_mutex); 59 + } 60 + EXPORT_SYMBOL(pipe_unlock); 61 + 62 + void pipe_double_lock(struct pipe_inode_info *pipe1, 63 + struct pipe_inode_info *pipe2) 64 + { 65 + BUG_ON(pipe1 == pipe2); 66 + 67 + if (pipe1 < pipe2) { 68 + pipe_lock_nested(pipe1, I_MUTEX_PARENT); 69 + pipe_lock_nested(pipe2, I_MUTEX_CHILD); 70 + } else { 71 + pipe_lock_nested(pipe2, I_MUTEX_CHILD); 72 + pipe_lock_nested(pipe1, I_MUTEX_PARENT); 73 + } 74 + } 75 + 40 76 /* Drop the inode semaphore and wait for a pipe event, atomically */ 41 77 void pipe_wait(struct pipe_inode_info *pipe) 42 78 { ··· 83 47 * is considered a noninteractive wait: 84 48 */ 85 49 prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); 86 - if (pipe->inode) 87 - mutex_unlock(&pipe->inode->i_mutex); 50 + pipe_unlock(pipe); 88 51 schedule(); 89 52 finish_wait(&pipe->wait, &wait); 90 - if (pipe->inode) 91 - mutex_lock(&pipe->inode->i_mutex); 53 + pipe_lock(pipe); 92 54 } 93 55 94 56 static int

+196 -195

fs/splice.c

··· 182 182 do_wakeup = 0; 183 183 page_nr = 0; 184 184 185 - if (pipe->inode) 186 - mutex_lock(&pipe->inode->i_mutex); 185 + pipe_lock(pipe); 187 186 188 187 for (;;) { 189 188 if (!pipe->readers) { ··· 244 245 pipe->waiting_writers--; 245 246 } 246 247 247 - if (pipe->inode) { 248 - mutex_unlock(&pipe->inode->i_mutex); 248 + pipe_unlock(pipe); 249 249 250 - if (do_wakeup) { 251 - smp_mb(); 252 - if (waitqueue_active(&pipe->wait)) 253 - wake_up_interruptible(&pipe->wait); 254 - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 255 - } 250 + if (do_wakeup) { 251 + smp_mb(); 252 + if (waitqueue_active(&pipe->wait)) 253 + wake_up_interruptible(&pipe->wait); 254 + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 256 255 } 257 256 258 257 while (page_nr < spd_pages) ··· 552 555 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create 553 556 * a new page in the output file page cache and fill/dirty that. 554 557 */ 555 - static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 556 - struct splice_desc *sd) 558 + int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 559 + struct splice_desc *sd) 557 560 { 558 561 struct file *file = sd->u.file; 559 562 struct address_space *mapping = file->f_mapping; ··· 597 600 out: 598 601 return ret; 599 602 } 603 + EXPORT_SYMBOL(pipe_to_file); 604 + 605 + static void wakeup_pipe_writers(struct pipe_inode_info *pipe) 606 + { 607 + smp_mb(); 608 + if (waitqueue_active(&pipe->wait)) 609 + wake_up_interruptible(&pipe->wait); 610 + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 611 + } 612 + 613 + /** 614 + * splice_from_pipe_feed - feed available data from a pipe to a file 615 + * @pipe: pipe to splice from 616 + * @sd: information to @actor 617 + * @actor: handler that splices the data 618 + * 619 + * Description: 620 + 621 + * This function loops over the pipe and calls @actor to do the 622 + * actual moving of a single struct pipe_buffer to the desired 623 + * destination. It returns when there's no more buffers left in 624 + * the pipe or if the requested number of bytes (@sd->total_len) 625 + * have been copied. It returns a positive number (one) if the 626 + * pipe needs to be filled with more data, zero if the required 627 + * number of bytes have been copied and -errno on error. 628 + * 629 + * This, together with splice_from_pipe_{begin,end,next}, may be 630 + * used to implement the functionality of __splice_from_pipe() when 631 + * locking is required around copying the pipe buffers to the 632 + * destination. 633 + */ 634 + int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, 635 + splice_actor *actor) 636 + { 637 + int ret; 638 + 639 + while (pipe->nrbufs) { 640 + struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; 641 + const struct pipe_buf_operations *ops = buf->ops; 642 + 643 + sd->len = buf->len; 644 + if (sd->len > sd->total_len) 645 + sd->len = sd->total_len; 646 + 647 + ret = actor(pipe, buf, sd); 648 + if (ret <= 0) { 649 + if (ret == -ENODATA) 650 + ret = 0; 651 + return ret; 652 + } 653 + buf->offset += ret; 654 + buf->len -= ret; 655 + 656 + sd->num_spliced += ret; 657 + sd->len -= ret; 658 + sd->pos += ret; 659 + sd->total_len -= ret; 660 + 661 + if (!buf->len) { 662 + buf->ops = NULL; 663 + ops->release(pipe, buf); 664 + pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); 665 + pipe->nrbufs--; 666 + if (pipe->inode) 667 + sd->need_wakeup = true; 668 + } 669 + 670 + if (!sd->total_len) 671 + return 0; 672 + } 673 + 674 + return 1; 675 + } 676 + EXPORT_SYMBOL(splice_from_pipe_feed); 677 + 678 + /** 679 + * splice_from_pipe_next - wait for some data to splice from 680 + * @pipe: pipe to splice from 681 + * @sd: information about the splice operation 682 + * 683 + * Description: 684 + * This function will wait for some data and return a positive 685 + * value (one) if pipe buffers are available. It will return zero 686 + * or -errno if no more data needs to be spliced. 687 + */ 688 + int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) 689 + { 690 + while (!pipe->nrbufs) { 691 + if (!pipe->writers) 692 + return 0; 693 + 694 + if (!pipe->waiting_writers && sd->num_spliced) 695 + return 0; 696 + 697 + if (sd->flags & SPLICE_F_NONBLOCK) 698 + return -EAGAIN; 699 + 700 + if (signal_pending(current)) 701 + return -ERESTARTSYS; 702 + 703 + if (sd->need_wakeup) { 704 + wakeup_pipe_writers(pipe); 705 + sd->need_wakeup = false; 706 + } 707 + 708 + pipe_wait(pipe); 709 + } 710 + 711 + return 1; 712 + } 713 + EXPORT_SYMBOL(splice_from_pipe_next); 714 + 715 + /** 716 + * splice_from_pipe_begin - start splicing from pipe 717 + * @pipe: pipe to splice from 718 + * 719 + * Description: 720 + * This function should be called before a loop containing 721 + * splice_from_pipe_next() and splice_from_pipe_feed() to 722 + * initialize the necessary fields of @sd. 723 + */ 724 + void splice_from_pipe_begin(struct splice_desc *sd) 725 + { 726 + sd->num_spliced = 0; 727 + sd->need_wakeup = false; 728 + } 729 + EXPORT_SYMBOL(splice_from_pipe_begin); 730 + 731 + /** 732 + * splice_from_pipe_end - finish splicing from pipe 733 + * @pipe: pipe to splice from 734 + * @sd: information about the splice operation 735 + * 736 + * Description: 737 + * This function will wake up pipe writers if necessary. It should 738 + * be called after a loop containing splice_from_pipe_next() and 739 + * splice_from_pipe_feed(). 740 + */ 741 + void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd) 742 + { 743 + if (sd->need_wakeup) 744 + wakeup_pipe_writers(pipe); 745 + } 746 + EXPORT_SYMBOL(splice_from_pipe_end); 600 747 601 748 /** 602 749 * __splice_from_pipe - splice data from a pipe to given actor ··· 758 617 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, 759 618 splice_actor *actor) 760 619 { 761 - int ret, do_wakeup, err; 620 + int ret; 762 621 763 - ret = 0; 764 - do_wakeup = 0; 622 + splice_from_pipe_begin(sd); 623 + do { 624 + ret = splice_from_pipe_next(pipe, sd); 625 + if (ret > 0) 626 + ret = splice_from_pipe_feed(pipe, sd, actor); 627 + } while (ret > 0); 628 + splice_from_pipe_end(pipe, sd); 765 629 766 - for (;;) { 767 - if (pipe->nrbufs) { 768 - struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; 769 - const struct pipe_buf_operations *ops = buf->ops; 770 - 771 - sd->len = buf->len; 772 - if (sd->len > sd->total_len) 773 - sd->len = sd->total_len; 774 - 775 - err = actor(pipe, buf, sd); 776 - if (err <= 0) { 777 - if (!ret && err != -ENODATA) 778 - ret = err; 779 - 780 - break; 781 - } 782 - 783 - ret += err; 784 - buf->offset += err; 785 - buf->len -= err; 786 - 787 - sd->len -= err; 788 - sd->pos += err; 789 - sd->total_len -= err; 790 - if (sd->len) 791 - continue; 792 - 793 - if (!buf->len) { 794 - buf->ops = NULL; 795 - ops->release(pipe, buf); 796 - pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); 797 - pipe->nrbufs--; 798 - if (pipe->inode) 799 - do_wakeup = 1; 800 - } 801 - 802 - if (!sd->total_len) 803 - break; 804 - } 805 - 806 - if (pipe->nrbufs) 807 - continue; 808 - if (!pipe->writers) 809 - break; 810 - if (!pipe->waiting_writers) { 811 - if (ret) 812 - break; 813 - } 814 - 815 - if (sd->flags & SPLICE_F_NONBLOCK) { 816 - if (!ret) 817 - ret = -EAGAIN; 818 - break; 819 - } 820 - 821 - if (signal_pending(current)) { 822 - if (!ret) 823 - ret = -ERESTARTSYS; 824 - break; 825 - } 826 - 827 - if (do_wakeup) { 828 - smp_mb(); 829 - if (waitqueue_active(&pipe->wait)) 830 - wake_up_interruptible_sync(&pipe->wait); 831 - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 832 - do_wakeup = 0; 833 - } 834 - 835 - pipe_wait(pipe); 836 - } 837 - 838 - if (do_wakeup) { 839 - smp_mb(); 840 - if (waitqueue_active(&pipe->wait)) 841 - wake_up_interruptible(&pipe->wait); 842 - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 843 - } 844 - 845 - return ret; 630 + return sd->num_spliced ? sd->num_spliced : ret; 846 631 } 847 632 EXPORT_SYMBOL(__splice_from_pipe); 848 633 ··· 782 715 * @actor: handler that splices the data 783 716 * 784 717 * Description: 785 - * See __splice_from_pipe. This function locks the input and output inodes, 718 + * See __splice_from_pipe. This function locks the pipe inode, 786 719 * otherwise it's identical to __splice_from_pipe(). 787 720 * 788 721 */ ··· 791 724 splice_actor *actor) 792 725 { 793 726 ssize_t ret; 794 - struct inode *inode = out->f_mapping->host; 795 727 struct splice_desc sd = { 796 728 .total_len = len, 797 729 .flags = flags, ··· 798 732 .u.file = out, 799 733 }; 800 734 801 - /* 802 - * The actor worker might be calling ->write_begin and 803 - * ->write_end. Most of the time, these expect i_mutex to 804 - * be held. Since this may result in an ABBA deadlock with 805 - * pipe->inode, we have to order lock acquiry here. 806 - * 807 - * Outer lock must be inode->i_mutex, as pipe_wait() will 808 - * release and reacquire pipe->inode->i_mutex, AND inode must 809 - * never be a pipe. 810 - */ 811 - WARN_ON(S_ISFIFO(inode->i_mode)); 812 - mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); 813 - if (pipe->inode) 814 - mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD); 735 + pipe_lock(pipe); 815 736 ret = __splice_from_pipe(pipe, &sd, actor); 816 - if (pipe->inode) 817 - mutex_unlock(&pipe->inode->i_mutex); 818 - mutex_unlock(&inode->i_mutex); 737 + pipe_unlock(pipe); 819 738 820 739 return ret; 821 740 } 822 - 823 - /** 824 - * generic_file_splice_write_nolock - generic_file_splice_write without mutexes 825 - * @pipe: pipe info 826 - * @out: file to write to 827 - * @ppos: position in @out 828 - * @len: number of bytes to splice 829 - * @flags: splice modifier flags 830 - * 831 - * Description: 832 - * Will either move or copy pages (determined by @flags options) from 833 - * the given pipe inode to the given file. The caller is responsible 834 - * for acquiring i_mutex on both inodes. 835 - * 836 - */ 837 - ssize_t 838 - generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, 839 - loff_t *ppos, size_t len, unsigned int flags) 840 - { 841 - struct address_space *mapping = out->f_mapping; 842 - struct inode *inode = mapping->host; 843 - struct splice_desc sd = { 844 - .total_len = len, 845 - .flags = flags, 846 - .pos = *ppos, 847 - .u.file = out, 848 - }; 849 - ssize_t ret; 850 - int err; 851 - 852 - err = file_remove_suid(out); 853 - if (unlikely(err)) 854 - return err; 855 - 856 - ret = __splice_from_pipe(pipe, &sd, pipe_to_file); 857 - if (ret > 0) { 858 - unsigned long nr_pages; 859 - 860 - *ppos += ret; 861 - nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 862 - 863 - /* 864 - * If file or inode is SYNC and we actually wrote some data, 865 - * sync it. 866 - */ 867 - if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 868 - err = generic_osync_inode(inode, mapping, 869 - OSYNC_METADATA|OSYNC_DATA); 870 - 871 - if (err) 872 - ret = err; 873 - } 874 - balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 875 - } 876 - 877 - return ret; 878 - } 879 - 880 - EXPORT_SYMBOL(generic_file_splice_write_nolock); 881 741 882 742 /** 883 743 * generic_file_splice_write - splice data from a pipe to a file ··· 832 840 }; 833 841 ssize_t ret; 834 842 835 - WARN_ON(S_ISFIFO(inode->i_mode)); 836 - mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); 837 - ret = file_remove_suid(out); 838 - if (likely(!ret)) { 839 - if (pipe->inode) 840 - mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD); 841 - ret = __splice_from_pipe(pipe, &sd, pipe_to_file); 842 - if (pipe->inode) 843 - mutex_unlock(&pipe->inode->i_mutex); 844 - } 845 - mutex_unlock(&inode->i_mutex); 843 + pipe_lock(pipe); 844 + 845 + splice_from_pipe_begin(&sd); 846 + do { 847 + ret = splice_from_pipe_next(pipe, &sd); 848 + if (ret <= 0) 849 + break; 850 + 851 + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 852 + ret = file_remove_suid(out); 853 + if (!ret) 854 + ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file); 855 + mutex_unlock(&inode->i_mutex); 856 + } while (ret > 0); 857 + splice_from_pipe_end(pipe, &sd); 858 + 859 + pipe_unlock(pipe); 860 + 861 + if (sd.num_spliced) 862 + ret = sd.num_spliced; 863 + 846 864 if (ret > 0) { 847 865 unsigned long nr_pages; 848 866 ··· 1341 1339 if (!pipe) 1342 1340 return -EBADF; 1343 1341 1344 - if (pipe->inode) 1345 - mutex_lock(&pipe->inode->i_mutex); 1342 + pipe_lock(pipe); 1346 1343 1347 1344 error = ret = 0; 1348 1345 while (nr_segs) { ··· 1396 1395 iov++; 1397 1396 } 1398 1397 1399 - if (pipe->inode) 1400 - mutex_unlock(&pipe->inode->i_mutex); 1398 + pipe_unlock(pipe); 1401 1399 1402 1400 if (!ret) 1403 1401 ret = error; ··· 1524 1524 return 0; 1525 1525 1526 1526 ret = 0; 1527 - mutex_lock(&pipe->inode->i_mutex); 1527 + pipe_lock(pipe); 1528 1528 1529 1529 while (!pipe->nrbufs) { 1530 1530 if (signal_pending(current)) { ··· 1542 1542 pipe_wait(pipe); 1543 1543 } 1544 1544 1545 - mutex_unlock(&pipe->inode->i_mutex); 1545 + pipe_unlock(pipe); 1546 1546 return ret; 1547 1547 } 1548 1548 ··· 1562 1562 return 0; 1563 1563 1564 1564 ret = 0; 1565 - mutex_lock(&pipe->inode->i_mutex); 1565 + pipe_lock(pipe); 1566 1566 1567 1567 while (pipe->nrbufs >= PIPE_BUFFERS) { 1568 1568 if (!pipe->readers) { ··· 1583 1583 pipe->waiting_writers--; 1584 1584 } 1585 1585 1586 - mutex_unlock(&pipe->inode->i_mutex); 1586 + pipe_unlock(pipe); 1587 1587 return ret; 1588 1588 } 1589 1589 ··· 1599 1599 1600 1600 /* 1601 1601 * Potential ABBA deadlock, work around it by ordering lock 1602 - * grabbing by inode address. Otherwise two different processes 1602 + * grabbing by pipe info address. Otherwise two different processes 1603 1603 * could deadlock (one doing tee from A -> B, the other from B -> A). 1604 1604 */ 1605 - inode_double_lock(ipipe->inode, opipe->inode); 1605 + pipe_double_lock(ipipe, opipe); 1606 1606 1607 1607 do { 1608 1608 if (!opipe->readers) { ··· 1653 1653 if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK)) 1654 1654 ret = -EAGAIN; 1655 1655 1656 - inode_double_unlock(ipipe->inode, opipe->inode); 1656 + pipe_unlock(ipipe); 1657 + pipe_unlock(opipe); 1657 1658 1658 1659 /* 1659 1660 * If we put data in the output pipe, wakeup any potential readers.

+109

include/linux/bio.h

··· 504 504 return bio && bio->bi_io_vec != NULL; 505 505 } 506 506 507 + /* 508 + * BIO list managment for use by remapping drivers (e.g. DM or MD). 509 + * 510 + * A bio_list anchors a singly-linked list of bios chained through the bi_next 511 + * member of the bio. The bio_list also caches the last list member to allow 512 + * fast access to the tail. 513 + */ 514 + struct bio_list { 515 + struct bio *head; 516 + struct bio *tail; 517 + }; 518 + 519 + static inline int bio_list_empty(const struct bio_list *bl) 520 + { 521 + return bl->head == NULL; 522 + } 523 + 524 + static inline void bio_list_init(struct bio_list *bl) 525 + { 526 + bl->head = bl->tail = NULL; 527 + } 528 + 529 + #define bio_list_for_each(bio, bl) \ 530 + for (bio = (bl)->head; bio; bio = bio->bi_next) 531 + 532 + static inline unsigned bio_list_size(const struct bio_list *bl) 533 + { 534 + unsigned sz = 0; 535 + struct bio *bio; 536 + 537 + bio_list_for_each(bio, bl) 538 + sz++; 539 + 540 + return sz; 541 + } 542 + 543 + static inline void bio_list_add(struct bio_list *bl, struct bio *bio) 544 + { 545 + bio->bi_next = NULL; 546 + 547 + if (bl->tail) 548 + bl->tail->bi_next = bio; 549 + else 550 + bl->head = bio; 551 + 552 + bl->tail = bio; 553 + } 554 + 555 + static inline void bio_list_add_head(struct bio_list *bl, struct bio *bio) 556 + { 557 + bio->bi_next = bl->head; 558 + 559 + bl->head = bio; 560 + 561 + if (!bl->tail) 562 + bl->tail = bio; 563 + } 564 + 565 + static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2) 566 + { 567 + if (!bl2->head) 568 + return; 569 + 570 + if (bl->tail) 571 + bl->tail->bi_next = bl2->head; 572 + else 573 + bl->head = bl2->head; 574 + 575 + bl->tail = bl2->tail; 576 + } 577 + 578 + static inline void bio_list_merge_head(struct bio_list *bl, 579 + struct bio_list *bl2) 580 + { 581 + if (!bl2->head) 582 + return; 583 + 584 + if (bl->head) 585 + bl2->tail->bi_next = bl->head; 586 + else 587 + bl->tail = bl2->tail; 588 + 589 + bl->head = bl2->head; 590 + } 591 + 592 + static inline struct bio *bio_list_pop(struct bio_list *bl) 593 + { 594 + struct bio *bio = bl->head; 595 + 596 + if (bio) { 597 + bl->head = bl->head->bi_next; 598 + if (!bl->head) 599 + bl->tail = NULL; 600 + 601 + bio->bi_next = NULL; 602 + } 603 + 604 + return bio; 605 + } 606 + 607 + static inline struct bio *bio_list_get(struct bio_list *bl) 608 + { 609 + struct bio *bio = bl->head; 610 + 611 + bl->head = bl->tail = NULL; 612 + 613 + return bio; 614 + } 615 + 507 616 #if defined(CONFIG_BLK_DEV_INTEGRITY) 508 617 509 618 #define bip_vec_idx(bip, idx) (&(bip->bip_vec[(idx)]))

+59 -5

include/linux/fs.h

··· 87 87 */ 88 88 #define FMODE_NOCMTIME ((__force fmode_t)2048) 89 89 90 + /* 91 + * The below are the various read and write types that we support. Some of 92 + * them include behavioral modifiers that send information down to the 93 + * block layer and IO scheduler. Terminology: 94 + * 95 + * The block layer uses device plugging to defer IO a little bit, in 96 + * the hope that we will see more IO very shortly. This increases 97 + * coalescing of adjacent IO and thus reduces the number of IOs we 98 + * have to send to the device. It also allows for better queuing, 99 + * if the IO isn't mergeable. If the caller is going to be waiting 100 + * for the IO, then he must ensure that the device is unplugged so 101 + * that the IO is dispatched to the driver. 102 + * 103 + * All IO is handled async in Linux. This is fine for background 104 + * writes, but for reads or writes that someone waits for completion 105 + * on, we want to notify the block layer and IO scheduler so that they 106 + * know about it. That allows them to make better scheduling 107 + * decisions. So when the below references 'sync' and 'async', it 108 + * is referencing this priority hint. 109 + * 110 + * With that in mind, the available types are: 111 + * 112 + * READ A normal read operation. Device will be plugged. 113 + * READ_SYNC A synchronous read. Device is not plugged, caller can 114 + * immediately wait on this read without caring about 115 + * unplugging. 116 + * READA Used for read-ahead operations. Lower priority, and the 117 + * block layer could (in theory) choose to ignore this 118 + * request if it runs into resource problems. 119 + * WRITE A normal async write. Device will be plugged. 120 + * SWRITE Like WRITE, but a special case for ll_rw_block() that 121 + * tells it to lock the buffer first. Normally a buffer 122 + * must be locked before doing IO. 123 + * WRITE_SYNC_PLUG Synchronous write. Identical to WRITE, but passes down 124 + * the hint that someone will be waiting on this IO 125 + * shortly. The device must still be unplugged explicitly, 126 + * WRITE_SYNC_PLUG does not do this as we could be 127 + * submitting more writes before we actually wait on any 128 + * of them. 129 + * WRITE_SYNC Like WRITE_SYNC_PLUG, but also unplugs the device 130 + * immediately after submission. The write equivalent 131 + * of READ_SYNC. 132 + * WRITE_ODIRECT Special case write for O_DIRECT only. 133 + * SWRITE_SYNC 134 + * SWRITE_SYNC_PLUG Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer. 135 + * See SWRITE. 136 + * WRITE_BARRIER Like WRITE, but tells the block layer that all 137 + * previously submitted writes must be safely on storage 138 + * before this one is started. Also guarantees that when 139 + * this write is complete, it itself is also safely on 140 + * storage. Prevents reordering of writes on both sides 141 + * of this IO. 142 + * 143 + */ 90 144 #define RW_MASK 1 91 145 #define RWA_MASK 2 92 146 #define READ 0 ··· 156 102 (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) 157 103 #define SWRITE_SYNC (SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) 158 104 #define WRITE_BARRIER (WRITE | (1 << BIO_RW_BARRIER)) 105 + 106 + /* 107 + * These aren't really reads or writes, they pass down information about 108 + * parts of device that are now unused by the file system. 109 + */ 159 110 #define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD) 160 111 #define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER)) 161 112 ··· 796 737 I_MUTEX_XATTR, 797 738 I_MUTEX_QUOTA 798 739 }; 799 - 800 - extern void inode_double_lock(struct inode *inode1, struct inode *inode2); 801 - extern void inode_double_unlock(struct inode *inode1, struct inode *inode2); 802 740 803 741 /* 804 742 * NOTE: in a 32bit arch with a preemptable kernel and ··· 2205 2149 extern ssize_t generic_file_splice_read(struct file *, loff_t *, 2206 2150 struct pipe_inode_info *, size_t, unsigned int); 2207 2151 extern ssize_t generic_file_splice_write(struct pipe_inode_info *, 2208 - struct file *, loff_t *, size_t, unsigned int); 2209 - extern ssize_t generic_file_splice_write_nolock(struct pipe_inode_info *, 2210 2152 struct file *, loff_t *, size_t, unsigned int); 2211 2153 extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, 2212 2154 struct file *out, loff_t *, size_t len, unsigned int flags);

+5

include/linux/pipe_fs_i.h

··· 134 134 memory allocation, whereas PIPE_BUF makes atomicity guarantees. */ 135 135 #define PIPE_SIZE PAGE_SIZE 136 136 137 + /* Pipe lock and unlock operations */ 138 + void pipe_lock(struct pipe_inode_info *); 139 + void pipe_unlock(struct pipe_inode_info *); 140 + void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); 141 + 137 142 /* Drop the inode semaphore and wait for a pipe event, atomically */ 138 143 void pipe_wait(struct pipe_inode_info *pipe); 139 144

+12

include/linux/splice.h

··· 36 36 void *data; /* cookie */ 37 37 } u; 38 38 loff_t pos; /* file position */ 39 + size_t num_spliced; /* number of bytes already spliced */ 40 + bool need_wakeup; /* need to wake up writer */ 39 41 }; 40 42 41 43 struct partial_page { ··· 68 66 splice_actor *); 69 67 extern ssize_t __splice_from_pipe(struct pipe_inode_info *, 70 68 struct splice_desc *, splice_actor *); 69 + extern int splice_from_pipe_feed(struct pipe_inode_info *, struct splice_desc *, 70 + splice_actor *); 71 + extern int splice_from_pipe_next(struct pipe_inode_info *, 72 + struct splice_desc *); 73 + extern void splice_from_pipe_begin(struct splice_desc *); 74 + extern void splice_from_pipe_end(struct pipe_inode_info *, 75 + struct splice_desc *); 76 + extern int pipe_to_file(struct pipe_inode_info *, struct pipe_buffer *, 77 + struct splice_desc *); 78 + 71 79 extern ssize_t splice_to_pipe(struct pipe_inode_info *, 72 80 struct splice_pipe_desc *); 73 81 extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,

-2

kernel/power/swap.c

··· 64 64 struct bio *bio; 65 65 66 66 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); 67 - if (!bio) 68 - return -ENOMEM; 69 67 bio->bi_sector = page_off * (PAGE_SIZE >> 9); 70 68 bio->bi_bdev = resume_bdev; 71 69 bio->bi_end_io = end_swap_bio_read;