commit 23da64b4714812b66ecf010e7dfb3ed1bf2eda69 · tjh.dev/kernel

+6 -13

Documentation/block/biodoc.txt

··· 1040 iii. Plugging the queue to batch requests in anticipation of opportunities for 1041 merge/sort optimizations 1042 1043 - This is just the same as in 2.4 so far, though per-device unplugging 1044 - support is anticipated for 2.5. Also with a priority-based i/o scheduler, 1045 - such decisions could be based on request priorities. 1046 - 1047 Plugging is an approach that the current i/o scheduling algorithm resorts to so 1048 that it collects up enough requests in the queue to be able to take 1049 advantage of the sorting/merging logic in the elevator. If the 1050 queue is empty when a request comes in, then it plugs the request queue 1051 - (sort of like plugging the bottom of a vessel to get fluid to build up) 1052 till it fills up with a few more requests, before starting to service 1053 the requests. This provides an opportunity to merge/sort the requests before 1054 passing them down to the device. There are various conditions when the queue is 1055 unplugged (to open up the flow again), either through a scheduled task or 1056 could be on demand. For example wait_on_buffer sets the unplugging going 1057 - (by running tq_disk) so the read gets satisfied soon. So in the read case, 1058 - the queue gets explicitly unplugged as part of waiting for completion, 1059 - in fact all queues get unplugged as a side-effect. 1060 1061 Aside: 1062 This is kind of controversial territory, as it's not clear if plugging is ··· 1064 balance between when to plug and when to open up. Also now that we have 1065 multi-page bios being queued in one shot, we may not need to wait to merge 1066 a big request from the broken up pieces coming by. 1067 - 1068 - Per-queue granularity unplugging (still a Todo) may help reduce some of the 1069 - concerns with just a single tq_disk flush approach. Something like 1070 - blk_kick_queue() to unplug a specific queue (right away ?) 1071 - or optionally, all queues, is in the plan. 1072 1073 4.4 I/O contexts 1074 I/O contexts provide a dynamically allocated per process data area. They may

··· 1040 iii. Plugging the queue to batch requests in anticipation of opportunities for 1041 merge/sort optimizations 1042 1043 Plugging is an approach that the current i/o scheduling algorithm resorts to so 1044 that it collects up enough requests in the queue to be able to take 1045 advantage of the sorting/merging logic in the elevator. If the 1046 queue is empty when a request comes in, then it plugs the request queue 1047 + (sort of like plugging the bath tub of a vessel to get fluid to build up) 1048 till it fills up with a few more requests, before starting to service 1049 the requests. This provides an opportunity to merge/sort the requests before 1050 passing them down to the device. There are various conditions when the queue is 1051 unplugged (to open up the flow again), either through a scheduled task or 1052 could be on demand. For example wait_on_buffer sets the unplugging going 1053 + through sync_buffer() running blk_run_address_space(mapping). Or the caller 1054 + can do it explicity through blk_unplug(bdev). So in the read case, 1055 + the queue gets explicitly unplugged as part of waiting for completion on that 1056 + buffer. For page driven IO, the address space ->sync_page() takes care of 1057 + doing the blk_run_address_space(). 1058 1059 Aside: 1060 This is kind of controversial territory, as it's not clear if plugging is ··· 1066 balance between when to plug and when to open up. Also now that we have 1067 multi-page bios being queued in one shot, we may not need to wait to merge 1068 a big request from the broken up pieces coming by. 1069 1070 4.4 I/O contexts 1071 I/O contexts provide a dynamically allocated per process data area. They may

+57 -59

block/as-iosched.c

··· 17 #include <linux/rbtree.h> 18 #include <linux/interrupt.h> 19 20 - #define REQ_SYNC 1 21 - #define REQ_ASYNC 0 22 - 23 /* 24 * See Documentation/block/as-iosched.txt 25 */ ··· 90 struct list_head fifo_list[2]; 91 92 struct request *next_rq[2]; /* next in sort order */ 93 - sector_t last_sector[2]; /* last REQ_SYNC & REQ_ASYNC sectors */ 94 95 unsigned long exit_prob; /* probability a task will exit while 96 being waited on */ ··· 106 unsigned long last_check_fifo[2]; 107 int changed_batch; /* 1: waiting for old batch to end */ 108 int new_batch; /* 1: waiting on first read complete */ 109 - int batch_data_dir; /* current batch REQ_SYNC / REQ_ASYNC */ 110 int write_batch_count; /* max # of reqs in a write batch */ 111 int current_write_count; /* how many requests left this batch */ 112 int write_batch_idled; /* has the write batch gone idle? */ ··· 551 if (aic == NULL) 552 return; 553 554 - if (data_dir == REQ_SYNC) { 555 unsigned long in_flight = atomic_read(&aic->nr_queued) 556 + atomic_read(&aic->nr_dispatched); 557 spin_lock(&aic->lock); ··· 808 */ 809 static void update_write_batch(struct as_data *ad) 810 { 811 - unsigned long batch = ad->batch_expire[REQ_ASYNC]; 812 long write_time; 813 814 write_time = (jiffies - ad->current_batch_expires) + batch; ··· 852 kblockd_schedule_work(q, &ad->antic_work); 853 ad->changed_batch = 0; 854 855 - if (ad->batch_data_dir == REQ_SYNC) 856 ad->new_batch = 1; 857 } 858 WARN_ON(ad->nr_dispatched == 0); ··· 866 if (ad->new_batch && ad->batch_data_dir == rq_is_sync(rq)) { 867 update_write_batch(ad); 868 ad->current_batch_expires = jiffies + 869 - ad->batch_expire[REQ_SYNC]; 870 ad->new_batch = 0; 871 } 872 ··· 957 if (ad->changed_batch || ad->new_batch) 958 return 0; 959 960 - if (ad->batch_data_dir == REQ_SYNC) 961 /* TODO! add a check so a complete fifo gets written? */ 962 return time_after(jiffies, ad->current_batch_expires); 963 ··· 983 */ 984 ad->last_sector[data_dir] = rq->sector + rq->nr_sectors; 985 986 - if (data_dir == REQ_SYNC) { 987 struct io_context *ioc = RQ_IOC(rq); 988 /* In case we have to anticipate after this */ 989 copy_io_context(&ad->io_context, &ioc); ··· 1022 static int as_dispatch_request(struct request_queue *q, int force) 1023 { 1024 struct as_data *ad = q->elevator->elevator_data; 1025 - const int reads = !list_empty(&ad->fifo_list[REQ_SYNC]); 1026 - const int writes = !list_empty(&ad->fifo_list[REQ_ASYNC]); 1027 struct request *rq; 1028 1029 if (unlikely(force)) { 1030 /* 1031 * Forced dispatch, accounting is useless. Reset 1032 * accounting states and dump fifo_lists. Note that 1033 - * batch_data_dir is reset to REQ_SYNC to avoid 1034 * screwing write batch accounting as write batch 1035 * accounting occurs on W->R transition. 1036 */ 1037 int dispatched = 0; 1038 1039 - ad->batch_data_dir = REQ_SYNC; 1040 ad->changed_batch = 0; 1041 ad->new_batch = 0; 1042 1043 - while (ad->next_rq[REQ_SYNC]) { 1044 - as_move_to_dispatch(ad, ad->next_rq[REQ_SYNC]); 1045 dispatched++; 1046 } 1047 - ad->last_check_fifo[REQ_SYNC] = jiffies; 1048 1049 - while (ad->next_rq[REQ_ASYNC]) { 1050 - as_move_to_dispatch(ad, ad->next_rq[REQ_ASYNC]); 1051 dispatched++; 1052 } 1053 - ad->last_check_fifo[REQ_ASYNC] = jiffies; 1054 1055 return dispatched; 1056 } 1057 1058 /* Signal that the write batch was uncontended, so we can't time it */ 1059 - if (ad->batch_data_dir == REQ_ASYNC && !reads) { 1060 if (ad->current_write_count == 0 || !writes) 1061 ad->write_batch_idled = 1; 1062 } ··· 1073 */ 1074 rq = ad->next_rq[ad->batch_data_dir]; 1075 1076 - if (ad->batch_data_dir == REQ_SYNC && ad->antic_expire) { 1077 - if (as_fifo_expired(ad, REQ_SYNC)) 1078 goto fifo_expired; 1079 1080 if (as_can_anticipate(ad, rq)) { ··· 1087 /* we have a "next request" */ 1088 if (reads && !writes) 1089 ad->current_batch_expires = 1090 - jiffies + ad->batch_expire[REQ_SYNC]; 1091 goto dispatch_request; 1092 } 1093 } ··· 1098 */ 1099 1100 if (reads) { 1101 - BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[REQ_SYNC])); 1102 1103 - if (writes && ad->batch_data_dir == REQ_SYNC) 1104 /* 1105 * Last batch was a read, switch to writes 1106 */ 1107 goto dispatch_writes; 1108 1109 - if (ad->batch_data_dir == REQ_ASYNC) { 1110 WARN_ON(ad->new_batch); 1111 ad->changed_batch = 1; 1112 } 1113 - ad->batch_data_dir = REQ_SYNC; 1114 - rq = rq_entry_fifo(ad->fifo_list[REQ_SYNC].next); 1115 ad->last_check_fifo[ad->batch_data_dir] = jiffies; 1116 goto dispatch_request; 1117 } ··· 1122 1123 if (writes) { 1124 dispatch_writes: 1125 - BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[REQ_ASYNC])); 1126 1127 - if (ad->batch_data_dir == REQ_SYNC) { 1128 ad->changed_batch = 1; 1129 1130 /* ··· 1134 */ 1135 ad->new_batch = 0; 1136 } 1137 - ad->batch_data_dir = REQ_ASYNC; 1138 ad->current_write_count = ad->write_batch_count; 1139 ad->write_batch_idled = 0; 1140 - rq = rq_entry_fifo(ad->fifo_list[REQ_ASYNC].next); 1141 - ad->last_check_fifo[REQ_ASYNC] = jiffies; 1142 goto dispatch_request; 1143 } 1144 ··· 1161 if (ad->nr_dispatched) 1162 return 0; 1163 1164 - if (ad->batch_data_dir == REQ_ASYNC) 1165 ad->current_batch_expires = jiffies + 1166 - ad->batch_expire[REQ_ASYNC]; 1167 else 1168 ad->new_batch = 1; 1169 ··· 1235 { 1236 struct as_data *ad = q->elevator->elevator_data; 1237 1238 - return list_empty(&ad->fifo_list[REQ_ASYNC]) 1239 - && list_empty(&ad->fifo_list[REQ_SYNC]); 1240 } 1241 1242 static int ··· 1343 del_timer_sync(&ad->antic_timer); 1344 cancel_work_sync(&ad->antic_work); 1345 1346 - BUG_ON(!list_empty(&ad->fifo_list[REQ_SYNC])); 1347 - BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC])); 1348 1349 put_io_context(ad->io_context); 1350 kfree(ad); ··· 1369 init_timer(&ad->antic_timer); 1370 INIT_WORK(&ad->antic_work, as_work_handler); 1371 1372 - INIT_LIST_HEAD(&ad->fifo_list[REQ_SYNC]); 1373 - INIT_LIST_HEAD(&ad->fifo_list[REQ_ASYNC]); 1374 - ad->sort_list[REQ_SYNC] = RB_ROOT; 1375 - ad->sort_list[REQ_ASYNC] = RB_ROOT; 1376 - ad->fifo_expire[REQ_SYNC] = default_read_expire; 1377 - ad->fifo_expire[REQ_ASYNC] = default_write_expire; 1378 ad->antic_expire = default_antic_expire; 1379 - ad->batch_expire[REQ_SYNC] = default_read_batch_expire; 1380 - ad->batch_expire[REQ_ASYNC] = default_write_batch_expire; 1381 1382 - ad->current_batch_expires = jiffies + ad->batch_expire[REQ_SYNC]; 1383 - ad->write_batch_count = ad->batch_expire[REQ_ASYNC] / 10; 1384 if (ad->write_batch_count < 2) 1385 ad->write_batch_count = 2; 1386 ··· 1429 struct as_data *ad = e->elevator_data; \ 1430 return as_var_show(jiffies_to_msecs((__VAR)), (page)); \ 1431 } 1432 - SHOW_FUNCTION(as_read_expire_show, ad->fifo_expire[REQ_SYNC]); 1433 - SHOW_FUNCTION(as_write_expire_show, ad->fifo_expire[REQ_ASYNC]); 1434 SHOW_FUNCTION(as_antic_expire_show, ad->antic_expire); 1435 - SHOW_FUNCTION(as_read_batch_expire_show, ad->batch_expire[REQ_SYNC]); 1436 - SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[REQ_ASYNC]); 1437 #undef SHOW_FUNCTION 1438 1439 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ ··· 1448 *(__PTR) = msecs_to_jiffies(*(__PTR)); \ 1449 return ret; \ 1450 } 1451 - STORE_FUNCTION(as_read_expire_store, &ad->fifo_expire[REQ_SYNC], 0, INT_MAX); 1452 - STORE_FUNCTION(as_write_expire_store, &ad->fifo_expire[REQ_ASYNC], 0, INT_MAX); 1453 STORE_FUNCTION(as_antic_expire_store, &ad->antic_expire, 0, INT_MAX); 1454 STORE_FUNCTION(as_read_batch_expire_store, 1455 - &ad->batch_expire[REQ_SYNC], 0, INT_MAX); 1456 STORE_FUNCTION(as_write_batch_expire_store, 1457 - &ad->batch_expire[REQ_ASYNC], 0, INT_MAX); 1458 #undef STORE_FUNCTION 1459 1460 #define AS_ATTR(name) \

··· 17 #include <linux/rbtree.h> 18 #include <linux/interrupt.h> 19 20 /* 21 * See Documentation/block/as-iosched.txt 22 */ ··· 93 struct list_head fifo_list[2]; 94 95 struct request *next_rq[2]; /* next in sort order */ 96 + sector_t last_sector[2]; /* last SYNC & ASYNC sectors */ 97 98 unsigned long exit_prob; /* probability a task will exit while 99 being waited on */ ··· 109 unsigned long last_check_fifo[2]; 110 int changed_batch; /* 1: waiting for old batch to end */ 111 int new_batch; /* 1: waiting on first read complete */ 112 + int batch_data_dir; /* current batch SYNC / ASYNC */ 113 int write_batch_count; /* max # of reqs in a write batch */ 114 int current_write_count; /* how many requests left this batch */ 115 int write_batch_idled; /* has the write batch gone idle? */ ··· 554 if (aic == NULL) 555 return; 556 557 + if (data_dir == BLK_RW_SYNC) { 558 unsigned long in_flight = atomic_read(&aic->nr_queued) 559 + atomic_read(&aic->nr_dispatched); 560 spin_lock(&aic->lock); ··· 811 */ 812 static void update_write_batch(struct as_data *ad) 813 { 814 + unsigned long batch = ad->batch_expire[BLK_RW_ASYNC]; 815 long write_time; 816 817 write_time = (jiffies - ad->current_batch_expires) + batch; ··· 855 kblockd_schedule_work(q, &ad->antic_work); 856 ad->changed_batch = 0; 857 858 + if (ad->batch_data_dir == BLK_RW_SYNC) 859 ad->new_batch = 1; 860 } 861 WARN_ON(ad->nr_dispatched == 0); ··· 869 if (ad->new_batch && ad->batch_data_dir == rq_is_sync(rq)) { 870 update_write_batch(ad); 871 ad->current_batch_expires = jiffies + 872 + ad->batch_expire[BLK_RW_SYNC]; 873 ad->new_batch = 0; 874 } 875 ··· 960 if (ad->changed_batch || ad->new_batch) 961 return 0; 962 963 + if (ad->batch_data_dir == BLK_RW_SYNC) 964 /* TODO! add a check so a complete fifo gets written? */ 965 return time_after(jiffies, ad->current_batch_expires); 966 ··· 986 */ 987 ad->last_sector[data_dir] = rq->sector + rq->nr_sectors; 988 989 + if (data_dir == BLK_RW_SYNC) { 990 struct io_context *ioc = RQ_IOC(rq); 991 /* In case we have to anticipate after this */ 992 copy_io_context(&ad->io_context, &ioc); ··· 1025 static int as_dispatch_request(struct request_queue *q, int force) 1026 { 1027 struct as_data *ad = q->elevator->elevator_data; 1028 + const int reads = !list_empty(&ad->fifo_list[BLK_RW_SYNC]); 1029 + const int writes = !list_empty(&ad->fifo_list[BLK_RW_ASYNC]); 1030 struct request *rq; 1031 1032 if (unlikely(force)) { 1033 /* 1034 * Forced dispatch, accounting is useless. Reset 1035 * accounting states and dump fifo_lists. Note that 1036 + * batch_data_dir is reset to BLK_RW_SYNC to avoid 1037 * screwing write batch accounting as write batch 1038 * accounting occurs on W->R transition. 1039 */ 1040 int dispatched = 0; 1041 1042 + ad->batch_data_dir = BLK_RW_SYNC; 1043 ad->changed_batch = 0; 1044 ad->new_batch = 0; 1045 1046 + while (ad->next_rq[BLK_RW_SYNC]) { 1047 + as_move_to_dispatch(ad, ad->next_rq[BLK_RW_SYNC]); 1048 dispatched++; 1049 } 1050 + ad->last_check_fifo[BLK_RW_SYNC] = jiffies; 1051 1052 + while (ad->next_rq[BLK_RW_ASYNC]) { 1053 + as_move_to_dispatch(ad, ad->next_rq[BLK_RW_ASYNC]); 1054 dispatched++; 1055 } 1056 + ad->last_check_fifo[BLK_RW_ASYNC] = jiffies; 1057 1058 return dispatched; 1059 } 1060 1061 /* Signal that the write batch was uncontended, so we can't time it */ 1062 + if (ad->batch_data_dir == BLK_RW_ASYNC && !reads) { 1063 if (ad->current_write_count == 0 || !writes) 1064 ad->write_batch_idled = 1; 1065 } ··· 1076 */ 1077 rq = ad->next_rq[ad->batch_data_dir]; 1078 1079 + if (ad->batch_data_dir == BLK_RW_SYNC && ad->antic_expire) { 1080 + if (as_fifo_expired(ad, BLK_RW_SYNC)) 1081 goto fifo_expired; 1082 1083 if (as_can_anticipate(ad, rq)) { ··· 1090 /* we have a "next request" */ 1091 if (reads && !writes) 1092 ad->current_batch_expires = 1093 + jiffies + ad->batch_expire[BLK_RW_SYNC]; 1094 goto dispatch_request; 1095 } 1096 } ··· 1101 */ 1102 1103 if (reads) { 1104 + BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_SYNC])); 1105 1106 + if (writes && ad->batch_data_dir == BLK_RW_SYNC) 1107 /* 1108 * Last batch was a read, switch to writes 1109 */ 1110 goto dispatch_writes; 1111 1112 + if (ad->batch_data_dir == BLK_RW_ASYNC) { 1113 WARN_ON(ad->new_batch); 1114 ad->changed_batch = 1; 1115 } 1116 + ad->batch_data_dir = BLK_RW_SYNC; 1117 + rq = rq_entry_fifo(ad->fifo_list[BLK_RW_SYNC].next); 1118 ad->last_check_fifo[ad->batch_data_dir] = jiffies; 1119 goto dispatch_request; 1120 } ··· 1125 1126 if (writes) { 1127 dispatch_writes: 1128 + BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_ASYNC])); 1129 1130 + if (ad->batch_data_dir == BLK_RW_SYNC) { 1131 ad->changed_batch = 1; 1132 1133 /* ··· 1137 */ 1138 ad->new_batch = 0; 1139 } 1140 + ad->batch_data_dir = BLK_RW_ASYNC; 1141 ad->current_write_count = ad->write_batch_count; 1142 ad->write_batch_idled = 0; 1143 + rq = rq_entry_fifo(ad->fifo_list[BLK_RW_ASYNC].next); 1144 + ad->last_check_fifo[BLK_RW_ASYNC] = jiffies; 1145 goto dispatch_request; 1146 } 1147 ··· 1164 if (ad->nr_dispatched) 1165 return 0; 1166 1167 + if (ad->batch_data_dir == BLK_RW_ASYNC) 1168 ad->current_batch_expires = jiffies + 1169 + ad->batch_expire[BLK_RW_ASYNC]; 1170 else 1171 ad->new_batch = 1; 1172 ··· 1238 { 1239 struct as_data *ad = q->elevator->elevator_data; 1240 1241 + return list_empty(&ad->fifo_list[BLK_RW_ASYNC]) 1242 + && list_empty(&ad->fifo_list[BLK_RW_SYNC]); 1243 } 1244 1245 static int ··· 1346 del_timer_sync(&ad->antic_timer); 1347 cancel_work_sync(&ad->antic_work); 1348 1349 + BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_SYNC])); 1350 + BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_ASYNC])); 1351 1352 put_io_context(ad->io_context); 1353 kfree(ad); ··· 1372 init_timer(&ad->antic_timer); 1373 INIT_WORK(&ad->antic_work, as_work_handler); 1374 1375 + INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_SYNC]); 1376 + INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_ASYNC]); 1377 + ad->sort_list[BLK_RW_SYNC] = RB_ROOT; 1378 + ad->sort_list[BLK_RW_ASYNC] = RB_ROOT; 1379 + ad->fifo_expire[BLK_RW_SYNC] = default_read_expire; 1380 + ad->fifo_expire[BLK_RW_ASYNC] = default_write_expire; 1381 ad->antic_expire = default_antic_expire; 1382 + ad->batch_expire[BLK_RW_SYNC] = default_read_batch_expire; 1383 + ad->batch_expire[BLK_RW_ASYNC] = default_write_batch_expire; 1384 1385 + ad->current_batch_expires = jiffies + ad->batch_expire[BLK_RW_SYNC]; 1386 + ad->write_batch_count = ad->batch_expire[BLK_RW_ASYNC] / 10; 1387 if (ad->write_batch_count < 2) 1388 ad->write_batch_count = 2; 1389 ··· 1432 struct as_data *ad = e->elevator_data; \ 1433 return as_var_show(jiffies_to_msecs((__VAR)), (page)); \ 1434 } 1435 + SHOW_FUNCTION(as_read_expire_show, ad->fifo_expire[BLK_RW_SYNC]); 1436 + SHOW_FUNCTION(as_write_expire_show, ad->fifo_expire[BLK_RW_ASYNC]); 1437 SHOW_FUNCTION(as_antic_expire_show, ad->antic_expire); 1438 + SHOW_FUNCTION(as_read_batch_expire_show, ad->batch_expire[BLK_RW_SYNC]); 1439 + SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[BLK_RW_ASYNC]); 1440 #undef SHOW_FUNCTION 1441 1442 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ ··· 1451 *(__PTR) = msecs_to_jiffies(*(__PTR)); \ 1452 return ret; \ 1453 } 1454 + STORE_FUNCTION(as_read_expire_store, &ad->fifo_expire[BLK_RW_SYNC], 0, INT_MAX); 1455 + STORE_FUNCTION(as_write_expire_store, 1456 + &ad->fifo_expire[BLK_RW_ASYNC], 0, INT_MAX); 1457 STORE_FUNCTION(as_antic_expire_store, &ad->antic_expire, 0, INT_MAX); 1458 STORE_FUNCTION(as_read_batch_expire_store, 1459 + &ad->batch_expire[BLK_RW_SYNC], 0, INT_MAX); 1460 STORE_FUNCTION(as_write_batch_expire_store, 1461 + &ad->batch_expire[BLK_RW_ASYNC], 0, INT_MAX); 1462 #undef STORE_FUNCTION 1463 1464 #define AS_ATTR(name) \

-3

block/blk-barrier.c

··· 319 return -ENXIO; 320 321 bio = bio_alloc(GFP_KERNEL, 0); 322 - if (!bio) 323 - return -ENOMEM; 324 - 325 bio->bi_end_io = bio_end_empty_barrier; 326 bio->bi_private = &wait; 327 bio->bi_bdev = bdev;

··· 319 return -ENXIO; 320 321 bio = bio_alloc(GFP_KERNEL, 0); 322 bio->bi_end_io = bio_end_empty_barrier; 323 bio->bi_private = &wait; 324 bio->bi_bdev = bdev;

+2 -2

block/blk-sysfs.c

··· 209 ssize_t ret = queue_var_store(&stats, page, count); 210 211 spin_lock_irq(q->queue_lock); 212 - elv_quisce_start(q); 213 214 if (stats) 215 queue_flag_set(QUEUE_FLAG_IO_STAT, q); 216 else 217 queue_flag_clear(QUEUE_FLAG_IO_STAT, q); 218 219 - elv_quisce_end(q); 220 spin_unlock_irq(q->queue_lock); 221 222 return ret;

··· 209 ssize_t ret = queue_var_store(&stats, page, count); 210 211 spin_lock_irq(q->queue_lock); 212 + elv_quiesce_start(q); 213 214 if (stats) 215 queue_flag_set(QUEUE_FLAG_IO_STAT, q); 216 else 217 queue_flag_clear(QUEUE_FLAG_IO_STAT, q); 218 219 + elv_quiesce_end(q); 220 spin_unlock_irq(q->queue_lock); 221 222 return ret;

+2 -2

block/blk.h

··· 70 71 int blk_dev_init(void); 72 73 - void elv_quisce_start(struct request_queue *q); 74 - void elv_quisce_end(struct request_queue *q); 75 76 77 /*

··· 70 71 int blk_dev_init(void); 72 73 + void elv_quiesce_start(struct request_queue *q); 74 + void elv_quiesce_end(struct request_queue *q); 75 76 77 /*

+225 -45

block/cfq-iosched.c

··· 56 #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) 57 #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) 58 59 - #define ASYNC (0) 60 - #define SYNC (1) 61 - 62 #define sample_valid(samples) ((samples) > 80) 63 64 /* ··· 80 * rr list of queues with requests and the count of them 81 */ 82 struct cfq_rb_root service_tree; 83 unsigned int busy_queues; 84 /* 85 * Used to track any pending rt requests so we can pre-empt current ··· 152 struct rb_node rb_node; 153 /* service_tree key */ 154 unsigned long rb_key; 155 /* sorted list of pending requests */ 156 struct rb_root sort_list; 157 /* if fifo isn't expired, next request to serve */ ··· 192 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ 193 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ 194 CFQ_CFQQ_FLAG_sync, /* synchronous queue */ 195 }; 196 197 #define CFQ_CFQQ_FNS(name) \ ··· 219 CFQ_CFQQ_FNS(prio_changed); 220 CFQ_CFQQ_FNS(slice_new); 221 CFQ_CFQQ_FNS(sync); 222 #undef CFQ_CFQQ_FNS 223 224 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ ··· 428 return NULL; 429 } 430 431 static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) 432 { 433 if (root->left == n) 434 root->left = NULL; 435 - 436 - rb_erase(n, &root->rb); 437 - RB_CLEAR_NODE(n); 438 } 439 440 /* ··· 483 * requests waiting to be processed. It is sorted in the order that 484 * we will service the queues. 485 */ 486 - static void cfq_service_tree_add(struct cfq_data *cfqd, 487 - struct cfq_queue *cfqq, int add_front) 488 { 489 struct rb_node **p, *parent; 490 struct cfq_queue *__cfqq; ··· 557 rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb); 558 } 559 560 /* 561 * Update cfqq's position in the service tree. 562 */ ··· 622 /* 623 * Resorting requires the cfqq to be on the RR list already. 624 */ 625 - if (cfq_cfqq_on_rr(cfqq)) 626 cfq_service_tree_add(cfqd, cfqq, 0); 627 } 628 629 /* ··· 656 657 if (!RB_EMPTY_NODE(&cfqq->rb_node)) 658 cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); 659 660 BUG_ON(!cfqd->busy_queues); 661 cfqd->busy_queues--; ··· 687 { 688 struct cfq_queue *cfqq = RQ_CFQQ(rq); 689 struct cfq_data *cfqd = cfqq->cfqd; 690 - struct request *__alias; 691 692 cfqq->queued[rq_is_sync(rq)]++; 693 ··· 704 /* 705 * check if this request is a better next-serve candidate 706 */ 707 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq); 708 BUG_ON(!cfqq->next_rq); 709 } 710 ··· 925 /* 926 * Get and set a new active queue for service. 927 */ 928 - static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd) 929 { 930 - struct cfq_queue *cfqq; 931 932 - cfqq = cfq_get_next_queue(cfqd); 933 __cfq_set_active_queue(cfqd, cfqq); 934 return cfqq; 935 } ··· 957 return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean; 958 } 959 960 - static int cfq_close_cooperator(struct cfq_data *cfq_data, 961 - struct cfq_queue *cfqq) 962 { 963 /* 964 * We should notice if some of the queues are cooperating, eg 965 * working closely on the same area of the disk. In that case, 966 * we can group them together and don't waste time idling. 967 */ 968 - return 0; 969 } 970 971 #define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024)) 972 ··· 1078 if (!cic || !atomic_read(&cic->ioc->nr_tasks)) 1079 return; 1080 1081 - /* 1082 - * See if this prio level has a good candidate 1083 - */ 1084 - if (cfq_close_cooperator(cfqd, cfqq) && 1085 - (sample_valid(cic->ttime_samples) && cic->ttime_mean > 2)) 1086 - return; 1087 - 1088 cfq_mark_cfqq_wait_request(cfqq); 1089 1090 /* ··· 1090 sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT)); 1091 1092 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 1093 - cfq_log(cfqd, "arm_idle: %lu", sl); 1094 } 1095 1096 /* ··· 1154 */ 1155 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) 1156 { 1157 - struct cfq_queue *cfqq; 1158 1159 cfqq = cfqd->active_queue; 1160 if (!cfqq) ··· 1188 goto keep_queue; 1189 1190 /* 1191 * No requests pending. If the active queue still has requests in 1192 * flight or is idling for a new request, allow either of these 1193 * conditions to happen (or time out) before selecting a new queue. ··· 1211 expire: 1212 cfq_slice_expired(cfqd, 0); 1213 new_queue: 1214 - cfqq = cfq_set_active_queue(cfqd); 1215 keep_queue: 1216 return cfqq; 1217 } ··· 1494 if (ioc->ioc_data == cic) 1495 rcu_assign_pointer(ioc->ioc_data, NULL); 1496 1497 - if (cic->cfqq[ASYNC]) { 1498 - cfq_exit_cfqq(cfqd, cic->cfqq[ASYNC]); 1499 - cic->cfqq[ASYNC] = NULL; 1500 } 1501 1502 - if (cic->cfqq[SYNC]) { 1503 - cfq_exit_cfqq(cfqd, cic->cfqq[SYNC]); 1504 - cic->cfqq[SYNC] = NULL; 1505 } 1506 } 1507 ··· 1610 1611 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 1612 1613 - cfqq = cic->cfqq[ASYNC]; 1614 if (cfqq) { 1615 struct cfq_queue *new_cfqq; 1616 - new_cfqq = cfq_get_queue(cfqd, ASYNC, cic->ioc, GFP_ATOMIC); 1617 if (new_cfqq) { 1618 - cic->cfqq[ASYNC] = new_cfqq; 1619 cfq_put_queue(cfqq); 1620 } 1621 } 1622 1623 - cfqq = cic->cfqq[SYNC]; 1624 if (cfqq) 1625 cfq_mark_cfqq_prio_changed(cfqq); 1626 ··· 1672 } 1673 1674 RB_CLEAR_NODE(&cfqq->rb_node); 1675 INIT_LIST_HEAD(&cfqq->fifo); 1676 1677 atomic_set(&cfqq->ref, 0); ··· 2068 * Remember that we saw a request from this process, but 2069 * don't start queuing just yet. Otherwise we risk seeing lots 2070 * of tiny requests, because we disrupt the normal plugging 2071 - * and merging. 2072 */ 2073 - if (cfq_cfqq_wait_request(cfqq)) 2074 cfq_mark_cfqq_must_dispatch(cfqq); 2075 } else if (cfq_should_preempt(cfqd, cfqq, rq)) { 2076 /* 2077 * not the active queue - expire current slice if it is ··· 2165 * or if we want to idle in case it has no pending requests. 2166 */ 2167 if (cfqd->active_queue == cfqq) { 2168 if (cfq_cfqq_slice_new(cfqq)) { 2169 cfq_set_prio_slice(cfqd, cfqq); 2170 cfq_clear_cfqq_slice_new(cfqq); 2171 } 2172 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) 2173 cfq_slice_expired(cfqd, 1); 2174 - else if (sync && !rq_noidle(rq) && 2175 - RB_EMPTY_ROOT(&cfqq->sort_list)) { 2176 cfq_arm_slice_timer(cfqd); 2177 - } 2178 } 2179 2180 if (!cfqd->rq_in_driver) ··· 2243 if (!cic) 2244 return ELV_MQUEUE_MAY; 2245 2246 - cfqq = cic_to_cfqq(cic, rw & REQ_RW_SYNC); 2247 if (cfqq) { 2248 cfq_init_prio_data(cfqq, cic->ioc); 2249 cfq_prio_boost(cfqq); ··· 2333 struct cfq_data *cfqd = 2334 container_of(work, struct cfq_data, unplug_work); 2335 struct request_queue *q = cfqd->queue; 2336 - unsigned long flags; 2337 2338 - spin_lock_irqsave(q->queue_lock, flags); 2339 blk_start_queueing(q); 2340 - spin_unlock_irqrestore(q->queue_lock, flags); 2341 } 2342 2343 /*

··· 56 #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) 57 #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) 58 59 #define sample_valid(samples) ((samples) > 80) 60 61 /* ··· 83 * rr list of queues with requests and the count of them 84 */ 85 struct cfq_rb_root service_tree; 86 + 87 + /* 88 + * Each priority tree is sorted by next_request position. These 89 + * trees are used when determining if two or more queues are 90 + * interleaving requests (see cfq_close_cooperator). 91 + */ 92 + struct rb_root prio_trees[CFQ_PRIO_LISTS]; 93 + 94 unsigned int busy_queues; 95 /* 96 * Used to track any pending rt requests so we can pre-empt current ··· 147 struct rb_node rb_node; 148 /* service_tree key */ 149 unsigned long rb_key; 150 + /* prio tree member */ 151 + struct rb_node p_node; 152 /* sorted list of pending requests */ 153 struct rb_root sort_list; 154 /* if fifo isn't expired, next request to serve */ ··· 185 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ 186 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ 187 CFQ_CFQQ_FLAG_sync, /* synchronous queue */ 188 + CFQ_CFQQ_FLAG_coop, /* has done a coop jump of the queue */ 189 }; 190 191 #define CFQ_CFQQ_FNS(name) \ ··· 211 CFQ_CFQQ_FNS(prio_changed); 212 CFQ_CFQQ_FNS(slice_new); 213 CFQ_CFQQ_FNS(sync); 214 + CFQ_CFQQ_FNS(coop); 215 #undef CFQ_CFQQ_FNS 216 217 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ ··· 419 return NULL; 420 } 421 422 + static void rb_erase_init(struct rb_node *n, struct rb_root *root) 423 + { 424 + rb_erase(n, root); 425 + RB_CLEAR_NODE(n); 426 + } 427 + 428 static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) 429 { 430 if (root->left == n) 431 root->left = NULL; 432 + rb_erase_init(n, &root->rb); 433 } 434 435 /* ··· 470 * requests waiting to be processed. It is sorted in the order that 471 * we will service the queues. 472 */ 473 + static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, 474 + int add_front) 475 { 476 struct rb_node **p, *parent; 477 struct cfq_queue *__cfqq; ··· 544 rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb); 545 } 546 547 + static struct cfq_queue * 548 + cfq_prio_tree_lookup(struct cfq_data *cfqd, int ioprio, sector_t sector, 549 + struct rb_node **ret_parent, struct rb_node ***rb_link) 550 + { 551 + struct rb_root *root = &cfqd->prio_trees[ioprio]; 552 + struct rb_node **p, *parent; 553 + struct cfq_queue *cfqq = NULL; 554 + 555 + parent = NULL; 556 + p = &root->rb_node; 557 + while (*p) { 558 + struct rb_node **n; 559 + 560 + parent = *p; 561 + cfqq = rb_entry(parent, struct cfq_queue, p_node); 562 + 563 + /* 564 + * Sort strictly based on sector. Smallest to the left, 565 + * largest to the right. 566 + */ 567 + if (sector > cfqq->next_rq->sector) 568 + n = &(*p)->rb_right; 569 + else if (sector < cfqq->next_rq->sector) 570 + n = &(*p)->rb_left; 571 + else 572 + break; 573 + p = n; 574 + } 575 + 576 + *ret_parent = parent; 577 + if (rb_link) 578 + *rb_link = p; 579 + return NULL; 580 + } 581 + 582 + static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq) 583 + { 584 + struct rb_root *root = &cfqd->prio_trees[cfqq->ioprio]; 585 + struct rb_node **p, *parent; 586 + struct cfq_queue *__cfqq; 587 + 588 + if (!RB_EMPTY_NODE(&cfqq->p_node)) 589 + rb_erase_init(&cfqq->p_node, root); 590 + 591 + if (cfq_class_idle(cfqq)) 592 + return; 593 + if (!cfqq->next_rq) 594 + return; 595 + 596 + __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->ioprio, cfqq->next_rq->sector, 597 + &parent, &p); 598 + BUG_ON(__cfqq); 599 + 600 + rb_link_node(&cfqq->p_node, parent, p); 601 + rb_insert_color(&cfqq->p_node, root); 602 + } 603 + 604 /* 605 * Update cfqq's position in the service tree. 606 */ ··· 552 /* 553 * Resorting requires the cfqq to be on the RR list already. 554 */ 555 + if (cfq_cfqq_on_rr(cfqq)) { 556 cfq_service_tree_add(cfqd, cfqq, 0); 557 + cfq_prio_tree_add(cfqd, cfqq); 558 + } 559 } 560 561 /* ··· 584 585 if (!RB_EMPTY_NODE(&cfqq->rb_node)) 586 cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); 587 + if (!RB_EMPTY_NODE(&cfqq->p_node)) 588 + rb_erase_init(&cfqq->p_node, &cfqd->prio_trees[cfqq->ioprio]); 589 590 BUG_ON(!cfqd->busy_queues); 591 cfqd->busy_queues--; ··· 613 { 614 struct cfq_queue *cfqq = RQ_CFQQ(rq); 615 struct cfq_data *cfqd = cfqq->cfqd; 616 + struct request *__alias, *prev; 617 618 cfqq->queued[rq_is_sync(rq)]++; 619 ··· 630 /* 631 * check if this request is a better next-serve candidate 632 */ 633 + prev = cfqq->next_rq; 634 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq); 635 + 636 + /* 637 + * adjust priority tree position, if ->next_rq changes 638 + */ 639 + if (prev != cfqq->next_rq) 640 + cfq_prio_tree_add(cfqd, cfqq); 641 + 642 BUG_ON(!cfqq->next_rq); 643 } 644 ··· 843 /* 844 * Get and set a new active queue for service. 845 */ 846 + static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, 847 + struct cfq_queue *cfqq) 848 { 849 + if (!cfqq) { 850 + cfqq = cfq_get_next_queue(cfqd); 851 + if (cfqq) 852 + cfq_clear_cfqq_coop(cfqq); 853 + } 854 855 __cfq_set_active_queue(cfqd, cfqq); 856 return cfqq; 857 } ··· 871 return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean; 872 } 873 874 + static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, 875 + struct cfq_queue *cur_cfqq) 876 { 877 + struct rb_root *root = &cfqd->prio_trees[cur_cfqq->ioprio]; 878 + struct rb_node *parent, *node; 879 + struct cfq_queue *__cfqq; 880 + sector_t sector = cfqd->last_position; 881 + 882 + if (RB_EMPTY_ROOT(root)) 883 + return NULL; 884 + 885 + /* 886 + * First, if we find a request starting at the end of the last 887 + * request, choose it. 888 + */ 889 + __cfqq = cfq_prio_tree_lookup(cfqd, cur_cfqq->ioprio, 890 + sector, &parent, NULL); 891 + if (__cfqq) 892 + return __cfqq; 893 + 894 + /* 895 + * If the exact sector wasn't found, the parent of the NULL leaf 896 + * will contain the closest sector. 897 + */ 898 + __cfqq = rb_entry(parent, struct cfq_queue, p_node); 899 + if (cfq_rq_close(cfqd, __cfqq->next_rq)) 900 + return __cfqq; 901 + 902 + if (__cfqq->next_rq->sector < sector) 903 + node = rb_next(&__cfqq->p_node); 904 + else 905 + node = rb_prev(&__cfqq->p_node); 906 + if (!node) 907 + return NULL; 908 + 909 + __cfqq = rb_entry(node, struct cfq_queue, p_node); 910 + if (cfq_rq_close(cfqd, __cfqq->next_rq)) 911 + return __cfqq; 912 + 913 + return NULL; 914 + } 915 + 916 + /* 917 + * cfqd - obvious 918 + * cur_cfqq - passed in so that we don't decide that the current queue is 919 + * closely cooperating with itself. 920 + * 921 + * So, basically we're assuming that that cur_cfqq has dispatched at least 922 + * one request, and that cfqd->last_position reflects a position on the disk 923 + * associated with the I/O issued by cur_cfqq. I'm not sure this is a valid 924 + * assumption. 925 + */ 926 + static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, 927 + struct cfq_queue *cur_cfqq, 928 + int probe) 929 + { 930 + struct cfq_queue *cfqq; 931 + 932 + /* 933 + * A valid cfq_io_context is necessary to compare requests against 934 + * the seek_mean of the current cfqq. 935 + */ 936 + if (!cfqd->active_cic) 937 + return NULL; 938 + 939 /* 940 * We should notice if some of the queues are cooperating, eg 941 * working closely on the same area of the disk. In that case, 942 * we can group them together and don't waste time idling. 943 */ 944 + cfqq = cfqq_close(cfqd, cur_cfqq); 945 + if (!cfqq) 946 + return NULL; 947 + 948 + if (cfq_cfqq_coop(cfqq)) 949 + return NULL; 950 + 951 + if (!probe) 952 + cfq_mark_cfqq_coop(cfqq); 953 + return cfqq; 954 } 955 + 956 957 #define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024)) 958 ··· 920 if (!cic || !atomic_read(&cic->ioc->nr_tasks)) 921 return; 922 923 cfq_mark_cfqq_wait_request(cfqq); 924 925 /* ··· 939 sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT)); 940 941 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 942 + cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); 943 } 944 945 /* ··· 1003 */ 1004 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) 1005 { 1006 + struct cfq_queue *cfqq, *new_cfqq = NULL; 1007 1008 cfqq = cfqd->active_queue; 1009 if (!cfqq) ··· 1037 goto keep_queue; 1038 1039 /* 1040 + * If another queue has a request waiting within our mean seek 1041 + * distance, let it run. The expire code will check for close 1042 + * cooperators and put the close queue at the front of the service 1043 + * tree. 1044 + */ 1045 + new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0); 1046 + if (new_cfqq) 1047 + goto expire; 1048 + 1049 + /* 1050 * No requests pending. If the active queue still has requests in 1051 * flight or is idling for a new request, allow either of these 1052 * conditions to happen (or time out) before selecting a new queue. ··· 1050 expire: 1051 cfq_slice_expired(cfqd, 0); 1052 new_queue: 1053 + cfqq = cfq_set_active_queue(cfqd, new_cfqq); 1054 keep_queue: 1055 return cfqq; 1056 } ··· 1333 if (ioc->ioc_data == cic) 1334 rcu_assign_pointer(ioc->ioc_data, NULL); 1335 1336 + if (cic->cfqq[BLK_RW_ASYNC]) { 1337 + cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); 1338 + cic->cfqq[BLK_RW_ASYNC] = NULL; 1339 } 1340 1341 + if (cic->cfqq[BLK_RW_SYNC]) { 1342 + cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]); 1343 + cic->cfqq[BLK_RW_SYNC] = NULL; 1344 } 1345 } 1346 ··· 1449 1450 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 1451 1452 + cfqq = cic->cfqq[BLK_RW_ASYNC]; 1453 if (cfqq) { 1454 struct cfq_queue *new_cfqq; 1455 + new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc, 1456 + GFP_ATOMIC); 1457 if (new_cfqq) { 1458 + cic->cfqq[BLK_RW_ASYNC] = new_cfqq; 1459 cfq_put_queue(cfqq); 1460 } 1461 } 1462 1463 + cfqq = cic->cfqq[BLK_RW_SYNC]; 1464 if (cfqq) 1465 cfq_mark_cfqq_prio_changed(cfqq); 1466 ··· 1510 } 1511 1512 RB_CLEAR_NODE(&cfqq->rb_node); 1513 + RB_CLEAR_NODE(&cfqq->p_node); 1514 INIT_LIST_HEAD(&cfqq->fifo); 1515 1516 atomic_set(&cfqq->ref, 0); ··· 1905 * Remember that we saw a request from this process, but 1906 * don't start queuing just yet. Otherwise we risk seeing lots 1907 * of tiny requests, because we disrupt the normal plugging 1908 + * and merging. If the request is already larger than a single 1909 + * page, let it rip immediately. For that case we assume that 1910 + * merging is already done. Ditto for a busy system that 1911 + * has other work pending, don't risk delaying until the 1912 + * idle timer unplug to continue working. 1913 */ 1914 + if (cfq_cfqq_wait_request(cfqq)) { 1915 + if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || 1916 + cfqd->busy_queues > 1) { 1917 + del_timer(&cfqd->idle_slice_timer); 1918 + blk_start_queueing(cfqd->queue); 1919 + } 1920 cfq_mark_cfqq_must_dispatch(cfqq); 1921 + } 1922 } else if (cfq_should_preempt(cfqd, cfqq, rq)) { 1923 /* 1924 * not the active queue - expire current slice if it is ··· 1992 * or if we want to idle in case it has no pending requests. 1993 */ 1994 if (cfqd->active_queue == cfqq) { 1995 + const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list); 1996 + 1997 if (cfq_cfqq_slice_new(cfqq)) { 1998 cfq_set_prio_slice(cfqd, cfqq); 1999 cfq_clear_cfqq_slice_new(cfqq); 2000 } 2001 + /* 2002 + * If there are no requests waiting in this queue, and 2003 + * there are other queues ready to issue requests, AND 2004 + * those other queues are issuing requests within our 2005 + * mean seek distance, give them a chance to run instead 2006 + * of idling. 2007 + */ 2008 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) 2009 cfq_slice_expired(cfqd, 1); 2010 + else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq, 1) && 2011 + sync && !rq_noidle(rq)) 2012 cfq_arm_slice_timer(cfqd); 2013 } 2014 2015 if (!cfqd->rq_in_driver) ··· 2062 if (!cic) 2063 return ELV_MQUEUE_MAY; 2064 2065 + cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); 2066 if (cfqq) { 2067 cfq_init_prio_data(cfqq, cic->ioc); 2068 cfq_prio_boost(cfqq); ··· 2152 struct cfq_data *cfqd = 2153 container_of(work, struct cfq_data, unplug_work); 2154 struct request_queue *q = cfqd->queue; 2155 2156 + spin_lock_irq(q->queue_lock); 2157 blk_start_queueing(q); 2158 + spin_unlock_irq(q->queue_lock); 2159 } 2160 2161 /*

+4 -4

block/elevator.c

··· 590 /* 591 * Call with queue lock held, interrupts disabled 592 */ 593 - void elv_quisce_start(struct request_queue *q) 594 { 595 queue_flag_set(QUEUE_FLAG_ELVSWITCH, q); 596 ··· 607 } 608 } 609 610 - void elv_quisce_end(struct request_queue *q) 611 { 612 queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); 613 } ··· 1126 * Turn on BYPASS and drain all requests w/ elevator private data 1127 */ 1128 spin_lock_irq(q->queue_lock); 1129 - elv_quisce_start(q); 1130 1131 /* 1132 * Remember old elevator. ··· 1150 */ 1151 elevator_exit(old_elevator); 1152 spin_lock_irq(q->queue_lock); 1153 - elv_quisce_end(q); 1154 spin_unlock_irq(q->queue_lock); 1155 1156 blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name);

··· 590 /* 591 * Call with queue lock held, interrupts disabled 592 */ 593 + void elv_quiesce_start(struct request_queue *q) 594 { 595 queue_flag_set(QUEUE_FLAG_ELVSWITCH, q); 596 ··· 607 } 608 } 609 610 + void elv_quiesce_end(struct request_queue *q) 611 { 612 queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); 613 } ··· 1126 * Turn on BYPASS and drain all requests w/ elevator private data 1127 */ 1128 spin_lock_irq(q->queue_lock); 1129 + elv_quiesce_start(q); 1130 1131 /* 1132 * Remember old elevator. ··· 1150 */ 1151 elevator_exit(old_elevator); 1152 spin_lock_irq(q->queue_lock); 1153 + elv_quiesce_end(q); 1154 spin_unlock_irq(q->queue_lock); 1155 1156 blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name);

-2

block/ioctl.c

··· 146 struct bio *bio; 147 148 bio = bio_alloc(GFP_KERNEL, 0); 149 - if (!bio) 150 - return -ENOMEM; 151 152 bio->bi_end_io = blk_ioc_discard_endio; 153 bio->bi_bdev = bdev;

··· 146 struct bio *bio; 147 148 bio = bio_alloc(GFP_KERNEL, 0); 149 150 bio->bi_end_io = blk_ioc_discard_endio; 151 bio->bi_bdev = bdev;

+4 -2

block/scsi_ioctl.c

··· 217 static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr, 218 struct bio *bio) 219 { 220 - int ret = 0; 221 222 /* 223 * fill in all the output members ··· 242 ret = -EFAULT; 243 } 244 245 - blk_rq_unmap_user(bio); 246 blk_put_request(rq); 247 248 return ret;

··· 217 static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr, 218 struct bio *bio) 219 { 220 + int r, ret = 0; 221 222 /* 223 * fill in all the output members ··· 242 ret = -EFAULT; 243 } 244 245 + r = blk_rq_unmap_user(bio); 246 + if (!ret) 247 + ret = r; 248 blk_put_request(rq); 249 250 return ret;

+4 -1

drivers/block/brd.c

··· 275 if (rw == READ) { 276 copy_from_brd(mem + off, brd, sector, len); 277 flush_dcache_page(page); 278 - } else 279 copy_to_brd(brd, mem + off, sector, len); 280 kunmap_atomic(mem, KM_USER0); 281 282 out: ··· 438 if (!brd->brd_queue) 439 goto out_free_dev; 440 blk_queue_make_request(brd->brd_queue, brd_make_request); 441 blk_queue_max_sectors(brd->brd_queue, 1024); 442 blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY); 443

··· 275 if (rw == READ) { 276 copy_from_brd(mem + off, brd, sector, len); 277 flush_dcache_page(page); 278 + } else { 279 + flush_dcache_page(page); 280 copy_to_brd(brd, mem + off, sector, len); 281 + } 282 kunmap_atomic(mem, KM_USER0); 283 284 out: ··· 436 if (!brd->brd_queue) 437 goto out_free_dev; 438 blk_queue_make_request(brd->brd_queue, brd_make_request); 439 + blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_TAG, NULL); 440 blk_queue_max_sectors(brd->brd_queue, 1024); 441 blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY); 442

-117

drivers/md/dm-bio-list.h

··· 1 - /* 2 - * Copyright (C) 2004 Red Hat UK Ltd. 3 - * 4 - * This file is released under the GPL. 5 - */ 6 - 7 - #ifndef DM_BIO_LIST_H 8 - #define DM_BIO_LIST_H 9 - 10 - #include <linux/bio.h> 11 - 12 - #ifdef CONFIG_BLOCK 13 - 14 - struct bio_list { 15 - struct bio *head; 16 - struct bio *tail; 17 - }; 18 - 19 - static inline int bio_list_empty(const struct bio_list *bl) 20 - { 21 - return bl->head == NULL; 22 - } 23 - 24 - static inline void bio_list_init(struct bio_list *bl) 25 - { 26 - bl->head = bl->tail = NULL; 27 - } 28 - 29 - #define bio_list_for_each(bio, bl) \ 30 - for (bio = (bl)->head; bio; bio = bio->bi_next) 31 - 32 - static inline unsigned bio_list_size(const struct bio_list *bl) 33 - { 34 - unsigned sz = 0; 35 - struct bio *bio; 36 - 37 - bio_list_for_each(bio, bl) 38 - sz++; 39 - 40 - return sz; 41 - } 42 - 43 - static inline void bio_list_add(struct bio_list *bl, struct bio *bio) 44 - { 45 - bio->bi_next = NULL; 46 - 47 - if (bl->tail) 48 - bl->tail->bi_next = bio; 49 - else 50 - bl->head = bio; 51 - 52 - bl->tail = bio; 53 - } 54 - 55 - static inline void bio_list_add_head(struct bio_list *bl, struct bio *bio) 56 - { 57 - bio->bi_next = bl->head; 58 - 59 - bl->head = bio; 60 - 61 - if (!bl->tail) 62 - bl->tail = bio; 63 - } 64 - 65 - static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2) 66 - { 67 - if (!bl2->head) 68 - return; 69 - 70 - if (bl->tail) 71 - bl->tail->bi_next = bl2->head; 72 - else 73 - bl->head = bl2->head; 74 - 75 - bl->tail = bl2->tail; 76 - } 77 - 78 - static inline void bio_list_merge_head(struct bio_list *bl, 79 - struct bio_list *bl2) 80 - { 81 - if (!bl2->head) 82 - return; 83 - 84 - if (bl->head) 85 - bl2->tail->bi_next = bl->head; 86 - else 87 - bl->tail = bl2->tail; 88 - 89 - bl->head = bl2->head; 90 - } 91 - 92 - static inline struct bio *bio_list_pop(struct bio_list *bl) 93 - { 94 - struct bio *bio = bl->head; 95 - 96 - if (bio) { 97 - bl->head = bl->head->bi_next; 98 - if (!bl->head) 99 - bl->tail = NULL; 100 - 101 - bio->bi_next = NULL; 102 - } 103 - 104 - return bio; 105 - } 106 - 107 - static inline struct bio *bio_list_get(struct bio_list *bl) 108 - { 109 - struct bio *bio = bl->head; 110 - 111 - bl->head = bl->tail = NULL; 112 - 113 - return bio; 114 - } 115 - 116 - #endif /* CONFIG_BLOCK */ 117 - #endif

···

-2

drivers/md/dm-delay.c

··· 15 16 #include <linux/device-mapper.h> 17 18 - #include "dm-bio-list.h" 19 - 20 #define DM_MSG_PREFIX "delay" 21 22 struct delay_c {

··· 15 16 #include <linux/device-mapper.h> 17 18 #define DM_MSG_PREFIX "delay" 19 20 struct delay_c {

-1

drivers/md/dm-mpath.c

··· 8 #include <linux/device-mapper.h> 9 10 #include "dm-path-selector.h" 11 - #include "dm-bio-list.h" 12 #include "dm-bio-record.h" 13 #include "dm-uevent.h" 14

··· 8 #include <linux/device-mapper.h> 9 10 #include "dm-path-selector.h" 11 #include "dm-bio-record.h" 12 #include "dm-uevent.h" 13

-1

drivers/md/dm-raid1.c

··· 5 * This file is released under the GPL. 6 */ 7 8 - #include "dm-bio-list.h" 9 #include "dm-bio-record.h" 10 11 #include <linux/init.h>

··· 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-bio-record.h" 9 10 #include <linux/init.h>

-1

drivers/md/dm-region-hash.c

··· 14 #include <linux/vmalloc.h> 15 16 #include "dm.h" 17 - #include "dm-bio-list.h" 18 19 #define DM_MSG_PREFIX "region hash" 20

··· 14 #include <linux/vmalloc.h> 15 16 #include "dm.h" 17 18 #define DM_MSG_PREFIX "region hash" 19

-1

drivers/md/dm-snap.c

··· 22 #include <linux/workqueue.h> 23 24 #include "dm-exception-store.h" 25 - #include "dm-bio-list.h" 26 27 #define DM_MSG_PREFIX "snapshots" 28

··· 22 #include <linux/workqueue.h> 23 24 #include "dm-exception-store.h" 25 26 #define DM_MSG_PREFIX "snapshots" 27

-1

drivers/md/dm.c

··· 6 */ 7 8 #include "dm.h" 9 - #include "dm-bio-list.h" 10 #include "dm-uevent.h" 11 12 #include <linux/init.h>

··· 6 */ 7 8 #include "dm.h" 9 #include "dm-uevent.h" 10 11 #include <linux/init.h>

-1

drivers/md/raid1.c

··· 35 #include <linux/blkdev.h> 36 #include <linux/seq_file.h> 37 #include "md.h" 38 - #include "dm-bio-list.h" 39 #include "raid1.h" 40 #include "bitmap.h" 41

··· 35 #include <linux/blkdev.h> 36 #include <linux/seq_file.h> 37 #include "md.h" 38 #include "raid1.h" 39 #include "bitmap.h" 40

-1

drivers/md/raid10.c

··· 22 #include <linux/blkdev.h> 23 #include <linux/seq_file.h> 24 #include "md.h" 25 - #include "dm-bio-list.h" 26 #include "raid10.h" 27 #include "bitmap.h" 28

··· 22 #include <linux/blkdev.h> 23 #include <linux/seq_file.h> 24 #include "md.h" 25 #include "raid10.h" 26 #include "bitmap.h" 27

+18

fs/bio.c

··· 348 return NULL; 349 } 350 351 struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) 352 { 353 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);

··· 348 return NULL; 349 } 350 351 + /** 352 + * bio_alloc - allocate a bio for I/O 353 + * @gfp_mask: the GFP_ mask given to the slab allocator 354 + * @nr_iovecs: number of iovecs to pre-allocate 355 + * 356 + * Description: 357 + * bio_alloc will allocate a bio and associated bio_vec array that can hold 358 + * at least @nr_iovecs entries. Allocations will be done from the 359 + * fs_bio_set. Also see @bio_alloc_bioset. 360 + * 361 + * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate 362 + * a bio. This is due to the mempool guarantees. To make this work, callers 363 + * must never allocate more than 1 bio at the time from this pool. Callers 364 + * that need to allocate more than 1 bio must always submit the previously 365 + * allocate bio for IO before attempting to allocate a new one. Failure to 366 + * do so can cause livelocks under memory pressure. 367 + * 368 + **/ 369 struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) 370 { 371 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);

+9 -2

fs/buffer.c

··· 547 return err; 548 } 549 550 - void do_thaw_all(unsigned long unused) 551 { 552 struct super_block *sb; 553 char b[BDEVNAME_SIZE]; ··· 567 goto restart; 568 } 569 spin_unlock(&sb_lock); 570 printk(KERN_WARNING "Emergency Thaw complete\n"); 571 } 572 ··· 578 */ 579 void emergency_thaw_all(void) 580 { 581 - pdflush_operation(do_thaw_all, 0); 582 } 583 584 /**

··· 547 return err; 548 } 549 550 + void do_thaw_all(struct work_struct *work) 551 { 552 struct super_block *sb; 553 char b[BDEVNAME_SIZE]; ··· 567 goto restart; 568 } 569 spin_unlock(&sb_lock); 570 + kfree(work); 571 printk(KERN_WARNING "Emergency Thaw complete\n"); 572 } 573 ··· 577 */ 578 void emergency_thaw_all(void) 579 { 580 + struct work_struct *work; 581 + 582 + work = kmalloc(sizeof(*work), GFP_ATOMIC); 583 + if (work) { 584 + INIT_WORK(work, do_thaw_all); 585 + schedule_work(work); 586 + } 587 } 588 589 /**

-2

fs/direct-io.c

··· 307 struct bio *bio; 308 309 bio = bio_alloc(GFP_KERNEL, nr_vecs); 310 - if (bio == NULL) 311 - return -ENOMEM; 312 313 bio->bi_bdev = bdev; 314 bio->bi_sector = first_sector;

··· 307 struct bio *bio; 308 309 bio = bio_alloc(GFP_KERNEL, nr_vecs); 310 311 bio->bi_bdev = bdev; 312 bio->bi_sector = first_sector;

-2

fs/ext4/extents.c

··· 2416 len = ee_len; 2417 2418 bio = bio_alloc(GFP_NOIO, len); 2419 - if (!bio) 2420 - return -ENOMEM; 2421 bio->bi_sector = ee_pblock; 2422 bio->bi_bdev = inode->i_sb->s_bdev; 2423

··· 2416 len = ee_len; 2417 2418 bio = bio_alloc(GFP_NOIO, len); 2419 bio->bi_sector = ee_pblock; 2420 bio->bi_bdev = inode->i_sb->s_bdev; 2421

-5

fs/gfs2/ops_fstype.c

··· 272 lock_page(page); 273 274 bio = bio_alloc(GFP_NOFS, 1); 275 - if (unlikely(!bio)) { 276 - __free_page(page); 277 - return -ENOBUFS; 278 - } 279 - 280 bio->bi_sector = sector * (sb->s_blocksize >> 9); 281 bio->bi_bdev = sb->s_bdev; 282 bio_add_page(bio, page, PAGE_SIZE, 0);

··· 272 lock_page(page); 273 274 bio = bio_alloc(GFP_NOFS, 1); 275 bio->bi_sector = sector * (sb->s_blocksize >> 9); 276 bio->bi_bdev = sb->s_bdev; 277 bio_add_page(bio, page, PAGE_SIZE, 0);

-36

fs/inode.c

··· 1470 spin_lock(&inode_lock); 1471 } 1472 1473 - /* 1474 - * We rarely want to lock two inodes that do not have a parent/child 1475 - * relationship (such as directory, child inode) simultaneously. The 1476 - * vast majority of file systems should be able to get along fine 1477 - * without this. Do not use these functions except as a last resort. 1478 - */ 1479 - void inode_double_lock(struct inode *inode1, struct inode *inode2) 1480 - { 1481 - if (inode1 == NULL || inode2 == NULL || inode1 == inode2) { 1482 - if (inode1) 1483 - mutex_lock(&inode1->i_mutex); 1484 - else if (inode2) 1485 - mutex_lock(&inode2->i_mutex); 1486 - return; 1487 - } 1488 - 1489 - if (inode1 < inode2) { 1490 - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); 1491 - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); 1492 - } else { 1493 - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); 1494 - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); 1495 - } 1496 - } 1497 - EXPORT_SYMBOL(inode_double_lock); 1498 - 1499 - void inode_double_unlock(struct inode *inode1, struct inode *inode2) 1500 - { 1501 - if (inode1) 1502 - mutex_unlock(&inode1->i_mutex); 1503 - 1504 - if (inode2 && inode2 != inode1) 1505 - mutex_unlock(&inode2->i_mutex); 1506 - } 1507 - EXPORT_SYMBOL(inode_double_unlock); 1508 - 1509 static __initdata unsigned long ihash_entries; 1510 static int __init set_ihash_entries(char *str) 1511 {

··· 1470 spin_lock(&inode_lock); 1471 } 1472 1473 static __initdata unsigned long ihash_entries; 1474 static int __init set_ihash_entries(char *str) 1475 {

+76 -22

fs/ocfs2/file.c

··· 1912 return written ? written : ret; 1913 } 1914 1915 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, 1916 struct file *out, 1917 loff_t *ppos, ··· 1935 unsigned int flags) 1936 { 1937 int ret; 1938 - struct inode *inode = out->f_path.dentry->d_inode; 1939 1940 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe, 1941 (unsigned int)len, 1942 out->f_path.dentry->d_name.len, 1943 out->f_path.dentry->d_name.name); 1944 1945 - mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); 1946 - 1947 - ret = ocfs2_rw_lock(inode, 1); 1948 - if (ret < 0) { 1949 - mlog_errno(ret); 1950 - goto out; 1951 - } 1952 - 1953 - ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0, 1954 - NULL); 1955 - if (ret < 0) { 1956 - mlog_errno(ret); 1957 - goto out_unlock; 1958 - } 1959 - 1960 if (pipe->inode) 1961 - mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD); 1962 - ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags); 1963 if (pipe->inode) 1964 mutex_unlock(&pipe->inode->i_mutex); 1965 1966 - out_unlock: 1967 - ocfs2_rw_unlock(inode, 1); 1968 - out: 1969 - mutex_unlock(&inode->i_mutex); 1970 1971 mlog_exit(ret); 1972 return ret;

··· 1912 return written ? written : ret; 1913 } 1914 1915 + static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, 1916 + struct file *out, 1917 + struct splice_desc *sd) 1918 + { 1919 + int ret; 1920 + 1921 + ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos, 1922 + sd->total_len, 0, NULL); 1923 + if (ret < 0) { 1924 + mlog_errno(ret); 1925 + return ret; 1926 + } 1927 + 1928 + return splice_from_pipe_feed(pipe, sd, pipe_to_file); 1929 + } 1930 + 1931 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, 1932 struct file *out, 1933 loff_t *ppos, ··· 1919 unsigned int flags) 1920 { 1921 int ret; 1922 + struct address_space *mapping = out->f_mapping; 1923 + struct inode *inode = mapping->host; 1924 + struct splice_desc sd = { 1925 + .total_len = len, 1926 + .flags = flags, 1927 + .pos = *ppos, 1928 + .u.file = out, 1929 + }; 1930 1931 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe, 1932 (unsigned int)len, 1933 out->f_path.dentry->d_name.len, 1934 out->f_path.dentry->d_name.name); 1935 1936 if (pipe->inode) 1937 + mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT); 1938 + 1939 + splice_from_pipe_begin(&sd); 1940 + do { 1941 + ret = splice_from_pipe_next(pipe, &sd); 1942 + if (ret <= 0) 1943 + break; 1944 + 1945 + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 1946 + ret = ocfs2_rw_lock(inode, 1); 1947 + if (ret < 0) 1948 + mlog_errno(ret); 1949 + else { 1950 + ret = ocfs2_splice_to_file(pipe, out, &sd); 1951 + ocfs2_rw_unlock(inode, 1); 1952 + } 1953 + mutex_unlock(&inode->i_mutex); 1954 + } while (ret > 0); 1955 + splice_from_pipe_end(pipe, &sd); 1956 + 1957 if (pipe->inode) 1958 mutex_unlock(&pipe->inode->i_mutex); 1959 1960 + if (sd.num_spliced) 1961 + ret = sd.num_spliced; 1962 + 1963 + if (ret > 0) { 1964 + unsigned long nr_pages; 1965 + 1966 + *ppos += ret; 1967 + nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1968 + 1969 + /* 1970 + * If file or inode is SYNC and we actually wrote some data, 1971 + * sync it. 1972 + */ 1973 + if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 1974 + int err; 1975 + 1976 + mutex_lock(&inode->i_mutex); 1977 + err = ocfs2_rw_lock(inode, 1); 1978 + if (err < 0) { 1979 + mlog_errno(err); 1980 + } else { 1981 + err = generic_osync_inode(inode, mapping, 1982 + OSYNC_METADATA|OSYNC_DATA); 1983 + ocfs2_rw_unlock(inode, 1); 1984 + } 1985 + mutex_unlock(&inode->i_mutex); 1986 + 1987 + if (err) 1988 + ret = err; 1989 + } 1990 + balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 1991 + } 1992 1993 mlog_exit(ret); 1994 return ret;

+38 -4

fs/pipe.c

··· 37 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 38 */ 39 40 /* Drop the inode semaphore and wait for a pipe event, atomically */ 41 void pipe_wait(struct pipe_inode_info *pipe) 42 { ··· 83 * is considered a noninteractive wait: 84 */ 85 prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); 86 - if (pipe->inode) 87 - mutex_unlock(&pipe->inode->i_mutex); 88 schedule(); 89 finish_wait(&pipe->wait, &wait); 90 - if (pipe->inode) 91 - mutex_lock(&pipe->inode->i_mutex); 92 } 93 94 static int

··· 37 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 38 */ 39 40 + static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) 41 + { 42 + if (pipe->inode) 43 + mutex_lock_nested(&pipe->inode->i_mutex, subclass); 44 + } 45 + 46 + void pipe_lock(struct pipe_inode_info *pipe) 47 + { 48 + /* 49 + * pipe_lock() nests non-pipe inode locks (for writing to a file) 50 + */ 51 + pipe_lock_nested(pipe, I_MUTEX_PARENT); 52 + } 53 + EXPORT_SYMBOL(pipe_lock); 54 + 55 + void pipe_unlock(struct pipe_inode_info *pipe) 56 + { 57 + if (pipe->inode) 58 + mutex_unlock(&pipe->inode->i_mutex); 59 + } 60 + EXPORT_SYMBOL(pipe_unlock); 61 + 62 + void pipe_double_lock(struct pipe_inode_info *pipe1, 63 + struct pipe_inode_info *pipe2) 64 + { 65 + BUG_ON(pipe1 == pipe2); 66 + 67 + if (pipe1 < pipe2) { 68 + pipe_lock_nested(pipe1, I_MUTEX_PARENT); 69 + pipe_lock_nested(pipe2, I_MUTEX_CHILD); 70 + } else { 71 + pipe_lock_nested(pipe2, I_MUTEX_CHILD); 72 + pipe_lock_nested(pipe1, I_MUTEX_PARENT); 73 + } 74 + } 75 + 76 /* Drop the inode semaphore and wait for a pipe event, atomically */ 77 void pipe_wait(struct pipe_inode_info *pipe) 78 { ··· 47 * is considered a noninteractive wait: 48 */ 49 prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); 50 + pipe_unlock(pipe); 51 schedule(); 52 finish_wait(&pipe->wait, &wait); 53 + pipe_lock(pipe); 54 } 55 56 static int

+196 -195

fs/splice.c

··· 182 do_wakeup = 0; 183 page_nr = 0; 184 185 - if (pipe->inode) 186 - mutex_lock(&pipe->inode->i_mutex); 187 188 for (;;) { 189 if (!pipe->readers) { ··· 244 pipe->waiting_writers--; 245 } 246 247 - if (pipe->inode) { 248 - mutex_unlock(&pipe->inode->i_mutex); 249 250 - if (do_wakeup) { 251 - smp_mb(); 252 - if (waitqueue_active(&pipe->wait)) 253 - wake_up_interruptible(&pipe->wait); 254 - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 255 - } 256 } 257 258 while (page_nr < spd_pages) ··· 552 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create 553 * a new page in the output file page cache and fill/dirty that. 554 */ 555 - static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 556 - struct splice_desc *sd) 557 { 558 struct file *file = sd->u.file; 559 struct address_space *mapping = file->f_mapping; ··· 597 out: 598 return ret; 599 } 600 601 /** 602 * __splice_from_pipe - splice data from a pipe to given actor ··· 758 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, 759 splice_actor *actor) 760 { 761 - int ret, do_wakeup, err; 762 763 - ret = 0; 764 - do_wakeup = 0; 765 766 - for (;;) { 767 - if (pipe->nrbufs) { 768 - struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; 769 - const struct pipe_buf_operations *ops = buf->ops; 770 - 771 - sd->len = buf->len; 772 - if (sd->len > sd->total_len) 773 - sd->len = sd->total_len; 774 - 775 - err = actor(pipe, buf, sd); 776 - if (err <= 0) { 777 - if (!ret && err != -ENODATA) 778 - ret = err; 779 - 780 - break; 781 - } 782 - 783 - ret += err; 784 - buf->offset += err; 785 - buf->len -= err; 786 - 787 - sd->len -= err; 788 - sd->pos += err; 789 - sd->total_len -= err; 790 - if (sd->len) 791 - continue; 792 - 793 - if (!buf->len) { 794 - buf->ops = NULL; 795 - ops->release(pipe, buf); 796 - pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); 797 - pipe->nrbufs--; 798 - if (pipe->inode) 799 - do_wakeup = 1; 800 - } 801 - 802 - if (!sd->total_len) 803 - break; 804 - } 805 - 806 - if (pipe->nrbufs) 807 - continue; 808 - if (!pipe->writers) 809 - break; 810 - if (!pipe->waiting_writers) { 811 - if (ret) 812 - break; 813 - } 814 - 815 - if (sd->flags & SPLICE_F_NONBLOCK) { 816 - if (!ret) 817 - ret = -EAGAIN; 818 - break; 819 - } 820 - 821 - if (signal_pending(current)) { 822 - if (!ret) 823 - ret = -ERESTARTSYS; 824 - break; 825 - } 826 - 827 - if (do_wakeup) { 828 - smp_mb(); 829 - if (waitqueue_active(&pipe->wait)) 830 - wake_up_interruptible_sync(&pipe->wait); 831 - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 832 - do_wakeup = 0; 833 - } 834 - 835 - pipe_wait(pipe); 836 - } 837 - 838 - if (do_wakeup) { 839 - smp_mb(); 840 - if (waitqueue_active(&pipe->wait)) 841 - wake_up_interruptible(&pipe->wait); 842 - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 843 - } 844 - 845 - return ret; 846 } 847 EXPORT_SYMBOL(__splice_from_pipe); 848 ··· 782 * @actor: handler that splices the data 783 * 784 * Description: 785 - * See __splice_from_pipe. This function locks the input and output inodes, 786 * otherwise it's identical to __splice_from_pipe(). 787 * 788 */ ··· 791 splice_actor *actor) 792 { 793 ssize_t ret; 794 - struct inode *inode = out->f_mapping->host; 795 struct splice_desc sd = { 796 .total_len = len, 797 .flags = flags, ··· 798 .u.file = out, 799 }; 800 801 - /* 802 - * The actor worker might be calling ->write_begin and 803 - * ->write_end. Most of the time, these expect i_mutex to 804 - * be held. Since this may result in an ABBA deadlock with 805 - * pipe->inode, we have to order lock acquiry here. 806 - * 807 - * Outer lock must be inode->i_mutex, as pipe_wait() will 808 - * release and reacquire pipe->inode->i_mutex, AND inode must 809 - * never be a pipe. 810 - */ 811 - WARN_ON(S_ISFIFO(inode->i_mode)); 812 - mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); 813 - if (pipe->inode) 814 - mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD); 815 ret = __splice_from_pipe(pipe, &sd, actor); 816 - if (pipe->inode) 817 - mutex_unlock(&pipe->inode->i_mutex); 818 - mutex_unlock(&inode->i_mutex); 819 820 return ret; 821 } 822 - 823 - /** 824 - * generic_file_splice_write_nolock - generic_file_splice_write without mutexes 825 - * @pipe: pipe info 826 - * @out: file to write to 827 - * @ppos: position in @out 828 - * @len: number of bytes to splice 829 - * @flags: splice modifier flags 830 - * 831 - * Description: 832 - * Will either move or copy pages (determined by @flags options) from 833 - * the given pipe inode to the given file. The caller is responsible 834 - * for acquiring i_mutex on both inodes. 835 - * 836 - */ 837 - ssize_t 838 - generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, 839 - loff_t *ppos, size_t len, unsigned int flags) 840 - { 841 - struct address_space *mapping = out->f_mapping; 842 - struct inode *inode = mapping->host; 843 - struct splice_desc sd = { 844 - .total_len = len, 845 - .flags = flags, 846 - .pos = *ppos, 847 - .u.file = out, 848 - }; 849 - ssize_t ret; 850 - int err; 851 - 852 - err = file_remove_suid(out); 853 - if (unlikely(err)) 854 - return err; 855 - 856 - ret = __splice_from_pipe(pipe, &sd, pipe_to_file); 857 - if (ret > 0) { 858 - unsigned long nr_pages; 859 - 860 - *ppos += ret; 861 - nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 862 - 863 - /* 864 - * If file or inode is SYNC and we actually wrote some data, 865 - * sync it. 866 - */ 867 - if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 868 - err = generic_osync_inode(inode, mapping, 869 - OSYNC_METADATA|OSYNC_DATA); 870 - 871 - if (err) 872 - ret = err; 873 - } 874 - balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 875 - } 876 - 877 - return ret; 878 - } 879 - 880 - EXPORT_SYMBOL(generic_file_splice_write_nolock); 881 882 /** 883 * generic_file_splice_write - splice data from a pipe to a file ··· 832 }; 833 ssize_t ret; 834 835 - WARN_ON(S_ISFIFO(inode->i_mode)); 836 - mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); 837 - ret = file_remove_suid(out); 838 - if (likely(!ret)) { 839 - if (pipe->inode) 840 - mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD); 841 - ret = __splice_from_pipe(pipe, &sd, pipe_to_file); 842 - if (pipe->inode) 843 - mutex_unlock(&pipe->inode->i_mutex); 844 - } 845 - mutex_unlock(&inode->i_mutex); 846 if (ret > 0) { 847 unsigned long nr_pages; 848 ··· 1341 if (!pipe) 1342 return -EBADF; 1343 1344 - if (pipe->inode) 1345 - mutex_lock(&pipe->inode->i_mutex); 1346 1347 error = ret = 0; 1348 while (nr_segs) { ··· 1396 iov++; 1397 } 1398 1399 - if (pipe->inode) 1400 - mutex_unlock(&pipe->inode->i_mutex); 1401 1402 if (!ret) 1403 ret = error; ··· 1524 return 0; 1525 1526 ret = 0; 1527 - mutex_lock(&pipe->inode->i_mutex); 1528 1529 while (!pipe->nrbufs) { 1530 if (signal_pending(current)) { ··· 1542 pipe_wait(pipe); 1543 } 1544 1545 - mutex_unlock(&pipe->inode->i_mutex); 1546 return ret; 1547 } 1548 ··· 1562 return 0; 1563 1564 ret = 0; 1565 - mutex_lock(&pipe->inode->i_mutex); 1566 1567 while (pipe->nrbufs >= PIPE_BUFFERS) { 1568 if (!pipe->readers) { ··· 1583 pipe->waiting_writers--; 1584 } 1585 1586 - mutex_unlock(&pipe->inode->i_mutex); 1587 return ret; 1588 } 1589 ··· 1599 1600 /* 1601 * Potential ABBA deadlock, work around it by ordering lock 1602 - * grabbing by inode address. Otherwise two different processes 1603 * could deadlock (one doing tee from A -> B, the other from B -> A). 1604 */ 1605 - inode_double_lock(ipipe->inode, opipe->inode); 1606 1607 do { 1608 if (!opipe->readers) { ··· 1653 if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK)) 1654 ret = -EAGAIN; 1655 1656 - inode_double_unlock(ipipe->inode, opipe->inode); 1657 1658 /* 1659 * If we put data in the output pipe, wakeup any potential readers.

··· 182 do_wakeup = 0; 183 page_nr = 0; 184 185 + pipe_lock(pipe); 186 187 for (;;) { 188 if (!pipe->readers) { ··· 245 pipe->waiting_writers--; 246 } 247 248 + pipe_unlock(pipe); 249 250 + if (do_wakeup) { 251 + smp_mb(); 252 + if (waitqueue_active(&pipe->wait)) 253 + wake_up_interruptible(&pipe->wait); 254 + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 255 } 256 257 while (page_nr < spd_pages) ··· 555 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create 556 * a new page in the output file page cache and fill/dirty that. 557 */ 558 + int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 559 + struct splice_desc *sd) 560 { 561 struct file *file = sd->u.file; 562 struct address_space *mapping = file->f_mapping; ··· 600 out: 601 return ret; 602 } 603 + EXPORT_SYMBOL(pipe_to_file); 604 + 605 + static void wakeup_pipe_writers(struct pipe_inode_info *pipe) 606 + { 607 + smp_mb(); 608 + if (waitqueue_active(&pipe->wait)) 609 + wake_up_interruptible(&pipe->wait); 610 + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 611 + } 612 + 613 + /** 614 + * splice_from_pipe_feed - feed available data from a pipe to a file 615 + * @pipe: pipe to splice from 616 + * @sd: information to @actor 617 + * @actor: handler that splices the data 618 + * 619 + * Description: 620 + 621 + * This function loops over the pipe and calls @actor to do the 622 + * actual moving of a single struct pipe_buffer to the desired 623 + * destination. It returns when there's no more buffers left in 624 + * the pipe or if the requested number of bytes (@sd->total_len) 625 + * have been copied. It returns a positive number (one) if the 626 + * pipe needs to be filled with more data, zero if the required 627 + * number of bytes have been copied and -errno on error. 628 + * 629 + * This, together with splice_from_pipe_{begin,end,next}, may be 630 + * used to implement the functionality of __splice_from_pipe() when 631 + * locking is required around copying the pipe buffers to the 632 + * destination. 633 + */ 634 + int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, 635 + splice_actor *actor) 636 + { 637 + int ret; 638 + 639 + while (pipe->nrbufs) { 640 + struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; 641 + const struct pipe_buf_operations *ops = buf->ops; 642 + 643 + sd->len = buf->len; 644 + if (sd->len > sd->total_len) 645 + sd->len = sd->total_len; 646 + 647 + ret = actor(pipe, buf, sd); 648 + if (ret <= 0) { 649 + if (ret == -ENODATA) 650 + ret = 0; 651 + return ret; 652 + } 653 + buf->offset += ret; 654 + buf->len -= ret; 655 + 656 + sd->num_spliced += ret; 657 + sd->len -= ret; 658 + sd->pos += ret; 659 + sd->total_len -= ret; 660 + 661 + if (!buf->len) { 662 + buf->ops = NULL; 663 + ops->release(pipe, buf); 664 + pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); 665 + pipe->nrbufs--; 666 + if (pipe->inode) 667 + sd->need_wakeup = true; 668 + } 669 + 670 + if (!sd->total_len) 671 + return 0; 672 + } 673 + 674 + return 1; 675 + } 676 + EXPORT_SYMBOL(splice_from_pipe_feed); 677 + 678 + /** 679 + * splice_from_pipe_next - wait for some data to splice from 680 + * @pipe: pipe to splice from 681 + * @sd: information about the splice operation 682 + * 683 + * Description: 684 + * This function will wait for some data and return a positive 685 + * value (one) if pipe buffers are available. It will return zero 686 + * or -errno if no more data needs to be spliced. 687 + */ 688 + int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) 689 + { 690 + while (!pipe->nrbufs) { 691 + if (!pipe->writers) 692 + return 0; 693 + 694 + if (!pipe->waiting_writers && sd->num_spliced) 695 + return 0; 696 + 697 + if (sd->flags & SPLICE_F_NONBLOCK) 698 + return -EAGAIN; 699 + 700 + if (signal_pending(current)) 701 + return -ERESTARTSYS; 702 + 703 + if (sd->need_wakeup) { 704 + wakeup_pipe_writers(pipe); 705 + sd->need_wakeup = false; 706 + } 707 + 708 + pipe_wait(pipe); 709 + } 710 + 711 + return 1; 712 + } 713 + EXPORT_SYMBOL(splice_from_pipe_next); 714 + 715 + /** 716 + * splice_from_pipe_begin - start splicing from pipe 717 + * @pipe: pipe to splice from 718 + * 719 + * Description: 720 + * This function should be called before a loop containing 721 + * splice_from_pipe_next() and splice_from_pipe_feed() to 722 + * initialize the necessary fields of @sd. 723 + */ 724 + void splice_from_pipe_begin(struct splice_desc *sd) 725 + { 726 + sd->num_spliced = 0; 727 + sd->need_wakeup = false; 728 + } 729 + EXPORT_SYMBOL(splice_from_pipe_begin); 730 + 731 + /** 732 + * splice_from_pipe_end - finish splicing from pipe 733 + * @pipe: pipe to splice from 734 + * @sd: information about the splice operation 735 + * 736 + * Description: 737 + * This function will wake up pipe writers if necessary. It should 738 + * be called after a loop containing splice_from_pipe_next() and 739 + * splice_from_pipe_feed(). 740 + */ 741 + void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd) 742 + { 743 + if (sd->need_wakeup) 744 + wakeup_pipe_writers(pipe); 745 + } 746 + EXPORT_SYMBOL(splice_from_pipe_end); 747 748 /** 749 * __splice_from_pipe - splice data from a pipe to given actor ··· 617 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, 618 splice_actor *actor) 619 { 620 + int ret; 621 622 + splice_from_pipe_begin(sd); 623 + do { 624 + ret = splice_from_pipe_next(pipe, sd); 625 + if (ret > 0) 626 + ret = splice_from_pipe_feed(pipe, sd, actor); 627 + } while (ret > 0); 628 + splice_from_pipe_end(pipe, sd); 629 630 + return sd->num_spliced ? sd->num_spliced : ret; 631 } 632 EXPORT_SYMBOL(__splice_from_pipe); 633 ··· 715 * @actor: handler that splices the data 716 * 717 * Description: 718 + * See __splice_from_pipe. This function locks the pipe inode, 719 * otherwise it's identical to __splice_from_pipe(). 720 * 721 */ ··· 724 splice_actor *actor) 725 { 726 ssize_t ret; 727 struct splice_desc sd = { 728 .total_len = len, 729 .flags = flags, ··· 732 .u.file = out, 733 }; 734 735 + pipe_lock(pipe); 736 ret = __splice_from_pipe(pipe, &sd, actor); 737 + pipe_unlock(pipe); 738 739 return ret; 740 } 741 742 /** 743 * generic_file_splice_write - splice data from a pipe to a file ··· 840 }; 841 ssize_t ret; 842 843 + pipe_lock(pipe); 844 + 845 + splice_from_pipe_begin(&sd); 846 + do { 847 + ret = splice_from_pipe_next(pipe, &sd); 848 + if (ret <= 0) 849 + break; 850 + 851 + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 852 + ret = file_remove_suid(out); 853 + if (!ret) 854 + ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file); 855 + mutex_unlock(&inode->i_mutex); 856 + } while (ret > 0); 857 + splice_from_pipe_end(pipe, &sd); 858 + 859 + pipe_unlock(pipe); 860 + 861 + if (sd.num_spliced) 862 + ret = sd.num_spliced; 863 + 864 if (ret > 0) { 865 unsigned long nr_pages; 866 ··· 1339 if (!pipe) 1340 return -EBADF; 1341 1342 + pipe_lock(pipe); 1343 1344 error = ret = 0; 1345 while (nr_segs) { ··· 1395 iov++; 1396 } 1397 1398 + pipe_unlock(pipe); 1399 1400 if (!ret) 1401 ret = error; ··· 1524 return 0; 1525 1526 ret = 0; 1527 + pipe_lock(pipe); 1528 1529 while (!pipe->nrbufs) { 1530 if (signal_pending(current)) { ··· 1542 pipe_wait(pipe); 1543 } 1544 1545 + pipe_unlock(pipe); 1546 return ret; 1547 } 1548 ··· 1562 return 0; 1563 1564 ret = 0; 1565 + pipe_lock(pipe); 1566 1567 while (pipe->nrbufs >= PIPE_BUFFERS) { 1568 if (!pipe->readers) { ··· 1583 pipe->waiting_writers--; 1584 } 1585 1586 + pipe_unlock(pipe); 1587 return ret; 1588 } 1589 ··· 1599 1600 /* 1601 * Potential ABBA deadlock, work around it by ordering lock 1602 + * grabbing by pipe info address. Otherwise two different processes 1603 * could deadlock (one doing tee from A -> B, the other from B -> A). 1604 */ 1605 + pipe_double_lock(ipipe, opipe); 1606 1607 do { 1608 if (!opipe->readers) { ··· 1653 if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK)) 1654 ret = -EAGAIN; 1655 1656 + pipe_unlock(ipipe); 1657 + pipe_unlock(opipe); 1658 1659 /* 1660 * If we put data in the output pipe, wakeup any potential readers.

+109

include/linux/bio.h

··· 504 return bio && bio->bi_io_vec != NULL; 505 } 506 507 #if defined(CONFIG_BLK_DEV_INTEGRITY) 508 509 #define bip_vec_idx(bip, idx) (&(bip->bip_vec[(idx)]))

··· 504 return bio && bio->bi_io_vec != NULL; 505 } 506 507 + /* 508 + * BIO list managment for use by remapping drivers (e.g. DM or MD). 509 + * 510 + * A bio_list anchors a singly-linked list of bios chained through the bi_next 511 + * member of the bio. The bio_list also caches the last list member to allow 512 + * fast access to the tail. 513 + */ 514 + struct bio_list { 515 + struct bio *head; 516 + struct bio *tail; 517 + }; 518 + 519 + static inline int bio_list_empty(const struct bio_list *bl) 520 + { 521 + return bl->head == NULL; 522 + } 523 + 524 + static inline void bio_list_init(struct bio_list *bl) 525 + { 526 + bl->head = bl->tail = NULL; 527 + } 528 + 529 + #define bio_list_for_each(bio, bl) \ 530 + for (bio = (bl)->head; bio; bio = bio->bi_next) 531 + 532 + static inline unsigned bio_list_size(const struct bio_list *bl) 533 + { 534 + unsigned sz = 0; 535 + struct bio *bio; 536 + 537 + bio_list_for_each(bio, bl) 538 + sz++; 539 + 540 + return sz; 541 + } 542 + 543 + static inline void bio_list_add(struct bio_list *bl, struct bio *bio) 544 + { 545 + bio->bi_next = NULL; 546 + 547 + if (bl->tail) 548 + bl->tail->bi_next = bio; 549 + else 550 + bl->head = bio; 551 + 552 + bl->tail = bio; 553 + } 554 + 555 + static inline void bio_list_add_head(struct bio_list *bl, struct bio *bio) 556 + { 557 + bio->bi_next = bl->head; 558 + 559 + bl->head = bio; 560 + 561 + if (!bl->tail) 562 + bl->tail = bio; 563 + } 564 + 565 + static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2) 566 + { 567 + if (!bl2->head) 568 + return; 569 + 570 + if (bl->tail) 571 + bl->tail->bi_next = bl2->head; 572 + else 573 + bl->head = bl2->head; 574 + 575 + bl->tail = bl2->tail; 576 + } 577 + 578 + static inline void bio_list_merge_head(struct bio_list *bl, 579 + struct bio_list *bl2) 580 + { 581 + if (!bl2->head) 582 + return; 583 + 584 + if (bl->head) 585 + bl2->tail->bi_next = bl->head; 586 + else 587 + bl->tail = bl2->tail; 588 + 589 + bl->head = bl2->head; 590 + } 591 + 592 + static inline struct bio *bio_list_pop(struct bio_list *bl) 593 + { 594 + struct bio *bio = bl->head; 595 + 596 + if (bio) { 597 + bl->head = bl->head->bi_next; 598 + if (!bl->head) 599 + bl->tail = NULL; 600 + 601 + bio->bi_next = NULL; 602 + } 603 + 604 + return bio; 605 + } 606 + 607 + static inline struct bio *bio_list_get(struct bio_list *bl) 608 + { 609 + struct bio *bio = bl->head; 610 + 611 + bl->head = bl->tail = NULL; 612 + 613 + return bio; 614 + } 615 + 616 #if defined(CONFIG_BLK_DEV_INTEGRITY) 617 618 #define bip_vec_idx(bip, idx) (&(bip->bip_vec[(idx)]))

+59 -5

include/linux/fs.h

··· 87 */ 88 #define FMODE_NOCMTIME ((__force fmode_t)2048) 89 90 #define RW_MASK 1 91 #define RWA_MASK 2 92 #define READ 0 ··· 156 (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) 157 #define SWRITE_SYNC (SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) 158 #define WRITE_BARRIER (WRITE | (1 << BIO_RW_BARRIER)) 159 #define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD) 160 #define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER)) 161 ··· 796 I_MUTEX_XATTR, 797 I_MUTEX_QUOTA 798 }; 799 - 800 - extern void inode_double_lock(struct inode *inode1, struct inode *inode2); 801 - extern void inode_double_unlock(struct inode *inode1, struct inode *inode2); 802 803 /* 804 * NOTE: in a 32bit arch with a preemptable kernel and ··· 2205 extern ssize_t generic_file_splice_read(struct file *, loff_t *, 2206 struct pipe_inode_info *, size_t, unsigned int); 2207 extern ssize_t generic_file_splice_write(struct pipe_inode_info *, 2208 - struct file *, loff_t *, size_t, unsigned int); 2209 - extern ssize_t generic_file_splice_write_nolock(struct pipe_inode_info *, 2210 struct file *, loff_t *, size_t, unsigned int); 2211 extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, 2212 struct file *out, loff_t *, size_t len, unsigned int flags);

··· 87 */ 88 #define FMODE_NOCMTIME ((__force fmode_t)2048) 89 90 + /* 91 + * The below are the various read and write types that we support. Some of 92 + * them include behavioral modifiers that send information down to the 93 + * block layer and IO scheduler. Terminology: 94 + * 95 + * The block layer uses device plugging to defer IO a little bit, in 96 + * the hope that we will see more IO very shortly. This increases 97 + * coalescing of adjacent IO and thus reduces the number of IOs we 98 + * have to send to the device. It also allows for better queuing, 99 + * if the IO isn't mergeable. If the caller is going to be waiting 100 + * for the IO, then he must ensure that the device is unplugged so 101 + * that the IO is dispatched to the driver. 102 + * 103 + * All IO is handled async in Linux. This is fine for background 104 + * writes, but for reads or writes that someone waits for completion 105 + * on, we want to notify the block layer and IO scheduler so that they 106 + * know about it. That allows them to make better scheduling 107 + * decisions. So when the below references 'sync' and 'async', it 108 + * is referencing this priority hint. 109 + * 110 + * With that in mind, the available types are: 111 + * 112 + * READ A normal read operation. Device will be plugged. 113 + * READ_SYNC A synchronous read. Device is not plugged, caller can 114 + * immediately wait on this read without caring about 115 + * unplugging. 116 + * READA Used for read-ahead operations. Lower priority, and the 117 + * block layer could (in theory) choose to ignore this 118 + * request if it runs into resource problems. 119 + * WRITE A normal async write. Device will be plugged. 120 + * SWRITE Like WRITE, but a special case for ll_rw_block() that 121 + * tells it to lock the buffer first. Normally a buffer 122 + * must be locked before doing IO. 123 + * WRITE_SYNC_PLUG Synchronous write. Identical to WRITE, but passes down 124 + * the hint that someone will be waiting on this IO 125 + * shortly. The device must still be unplugged explicitly, 126 + * WRITE_SYNC_PLUG does not do this as we could be 127 + * submitting more writes before we actually wait on any 128 + * of them. 129 + * WRITE_SYNC Like WRITE_SYNC_PLUG, but also unplugs the device 130 + * immediately after submission. The write equivalent 131 + * of READ_SYNC. 132 + * WRITE_ODIRECT Special case write for O_DIRECT only. 133 + * SWRITE_SYNC 134 + * SWRITE_SYNC_PLUG Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer. 135 + * See SWRITE. 136 + * WRITE_BARRIER Like WRITE, but tells the block layer that all 137 + * previously submitted writes must be safely on storage 138 + * before this one is started. Also guarantees that when 139 + * this write is complete, it itself is also safely on 140 + * storage. Prevents reordering of writes on both sides 141 + * of this IO. 142 + * 143 + */ 144 #define RW_MASK 1 145 #define RWA_MASK 2 146 #define READ 0 ··· 102 (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) 103 #define SWRITE_SYNC (SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) 104 #define WRITE_BARRIER (WRITE | (1 << BIO_RW_BARRIER)) 105 + 106 + /* 107 + * These aren't really reads or writes, they pass down information about 108 + * parts of device that are now unused by the file system. 109 + */ 110 #define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD) 111 #define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER)) 112 ··· 737 I_MUTEX_XATTR, 738 I_MUTEX_QUOTA 739 }; 740 741 /* 742 * NOTE: in a 32bit arch with a preemptable kernel and ··· 2149 extern ssize_t generic_file_splice_read(struct file *, loff_t *, 2150 struct pipe_inode_info *, size_t, unsigned int); 2151 extern ssize_t generic_file_splice_write(struct pipe_inode_info *, 2152 struct file *, loff_t *, size_t, unsigned int); 2153 extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, 2154 struct file *out, loff_t *, size_t len, unsigned int flags);

+5

include/linux/pipe_fs_i.h

··· 134 memory allocation, whereas PIPE_BUF makes atomicity guarantees. */ 135 #define PIPE_SIZE PAGE_SIZE 136 137 /* Drop the inode semaphore and wait for a pipe event, atomically */ 138 void pipe_wait(struct pipe_inode_info *pipe); 139

··· 134 memory allocation, whereas PIPE_BUF makes atomicity guarantees. */ 135 #define PIPE_SIZE PAGE_SIZE 136 137 + /* Pipe lock and unlock operations */ 138 + void pipe_lock(struct pipe_inode_info *); 139 + void pipe_unlock(struct pipe_inode_info *); 140 + void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); 141 + 142 /* Drop the inode semaphore and wait for a pipe event, atomically */ 143 void pipe_wait(struct pipe_inode_info *pipe); 144

+12

include/linux/splice.h

··· 36 void *data; /* cookie */ 37 } u; 38 loff_t pos; /* file position */ 39 }; 40 41 struct partial_page { ··· 68 splice_actor *); 69 extern ssize_t __splice_from_pipe(struct pipe_inode_info *, 70 struct splice_desc *, splice_actor *); 71 extern ssize_t splice_to_pipe(struct pipe_inode_info *, 72 struct splice_pipe_desc *); 73 extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,

··· 36 void *data; /* cookie */ 37 } u; 38 loff_t pos; /* file position */ 39 + size_t num_spliced; /* number of bytes already spliced */ 40 + bool need_wakeup; /* need to wake up writer */ 41 }; 42 43 struct partial_page { ··· 66 splice_actor *); 67 extern ssize_t __splice_from_pipe(struct pipe_inode_info *, 68 struct splice_desc *, splice_actor *); 69 + extern int splice_from_pipe_feed(struct pipe_inode_info *, struct splice_desc *, 70 + splice_actor *); 71 + extern int splice_from_pipe_next(struct pipe_inode_info *, 72 + struct splice_desc *); 73 + extern void splice_from_pipe_begin(struct splice_desc *); 74 + extern void splice_from_pipe_end(struct pipe_inode_info *, 75 + struct splice_desc *); 76 + extern int pipe_to_file(struct pipe_inode_info *, struct pipe_buffer *, 77 + struct splice_desc *); 78 + 79 extern ssize_t splice_to_pipe(struct pipe_inode_info *, 80 struct splice_pipe_desc *); 81 extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,

-2

kernel/power/swap.c

··· 64 struct bio *bio; 65 66 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); 67 - if (!bio) 68 - return -ENOMEM; 69 bio->bi_sector = page_off * (PAGE_SIZE >> 9); 70 bio->bi_bdev = resume_bdev; 71 bio->bi_end_io = end_swap_bio_read;

··· 64 struct bio *bio; 65 66 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); 67 bio->bi_sector = page_off * (PAGE_SIZE >> 9); 68 bio->bi_bdev = resume_bdev; 69 bio->bi_end_io = end_swap_bio_read;