Merge branch 'for-2.6.40/core' of git://git.kernel.dk/linux-2.6-block

+64

Documentation/ABI/testing/sysfs-block

··· 142 142 with the previous I/O request are enabled. When set to 2, 143 143 all merge tries are disabled. The default value is 0 - 144 144 which enables all types of merge tries. 145 + 146 + What: /sys/block/<disk>/discard_alignment 147 + Date: May 2011 148 + Contact: Martin K. Petersen <martin.petersen@oracle.com> 149 + Description: 150 + Devices that support discard functionality may 151 + internally allocate space in units that are bigger than 152 + the exported logical block size. The discard_alignment 153 + parameter indicates how many bytes the beginning of the 154 + device is offset from the internal allocation unit's 155 + natural alignment. 156 + 157 + What: /sys/block/<disk>/<partition>/discard_alignment 158 + Date: May 2011 159 + Contact: Martin K. Petersen <martin.petersen@oracle.com> 160 + Description: 161 + Devices that support discard functionality may 162 + internally allocate space in units that are bigger than 163 + the exported logical block size. The discard_alignment 164 + parameter indicates how many bytes the beginning of the 165 + partition is offset from the internal allocation unit's 166 + natural alignment. 167 + 168 + What: /sys/block/<disk>/queue/discard_granularity 169 + Date: May 2011 170 + Contact: Martin K. Petersen <martin.petersen@oracle.com> 171 + Description: 172 + Devices that support discard functionality may 173 + internally allocate space using units that are bigger 174 + than the logical block size. The discard_granularity 175 + parameter indicates the size of the internal allocation 176 + unit in bytes if reported by the device. Otherwise the 177 + discard_granularity will be set to match the device's 178 + physical block size. A discard_granularity of 0 means 179 + that the device does not support discard functionality. 180 + 181 + What: /sys/block/<disk>/queue/discard_max_bytes 182 + Date: May 2011 183 + Contact: Martin K. Petersen <martin.petersen@oracle.com> 184 + Description: 185 + Devices that support discard functionality may have 186 + internal limits on the number of bytes that can be 187 + trimmed or unmapped in a single operation. Some storage 188 + protocols also have inherent limits on the number of 189 + blocks that can be described in a single command. The 190 + discard_max_bytes parameter is set by the device driver 191 + to the maximum number of bytes that can be discarded in 192 + a single operation. Discard requests issued to the 193 + device must not exceed this limit. A discard_max_bytes 194 + value of 0 means that the device does not support 195 + discard functionality. 196 + 197 + What: /sys/block/<disk>/queue/discard_zeroes_data 198 + Date: May 2011 199 + Contact: Martin K. Petersen <martin.petersen@oracle.com> 200 + Description: 201 + Devices that support discard functionality may return 202 + stale or random data when a previously discarded block 203 + is read back. This can cause problems if the filesystem 204 + expects discarded blocks to be explicitly cleared. If a 205 + device reports that it deterministically returns zeroes 206 + when a discarded area is read the discard_zeroes_data 207 + parameter will be set to one. Otherwise it will be 0 and 208 + the result of reading a discarded area is undefined.

+162 -38

block/blk-cgroup.c

··· 385 385 386 386 spin_lock_irqsave(&blkg->stats_lock, flags); 387 387 blkg->stats.time += time; 388 + #ifdef CONFIG_DEBUG_BLK_CGROUP 388 389 blkg->stats.unaccounted_time += unaccounted_time; 390 + #endif 389 391 spin_unlock_irqrestore(&blkg->stats_lock, flags); 390 392 } 391 393 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); 392 394 395 + /* 396 + * should be called under rcu read lock or queue lock to make sure blkg pointer 397 + * is valid. 398 + */ 393 399 void blkiocg_update_dispatch_stats(struct blkio_group *blkg, 394 400 uint64_t bytes, bool direction, bool sync) 395 401 { 396 - struct blkio_group_stats *stats; 402 + struct blkio_group_stats_cpu *stats_cpu; 397 403 unsigned long flags; 398 404 399 - spin_lock_irqsave(&blkg->stats_lock, flags); 400 - stats = &blkg->stats; 401 - stats->sectors += bytes >> 9; 402 - blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction, 403 - sync); 404 - blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes, 405 - direction, sync); 406 - spin_unlock_irqrestore(&blkg->stats_lock, flags); 405 + /* 406 + * Disabling interrupts to provide mutual exclusion between two 407 + * writes on same cpu. It probably is not needed for 64bit. Not 408 + * optimizing that case yet. 409 + */ 410 + local_irq_save(flags); 411 + 412 + stats_cpu = this_cpu_ptr(blkg->stats_cpu); 413 + 414 + u64_stats_update_begin(&stats_cpu->syncp); 415 + stats_cpu->sectors += bytes >> 9; 416 + blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED], 417 + 1, direction, sync); 418 + blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES], 419 + bytes, direction, sync); 420 + u64_stats_update_end(&stats_cpu->syncp); 421 + local_irq_restore(flags); 407 422 } 408 423 EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); 409 424 ··· 441 426 } 442 427 EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); 443 428 429 + /* Merged stats are per cpu. */ 444 430 void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, 445 431 bool sync) 446 432 { 433 + struct blkio_group_stats_cpu *stats_cpu; 447 434 unsigned long flags; 448 435 449 - spin_lock_irqsave(&blkg->stats_lock, flags); 450 - blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction, 451 - sync); 452 - spin_unlock_irqrestore(&blkg->stats_lock, flags); 436 + /* 437 + * Disabling interrupts to provide mutual exclusion between two 438 + * writes on same cpu. It probably is not needed for 64bit. Not 439 + * optimizing that case yet. 440 + */ 441 + local_irq_save(flags); 442 + 443 + stats_cpu = this_cpu_ptr(blkg->stats_cpu); 444 + 445 + u64_stats_update_begin(&stats_cpu->syncp); 446 + blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1, 447 + direction, sync); 448 + u64_stats_update_end(&stats_cpu->syncp); 449 + local_irq_restore(flags); 453 450 } 454 451 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); 452 + 453 + /* 454 + * This function allocates the per cpu stats for blkio_group. Should be called 455 + * from sleepable context as alloc_per_cpu() requires that. 456 + */ 457 + int blkio_alloc_blkg_stats(struct blkio_group *blkg) 458 + { 459 + /* Allocate memory for per cpu stats */ 460 + blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu); 461 + if (!blkg->stats_cpu) 462 + return -ENOMEM; 463 + return 0; 464 + } 465 + EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats); 455 466 456 467 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 457 468 struct blkio_group *blkg, void *key, dev_t dev, ··· 549 508 } 550 509 EXPORT_SYMBOL_GPL(blkiocg_lookup_group); 551 510 511 + static void blkio_reset_stats_cpu(struct blkio_group *blkg) 512 + { 513 + struct blkio_group_stats_cpu *stats_cpu; 514 + int i, j, k; 515 + /* 516 + * Note: On 64 bit arch this should not be an issue. This has the 517 + * possibility of returning some inconsistent value on 32bit arch 518 + * as 64bit update on 32bit is non atomic. Taking care of this 519 + * corner case makes code very complicated, like sending IPIs to 520 + * cpus, taking care of stats of offline cpus etc. 521 + * 522 + * reset stats is anyway more of a debug feature and this sounds a 523 + * corner case. So I am not complicating the code yet until and 524 + * unless this becomes a real issue. 525 + */ 526 + for_each_possible_cpu(i) { 527 + stats_cpu = per_cpu_ptr(blkg->stats_cpu, i); 528 + stats_cpu->sectors = 0; 529 + for(j = 0; j < BLKIO_STAT_CPU_NR; j++) 530 + for (k = 0; k < BLKIO_STAT_TOTAL; k++) 531 + stats_cpu->stat_arr_cpu[j][k] = 0; 532 + } 533 + } 534 + 552 535 static int 553 536 blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) 554 537 { ··· 617 552 } 618 553 #endif 619 554 spin_unlock(&blkg->stats_lock); 555 + 556 + /* Reset Per cpu stats which don't take blkg->stats_lock */ 557 + blkio_reset_stats_cpu(blkg); 620 558 } 559 + 621 560 spin_unlock_irq(&blkcg->lock); 622 561 return 0; 623 562 } ··· 667 598 return val; 668 599 } 669 600 601 + 602 + static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, 603 + enum stat_type_cpu type, enum stat_sub_type sub_type) 604 + { 605 + int cpu; 606 + struct blkio_group_stats_cpu *stats_cpu; 607 + u64 val = 0, tval; 608 + 609 + for_each_possible_cpu(cpu) { 610 + unsigned int start; 611 + stats_cpu = per_cpu_ptr(blkg->stats_cpu, cpu); 612 + 613 + do { 614 + start = u64_stats_fetch_begin(&stats_cpu->syncp); 615 + if (type == BLKIO_STAT_CPU_SECTORS) 616 + tval = stats_cpu->sectors; 617 + else 618 + tval = stats_cpu->stat_arr_cpu[type][sub_type]; 619 + } while(u64_stats_fetch_retry(&stats_cpu->syncp, start)); 620 + 621 + val += tval; 622 + } 623 + 624 + return val; 625 + } 626 + 627 + static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, 628 + struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type) 629 + { 630 + uint64_t disk_total, val; 631 + char key_str[MAX_KEY_LEN]; 632 + enum stat_sub_type sub_type; 633 + 634 + if (type == BLKIO_STAT_CPU_SECTORS) { 635 + val = blkio_read_stat_cpu(blkg, type, 0); 636 + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev); 637 + } 638 + 639 + for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; 640 + sub_type++) { 641 + blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); 642 + val = blkio_read_stat_cpu(blkg, type, sub_type); 643 + cb->fill(cb, key_str, val); 644 + } 645 + 646 + disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) + 647 + blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE); 648 + 649 + blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); 650 + cb->fill(cb, key_str, disk_total); 651 + return disk_total; 652 + } 653 + 670 654 /* This should be called with blkg->stats_lock held */ 671 655 static uint64_t blkio_get_stat(struct blkio_group *blkg, 672 656 struct cgroup_map_cb *cb, dev_t dev, enum stat_type type) ··· 731 609 if (type == BLKIO_STAT_TIME) 732 610 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 733 611 blkg->stats.time, cb, dev); 734 - if (type == BLKIO_STAT_SECTORS) 735 - return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 736 - blkg->stats.sectors, cb, dev); 737 612 #ifdef CONFIG_DEBUG_BLK_CGROUP 738 613 if (type == BLKIO_STAT_UNACCOUNTED_TIME) 739 614 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, ··· 1194 1075 } 1195 1076 1196 1077 static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, 1197 - struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type, 1198 - bool show_total) 1078 + struct cftype *cft, struct cgroup_map_cb *cb, 1079 + enum stat_type type, bool show_total, bool pcpu) 1199 1080 { 1200 1081 struct blkio_group *blkg; 1201 1082 struct hlist_node *n; ··· 1206 1087 if (blkg->dev) { 1207 1088 if (!cftype_blkg_same_policy(cft, blkg)) 1208 1089 continue; 1209 - spin_lock_irq(&blkg->stats_lock); 1210 - cgroup_total += blkio_get_stat(blkg, cb, blkg->dev, 1211 - type); 1212 - spin_unlock_irq(&blkg->stats_lock); 1090 + if (pcpu) 1091 + cgroup_total += blkio_get_stat_cpu(blkg, cb, 1092 + blkg->dev, type); 1093 + else { 1094 + spin_lock_irq(&blkg->stats_lock); 1095 + cgroup_total += blkio_get_stat(blkg, cb, 1096 + blkg->dev, type); 1097 + spin_unlock_irq(&blkg->stats_lock); 1098 + } 1213 1099 } 1214 1100 } 1215 1101 if (show_total) ··· 1238 1114 switch(name) { 1239 1115 case BLKIO_PROP_time: 1240 1116 return blkio_read_blkg_stats(blkcg, cft, cb, 1241 - BLKIO_STAT_TIME, 0); 1117 + BLKIO_STAT_TIME, 0, 0); 1242 1118 case BLKIO_PROP_sectors: 1243 1119 return blkio_read_blkg_stats(blkcg, cft, cb, 1244 - BLKIO_STAT_SECTORS, 0); 1120 + BLKIO_STAT_CPU_SECTORS, 0, 1); 1245 1121 case BLKIO_PROP_io_service_bytes: 1246 1122 return blkio_read_blkg_stats(blkcg, cft, cb, 1247 - BLKIO_STAT_SERVICE_BYTES, 1); 1123 + BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); 1248 1124 case BLKIO_PROP_io_serviced: 1249 1125 return blkio_read_blkg_stats(blkcg, cft, cb, 1250 - BLKIO_STAT_SERVICED, 1); 1126 + BLKIO_STAT_CPU_SERVICED, 1, 1); 1251 1127 case BLKIO_PROP_io_service_time: 1252 1128 return blkio_read_blkg_stats(blkcg, cft, cb, 1253 - BLKIO_STAT_SERVICE_TIME, 1); 1129 + BLKIO_STAT_SERVICE_TIME, 1, 0); 1254 1130 case BLKIO_PROP_io_wait_time: 1255 1131 return blkio_read_blkg_stats(blkcg, cft, cb, 1256 - BLKIO_STAT_WAIT_TIME, 1); 1132 + BLKIO_STAT_WAIT_TIME, 1, 0); 1257 1133 case BLKIO_PROP_io_merged: 1258 1134 return blkio_read_blkg_stats(blkcg, cft, cb, 1259 - BLKIO_STAT_MERGED, 1); 1135 + BLKIO_STAT_CPU_MERGED, 1, 1); 1260 1136 case BLKIO_PROP_io_queued: 1261 1137 return blkio_read_blkg_stats(blkcg, cft, cb, 1262 - BLKIO_STAT_QUEUED, 1); 1138 + BLKIO_STAT_QUEUED, 1, 0); 1263 1139 #ifdef CONFIG_DEBUG_BLK_CGROUP 1264 1140 case BLKIO_PROP_unaccounted_time: 1265 1141 return blkio_read_blkg_stats(blkcg, cft, cb, 1266 - BLKIO_STAT_UNACCOUNTED_TIME, 0); 1142 + BLKIO_STAT_UNACCOUNTED_TIME, 0, 0); 1267 1143 case BLKIO_PROP_dequeue: 1268 1144 return blkio_read_blkg_stats(blkcg, cft, cb, 1269 - BLKIO_STAT_DEQUEUE, 0); 1145 + BLKIO_STAT_DEQUEUE, 0, 0); 1270 1146 case BLKIO_PROP_avg_queue_size: 1271 1147 return blkio_read_blkg_stats(blkcg, cft, cb, 1272 - BLKIO_STAT_AVG_QUEUE_SIZE, 0); 1148 + BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0); 1273 1149 case BLKIO_PROP_group_wait_time: 1274 1150 return blkio_read_blkg_stats(blkcg, cft, cb, 1275 - BLKIO_STAT_GROUP_WAIT_TIME, 0); 1151 + BLKIO_STAT_GROUP_WAIT_TIME, 0, 0); 1276 1152 case BLKIO_PROP_idle_time: 1277 1153 return blkio_read_blkg_stats(blkcg, cft, cb, 1278 - BLKIO_STAT_IDLE_TIME, 0); 1154 + BLKIO_STAT_IDLE_TIME, 0, 0); 1279 1155 case BLKIO_PROP_empty_time: 1280 1156 return blkio_read_blkg_stats(blkcg, cft, cb, 1281 - BLKIO_STAT_EMPTY_TIME, 0); 1157 + BLKIO_STAT_EMPTY_TIME, 0, 0); 1282 1158 #endif 1283 1159 default: 1284 1160 BUG(); ··· 1288 1164 switch(name){ 1289 1165 case BLKIO_THROTL_io_service_bytes: 1290 1166 return blkio_read_blkg_stats(blkcg, cft, cb, 1291 - BLKIO_STAT_SERVICE_BYTES, 1); 1167 + BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); 1292 1168 case BLKIO_THROTL_io_serviced: 1293 1169 return blkio_read_blkg_stats(blkcg, cft, cb, 1294 - BLKIO_STAT_SERVICED, 1); 1170 + BLKIO_STAT_CPU_SERVICED, 1, 1); 1295 1171 default: 1296 1172 BUG(); 1297 1173 }

+29 -11

block/blk-cgroup.h

··· 14 14 */ 15 15 16 16 #include <linux/cgroup.h> 17 + #include <linux/u64_stats_sync.h> 17 18 18 19 enum blkio_policy_id { 19 20 BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */ ··· 37 36 * request completion for IOs doen by this cgroup. This may not be 38 37 * accurate when NCQ is turned on. */ 39 38 BLKIO_STAT_SERVICE_TIME = 0, 40 - /* Total bytes transferred */ 41 - BLKIO_STAT_SERVICE_BYTES, 42 - /* Total IOs serviced, post merge */ 43 - BLKIO_STAT_SERVICED, 44 39 /* Total time spent waiting in scheduler queue in ns */ 45 40 BLKIO_STAT_WAIT_TIME, 46 - /* Number of IOs merged */ 47 - BLKIO_STAT_MERGED, 48 41 /* Number of IOs queued up */ 49 42 BLKIO_STAT_QUEUED, 50 43 /* All the single valued stats go below this */ 51 44 BLKIO_STAT_TIME, 52 - BLKIO_STAT_SECTORS, 45 + #ifdef CONFIG_DEBUG_BLK_CGROUP 53 46 /* Time not charged to this cgroup */ 54 47 BLKIO_STAT_UNACCOUNTED_TIME, 55 - #ifdef CONFIG_DEBUG_BLK_CGROUP 56 48 BLKIO_STAT_AVG_QUEUE_SIZE, 57 49 BLKIO_STAT_IDLE_TIME, 58 50 BLKIO_STAT_EMPTY_TIME, 59 51 BLKIO_STAT_GROUP_WAIT_TIME, 60 52 BLKIO_STAT_DEQUEUE 61 53 #endif 54 + }; 55 + 56 + /* Per cpu stats */ 57 + enum stat_type_cpu { 58 + BLKIO_STAT_CPU_SECTORS, 59 + /* Total bytes transferred */ 60 + BLKIO_STAT_CPU_SERVICE_BYTES, 61 + /* Total IOs serviced, post merge */ 62 + BLKIO_STAT_CPU_SERVICED, 63 + /* Number of IOs merged */ 64 + BLKIO_STAT_CPU_MERGED, 65 + BLKIO_STAT_CPU_NR 62 66 }; 63 67 64 68 enum stat_sub_type { ··· 122 116 struct blkio_group_stats { 123 117 /* total disk time and nr sectors dispatched by this group */ 124 118 uint64_t time; 125 - uint64_t sectors; 126 - /* Time not charged to this cgroup */ 127 - uint64_t unaccounted_time; 128 119 uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL]; 129 120 #ifdef CONFIG_DEBUG_BLK_CGROUP 121 + /* Time not charged to this cgroup */ 122 + uint64_t unaccounted_time; 123 + 130 124 /* Sum of number of IOs queued across all samples */ 131 125 uint64_t avg_queue_size_sum; 132 126 /* Count of samples taken for average */ ··· 151 145 #endif 152 146 }; 153 147 148 + /* Per cpu blkio group stats */ 149 + struct blkio_group_stats_cpu { 150 + uint64_t sectors; 151 + uint64_t stat_arr_cpu[BLKIO_STAT_CPU_NR][BLKIO_STAT_TOTAL]; 152 + struct u64_stats_sync syncp; 153 + }; 154 + 154 155 struct blkio_group { 155 156 /* An rcu protected unique identifier for the group */ 156 157 void *key; ··· 173 160 /* Need to serialize the stats in the case of reset/update */ 174 161 spinlock_t stats_lock; 175 162 struct blkio_group_stats stats; 163 + /* Per cpu stats pointer */ 164 + struct blkio_group_stats_cpu __percpu *stats_cpu; 176 165 }; 177 166 178 167 struct blkio_policy_node { ··· 310 295 extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 311 296 struct blkio_group *blkg, void *key, dev_t dev, 312 297 enum blkio_policy_id plid); 298 + extern int blkio_alloc_blkg_stats(struct blkio_group *blkg); 313 299 extern int blkiocg_del_blkio_group(struct blkio_group *blkg); 314 300 extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, 315 301 void *key); ··· 337 321 static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 338 322 struct blkio_group *blkg, void *key, dev_t dev, 339 323 enum blkio_policy_id plid) {} 324 + 325 + static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; } 340 326 341 327 static inline int 342 328 blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }

+4 -28

block/blk-core.c

··· 569 569 570 570 static inline void blk_free_request(struct request_queue *q, struct request *rq) 571 571 { 572 - BUG_ON(rq->cmd_flags & REQ_ON_PLUG); 573 - 574 572 if (rq->cmd_flags & REQ_ELVPRIV) 575 573 elv_put_request(q, rq); 576 574 mempool_free(rq, q->rq.rq_pool); ··· 1108 1110 { 1109 1111 const int ff = bio->bi_rw & REQ_FAILFAST_MASK; 1110 1112 1111 - /* 1112 - * Debug stuff, kill later 1113 - */ 1114 - if (!rq_mergeable(req)) { 1115 - blk_dump_rq_flags(req, "back"); 1116 - return false; 1117 - } 1118 - 1119 1113 if (!ll_back_merge_fn(q, req, bio)) 1120 1114 return false; 1121 1115 ··· 1122 1132 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); 1123 1133 1124 1134 drive_stat_acct(req, 0); 1135 + elv_bio_merged(q, req, bio); 1125 1136 return true; 1126 1137 } 1127 1138 ··· 1131 1140 { 1132 1141 const int ff = bio->bi_rw & REQ_FAILFAST_MASK; 1133 1142 sector_t sector; 1134 - 1135 - /* 1136 - * Debug stuff, kill later 1137 - */ 1138 - if (!rq_mergeable(req)) { 1139 - blk_dump_rq_flags(req, "front"); 1140 - return false; 1141 - } 1142 1143 1143 1144 if (!ll_front_merge_fn(q, req, bio)) 1144 1145 return false; ··· 1156 1173 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); 1157 1174 1158 1175 drive_stat_acct(req, 0); 1176 + elv_bio_merged(q, req, bio); 1159 1177 return true; 1160 1178 } 1161 1179 ··· 1242 1258 1243 1259 el_ret = elv_merge(q, &req, bio); 1244 1260 if (el_ret == ELEVATOR_BACK_MERGE) { 1245 - BUG_ON(req->cmd_flags & REQ_ON_PLUG); 1246 1261 if (bio_attempt_back_merge(q, req, bio)) { 1247 1262 if (!attempt_back_merge(q, req)) 1248 1263 elv_merged_request(q, req, el_ret); 1249 1264 goto out_unlock; 1250 1265 } 1251 1266 } else if (el_ret == ELEVATOR_FRONT_MERGE) { 1252 - BUG_ON(req->cmd_flags & REQ_ON_PLUG); 1253 1267 if (bio_attempt_front_merge(q, req, bio)) { 1254 1268 if (!attempt_front_merge(q, req)) 1255 1269 elv_merged_request(q, req, el_ret); ··· 1302 1320 if (__rq->q != q) 1303 1321 plug->should_sort = 1; 1304 1322 } 1305 - /* 1306 - * Debug flag, kill later 1307 - */ 1308 - req->cmd_flags |= REQ_ON_PLUG; 1309 1323 list_add_tail(&req->queuelist, &plug->list); 1310 1324 drive_stat_acct(req, 1); 1311 1325 } else { ··· 1528 1550 goto end_io; 1529 1551 } 1530 1552 1531 - blk_throtl_bio(q, &bio); 1553 + if (blk_throtl_bio(q, &bio)) 1554 + goto end_io; 1532 1555 1533 1556 /* 1534 1557 * If bio = NULL, bio has been throttled and will be submitted ··· 2727 2748 while (!list_empty(&list)) { 2728 2749 rq = list_entry_rq(list.next); 2729 2750 list_del_init(&rq->queuelist); 2730 - BUG_ON(!(rq->cmd_flags & REQ_ON_PLUG)); 2731 2751 BUG_ON(!rq->q); 2732 2752 if (rq->q != q) { 2733 2753 /* ··· 2738 2760 depth = 0; 2739 2761 spin_lock(q->queue_lock); 2740 2762 } 2741 - rq->cmd_flags &= ~REQ_ON_PLUG; 2742 - 2743 2763 /* 2744 2764 * rq is already accounted, so use raw insert 2745 2765 */

+1 -1

block/blk-exec.c

··· 56 56 spin_lock_irq(q->queue_lock); 57 57 __elv_add_request(q, rq, where); 58 58 __blk_run_queue(q); 59 - /* the queue is stopped so it won't be plugged+unplugged */ 59 + /* the queue is stopped so it won't be run */ 60 60 if (rq->cmd_type == REQ_TYPE_PM_RESUME) 61 61 q->request_fn(q); 62 62 spin_unlock_irq(q->queue_lock);

+11 -5

block/blk-flush.c

··· 212 212 } 213 213 214 214 /* 215 - * Moving a request silently to empty queue_head may stall the 216 - * queue. Kick the queue in those cases. This function is called 217 - * from request completion path and calling directly into 218 - * request_fn may confuse the driver. Always use kblockd. 215 + * Kick the queue to avoid stall for two cases: 216 + * 1. Moving a request silently to empty queue_head may stall the 217 + * queue. 218 + * 2. When flush request is running in non-queueable queue, the 219 + * queue is hold. Restart the queue after flush request is finished 220 + * to avoid stall. 221 + * This function is called from request completion path and calling 222 + * directly into request_fn may confuse the driver. Always use 223 + * kblockd. 219 224 */ 220 - if (queued) 225 + if (queued || q->flush_queue_delayed) 221 226 blk_run_queue_async(q); 227 + q->flush_queue_delayed = 0; 222 228 } 223 229 224 230 /**

+3

block/blk-ioc.c

··· 96 96 INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); 97 97 INIT_HLIST_HEAD(&ret->cic_list); 98 98 ret->ioc_data = NULL; 99 + #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) 100 + ret->cgroup_changed = 0; 101 + #endif 99 102 } 100 103 101 104 return ret;

+26 -56

block/blk-lib.c

··· 9 9 10 10 #include "blk.h" 11 11 12 - static void blkdev_discard_end_io(struct bio *bio, int err) 12 + struct bio_batch { 13 + atomic_t done; 14 + unsigned long flags; 15 + struct completion *wait; 16 + }; 17 + 18 + static void bio_batch_end_io(struct bio *bio, int err) 13 19 { 14 - if (err) { 15 - if (err == -EOPNOTSUPP) 16 - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); 17 - clear_bit(BIO_UPTODATE, &bio->bi_flags); 18 - } 20 + struct bio_batch *bb = bio->bi_private; 19 21 20 - if (bio->bi_private) 21 - complete(bio->bi_private); 22 - 22 + if (err && (err != -EOPNOTSUPP)) 23 + clear_bit(BIO_UPTODATE, &bb->flags); 24 + if (atomic_dec_and_test(&bb->done)) 25 + complete(bb->wait); 23 26 bio_put(bio); 24 27 } 25 28 ··· 44 41 struct request_queue *q = bdev_get_queue(bdev); 45 42 int type = REQ_WRITE | REQ_DISCARD; 46 43 unsigned int max_discard_sectors; 44 + struct bio_batch bb; 47 45 struct bio *bio; 48 46 int ret = 0; 49 47 ··· 71 67 type |= REQ_SECURE; 72 68 } 73 69 74 - while (nr_sects && !ret) { 70 + atomic_set(&bb.done, 1); 71 + bb.flags = 1 << BIO_UPTODATE; 72 + bb.wait = &wait; 73 + 74 + while (nr_sects) { 75 75 bio = bio_alloc(gfp_mask, 1); 76 76 if (!bio) { 77 77 ret = -ENOMEM; ··· 83 75 } 84 76 85 77 bio->bi_sector = sector; 86 - bio->bi_end_io = blkdev_discard_end_io; 78 + bio->bi_end_io = bio_batch_end_io; 87 79 bio->bi_bdev = bdev; 88 - bio->bi_private = &wait; 80 + bio->bi_private = &bb; 89 81 90 82 if (nr_sects > max_discard_sectors) { 91 83 bio->bi_size = max_discard_sectors << 9; ··· 96 88 nr_sects = 0; 97 89 } 98 90 99 - bio_get(bio); 91 + atomic_inc(&bb.done); 100 92 submit_bio(type, bio); 93 + } 101 94 95 + /* Wait for bios in-flight */ 96 + if (!atomic_dec_and_test(&bb.done)) 102 97 wait_for_completion(&wait); 103 98 104 - if (bio_flagged(bio, BIO_EOPNOTSUPP)) 105 - ret = -EOPNOTSUPP; 106 - else if (!bio_flagged(bio, BIO_UPTODATE)) 107 - ret = -EIO; 108 - bio_put(bio); 109 - } 99 + if (!test_bit(BIO_UPTODATE, &bb.flags)) 100 + ret = -EIO; 110 101 111 102 return ret; 112 103 } 113 104 EXPORT_SYMBOL(blkdev_issue_discard); 114 - 115 - struct bio_batch 116 - { 117 - atomic_t done; 118 - unsigned long flags; 119 - struct completion *wait; 120 - }; 121 - 122 - static void bio_batch_end_io(struct bio *bio, int err) 123 - { 124 - struct bio_batch *bb = bio->bi_private; 125 - 126 - if (err) { 127 - if (err == -EOPNOTSUPP) 128 - set_bit(BIO_EOPNOTSUPP, &bb->flags); 129 - else 130 - clear_bit(BIO_UPTODATE, &bb->flags); 131 - } 132 - if (bb) 133 - if (atomic_dec_and_test(&bb->done)) 134 - complete(bb->wait); 135 - bio_put(bio); 136 - } 137 105 138 106 /** 139 107 * blkdev_issue_zeroout - generate number of zero filed write bios ··· 135 151 bb.flags = 1 << BIO_UPTODATE; 136 152 bb.wait = &wait; 137 153 138 - submit: 139 154 ret = 0; 140 155 while (nr_sects != 0) { 141 156 bio = bio_alloc(gfp_mask, ··· 151 168 152 169 while (nr_sects != 0) { 153 170 sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects); 154 - if (sz == 0) 155 - /* bio has maximum size possible */ 156 - break; 157 171 ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0); 158 172 nr_sects -= ret >> 9; 159 173 sector += ret >> 9; ··· 170 190 /* One of bios in the batch was completed with error.*/ 171 191 ret = -EIO; 172 192 173 - if (ret) 174 - goto out; 175 - 176 - if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) { 177 - ret = -EOPNOTSUPP; 178 - goto out; 179 - } 180 - if (nr_sects != 0) 181 - goto submit; 182 - out: 183 193 return ret; 184 194 } 185 195 EXPORT_SYMBOL(blkdev_issue_zeroout);

+8 -1

block/blk-settings.c

··· 120 120 lim->discard_granularity = 0; 121 121 lim->discard_alignment = 0; 122 122 lim->discard_misaligned = 0; 123 - lim->discard_zeroes_data = -1; 123 + lim->discard_zeroes_data = 1; 124 124 lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; 125 125 lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); 126 126 lim->alignment_offset = 0; ··· 166 166 167 167 blk_set_default_limits(&q->limits); 168 168 blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS); 169 + q->limits.discard_zeroes_data = 0; 169 170 170 171 /* 171 172 * by default assume old behaviour and bounce for any highmem page ··· 790 789 q->flush_flags = flush & (REQ_FLUSH | REQ_FUA); 791 790 } 792 791 EXPORT_SYMBOL_GPL(blk_queue_flush); 792 + 793 + void blk_queue_flush_queueable(struct request_queue *q, bool queueable) 794 + { 795 + q->flush_not_queueable = !queueable; 796 + } 797 + EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); 793 798 794 799 static int __init blk_settings_init(void) 795 800 {

+2 -1

block/blk-sysfs.c

··· 152 152 153 153 static ssize_t queue_discard_max_show(struct request_queue *q, char *page) 154 154 { 155 - return queue_var_show(q->limits.max_discard_sectors << 9, page); 155 + return sprintf(page, "%llu\n", 156 + (unsigned long long)q->limits.max_discard_sectors << 9); 156 157 } 157 158 158 159 static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)

+240 -79

block/blk-throttle.c

··· 78 78 79 79 /* Some throttle limits got updated for the group */ 80 80 int limits_changed; 81 + 82 + struct rcu_head rcu_head; 81 83 }; 82 84 83 85 struct throtl_data ··· 90 88 /* service tree for active throtl groups */ 91 89 struct throtl_rb_root tg_service_tree; 92 90 93 - struct throtl_grp root_tg; 91 + struct throtl_grp *root_tg; 94 92 struct request_queue *queue; 95 93 96 94 /* Total Number of queued bios on READ and WRITE lists */ ··· 153 151 return tg; 154 152 } 155 153 154 + static void throtl_free_tg(struct rcu_head *head) 155 + { 156 + struct throtl_grp *tg; 157 + 158 + tg = container_of(head, struct throtl_grp, rcu_head); 159 + free_percpu(tg->blkg.stats_cpu); 160 + kfree(tg); 161 + } 162 + 156 163 static void throtl_put_tg(struct throtl_grp *tg) 157 164 { 158 165 BUG_ON(atomic_read(&tg->ref) <= 0); 159 166 if (!atomic_dec_and_test(&tg->ref)) 160 167 return; 161 - kfree(tg); 168 + 169 + /* 170 + * A group is freed in rcu manner. But having an rcu lock does not 171 + * mean that one can access all the fields of blkg and assume these 172 + * are valid. For example, don't try to follow throtl_data and 173 + * request queue links. 174 + * 175 + * Having a reference to blkg under an rcu allows acess to only 176 + * values local to groups like group stats and group rate limits 177 + */ 178 + call_rcu(&tg->rcu_head, throtl_free_tg); 162 179 } 163 180 164 - static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, 165 - struct blkio_cgroup *blkcg) 181 + static void throtl_init_group(struct throtl_grp *tg) 166 182 { 167 - struct throtl_grp *tg = NULL; 168 - void *key = td; 169 - struct backing_dev_info *bdi = &td->queue->backing_dev_info; 170 - unsigned int major, minor; 171 - 172 - /* 173 - * TODO: Speed up blkiocg_lookup_group() by maintaining a radix 174 - * tree of blkg (instead of traversing through hash list all 175 - * the time. 176 - */ 177 - 178 - /* 179 - * This is the common case when there are no blkio cgroups. 180 - * Avoid lookup in this case 181 - */ 182 - if (blkcg == &blkio_root_cgroup) 183 - tg = &td->root_tg; 184 - else 185 - tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); 186 - 187 - /* Fill in device details for root group */ 188 - if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { 189 - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 190 - tg->blkg.dev = MKDEV(major, minor); 191 - goto done; 192 - } 193 - 194 - if (tg) 195 - goto done; 196 - 197 - tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); 198 - if (!tg) 199 - goto done; 200 - 201 183 INIT_HLIST_NODE(&tg->tg_node); 202 184 RB_CLEAR_NODE(&tg->rb_node); 203 185 bio_list_init(&tg->bio_lists[0]); 204 186 bio_list_init(&tg->bio_lists[1]); 205 - td->limits_changed = false; 187 + tg->limits_changed = false; 188 + 189 + /* Practically unlimited BW */ 190 + tg->bps[0] = tg->bps[1] = -1; 191 + tg->iops[0] = tg->iops[1] = -1; 206 192 207 193 /* 208 194 * Take the initial reference that will be released on destroy ··· 199 209 * exit or cgroup deletion path depending on who is exiting first. 200 210 */ 201 211 atomic_set(&tg->ref, 1); 212 + } 213 + 214 + /* Should be called with rcu read lock held (needed for blkcg) */ 215 + static void 216 + throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg) 217 + { 218 + hlist_add_head(&tg->tg_node, &td->tg_list); 219 + td->nr_undestroyed_grps++; 220 + } 221 + 222 + static void 223 + __throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) 224 + { 225 + struct backing_dev_info *bdi = &td->queue->backing_dev_info; 226 + unsigned int major, minor; 227 + 228 + if (!tg || tg->blkg.dev) 229 + return; 230 + 231 + /* 232 + * Fill in device details for a group which might not have been 233 + * filled at group creation time as queue was being instantiated 234 + * and driver had not attached a device yet 235 + */ 236 + if (bdi->dev && dev_name(bdi->dev)) { 237 + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 238 + tg->blkg.dev = MKDEV(major, minor); 239 + } 240 + } 241 + 242 + /* 243 + * Should be called with without queue lock held. Here queue lock will be 244 + * taken rarely. It will be taken only once during life time of a group 245 + * if need be 246 + */ 247 + static void 248 + throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) 249 + { 250 + if (!tg || tg->blkg.dev) 251 + return; 252 + 253 + spin_lock_irq(td->queue->queue_lock); 254 + __throtl_tg_fill_dev_details(td, tg); 255 + spin_unlock_irq(td->queue->queue_lock); 256 + } 257 + 258 + static void throtl_init_add_tg_lists(struct throtl_data *td, 259 + struct throtl_grp *tg, struct blkio_cgroup *blkcg) 260 + { 261 + __throtl_tg_fill_dev_details(td, tg); 202 262 203 263 /* Add group onto cgroup list */ 204 - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 205 264 blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, 206 - MKDEV(major, minor), BLKIO_POLICY_THROTL); 265 + tg->blkg.dev, BLKIO_POLICY_THROTL); 207 266 208 267 tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); 209 268 tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); 210 269 tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); 211 270 tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); 212 271 213 - hlist_add_head(&tg->tg_node, &td->tg_list); 214 - td->nr_undestroyed_grps++; 215 - done: 272 + throtl_add_group_to_td_list(td, tg); 273 + } 274 + 275 + /* Should be called without queue lock and outside of rcu period */ 276 + static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td) 277 + { 278 + struct throtl_grp *tg = NULL; 279 + int ret; 280 + 281 + tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); 282 + if (!tg) 283 + return NULL; 284 + 285 + ret = blkio_alloc_blkg_stats(&tg->blkg); 286 + 287 + if (ret) { 288 + kfree(tg); 289 + return NULL; 290 + } 291 + 292 + throtl_init_group(tg); 216 293 return tg; 217 294 } 218 295 219 - static struct throtl_grp * throtl_get_tg(struct throtl_data *td) 296 + static struct 297 + throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg) 220 298 { 221 299 struct throtl_grp *tg = NULL; 300 + void *key = td; 301 + 302 + /* 303 + * This is the common case when there are no blkio cgroups. 304 + * Avoid lookup in this case 305 + */ 306 + if (blkcg == &blkio_root_cgroup) 307 + tg = td->root_tg; 308 + else 309 + tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); 310 + 311 + __throtl_tg_fill_dev_details(td, tg); 312 + return tg; 313 + } 314 + 315 + /* 316 + * This function returns with queue lock unlocked in case of error, like 317 + * request queue is no more 318 + */ 319 + static struct throtl_grp * throtl_get_tg(struct throtl_data *td) 320 + { 321 + struct throtl_grp *tg = NULL, *__tg = NULL; 222 322 struct blkio_cgroup *blkcg; 323 + struct request_queue *q = td->queue; 223 324 224 325 rcu_read_lock(); 225 326 blkcg = task_blkio_cgroup(current); 226 - tg = throtl_find_alloc_tg(td, blkcg); 227 - if (!tg) 228 - tg = &td->root_tg; 327 + tg = throtl_find_tg(td, blkcg); 328 + if (tg) { 329 + rcu_read_unlock(); 330 + return tg; 331 + } 332 + 333 + /* 334 + * Need to allocate a group. Allocation of group also needs allocation 335 + * of per cpu stats which in-turn takes a mutex() and can block. Hence 336 + * we need to drop rcu lock and queue_lock before we call alloc 337 + * 338 + * Take the request queue reference to make sure queue does not 339 + * go away once we return from allocation. 340 + */ 341 + blk_get_queue(q); 342 + rcu_read_unlock(); 343 + spin_unlock_irq(q->queue_lock); 344 + 345 + tg = throtl_alloc_tg(td); 346 + /* 347 + * We might have slept in group allocation. Make sure queue is not 348 + * dead 349 + */ 350 + if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { 351 + blk_put_queue(q); 352 + if (tg) 353 + kfree(tg); 354 + 355 + return ERR_PTR(-ENODEV); 356 + } 357 + blk_put_queue(q); 358 + 359 + /* Group allocated and queue is still alive. take the lock */ 360 + spin_lock_irq(q->queue_lock); 361 + 362 + /* 363 + * Initialize the new group. After sleeping, read the blkcg again. 364 + */ 365 + rcu_read_lock(); 366 + blkcg = task_blkio_cgroup(current); 367 + 368 + /* 369 + * If some other thread already allocated the group while we were 370 + * not holding queue lock, free up the group 371 + */ 372 + __tg = throtl_find_tg(td, blkcg); 373 + 374 + if (__tg) { 375 + kfree(tg); 376 + rcu_read_unlock(); 377 + return __tg; 378 + } 379 + 380 + /* Group allocation failed. Account the IO to root group */ 381 + if (!tg) { 382 + tg = td->root_tg; 383 + return tg; 384 + } 385 + 386 + throtl_init_add_tg_lists(td, tg, blkcg); 229 387 rcu_read_unlock(); 230 388 return tg; 231 389 } ··· 682 544 return 0; 683 545 } 684 546 547 + static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) { 548 + if (tg->bps[rw] == -1 && tg->iops[rw] == -1) 549 + return 1; 550 + return 0; 551 + } 552 + 685 553 /* 686 554 * Returns whether one can dispatch a bio or not. Also returns approx number 687 555 * of jiffies to wait before this bio is with-in IO rate and can be dispatched ··· 752 608 tg->bytes_disp[rw] += bio->bi_size; 753 609 tg->io_disp[rw]++; 754 610 755 - /* 756 - * TODO: This will take blkg->stats_lock. Figure out a way 757 - * to avoid this cost. 758 - */ 759 611 blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); 760 612 } 761 613 ··· 1129 989 struct throtl_grp *tg; 1130 990 struct bio *bio = *biop; 1131 991 bool rw = bio_data_dir(bio), update_disptime = true; 992 + struct blkio_cgroup *blkcg; 1132 993 1133 994 if (bio->bi_rw & REQ_THROTTLED) { 1134 995 bio->bi_rw &= ~REQ_THROTTLED; 1135 996 return 0; 1136 997 } 1137 998 999 + /* 1000 + * A throtl_grp pointer retrieved under rcu can be used to access 1001 + * basic fields like stats and io rates. If a group has no rules, 1002 + * just update the dispatch stats in lockless manner and return. 1003 + */ 1004 + 1005 + rcu_read_lock(); 1006 + blkcg = task_blkio_cgroup(current); 1007 + tg = throtl_find_tg(td, blkcg); 1008 + if (tg) { 1009 + throtl_tg_fill_dev_details(td, tg); 1010 + 1011 + if (tg_no_rule_group(tg, rw)) { 1012 + blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, 1013 + rw, bio->bi_rw & REQ_SYNC); 1014 + rcu_read_unlock(); 1015 + return 0; 1016 + } 1017 + } 1018 + rcu_read_unlock(); 1019 + 1020 + /* 1021 + * Either group has not been allocated yet or it is not an unlimited 1022 + * IO group 1023 + */ 1024 + 1138 1025 spin_lock_irq(q->queue_lock); 1139 1026 tg = throtl_get_tg(td); 1027 + 1028 + if (IS_ERR(tg)) { 1029 + if (PTR_ERR(tg) == -ENODEV) { 1030 + /* 1031 + * Queue is gone. No queue lock held here. 1032 + */ 1033 + return -ENODEV; 1034 + } 1035 + } 1140 1036 1141 1037 if (tg->nr_queued[rw]) { 1142 1038 /* ··· 1236 1060 INIT_HLIST_HEAD(&td->tg_list); 1237 1061 td->tg_service_tree = THROTL_RB_ROOT; 1238 1062 td->limits_changed = false; 1239 - 1240 - /* Init root group */ 1241 - tg = &td->root_tg; 1242 - INIT_HLIST_NODE(&tg->tg_node); 1243 - RB_CLEAR_NODE(&tg->rb_node); 1244 - bio_list_init(&tg->bio_lists[0]); 1245 - bio_list_init(&tg->bio_lists[1]); 1246 - 1247 - /* Practically unlimited BW */ 1248 - tg->bps[0] = tg->bps[1] = -1; 1249 - tg->iops[0] = tg->iops[1] = -1; 1250 - td->limits_changed = false; 1251 - 1252 - /* 1253 - * Set root group reference to 2. One reference will be dropped when 1254 - * all groups on tg_list are being deleted during queue exit. Other 1255 - * reference will remain there as we don't want to delete this group 1256 - * as it is statically allocated and gets destroyed when throtl_data 1257 - * goes away. 1258 - */ 1259 - atomic_set(&tg->ref, 2); 1260 - hlist_add_head(&tg->tg_node, &td->tg_list); 1261 - td->nr_undestroyed_grps++; 1262 - 1263 1063 INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); 1264 1064 1065 + /* alloc and Init root group. */ 1066 + td->queue = q; 1067 + tg = throtl_alloc_tg(td); 1068 + 1069 + if (!tg) { 1070 + kfree(td); 1071 + return -ENOMEM; 1072 + } 1073 + 1074 + td->root_tg = tg; 1075 + 1265 1076 rcu_read_lock(); 1266 - blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td, 1267 - 0, BLKIO_POLICY_THROTL); 1077 + throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup); 1268 1078 rcu_read_unlock(); 1269 1079 1270 1080 /* Attach throtl data to request queue */ 1271 - td->queue = q; 1272 1081 q->td = td; 1273 1082 return 0; 1274 1083 }

+22 -1

block/blk.h

··· 62 62 return rq; 63 63 } 64 64 65 - if (!q->elevator->ops->elevator_dispatch_fn(q, 0)) 65 + /* 66 + * Flush request is running and flush request isn't queueable 67 + * in the drive, we can hold the queue till flush request is 68 + * finished. Even we don't do this, driver can't dispatch next 69 + * requests and will requeue them. And this can improve 70 + * throughput too. For example, we have request flush1, write1, 71 + * flush 2. flush1 is dispatched, then queue is hold, write1 72 + * isn't inserted to queue. After flush1 is finished, flush2 73 + * will be dispatched. Since disk cache is already clean, 74 + * flush2 will be finished very soon, so looks like flush2 is 75 + * folded to flush1. 76 + * Since the queue is hold, a flag is set to indicate the queue 77 + * should be restarted later. Please see flush_end_io() for 78 + * details. 79 + */ 80 + if (q->flush_pending_idx != q->flush_running_idx && 81 + !queue_flush_queueable(q)) { 82 + q->flush_queue_delayed = 1; 83 + return NULL; 84 + } 85 + if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) || 86 + !q->elevator->ops->elevator_dispatch_fn(q, 0)) 66 87 return NULL; 67 88 } 68 89 }

+169 -63

block/cfq-iosched.c

··· 300 300 301 301 /* List of cfq groups being managed on this device*/ 302 302 struct hlist_head cfqg_list; 303 - struct rcu_head rcu; 303 + 304 + /* Number of groups which are on blkcg->blkg_list */ 305 + unsigned int nr_blkcg_linked_grps; 304 306 }; 305 307 306 308 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); ··· 667 665 if (rq2 == NULL) 668 666 return rq1; 669 667 670 - if (rq_is_sync(rq1) && !rq_is_sync(rq2)) 671 - return rq1; 672 - else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) 673 - return rq2; 674 - if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) 675 - return rq1; 676 - else if ((rq2->cmd_flags & REQ_META) && 677 - !(rq1->cmd_flags & REQ_META)) 678 - return rq2; 668 + if (rq_is_sync(rq1) != rq_is_sync(rq2)) 669 + return rq_is_sync(rq1) ? rq1 : rq2; 670 + 671 + if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_META) 672 + return rq1->cmd_flags & REQ_META ? rq1 : rq2; 679 673 680 674 s1 = blk_rq_pos(rq1); 681 675 s2 = blk_rq_pos(rq2); ··· 1012 1014 cfqg->needs_update = true; 1013 1015 } 1014 1016 1015 - static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd, 1016 - struct blkio_cgroup *blkcg, int create) 1017 + static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd, 1018 + struct cfq_group *cfqg, struct blkio_cgroup *blkcg) 1017 1019 { 1018 - struct cfq_group *cfqg = NULL; 1019 - void *key = cfqd; 1020 - int i, j; 1021 - struct cfq_rb_root *st; 1022 1020 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; 1023 1021 unsigned int major, minor; 1024 1022 1025 - cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); 1026 - if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { 1023 + /* 1024 + * Add group onto cgroup list. It might happen that bdi->dev is 1025 + * not initialized yet. Initialize this new group without major 1026 + * and minor info and this info will be filled in once a new thread 1027 + * comes for IO. 1028 + */ 1029 + if (bdi->dev) { 1027 1030 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 1028 - cfqg->blkg.dev = MKDEV(major, minor); 1029 - goto done; 1030 - } 1031 - if (cfqg || !create) 1032 - goto done; 1031 + cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, 1032 + (void *)cfqd, MKDEV(major, minor)); 1033 + } else 1034 + cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, 1035 + (void *)cfqd, 0); 1036 + 1037 + cfqd->nr_blkcg_linked_grps++; 1038 + cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); 1039 + 1040 + /* Add group on cfqd list */ 1041 + hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); 1042 + } 1043 + 1044 + /* 1045 + * Should be called from sleepable context. No request queue lock as per 1046 + * cpu stats are allocated dynamically and alloc_percpu needs to be called 1047 + * from sleepable context. 1048 + */ 1049 + static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd) 1050 + { 1051 + struct cfq_group *cfqg = NULL; 1052 + int i, j, ret; 1053 + struct cfq_rb_root *st; 1033 1054 1034 1055 cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); 1035 1056 if (!cfqg) 1036 - goto done; 1057 + return NULL; 1037 1058 1038 1059 for_each_cfqg_st(cfqg, i, j, st) 1039 1060 *st = CFQ_RB_ROOT; ··· 1066 1049 */ 1067 1050 cfqg->ref = 1; 1068 1051 1052 + ret = blkio_alloc_blkg_stats(&cfqg->blkg); 1053 + if (ret) { 1054 + kfree(cfqg); 1055 + return NULL; 1056 + } 1057 + 1058 + return cfqg; 1059 + } 1060 + 1061 + static struct cfq_group * 1062 + cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg) 1063 + { 1064 + struct cfq_group *cfqg = NULL; 1065 + void *key = cfqd; 1066 + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; 1067 + unsigned int major, minor; 1068 + 1069 1069 /* 1070 - * Add group onto cgroup list. It might happen that bdi->dev is 1071 - * not initialized yet. Initialize this new group without major 1072 - * and minor info and this info will be filled in once a new thread 1073 - * comes for IO. See code above. 1070 + * This is the common case when there are no blkio cgroups. 1071 + * Avoid lookup in this case 1074 1072 */ 1075 - if (bdi->dev) { 1073 + if (blkcg == &blkio_root_cgroup) 1074 + cfqg = &cfqd->root_group; 1075 + else 1076 + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); 1077 + 1078 + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { 1076 1079 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 1077 - cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, 1078 - MKDEV(major, minor)); 1079 - } else 1080 - cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, 1081 - 0); 1080 + cfqg->blkg.dev = MKDEV(major, minor); 1081 + } 1082 1082 1083 - cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); 1084 - 1085 - /* Add group on cfqd list */ 1086 - hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); 1087 - 1088 - done: 1089 1083 return cfqg; 1090 1084 } 1091 1085 1092 1086 /* 1093 - * Search for the cfq group current task belongs to. If create = 1, then also 1094 - * create the cfq group if it does not exist. request_queue lock must be held. 1087 + * Search for the cfq group current task belongs to. request_queue lock must 1088 + * be held. 1095 1089 */ 1096 - static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) 1090 + static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) 1097 1091 { 1098 1092 struct blkio_cgroup *blkcg; 1099 - struct cfq_group *cfqg = NULL; 1093 + struct cfq_group *cfqg = NULL, *__cfqg = NULL; 1094 + struct request_queue *q = cfqd->queue; 1100 1095 1101 1096 rcu_read_lock(); 1102 1097 blkcg = task_blkio_cgroup(current); 1103 - cfqg = cfq_find_alloc_cfqg(cfqd, blkcg, create); 1104 - if (!cfqg && create) 1098 + cfqg = cfq_find_cfqg(cfqd, blkcg); 1099 + if (cfqg) { 1100 + rcu_read_unlock(); 1101 + return cfqg; 1102 + } 1103 + 1104 + /* 1105 + * Need to allocate a group. Allocation of group also needs allocation 1106 + * of per cpu stats which in-turn takes a mutex() and can block. Hence 1107 + * we need to drop rcu lock and queue_lock before we call alloc. 1108 + * 1109 + * Not taking any queue reference here and assuming that queue is 1110 + * around by the time we return. CFQ queue allocation code does 1111 + * the same. It might be racy though. 1112 + */ 1113 + 1114 + rcu_read_unlock(); 1115 + spin_unlock_irq(q->queue_lock); 1116 + 1117 + cfqg = cfq_alloc_cfqg(cfqd); 1118 + 1119 + spin_lock_irq(q->queue_lock); 1120 + 1121 + rcu_read_lock(); 1122 + blkcg = task_blkio_cgroup(current); 1123 + 1124 + /* 1125 + * If some other thread already allocated the group while we were 1126 + * not holding queue lock, free up the group 1127 + */ 1128 + __cfqg = cfq_find_cfqg(cfqd, blkcg); 1129 + 1130 + if (__cfqg) { 1131 + kfree(cfqg); 1132 + rcu_read_unlock(); 1133 + return __cfqg; 1134 + } 1135 + 1136 + if (!cfqg) 1105 1137 cfqg = &cfqd->root_group; 1138 + 1139 + cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg); 1106 1140 rcu_read_unlock(); 1107 1141 return cfqg; 1108 1142 } ··· 1186 1118 return; 1187 1119 for_each_cfqg_st(cfqg, i, j, st) 1188 1120 BUG_ON(!RB_EMPTY_ROOT(&st->rb)); 1121 + free_percpu(cfqg->blkg.stats_cpu); 1189 1122 kfree(cfqg); 1190 1123 } 1191 1124 ··· 1245 1176 } 1246 1177 1247 1178 #else /* GROUP_IOSCHED */ 1248 - static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) 1179 + static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) 1249 1180 { 1250 1181 return &cfqd->root_group; 1251 1182 } ··· 1279 1210 struct cfq_rb_root *service_tree; 1280 1211 int left; 1281 1212 int new_cfqq = 1; 1282 - int group_changed = 0; 1283 1213 1284 1214 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), 1285 1215 cfqq_type(cfqq)); ··· 1349 1281 rb_link_node(&cfqq->rb_node, parent, p); 1350 1282 rb_insert_color(&cfqq->rb_node, &service_tree->rb); 1351 1283 service_tree->count++; 1352 - if ((add_front || !new_cfqq) && !group_changed) 1284 + if (add_front || !new_cfqq) 1353 1285 return; 1354 1286 cfq_group_notify_queue_add(cfqd, cfqq->cfqg); 1355 1287 } ··· 2097 2029 2098 2030 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); 2099 2031 2100 - return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); 2032 + return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio); 2101 2033 } 2102 2034 2103 2035 /* ··· 2979 2911 struct cfq_group *cfqg; 2980 2912 2981 2913 retry: 2982 - cfqg = cfq_get_cfqg(cfqd, 1); 2914 + cfqg = cfq_get_cfqg(cfqd); 2983 2915 cic = cfq_cic_lookup(cfqd, ioc); 2984 2916 /* cic always exists here */ 2985 2917 cfqq = cic_to_cfqq(cic, is_sync); ··· 3883 3815 cfq_put_queue(cfqd->async_idle_cfqq); 3884 3816 } 3885 3817 3886 - static void cfq_cfqd_free(struct rcu_head *head) 3887 - { 3888 - kfree(container_of(head, struct cfq_data, rcu)); 3889 - } 3890 - 3891 3818 static void cfq_exit_queue(struct elevator_queue *e) 3892 3819 { 3893 3820 struct cfq_data *cfqd = e->elevator_data; 3894 3821 struct request_queue *q = cfqd->queue; 3822 + bool wait = false; 3895 3823 3896 3824 cfq_shutdown_timer_wq(cfqd); 3897 3825 ··· 3906 3842 3907 3843 cfq_put_async_queues(cfqd); 3908 3844 cfq_release_cfq_groups(cfqd); 3909 - cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg); 3845 + 3846 + /* 3847 + * If there are groups which we could not unlink from blkcg list, 3848 + * wait for a rcu period for them to be freed. 3849 + */ 3850 + if (cfqd->nr_blkcg_linked_grps) 3851 + wait = true; 3910 3852 3911 3853 spin_unlock_irq(q->queue_lock); 3912 3854 ··· 3922 3852 ida_remove(&cic_index_ida, cfqd->cic_index); 3923 3853 spin_unlock(&cic_index_lock); 3924 3854 3925 - /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ 3926 - call_rcu(&cfqd->rcu, cfq_cfqd_free); 3855 + /* 3856 + * Wait for cfqg->blkg->key accessors to exit their grace periods. 3857 + * Do this wait only if there are other unlinked groups out 3858 + * there. This can happen if cgroup deletion path claimed the 3859 + * responsibility of cleaning up a group before queue cleanup code 3860 + * get to the group. 3861 + * 3862 + * Do not call synchronize_rcu() unconditionally as there are drivers 3863 + * which create/delete request queue hundreds of times during scan/boot 3864 + * and synchronize_rcu() can take significant time and slow down boot. 3865 + */ 3866 + if (wait) 3867 + synchronize_rcu(); 3868 + 3869 + #ifdef CONFIG_CFQ_GROUP_IOSCHED 3870 + /* Free up per cpu stats for root group */ 3871 + free_percpu(cfqd->root_group.blkg.stats_cpu); 3872 + #endif 3873 + kfree(cfqd); 3927 3874 } 3928 3875 3929 3876 static int cfq_alloc_cic_index(void) ··· 3973 3886 return NULL; 3974 3887 3975 3888 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); 3976 - if (!cfqd) 3889 + if (!cfqd) { 3890 + spin_lock(&cic_index_lock); 3891 + ida_remove(&cic_index_ida, i); 3892 + spin_unlock(&cic_index_lock); 3977 3893 return NULL; 3894 + } 3978 3895 3979 3896 /* 3980 3897 * Don't need take queue_lock in the routine, since we are ··· 4000 3909 4001 3910 #ifdef CONFIG_CFQ_GROUP_IOSCHED 4002 3911 /* 4003 - * Take a reference to root group which we never drop. This is just 4004 - * to make sure that cfq_put_cfqg() does not try to kfree root group 3912 + * Set root group reference to 2. One reference will be dropped when 3913 + * all groups on cfqd->cfqg_list are being deleted during queue exit. 3914 + * Other reference will remain there as we don't want to delete this 3915 + * group as it is statically allocated and gets destroyed when 3916 + * throtl_data goes away. 4005 3917 */ 4006 - cfqg->ref = 1; 3918 + cfqg->ref = 2; 3919 + 3920 + if (blkio_alloc_blkg_stats(&cfqg->blkg)) { 3921 + kfree(cfqg); 3922 + kfree(cfqd); 3923 + return NULL; 3924 + } 3925 + 4007 3926 rcu_read_lock(); 3927 + 4008 3928 cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, 4009 3929 (void *)cfqd, 0); 4010 3930 rcu_read_unlock(); 3931 + cfqd->nr_blkcg_linked_grps++; 3932 + 3933 + /* Add group on cfqd->cfqg_list */ 3934 + hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); 4011 3935 #endif 4012 3936 /* 4013 3937 * Not strictly needed (since RB_ROOT just clears the node and we

+1 -10

block/elevator.c

··· 155 155 156 156 e = elevator_find(name); 157 157 if (!e) { 158 - char elv[ELV_NAME_MAX + strlen("-iosched")]; 159 - 160 158 spin_unlock(&elv_list_lock); 161 - 162 - snprintf(elv, sizeof(elv), "%s-iosched", name); 163 - 164 - request_module("%s", elv); 159 + request_module("%s-iosched", name); 165 160 spin_lock(&elv_list_lock); 166 161 e = elevator_find(name); 167 162 } ··· 416 421 struct list_head *entry; 417 422 int stop_flags; 418 423 419 - BUG_ON(rq->cmd_flags & REQ_ON_PLUG); 420 - 421 424 if (q->last_merge == rq) 422 425 q->last_merge = NULL; 423 426 ··· 653 660 trace_block_rq_insert(q, rq); 654 661 655 662 rq->q = q; 656 - 657 - BUG_ON(rq->cmd_flags & REQ_ON_PLUG); 658 663 659 664 if (rq->cmd_flags & REQ_SOFTBARRIER) { 660 665 /* barriers are scheduling boundary, update end_sector */

+7 -6

drivers/ata/libata-scsi.c

··· 1089 1089 static int ata_scsi_dev_config(struct scsi_device *sdev, 1090 1090 struct ata_device *dev) 1091 1091 { 1092 + struct request_queue *q = sdev->request_queue; 1093 + 1092 1094 if (!ata_id_has_unload(dev->id)) 1093 1095 dev->flags |= ATA_DFLAG_NO_UNLOAD; 1094 1096 1095 1097 /* configure max sectors */ 1096 - blk_queue_max_hw_sectors(sdev->request_queue, dev->max_sectors); 1098 + blk_queue_max_hw_sectors(q, dev->max_sectors); 1097 1099 1098 1100 if (dev->class == ATA_DEV_ATAPI) { 1099 - struct request_queue *q = sdev->request_queue; 1100 1101 void *buf; 1101 1102 1102 1103 sdev->sector_size = ATA_SECT_SIZE; 1103 1104 1104 1105 /* set DMA padding */ 1105 - blk_queue_update_dma_pad(sdev->request_queue, 1106 - ATA_DMA_PAD_SZ - 1); 1106 + blk_queue_update_dma_pad(q, ATA_DMA_PAD_SZ - 1); 1107 1107 1108 1108 /* configure draining */ 1109 1109 buf = kmalloc(ATAPI_MAX_DRAIN, q->bounce_gfp | GFP_KERNEL); ··· 1131 1131 "sector_size=%u > PAGE_SIZE, PIO may malfunction\n", 1132 1132 sdev->sector_size); 1133 1133 1134 - blk_queue_update_dma_alignment(sdev->request_queue, 1135 - sdev->sector_size - 1); 1134 + blk_queue_update_dma_alignment(q, sdev->sector_size - 1); 1136 1135 1137 1136 if (dev->flags & ATA_DFLAG_AN) 1138 1137 set_bit(SDEV_EVT_MEDIA_CHANGE, sdev->supported_events); ··· 1143 1144 depth = min(ATA_MAX_QUEUE - 1, depth); 1144 1145 scsi_adjust_queue_depth(sdev, MSG_SIMPLE_TAG, depth); 1145 1146 } 1147 + 1148 + blk_queue_flush_queueable(q, false); 1146 1149 1147 1150 dev->sdev = sdev; 1148 1151 return 0;

+2

drivers/block/paride/pcd.c

··· 320 320 disk->first_minor = unit; 321 321 strcpy(disk->disk_name, cd->name); /* umm... */ 322 322 disk->fops = &pcd_bdops; 323 + disk->flags = GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; 324 + disk->events = DISK_EVENT_MEDIA_CHANGE; 323 325 } 324 326 } 325 327

+3 -1

drivers/cdrom/viocd.c

··· 625 625 blk_queue_max_hw_sectors(q, 4096 / 512); 626 626 gendisk->queue = q; 627 627 gendisk->fops = &viocd_fops; 628 - gendisk->flags = GENHD_FL_CD|GENHD_FL_REMOVABLE; 628 + gendisk->flags = GENHD_FL_CD | GENHD_FL_REMOVABLE | 629 + GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; 630 + gendisk->events = DISK_EVENT_MEDIA_CHANGE; 629 631 set_capacity(gendisk, 0); 630 632 gendisk->private_data = d; 631 633 d->viocd_disk = gendisk;

+2 -1

drivers/ide/ide-cd.c

··· 1781 1781 1782 1782 ide_cd_read_toc(drive, &sense); 1783 1783 g->fops = &idecd_ops; 1784 - g->flags |= GENHD_FL_REMOVABLE; 1784 + g->flags |= GENHD_FL_REMOVABLE | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; 1785 + g->events = DISK_EVENT_MEDIA_CHANGE; 1785 1786 add_disk(g); 1786 1787 return 0; 1787 1788

+1 -1

drivers/scsi/sr.c

··· 636 636 disk->first_minor = minor; 637 637 sprintf(disk->disk_name, "sr%d", minor); 638 638 disk->fops = &sr_bdops; 639 - disk->flags = GENHD_FL_CD; 639 + disk->flags = GENHD_FL_CD | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; 640 640 disk->events = DISK_EVENT_MEDIA_CHANGE | DISK_EVENT_EJECT_REQUEST; 641 641 642 642 blk_queue_rq_timeout(sdev->request_queue, SR_TIMEOUT);

+10 -7

fs/block_dev.c

··· 1238 1238 res = __blkdev_get(bdev, mode, 0); 1239 1239 1240 1240 if (whole) { 1241 + struct gendisk *disk = whole->bd_disk; 1242 + 1241 1243 /* finish claiming */ 1242 1244 mutex_lock(&bdev->bd_mutex); 1243 1245 spin_lock(&bdev_lock); ··· 1266 1264 spin_unlock(&bdev_lock); 1267 1265 1268 1266 /* 1269 - * Block event polling for write claims. Any write 1270 - * holder makes the write_holder state stick until all 1271 - * are released. This is good enough and tracking 1272 - * individual writeable reference is too fragile given 1273 - * the way @mode is used in blkdev_get/put(). 1267 + * Block event polling for write claims if requested. Any 1268 + * write holder makes the write_holder state stick until 1269 + * all are released. This is good enough and tracking 1270 + * individual writeable reference is too fragile given the 1271 + * way @mode is used in blkdev_get/put(). 1274 1272 */ 1275 - if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) { 1273 + if ((disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE) && 1274 + !res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) { 1276 1275 bdev->bd_write_holder = true; 1277 - disk_block_events(bdev->bd_disk); 1276 + disk_block_events(disk); 1278 1277 } 1279 1278 1280 1279 mutex_unlock(&bdev->bd_mutex);

+5 -3

fs/partitions/check.c

··· 255 255 struct device_attribute *attr, char *buf) 256 256 { 257 257 struct hd_struct *p = dev_to_part(dev); 258 - return sprintf(buf, "%u\n", p->discard_alignment); 258 + struct gendisk *disk = dev_to_disk(dev); 259 + 260 + return sprintf(buf, "%u\n", 261 + queue_limit_discard_alignment(&disk->queue->limits, 262 + p->start_sect)); 259 263 } 260 264 261 265 ssize_t part_stat_show(struct device *dev, ··· 453 449 p->start_sect = start; 454 450 p->alignment_offset = 455 451 queue_limit_alignment_offset(&disk->queue->limits, start); 456 - p->discard_alignment = 457 - queue_limit_discard_alignment(&disk->queue->limits, start); 458 452 p->nr_sects = len; 459 453 p->partno = partno; 460 454 p->policy = get_disk_ro(disk);

-2

include/linux/blk_types.h

··· 151 151 __REQ_IO_STAT, /* account I/O stat */ 152 152 __REQ_MIXED_MERGE, /* merge of different types, fail separately */ 153 153 __REQ_SECURE, /* secure discard (used with __REQ_DISCARD) */ 154 - __REQ_ON_PLUG, /* on plug list */ 155 154 __REQ_NR_BITS, /* stops here */ 156 155 }; 157 156 ··· 191 192 #define REQ_IO_STAT (1 << __REQ_IO_STAT) 192 193 #define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE) 193 194 #define REQ_SECURE (1 << __REQ_SECURE) 194 - #define REQ_ON_PLUG (1 << __REQ_ON_PLUG) 195 195 196 196 #endif /* __LINUX_BLK_TYPES_H */

+13 -2

include/linux/blkdev.h

··· 257 257 unsigned char misaligned; 258 258 unsigned char discard_misaligned; 259 259 unsigned char cluster; 260 - signed char discard_zeroes_data; 260 + unsigned char discard_zeroes_data; 261 261 }; 262 262 263 263 struct request_queue ··· 364 364 * for flush operations 365 365 */ 366 366 unsigned int flush_flags; 367 + unsigned int flush_not_queueable:1; 368 + unsigned int flush_queue_delayed:1; 367 369 unsigned int flush_pending_idx:1; 368 370 unsigned int flush_running_idx:1; 369 371 unsigned long flush_pending_since; ··· 845 843 extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *); 846 844 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); 847 845 extern void blk_queue_flush(struct request_queue *q, unsigned int flush); 846 + extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable); 848 847 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); 849 848 850 849 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); ··· 1069 1066 { 1070 1067 unsigned int alignment = (sector << 9) & (lim->discard_granularity - 1); 1071 1068 1069 + if (!lim->max_discard_sectors) 1070 + return 0; 1071 + 1072 1072 return (lim->discard_granularity + lim->discard_alignment - alignment) 1073 1073 & (lim->discard_granularity - 1); 1074 1074 } 1075 1075 1076 1076 static inline unsigned int queue_discard_zeroes_data(struct request_queue *q) 1077 1077 { 1078 - if (q->limits.discard_zeroes_data == 1) 1078 + if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1) 1079 1079 return 1; 1080 1080 1081 1081 return 0; ··· 1115 1109 static inline unsigned int block_size(struct block_device *bdev) 1116 1110 { 1117 1111 return bdev->bd_block_size; 1112 + } 1113 + 1114 + static inline bool queue_flush_queueable(struct request_queue *q) 1115 + { 1116 + return !q->flush_not_queueable; 1118 1117 } 1119 1118 1120 1119 typedef struct {struct page *v;} Sector;

+1 -1

include/linux/genhd.h

··· 100 100 sector_t start_sect; 101 101 sector_t nr_sects; 102 102 sector_t alignment_offset; 103 - unsigned int discard_alignment; 104 103 struct device __dev; 105 104 struct kobject *holder_dir; 106 105 int policy, partno; ··· 126 127 #define GENHD_FL_SUPPRESS_PARTITION_INFO 32 127 128 #define GENHD_FL_EXT_DEVT 64 /* allow extended devt */ 128 129 #define GENHD_FL_NATIVE_CAPACITY 128 130 + #define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE 256 129 131 130 132 enum { 131 133 DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */

+2 -2

mm/backing-dev.c

··· 63 63 unsigned long background_thresh; 64 64 unsigned long dirty_thresh; 65 65 unsigned long bdi_thresh; 66 - unsigned long nr_dirty, nr_io, nr_more_io, nr_wb; 66 + unsigned long nr_dirty, nr_io, nr_more_io; 67 67 struct inode *inode; 68 68 69 - nr_wb = nr_dirty = nr_io = nr_more_io = 0; 69 + nr_dirty = nr_io = nr_more_io = 0; 70 70 spin_lock(&inode_wb_list_lock); 71 71 list_for_each_entry(inode, &wb->b_dirty, i_wb_list) 72 72 nr_dirty++;