at v2.6.32 11 kB view raw
1/* 2 * Functions related to barrier IO handling 3 */ 4#include <linux/kernel.h> 5#include <linux/module.h> 6#include <linux/bio.h> 7#include <linux/blkdev.h> 8 9#include "blk.h" 10 11/** 12 * blk_queue_ordered - does this queue support ordered writes 13 * @q: the request queue 14 * @ordered: one of QUEUE_ORDERED_* 15 * @prepare_flush_fn: rq setup helper for cache flush ordered writes 16 * 17 * Description: 18 * For journalled file systems, doing ordered writes on a commit 19 * block instead of explicitly doing wait_on_buffer (which is bad 20 * for performance) can be a big win. Block drivers supporting this 21 * feature should call this function and indicate so. 22 * 23 **/ 24int blk_queue_ordered(struct request_queue *q, unsigned ordered, 25 prepare_flush_fn *prepare_flush_fn) 26{ 27 if (!prepare_flush_fn && (ordered & (QUEUE_ORDERED_DO_PREFLUSH | 28 QUEUE_ORDERED_DO_POSTFLUSH))) { 29 printk(KERN_ERR "%s: prepare_flush_fn required\n", __func__); 30 return -EINVAL; 31 } 32 33 if (ordered != QUEUE_ORDERED_NONE && 34 ordered != QUEUE_ORDERED_DRAIN && 35 ordered != QUEUE_ORDERED_DRAIN_FLUSH && 36 ordered != QUEUE_ORDERED_DRAIN_FUA && 37 ordered != QUEUE_ORDERED_TAG && 38 ordered != QUEUE_ORDERED_TAG_FLUSH && 39 ordered != QUEUE_ORDERED_TAG_FUA) { 40 printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered); 41 return -EINVAL; 42 } 43 44 q->ordered = ordered; 45 q->next_ordered = ordered; 46 q->prepare_flush_fn = prepare_flush_fn; 47 48 return 0; 49} 50EXPORT_SYMBOL(blk_queue_ordered); 51 52/* 53 * Cache flushing for ordered writes handling 54 */ 55unsigned blk_ordered_cur_seq(struct request_queue *q) 56{ 57 if (!q->ordseq) 58 return 0; 59 return 1 << ffz(q->ordseq); 60} 61 62unsigned blk_ordered_req_seq(struct request *rq) 63{ 64 struct request_queue *q = rq->q; 65 66 BUG_ON(q->ordseq == 0); 67 68 if (rq == &q->pre_flush_rq) 69 return QUEUE_ORDSEQ_PREFLUSH; 70 if (rq == &q->bar_rq) 71 return QUEUE_ORDSEQ_BAR; 72 if (rq == &q->post_flush_rq) 73 return QUEUE_ORDSEQ_POSTFLUSH; 74 75 /* 76 * !fs requests don't need to follow barrier ordering. Always 77 * put them at the front. This fixes the following deadlock. 78 * 79 * http://thread.gmane.org/gmane.linux.kernel/537473 80 */ 81 if (!blk_fs_request(rq)) 82 return QUEUE_ORDSEQ_DRAIN; 83 84 if ((rq->cmd_flags & REQ_ORDERED_COLOR) == 85 (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR)) 86 return QUEUE_ORDSEQ_DRAIN; 87 else 88 return QUEUE_ORDSEQ_DONE; 89} 90 91bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error) 92{ 93 struct request *rq; 94 95 if (error && !q->orderr) 96 q->orderr = error; 97 98 BUG_ON(q->ordseq & seq); 99 q->ordseq |= seq; 100 101 if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) 102 return false; 103 104 /* 105 * Okay, sequence complete. 106 */ 107 q->ordseq = 0; 108 rq = q->orig_bar_rq; 109 __blk_end_request_all(rq, q->orderr); 110 return true; 111} 112 113static void pre_flush_end_io(struct request *rq, int error) 114{ 115 elv_completed_request(rq->q, rq); 116 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error); 117} 118 119static void bar_end_io(struct request *rq, int error) 120{ 121 elv_completed_request(rq->q, rq); 122 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error); 123} 124 125static void post_flush_end_io(struct request *rq, int error) 126{ 127 elv_completed_request(rq->q, rq); 128 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error); 129} 130 131static void queue_flush(struct request_queue *q, unsigned which) 132{ 133 struct request *rq; 134 rq_end_io_fn *end_io; 135 136 if (which == QUEUE_ORDERED_DO_PREFLUSH) { 137 rq = &q->pre_flush_rq; 138 end_io = pre_flush_end_io; 139 } else { 140 rq = &q->post_flush_rq; 141 end_io = post_flush_end_io; 142 } 143 144 blk_rq_init(q, rq); 145 rq->cmd_flags = REQ_HARDBARRIER; 146 rq->rq_disk = q->bar_rq.rq_disk; 147 rq->end_io = end_io; 148 q->prepare_flush_fn(q, rq); 149 150 elv_insert(q, rq, ELEVATOR_INSERT_FRONT); 151} 152 153static inline bool start_ordered(struct request_queue *q, struct request **rqp) 154{ 155 struct request *rq = *rqp; 156 unsigned skip = 0; 157 158 q->orderr = 0; 159 q->ordered = q->next_ordered; 160 q->ordseq |= QUEUE_ORDSEQ_STARTED; 161 162 /* 163 * For an empty barrier, there's no actual BAR request, which 164 * in turn makes POSTFLUSH unnecessary. Mask them off. 165 */ 166 if (!blk_rq_sectors(rq)) { 167 q->ordered &= ~(QUEUE_ORDERED_DO_BAR | 168 QUEUE_ORDERED_DO_POSTFLUSH); 169 /* 170 * Empty barrier on a write-through device w/ ordered 171 * tag has no command to issue and without any command 172 * to issue, ordering by tag can't be used. Drain 173 * instead. 174 */ 175 if ((q->ordered & QUEUE_ORDERED_BY_TAG) && 176 !(q->ordered & QUEUE_ORDERED_DO_PREFLUSH)) { 177 q->ordered &= ~QUEUE_ORDERED_BY_TAG; 178 q->ordered |= QUEUE_ORDERED_BY_DRAIN; 179 } 180 } 181 182 /* stash away the original request */ 183 blk_dequeue_request(rq); 184 q->orig_bar_rq = rq; 185 rq = NULL; 186 187 /* 188 * Queue ordered sequence. As we stack them at the head, we 189 * need to queue in reverse order. Note that we rely on that 190 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs 191 * request gets inbetween ordered sequence. 192 */ 193 if (q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) { 194 queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH); 195 rq = &q->post_flush_rq; 196 } else 197 skip |= QUEUE_ORDSEQ_POSTFLUSH; 198 199 if (q->ordered & QUEUE_ORDERED_DO_BAR) { 200 rq = &q->bar_rq; 201 202 /* initialize proxy request and queue it */ 203 blk_rq_init(q, rq); 204 if (bio_data_dir(q->orig_bar_rq->bio) == WRITE) 205 rq->cmd_flags |= REQ_RW; 206 if (q->ordered & QUEUE_ORDERED_DO_FUA) 207 rq->cmd_flags |= REQ_FUA; 208 init_request_from_bio(rq, q->orig_bar_rq->bio); 209 rq->end_io = bar_end_io; 210 211 elv_insert(q, rq, ELEVATOR_INSERT_FRONT); 212 } else 213 skip |= QUEUE_ORDSEQ_BAR; 214 215 if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) { 216 queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH); 217 rq = &q->pre_flush_rq; 218 } else 219 skip |= QUEUE_ORDSEQ_PREFLUSH; 220 221 if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && queue_in_flight(q)) 222 rq = NULL; 223 else 224 skip |= QUEUE_ORDSEQ_DRAIN; 225 226 *rqp = rq; 227 228 /* 229 * Complete skipped sequences. If whole sequence is complete, 230 * return false to tell elevator that this request is gone. 231 */ 232 return !blk_ordered_complete_seq(q, skip, 0); 233} 234 235bool blk_do_ordered(struct request_queue *q, struct request **rqp) 236{ 237 struct request *rq = *rqp; 238 const int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq); 239 240 if (!q->ordseq) { 241 if (!is_barrier) 242 return true; 243 244 if (q->next_ordered != QUEUE_ORDERED_NONE) 245 return start_ordered(q, rqp); 246 else { 247 /* 248 * Queue ordering not supported. Terminate 249 * with prejudice. 250 */ 251 blk_dequeue_request(rq); 252 __blk_end_request_all(rq, -EOPNOTSUPP); 253 *rqp = NULL; 254 return false; 255 } 256 } 257 258 /* 259 * Ordered sequence in progress 260 */ 261 262 /* Special requests are not subject to ordering rules. */ 263 if (!blk_fs_request(rq) && 264 rq != &q->pre_flush_rq && rq != &q->post_flush_rq) 265 return true; 266 267 if (q->ordered & QUEUE_ORDERED_BY_TAG) { 268 /* Ordered by tag. Blocking the next barrier is enough. */ 269 if (is_barrier && rq != &q->bar_rq) 270 *rqp = NULL; 271 } else { 272 /* Ordered by draining. Wait for turn. */ 273 WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q)); 274 if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q)) 275 *rqp = NULL; 276 } 277 278 return true; 279} 280 281static void bio_end_empty_barrier(struct bio *bio, int err) 282{ 283 if (err) { 284 if (err == -EOPNOTSUPP) 285 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); 286 clear_bit(BIO_UPTODATE, &bio->bi_flags); 287 } 288 289 complete(bio->bi_private); 290} 291 292/** 293 * blkdev_issue_flush - queue a flush 294 * @bdev: blockdev to issue flush for 295 * @error_sector: error sector 296 * 297 * Description: 298 * Issue a flush for the block device in question. Caller can supply 299 * room for storing the error offset in case of a flush error, if they 300 * wish to. 301 */ 302int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) 303{ 304 DECLARE_COMPLETION_ONSTACK(wait); 305 struct request_queue *q; 306 struct bio *bio; 307 int ret; 308 309 if (bdev->bd_disk == NULL) 310 return -ENXIO; 311 312 q = bdev_get_queue(bdev); 313 if (!q) 314 return -ENXIO; 315 316 bio = bio_alloc(GFP_KERNEL, 0); 317 bio->bi_end_io = bio_end_empty_barrier; 318 bio->bi_private = &wait; 319 bio->bi_bdev = bdev; 320 submit_bio(WRITE_BARRIER, bio); 321 322 wait_for_completion(&wait); 323 324 /* 325 * The driver must store the error location in ->bi_sector, if 326 * it supports it. For non-stacked drivers, this should be copied 327 * from blk_rq_pos(rq). 328 */ 329 if (error_sector) 330 *error_sector = bio->bi_sector; 331 332 ret = 0; 333 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 334 ret = -EOPNOTSUPP; 335 else if (!bio_flagged(bio, BIO_UPTODATE)) 336 ret = -EIO; 337 338 bio_put(bio); 339 return ret; 340} 341EXPORT_SYMBOL(blkdev_issue_flush); 342 343static void blkdev_discard_end_io(struct bio *bio, int err) 344{ 345 if (err) { 346 if (err == -EOPNOTSUPP) 347 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); 348 clear_bit(BIO_UPTODATE, &bio->bi_flags); 349 } 350 351 if (bio->bi_private) 352 complete(bio->bi_private); 353 __free_page(bio_page(bio)); 354 355 bio_put(bio); 356} 357 358/** 359 * blkdev_issue_discard - queue a discard 360 * @bdev: blockdev to issue discard for 361 * @sector: start sector 362 * @nr_sects: number of sectors to discard 363 * @gfp_mask: memory allocation flags (for bio_alloc) 364 * @flags: DISCARD_FL_* flags to control behaviour 365 * 366 * Description: 367 * Issue a discard request for the sectors in question. 368 */ 369int blkdev_issue_discard(struct block_device *bdev, sector_t sector, 370 sector_t nr_sects, gfp_t gfp_mask, int flags) 371{ 372 DECLARE_COMPLETION_ONSTACK(wait); 373 struct request_queue *q = bdev_get_queue(bdev); 374 int type = flags & DISCARD_FL_BARRIER ? 375 DISCARD_BARRIER : DISCARD_NOBARRIER; 376 struct bio *bio; 377 struct page *page; 378 int ret = 0; 379 380 if (!q) 381 return -ENXIO; 382 383 if (!blk_queue_discard(q)) 384 return -EOPNOTSUPP; 385 386 while (nr_sects && !ret) { 387 unsigned int sector_size = q->limits.logical_block_size; 388 unsigned int max_discard_sectors = 389 min(q->limits.max_discard_sectors, UINT_MAX >> 9); 390 391 bio = bio_alloc(gfp_mask, 1); 392 if (!bio) 393 goto out; 394 bio->bi_sector = sector; 395 bio->bi_end_io = blkdev_discard_end_io; 396 bio->bi_bdev = bdev; 397 if (flags & DISCARD_FL_WAIT) 398 bio->bi_private = &wait; 399 400 /* 401 * Add a zeroed one-sector payload as that's what 402 * our current implementations need. If we'll ever need 403 * more the interface will need revisiting. 404 */ 405 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 406 if (!page) 407 goto out_free_bio; 408 if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size) 409 goto out_free_page; 410 411 /* 412 * And override the bio size - the way discard works we 413 * touch many more blocks on disk than the actual payload 414 * length. 415 */ 416 if (nr_sects > max_discard_sectors) { 417 bio->bi_size = max_discard_sectors << 9; 418 nr_sects -= max_discard_sectors; 419 sector += max_discard_sectors; 420 } else { 421 bio->bi_size = nr_sects << 9; 422 nr_sects = 0; 423 } 424 425 bio_get(bio); 426 submit_bio(type, bio); 427 428 if (flags & DISCARD_FL_WAIT) 429 wait_for_completion(&wait); 430 431 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 432 ret = -EOPNOTSUPP; 433 else if (!bio_flagged(bio, BIO_UPTODATE)) 434 ret = -EIO; 435 bio_put(bio); 436 } 437 return ret; 438out_free_page: 439 __free_page(page); 440out_free_bio: 441 bio_put(bio); 442out: 443 return -ENOMEM; 444} 445EXPORT_SYMBOL(blkdev_issue_discard);