Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at 77b2555b52a894a2e39a42e43d993df875c46a6a 3747 lines 97 kB view raw
1/* 2 * linux/drivers/block/ll_rw_blk.c 3 * 4 * Copyright (C) 1991, 1992 Linus Torvalds 5 * Copyright (C) 1994, Karl Keyte: Added support for disk statistics 6 * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE 7 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de> 8 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> - July2000 9 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001 10 */ 11 12/* 13 * This handles all read/write requests to block devices 14 */ 15#include <linux/config.h> 16#include <linux/kernel.h> 17#include <linux/module.h> 18#include <linux/backing-dev.h> 19#include <linux/bio.h> 20#include <linux/blkdev.h> 21#include <linux/highmem.h> 22#include <linux/mm.h> 23#include <linux/kernel_stat.h> 24#include <linux/string.h> 25#include <linux/init.h> 26#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */ 27#include <linux/completion.h> 28#include <linux/slab.h> 29#include <linux/swap.h> 30#include <linux/writeback.h> 31#include <linux/blkdev.h> 32 33/* 34 * for max sense size 35 */ 36#include <scsi/scsi_cmnd.h> 37 38static void blk_unplug_work(void *data); 39static void blk_unplug_timeout(unsigned long data); 40static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io); 41 42/* 43 * For the allocated request tables 44 */ 45static kmem_cache_t *request_cachep; 46 47/* 48 * For queue allocation 49 */ 50static kmem_cache_t *requestq_cachep; 51 52/* 53 * For io context allocations 54 */ 55static kmem_cache_t *iocontext_cachep; 56 57static wait_queue_head_t congestion_wqh[2] = { 58 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), 59 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) 60 }; 61 62/* 63 * Controlling structure to kblockd 64 */ 65static struct workqueue_struct *kblockd_workqueue; 66 67unsigned long blk_max_low_pfn, blk_max_pfn; 68 69EXPORT_SYMBOL(blk_max_low_pfn); 70EXPORT_SYMBOL(blk_max_pfn); 71 72/* Amount of time in which a process may batch requests */ 73#define BLK_BATCH_TIME (HZ/50UL) 74 75/* Number of requests a "batching" process may submit */ 76#define BLK_BATCH_REQ 32 77 78/* 79 * Return the threshold (number of used requests) at which the queue is 80 * considered to be congested. It include a little hysteresis to keep the 81 * context switch rate down. 82 */ 83static inline int queue_congestion_on_threshold(struct request_queue *q) 84{ 85 return q->nr_congestion_on; 86} 87 88/* 89 * The threshold at which a queue is considered to be uncongested 90 */ 91static inline int queue_congestion_off_threshold(struct request_queue *q) 92{ 93 return q->nr_congestion_off; 94} 95 96static void blk_queue_congestion_threshold(struct request_queue *q) 97{ 98 int nr; 99 100 nr = q->nr_requests - (q->nr_requests / 8) + 1; 101 if (nr > q->nr_requests) 102 nr = q->nr_requests; 103 q->nr_congestion_on = nr; 104 105 nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1; 106 if (nr < 1) 107 nr = 1; 108 q->nr_congestion_off = nr; 109} 110 111/* 112 * A queue has just exitted congestion. Note this in the global counter of 113 * congested queues, and wake up anyone who was waiting for requests to be 114 * put back. 115 */ 116static void clear_queue_congested(request_queue_t *q, int rw) 117{ 118 enum bdi_state bit; 119 wait_queue_head_t *wqh = &congestion_wqh[rw]; 120 121 bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested; 122 clear_bit(bit, &q->backing_dev_info.state); 123 smp_mb__after_clear_bit(); 124 if (waitqueue_active(wqh)) 125 wake_up(wqh); 126} 127 128/* 129 * A queue has just entered congestion. Flag that in the queue's VM-visible 130 * state flags and increment the global gounter of congested queues. 131 */ 132static void set_queue_congested(request_queue_t *q, int rw) 133{ 134 enum bdi_state bit; 135 136 bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested; 137 set_bit(bit, &q->backing_dev_info.state); 138} 139 140/** 141 * blk_get_backing_dev_info - get the address of a queue's backing_dev_info 142 * @bdev: device 143 * 144 * Locates the passed device's request queue and returns the address of its 145 * backing_dev_info 146 * 147 * Will return NULL if the request queue cannot be located. 148 */ 149struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) 150{ 151 struct backing_dev_info *ret = NULL; 152 request_queue_t *q = bdev_get_queue(bdev); 153 154 if (q) 155 ret = &q->backing_dev_info; 156 return ret; 157} 158 159EXPORT_SYMBOL(blk_get_backing_dev_info); 160 161void blk_queue_activity_fn(request_queue_t *q, activity_fn *fn, void *data) 162{ 163 q->activity_fn = fn; 164 q->activity_data = data; 165} 166 167EXPORT_SYMBOL(blk_queue_activity_fn); 168 169/** 170 * blk_queue_prep_rq - set a prepare_request function for queue 171 * @q: queue 172 * @pfn: prepare_request function 173 * 174 * It's possible for a queue to register a prepare_request callback which 175 * is invoked before the request is handed to the request_fn. The goal of 176 * the function is to prepare a request for I/O, it can be used to build a 177 * cdb from the request data for instance. 178 * 179 */ 180void blk_queue_prep_rq(request_queue_t *q, prep_rq_fn *pfn) 181{ 182 q->prep_rq_fn = pfn; 183} 184 185EXPORT_SYMBOL(blk_queue_prep_rq); 186 187/** 188 * blk_queue_merge_bvec - set a merge_bvec function for queue 189 * @q: queue 190 * @mbfn: merge_bvec_fn 191 * 192 * Usually queues have static limitations on the max sectors or segments that 193 * we can put in a request. Stacking drivers may have some settings that 194 * are dynamic, and thus we have to query the queue whether it is ok to 195 * add a new bio_vec to a bio at a given offset or not. If the block device 196 * has such limitations, it needs to register a merge_bvec_fn to control 197 * the size of bio's sent to it. Note that a block device *must* allow a 198 * single page to be added to an empty bio. The block device driver may want 199 * to use the bio_split() function to deal with these bio's. By default 200 * no merge_bvec_fn is defined for a queue, and only the fixed limits are 201 * honored. 202 */ 203void blk_queue_merge_bvec(request_queue_t *q, merge_bvec_fn *mbfn) 204{ 205 q->merge_bvec_fn = mbfn; 206} 207 208EXPORT_SYMBOL(blk_queue_merge_bvec); 209 210/** 211 * blk_queue_make_request - define an alternate make_request function for a device 212 * @q: the request queue for the device to be affected 213 * @mfn: the alternate make_request function 214 * 215 * Description: 216 * The normal way for &struct bios to be passed to a device 217 * driver is for them to be collected into requests on a request 218 * queue, and then to allow the device driver to select requests 219 * off that queue when it is ready. This works well for many block 220 * devices. However some block devices (typically virtual devices 221 * such as md or lvm) do not benefit from the processing on the 222 * request queue, and are served best by having the requests passed 223 * directly to them. This can be achieved by providing a function 224 * to blk_queue_make_request(). 225 * 226 * Caveat: 227 * The driver that does this *must* be able to deal appropriately 228 * with buffers in "highmemory". This can be accomplished by either calling 229 * __bio_kmap_atomic() to get a temporary kernel mapping, or by calling 230 * blk_queue_bounce() to create a buffer in normal memory. 231 **/ 232void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn) 233{ 234 /* 235 * set defaults 236 */ 237 q->nr_requests = BLKDEV_MAX_RQ; 238 blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); 239 blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); 240 q->make_request_fn = mfn; 241 q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 242 q->backing_dev_info.state = 0; 243 q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY; 244 blk_queue_max_sectors(q, MAX_SECTORS); 245 blk_queue_hardsect_size(q, 512); 246 blk_queue_dma_alignment(q, 511); 247 blk_queue_congestion_threshold(q); 248 q->nr_batching = BLK_BATCH_REQ; 249 250 q->unplug_thresh = 4; /* hmm */ 251 q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */ 252 if (q->unplug_delay == 0) 253 q->unplug_delay = 1; 254 255 INIT_WORK(&q->unplug_work, blk_unplug_work, q); 256 257 q->unplug_timer.function = blk_unplug_timeout; 258 q->unplug_timer.data = (unsigned long)q; 259 260 /* 261 * by default assume old behaviour and bounce for any highmem page 262 */ 263 blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); 264 265 blk_queue_activity_fn(q, NULL, NULL); 266 267 INIT_LIST_HEAD(&q->drain_list); 268} 269 270EXPORT_SYMBOL(blk_queue_make_request); 271 272static inline void rq_init(request_queue_t *q, struct request *rq) 273{ 274 INIT_LIST_HEAD(&rq->queuelist); 275 276 rq->errors = 0; 277 rq->rq_status = RQ_ACTIVE; 278 rq->bio = rq->biotail = NULL; 279 rq->ioprio = 0; 280 rq->buffer = NULL; 281 rq->ref_count = 1; 282 rq->q = q; 283 rq->waiting = NULL; 284 rq->special = NULL; 285 rq->data_len = 0; 286 rq->data = NULL; 287 rq->nr_phys_segments = 0; 288 rq->sense = NULL; 289 rq->end_io = NULL; 290 rq->end_io_data = NULL; 291} 292 293/** 294 * blk_queue_ordered - does this queue support ordered writes 295 * @q: the request queue 296 * @flag: see below 297 * 298 * Description: 299 * For journalled file systems, doing ordered writes on a commit 300 * block instead of explicitly doing wait_on_buffer (which is bad 301 * for performance) can be a big win. Block drivers supporting this 302 * feature should call this function and indicate so. 303 * 304 **/ 305void blk_queue_ordered(request_queue_t *q, int flag) 306{ 307 switch (flag) { 308 case QUEUE_ORDERED_NONE: 309 if (q->flush_rq) 310 kmem_cache_free(request_cachep, q->flush_rq); 311 q->flush_rq = NULL; 312 q->ordered = flag; 313 break; 314 case QUEUE_ORDERED_TAG: 315 q->ordered = flag; 316 break; 317 case QUEUE_ORDERED_FLUSH: 318 q->ordered = flag; 319 if (!q->flush_rq) 320 q->flush_rq = kmem_cache_alloc(request_cachep, 321 GFP_KERNEL); 322 break; 323 default: 324 printk("blk_queue_ordered: bad value %d\n", flag); 325 break; 326 } 327} 328 329EXPORT_SYMBOL(blk_queue_ordered); 330 331/** 332 * blk_queue_issue_flush_fn - set function for issuing a flush 333 * @q: the request queue 334 * @iff: the function to be called issuing the flush 335 * 336 * Description: 337 * If a driver supports issuing a flush command, the support is notified 338 * to the block layer by defining it through this call. 339 * 340 **/ 341void blk_queue_issue_flush_fn(request_queue_t *q, issue_flush_fn *iff) 342{ 343 q->issue_flush_fn = iff; 344} 345 346EXPORT_SYMBOL(blk_queue_issue_flush_fn); 347 348/* 349 * Cache flushing for ordered writes handling 350 */ 351static void blk_pre_flush_end_io(struct request *flush_rq) 352{ 353 struct request *rq = flush_rq->end_io_data; 354 request_queue_t *q = rq->q; 355 356 rq->flags |= REQ_BAR_PREFLUSH; 357 358 if (!flush_rq->errors) 359 elv_requeue_request(q, rq); 360 else { 361 q->end_flush_fn(q, flush_rq); 362 clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags); 363 q->request_fn(q); 364 } 365} 366 367static void blk_post_flush_end_io(struct request *flush_rq) 368{ 369 struct request *rq = flush_rq->end_io_data; 370 request_queue_t *q = rq->q; 371 372 rq->flags |= REQ_BAR_POSTFLUSH; 373 374 q->end_flush_fn(q, flush_rq); 375 clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags); 376 q->request_fn(q); 377} 378 379struct request *blk_start_pre_flush(request_queue_t *q, struct request *rq) 380{ 381 struct request *flush_rq = q->flush_rq; 382 383 BUG_ON(!blk_barrier_rq(rq)); 384 385 if (test_and_set_bit(QUEUE_FLAG_FLUSH, &q->queue_flags)) 386 return NULL; 387 388 rq_init(q, flush_rq); 389 flush_rq->elevator_private = NULL; 390 flush_rq->flags = REQ_BAR_FLUSH; 391 flush_rq->rq_disk = rq->rq_disk; 392 flush_rq->rl = NULL; 393 394 /* 395 * prepare_flush returns 0 if no flush is needed, just mark both 396 * pre and post flush as done in that case 397 */ 398 if (!q->prepare_flush_fn(q, flush_rq)) { 399 rq->flags |= REQ_BAR_PREFLUSH | REQ_BAR_POSTFLUSH; 400 clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags); 401 return rq; 402 } 403 404 /* 405 * some drivers dequeue requests right away, some only after io 406 * completion. make sure the request is dequeued. 407 */ 408 if (!list_empty(&rq->queuelist)) 409 blkdev_dequeue_request(rq); 410 411 elv_deactivate_request(q, rq); 412 413 flush_rq->end_io_data = rq; 414 flush_rq->end_io = blk_pre_flush_end_io; 415 416 __elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0); 417 return flush_rq; 418} 419 420static void blk_start_post_flush(request_queue_t *q, struct request *rq) 421{ 422 struct request *flush_rq = q->flush_rq; 423 424 BUG_ON(!blk_barrier_rq(rq)); 425 426 rq_init(q, flush_rq); 427 flush_rq->elevator_private = NULL; 428 flush_rq->flags = REQ_BAR_FLUSH; 429 flush_rq->rq_disk = rq->rq_disk; 430 flush_rq->rl = NULL; 431 432 if (q->prepare_flush_fn(q, flush_rq)) { 433 flush_rq->end_io_data = rq; 434 flush_rq->end_io = blk_post_flush_end_io; 435 436 __elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0); 437 q->request_fn(q); 438 } 439} 440 441static inline int blk_check_end_barrier(request_queue_t *q, struct request *rq, 442 int sectors) 443{ 444 if (sectors > rq->nr_sectors) 445 sectors = rq->nr_sectors; 446 447 rq->nr_sectors -= sectors; 448 return rq->nr_sectors; 449} 450 451static int __blk_complete_barrier_rq(request_queue_t *q, struct request *rq, 452 int sectors, int queue_locked) 453{ 454 if (q->ordered != QUEUE_ORDERED_FLUSH) 455 return 0; 456 if (!blk_fs_request(rq) || !blk_barrier_rq(rq)) 457 return 0; 458 if (blk_barrier_postflush(rq)) 459 return 0; 460 461 if (!blk_check_end_barrier(q, rq, sectors)) { 462 unsigned long flags = 0; 463 464 if (!queue_locked) 465 spin_lock_irqsave(q->queue_lock, flags); 466 467 blk_start_post_flush(q, rq); 468 469 if (!queue_locked) 470 spin_unlock_irqrestore(q->queue_lock, flags); 471 } 472 473 return 1; 474} 475 476/** 477 * blk_complete_barrier_rq - complete possible barrier request 478 * @q: the request queue for the device 479 * @rq: the request 480 * @sectors: number of sectors to complete 481 * 482 * Description: 483 * Used in driver end_io handling to determine whether to postpone 484 * completion of a barrier request until a post flush has been done. This 485 * is the unlocked variant, used if the caller doesn't already hold the 486 * queue lock. 487 **/ 488int blk_complete_barrier_rq(request_queue_t *q, struct request *rq, int sectors) 489{ 490 return __blk_complete_barrier_rq(q, rq, sectors, 0); 491} 492EXPORT_SYMBOL(blk_complete_barrier_rq); 493 494/** 495 * blk_complete_barrier_rq_locked - complete possible barrier request 496 * @q: the request queue for the device 497 * @rq: the request 498 * @sectors: number of sectors to complete 499 * 500 * Description: 501 * See blk_complete_barrier_rq(). This variant must be used if the caller 502 * holds the queue lock. 503 **/ 504int blk_complete_barrier_rq_locked(request_queue_t *q, struct request *rq, 505 int sectors) 506{ 507 return __blk_complete_barrier_rq(q, rq, sectors, 1); 508} 509EXPORT_SYMBOL(blk_complete_barrier_rq_locked); 510 511/** 512 * blk_queue_bounce_limit - set bounce buffer limit for queue 513 * @q: the request queue for the device 514 * @dma_addr: bus address limit 515 * 516 * Description: 517 * Different hardware can have different requirements as to what pages 518 * it can do I/O directly to. A low level driver can call 519 * blk_queue_bounce_limit to have lower memory pages allocated as bounce 520 * buffers for doing I/O to pages residing above @page. By default 521 * the block layer sets this to the highest numbered "low" memory page. 522 **/ 523void blk_queue_bounce_limit(request_queue_t *q, u64 dma_addr) 524{ 525 unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT; 526 527 /* 528 * set appropriate bounce gfp mask -- unfortunately we don't have a 529 * full 4GB zone, so we have to resort to low memory for any bounces. 530 * ISA has its own < 16MB zone. 531 */ 532 if (bounce_pfn < blk_max_low_pfn) { 533 BUG_ON(dma_addr < BLK_BOUNCE_ISA); 534 init_emergency_isa_pool(); 535 q->bounce_gfp = GFP_NOIO | GFP_DMA; 536 } else 537 q->bounce_gfp = GFP_NOIO; 538 539 q->bounce_pfn = bounce_pfn; 540} 541 542EXPORT_SYMBOL(blk_queue_bounce_limit); 543 544/** 545 * blk_queue_max_sectors - set max sectors for a request for this queue 546 * @q: the request queue for the device 547 * @max_sectors: max sectors in the usual 512b unit 548 * 549 * Description: 550 * Enables a low level driver to set an upper limit on the size of 551 * received requests. 552 **/ 553void blk_queue_max_sectors(request_queue_t *q, unsigned short max_sectors) 554{ 555 if ((max_sectors << 9) < PAGE_CACHE_SIZE) { 556 max_sectors = 1 << (PAGE_CACHE_SHIFT - 9); 557 printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors); 558 } 559 560 q->max_sectors = q->max_hw_sectors = max_sectors; 561} 562 563EXPORT_SYMBOL(blk_queue_max_sectors); 564 565/** 566 * blk_queue_max_phys_segments - set max phys segments for a request for this queue 567 * @q: the request queue for the device 568 * @max_segments: max number of segments 569 * 570 * Description: 571 * Enables a low level driver to set an upper limit on the number of 572 * physical data segments in a request. This would be the largest sized 573 * scatter list the driver could handle. 574 **/ 575void blk_queue_max_phys_segments(request_queue_t *q, unsigned short max_segments) 576{ 577 if (!max_segments) { 578 max_segments = 1; 579 printk("%s: set to minimum %d\n", __FUNCTION__, max_segments); 580 } 581 582 q->max_phys_segments = max_segments; 583} 584 585EXPORT_SYMBOL(blk_queue_max_phys_segments); 586 587/** 588 * blk_queue_max_hw_segments - set max hw segments for a request for this queue 589 * @q: the request queue for the device 590 * @max_segments: max number of segments 591 * 592 * Description: 593 * Enables a low level driver to set an upper limit on the number of 594 * hw data segments in a request. This would be the largest number of 595 * address/length pairs the host adapter can actually give as once 596 * to the device. 597 **/ 598void blk_queue_max_hw_segments(request_queue_t *q, unsigned short max_segments) 599{ 600 if (!max_segments) { 601 max_segments = 1; 602 printk("%s: set to minimum %d\n", __FUNCTION__, max_segments); 603 } 604 605 q->max_hw_segments = max_segments; 606} 607 608EXPORT_SYMBOL(blk_queue_max_hw_segments); 609 610/** 611 * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg 612 * @q: the request queue for the device 613 * @max_size: max size of segment in bytes 614 * 615 * Description: 616 * Enables a low level driver to set an upper limit on the size of a 617 * coalesced segment 618 **/ 619void blk_queue_max_segment_size(request_queue_t *q, unsigned int max_size) 620{ 621 if (max_size < PAGE_CACHE_SIZE) { 622 max_size = PAGE_CACHE_SIZE; 623 printk("%s: set to minimum %d\n", __FUNCTION__, max_size); 624 } 625 626 q->max_segment_size = max_size; 627} 628 629EXPORT_SYMBOL(blk_queue_max_segment_size); 630 631/** 632 * blk_queue_hardsect_size - set hardware sector size for the queue 633 * @q: the request queue for the device 634 * @size: the hardware sector size, in bytes 635 * 636 * Description: 637 * This should typically be set to the lowest possible sector size 638 * that the hardware can operate on (possible without reverting to 639 * even internal read-modify-write operations). Usually the default 640 * of 512 covers most hardware. 641 **/ 642void blk_queue_hardsect_size(request_queue_t *q, unsigned short size) 643{ 644 q->hardsect_size = size; 645} 646 647EXPORT_SYMBOL(blk_queue_hardsect_size); 648 649/* 650 * Returns the minimum that is _not_ zero, unless both are zero. 651 */ 652#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) 653 654/** 655 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers 656 * @t: the stacking driver (top) 657 * @b: the underlying device (bottom) 658 **/ 659void blk_queue_stack_limits(request_queue_t *t, request_queue_t *b) 660{ 661 /* zero is "infinity" */ 662 t->max_sectors = t->max_hw_sectors = 663 min_not_zero(t->max_sectors,b->max_sectors); 664 665 t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments); 666 t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments); 667 t->max_segment_size = min(t->max_segment_size,b->max_segment_size); 668 t->hardsect_size = max(t->hardsect_size,b->hardsect_size); 669} 670 671EXPORT_SYMBOL(blk_queue_stack_limits); 672 673/** 674 * blk_queue_segment_boundary - set boundary rules for segment merging 675 * @q: the request queue for the device 676 * @mask: the memory boundary mask 677 **/ 678void blk_queue_segment_boundary(request_queue_t *q, unsigned long mask) 679{ 680 if (mask < PAGE_CACHE_SIZE - 1) { 681 mask = PAGE_CACHE_SIZE - 1; 682 printk("%s: set to minimum %lx\n", __FUNCTION__, mask); 683 } 684 685 q->seg_boundary_mask = mask; 686} 687 688EXPORT_SYMBOL(blk_queue_segment_boundary); 689 690/** 691 * blk_queue_dma_alignment - set dma length and memory alignment 692 * @q: the request queue for the device 693 * @mask: alignment mask 694 * 695 * description: 696 * set required memory and length aligment for direct dma transactions. 697 * this is used when buiding direct io requests for the queue. 698 * 699 **/ 700void blk_queue_dma_alignment(request_queue_t *q, int mask) 701{ 702 q->dma_alignment = mask; 703} 704 705EXPORT_SYMBOL(blk_queue_dma_alignment); 706 707/** 708 * blk_queue_find_tag - find a request by its tag and queue 709 * 710 * @q: The request queue for the device 711 * @tag: The tag of the request 712 * 713 * Notes: 714 * Should be used when a device returns a tag and you want to match 715 * it with a request. 716 * 717 * no locks need be held. 718 **/ 719struct request *blk_queue_find_tag(request_queue_t *q, int tag) 720{ 721 struct blk_queue_tag *bqt = q->queue_tags; 722 723 if (unlikely(bqt == NULL || tag >= bqt->real_max_depth)) 724 return NULL; 725 726 return bqt->tag_index[tag]; 727} 728 729EXPORT_SYMBOL(blk_queue_find_tag); 730 731/** 732 * __blk_queue_free_tags - release tag maintenance info 733 * @q: the request queue for the device 734 * 735 * Notes: 736 * blk_cleanup_queue() will take care of calling this function, if tagging 737 * has been used. So there's no need to call this directly. 738 **/ 739static void __blk_queue_free_tags(request_queue_t *q) 740{ 741 struct blk_queue_tag *bqt = q->queue_tags; 742 743 if (!bqt) 744 return; 745 746 if (atomic_dec_and_test(&bqt->refcnt)) { 747 BUG_ON(bqt->busy); 748 BUG_ON(!list_empty(&bqt->busy_list)); 749 750 kfree(bqt->tag_index); 751 bqt->tag_index = NULL; 752 753 kfree(bqt->tag_map); 754 bqt->tag_map = NULL; 755 756 kfree(bqt); 757 } 758 759 q->queue_tags = NULL; 760 q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED); 761} 762 763/** 764 * blk_queue_free_tags - release tag maintenance info 765 * @q: the request queue for the device 766 * 767 * Notes: 768 * This is used to disabled tagged queuing to a device, yet leave 769 * queue in function. 770 **/ 771void blk_queue_free_tags(request_queue_t *q) 772{ 773 clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags); 774} 775 776EXPORT_SYMBOL(blk_queue_free_tags); 777 778static int 779init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth) 780{ 781 struct request **tag_index; 782 unsigned long *tag_map; 783 int nr_ulongs; 784 785 if (depth > q->nr_requests * 2) { 786 depth = q->nr_requests * 2; 787 printk(KERN_ERR "%s: adjusted depth to %d\n", 788 __FUNCTION__, depth); 789 } 790 791 tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC); 792 if (!tag_index) 793 goto fail; 794 795 nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG; 796 tag_map = kmalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC); 797 if (!tag_map) 798 goto fail; 799 800 memset(tag_index, 0, depth * sizeof(struct request *)); 801 memset(tag_map, 0, nr_ulongs * sizeof(unsigned long)); 802 tags->real_max_depth = depth; 803 tags->max_depth = depth; 804 tags->tag_index = tag_index; 805 tags->tag_map = tag_map; 806 807 return 0; 808fail: 809 kfree(tag_index); 810 return -ENOMEM; 811} 812 813/** 814 * blk_queue_init_tags - initialize the queue tag info 815 * @q: the request queue for the device 816 * @depth: the maximum queue depth supported 817 * @tags: the tag to use 818 **/ 819int blk_queue_init_tags(request_queue_t *q, int depth, 820 struct blk_queue_tag *tags) 821{ 822 int rc; 823 824 BUG_ON(tags && q->queue_tags && tags != q->queue_tags); 825 826 if (!tags && !q->queue_tags) { 827 tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC); 828 if (!tags) 829 goto fail; 830 831 if (init_tag_map(q, tags, depth)) 832 goto fail; 833 834 INIT_LIST_HEAD(&tags->busy_list); 835 tags->busy = 0; 836 atomic_set(&tags->refcnt, 1); 837 } else if (q->queue_tags) { 838 if ((rc = blk_queue_resize_tags(q, depth))) 839 return rc; 840 set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags); 841 return 0; 842 } else 843 atomic_inc(&tags->refcnt); 844 845 /* 846 * assign it, all done 847 */ 848 q->queue_tags = tags; 849 q->queue_flags |= (1 << QUEUE_FLAG_QUEUED); 850 return 0; 851fail: 852 kfree(tags); 853 return -ENOMEM; 854} 855 856EXPORT_SYMBOL(blk_queue_init_tags); 857 858/** 859 * blk_queue_resize_tags - change the queueing depth 860 * @q: the request queue for the device 861 * @new_depth: the new max command queueing depth 862 * 863 * Notes: 864 * Must be called with the queue lock held. 865 **/ 866int blk_queue_resize_tags(request_queue_t *q, int new_depth) 867{ 868 struct blk_queue_tag *bqt = q->queue_tags; 869 struct request **tag_index; 870 unsigned long *tag_map; 871 int max_depth, nr_ulongs; 872 873 if (!bqt) 874 return -ENXIO; 875 876 /* 877 * if we already have large enough real_max_depth. just 878 * adjust max_depth. *NOTE* as requests with tag value 879 * between new_depth and real_max_depth can be in-flight, tag 880 * map can not be shrunk blindly here. 881 */ 882 if (new_depth <= bqt->real_max_depth) { 883 bqt->max_depth = new_depth; 884 return 0; 885 } 886 887 /* 888 * save the old state info, so we can copy it back 889 */ 890 tag_index = bqt->tag_index; 891 tag_map = bqt->tag_map; 892 max_depth = bqt->real_max_depth; 893 894 if (init_tag_map(q, bqt, new_depth)) 895 return -ENOMEM; 896 897 memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *)); 898 nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG; 899 memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long)); 900 901 kfree(tag_index); 902 kfree(tag_map); 903 return 0; 904} 905 906EXPORT_SYMBOL(blk_queue_resize_tags); 907 908/** 909 * blk_queue_end_tag - end tag operations for a request 910 * @q: the request queue for the device 911 * @rq: the request that has completed 912 * 913 * Description: 914 * Typically called when end_that_request_first() returns 0, meaning 915 * all transfers have been done for a request. It's important to call 916 * this function before end_that_request_last(), as that will put the 917 * request back on the free list thus corrupting the internal tag list. 918 * 919 * Notes: 920 * queue lock must be held. 921 **/ 922void blk_queue_end_tag(request_queue_t *q, struct request *rq) 923{ 924 struct blk_queue_tag *bqt = q->queue_tags; 925 int tag = rq->tag; 926 927 BUG_ON(tag == -1); 928 929 if (unlikely(tag >= bqt->real_max_depth)) 930 /* 931 * This can happen after tag depth has been reduced. 932 * FIXME: how about a warning or info message here? 933 */ 934 return; 935 936 if (unlikely(!__test_and_clear_bit(tag, bqt->tag_map))) { 937 printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n", 938 __FUNCTION__, tag); 939 return; 940 } 941 942 list_del_init(&rq->queuelist); 943 rq->flags &= ~REQ_QUEUED; 944 rq->tag = -1; 945 946 if (unlikely(bqt->tag_index[tag] == NULL)) 947 printk(KERN_ERR "%s: tag %d is missing\n", 948 __FUNCTION__, tag); 949 950 bqt->tag_index[tag] = NULL; 951 bqt->busy--; 952} 953 954EXPORT_SYMBOL(blk_queue_end_tag); 955 956/** 957 * blk_queue_start_tag - find a free tag and assign it 958 * @q: the request queue for the device 959 * @rq: the block request that needs tagging 960 * 961 * Description: 962 * This can either be used as a stand-alone helper, or possibly be 963 * assigned as the queue &prep_rq_fn (in which case &struct request 964 * automagically gets a tag assigned). Note that this function 965 * assumes that any type of request can be queued! if this is not 966 * true for your device, you must check the request type before 967 * calling this function. The request will also be removed from 968 * the request queue, so it's the drivers responsibility to readd 969 * it if it should need to be restarted for some reason. 970 * 971 * Notes: 972 * queue lock must be held. 973 **/ 974int blk_queue_start_tag(request_queue_t *q, struct request *rq) 975{ 976 struct blk_queue_tag *bqt = q->queue_tags; 977 int tag; 978 979 if (unlikely((rq->flags & REQ_QUEUED))) { 980 printk(KERN_ERR 981 "%s: request %p for device [%s] already tagged %d", 982 __FUNCTION__, rq, 983 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag); 984 BUG(); 985 } 986 987 tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth); 988 if (tag >= bqt->max_depth) 989 return 1; 990 991 __set_bit(tag, bqt->tag_map); 992 993 rq->flags |= REQ_QUEUED; 994 rq->tag = tag; 995 bqt->tag_index[tag] = rq; 996 blkdev_dequeue_request(rq); 997 list_add(&rq->queuelist, &bqt->busy_list); 998 bqt->busy++; 999 return 0; 1000} 1001 1002EXPORT_SYMBOL(blk_queue_start_tag); 1003 1004/** 1005 * blk_queue_invalidate_tags - invalidate all pending tags 1006 * @q: the request queue for the device 1007 * 1008 * Description: 1009 * Hardware conditions may dictate a need to stop all pending requests. 1010 * In this case, we will safely clear the block side of the tag queue and 1011 * readd all requests to the request queue in the right order. 1012 * 1013 * Notes: 1014 * queue lock must be held. 1015 **/ 1016void blk_queue_invalidate_tags(request_queue_t *q) 1017{ 1018 struct blk_queue_tag *bqt = q->queue_tags; 1019 struct list_head *tmp, *n; 1020 struct request *rq; 1021 1022 list_for_each_safe(tmp, n, &bqt->busy_list) { 1023 rq = list_entry_rq(tmp); 1024 1025 if (rq->tag == -1) { 1026 printk(KERN_ERR 1027 "%s: bad tag found on list\n", __FUNCTION__); 1028 list_del_init(&rq->queuelist); 1029 rq->flags &= ~REQ_QUEUED; 1030 } else 1031 blk_queue_end_tag(q, rq); 1032 1033 rq->flags &= ~REQ_STARTED; 1034 __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0); 1035 } 1036} 1037 1038EXPORT_SYMBOL(blk_queue_invalidate_tags); 1039 1040static char *rq_flags[] = { 1041 "REQ_RW", 1042 "REQ_FAILFAST", 1043 "REQ_SOFTBARRIER", 1044 "REQ_HARDBARRIER", 1045 "REQ_CMD", 1046 "REQ_NOMERGE", 1047 "REQ_STARTED", 1048 "REQ_DONTPREP", 1049 "REQ_QUEUED", 1050 "REQ_PC", 1051 "REQ_BLOCK_PC", 1052 "REQ_SENSE", 1053 "REQ_FAILED", 1054 "REQ_QUIET", 1055 "REQ_SPECIAL", 1056 "REQ_DRIVE_CMD", 1057 "REQ_DRIVE_TASK", 1058 "REQ_DRIVE_TASKFILE", 1059 "REQ_PREEMPT", 1060 "REQ_PM_SUSPEND", 1061 "REQ_PM_RESUME", 1062 "REQ_PM_SHUTDOWN", 1063}; 1064 1065void blk_dump_rq_flags(struct request *rq, char *msg) 1066{ 1067 int bit; 1068 1069 printk("%s: dev %s: flags = ", msg, 1070 rq->rq_disk ? rq->rq_disk->disk_name : "?"); 1071 bit = 0; 1072 do { 1073 if (rq->flags & (1 << bit)) 1074 printk("%s ", rq_flags[bit]); 1075 bit++; 1076 } while (bit < __REQ_NR_BITS); 1077 1078 printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector, 1079 rq->nr_sectors, 1080 rq->current_nr_sectors); 1081 printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len); 1082 1083 if (rq->flags & (REQ_BLOCK_PC | REQ_PC)) { 1084 printk("cdb: "); 1085 for (bit = 0; bit < sizeof(rq->cmd); bit++) 1086 printk("%02x ", rq->cmd[bit]); 1087 printk("\n"); 1088 } 1089} 1090 1091EXPORT_SYMBOL(blk_dump_rq_flags); 1092 1093void blk_recount_segments(request_queue_t *q, struct bio *bio) 1094{ 1095 struct bio_vec *bv, *bvprv = NULL; 1096 int i, nr_phys_segs, nr_hw_segs, seg_size, hw_seg_size, cluster; 1097 int high, highprv = 1; 1098 1099 if (unlikely(!bio->bi_io_vec)) 1100 return; 1101 1102 cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER); 1103 hw_seg_size = seg_size = nr_phys_segs = nr_hw_segs = 0; 1104 bio_for_each_segment(bv, bio, i) { 1105 /* 1106 * the trick here is making sure that a high page is never 1107 * considered part of another segment, since that might 1108 * change with the bounce page. 1109 */ 1110 high = page_to_pfn(bv->bv_page) >= q->bounce_pfn; 1111 if (high || highprv) 1112 goto new_hw_segment; 1113 if (cluster) { 1114 if (seg_size + bv->bv_len > q->max_segment_size) 1115 goto new_segment; 1116 if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv)) 1117 goto new_segment; 1118 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv)) 1119 goto new_segment; 1120 if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) 1121 goto new_hw_segment; 1122 1123 seg_size += bv->bv_len; 1124 hw_seg_size += bv->bv_len; 1125 bvprv = bv; 1126 continue; 1127 } 1128new_segment: 1129 if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) && 1130 !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) { 1131 hw_seg_size += bv->bv_len; 1132 } else { 1133new_hw_segment: 1134 if (hw_seg_size > bio->bi_hw_front_size) 1135 bio->bi_hw_front_size = hw_seg_size; 1136 hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len; 1137 nr_hw_segs++; 1138 } 1139 1140 nr_phys_segs++; 1141 bvprv = bv; 1142 seg_size = bv->bv_len; 1143 highprv = high; 1144 } 1145 if (hw_seg_size > bio->bi_hw_back_size) 1146 bio->bi_hw_back_size = hw_seg_size; 1147 if (nr_hw_segs == 1 && hw_seg_size > bio->bi_hw_front_size) 1148 bio->bi_hw_front_size = hw_seg_size; 1149 bio->bi_phys_segments = nr_phys_segs; 1150 bio->bi_hw_segments = nr_hw_segs; 1151 bio->bi_flags |= (1 << BIO_SEG_VALID); 1152} 1153 1154 1155static int blk_phys_contig_segment(request_queue_t *q, struct bio *bio, 1156 struct bio *nxt) 1157{ 1158 if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER))) 1159 return 0; 1160 1161 if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt))) 1162 return 0; 1163 if (bio->bi_size + nxt->bi_size > q->max_segment_size) 1164 return 0; 1165 1166 /* 1167 * bio and nxt are contigous in memory, check if the queue allows 1168 * these two to be merged into one 1169 */ 1170 if (BIO_SEG_BOUNDARY(q, bio, nxt)) 1171 return 1; 1172 1173 return 0; 1174} 1175 1176static int blk_hw_contig_segment(request_queue_t *q, struct bio *bio, 1177 struct bio *nxt) 1178{ 1179 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 1180 blk_recount_segments(q, bio); 1181 if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID))) 1182 blk_recount_segments(q, nxt); 1183 if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) || 1184 BIOVEC_VIRT_OVERSIZE(bio->bi_hw_front_size + bio->bi_hw_back_size)) 1185 return 0; 1186 if (bio->bi_size + nxt->bi_size > q->max_segment_size) 1187 return 0; 1188 1189 return 1; 1190} 1191 1192/* 1193 * map a request to scatterlist, return number of sg entries setup. Caller 1194 * must make sure sg can hold rq->nr_phys_segments entries 1195 */ 1196int blk_rq_map_sg(request_queue_t *q, struct request *rq, struct scatterlist *sg) 1197{ 1198 struct bio_vec *bvec, *bvprv; 1199 struct bio *bio; 1200 int nsegs, i, cluster; 1201 1202 nsegs = 0; 1203 cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER); 1204 1205 /* 1206 * for each bio in rq 1207 */ 1208 bvprv = NULL; 1209 rq_for_each_bio(bio, rq) { 1210 /* 1211 * for each segment in bio 1212 */ 1213 bio_for_each_segment(bvec, bio, i) { 1214 int nbytes = bvec->bv_len; 1215 1216 if (bvprv && cluster) { 1217 if (sg[nsegs - 1].length + nbytes > q->max_segment_size) 1218 goto new_segment; 1219 1220 if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) 1221 goto new_segment; 1222 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec)) 1223 goto new_segment; 1224 1225 sg[nsegs - 1].length += nbytes; 1226 } else { 1227new_segment: 1228 memset(&sg[nsegs],0,sizeof(struct scatterlist)); 1229 sg[nsegs].page = bvec->bv_page; 1230 sg[nsegs].length = nbytes; 1231 sg[nsegs].offset = bvec->bv_offset; 1232 1233 nsegs++; 1234 } 1235 bvprv = bvec; 1236 } /* segments in bio */ 1237 } /* bios in rq */ 1238 1239 return nsegs; 1240} 1241 1242EXPORT_SYMBOL(blk_rq_map_sg); 1243 1244/* 1245 * the standard queue merge functions, can be overridden with device 1246 * specific ones if so desired 1247 */ 1248 1249static inline int ll_new_mergeable(request_queue_t *q, 1250 struct request *req, 1251 struct bio *bio) 1252{ 1253 int nr_phys_segs = bio_phys_segments(q, bio); 1254 1255 if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) { 1256 req->flags |= REQ_NOMERGE; 1257 if (req == q->last_merge) 1258 q->last_merge = NULL; 1259 return 0; 1260 } 1261 1262 /* 1263 * A hw segment is just getting larger, bump just the phys 1264 * counter. 1265 */ 1266 req->nr_phys_segments += nr_phys_segs; 1267 return 1; 1268} 1269 1270static inline int ll_new_hw_segment(request_queue_t *q, 1271 struct request *req, 1272 struct bio *bio) 1273{ 1274 int nr_hw_segs = bio_hw_segments(q, bio); 1275 int nr_phys_segs = bio_phys_segments(q, bio); 1276 1277 if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments 1278 || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) { 1279 req->flags |= REQ_NOMERGE; 1280 if (req == q->last_merge) 1281 q->last_merge = NULL; 1282 return 0; 1283 } 1284 1285 /* 1286 * This will form the start of a new hw segment. Bump both 1287 * counters. 1288 */ 1289 req->nr_hw_segments += nr_hw_segs; 1290 req->nr_phys_segments += nr_phys_segs; 1291 return 1; 1292} 1293 1294static int ll_back_merge_fn(request_queue_t *q, struct request *req, 1295 struct bio *bio) 1296{ 1297 int len; 1298 1299 if (req->nr_sectors + bio_sectors(bio) > q->max_sectors) { 1300 req->flags |= REQ_NOMERGE; 1301 if (req == q->last_merge) 1302 q->last_merge = NULL; 1303 return 0; 1304 } 1305 if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID))) 1306 blk_recount_segments(q, req->biotail); 1307 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 1308 blk_recount_segments(q, bio); 1309 len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size; 1310 if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)) && 1311 !BIOVEC_VIRT_OVERSIZE(len)) { 1312 int mergeable = ll_new_mergeable(q, req, bio); 1313 1314 if (mergeable) { 1315 if (req->nr_hw_segments == 1) 1316 req->bio->bi_hw_front_size = len; 1317 if (bio->bi_hw_segments == 1) 1318 bio->bi_hw_back_size = len; 1319 } 1320 return mergeable; 1321 } 1322 1323 return ll_new_hw_segment(q, req, bio); 1324} 1325 1326static int ll_front_merge_fn(request_queue_t *q, struct request *req, 1327 struct bio *bio) 1328{ 1329 int len; 1330 1331 if (req->nr_sectors + bio_sectors(bio) > q->max_sectors) { 1332 req->flags |= REQ_NOMERGE; 1333 if (req == q->last_merge) 1334 q->last_merge = NULL; 1335 return 0; 1336 } 1337 len = bio->bi_hw_back_size + req->bio->bi_hw_front_size; 1338 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 1339 blk_recount_segments(q, bio); 1340 if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID))) 1341 blk_recount_segments(q, req->bio); 1342 if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) && 1343 !BIOVEC_VIRT_OVERSIZE(len)) { 1344 int mergeable = ll_new_mergeable(q, req, bio); 1345 1346 if (mergeable) { 1347 if (bio->bi_hw_segments == 1) 1348 bio->bi_hw_front_size = len; 1349 if (req->nr_hw_segments == 1) 1350 req->biotail->bi_hw_back_size = len; 1351 } 1352 return mergeable; 1353 } 1354 1355 return ll_new_hw_segment(q, req, bio); 1356} 1357 1358static int ll_merge_requests_fn(request_queue_t *q, struct request *req, 1359 struct request *next) 1360{ 1361 int total_phys_segments; 1362 int total_hw_segments; 1363 1364 /* 1365 * First check if the either of the requests are re-queued 1366 * requests. Can't merge them if they are. 1367 */ 1368 if (req->special || next->special) 1369 return 0; 1370 1371 /* 1372 * Will it become too large? 1373 */ 1374 if ((req->nr_sectors + next->nr_sectors) > q->max_sectors) 1375 return 0; 1376 1377 total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; 1378 if (blk_phys_contig_segment(q, req->biotail, next->bio)) 1379 total_phys_segments--; 1380 1381 if (total_phys_segments > q->max_phys_segments) 1382 return 0; 1383 1384 total_hw_segments = req->nr_hw_segments + next->nr_hw_segments; 1385 if (blk_hw_contig_segment(q, req->biotail, next->bio)) { 1386 int len = req->biotail->bi_hw_back_size + next->bio->bi_hw_front_size; 1387 /* 1388 * propagate the combined length to the end of the requests 1389 */ 1390 if (req->nr_hw_segments == 1) 1391 req->bio->bi_hw_front_size = len; 1392 if (next->nr_hw_segments == 1) 1393 next->biotail->bi_hw_back_size = len; 1394 total_hw_segments--; 1395 } 1396 1397 if (total_hw_segments > q->max_hw_segments) 1398 return 0; 1399 1400 /* Merge is OK... */ 1401 req->nr_phys_segments = total_phys_segments; 1402 req->nr_hw_segments = total_hw_segments; 1403 return 1; 1404} 1405 1406/* 1407 * "plug" the device if there are no outstanding requests: this will 1408 * force the transfer to start only after we have put all the requests 1409 * on the list. 1410 * 1411 * This is called with interrupts off and no requests on the queue and 1412 * with the queue lock held. 1413 */ 1414void blk_plug_device(request_queue_t *q) 1415{ 1416 WARN_ON(!irqs_disabled()); 1417 1418 /* 1419 * don't plug a stopped queue, it must be paired with blk_start_queue() 1420 * which will restart the queueing 1421 */ 1422 if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) 1423 return; 1424 1425 if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) 1426 mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); 1427} 1428 1429EXPORT_SYMBOL(blk_plug_device); 1430 1431/* 1432 * remove the queue from the plugged list, if present. called with 1433 * queue lock held and interrupts disabled. 1434 */ 1435int blk_remove_plug(request_queue_t *q) 1436{ 1437 WARN_ON(!irqs_disabled()); 1438 1439 if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) 1440 return 0; 1441 1442 del_timer(&q->unplug_timer); 1443 return 1; 1444} 1445 1446EXPORT_SYMBOL(blk_remove_plug); 1447 1448/* 1449 * remove the plug and let it rip.. 1450 */ 1451void __generic_unplug_device(request_queue_t *q) 1452{ 1453 if (unlikely(test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))) 1454 return; 1455 1456 if (!blk_remove_plug(q)) 1457 return; 1458 1459 q->request_fn(q); 1460} 1461EXPORT_SYMBOL(__generic_unplug_device); 1462 1463/** 1464 * generic_unplug_device - fire a request queue 1465 * @q: The &request_queue_t in question 1466 * 1467 * Description: 1468 * Linux uses plugging to build bigger requests queues before letting 1469 * the device have at them. If a queue is plugged, the I/O scheduler 1470 * is still adding and merging requests on the queue. Once the queue 1471 * gets unplugged, the request_fn defined for the queue is invoked and 1472 * transfers started. 1473 **/ 1474void generic_unplug_device(request_queue_t *q) 1475{ 1476 spin_lock_irq(q->queue_lock); 1477 __generic_unplug_device(q); 1478 spin_unlock_irq(q->queue_lock); 1479} 1480EXPORT_SYMBOL(generic_unplug_device); 1481 1482static void blk_backing_dev_unplug(struct backing_dev_info *bdi, 1483 struct page *page) 1484{ 1485 request_queue_t *q = bdi->unplug_io_data; 1486 1487 /* 1488 * devices don't necessarily have an ->unplug_fn defined 1489 */ 1490 if (q->unplug_fn) 1491 q->unplug_fn(q); 1492} 1493 1494static void blk_unplug_work(void *data) 1495{ 1496 request_queue_t *q = data; 1497 1498 q->unplug_fn(q); 1499} 1500 1501static void blk_unplug_timeout(unsigned long data) 1502{ 1503 request_queue_t *q = (request_queue_t *)data; 1504 1505 kblockd_schedule_work(&q->unplug_work); 1506} 1507 1508/** 1509 * blk_start_queue - restart a previously stopped queue 1510 * @q: The &request_queue_t in question 1511 * 1512 * Description: 1513 * blk_start_queue() will clear the stop flag on the queue, and call 1514 * the request_fn for the queue if it was in a stopped state when 1515 * entered. Also see blk_stop_queue(). Queue lock must be held. 1516 **/ 1517void blk_start_queue(request_queue_t *q) 1518{ 1519 clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags); 1520 1521 /* 1522 * one level of recursion is ok and is much faster than kicking 1523 * the unplug handling 1524 */ 1525 if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) { 1526 q->request_fn(q); 1527 clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags); 1528 } else { 1529 blk_plug_device(q); 1530 kblockd_schedule_work(&q->unplug_work); 1531 } 1532} 1533 1534EXPORT_SYMBOL(blk_start_queue); 1535 1536/** 1537 * blk_stop_queue - stop a queue 1538 * @q: The &request_queue_t in question 1539 * 1540 * Description: 1541 * The Linux block layer assumes that a block driver will consume all 1542 * entries on the request queue when the request_fn strategy is called. 1543 * Often this will not happen, because of hardware limitations (queue 1544 * depth settings). If a device driver gets a 'queue full' response, 1545 * or if it simply chooses not to queue more I/O at one point, it can 1546 * call this function to prevent the request_fn from being called until 1547 * the driver has signalled it's ready to go again. This happens by calling 1548 * blk_start_queue() to restart queue operations. Queue lock must be held. 1549 **/ 1550void blk_stop_queue(request_queue_t *q) 1551{ 1552 blk_remove_plug(q); 1553 set_bit(QUEUE_FLAG_STOPPED, &q->queue_flags); 1554} 1555EXPORT_SYMBOL(blk_stop_queue); 1556 1557/** 1558 * blk_sync_queue - cancel any pending callbacks on a queue 1559 * @q: the queue 1560 * 1561 * Description: 1562 * The block layer may perform asynchronous callback activity 1563 * on a queue, such as calling the unplug function after a timeout. 1564 * A block device may call blk_sync_queue to ensure that any 1565 * such activity is cancelled, thus allowing it to release resources 1566 * the the callbacks might use. The caller must already have made sure 1567 * that its ->make_request_fn will not re-add plugging prior to calling 1568 * this function. 1569 * 1570 */ 1571void blk_sync_queue(struct request_queue *q) 1572{ 1573 del_timer_sync(&q->unplug_timer); 1574 kblockd_flush(); 1575} 1576EXPORT_SYMBOL(blk_sync_queue); 1577 1578/** 1579 * blk_run_queue - run a single device queue 1580 * @q: The queue to run 1581 */ 1582void blk_run_queue(struct request_queue *q) 1583{ 1584 unsigned long flags; 1585 1586 spin_lock_irqsave(q->queue_lock, flags); 1587 blk_remove_plug(q); 1588 if (!elv_queue_empty(q)) 1589 q->request_fn(q); 1590 spin_unlock_irqrestore(q->queue_lock, flags); 1591} 1592EXPORT_SYMBOL(blk_run_queue); 1593 1594/** 1595 * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed 1596 * @q: the request queue to be released 1597 * 1598 * Description: 1599 * blk_cleanup_queue is the pair to blk_init_queue() or 1600 * blk_queue_make_request(). It should be called when a request queue is 1601 * being released; typically when a block device is being de-registered. 1602 * Currently, its primary task it to free all the &struct request 1603 * structures that were allocated to the queue and the queue itself. 1604 * 1605 * Caveat: 1606 * Hopefully the low level driver will have finished any 1607 * outstanding requests first... 1608 **/ 1609void blk_cleanup_queue(request_queue_t * q) 1610{ 1611 struct request_list *rl = &q->rq; 1612 1613 if (!atomic_dec_and_test(&q->refcnt)) 1614 return; 1615 1616 if (q->elevator) 1617 elevator_exit(q->elevator); 1618 1619 blk_sync_queue(q); 1620 1621 if (rl->rq_pool) 1622 mempool_destroy(rl->rq_pool); 1623 1624 if (q->queue_tags) 1625 __blk_queue_free_tags(q); 1626 1627 blk_queue_ordered(q, QUEUE_ORDERED_NONE); 1628 1629 kmem_cache_free(requestq_cachep, q); 1630} 1631 1632EXPORT_SYMBOL(blk_cleanup_queue); 1633 1634static int blk_init_free_list(request_queue_t *q) 1635{ 1636 struct request_list *rl = &q->rq; 1637 1638 rl->count[READ] = rl->count[WRITE] = 0; 1639 rl->starved[READ] = rl->starved[WRITE] = 0; 1640 init_waitqueue_head(&rl->wait[READ]); 1641 init_waitqueue_head(&rl->wait[WRITE]); 1642 init_waitqueue_head(&rl->drain); 1643 1644 rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, 1645 mempool_free_slab, request_cachep, q->node); 1646 1647 if (!rl->rq_pool) 1648 return -ENOMEM; 1649 1650 return 0; 1651} 1652 1653static int __make_request(request_queue_t *, struct bio *); 1654 1655request_queue_t *blk_alloc_queue(int gfp_mask) 1656{ 1657 return blk_alloc_queue_node(gfp_mask, -1); 1658} 1659EXPORT_SYMBOL(blk_alloc_queue); 1660 1661request_queue_t *blk_alloc_queue_node(int gfp_mask, int node_id) 1662{ 1663 request_queue_t *q; 1664 1665 q = kmem_cache_alloc_node(requestq_cachep, gfp_mask, node_id); 1666 if (!q) 1667 return NULL; 1668 1669 memset(q, 0, sizeof(*q)); 1670 init_timer(&q->unplug_timer); 1671 atomic_set(&q->refcnt, 1); 1672 1673 q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug; 1674 q->backing_dev_info.unplug_io_data = q; 1675 1676 return q; 1677} 1678EXPORT_SYMBOL(blk_alloc_queue_node); 1679 1680/** 1681 * blk_init_queue - prepare a request queue for use with a block device 1682 * @rfn: The function to be called to process requests that have been 1683 * placed on the queue. 1684 * @lock: Request queue spin lock 1685 * 1686 * Description: 1687 * If a block device wishes to use the standard request handling procedures, 1688 * which sorts requests and coalesces adjacent requests, then it must 1689 * call blk_init_queue(). The function @rfn will be called when there 1690 * are requests on the queue that need to be processed. If the device 1691 * supports plugging, then @rfn may not be called immediately when requests 1692 * are available on the queue, but may be called at some time later instead. 1693 * Plugged queues are generally unplugged when a buffer belonging to one 1694 * of the requests on the queue is needed, or due to memory pressure. 1695 * 1696 * @rfn is not required, or even expected, to remove all requests off the 1697 * queue, but only as many as it can handle at a time. If it does leave 1698 * requests on the queue, it is responsible for arranging that the requests 1699 * get dealt with eventually. 1700 * 1701 * The queue spin lock must be held while manipulating the requests on the 1702 * request queue. 1703 * 1704 * Function returns a pointer to the initialized request queue, or NULL if 1705 * it didn't succeed. 1706 * 1707 * Note: 1708 * blk_init_queue() must be paired with a blk_cleanup_queue() call 1709 * when the block device is deactivated (such as at module unload). 1710 **/ 1711 1712request_queue_t *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) 1713{ 1714 return blk_init_queue_node(rfn, lock, -1); 1715} 1716EXPORT_SYMBOL(blk_init_queue); 1717 1718request_queue_t * 1719blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) 1720{ 1721 request_queue_t *q = blk_alloc_queue_node(GFP_KERNEL, node_id); 1722 1723 if (!q) 1724 return NULL; 1725 1726 q->node = node_id; 1727 if (blk_init_free_list(q)) 1728 goto out_init; 1729 1730 /* 1731 * if caller didn't supply a lock, they get per-queue locking with 1732 * our embedded lock 1733 */ 1734 if (!lock) { 1735 spin_lock_init(&q->__queue_lock); 1736 lock = &q->__queue_lock; 1737 } 1738 1739 q->request_fn = rfn; 1740 q->back_merge_fn = ll_back_merge_fn; 1741 q->front_merge_fn = ll_front_merge_fn; 1742 q->merge_requests_fn = ll_merge_requests_fn; 1743 q->prep_rq_fn = NULL; 1744 q->unplug_fn = generic_unplug_device; 1745 q->queue_flags = (1 << QUEUE_FLAG_CLUSTER); 1746 q->queue_lock = lock; 1747 1748 blk_queue_segment_boundary(q, 0xffffffff); 1749 1750 blk_queue_make_request(q, __make_request); 1751 blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE); 1752 1753 blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); 1754 blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); 1755 1756 /* 1757 * all done 1758 */ 1759 if (!elevator_init(q, NULL)) { 1760 blk_queue_congestion_threshold(q); 1761 return q; 1762 } 1763 1764 blk_cleanup_queue(q); 1765out_init: 1766 kmem_cache_free(requestq_cachep, q); 1767 return NULL; 1768} 1769EXPORT_SYMBOL(blk_init_queue_node); 1770 1771int blk_get_queue(request_queue_t *q) 1772{ 1773 if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { 1774 atomic_inc(&q->refcnt); 1775 return 0; 1776 } 1777 1778 return 1; 1779} 1780 1781EXPORT_SYMBOL(blk_get_queue); 1782 1783static inline void blk_free_request(request_queue_t *q, struct request *rq) 1784{ 1785 elv_put_request(q, rq); 1786 mempool_free(rq, q->rq.rq_pool); 1787} 1788 1789static inline struct request * 1790blk_alloc_request(request_queue_t *q, int rw, struct bio *bio, int gfp_mask) 1791{ 1792 struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); 1793 1794 if (!rq) 1795 return NULL; 1796 1797 /* 1798 * first three bits are identical in rq->flags and bio->bi_rw, 1799 * see bio.h and blkdev.h 1800 */ 1801 rq->flags = rw; 1802 1803 if (!elv_set_request(q, rq, bio, gfp_mask)) 1804 return rq; 1805 1806 mempool_free(rq, q->rq.rq_pool); 1807 return NULL; 1808} 1809 1810/* 1811 * ioc_batching returns true if the ioc is a valid batching request and 1812 * should be given priority access to a request. 1813 */ 1814static inline int ioc_batching(request_queue_t *q, struct io_context *ioc) 1815{ 1816 if (!ioc) 1817 return 0; 1818 1819 /* 1820 * Make sure the process is able to allocate at least 1 request 1821 * even if the batch times out, otherwise we could theoretically 1822 * lose wakeups. 1823 */ 1824 return ioc->nr_batch_requests == q->nr_batching || 1825 (ioc->nr_batch_requests > 0 1826 && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME)); 1827} 1828 1829/* 1830 * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This 1831 * will cause the process to be a "batcher" on all queues in the system. This 1832 * is the behaviour we want though - once it gets a wakeup it should be given 1833 * a nice run. 1834 */ 1835static void ioc_set_batching(request_queue_t *q, struct io_context *ioc) 1836{ 1837 if (!ioc || ioc_batching(q, ioc)) 1838 return; 1839 1840 ioc->nr_batch_requests = q->nr_batching; 1841 ioc->last_waited = jiffies; 1842} 1843 1844static void __freed_request(request_queue_t *q, int rw) 1845{ 1846 struct request_list *rl = &q->rq; 1847 1848 if (rl->count[rw] < queue_congestion_off_threshold(q)) 1849 clear_queue_congested(q, rw); 1850 1851 if (rl->count[rw] + 1 <= q->nr_requests) { 1852 if (waitqueue_active(&rl->wait[rw])) 1853 wake_up(&rl->wait[rw]); 1854 1855 blk_clear_queue_full(q, rw); 1856 } 1857} 1858 1859/* 1860 * A request has just been released. Account for it, update the full and 1861 * congestion status, wake up any waiters. Called under q->queue_lock. 1862 */ 1863static void freed_request(request_queue_t *q, int rw) 1864{ 1865 struct request_list *rl = &q->rq; 1866 1867 rl->count[rw]--; 1868 1869 __freed_request(q, rw); 1870 1871 if (unlikely(rl->starved[rw ^ 1])) 1872 __freed_request(q, rw ^ 1); 1873 1874 if (!rl->count[READ] && !rl->count[WRITE]) { 1875 smp_mb(); 1876 if (unlikely(waitqueue_active(&rl->drain))) 1877 wake_up(&rl->drain); 1878 } 1879} 1880 1881#define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist) 1882/* 1883 * Get a free request, queue_lock must be held. 1884 * Returns NULL on failure, with queue_lock held. 1885 * Returns !NULL on success, with queue_lock *not held*. 1886 */ 1887static struct request *get_request(request_queue_t *q, int rw, struct bio *bio, 1888 int gfp_mask) 1889{ 1890 struct request *rq = NULL; 1891 struct request_list *rl = &q->rq; 1892 struct io_context *ioc = current_io_context(GFP_ATOMIC); 1893 1894 if (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags))) 1895 goto out; 1896 1897 if (rl->count[rw]+1 >= q->nr_requests) { 1898 /* 1899 * The queue will fill after this allocation, so set it as 1900 * full, and mark this process as "batching". This process 1901 * will be allowed to complete a batch of requests, others 1902 * will be blocked. 1903 */ 1904 if (!blk_queue_full(q, rw)) { 1905 ioc_set_batching(q, ioc); 1906 blk_set_queue_full(q, rw); 1907 } 1908 } 1909 1910 switch (elv_may_queue(q, rw, bio)) { 1911 case ELV_MQUEUE_NO: 1912 goto rq_starved; 1913 case ELV_MQUEUE_MAY: 1914 break; 1915 case ELV_MQUEUE_MUST: 1916 goto get_rq; 1917 } 1918 1919 if (blk_queue_full(q, rw) && !ioc_batching(q, ioc)) { 1920 /* 1921 * The queue is full and the allocating process is not a 1922 * "batcher", and not exempted by the IO scheduler 1923 */ 1924 goto out; 1925 } 1926 1927get_rq: 1928 /* 1929 * Only allow batching queuers to allocate up to 50% over the defined 1930 * limit of requests, otherwise we could have thousands of requests 1931 * allocated with any setting of ->nr_requests 1932 */ 1933 if (rl->count[rw] >= (3 * q->nr_requests / 2)) 1934 goto out; 1935 1936 rl->count[rw]++; 1937 rl->starved[rw] = 0; 1938 if (rl->count[rw] >= queue_congestion_on_threshold(q)) 1939 set_queue_congested(q, rw); 1940 spin_unlock_irq(q->queue_lock); 1941 1942 rq = blk_alloc_request(q, rw, bio, gfp_mask); 1943 if (!rq) { 1944 /* 1945 * Allocation failed presumably due to memory. Undo anything 1946 * we might have messed up. 1947 * 1948 * Allocating task should really be put onto the front of the 1949 * wait queue, but this is pretty rare. 1950 */ 1951 spin_lock_irq(q->queue_lock); 1952 freed_request(q, rw); 1953 1954 /* 1955 * in the very unlikely event that allocation failed and no 1956 * requests for this direction was pending, mark us starved 1957 * so that freeing of a request in the other direction will 1958 * notice us. another possible fix would be to split the 1959 * rq mempool into READ and WRITE 1960 */ 1961rq_starved: 1962 if (unlikely(rl->count[rw] == 0)) 1963 rl->starved[rw] = 1; 1964 1965 goto out; 1966 } 1967 1968 if (ioc_batching(q, ioc)) 1969 ioc->nr_batch_requests--; 1970 1971 rq_init(q, rq); 1972 rq->rl = rl; 1973out: 1974 return rq; 1975} 1976 1977/* 1978 * No available requests for this queue, unplug the device and wait for some 1979 * requests to become available. 1980 * 1981 * Called with q->queue_lock held, and returns with it unlocked. 1982 */ 1983static struct request *get_request_wait(request_queue_t *q, int rw, 1984 struct bio *bio) 1985{ 1986 struct request *rq; 1987 1988 rq = get_request(q, rw, bio, GFP_NOIO); 1989 while (!rq) { 1990 DEFINE_WAIT(wait); 1991 struct request_list *rl = &q->rq; 1992 1993 prepare_to_wait_exclusive(&rl->wait[rw], &wait, 1994 TASK_UNINTERRUPTIBLE); 1995 1996 rq = get_request(q, rw, bio, GFP_NOIO); 1997 1998 if (!rq) { 1999 struct io_context *ioc; 2000 2001 __generic_unplug_device(q); 2002 spin_unlock_irq(q->queue_lock); 2003 io_schedule(); 2004 2005 /* 2006 * After sleeping, we become a "batching" process and 2007 * will be able to allocate at least one request, and 2008 * up to a big batch of them for a small period time. 2009 * See ioc_batching, ioc_set_batching 2010 */ 2011 ioc = current_io_context(GFP_NOIO); 2012 ioc_set_batching(q, ioc); 2013 2014 spin_lock_irq(q->queue_lock); 2015 } 2016 finish_wait(&rl->wait[rw], &wait); 2017 } 2018 2019 return rq; 2020} 2021 2022struct request *blk_get_request(request_queue_t *q, int rw, int gfp_mask) 2023{ 2024 struct request *rq; 2025 2026 BUG_ON(rw != READ && rw != WRITE); 2027 2028 spin_lock_irq(q->queue_lock); 2029 if (gfp_mask & __GFP_WAIT) { 2030 rq = get_request_wait(q, rw, NULL); 2031 } else { 2032 rq = get_request(q, rw, NULL, gfp_mask); 2033 if (!rq) 2034 spin_unlock_irq(q->queue_lock); 2035 } 2036 /* q->queue_lock is unlocked at this point */ 2037 2038 return rq; 2039} 2040EXPORT_SYMBOL(blk_get_request); 2041 2042/** 2043 * blk_requeue_request - put a request back on queue 2044 * @q: request queue where request should be inserted 2045 * @rq: request to be inserted 2046 * 2047 * Description: 2048 * Drivers often keep queueing requests until the hardware cannot accept 2049 * more, when that condition happens we need to put the request back 2050 * on the queue. Must be called with queue lock held. 2051 */ 2052void blk_requeue_request(request_queue_t *q, struct request *rq) 2053{ 2054 if (blk_rq_tagged(rq)) 2055 blk_queue_end_tag(q, rq); 2056 2057 elv_requeue_request(q, rq); 2058} 2059 2060EXPORT_SYMBOL(blk_requeue_request); 2061 2062/** 2063 * blk_insert_request - insert a special request in to a request queue 2064 * @q: request queue where request should be inserted 2065 * @rq: request to be inserted 2066 * @at_head: insert request at head or tail of queue 2067 * @data: private data 2068 * 2069 * Description: 2070 * Many block devices need to execute commands asynchronously, so they don't 2071 * block the whole kernel from preemption during request execution. This is 2072 * accomplished normally by inserting aritficial requests tagged as 2073 * REQ_SPECIAL in to the corresponding request queue, and letting them be 2074 * scheduled for actual execution by the request queue. 2075 * 2076 * We have the option of inserting the head or the tail of the queue. 2077 * Typically we use the tail for new ioctls and so forth. We use the head 2078 * of the queue for things like a QUEUE_FULL message from a device, or a 2079 * host that is unable to accept a particular command. 2080 */ 2081void blk_insert_request(request_queue_t *q, struct request *rq, 2082 int at_head, void *data) 2083{ 2084 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; 2085 unsigned long flags; 2086 2087 /* 2088 * tell I/O scheduler that this isn't a regular read/write (ie it 2089 * must not attempt merges on this) and that it acts as a soft 2090 * barrier 2091 */ 2092 rq->flags |= REQ_SPECIAL | REQ_SOFTBARRIER; 2093 2094 rq->special = data; 2095 2096 spin_lock_irqsave(q->queue_lock, flags); 2097 2098 /* 2099 * If command is tagged, release the tag 2100 */ 2101 if (blk_rq_tagged(rq)) 2102 blk_queue_end_tag(q, rq); 2103 2104 drive_stat_acct(rq, rq->nr_sectors, 1); 2105 __elv_add_request(q, rq, where, 0); 2106 2107 if (blk_queue_plugged(q)) 2108 __generic_unplug_device(q); 2109 else 2110 q->request_fn(q); 2111 spin_unlock_irqrestore(q->queue_lock, flags); 2112} 2113 2114EXPORT_SYMBOL(blk_insert_request); 2115 2116/** 2117 * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage 2118 * @q: request queue where request should be inserted 2119 * @rq: request structure to fill 2120 * @ubuf: the user buffer 2121 * @len: length of user data 2122 * 2123 * Description: 2124 * Data will be mapped directly for zero copy io, if possible. Otherwise 2125 * a kernel bounce buffer is used. 2126 * 2127 * A matching blk_rq_unmap_user() must be issued at the end of io, while 2128 * still in process context. 2129 * 2130 * Note: The mapped bio may need to be bounced through blk_queue_bounce() 2131 * before being submitted to the device, as pages mapped may be out of 2132 * reach. It's the callers responsibility to make sure this happens. The 2133 * original bio must be passed back in to blk_rq_unmap_user() for proper 2134 * unmapping. 2135 */ 2136int blk_rq_map_user(request_queue_t *q, struct request *rq, void __user *ubuf, 2137 unsigned int len) 2138{ 2139 unsigned long uaddr; 2140 struct bio *bio; 2141 int reading; 2142 2143 if (len > (q->max_sectors << 9)) 2144 return -EINVAL; 2145 if (!len || !ubuf) 2146 return -EINVAL; 2147 2148 reading = rq_data_dir(rq) == READ; 2149 2150 /* 2151 * if alignment requirement is satisfied, map in user pages for 2152 * direct dma. else, set up kernel bounce buffers 2153 */ 2154 uaddr = (unsigned long) ubuf; 2155 if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q))) 2156 bio = bio_map_user(q, NULL, uaddr, len, reading); 2157 else 2158 bio = bio_copy_user(q, uaddr, len, reading); 2159 2160 if (!IS_ERR(bio)) { 2161 rq->bio = rq->biotail = bio; 2162 blk_rq_bio_prep(q, rq, bio); 2163 2164 rq->buffer = rq->data = NULL; 2165 rq->data_len = len; 2166 return 0; 2167 } 2168 2169 /* 2170 * bio is the err-ptr 2171 */ 2172 return PTR_ERR(bio); 2173} 2174 2175EXPORT_SYMBOL(blk_rq_map_user); 2176 2177/** 2178 * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage 2179 * @q: request queue where request should be inserted 2180 * @rq: request to map data to 2181 * @iov: pointer to the iovec 2182 * @iov_count: number of elements in the iovec 2183 * 2184 * Description: 2185 * Data will be mapped directly for zero copy io, if possible. Otherwise 2186 * a kernel bounce buffer is used. 2187 * 2188 * A matching blk_rq_unmap_user() must be issued at the end of io, while 2189 * still in process context. 2190 * 2191 * Note: The mapped bio may need to be bounced through blk_queue_bounce() 2192 * before being submitted to the device, as pages mapped may be out of 2193 * reach. It's the callers responsibility to make sure this happens. The 2194 * original bio must be passed back in to blk_rq_unmap_user() for proper 2195 * unmapping. 2196 */ 2197int blk_rq_map_user_iov(request_queue_t *q, struct request *rq, 2198 struct sg_iovec *iov, int iov_count) 2199{ 2200 struct bio *bio; 2201 2202 if (!iov || iov_count <= 0) 2203 return -EINVAL; 2204 2205 /* we don't allow misaligned data like bio_map_user() does. If the 2206 * user is using sg, they're expected to know the alignment constraints 2207 * and respect them accordingly */ 2208 bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ); 2209 if (IS_ERR(bio)) 2210 return PTR_ERR(bio); 2211 2212 rq->bio = rq->biotail = bio; 2213 blk_rq_bio_prep(q, rq, bio); 2214 rq->buffer = rq->data = NULL; 2215 rq->data_len = bio->bi_size; 2216 return 0; 2217} 2218 2219EXPORT_SYMBOL(blk_rq_map_user_iov); 2220 2221/** 2222 * blk_rq_unmap_user - unmap a request with user data 2223 * @bio: bio to be unmapped 2224 * @ulen: length of user buffer 2225 * 2226 * Description: 2227 * Unmap a bio previously mapped by blk_rq_map_user(). 2228 */ 2229int blk_rq_unmap_user(struct bio *bio, unsigned int ulen) 2230{ 2231 int ret = 0; 2232 2233 if (bio) { 2234 if (bio_flagged(bio, BIO_USER_MAPPED)) 2235 bio_unmap_user(bio); 2236 else 2237 ret = bio_uncopy_user(bio); 2238 } 2239 2240 return 0; 2241} 2242 2243EXPORT_SYMBOL(blk_rq_unmap_user); 2244 2245/** 2246 * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage 2247 * @q: request queue where request should be inserted 2248 * @rq: request to fill 2249 * @kbuf: the kernel buffer 2250 * @len: length of user data 2251 * @gfp_mask: memory allocation flags 2252 */ 2253int blk_rq_map_kern(request_queue_t *q, struct request *rq, void *kbuf, 2254 unsigned int len, unsigned int gfp_mask) 2255{ 2256 struct bio *bio; 2257 2258 if (len > (q->max_sectors << 9)) 2259 return -EINVAL; 2260 if (!len || !kbuf) 2261 return -EINVAL; 2262 2263 bio = bio_map_kern(q, kbuf, len, gfp_mask); 2264 if (IS_ERR(bio)) 2265 return PTR_ERR(bio); 2266 2267 if (rq_data_dir(rq) == WRITE) 2268 bio->bi_rw |= (1 << BIO_RW); 2269 2270 rq->bio = rq->biotail = bio; 2271 blk_rq_bio_prep(q, rq, bio); 2272 2273 rq->buffer = rq->data = NULL; 2274 rq->data_len = len; 2275 return 0; 2276} 2277 2278EXPORT_SYMBOL(blk_rq_map_kern); 2279 2280/** 2281 * blk_execute_rq_nowait - insert a request into queue for execution 2282 * @q: queue to insert the request in 2283 * @bd_disk: matching gendisk 2284 * @rq: request to insert 2285 * @at_head: insert request at head or tail of queue 2286 * @done: I/O completion handler 2287 * 2288 * Description: 2289 * Insert a fully prepared request at the back of the io scheduler queue 2290 * for execution. Don't wait for completion. 2291 */ 2292void blk_execute_rq_nowait(request_queue_t *q, struct gendisk *bd_disk, 2293 struct request *rq, int at_head, 2294 void (*done)(struct request *)) 2295{ 2296 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; 2297 2298 rq->rq_disk = bd_disk; 2299 rq->flags |= REQ_NOMERGE; 2300 rq->end_io = done; 2301 elv_add_request(q, rq, where, 1); 2302 generic_unplug_device(q); 2303} 2304 2305/** 2306 * blk_execute_rq - insert a request into queue for execution 2307 * @q: queue to insert the request in 2308 * @bd_disk: matching gendisk 2309 * @rq: request to insert 2310 * @at_head: insert request at head or tail of queue 2311 * 2312 * Description: 2313 * Insert a fully prepared request at the back of the io scheduler queue 2314 * for execution and wait for completion. 2315 */ 2316int blk_execute_rq(request_queue_t *q, struct gendisk *bd_disk, 2317 struct request *rq, int at_head) 2318{ 2319 DECLARE_COMPLETION(wait); 2320 char sense[SCSI_SENSE_BUFFERSIZE]; 2321 int err = 0; 2322 2323 /* 2324 * we need an extra reference to the request, so we can look at 2325 * it after io completion 2326 */ 2327 rq->ref_count++; 2328 2329 if (!rq->sense) { 2330 memset(sense, 0, sizeof(sense)); 2331 rq->sense = sense; 2332 rq->sense_len = 0; 2333 } 2334 2335 rq->waiting = &wait; 2336 blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); 2337 wait_for_completion(&wait); 2338 rq->waiting = NULL; 2339 2340 if (rq->errors) 2341 err = -EIO; 2342 2343 return err; 2344} 2345 2346EXPORT_SYMBOL(blk_execute_rq); 2347 2348/** 2349 * blkdev_issue_flush - queue a flush 2350 * @bdev: blockdev to issue flush for 2351 * @error_sector: error sector 2352 * 2353 * Description: 2354 * Issue a flush for the block device in question. Caller can supply 2355 * room for storing the error offset in case of a flush error, if they 2356 * wish to. Caller must run wait_for_completion() on its own. 2357 */ 2358int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) 2359{ 2360 request_queue_t *q; 2361 2362 if (bdev->bd_disk == NULL) 2363 return -ENXIO; 2364 2365 q = bdev_get_queue(bdev); 2366 if (!q) 2367 return -ENXIO; 2368 if (!q->issue_flush_fn) 2369 return -EOPNOTSUPP; 2370 2371 return q->issue_flush_fn(q, bdev->bd_disk, error_sector); 2372} 2373 2374EXPORT_SYMBOL(blkdev_issue_flush); 2375 2376/** 2377 * blkdev_scsi_issue_flush_fn - issue flush for SCSI devices 2378 * @q: device queue 2379 * @disk: gendisk 2380 * @error_sector: error offset 2381 * 2382 * Description: 2383 * Devices understanding the SCSI command set, can use this function as 2384 * a helper for issuing a cache flush. Note: driver is required to store 2385 * the error offset (in case of error flushing) in ->sector of struct 2386 * request. 2387 */ 2388int blkdev_scsi_issue_flush_fn(request_queue_t *q, struct gendisk *disk, 2389 sector_t *error_sector) 2390{ 2391 struct request *rq = blk_get_request(q, WRITE, __GFP_WAIT); 2392 int ret; 2393 2394 rq->flags |= REQ_BLOCK_PC | REQ_SOFTBARRIER; 2395 rq->sector = 0; 2396 memset(rq->cmd, 0, sizeof(rq->cmd)); 2397 rq->cmd[0] = 0x35; 2398 rq->cmd_len = 12; 2399 rq->data = NULL; 2400 rq->data_len = 0; 2401 rq->timeout = 60 * HZ; 2402 2403 ret = blk_execute_rq(q, disk, rq, 0); 2404 2405 if (ret && error_sector) 2406 *error_sector = rq->sector; 2407 2408 blk_put_request(rq); 2409 return ret; 2410} 2411 2412EXPORT_SYMBOL(blkdev_scsi_issue_flush_fn); 2413 2414static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io) 2415{ 2416 int rw = rq_data_dir(rq); 2417 2418 if (!blk_fs_request(rq) || !rq->rq_disk) 2419 return; 2420 2421 if (rw == READ) { 2422 __disk_stat_add(rq->rq_disk, read_sectors, nr_sectors); 2423 if (!new_io) 2424 __disk_stat_inc(rq->rq_disk, read_merges); 2425 } else if (rw == WRITE) { 2426 __disk_stat_add(rq->rq_disk, write_sectors, nr_sectors); 2427 if (!new_io) 2428 __disk_stat_inc(rq->rq_disk, write_merges); 2429 } 2430 if (new_io) { 2431 disk_round_stats(rq->rq_disk); 2432 rq->rq_disk->in_flight++; 2433 } 2434} 2435 2436/* 2437 * add-request adds a request to the linked list. 2438 * queue lock is held and interrupts disabled, as we muck with the 2439 * request queue list. 2440 */ 2441static inline void add_request(request_queue_t * q, struct request * req) 2442{ 2443 drive_stat_acct(req, req->nr_sectors, 1); 2444 2445 if (q->activity_fn) 2446 q->activity_fn(q->activity_data, rq_data_dir(req)); 2447 2448 /* 2449 * elevator indicated where it wants this request to be 2450 * inserted at elevator_merge time 2451 */ 2452 __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0); 2453} 2454 2455/* 2456 * disk_round_stats() - Round off the performance stats on a struct 2457 * disk_stats. 2458 * 2459 * The average IO queue length and utilisation statistics are maintained 2460 * by observing the current state of the queue length and the amount of 2461 * time it has been in this state for. 2462 * 2463 * Normally, that accounting is done on IO completion, but that can result 2464 * in more than a second's worth of IO being accounted for within any one 2465 * second, leading to >100% utilisation. To deal with that, we call this 2466 * function to do a round-off before returning the results when reading 2467 * /proc/diskstats. This accounts immediately for all queue usage up to 2468 * the current jiffies and restarts the counters again. 2469 */ 2470void disk_round_stats(struct gendisk *disk) 2471{ 2472 unsigned long now = jiffies; 2473 2474 __disk_stat_add(disk, time_in_queue, 2475 disk->in_flight * (now - disk->stamp)); 2476 disk->stamp = now; 2477 2478 if (disk->in_flight) 2479 __disk_stat_add(disk, io_ticks, (now - disk->stamp_idle)); 2480 disk->stamp_idle = now; 2481} 2482 2483/* 2484 * queue lock must be held 2485 */ 2486static void __blk_put_request(request_queue_t *q, struct request *req) 2487{ 2488 struct request_list *rl = req->rl; 2489 2490 if (unlikely(!q)) 2491 return; 2492 if (unlikely(--req->ref_count)) 2493 return; 2494 2495 req->rq_status = RQ_INACTIVE; 2496 req->rl = NULL; 2497 2498 /* 2499 * Request may not have originated from ll_rw_blk. if not, 2500 * it didn't come out of our reserved rq pools 2501 */ 2502 if (rl) { 2503 int rw = rq_data_dir(req); 2504 2505 elv_completed_request(q, req); 2506 2507 BUG_ON(!list_empty(&req->queuelist)); 2508 2509 blk_free_request(q, req); 2510 freed_request(q, rw); 2511 } 2512} 2513 2514void blk_put_request(struct request *req) 2515{ 2516 /* 2517 * if req->rl isn't set, this request didnt originate from the 2518 * block layer, so it's safe to just disregard it 2519 */ 2520 if (req->rl) { 2521 unsigned long flags; 2522 request_queue_t *q = req->q; 2523 2524 spin_lock_irqsave(q->queue_lock, flags); 2525 __blk_put_request(q, req); 2526 spin_unlock_irqrestore(q->queue_lock, flags); 2527 } 2528} 2529 2530EXPORT_SYMBOL(blk_put_request); 2531 2532/** 2533 * blk_end_sync_rq - executes a completion event on a request 2534 * @rq: request to complete 2535 */ 2536void blk_end_sync_rq(struct request *rq) 2537{ 2538 struct completion *waiting = rq->waiting; 2539 2540 rq->waiting = NULL; 2541 __blk_put_request(rq->q, rq); 2542 2543 /* 2544 * complete last, if this is a stack request the process (and thus 2545 * the rq pointer) could be invalid right after this complete() 2546 */ 2547 complete(waiting); 2548} 2549EXPORT_SYMBOL(blk_end_sync_rq); 2550 2551/** 2552 * blk_congestion_wait - wait for a queue to become uncongested 2553 * @rw: READ or WRITE 2554 * @timeout: timeout in jiffies 2555 * 2556 * Waits for up to @timeout jiffies for a queue (any queue) to exit congestion. 2557 * If no queues are congested then just wait for the next request to be 2558 * returned. 2559 */ 2560long blk_congestion_wait(int rw, long timeout) 2561{ 2562 long ret; 2563 DEFINE_WAIT(wait); 2564 wait_queue_head_t *wqh = &congestion_wqh[rw]; 2565 2566 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); 2567 ret = io_schedule_timeout(timeout); 2568 finish_wait(wqh, &wait); 2569 return ret; 2570} 2571 2572EXPORT_SYMBOL(blk_congestion_wait); 2573 2574/* 2575 * Has to be called with the request spinlock acquired 2576 */ 2577static int attempt_merge(request_queue_t *q, struct request *req, 2578 struct request *next) 2579{ 2580 if (!rq_mergeable(req) || !rq_mergeable(next)) 2581 return 0; 2582 2583 /* 2584 * not contigious 2585 */ 2586 if (req->sector + req->nr_sectors != next->sector) 2587 return 0; 2588 2589 if (rq_data_dir(req) != rq_data_dir(next) 2590 || req->rq_disk != next->rq_disk 2591 || next->waiting || next->special) 2592 return 0; 2593 2594 /* 2595 * If we are allowed to merge, then append bio list 2596 * from next to rq and release next. merge_requests_fn 2597 * will have updated segment counts, update sector 2598 * counts here. 2599 */ 2600 if (!q->merge_requests_fn(q, req, next)) 2601 return 0; 2602 2603 /* 2604 * At this point we have either done a back merge 2605 * or front merge. We need the smaller start_time of 2606 * the merged requests to be the current request 2607 * for accounting purposes. 2608 */ 2609 if (time_after(req->start_time, next->start_time)) 2610 req->start_time = next->start_time; 2611 2612 req->biotail->bi_next = next->bio; 2613 req->biotail = next->biotail; 2614 2615 req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors; 2616 2617 elv_merge_requests(q, req, next); 2618 2619 if (req->rq_disk) { 2620 disk_round_stats(req->rq_disk); 2621 req->rq_disk->in_flight--; 2622 } 2623 2624 req->ioprio = ioprio_best(req->ioprio, next->ioprio); 2625 2626 __blk_put_request(q, next); 2627 return 1; 2628} 2629 2630static inline int attempt_back_merge(request_queue_t *q, struct request *rq) 2631{ 2632 struct request *next = elv_latter_request(q, rq); 2633 2634 if (next) 2635 return attempt_merge(q, rq, next); 2636 2637 return 0; 2638} 2639 2640static inline int attempt_front_merge(request_queue_t *q, struct request *rq) 2641{ 2642 struct request *prev = elv_former_request(q, rq); 2643 2644 if (prev) 2645 return attempt_merge(q, prev, rq); 2646 2647 return 0; 2648} 2649 2650/** 2651 * blk_attempt_remerge - attempt to remerge active head with next request 2652 * @q: The &request_queue_t belonging to the device 2653 * @rq: The head request (usually) 2654 * 2655 * Description: 2656 * For head-active devices, the queue can easily be unplugged so quickly 2657 * that proper merging is not done on the front request. This may hurt 2658 * performance greatly for some devices. The block layer cannot safely 2659 * do merging on that first request for these queues, but the driver can 2660 * call this function and make it happen any way. Only the driver knows 2661 * when it is safe to do so. 2662 **/ 2663void blk_attempt_remerge(request_queue_t *q, struct request *rq) 2664{ 2665 unsigned long flags; 2666 2667 spin_lock_irqsave(q->queue_lock, flags); 2668 attempt_back_merge(q, rq); 2669 spin_unlock_irqrestore(q->queue_lock, flags); 2670} 2671 2672EXPORT_SYMBOL(blk_attempt_remerge); 2673 2674static int __make_request(request_queue_t *q, struct bio *bio) 2675{ 2676 struct request *req; 2677 int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err, sync; 2678 unsigned short prio; 2679 sector_t sector; 2680 2681 sector = bio->bi_sector; 2682 nr_sectors = bio_sectors(bio); 2683 cur_nr_sectors = bio_cur_sectors(bio); 2684 prio = bio_prio(bio); 2685 2686 rw = bio_data_dir(bio); 2687 sync = bio_sync(bio); 2688 2689 /* 2690 * low level driver can indicate that it wants pages above a 2691 * certain limit bounced to low memory (ie for highmem, or even 2692 * ISA dma in theory) 2693 */ 2694 blk_queue_bounce(q, &bio); 2695 2696 spin_lock_prefetch(q->queue_lock); 2697 2698 barrier = bio_barrier(bio); 2699 if (unlikely(barrier) && (q->ordered == QUEUE_ORDERED_NONE)) { 2700 err = -EOPNOTSUPP; 2701 goto end_io; 2702 } 2703 2704 spin_lock_irq(q->queue_lock); 2705 2706 if (unlikely(barrier) || elv_queue_empty(q)) 2707 goto get_rq; 2708 2709 el_ret = elv_merge(q, &req, bio); 2710 switch (el_ret) { 2711 case ELEVATOR_BACK_MERGE: 2712 BUG_ON(!rq_mergeable(req)); 2713 2714 if (!q->back_merge_fn(q, req, bio)) 2715 break; 2716 2717 req->biotail->bi_next = bio; 2718 req->biotail = bio; 2719 req->nr_sectors = req->hard_nr_sectors += nr_sectors; 2720 req->ioprio = ioprio_best(req->ioprio, prio); 2721 drive_stat_acct(req, nr_sectors, 0); 2722 if (!attempt_back_merge(q, req)) 2723 elv_merged_request(q, req); 2724 goto out; 2725 2726 case ELEVATOR_FRONT_MERGE: 2727 BUG_ON(!rq_mergeable(req)); 2728 2729 if (!q->front_merge_fn(q, req, bio)) 2730 break; 2731 2732 bio->bi_next = req->bio; 2733 req->bio = bio; 2734 2735 /* 2736 * may not be valid. if the low level driver said 2737 * it didn't need a bounce buffer then it better 2738 * not touch req->buffer either... 2739 */ 2740 req->buffer = bio_data(bio); 2741 req->current_nr_sectors = cur_nr_sectors; 2742 req->hard_cur_sectors = cur_nr_sectors; 2743 req->sector = req->hard_sector = sector; 2744 req->nr_sectors = req->hard_nr_sectors += nr_sectors; 2745 req->ioprio = ioprio_best(req->ioprio, prio); 2746 drive_stat_acct(req, nr_sectors, 0); 2747 if (!attempt_front_merge(q, req)) 2748 elv_merged_request(q, req); 2749 goto out; 2750 2751 /* ELV_NO_MERGE: elevator says don't/can't merge. */ 2752 default: 2753 ; 2754 } 2755 2756get_rq: 2757 /* 2758 * Grab a free request. This is might sleep but can not fail. 2759 * Returns with the queue unlocked. 2760 */ 2761 req = get_request_wait(q, rw, bio); 2762 2763 /* 2764 * After dropping the lock and possibly sleeping here, our request 2765 * may now be mergeable after it had proven unmergeable (above). 2766 * We don't worry about that case for efficiency. It won't happen 2767 * often, and the elevators are able to handle it. 2768 */ 2769 2770 req->flags |= REQ_CMD; 2771 2772 /* 2773 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST) 2774 */ 2775 if (bio_rw_ahead(bio) || bio_failfast(bio)) 2776 req->flags |= REQ_FAILFAST; 2777 2778 /* 2779 * REQ_BARRIER implies no merging, but lets make it explicit 2780 */ 2781 if (unlikely(barrier)) 2782 req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE); 2783 2784 req->errors = 0; 2785 req->hard_sector = req->sector = sector; 2786 req->hard_nr_sectors = req->nr_sectors = nr_sectors; 2787 req->current_nr_sectors = req->hard_cur_sectors = cur_nr_sectors; 2788 req->nr_phys_segments = bio_phys_segments(q, bio); 2789 req->nr_hw_segments = bio_hw_segments(q, bio); 2790 req->buffer = bio_data(bio); /* see ->buffer comment above */ 2791 req->waiting = NULL; 2792 req->bio = req->biotail = bio; 2793 req->ioprio = prio; 2794 req->rq_disk = bio->bi_bdev->bd_disk; 2795 req->start_time = jiffies; 2796 2797 spin_lock_irq(q->queue_lock); 2798 if (elv_queue_empty(q)) 2799 blk_plug_device(q); 2800 add_request(q, req); 2801out: 2802 if (sync) 2803 __generic_unplug_device(q); 2804 2805 spin_unlock_irq(q->queue_lock); 2806 return 0; 2807 2808end_io: 2809 bio_endio(bio, nr_sectors << 9, err); 2810 return 0; 2811} 2812 2813/* 2814 * If bio->bi_dev is a partition, remap the location 2815 */ 2816static inline void blk_partition_remap(struct bio *bio) 2817{ 2818 struct block_device *bdev = bio->bi_bdev; 2819 2820 if (bdev != bdev->bd_contains) { 2821 struct hd_struct *p = bdev->bd_part; 2822 2823 switch (bio_data_dir(bio)) { 2824 case READ: 2825 p->read_sectors += bio_sectors(bio); 2826 p->reads++; 2827 break; 2828 case WRITE: 2829 p->write_sectors += bio_sectors(bio); 2830 p->writes++; 2831 break; 2832 } 2833 bio->bi_sector += p->start_sect; 2834 bio->bi_bdev = bdev->bd_contains; 2835 } 2836} 2837 2838void blk_finish_queue_drain(request_queue_t *q) 2839{ 2840 struct request_list *rl = &q->rq; 2841 struct request *rq; 2842 int requeued = 0; 2843 2844 spin_lock_irq(q->queue_lock); 2845 clear_bit(QUEUE_FLAG_DRAIN, &q->queue_flags); 2846 2847 while (!list_empty(&q->drain_list)) { 2848 rq = list_entry_rq(q->drain_list.next); 2849 2850 list_del_init(&rq->queuelist); 2851 elv_requeue_request(q, rq); 2852 requeued++; 2853 } 2854 2855 if (requeued) 2856 q->request_fn(q); 2857 2858 spin_unlock_irq(q->queue_lock); 2859 2860 wake_up(&rl->wait[0]); 2861 wake_up(&rl->wait[1]); 2862 wake_up(&rl->drain); 2863} 2864 2865static int wait_drain(request_queue_t *q, struct request_list *rl, int dispatch) 2866{ 2867 int wait = rl->count[READ] + rl->count[WRITE]; 2868 2869 if (dispatch) 2870 wait += !list_empty(&q->queue_head); 2871 2872 return wait; 2873} 2874 2875/* 2876 * We rely on the fact that only requests allocated through blk_alloc_request() 2877 * have io scheduler private data structures associated with them. Any other 2878 * type of request (allocated on stack or through kmalloc()) should not go 2879 * to the io scheduler core, but be attached to the queue head instead. 2880 */ 2881void blk_wait_queue_drained(request_queue_t *q, int wait_dispatch) 2882{ 2883 struct request_list *rl = &q->rq; 2884 DEFINE_WAIT(wait); 2885 2886 spin_lock_irq(q->queue_lock); 2887 set_bit(QUEUE_FLAG_DRAIN, &q->queue_flags); 2888 2889 while (wait_drain(q, rl, wait_dispatch)) { 2890 prepare_to_wait(&rl->drain, &wait, TASK_UNINTERRUPTIBLE); 2891 2892 if (wait_drain(q, rl, wait_dispatch)) { 2893 __generic_unplug_device(q); 2894 spin_unlock_irq(q->queue_lock); 2895 io_schedule(); 2896 spin_lock_irq(q->queue_lock); 2897 } 2898 2899 finish_wait(&rl->drain, &wait); 2900 } 2901 2902 spin_unlock_irq(q->queue_lock); 2903} 2904 2905/* 2906 * block waiting for the io scheduler being started again. 2907 */ 2908static inline void block_wait_queue_running(request_queue_t *q) 2909{ 2910 DEFINE_WAIT(wait); 2911 2912 while (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags))) { 2913 struct request_list *rl = &q->rq; 2914 2915 prepare_to_wait_exclusive(&rl->drain, &wait, 2916 TASK_UNINTERRUPTIBLE); 2917 2918 /* 2919 * re-check the condition. avoids using prepare_to_wait() 2920 * in the fast path (queue is running) 2921 */ 2922 if (test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags)) 2923 io_schedule(); 2924 2925 finish_wait(&rl->drain, &wait); 2926 } 2927} 2928 2929static void handle_bad_sector(struct bio *bio) 2930{ 2931 char b[BDEVNAME_SIZE]; 2932 2933 printk(KERN_INFO "attempt to access beyond end of device\n"); 2934 printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n", 2935 bdevname(bio->bi_bdev, b), 2936 bio->bi_rw, 2937 (unsigned long long)bio->bi_sector + bio_sectors(bio), 2938 (long long)(bio->bi_bdev->bd_inode->i_size >> 9)); 2939 2940 set_bit(BIO_EOF, &bio->bi_flags); 2941} 2942 2943/** 2944 * generic_make_request: hand a buffer to its device driver for I/O 2945 * @bio: The bio describing the location in memory and on the device. 2946 * 2947 * generic_make_request() is used to make I/O requests of block 2948 * devices. It is passed a &struct bio, which describes the I/O that needs 2949 * to be done. 2950 * 2951 * generic_make_request() does not return any status. The 2952 * success/failure status of the request, along with notification of 2953 * completion, is delivered asynchronously through the bio->bi_end_io 2954 * function described (one day) else where. 2955 * 2956 * The caller of generic_make_request must make sure that bi_io_vec 2957 * are set to describe the memory buffer, and that bi_dev and bi_sector are 2958 * set to describe the device address, and the 2959 * bi_end_io and optionally bi_private are set to describe how 2960 * completion notification should be signaled. 2961 * 2962 * generic_make_request and the drivers it calls may use bi_next if this 2963 * bio happens to be merged with someone else, and may change bi_dev and 2964 * bi_sector for remaps as it sees fit. So the values of these fields 2965 * should NOT be depended on after the call to generic_make_request. 2966 */ 2967void generic_make_request(struct bio *bio) 2968{ 2969 request_queue_t *q; 2970 sector_t maxsector; 2971 int ret, nr_sectors = bio_sectors(bio); 2972 2973 might_sleep(); 2974 /* Test device or partition size, when known. */ 2975 maxsector = bio->bi_bdev->bd_inode->i_size >> 9; 2976 if (maxsector) { 2977 sector_t sector = bio->bi_sector; 2978 2979 if (maxsector < nr_sectors || maxsector - nr_sectors < sector) { 2980 /* 2981 * This may well happen - the kernel calls bread() 2982 * without checking the size of the device, e.g., when 2983 * mounting a device. 2984 */ 2985 handle_bad_sector(bio); 2986 goto end_io; 2987 } 2988 } 2989 2990 /* 2991 * Resolve the mapping until finished. (drivers are 2992 * still free to implement/resolve their own stacking 2993 * by explicitly returning 0) 2994 * 2995 * NOTE: we don't repeat the blk_size check for each new device. 2996 * Stacking drivers are expected to know what they are doing. 2997 */ 2998 do { 2999 char b[BDEVNAME_SIZE]; 3000 3001 q = bdev_get_queue(bio->bi_bdev); 3002 if (!q) { 3003 printk(KERN_ERR 3004 "generic_make_request: Trying to access " 3005 "nonexistent block-device %s (%Lu)\n", 3006 bdevname(bio->bi_bdev, b), 3007 (long long) bio->bi_sector); 3008end_io: 3009 bio_endio(bio, bio->bi_size, -EIO); 3010 break; 3011 } 3012 3013 if (unlikely(bio_sectors(bio) > q->max_hw_sectors)) { 3014 printk("bio too big device %s (%u > %u)\n", 3015 bdevname(bio->bi_bdev, b), 3016 bio_sectors(bio), 3017 q->max_hw_sectors); 3018 goto end_io; 3019 } 3020 3021 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) 3022 goto end_io; 3023 3024 block_wait_queue_running(q); 3025 3026 /* 3027 * If this device has partitions, remap block n 3028 * of partition p to block n+start(p) of the disk. 3029 */ 3030 blk_partition_remap(bio); 3031 3032 ret = q->make_request_fn(q, bio); 3033 } while (ret); 3034} 3035 3036EXPORT_SYMBOL(generic_make_request); 3037 3038/** 3039 * submit_bio: submit a bio to the block device layer for I/O 3040 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) 3041 * @bio: The &struct bio which describes the I/O 3042 * 3043 * submit_bio() is very similar in purpose to generic_make_request(), and 3044 * uses that function to do most of the work. Both are fairly rough 3045 * interfaces, @bio must be presetup and ready for I/O. 3046 * 3047 */ 3048void submit_bio(int rw, struct bio *bio) 3049{ 3050 int count = bio_sectors(bio); 3051 3052 BIO_BUG_ON(!bio->bi_size); 3053 BIO_BUG_ON(!bio->bi_io_vec); 3054 bio->bi_rw |= rw; 3055 if (rw & WRITE) 3056 mod_page_state(pgpgout, count); 3057 else 3058 mod_page_state(pgpgin, count); 3059 3060 if (unlikely(block_dump)) { 3061 char b[BDEVNAME_SIZE]; 3062 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", 3063 current->comm, current->pid, 3064 (rw & WRITE) ? "WRITE" : "READ", 3065 (unsigned long long)bio->bi_sector, 3066 bdevname(bio->bi_bdev,b)); 3067 } 3068 3069 generic_make_request(bio); 3070} 3071 3072EXPORT_SYMBOL(submit_bio); 3073 3074static void blk_recalc_rq_segments(struct request *rq) 3075{ 3076 struct bio *bio, *prevbio = NULL; 3077 int nr_phys_segs, nr_hw_segs; 3078 unsigned int phys_size, hw_size; 3079 request_queue_t *q = rq->q; 3080 3081 if (!rq->bio) 3082 return; 3083 3084 phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0; 3085 rq_for_each_bio(bio, rq) { 3086 /* Force bio hw/phys segs to be recalculated. */ 3087 bio->bi_flags &= ~(1 << BIO_SEG_VALID); 3088 3089 nr_phys_segs += bio_phys_segments(q, bio); 3090 nr_hw_segs += bio_hw_segments(q, bio); 3091 if (prevbio) { 3092 int pseg = phys_size + prevbio->bi_size + bio->bi_size; 3093 int hseg = hw_size + prevbio->bi_size + bio->bi_size; 3094 3095 if (blk_phys_contig_segment(q, prevbio, bio) && 3096 pseg <= q->max_segment_size) { 3097 nr_phys_segs--; 3098 phys_size += prevbio->bi_size + bio->bi_size; 3099 } else 3100 phys_size = 0; 3101 3102 if (blk_hw_contig_segment(q, prevbio, bio) && 3103 hseg <= q->max_segment_size) { 3104 nr_hw_segs--; 3105 hw_size += prevbio->bi_size + bio->bi_size; 3106 } else 3107 hw_size = 0; 3108 } 3109 prevbio = bio; 3110 } 3111 3112 rq->nr_phys_segments = nr_phys_segs; 3113 rq->nr_hw_segments = nr_hw_segs; 3114} 3115 3116static void blk_recalc_rq_sectors(struct request *rq, int nsect) 3117{ 3118 if (blk_fs_request(rq)) { 3119 rq->hard_sector += nsect; 3120 rq->hard_nr_sectors -= nsect; 3121 3122 /* 3123 * Move the I/O submission pointers ahead if required. 3124 */ 3125 if ((rq->nr_sectors >= rq->hard_nr_sectors) && 3126 (rq->sector <= rq->hard_sector)) { 3127 rq->sector = rq->hard_sector; 3128 rq->nr_sectors = rq->hard_nr_sectors; 3129 rq->hard_cur_sectors = bio_cur_sectors(rq->bio); 3130 rq->current_nr_sectors = rq->hard_cur_sectors; 3131 rq->buffer = bio_data(rq->bio); 3132 } 3133 3134 /* 3135 * if total number of sectors is less than the first segment 3136 * size, something has gone terribly wrong 3137 */ 3138 if (rq->nr_sectors < rq->current_nr_sectors) { 3139 printk("blk: request botched\n"); 3140 rq->nr_sectors = rq->current_nr_sectors; 3141 } 3142 } 3143} 3144 3145static int __end_that_request_first(struct request *req, int uptodate, 3146 int nr_bytes) 3147{ 3148 int total_bytes, bio_nbytes, error, next_idx = 0; 3149 struct bio *bio; 3150 3151 /* 3152 * extend uptodate bool to allow < 0 value to be direct io error 3153 */ 3154 error = 0; 3155 if (end_io_error(uptodate)) 3156 error = !uptodate ? -EIO : uptodate; 3157 3158 /* 3159 * for a REQ_BLOCK_PC request, we want to carry any eventual 3160 * sense key with us all the way through 3161 */ 3162 if (!blk_pc_request(req)) 3163 req->errors = 0; 3164 3165 if (!uptodate) { 3166 if (blk_fs_request(req) && !(req->flags & REQ_QUIET)) 3167 printk("end_request: I/O error, dev %s, sector %llu\n", 3168 req->rq_disk ? req->rq_disk->disk_name : "?", 3169 (unsigned long long)req->sector); 3170 } 3171 3172 total_bytes = bio_nbytes = 0; 3173 while ((bio = req->bio) != NULL) { 3174 int nbytes; 3175 3176 if (nr_bytes >= bio->bi_size) { 3177 req->bio = bio->bi_next; 3178 nbytes = bio->bi_size; 3179 bio_endio(bio, nbytes, error); 3180 next_idx = 0; 3181 bio_nbytes = 0; 3182 } else { 3183 int idx = bio->bi_idx + next_idx; 3184 3185 if (unlikely(bio->bi_idx >= bio->bi_vcnt)) { 3186 blk_dump_rq_flags(req, "__end_that"); 3187 printk("%s: bio idx %d >= vcnt %d\n", 3188 __FUNCTION__, 3189 bio->bi_idx, bio->bi_vcnt); 3190 break; 3191 } 3192 3193 nbytes = bio_iovec_idx(bio, idx)->bv_len; 3194 BIO_BUG_ON(nbytes > bio->bi_size); 3195 3196 /* 3197 * not a complete bvec done 3198 */ 3199 if (unlikely(nbytes > nr_bytes)) { 3200 bio_nbytes += nr_bytes; 3201 total_bytes += nr_bytes; 3202 break; 3203 } 3204 3205 /* 3206 * advance to the next vector 3207 */ 3208 next_idx++; 3209 bio_nbytes += nbytes; 3210 } 3211 3212 total_bytes += nbytes; 3213 nr_bytes -= nbytes; 3214 3215 if ((bio = req->bio)) { 3216 /* 3217 * end more in this run, or just return 'not-done' 3218 */ 3219 if (unlikely(nr_bytes <= 0)) 3220 break; 3221 } 3222 } 3223 3224 /* 3225 * completely done 3226 */ 3227 if (!req->bio) 3228 return 0; 3229 3230 /* 3231 * if the request wasn't completed, update state 3232 */ 3233 if (bio_nbytes) { 3234 bio_endio(bio, bio_nbytes, error); 3235 bio->bi_idx += next_idx; 3236 bio_iovec(bio)->bv_offset += nr_bytes; 3237 bio_iovec(bio)->bv_len -= nr_bytes; 3238 } 3239 3240 blk_recalc_rq_sectors(req, total_bytes >> 9); 3241 blk_recalc_rq_segments(req); 3242 return 1; 3243} 3244 3245/** 3246 * end_that_request_first - end I/O on a request 3247 * @req: the request being processed 3248 * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error 3249 * @nr_sectors: number of sectors to end I/O on 3250 * 3251 * Description: 3252 * Ends I/O on a number of sectors attached to @req, and sets it up 3253 * for the next range of segments (if any) in the cluster. 3254 * 3255 * Return: 3256 * 0 - we are done with this request, call end_that_request_last() 3257 * 1 - still buffers pending for this request 3258 **/ 3259int end_that_request_first(struct request *req, int uptodate, int nr_sectors) 3260{ 3261 return __end_that_request_first(req, uptodate, nr_sectors << 9); 3262} 3263 3264EXPORT_SYMBOL(end_that_request_first); 3265 3266/** 3267 * end_that_request_chunk - end I/O on a request 3268 * @req: the request being processed 3269 * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error 3270 * @nr_bytes: number of bytes to complete 3271 * 3272 * Description: 3273 * Ends I/O on a number of bytes attached to @req, and sets it up 3274 * for the next range of segments (if any). Like end_that_request_first(), 3275 * but deals with bytes instead of sectors. 3276 * 3277 * Return: 3278 * 0 - we are done with this request, call end_that_request_last() 3279 * 1 - still buffers pending for this request 3280 **/ 3281int end_that_request_chunk(struct request *req, int uptodate, int nr_bytes) 3282{ 3283 return __end_that_request_first(req, uptodate, nr_bytes); 3284} 3285 3286EXPORT_SYMBOL(end_that_request_chunk); 3287 3288/* 3289 * queue lock must be held 3290 */ 3291void end_that_request_last(struct request *req) 3292{ 3293 struct gendisk *disk = req->rq_disk; 3294 3295 if (unlikely(laptop_mode) && blk_fs_request(req)) 3296 laptop_io_completion(); 3297 3298 if (disk && blk_fs_request(req)) { 3299 unsigned long duration = jiffies - req->start_time; 3300 switch (rq_data_dir(req)) { 3301 case WRITE: 3302 __disk_stat_inc(disk, writes); 3303 __disk_stat_add(disk, write_ticks, duration); 3304 break; 3305 case READ: 3306 __disk_stat_inc(disk, reads); 3307 __disk_stat_add(disk, read_ticks, duration); 3308 break; 3309 } 3310 disk_round_stats(disk); 3311 disk->in_flight--; 3312 } 3313 if (req->end_io) 3314 req->end_io(req); 3315 else 3316 __blk_put_request(req->q, req); 3317} 3318 3319EXPORT_SYMBOL(end_that_request_last); 3320 3321void end_request(struct request *req, int uptodate) 3322{ 3323 if (!end_that_request_first(req, uptodate, req->hard_cur_sectors)) { 3324 add_disk_randomness(req->rq_disk); 3325 blkdev_dequeue_request(req); 3326 end_that_request_last(req); 3327 } 3328} 3329 3330EXPORT_SYMBOL(end_request); 3331 3332void blk_rq_bio_prep(request_queue_t *q, struct request *rq, struct bio *bio) 3333{ 3334 /* first three bits are identical in rq->flags and bio->bi_rw */ 3335 rq->flags |= (bio->bi_rw & 7); 3336 3337 rq->nr_phys_segments = bio_phys_segments(q, bio); 3338 rq->nr_hw_segments = bio_hw_segments(q, bio); 3339 rq->current_nr_sectors = bio_cur_sectors(bio); 3340 rq->hard_cur_sectors = rq->current_nr_sectors; 3341 rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio); 3342 rq->buffer = bio_data(bio); 3343 3344 rq->bio = rq->biotail = bio; 3345} 3346 3347EXPORT_SYMBOL(blk_rq_bio_prep); 3348 3349int kblockd_schedule_work(struct work_struct *work) 3350{ 3351 return queue_work(kblockd_workqueue, work); 3352} 3353 3354EXPORT_SYMBOL(kblockd_schedule_work); 3355 3356void kblockd_flush(void) 3357{ 3358 flush_workqueue(kblockd_workqueue); 3359} 3360EXPORT_SYMBOL(kblockd_flush); 3361 3362int __init blk_dev_init(void) 3363{ 3364 kblockd_workqueue = create_workqueue("kblockd"); 3365 if (!kblockd_workqueue) 3366 panic("Failed to create kblockd\n"); 3367 3368 request_cachep = kmem_cache_create("blkdev_requests", 3369 sizeof(struct request), 0, SLAB_PANIC, NULL, NULL); 3370 3371 requestq_cachep = kmem_cache_create("blkdev_queue", 3372 sizeof(request_queue_t), 0, SLAB_PANIC, NULL, NULL); 3373 3374 iocontext_cachep = kmem_cache_create("blkdev_ioc", 3375 sizeof(struct io_context), 0, SLAB_PANIC, NULL, NULL); 3376 3377 blk_max_low_pfn = max_low_pfn; 3378 blk_max_pfn = max_pfn; 3379 3380 return 0; 3381} 3382 3383/* 3384 * IO Context helper functions 3385 */ 3386void put_io_context(struct io_context *ioc) 3387{ 3388 if (ioc == NULL) 3389 return; 3390 3391 BUG_ON(atomic_read(&ioc->refcount) == 0); 3392 3393 if (atomic_dec_and_test(&ioc->refcount)) { 3394 if (ioc->aic && ioc->aic->dtor) 3395 ioc->aic->dtor(ioc->aic); 3396 if (ioc->cic && ioc->cic->dtor) 3397 ioc->cic->dtor(ioc->cic); 3398 3399 kmem_cache_free(iocontext_cachep, ioc); 3400 } 3401} 3402EXPORT_SYMBOL(put_io_context); 3403 3404/* Called by the exitting task */ 3405void exit_io_context(void) 3406{ 3407 unsigned long flags; 3408 struct io_context *ioc; 3409 3410 local_irq_save(flags); 3411 task_lock(current); 3412 ioc = current->io_context; 3413 current->io_context = NULL; 3414 ioc->task = NULL; 3415 task_unlock(current); 3416 local_irq_restore(flags); 3417 3418 if (ioc->aic && ioc->aic->exit) 3419 ioc->aic->exit(ioc->aic); 3420 if (ioc->cic && ioc->cic->exit) 3421 ioc->cic->exit(ioc->cic); 3422 3423 put_io_context(ioc); 3424} 3425 3426/* 3427 * If the current task has no IO context then create one and initialise it. 3428 * Otherwise, return its existing IO context. 3429 * 3430 * This returned IO context doesn't have a specifically elevated refcount, 3431 * but since the current task itself holds a reference, the context can be 3432 * used in general code, so long as it stays within `current` context. 3433 */ 3434struct io_context *current_io_context(int gfp_flags) 3435{ 3436 struct task_struct *tsk = current; 3437 struct io_context *ret; 3438 3439 ret = tsk->io_context; 3440 if (likely(ret)) 3441 return ret; 3442 3443 ret = kmem_cache_alloc(iocontext_cachep, gfp_flags); 3444 if (ret) { 3445 atomic_set(&ret->refcount, 1); 3446 ret->task = current; 3447 ret->set_ioprio = NULL; 3448 ret->last_waited = jiffies; /* doesn't matter... */ 3449 ret->nr_batch_requests = 0; /* because this is 0 */ 3450 ret->aic = NULL; 3451 ret->cic = NULL; 3452 tsk->io_context = ret; 3453 } 3454 3455 return ret; 3456} 3457EXPORT_SYMBOL(current_io_context); 3458 3459/* 3460 * If the current task has no IO context then create one and initialise it. 3461 * If it does have a context, take a ref on it. 3462 * 3463 * This is always called in the context of the task which submitted the I/O. 3464 */ 3465struct io_context *get_io_context(int gfp_flags) 3466{ 3467 struct io_context *ret; 3468 ret = current_io_context(gfp_flags); 3469 if (likely(ret)) 3470 atomic_inc(&ret->refcount); 3471 return ret; 3472} 3473EXPORT_SYMBOL(get_io_context); 3474 3475void copy_io_context(struct io_context **pdst, struct io_context **psrc) 3476{ 3477 struct io_context *src = *psrc; 3478 struct io_context *dst = *pdst; 3479 3480 if (src) { 3481 BUG_ON(atomic_read(&src->refcount) == 0); 3482 atomic_inc(&src->refcount); 3483 put_io_context(dst); 3484 *pdst = src; 3485 } 3486} 3487EXPORT_SYMBOL(copy_io_context); 3488 3489void swap_io_context(struct io_context **ioc1, struct io_context **ioc2) 3490{ 3491 struct io_context *temp; 3492 temp = *ioc1; 3493 *ioc1 = *ioc2; 3494 *ioc2 = temp; 3495} 3496EXPORT_SYMBOL(swap_io_context); 3497 3498/* 3499 * sysfs parts below 3500 */ 3501struct queue_sysfs_entry { 3502 struct attribute attr; 3503 ssize_t (*show)(struct request_queue *, char *); 3504 ssize_t (*store)(struct request_queue *, const char *, size_t); 3505}; 3506 3507static ssize_t 3508queue_var_show(unsigned int var, char *page) 3509{ 3510 return sprintf(page, "%d\n", var); 3511} 3512 3513static ssize_t 3514queue_var_store(unsigned long *var, const char *page, size_t count) 3515{ 3516 char *p = (char *) page; 3517 3518 *var = simple_strtoul(p, &p, 10); 3519 return count; 3520} 3521 3522static ssize_t queue_requests_show(struct request_queue *q, char *page) 3523{ 3524 return queue_var_show(q->nr_requests, (page)); 3525} 3526 3527static ssize_t 3528queue_requests_store(struct request_queue *q, const char *page, size_t count) 3529{ 3530 struct request_list *rl = &q->rq; 3531 3532 int ret = queue_var_store(&q->nr_requests, page, count); 3533 if (q->nr_requests < BLKDEV_MIN_RQ) 3534 q->nr_requests = BLKDEV_MIN_RQ; 3535 blk_queue_congestion_threshold(q); 3536 3537 if (rl->count[READ] >= queue_congestion_on_threshold(q)) 3538 set_queue_congested(q, READ); 3539 else if (rl->count[READ] < queue_congestion_off_threshold(q)) 3540 clear_queue_congested(q, READ); 3541 3542 if (rl->count[WRITE] >= queue_congestion_on_threshold(q)) 3543 set_queue_congested(q, WRITE); 3544 else if (rl->count[WRITE] < queue_congestion_off_threshold(q)) 3545 clear_queue_congested(q, WRITE); 3546 3547 if (rl->count[READ] >= q->nr_requests) { 3548 blk_set_queue_full(q, READ); 3549 } else if (rl->count[READ]+1 <= q->nr_requests) { 3550 blk_clear_queue_full(q, READ); 3551 wake_up(&rl->wait[READ]); 3552 } 3553 3554 if (rl->count[WRITE] >= q->nr_requests) { 3555 blk_set_queue_full(q, WRITE); 3556 } else if (rl->count[WRITE]+1 <= q->nr_requests) { 3557 blk_clear_queue_full(q, WRITE); 3558 wake_up(&rl->wait[WRITE]); 3559 } 3560 return ret; 3561} 3562 3563static ssize_t queue_ra_show(struct request_queue *q, char *page) 3564{ 3565 int ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10); 3566 3567 return queue_var_show(ra_kb, (page)); 3568} 3569 3570static ssize_t 3571queue_ra_store(struct request_queue *q, const char *page, size_t count) 3572{ 3573 unsigned long ra_kb; 3574 ssize_t ret = queue_var_store(&ra_kb, page, count); 3575 3576 spin_lock_irq(q->queue_lock); 3577 if (ra_kb > (q->max_sectors >> 1)) 3578 ra_kb = (q->max_sectors >> 1); 3579 3580 q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10); 3581 spin_unlock_irq(q->queue_lock); 3582 3583 return ret; 3584} 3585 3586static ssize_t queue_max_sectors_show(struct request_queue *q, char *page) 3587{ 3588 int max_sectors_kb = q->max_sectors >> 1; 3589 3590 return queue_var_show(max_sectors_kb, (page)); 3591} 3592 3593static ssize_t 3594queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) 3595{ 3596 unsigned long max_sectors_kb, 3597 max_hw_sectors_kb = q->max_hw_sectors >> 1, 3598 page_kb = 1 << (PAGE_CACHE_SHIFT - 10); 3599 ssize_t ret = queue_var_store(&max_sectors_kb, page, count); 3600 int ra_kb; 3601 3602 if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) 3603 return -EINVAL; 3604 /* 3605 * Take the queue lock to update the readahead and max_sectors 3606 * values synchronously: 3607 */ 3608 spin_lock_irq(q->queue_lock); 3609 /* 3610 * Trim readahead window as well, if necessary: 3611 */ 3612 ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10); 3613 if (ra_kb > max_sectors_kb) 3614 q->backing_dev_info.ra_pages = 3615 max_sectors_kb >> (PAGE_CACHE_SHIFT - 10); 3616 3617 q->max_sectors = max_sectors_kb << 1; 3618 spin_unlock_irq(q->queue_lock); 3619 3620 return ret; 3621} 3622 3623static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page) 3624{ 3625 int max_hw_sectors_kb = q->max_hw_sectors >> 1; 3626 3627 return queue_var_show(max_hw_sectors_kb, (page)); 3628} 3629 3630 3631static struct queue_sysfs_entry queue_requests_entry = { 3632 .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, 3633 .show = queue_requests_show, 3634 .store = queue_requests_store, 3635}; 3636 3637static struct queue_sysfs_entry queue_ra_entry = { 3638 .attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR }, 3639 .show = queue_ra_show, 3640 .store = queue_ra_store, 3641}; 3642 3643static struct queue_sysfs_entry queue_max_sectors_entry = { 3644 .attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR }, 3645 .show = queue_max_sectors_show, 3646 .store = queue_max_sectors_store, 3647}; 3648 3649static struct queue_sysfs_entry queue_max_hw_sectors_entry = { 3650 .attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO }, 3651 .show = queue_max_hw_sectors_show, 3652}; 3653 3654static struct queue_sysfs_entry queue_iosched_entry = { 3655 .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR }, 3656 .show = elv_iosched_show, 3657 .store = elv_iosched_store, 3658}; 3659 3660static struct attribute *default_attrs[] = { 3661 &queue_requests_entry.attr, 3662 &queue_ra_entry.attr, 3663 &queue_max_hw_sectors_entry.attr, 3664 &queue_max_sectors_entry.attr, 3665 &queue_iosched_entry.attr, 3666 NULL, 3667}; 3668 3669#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr) 3670 3671static ssize_t 3672queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3673{ 3674 struct queue_sysfs_entry *entry = to_queue(attr); 3675 struct request_queue *q; 3676 3677 q = container_of(kobj, struct request_queue, kobj); 3678 if (!entry->show) 3679 return -EIO; 3680 3681 return entry->show(q, page); 3682} 3683 3684static ssize_t 3685queue_attr_store(struct kobject *kobj, struct attribute *attr, 3686 const char *page, size_t length) 3687{ 3688 struct queue_sysfs_entry *entry = to_queue(attr); 3689 struct request_queue *q; 3690 3691 q = container_of(kobj, struct request_queue, kobj); 3692 if (!entry->store) 3693 return -EIO; 3694 3695 return entry->store(q, page, length); 3696} 3697 3698static struct sysfs_ops queue_sysfs_ops = { 3699 .show = queue_attr_show, 3700 .store = queue_attr_store, 3701}; 3702 3703static struct kobj_type queue_ktype = { 3704 .sysfs_ops = &queue_sysfs_ops, 3705 .default_attrs = default_attrs, 3706}; 3707 3708int blk_register_queue(struct gendisk *disk) 3709{ 3710 int ret; 3711 3712 request_queue_t *q = disk->queue; 3713 3714 if (!q || !q->request_fn) 3715 return -ENXIO; 3716 3717 q->kobj.parent = kobject_get(&disk->kobj); 3718 if (!q->kobj.parent) 3719 return -EBUSY; 3720 3721 snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue"); 3722 q->kobj.ktype = &queue_ktype; 3723 3724 ret = kobject_register(&q->kobj); 3725 if (ret < 0) 3726 return ret; 3727 3728 ret = elv_register_queue(q); 3729 if (ret) { 3730 kobject_unregister(&q->kobj); 3731 return ret; 3732 } 3733 3734 return 0; 3735} 3736 3737void blk_unregister_queue(struct gendisk *disk) 3738{ 3739 request_queue_t *q = disk->queue; 3740 3741 if (q && q->request_fn) { 3742 elv_unregister_queue(q); 3743 3744 kobject_unregister(&q->kobj); 3745 kobject_put(&disk->kobj); 3746 } 3747}