Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at 2185200cd2a910ca7f4e3fa0370c6ed8a2bdc49c 3610 lines 94 kB view raw
1/* 2 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1994, Karl Keyte: Added support for disk statistics 4 * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE 5 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de> 6 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> - July2000 7 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001 8 */ 9 10/* 11 * This handles all read/write requests to block devices 12 */ 13#include <linux/config.h> 14#include <linux/kernel.h> 15#include <linux/module.h> 16#include <linux/backing-dev.h> 17#include <linux/bio.h> 18#include <linux/blkdev.h> 19#include <linux/highmem.h> 20#include <linux/mm.h> 21#include <linux/kernel_stat.h> 22#include <linux/string.h> 23#include <linux/init.h> 24#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */ 25#include <linux/completion.h> 26#include <linux/slab.h> 27#include <linux/swap.h> 28#include <linux/writeback.h> 29#include <linux/blkdev.h> 30 31/* 32 * for max sense size 33 */ 34#include <scsi/scsi_cmnd.h> 35 36static void blk_unplug_work(void *data); 37static void blk_unplug_timeout(unsigned long data); 38static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io); 39 40/* 41 * For the allocated request tables 42 */ 43static kmem_cache_t *request_cachep; 44 45/* 46 * For queue allocation 47 */ 48static kmem_cache_t *requestq_cachep; 49 50/* 51 * For io context allocations 52 */ 53static kmem_cache_t *iocontext_cachep; 54 55static wait_queue_head_t congestion_wqh[2] = { 56 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), 57 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) 58 }; 59 60/* 61 * Controlling structure to kblockd 62 */ 63static struct workqueue_struct *kblockd_workqueue; 64 65unsigned long blk_max_low_pfn, blk_max_pfn; 66 67EXPORT_SYMBOL(blk_max_low_pfn); 68EXPORT_SYMBOL(blk_max_pfn); 69 70/* Amount of time in which a process may batch requests */ 71#define BLK_BATCH_TIME (HZ/50UL) 72 73/* Number of requests a "batching" process may submit */ 74#define BLK_BATCH_REQ 32 75 76/* 77 * Return the threshold (number of used requests) at which the queue is 78 * considered to be congested. It include a little hysteresis to keep the 79 * context switch rate down. 80 */ 81static inline int queue_congestion_on_threshold(struct request_queue *q) 82{ 83 return q->nr_congestion_on; 84} 85 86/* 87 * The threshold at which a queue is considered to be uncongested 88 */ 89static inline int queue_congestion_off_threshold(struct request_queue *q) 90{ 91 return q->nr_congestion_off; 92} 93 94static void blk_queue_congestion_threshold(struct request_queue *q) 95{ 96 int nr; 97 98 nr = q->nr_requests - (q->nr_requests / 8) + 1; 99 if (nr > q->nr_requests) 100 nr = q->nr_requests; 101 q->nr_congestion_on = nr; 102 103 nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1; 104 if (nr < 1) 105 nr = 1; 106 q->nr_congestion_off = nr; 107} 108 109/* 110 * A queue has just exitted congestion. Note this in the global counter of 111 * congested queues, and wake up anyone who was waiting for requests to be 112 * put back. 113 */ 114static void clear_queue_congested(request_queue_t *q, int rw) 115{ 116 enum bdi_state bit; 117 wait_queue_head_t *wqh = &congestion_wqh[rw]; 118 119 bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested; 120 clear_bit(bit, &q->backing_dev_info.state); 121 smp_mb__after_clear_bit(); 122 if (waitqueue_active(wqh)) 123 wake_up(wqh); 124} 125 126/* 127 * A queue has just entered congestion. Flag that in the queue's VM-visible 128 * state flags and increment the global gounter of congested queues. 129 */ 130static void set_queue_congested(request_queue_t *q, int rw) 131{ 132 enum bdi_state bit; 133 134 bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested; 135 set_bit(bit, &q->backing_dev_info.state); 136} 137 138/** 139 * blk_get_backing_dev_info - get the address of a queue's backing_dev_info 140 * @bdev: device 141 * 142 * Locates the passed device's request queue and returns the address of its 143 * backing_dev_info 144 * 145 * Will return NULL if the request queue cannot be located. 146 */ 147struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) 148{ 149 struct backing_dev_info *ret = NULL; 150 request_queue_t *q = bdev_get_queue(bdev); 151 152 if (q) 153 ret = &q->backing_dev_info; 154 return ret; 155} 156 157EXPORT_SYMBOL(blk_get_backing_dev_info); 158 159void blk_queue_activity_fn(request_queue_t *q, activity_fn *fn, void *data) 160{ 161 q->activity_fn = fn; 162 q->activity_data = data; 163} 164 165EXPORT_SYMBOL(blk_queue_activity_fn); 166 167/** 168 * blk_queue_prep_rq - set a prepare_request function for queue 169 * @q: queue 170 * @pfn: prepare_request function 171 * 172 * It's possible for a queue to register a prepare_request callback which 173 * is invoked before the request is handed to the request_fn. The goal of 174 * the function is to prepare a request for I/O, it can be used to build a 175 * cdb from the request data for instance. 176 * 177 */ 178void blk_queue_prep_rq(request_queue_t *q, prep_rq_fn *pfn) 179{ 180 q->prep_rq_fn = pfn; 181} 182 183EXPORT_SYMBOL(blk_queue_prep_rq); 184 185/** 186 * blk_queue_merge_bvec - set a merge_bvec function for queue 187 * @q: queue 188 * @mbfn: merge_bvec_fn 189 * 190 * Usually queues have static limitations on the max sectors or segments that 191 * we can put in a request. Stacking drivers may have some settings that 192 * are dynamic, and thus we have to query the queue whether it is ok to 193 * add a new bio_vec to a bio at a given offset or not. If the block device 194 * has such limitations, it needs to register a merge_bvec_fn to control 195 * the size of bio's sent to it. Note that a block device *must* allow a 196 * single page to be added to an empty bio. The block device driver may want 197 * to use the bio_split() function to deal with these bio's. By default 198 * no merge_bvec_fn is defined for a queue, and only the fixed limits are 199 * honored. 200 */ 201void blk_queue_merge_bvec(request_queue_t *q, merge_bvec_fn *mbfn) 202{ 203 q->merge_bvec_fn = mbfn; 204} 205 206EXPORT_SYMBOL(blk_queue_merge_bvec); 207 208/** 209 * blk_queue_make_request - define an alternate make_request function for a device 210 * @q: the request queue for the device to be affected 211 * @mfn: the alternate make_request function 212 * 213 * Description: 214 * The normal way for &struct bios to be passed to a device 215 * driver is for them to be collected into requests on a request 216 * queue, and then to allow the device driver to select requests 217 * off that queue when it is ready. This works well for many block 218 * devices. However some block devices (typically virtual devices 219 * such as md or lvm) do not benefit from the processing on the 220 * request queue, and are served best by having the requests passed 221 * directly to them. This can be achieved by providing a function 222 * to blk_queue_make_request(). 223 * 224 * Caveat: 225 * The driver that does this *must* be able to deal appropriately 226 * with buffers in "highmemory". This can be accomplished by either calling 227 * __bio_kmap_atomic() to get a temporary kernel mapping, or by calling 228 * blk_queue_bounce() to create a buffer in normal memory. 229 **/ 230void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn) 231{ 232 /* 233 * set defaults 234 */ 235 q->nr_requests = BLKDEV_MAX_RQ; 236 blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); 237 blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); 238 q->make_request_fn = mfn; 239 q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 240 q->backing_dev_info.state = 0; 241 q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY; 242 blk_queue_max_sectors(q, MAX_SECTORS); 243 blk_queue_hardsect_size(q, 512); 244 blk_queue_dma_alignment(q, 511); 245 blk_queue_congestion_threshold(q); 246 q->nr_batching = BLK_BATCH_REQ; 247 248 q->unplug_thresh = 4; /* hmm */ 249 q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */ 250 if (q->unplug_delay == 0) 251 q->unplug_delay = 1; 252 253 INIT_WORK(&q->unplug_work, blk_unplug_work, q); 254 255 q->unplug_timer.function = blk_unplug_timeout; 256 q->unplug_timer.data = (unsigned long)q; 257 258 /* 259 * by default assume old behaviour and bounce for any highmem page 260 */ 261 blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); 262 263 blk_queue_activity_fn(q, NULL, NULL); 264} 265 266EXPORT_SYMBOL(blk_queue_make_request); 267 268static inline void rq_init(request_queue_t *q, struct request *rq) 269{ 270 INIT_LIST_HEAD(&rq->queuelist); 271 272 rq->errors = 0; 273 rq->rq_status = RQ_ACTIVE; 274 rq->bio = rq->biotail = NULL; 275 rq->ioprio = 0; 276 rq->buffer = NULL; 277 rq->ref_count = 1; 278 rq->q = q; 279 rq->waiting = NULL; 280 rq->special = NULL; 281 rq->data_len = 0; 282 rq->data = NULL; 283 rq->nr_phys_segments = 0; 284 rq->sense = NULL; 285 rq->end_io = NULL; 286 rq->end_io_data = NULL; 287} 288 289/** 290 * blk_queue_ordered - does this queue support ordered writes 291 * @q: the request queue 292 * @flag: see below 293 * 294 * Description: 295 * For journalled file systems, doing ordered writes on a commit 296 * block instead of explicitly doing wait_on_buffer (which is bad 297 * for performance) can be a big win. Block drivers supporting this 298 * feature should call this function and indicate so. 299 * 300 **/ 301void blk_queue_ordered(request_queue_t *q, int flag) 302{ 303 switch (flag) { 304 case QUEUE_ORDERED_NONE: 305 if (q->flush_rq) 306 kmem_cache_free(request_cachep, q->flush_rq); 307 q->flush_rq = NULL; 308 q->ordered = flag; 309 break; 310 case QUEUE_ORDERED_TAG: 311 q->ordered = flag; 312 break; 313 case QUEUE_ORDERED_FLUSH: 314 q->ordered = flag; 315 if (!q->flush_rq) 316 q->flush_rq = kmem_cache_alloc(request_cachep, 317 GFP_KERNEL); 318 break; 319 default: 320 printk("blk_queue_ordered: bad value %d\n", flag); 321 break; 322 } 323} 324 325EXPORT_SYMBOL(blk_queue_ordered); 326 327/** 328 * blk_queue_issue_flush_fn - set function for issuing a flush 329 * @q: the request queue 330 * @iff: the function to be called issuing the flush 331 * 332 * Description: 333 * If a driver supports issuing a flush command, the support is notified 334 * to the block layer by defining it through this call. 335 * 336 **/ 337void blk_queue_issue_flush_fn(request_queue_t *q, issue_flush_fn *iff) 338{ 339 q->issue_flush_fn = iff; 340} 341 342EXPORT_SYMBOL(blk_queue_issue_flush_fn); 343 344/* 345 * Cache flushing for ordered writes handling 346 */ 347static void blk_pre_flush_end_io(struct request *flush_rq) 348{ 349 struct request *rq = flush_rq->end_io_data; 350 request_queue_t *q = rq->q; 351 352 elv_completed_request(q, flush_rq); 353 354 rq->flags |= REQ_BAR_PREFLUSH; 355 356 if (!flush_rq->errors) 357 elv_requeue_request(q, rq); 358 else { 359 q->end_flush_fn(q, flush_rq); 360 clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags); 361 q->request_fn(q); 362 } 363} 364 365static void blk_post_flush_end_io(struct request *flush_rq) 366{ 367 struct request *rq = flush_rq->end_io_data; 368 request_queue_t *q = rq->q; 369 370 elv_completed_request(q, flush_rq); 371 372 rq->flags |= REQ_BAR_POSTFLUSH; 373 374 q->end_flush_fn(q, flush_rq); 375 clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags); 376 q->request_fn(q); 377} 378 379struct request *blk_start_pre_flush(request_queue_t *q, struct request *rq) 380{ 381 struct request *flush_rq = q->flush_rq; 382 383 BUG_ON(!blk_barrier_rq(rq)); 384 385 if (test_and_set_bit(QUEUE_FLAG_FLUSH, &q->queue_flags)) 386 return NULL; 387 388 rq_init(q, flush_rq); 389 flush_rq->elevator_private = NULL; 390 flush_rq->flags = REQ_BAR_FLUSH; 391 flush_rq->rq_disk = rq->rq_disk; 392 flush_rq->rl = NULL; 393 394 /* 395 * prepare_flush returns 0 if no flush is needed, just mark both 396 * pre and post flush as done in that case 397 */ 398 if (!q->prepare_flush_fn(q, flush_rq)) { 399 rq->flags |= REQ_BAR_PREFLUSH | REQ_BAR_POSTFLUSH; 400 clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags); 401 return rq; 402 } 403 404 /* 405 * some drivers dequeue requests right away, some only after io 406 * completion. make sure the request is dequeued. 407 */ 408 if (!list_empty(&rq->queuelist)) 409 blkdev_dequeue_request(rq); 410 411 flush_rq->end_io_data = rq; 412 flush_rq->end_io = blk_pre_flush_end_io; 413 414 __elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0); 415 return flush_rq; 416} 417 418static void blk_start_post_flush(request_queue_t *q, struct request *rq) 419{ 420 struct request *flush_rq = q->flush_rq; 421 422 BUG_ON(!blk_barrier_rq(rq)); 423 424 rq_init(q, flush_rq); 425 flush_rq->elevator_private = NULL; 426 flush_rq->flags = REQ_BAR_FLUSH; 427 flush_rq->rq_disk = rq->rq_disk; 428 flush_rq->rl = NULL; 429 430 if (q->prepare_flush_fn(q, flush_rq)) { 431 flush_rq->end_io_data = rq; 432 flush_rq->end_io = blk_post_flush_end_io; 433 434 __elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0); 435 q->request_fn(q); 436 } 437} 438 439static inline int blk_check_end_barrier(request_queue_t *q, struct request *rq, 440 int sectors) 441{ 442 if (sectors > rq->nr_sectors) 443 sectors = rq->nr_sectors; 444 445 rq->nr_sectors -= sectors; 446 return rq->nr_sectors; 447} 448 449static int __blk_complete_barrier_rq(request_queue_t *q, struct request *rq, 450 int sectors, int queue_locked) 451{ 452 if (q->ordered != QUEUE_ORDERED_FLUSH) 453 return 0; 454 if (!blk_fs_request(rq) || !blk_barrier_rq(rq)) 455 return 0; 456 if (blk_barrier_postflush(rq)) 457 return 0; 458 459 if (!blk_check_end_barrier(q, rq, sectors)) { 460 unsigned long flags = 0; 461 462 if (!queue_locked) 463 spin_lock_irqsave(q->queue_lock, flags); 464 465 blk_start_post_flush(q, rq); 466 467 if (!queue_locked) 468 spin_unlock_irqrestore(q->queue_lock, flags); 469 } 470 471 return 1; 472} 473 474/** 475 * blk_complete_barrier_rq - complete possible barrier request 476 * @q: the request queue for the device 477 * @rq: the request 478 * @sectors: number of sectors to complete 479 * 480 * Description: 481 * Used in driver end_io handling to determine whether to postpone 482 * completion of a barrier request until a post flush has been done. This 483 * is the unlocked variant, used if the caller doesn't already hold the 484 * queue lock. 485 **/ 486int blk_complete_barrier_rq(request_queue_t *q, struct request *rq, int sectors) 487{ 488 return __blk_complete_barrier_rq(q, rq, sectors, 0); 489} 490EXPORT_SYMBOL(blk_complete_barrier_rq); 491 492/** 493 * blk_complete_barrier_rq_locked - complete possible barrier request 494 * @q: the request queue for the device 495 * @rq: the request 496 * @sectors: number of sectors to complete 497 * 498 * Description: 499 * See blk_complete_barrier_rq(). This variant must be used if the caller 500 * holds the queue lock. 501 **/ 502int blk_complete_barrier_rq_locked(request_queue_t *q, struct request *rq, 503 int sectors) 504{ 505 return __blk_complete_barrier_rq(q, rq, sectors, 1); 506} 507EXPORT_SYMBOL(blk_complete_barrier_rq_locked); 508 509/** 510 * blk_queue_bounce_limit - set bounce buffer limit for queue 511 * @q: the request queue for the device 512 * @dma_addr: bus address limit 513 * 514 * Description: 515 * Different hardware can have different requirements as to what pages 516 * it can do I/O directly to. A low level driver can call 517 * blk_queue_bounce_limit to have lower memory pages allocated as bounce 518 * buffers for doing I/O to pages residing above @page. By default 519 * the block layer sets this to the highest numbered "low" memory page. 520 **/ 521void blk_queue_bounce_limit(request_queue_t *q, u64 dma_addr) 522{ 523 unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT; 524 525 /* 526 * set appropriate bounce gfp mask -- unfortunately we don't have a 527 * full 4GB zone, so we have to resort to low memory for any bounces. 528 * ISA has its own < 16MB zone. 529 */ 530 if (bounce_pfn < blk_max_low_pfn) { 531 BUG_ON(dma_addr < BLK_BOUNCE_ISA); 532 init_emergency_isa_pool(); 533 q->bounce_gfp = GFP_NOIO | GFP_DMA; 534 } else 535 q->bounce_gfp = GFP_NOIO; 536 537 q->bounce_pfn = bounce_pfn; 538} 539 540EXPORT_SYMBOL(blk_queue_bounce_limit); 541 542/** 543 * blk_queue_max_sectors - set max sectors for a request for this queue 544 * @q: the request queue for the device 545 * @max_sectors: max sectors in the usual 512b unit 546 * 547 * Description: 548 * Enables a low level driver to set an upper limit on the size of 549 * received requests. 550 **/ 551void blk_queue_max_sectors(request_queue_t *q, unsigned short max_sectors) 552{ 553 if ((max_sectors << 9) < PAGE_CACHE_SIZE) { 554 max_sectors = 1 << (PAGE_CACHE_SHIFT - 9); 555 printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors); 556 } 557 558 q->max_sectors = q->max_hw_sectors = max_sectors; 559} 560 561EXPORT_SYMBOL(blk_queue_max_sectors); 562 563/** 564 * blk_queue_max_phys_segments - set max phys segments for a request for this queue 565 * @q: the request queue for the device 566 * @max_segments: max number of segments 567 * 568 * Description: 569 * Enables a low level driver to set an upper limit on the number of 570 * physical data segments in a request. This would be the largest sized 571 * scatter list the driver could handle. 572 **/ 573void blk_queue_max_phys_segments(request_queue_t *q, unsigned short max_segments) 574{ 575 if (!max_segments) { 576 max_segments = 1; 577 printk("%s: set to minimum %d\n", __FUNCTION__, max_segments); 578 } 579 580 q->max_phys_segments = max_segments; 581} 582 583EXPORT_SYMBOL(blk_queue_max_phys_segments); 584 585/** 586 * blk_queue_max_hw_segments - set max hw segments for a request for this queue 587 * @q: the request queue for the device 588 * @max_segments: max number of segments 589 * 590 * Description: 591 * Enables a low level driver to set an upper limit on the number of 592 * hw data segments in a request. This would be the largest number of 593 * address/length pairs the host adapter can actually give as once 594 * to the device. 595 **/ 596void blk_queue_max_hw_segments(request_queue_t *q, unsigned short max_segments) 597{ 598 if (!max_segments) { 599 max_segments = 1; 600 printk("%s: set to minimum %d\n", __FUNCTION__, max_segments); 601 } 602 603 q->max_hw_segments = max_segments; 604} 605 606EXPORT_SYMBOL(blk_queue_max_hw_segments); 607 608/** 609 * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg 610 * @q: the request queue for the device 611 * @max_size: max size of segment in bytes 612 * 613 * Description: 614 * Enables a low level driver to set an upper limit on the size of a 615 * coalesced segment 616 **/ 617void blk_queue_max_segment_size(request_queue_t *q, unsigned int max_size) 618{ 619 if (max_size < PAGE_CACHE_SIZE) { 620 max_size = PAGE_CACHE_SIZE; 621 printk("%s: set to minimum %d\n", __FUNCTION__, max_size); 622 } 623 624 q->max_segment_size = max_size; 625} 626 627EXPORT_SYMBOL(blk_queue_max_segment_size); 628 629/** 630 * blk_queue_hardsect_size - set hardware sector size for the queue 631 * @q: the request queue for the device 632 * @size: the hardware sector size, in bytes 633 * 634 * Description: 635 * This should typically be set to the lowest possible sector size 636 * that the hardware can operate on (possible without reverting to 637 * even internal read-modify-write operations). Usually the default 638 * of 512 covers most hardware. 639 **/ 640void blk_queue_hardsect_size(request_queue_t *q, unsigned short size) 641{ 642 q->hardsect_size = size; 643} 644 645EXPORT_SYMBOL(blk_queue_hardsect_size); 646 647/* 648 * Returns the minimum that is _not_ zero, unless both are zero. 649 */ 650#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) 651 652/** 653 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers 654 * @t: the stacking driver (top) 655 * @b: the underlying device (bottom) 656 **/ 657void blk_queue_stack_limits(request_queue_t *t, request_queue_t *b) 658{ 659 /* zero is "infinity" */ 660 t->max_sectors = t->max_hw_sectors = 661 min_not_zero(t->max_sectors,b->max_sectors); 662 663 t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments); 664 t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments); 665 t->max_segment_size = min(t->max_segment_size,b->max_segment_size); 666 t->hardsect_size = max(t->hardsect_size,b->hardsect_size); 667} 668 669EXPORT_SYMBOL(blk_queue_stack_limits); 670 671/** 672 * blk_queue_segment_boundary - set boundary rules for segment merging 673 * @q: the request queue for the device 674 * @mask: the memory boundary mask 675 **/ 676void blk_queue_segment_boundary(request_queue_t *q, unsigned long mask) 677{ 678 if (mask < PAGE_CACHE_SIZE - 1) { 679 mask = PAGE_CACHE_SIZE - 1; 680 printk("%s: set to minimum %lx\n", __FUNCTION__, mask); 681 } 682 683 q->seg_boundary_mask = mask; 684} 685 686EXPORT_SYMBOL(blk_queue_segment_boundary); 687 688/** 689 * blk_queue_dma_alignment - set dma length and memory alignment 690 * @q: the request queue for the device 691 * @mask: alignment mask 692 * 693 * description: 694 * set required memory and length aligment for direct dma transactions. 695 * this is used when buiding direct io requests for the queue. 696 * 697 **/ 698void blk_queue_dma_alignment(request_queue_t *q, int mask) 699{ 700 q->dma_alignment = mask; 701} 702 703EXPORT_SYMBOL(blk_queue_dma_alignment); 704 705/** 706 * blk_queue_find_tag - find a request by its tag and queue 707 * @q: The request queue for the device 708 * @tag: The tag of the request 709 * 710 * Notes: 711 * Should be used when a device returns a tag and you want to match 712 * it with a request. 713 * 714 * no locks need be held. 715 **/ 716struct request *blk_queue_find_tag(request_queue_t *q, int tag) 717{ 718 struct blk_queue_tag *bqt = q->queue_tags; 719 720 if (unlikely(bqt == NULL || tag >= bqt->real_max_depth)) 721 return NULL; 722 723 return bqt->tag_index[tag]; 724} 725 726EXPORT_SYMBOL(blk_queue_find_tag); 727 728/** 729 * __blk_queue_free_tags - release tag maintenance info 730 * @q: the request queue for the device 731 * 732 * Notes: 733 * blk_cleanup_queue() will take care of calling this function, if tagging 734 * has been used. So there's no need to call this directly. 735 **/ 736static void __blk_queue_free_tags(request_queue_t *q) 737{ 738 struct blk_queue_tag *bqt = q->queue_tags; 739 740 if (!bqt) 741 return; 742 743 if (atomic_dec_and_test(&bqt->refcnt)) { 744 BUG_ON(bqt->busy); 745 BUG_ON(!list_empty(&bqt->busy_list)); 746 747 kfree(bqt->tag_index); 748 bqt->tag_index = NULL; 749 750 kfree(bqt->tag_map); 751 bqt->tag_map = NULL; 752 753 kfree(bqt); 754 } 755 756 q->queue_tags = NULL; 757 q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED); 758} 759 760/** 761 * blk_queue_free_tags - release tag maintenance info 762 * @q: the request queue for the device 763 * 764 * Notes: 765 * This is used to disabled tagged queuing to a device, yet leave 766 * queue in function. 767 **/ 768void blk_queue_free_tags(request_queue_t *q) 769{ 770 clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags); 771} 772 773EXPORT_SYMBOL(blk_queue_free_tags); 774 775static int 776init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth) 777{ 778 struct request **tag_index; 779 unsigned long *tag_map; 780 int nr_ulongs; 781 782 if (depth > q->nr_requests * 2) { 783 depth = q->nr_requests * 2; 784 printk(KERN_ERR "%s: adjusted depth to %d\n", 785 __FUNCTION__, depth); 786 } 787 788 tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC); 789 if (!tag_index) 790 goto fail; 791 792 nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG; 793 tag_map = kmalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC); 794 if (!tag_map) 795 goto fail; 796 797 memset(tag_index, 0, depth * sizeof(struct request *)); 798 memset(tag_map, 0, nr_ulongs * sizeof(unsigned long)); 799 tags->real_max_depth = depth; 800 tags->max_depth = depth; 801 tags->tag_index = tag_index; 802 tags->tag_map = tag_map; 803 804 return 0; 805fail: 806 kfree(tag_index); 807 return -ENOMEM; 808} 809 810/** 811 * blk_queue_init_tags - initialize the queue tag info 812 * @q: the request queue for the device 813 * @depth: the maximum queue depth supported 814 * @tags: the tag to use 815 **/ 816int blk_queue_init_tags(request_queue_t *q, int depth, 817 struct blk_queue_tag *tags) 818{ 819 int rc; 820 821 BUG_ON(tags && q->queue_tags && tags != q->queue_tags); 822 823 if (!tags && !q->queue_tags) { 824 tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC); 825 if (!tags) 826 goto fail; 827 828 if (init_tag_map(q, tags, depth)) 829 goto fail; 830 831 INIT_LIST_HEAD(&tags->busy_list); 832 tags->busy = 0; 833 atomic_set(&tags->refcnt, 1); 834 } else if (q->queue_tags) { 835 if ((rc = blk_queue_resize_tags(q, depth))) 836 return rc; 837 set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags); 838 return 0; 839 } else 840 atomic_inc(&tags->refcnt); 841 842 /* 843 * assign it, all done 844 */ 845 q->queue_tags = tags; 846 q->queue_flags |= (1 << QUEUE_FLAG_QUEUED); 847 return 0; 848fail: 849 kfree(tags); 850 return -ENOMEM; 851} 852 853EXPORT_SYMBOL(blk_queue_init_tags); 854 855/** 856 * blk_queue_resize_tags - change the queueing depth 857 * @q: the request queue for the device 858 * @new_depth: the new max command queueing depth 859 * 860 * Notes: 861 * Must be called with the queue lock held. 862 **/ 863int blk_queue_resize_tags(request_queue_t *q, int new_depth) 864{ 865 struct blk_queue_tag *bqt = q->queue_tags; 866 struct request **tag_index; 867 unsigned long *tag_map; 868 int max_depth, nr_ulongs; 869 870 if (!bqt) 871 return -ENXIO; 872 873 /* 874 * if we already have large enough real_max_depth. just 875 * adjust max_depth. *NOTE* as requests with tag value 876 * between new_depth and real_max_depth can be in-flight, tag 877 * map can not be shrunk blindly here. 878 */ 879 if (new_depth <= bqt->real_max_depth) { 880 bqt->max_depth = new_depth; 881 return 0; 882 } 883 884 /* 885 * save the old state info, so we can copy it back 886 */ 887 tag_index = bqt->tag_index; 888 tag_map = bqt->tag_map; 889 max_depth = bqt->real_max_depth; 890 891 if (init_tag_map(q, bqt, new_depth)) 892 return -ENOMEM; 893 894 memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *)); 895 nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG; 896 memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long)); 897 898 kfree(tag_index); 899 kfree(tag_map); 900 return 0; 901} 902 903EXPORT_SYMBOL(blk_queue_resize_tags); 904 905/** 906 * blk_queue_end_tag - end tag operations for a request 907 * @q: the request queue for the device 908 * @rq: the request that has completed 909 * 910 * Description: 911 * Typically called when end_that_request_first() returns 0, meaning 912 * all transfers have been done for a request. It's important to call 913 * this function before end_that_request_last(), as that will put the 914 * request back on the free list thus corrupting the internal tag list. 915 * 916 * Notes: 917 * queue lock must be held. 918 **/ 919void blk_queue_end_tag(request_queue_t *q, struct request *rq) 920{ 921 struct blk_queue_tag *bqt = q->queue_tags; 922 int tag = rq->tag; 923 924 BUG_ON(tag == -1); 925 926 if (unlikely(tag >= bqt->real_max_depth)) 927 /* 928 * This can happen after tag depth has been reduced. 929 * FIXME: how about a warning or info message here? 930 */ 931 return; 932 933 if (unlikely(!__test_and_clear_bit(tag, bqt->tag_map))) { 934 printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n", 935 __FUNCTION__, tag); 936 return; 937 } 938 939 list_del_init(&rq->queuelist); 940 rq->flags &= ~REQ_QUEUED; 941 rq->tag = -1; 942 943 if (unlikely(bqt->tag_index[tag] == NULL)) 944 printk(KERN_ERR "%s: tag %d is missing\n", 945 __FUNCTION__, tag); 946 947 bqt->tag_index[tag] = NULL; 948 bqt->busy--; 949} 950 951EXPORT_SYMBOL(blk_queue_end_tag); 952 953/** 954 * blk_queue_start_tag - find a free tag and assign it 955 * @q: the request queue for the device 956 * @rq: the block request that needs tagging 957 * 958 * Description: 959 * This can either be used as a stand-alone helper, or possibly be 960 * assigned as the queue &prep_rq_fn (in which case &struct request 961 * automagically gets a tag assigned). Note that this function 962 * assumes that any type of request can be queued! if this is not 963 * true for your device, you must check the request type before 964 * calling this function. The request will also be removed from 965 * the request queue, so it's the drivers responsibility to readd 966 * it if it should need to be restarted for some reason. 967 * 968 * Notes: 969 * queue lock must be held. 970 **/ 971int blk_queue_start_tag(request_queue_t *q, struct request *rq) 972{ 973 struct blk_queue_tag *bqt = q->queue_tags; 974 int tag; 975 976 if (unlikely((rq->flags & REQ_QUEUED))) { 977 printk(KERN_ERR 978 "%s: request %p for device [%s] already tagged %d", 979 __FUNCTION__, rq, 980 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag); 981 BUG(); 982 } 983 984 tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth); 985 if (tag >= bqt->max_depth) 986 return 1; 987 988 __set_bit(tag, bqt->tag_map); 989 990 rq->flags |= REQ_QUEUED; 991 rq->tag = tag; 992 bqt->tag_index[tag] = rq; 993 blkdev_dequeue_request(rq); 994 list_add(&rq->queuelist, &bqt->busy_list); 995 bqt->busy++; 996 return 0; 997} 998 999EXPORT_SYMBOL(blk_queue_start_tag); 1000 1001/** 1002 * blk_queue_invalidate_tags - invalidate all pending tags 1003 * @q: the request queue for the device 1004 * 1005 * Description: 1006 * Hardware conditions may dictate a need to stop all pending requests. 1007 * In this case, we will safely clear the block side of the tag queue and 1008 * readd all requests to the request queue in the right order. 1009 * 1010 * Notes: 1011 * queue lock must be held. 1012 **/ 1013void blk_queue_invalidate_tags(request_queue_t *q) 1014{ 1015 struct blk_queue_tag *bqt = q->queue_tags; 1016 struct list_head *tmp, *n; 1017 struct request *rq; 1018 1019 list_for_each_safe(tmp, n, &bqt->busy_list) { 1020 rq = list_entry_rq(tmp); 1021 1022 if (rq->tag == -1) { 1023 printk(KERN_ERR 1024 "%s: bad tag found on list\n", __FUNCTION__); 1025 list_del_init(&rq->queuelist); 1026 rq->flags &= ~REQ_QUEUED; 1027 } else 1028 blk_queue_end_tag(q, rq); 1029 1030 rq->flags &= ~REQ_STARTED; 1031 __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0); 1032 } 1033} 1034 1035EXPORT_SYMBOL(blk_queue_invalidate_tags); 1036 1037static char *rq_flags[] = { 1038 "REQ_RW", 1039 "REQ_FAILFAST", 1040 "REQ_SORTED", 1041 "REQ_SOFTBARRIER", 1042 "REQ_HARDBARRIER", 1043 "REQ_CMD", 1044 "REQ_NOMERGE", 1045 "REQ_STARTED", 1046 "REQ_DONTPREP", 1047 "REQ_QUEUED", 1048 "REQ_ELVPRIV", 1049 "REQ_PC", 1050 "REQ_BLOCK_PC", 1051 "REQ_SENSE", 1052 "REQ_FAILED", 1053 "REQ_QUIET", 1054 "REQ_SPECIAL", 1055 "REQ_DRIVE_CMD", 1056 "REQ_DRIVE_TASK", 1057 "REQ_DRIVE_TASKFILE", 1058 "REQ_PREEMPT", 1059 "REQ_PM_SUSPEND", 1060 "REQ_PM_RESUME", 1061 "REQ_PM_SHUTDOWN", 1062}; 1063 1064void blk_dump_rq_flags(struct request *rq, char *msg) 1065{ 1066 int bit; 1067 1068 printk("%s: dev %s: flags = ", msg, 1069 rq->rq_disk ? rq->rq_disk->disk_name : "?"); 1070 bit = 0; 1071 do { 1072 if (rq->flags & (1 << bit)) 1073 printk("%s ", rq_flags[bit]); 1074 bit++; 1075 } while (bit < __REQ_NR_BITS); 1076 1077 printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector, 1078 rq->nr_sectors, 1079 rq->current_nr_sectors); 1080 printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len); 1081 1082 if (rq->flags & (REQ_BLOCK_PC | REQ_PC)) { 1083 printk("cdb: "); 1084 for (bit = 0; bit < sizeof(rq->cmd); bit++) 1085 printk("%02x ", rq->cmd[bit]); 1086 printk("\n"); 1087 } 1088} 1089 1090EXPORT_SYMBOL(blk_dump_rq_flags); 1091 1092void blk_recount_segments(request_queue_t *q, struct bio *bio) 1093{ 1094 struct bio_vec *bv, *bvprv = NULL; 1095 int i, nr_phys_segs, nr_hw_segs, seg_size, hw_seg_size, cluster; 1096 int high, highprv = 1; 1097 1098 if (unlikely(!bio->bi_io_vec)) 1099 return; 1100 1101 cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER); 1102 hw_seg_size = seg_size = nr_phys_segs = nr_hw_segs = 0; 1103 bio_for_each_segment(bv, bio, i) { 1104 /* 1105 * the trick here is making sure that a high page is never 1106 * considered part of another segment, since that might 1107 * change with the bounce page. 1108 */ 1109 high = page_to_pfn(bv->bv_page) >= q->bounce_pfn; 1110 if (high || highprv) 1111 goto new_hw_segment; 1112 if (cluster) { 1113 if (seg_size + bv->bv_len > q->max_segment_size) 1114 goto new_segment; 1115 if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv)) 1116 goto new_segment; 1117 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv)) 1118 goto new_segment; 1119 if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) 1120 goto new_hw_segment; 1121 1122 seg_size += bv->bv_len; 1123 hw_seg_size += bv->bv_len; 1124 bvprv = bv; 1125 continue; 1126 } 1127new_segment: 1128 if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) && 1129 !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) { 1130 hw_seg_size += bv->bv_len; 1131 } else { 1132new_hw_segment: 1133 if (hw_seg_size > bio->bi_hw_front_size) 1134 bio->bi_hw_front_size = hw_seg_size; 1135 hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len; 1136 nr_hw_segs++; 1137 } 1138 1139 nr_phys_segs++; 1140 bvprv = bv; 1141 seg_size = bv->bv_len; 1142 highprv = high; 1143 } 1144 if (hw_seg_size > bio->bi_hw_back_size) 1145 bio->bi_hw_back_size = hw_seg_size; 1146 if (nr_hw_segs == 1 && hw_seg_size > bio->bi_hw_front_size) 1147 bio->bi_hw_front_size = hw_seg_size; 1148 bio->bi_phys_segments = nr_phys_segs; 1149 bio->bi_hw_segments = nr_hw_segs; 1150 bio->bi_flags |= (1 << BIO_SEG_VALID); 1151} 1152 1153 1154static int blk_phys_contig_segment(request_queue_t *q, struct bio *bio, 1155 struct bio *nxt) 1156{ 1157 if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER))) 1158 return 0; 1159 1160 if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt))) 1161 return 0; 1162 if (bio->bi_size + nxt->bi_size > q->max_segment_size) 1163 return 0; 1164 1165 /* 1166 * bio and nxt are contigous in memory, check if the queue allows 1167 * these two to be merged into one 1168 */ 1169 if (BIO_SEG_BOUNDARY(q, bio, nxt)) 1170 return 1; 1171 1172 return 0; 1173} 1174 1175static int blk_hw_contig_segment(request_queue_t *q, struct bio *bio, 1176 struct bio *nxt) 1177{ 1178 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 1179 blk_recount_segments(q, bio); 1180 if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID))) 1181 blk_recount_segments(q, nxt); 1182 if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) || 1183 BIOVEC_VIRT_OVERSIZE(bio->bi_hw_front_size + bio->bi_hw_back_size)) 1184 return 0; 1185 if (bio->bi_size + nxt->bi_size > q->max_segment_size) 1186 return 0; 1187 1188 return 1; 1189} 1190 1191/* 1192 * map a request to scatterlist, return number of sg entries setup. Caller 1193 * must make sure sg can hold rq->nr_phys_segments entries 1194 */ 1195int blk_rq_map_sg(request_queue_t *q, struct request *rq, struct scatterlist *sg) 1196{ 1197 struct bio_vec *bvec, *bvprv; 1198 struct bio *bio; 1199 int nsegs, i, cluster; 1200 1201 nsegs = 0; 1202 cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER); 1203 1204 /* 1205 * for each bio in rq 1206 */ 1207 bvprv = NULL; 1208 rq_for_each_bio(bio, rq) { 1209 /* 1210 * for each segment in bio 1211 */ 1212 bio_for_each_segment(bvec, bio, i) { 1213 int nbytes = bvec->bv_len; 1214 1215 if (bvprv && cluster) { 1216 if (sg[nsegs - 1].length + nbytes > q->max_segment_size) 1217 goto new_segment; 1218 1219 if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) 1220 goto new_segment; 1221 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec)) 1222 goto new_segment; 1223 1224 sg[nsegs - 1].length += nbytes; 1225 } else { 1226new_segment: 1227 memset(&sg[nsegs],0,sizeof(struct scatterlist)); 1228 sg[nsegs].page = bvec->bv_page; 1229 sg[nsegs].length = nbytes; 1230 sg[nsegs].offset = bvec->bv_offset; 1231 1232 nsegs++; 1233 } 1234 bvprv = bvec; 1235 } /* segments in bio */ 1236 } /* bios in rq */ 1237 1238 return nsegs; 1239} 1240 1241EXPORT_SYMBOL(blk_rq_map_sg); 1242 1243/* 1244 * the standard queue merge functions, can be overridden with device 1245 * specific ones if so desired 1246 */ 1247 1248static inline int ll_new_mergeable(request_queue_t *q, 1249 struct request *req, 1250 struct bio *bio) 1251{ 1252 int nr_phys_segs = bio_phys_segments(q, bio); 1253 1254 if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) { 1255 req->flags |= REQ_NOMERGE; 1256 if (req == q->last_merge) 1257 q->last_merge = NULL; 1258 return 0; 1259 } 1260 1261 /* 1262 * A hw segment is just getting larger, bump just the phys 1263 * counter. 1264 */ 1265 req->nr_phys_segments += nr_phys_segs; 1266 return 1; 1267} 1268 1269static inline int ll_new_hw_segment(request_queue_t *q, 1270 struct request *req, 1271 struct bio *bio) 1272{ 1273 int nr_hw_segs = bio_hw_segments(q, bio); 1274 int nr_phys_segs = bio_phys_segments(q, bio); 1275 1276 if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments 1277 || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) { 1278 req->flags |= REQ_NOMERGE; 1279 if (req == q->last_merge) 1280 q->last_merge = NULL; 1281 return 0; 1282 } 1283 1284 /* 1285 * This will form the start of a new hw segment. Bump both 1286 * counters. 1287 */ 1288 req->nr_hw_segments += nr_hw_segs; 1289 req->nr_phys_segments += nr_phys_segs; 1290 return 1; 1291} 1292 1293static int ll_back_merge_fn(request_queue_t *q, struct request *req, 1294 struct bio *bio) 1295{ 1296 int len; 1297 1298 if (req->nr_sectors + bio_sectors(bio) > q->max_sectors) { 1299 req->flags |= REQ_NOMERGE; 1300 if (req == q->last_merge) 1301 q->last_merge = NULL; 1302 return 0; 1303 } 1304 if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID))) 1305 blk_recount_segments(q, req->biotail); 1306 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 1307 blk_recount_segments(q, bio); 1308 len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size; 1309 if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)) && 1310 !BIOVEC_VIRT_OVERSIZE(len)) { 1311 int mergeable = ll_new_mergeable(q, req, bio); 1312 1313 if (mergeable) { 1314 if (req->nr_hw_segments == 1) 1315 req->bio->bi_hw_front_size = len; 1316 if (bio->bi_hw_segments == 1) 1317 bio->bi_hw_back_size = len; 1318 } 1319 return mergeable; 1320 } 1321 1322 return ll_new_hw_segment(q, req, bio); 1323} 1324 1325static int ll_front_merge_fn(request_queue_t *q, struct request *req, 1326 struct bio *bio) 1327{ 1328 int len; 1329 1330 if (req->nr_sectors + bio_sectors(bio) > q->max_sectors) { 1331 req->flags |= REQ_NOMERGE; 1332 if (req == q->last_merge) 1333 q->last_merge = NULL; 1334 return 0; 1335 } 1336 len = bio->bi_hw_back_size + req->bio->bi_hw_front_size; 1337 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 1338 blk_recount_segments(q, bio); 1339 if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID))) 1340 blk_recount_segments(q, req->bio); 1341 if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) && 1342 !BIOVEC_VIRT_OVERSIZE(len)) { 1343 int mergeable = ll_new_mergeable(q, req, bio); 1344 1345 if (mergeable) { 1346 if (bio->bi_hw_segments == 1) 1347 bio->bi_hw_front_size = len; 1348 if (req->nr_hw_segments == 1) 1349 req->biotail->bi_hw_back_size = len; 1350 } 1351 return mergeable; 1352 } 1353 1354 return ll_new_hw_segment(q, req, bio); 1355} 1356 1357static int ll_merge_requests_fn(request_queue_t *q, struct request *req, 1358 struct request *next) 1359{ 1360 int total_phys_segments; 1361 int total_hw_segments; 1362 1363 /* 1364 * First check if the either of the requests are re-queued 1365 * requests. Can't merge them if they are. 1366 */ 1367 if (req->special || next->special) 1368 return 0; 1369 1370 /* 1371 * Will it become too large? 1372 */ 1373 if ((req->nr_sectors + next->nr_sectors) > q->max_sectors) 1374 return 0; 1375 1376 total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; 1377 if (blk_phys_contig_segment(q, req->biotail, next->bio)) 1378 total_phys_segments--; 1379 1380 if (total_phys_segments > q->max_phys_segments) 1381 return 0; 1382 1383 total_hw_segments = req->nr_hw_segments + next->nr_hw_segments; 1384 if (blk_hw_contig_segment(q, req->biotail, next->bio)) { 1385 int len = req->biotail->bi_hw_back_size + next->bio->bi_hw_front_size; 1386 /* 1387 * propagate the combined length to the end of the requests 1388 */ 1389 if (req->nr_hw_segments == 1) 1390 req->bio->bi_hw_front_size = len; 1391 if (next->nr_hw_segments == 1) 1392 next->biotail->bi_hw_back_size = len; 1393 total_hw_segments--; 1394 } 1395 1396 if (total_hw_segments > q->max_hw_segments) 1397 return 0; 1398 1399 /* Merge is OK... */ 1400 req->nr_phys_segments = total_phys_segments; 1401 req->nr_hw_segments = total_hw_segments; 1402 return 1; 1403} 1404 1405/* 1406 * "plug" the device if there are no outstanding requests: this will 1407 * force the transfer to start only after we have put all the requests 1408 * on the list. 1409 * 1410 * This is called with interrupts off and no requests on the queue and 1411 * with the queue lock held. 1412 */ 1413void blk_plug_device(request_queue_t *q) 1414{ 1415 WARN_ON(!irqs_disabled()); 1416 1417 /* 1418 * don't plug a stopped queue, it must be paired with blk_start_queue() 1419 * which will restart the queueing 1420 */ 1421 if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) 1422 return; 1423 1424 if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) 1425 mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); 1426} 1427 1428EXPORT_SYMBOL(blk_plug_device); 1429 1430/* 1431 * remove the queue from the plugged list, if present. called with 1432 * queue lock held and interrupts disabled. 1433 */ 1434int blk_remove_plug(request_queue_t *q) 1435{ 1436 WARN_ON(!irqs_disabled()); 1437 1438 if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) 1439 return 0; 1440 1441 del_timer(&q->unplug_timer); 1442 return 1; 1443} 1444 1445EXPORT_SYMBOL(blk_remove_plug); 1446 1447/* 1448 * remove the plug and let it rip.. 1449 */ 1450void __generic_unplug_device(request_queue_t *q) 1451{ 1452 if (unlikely(test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))) 1453 return; 1454 1455 if (!blk_remove_plug(q)) 1456 return; 1457 1458 q->request_fn(q); 1459} 1460EXPORT_SYMBOL(__generic_unplug_device); 1461 1462/** 1463 * generic_unplug_device - fire a request queue 1464 * @q: The &request_queue_t in question 1465 * 1466 * Description: 1467 * Linux uses plugging to build bigger requests queues before letting 1468 * the device have at them. If a queue is plugged, the I/O scheduler 1469 * is still adding and merging requests on the queue. Once the queue 1470 * gets unplugged, the request_fn defined for the queue is invoked and 1471 * transfers started. 1472 **/ 1473void generic_unplug_device(request_queue_t *q) 1474{ 1475 spin_lock_irq(q->queue_lock); 1476 __generic_unplug_device(q); 1477 spin_unlock_irq(q->queue_lock); 1478} 1479EXPORT_SYMBOL(generic_unplug_device); 1480 1481static void blk_backing_dev_unplug(struct backing_dev_info *bdi, 1482 struct page *page) 1483{ 1484 request_queue_t *q = bdi->unplug_io_data; 1485 1486 /* 1487 * devices don't necessarily have an ->unplug_fn defined 1488 */ 1489 if (q->unplug_fn) 1490 q->unplug_fn(q); 1491} 1492 1493static void blk_unplug_work(void *data) 1494{ 1495 request_queue_t *q = data; 1496 1497 q->unplug_fn(q); 1498} 1499 1500static void blk_unplug_timeout(unsigned long data) 1501{ 1502 request_queue_t *q = (request_queue_t *)data; 1503 1504 kblockd_schedule_work(&q->unplug_work); 1505} 1506 1507/** 1508 * blk_start_queue - restart a previously stopped queue 1509 * @q: The &request_queue_t in question 1510 * 1511 * Description: 1512 * blk_start_queue() will clear the stop flag on the queue, and call 1513 * the request_fn for the queue if it was in a stopped state when 1514 * entered. Also see blk_stop_queue(). Queue lock must be held. 1515 **/ 1516void blk_start_queue(request_queue_t *q) 1517{ 1518 clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags); 1519 1520 /* 1521 * one level of recursion is ok and is much faster than kicking 1522 * the unplug handling 1523 */ 1524 if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) { 1525 q->request_fn(q); 1526 clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags); 1527 } else { 1528 blk_plug_device(q); 1529 kblockd_schedule_work(&q->unplug_work); 1530 } 1531} 1532 1533EXPORT_SYMBOL(blk_start_queue); 1534 1535/** 1536 * blk_stop_queue - stop a queue 1537 * @q: The &request_queue_t in question 1538 * 1539 * Description: 1540 * The Linux block layer assumes that a block driver will consume all 1541 * entries on the request queue when the request_fn strategy is called. 1542 * Often this will not happen, because of hardware limitations (queue 1543 * depth settings). If a device driver gets a 'queue full' response, 1544 * or if it simply chooses not to queue more I/O at one point, it can 1545 * call this function to prevent the request_fn from being called until 1546 * the driver has signalled it's ready to go again. This happens by calling 1547 * blk_start_queue() to restart queue operations. Queue lock must be held. 1548 **/ 1549void blk_stop_queue(request_queue_t *q) 1550{ 1551 blk_remove_plug(q); 1552 set_bit(QUEUE_FLAG_STOPPED, &q->queue_flags); 1553} 1554EXPORT_SYMBOL(blk_stop_queue); 1555 1556/** 1557 * blk_sync_queue - cancel any pending callbacks on a queue 1558 * @q: the queue 1559 * 1560 * Description: 1561 * The block layer may perform asynchronous callback activity 1562 * on a queue, such as calling the unplug function after a timeout. 1563 * A block device may call blk_sync_queue to ensure that any 1564 * such activity is cancelled, thus allowing it to release resources 1565 * the the callbacks might use. The caller must already have made sure 1566 * that its ->make_request_fn will not re-add plugging prior to calling 1567 * this function. 1568 * 1569 */ 1570void blk_sync_queue(struct request_queue *q) 1571{ 1572 del_timer_sync(&q->unplug_timer); 1573 kblockd_flush(); 1574} 1575EXPORT_SYMBOL(blk_sync_queue); 1576 1577/** 1578 * blk_run_queue - run a single device queue 1579 * @q: The queue to run 1580 */ 1581void blk_run_queue(struct request_queue *q) 1582{ 1583 unsigned long flags; 1584 1585 spin_lock_irqsave(q->queue_lock, flags); 1586 blk_remove_plug(q); 1587 if (!elv_queue_empty(q)) 1588 q->request_fn(q); 1589 spin_unlock_irqrestore(q->queue_lock, flags); 1590} 1591EXPORT_SYMBOL(blk_run_queue); 1592 1593/** 1594 * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed 1595 * @q: the request queue to be released 1596 * 1597 * Description: 1598 * blk_cleanup_queue is the pair to blk_init_queue() or 1599 * blk_queue_make_request(). It should be called when a request queue is 1600 * being released; typically when a block device is being de-registered. 1601 * Currently, its primary task it to free all the &struct request 1602 * structures that were allocated to the queue and the queue itself. 1603 * 1604 * Caveat: 1605 * Hopefully the low level driver will have finished any 1606 * outstanding requests first... 1607 **/ 1608void blk_cleanup_queue(request_queue_t * q) 1609{ 1610 struct request_list *rl = &q->rq; 1611 1612 if (!atomic_dec_and_test(&q->refcnt)) 1613 return; 1614 1615 if (q->elevator) 1616 elevator_exit(q->elevator); 1617 1618 blk_sync_queue(q); 1619 1620 if (rl->rq_pool) 1621 mempool_destroy(rl->rq_pool); 1622 1623 if (q->queue_tags) 1624 __blk_queue_free_tags(q); 1625 1626 blk_queue_ordered(q, QUEUE_ORDERED_NONE); 1627 1628 kmem_cache_free(requestq_cachep, q); 1629} 1630 1631EXPORT_SYMBOL(blk_cleanup_queue); 1632 1633static int blk_init_free_list(request_queue_t *q) 1634{ 1635 struct request_list *rl = &q->rq; 1636 1637 rl->count[READ] = rl->count[WRITE] = 0; 1638 rl->starved[READ] = rl->starved[WRITE] = 0; 1639 rl->elvpriv = 0; 1640 init_waitqueue_head(&rl->wait[READ]); 1641 init_waitqueue_head(&rl->wait[WRITE]); 1642 1643 rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, 1644 mempool_free_slab, request_cachep, q->node); 1645 1646 if (!rl->rq_pool) 1647 return -ENOMEM; 1648 1649 return 0; 1650} 1651 1652static int __make_request(request_queue_t *, struct bio *); 1653 1654request_queue_t *blk_alloc_queue(gfp_t gfp_mask) 1655{ 1656 return blk_alloc_queue_node(gfp_mask, -1); 1657} 1658EXPORT_SYMBOL(blk_alloc_queue); 1659 1660request_queue_t *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) 1661{ 1662 request_queue_t *q; 1663 1664 q = kmem_cache_alloc_node(requestq_cachep, gfp_mask, node_id); 1665 if (!q) 1666 return NULL; 1667 1668 memset(q, 0, sizeof(*q)); 1669 init_timer(&q->unplug_timer); 1670 atomic_set(&q->refcnt, 1); 1671 1672 q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug; 1673 q->backing_dev_info.unplug_io_data = q; 1674 1675 return q; 1676} 1677EXPORT_SYMBOL(blk_alloc_queue_node); 1678 1679/** 1680 * blk_init_queue - prepare a request queue for use with a block device 1681 * @rfn: The function to be called to process requests that have been 1682 * placed on the queue. 1683 * @lock: Request queue spin lock 1684 * 1685 * Description: 1686 * If a block device wishes to use the standard request handling procedures, 1687 * which sorts requests and coalesces adjacent requests, then it must 1688 * call blk_init_queue(). The function @rfn will be called when there 1689 * are requests on the queue that need to be processed. If the device 1690 * supports plugging, then @rfn may not be called immediately when requests 1691 * are available on the queue, but may be called at some time later instead. 1692 * Plugged queues are generally unplugged when a buffer belonging to one 1693 * of the requests on the queue is needed, or due to memory pressure. 1694 * 1695 * @rfn is not required, or even expected, to remove all requests off the 1696 * queue, but only as many as it can handle at a time. If it does leave 1697 * requests on the queue, it is responsible for arranging that the requests 1698 * get dealt with eventually. 1699 * 1700 * The queue spin lock must be held while manipulating the requests on the 1701 * request queue. 1702 * 1703 * Function returns a pointer to the initialized request queue, or NULL if 1704 * it didn't succeed. 1705 * 1706 * Note: 1707 * blk_init_queue() must be paired with a blk_cleanup_queue() call 1708 * when the block device is deactivated (such as at module unload). 1709 **/ 1710 1711request_queue_t *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) 1712{ 1713 return blk_init_queue_node(rfn, lock, -1); 1714} 1715EXPORT_SYMBOL(blk_init_queue); 1716 1717request_queue_t * 1718blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) 1719{ 1720 request_queue_t *q = blk_alloc_queue_node(GFP_KERNEL, node_id); 1721 1722 if (!q) 1723 return NULL; 1724 1725 q->node = node_id; 1726 if (blk_init_free_list(q)) 1727 goto out_init; 1728 1729 /* 1730 * if caller didn't supply a lock, they get per-queue locking with 1731 * our embedded lock 1732 */ 1733 if (!lock) { 1734 spin_lock_init(&q->__queue_lock); 1735 lock = &q->__queue_lock; 1736 } 1737 1738 q->request_fn = rfn; 1739 q->back_merge_fn = ll_back_merge_fn; 1740 q->front_merge_fn = ll_front_merge_fn; 1741 q->merge_requests_fn = ll_merge_requests_fn; 1742 q->prep_rq_fn = NULL; 1743 q->unplug_fn = generic_unplug_device; 1744 q->queue_flags = (1 << QUEUE_FLAG_CLUSTER); 1745 q->queue_lock = lock; 1746 1747 blk_queue_segment_boundary(q, 0xffffffff); 1748 1749 blk_queue_make_request(q, __make_request); 1750 blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE); 1751 1752 blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); 1753 blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); 1754 1755 /* 1756 * all done 1757 */ 1758 if (!elevator_init(q, NULL)) { 1759 blk_queue_congestion_threshold(q); 1760 return q; 1761 } 1762 1763 blk_cleanup_queue(q); 1764out_init: 1765 kmem_cache_free(requestq_cachep, q); 1766 return NULL; 1767} 1768EXPORT_SYMBOL(blk_init_queue_node); 1769 1770int blk_get_queue(request_queue_t *q) 1771{ 1772 if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { 1773 atomic_inc(&q->refcnt); 1774 return 0; 1775 } 1776 1777 return 1; 1778} 1779 1780EXPORT_SYMBOL(blk_get_queue); 1781 1782static inline void blk_free_request(request_queue_t *q, struct request *rq) 1783{ 1784 if (rq->flags & REQ_ELVPRIV) 1785 elv_put_request(q, rq); 1786 mempool_free(rq, q->rq.rq_pool); 1787} 1788 1789static inline struct request * 1790blk_alloc_request(request_queue_t *q, int rw, struct bio *bio, 1791 int priv, gfp_t gfp_mask) 1792{ 1793 struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); 1794 1795 if (!rq) 1796 return NULL; 1797 1798 /* 1799 * first three bits are identical in rq->flags and bio->bi_rw, 1800 * see bio.h and blkdev.h 1801 */ 1802 rq->flags = rw; 1803 1804 if (priv) { 1805 if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) { 1806 mempool_free(rq, q->rq.rq_pool); 1807 return NULL; 1808 } 1809 rq->flags |= REQ_ELVPRIV; 1810 } 1811 1812 return rq; 1813} 1814 1815/* 1816 * ioc_batching returns true if the ioc is a valid batching request and 1817 * should be given priority access to a request. 1818 */ 1819static inline int ioc_batching(request_queue_t *q, struct io_context *ioc) 1820{ 1821 if (!ioc) 1822 return 0; 1823 1824 /* 1825 * Make sure the process is able to allocate at least 1 request 1826 * even if the batch times out, otherwise we could theoretically 1827 * lose wakeups. 1828 */ 1829 return ioc->nr_batch_requests == q->nr_batching || 1830 (ioc->nr_batch_requests > 0 1831 && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME)); 1832} 1833 1834/* 1835 * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This 1836 * will cause the process to be a "batcher" on all queues in the system. This 1837 * is the behaviour we want though - once it gets a wakeup it should be given 1838 * a nice run. 1839 */ 1840static void ioc_set_batching(request_queue_t *q, struct io_context *ioc) 1841{ 1842 if (!ioc || ioc_batching(q, ioc)) 1843 return; 1844 1845 ioc->nr_batch_requests = q->nr_batching; 1846 ioc->last_waited = jiffies; 1847} 1848 1849static void __freed_request(request_queue_t *q, int rw) 1850{ 1851 struct request_list *rl = &q->rq; 1852 1853 if (rl->count[rw] < queue_congestion_off_threshold(q)) 1854 clear_queue_congested(q, rw); 1855 1856 if (rl->count[rw] + 1 <= q->nr_requests) { 1857 if (waitqueue_active(&rl->wait[rw])) 1858 wake_up(&rl->wait[rw]); 1859 1860 blk_clear_queue_full(q, rw); 1861 } 1862} 1863 1864/* 1865 * A request has just been released. Account for it, update the full and 1866 * congestion status, wake up any waiters. Called under q->queue_lock. 1867 */ 1868static void freed_request(request_queue_t *q, int rw, int priv) 1869{ 1870 struct request_list *rl = &q->rq; 1871 1872 rl->count[rw]--; 1873 if (priv) 1874 rl->elvpriv--; 1875 1876 __freed_request(q, rw); 1877 1878 if (unlikely(rl->starved[rw ^ 1])) 1879 __freed_request(q, rw ^ 1); 1880} 1881 1882#define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist) 1883/* 1884 * Get a free request, queue_lock must be held. 1885 * Returns NULL on failure, with queue_lock held. 1886 * Returns !NULL on success, with queue_lock *not held*. 1887 */ 1888static struct request *get_request(request_queue_t *q, int rw, struct bio *bio, 1889 gfp_t gfp_mask) 1890{ 1891 struct request *rq = NULL; 1892 struct request_list *rl = &q->rq; 1893 struct io_context *ioc = current_io_context(GFP_ATOMIC); 1894 int priv; 1895 1896 if (rl->count[rw]+1 >= q->nr_requests) { 1897 /* 1898 * The queue will fill after this allocation, so set it as 1899 * full, and mark this process as "batching". This process 1900 * will be allowed to complete a batch of requests, others 1901 * will be blocked. 1902 */ 1903 if (!blk_queue_full(q, rw)) { 1904 ioc_set_batching(q, ioc); 1905 blk_set_queue_full(q, rw); 1906 } 1907 } 1908 1909 switch (elv_may_queue(q, rw, bio)) { 1910 case ELV_MQUEUE_NO: 1911 goto rq_starved; 1912 case ELV_MQUEUE_MAY: 1913 break; 1914 case ELV_MQUEUE_MUST: 1915 goto get_rq; 1916 } 1917 1918 if (blk_queue_full(q, rw) && !ioc_batching(q, ioc)) { 1919 /* 1920 * The queue is full and the allocating process is not a 1921 * "batcher", and not exempted by the IO scheduler 1922 */ 1923 goto out; 1924 } 1925 1926get_rq: 1927 /* 1928 * Only allow batching queuers to allocate up to 50% over the defined 1929 * limit of requests, otherwise we could have thousands of requests 1930 * allocated with any setting of ->nr_requests 1931 */ 1932 if (rl->count[rw] >= (3 * q->nr_requests / 2)) 1933 goto out; 1934 1935 rl->count[rw]++; 1936 rl->starved[rw] = 0; 1937 if (rl->count[rw] >= queue_congestion_on_threshold(q)) 1938 set_queue_congested(q, rw); 1939 1940 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); 1941 if (priv) 1942 rl->elvpriv++; 1943 1944 spin_unlock_irq(q->queue_lock); 1945 1946 rq = blk_alloc_request(q, rw, bio, priv, gfp_mask); 1947 if (!rq) { 1948 /* 1949 * Allocation failed presumably due to memory. Undo anything 1950 * we might have messed up. 1951 * 1952 * Allocating task should really be put onto the front of the 1953 * wait queue, but this is pretty rare. 1954 */ 1955 spin_lock_irq(q->queue_lock); 1956 freed_request(q, rw, priv); 1957 1958 /* 1959 * in the very unlikely event that allocation failed and no 1960 * requests for this direction was pending, mark us starved 1961 * so that freeing of a request in the other direction will 1962 * notice us. another possible fix would be to split the 1963 * rq mempool into READ and WRITE 1964 */ 1965rq_starved: 1966 if (unlikely(rl->count[rw] == 0)) 1967 rl->starved[rw] = 1; 1968 1969 goto out; 1970 } 1971 1972 if (ioc_batching(q, ioc)) 1973 ioc->nr_batch_requests--; 1974 1975 rq_init(q, rq); 1976 rq->rl = rl; 1977out: 1978 return rq; 1979} 1980 1981/* 1982 * No available requests for this queue, unplug the device and wait for some 1983 * requests to become available. 1984 * 1985 * Called with q->queue_lock held, and returns with it unlocked. 1986 */ 1987static struct request *get_request_wait(request_queue_t *q, int rw, 1988 struct bio *bio) 1989{ 1990 struct request *rq; 1991 1992 rq = get_request(q, rw, bio, GFP_NOIO); 1993 while (!rq) { 1994 DEFINE_WAIT(wait); 1995 struct request_list *rl = &q->rq; 1996 1997 prepare_to_wait_exclusive(&rl->wait[rw], &wait, 1998 TASK_UNINTERRUPTIBLE); 1999 2000 rq = get_request(q, rw, bio, GFP_NOIO); 2001 2002 if (!rq) { 2003 struct io_context *ioc; 2004 2005 __generic_unplug_device(q); 2006 spin_unlock_irq(q->queue_lock); 2007 io_schedule(); 2008 2009 /* 2010 * After sleeping, we become a "batching" process and 2011 * will be able to allocate at least one request, and 2012 * up to a big batch of them for a small period time. 2013 * See ioc_batching, ioc_set_batching 2014 */ 2015 ioc = current_io_context(GFP_NOIO); 2016 ioc_set_batching(q, ioc); 2017 2018 spin_lock_irq(q->queue_lock); 2019 } 2020 finish_wait(&rl->wait[rw], &wait); 2021 } 2022 2023 return rq; 2024} 2025 2026struct request *blk_get_request(request_queue_t *q, int rw, gfp_t gfp_mask) 2027{ 2028 struct request *rq; 2029 2030 BUG_ON(rw != READ && rw != WRITE); 2031 2032 spin_lock_irq(q->queue_lock); 2033 if (gfp_mask & __GFP_WAIT) { 2034 rq = get_request_wait(q, rw, NULL); 2035 } else { 2036 rq = get_request(q, rw, NULL, gfp_mask); 2037 if (!rq) 2038 spin_unlock_irq(q->queue_lock); 2039 } 2040 /* q->queue_lock is unlocked at this point */ 2041 2042 return rq; 2043} 2044EXPORT_SYMBOL(blk_get_request); 2045 2046/** 2047 * blk_requeue_request - put a request back on queue 2048 * @q: request queue where request should be inserted 2049 * @rq: request to be inserted 2050 * 2051 * Description: 2052 * Drivers often keep queueing requests until the hardware cannot accept 2053 * more, when that condition happens we need to put the request back 2054 * on the queue. Must be called with queue lock held. 2055 */ 2056void blk_requeue_request(request_queue_t *q, struct request *rq) 2057{ 2058 if (blk_rq_tagged(rq)) 2059 blk_queue_end_tag(q, rq); 2060 2061 elv_requeue_request(q, rq); 2062} 2063 2064EXPORT_SYMBOL(blk_requeue_request); 2065 2066/** 2067 * blk_insert_request - insert a special request in to a request queue 2068 * @q: request queue where request should be inserted 2069 * @rq: request to be inserted 2070 * @at_head: insert request at head or tail of queue 2071 * @data: private data 2072 * 2073 * Description: 2074 * Many block devices need to execute commands asynchronously, so they don't 2075 * block the whole kernel from preemption during request execution. This is 2076 * accomplished normally by inserting aritficial requests tagged as 2077 * REQ_SPECIAL in to the corresponding request queue, and letting them be 2078 * scheduled for actual execution by the request queue. 2079 * 2080 * We have the option of inserting the head or the tail of the queue. 2081 * Typically we use the tail for new ioctls and so forth. We use the head 2082 * of the queue for things like a QUEUE_FULL message from a device, or a 2083 * host that is unable to accept a particular command. 2084 */ 2085void blk_insert_request(request_queue_t *q, struct request *rq, 2086 int at_head, void *data) 2087{ 2088 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; 2089 unsigned long flags; 2090 2091 /* 2092 * tell I/O scheduler that this isn't a regular read/write (ie it 2093 * must not attempt merges on this) and that it acts as a soft 2094 * barrier 2095 */ 2096 rq->flags |= REQ_SPECIAL | REQ_SOFTBARRIER; 2097 2098 rq->special = data; 2099 2100 spin_lock_irqsave(q->queue_lock, flags); 2101 2102 /* 2103 * If command is tagged, release the tag 2104 */ 2105 if (blk_rq_tagged(rq)) 2106 blk_queue_end_tag(q, rq); 2107 2108 drive_stat_acct(rq, rq->nr_sectors, 1); 2109 __elv_add_request(q, rq, where, 0); 2110 2111 if (blk_queue_plugged(q)) 2112 __generic_unplug_device(q); 2113 else 2114 q->request_fn(q); 2115 spin_unlock_irqrestore(q->queue_lock, flags); 2116} 2117 2118EXPORT_SYMBOL(blk_insert_request); 2119 2120/** 2121 * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage 2122 * @q: request queue where request should be inserted 2123 * @rq: request structure to fill 2124 * @ubuf: the user buffer 2125 * @len: length of user data 2126 * 2127 * Description: 2128 * Data will be mapped directly for zero copy io, if possible. Otherwise 2129 * a kernel bounce buffer is used. 2130 * 2131 * A matching blk_rq_unmap_user() must be issued at the end of io, while 2132 * still in process context. 2133 * 2134 * Note: The mapped bio may need to be bounced through blk_queue_bounce() 2135 * before being submitted to the device, as pages mapped may be out of 2136 * reach. It's the callers responsibility to make sure this happens. The 2137 * original bio must be passed back in to blk_rq_unmap_user() for proper 2138 * unmapping. 2139 */ 2140int blk_rq_map_user(request_queue_t *q, struct request *rq, void __user *ubuf, 2141 unsigned int len) 2142{ 2143 unsigned long uaddr; 2144 struct bio *bio; 2145 int reading; 2146 2147 if (len > (q->max_sectors << 9)) 2148 return -EINVAL; 2149 if (!len || !ubuf) 2150 return -EINVAL; 2151 2152 reading = rq_data_dir(rq) == READ; 2153 2154 /* 2155 * if alignment requirement is satisfied, map in user pages for 2156 * direct dma. else, set up kernel bounce buffers 2157 */ 2158 uaddr = (unsigned long) ubuf; 2159 if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q))) 2160 bio = bio_map_user(q, NULL, uaddr, len, reading); 2161 else 2162 bio = bio_copy_user(q, uaddr, len, reading); 2163 2164 if (!IS_ERR(bio)) { 2165 rq->bio = rq->biotail = bio; 2166 blk_rq_bio_prep(q, rq, bio); 2167 2168 rq->buffer = rq->data = NULL; 2169 rq->data_len = len; 2170 return 0; 2171 } 2172 2173 /* 2174 * bio is the err-ptr 2175 */ 2176 return PTR_ERR(bio); 2177} 2178 2179EXPORT_SYMBOL(blk_rq_map_user); 2180 2181/** 2182 * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage 2183 * @q: request queue where request should be inserted 2184 * @rq: request to map data to 2185 * @iov: pointer to the iovec 2186 * @iov_count: number of elements in the iovec 2187 * 2188 * Description: 2189 * Data will be mapped directly for zero copy io, if possible. Otherwise 2190 * a kernel bounce buffer is used. 2191 * 2192 * A matching blk_rq_unmap_user() must be issued at the end of io, while 2193 * still in process context. 2194 * 2195 * Note: The mapped bio may need to be bounced through blk_queue_bounce() 2196 * before being submitted to the device, as pages mapped may be out of 2197 * reach. It's the callers responsibility to make sure this happens. The 2198 * original bio must be passed back in to blk_rq_unmap_user() for proper 2199 * unmapping. 2200 */ 2201int blk_rq_map_user_iov(request_queue_t *q, struct request *rq, 2202 struct sg_iovec *iov, int iov_count) 2203{ 2204 struct bio *bio; 2205 2206 if (!iov || iov_count <= 0) 2207 return -EINVAL; 2208 2209 /* we don't allow misaligned data like bio_map_user() does. If the 2210 * user is using sg, they're expected to know the alignment constraints 2211 * and respect them accordingly */ 2212 bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ); 2213 if (IS_ERR(bio)) 2214 return PTR_ERR(bio); 2215 2216 rq->bio = rq->biotail = bio; 2217 blk_rq_bio_prep(q, rq, bio); 2218 rq->buffer = rq->data = NULL; 2219 rq->data_len = bio->bi_size; 2220 return 0; 2221} 2222 2223EXPORT_SYMBOL(blk_rq_map_user_iov); 2224 2225/** 2226 * blk_rq_unmap_user - unmap a request with user data 2227 * @bio: bio to be unmapped 2228 * @ulen: length of user buffer 2229 * 2230 * Description: 2231 * Unmap a bio previously mapped by blk_rq_map_user(). 2232 */ 2233int blk_rq_unmap_user(struct bio *bio, unsigned int ulen) 2234{ 2235 int ret = 0; 2236 2237 if (bio) { 2238 if (bio_flagged(bio, BIO_USER_MAPPED)) 2239 bio_unmap_user(bio); 2240 else 2241 ret = bio_uncopy_user(bio); 2242 } 2243 2244 return 0; 2245} 2246 2247EXPORT_SYMBOL(blk_rq_unmap_user); 2248 2249/** 2250 * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage 2251 * @q: request queue where request should be inserted 2252 * @rq: request to fill 2253 * @kbuf: the kernel buffer 2254 * @len: length of user data 2255 * @gfp_mask: memory allocation flags 2256 */ 2257int blk_rq_map_kern(request_queue_t *q, struct request *rq, void *kbuf, 2258 unsigned int len, gfp_t gfp_mask) 2259{ 2260 struct bio *bio; 2261 2262 if (len > (q->max_sectors << 9)) 2263 return -EINVAL; 2264 if (!len || !kbuf) 2265 return -EINVAL; 2266 2267 bio = bio_map_kern(q, kbuf, len, gfp_mask); 2268 if (IS_ERR(bio)) 2269 return PTR_ERR(bio); 2270 2271 if (rq_data_dir(rq) == WRITE) 2272 bio->bi_rw |= (1 << BIO_RW); 2273 2274 rq->bio = rq->biotail = bio; 2275 blk_rq_bio_prep(q, rq, bio); 2276 2277 rq->buffer = rq->data = NULL; 2278 rq->data_len = len; 2279 return 0; 2280} 2281 2282EXPORT_SYMBOL(blk_rq_map_kern); 2283 2284/** 2285 * blk_execute_rq_nowait - insert a request into queue for execution 2286 * @q: queue to insert the request in 2287 * @bd_disk: matching gendisk 2288 * @rq: request to insert 2289 * @at_head: insert request at head or tail of queue 2290 * @done: I/O completion handler 2291 * 2292 * Description: 2293 * Insert a fully prepared request at the back of the io scheduler queue 2294 * for execution. Don't wait for completion. 2295 */ 2296void blk_execute_rq_nowait(request_queue_t *q, struct gendisk *bd_disk, 2297 struct request *rq, int at_head, 2298 void (*done)(struct request *)) 2299{ 2300 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; 2301 2302 rq->rq_disk = bd_disk; 2303 rq->flags |= REQ_NOMERGE; 2304 rq->end_io = done; 2305 elv_add_request(q, rq, where, 1); 2306 generic_unplug_device(q); 2307} 2308 2309/** 2310 * blk_execute_rq - insert a request into queue for execution 2311 * @q: queue to insert the request in 2312 * @bd_disk: matching gendisk 2313 * @rq: request to insert 2314 * @at_head: insert request at head or tail of queue 2315 * 2316 * Description: 2317 * Insert a fully prepared request at the back of the io scheduler queue 2318 * for execution and wait for completion. 2319 */ 2320int blk_execute_rq(request_queue_t *q, struct gendisk *bd_disk, 2321 struct request *rq, int at_head) 2322{ 2323 DECLARE_COMPLETION(wait); 2324 char sense[SCSI_SENSE_BUFFERSIZE]; 2325 int err = 0; 2326 2327 /* 2328 * we need an extra reference to the request, so we can look at 2329 * it after io completion 2330 */ 2331 rq->ref_count++; 2332 2333 if (!rq->sense) { 2334 memset(sense, 0, sizeof(sense)); 2335 rq->sense = sense; 2336 rq->sense_len = 0; 2337 } 2338 2339 rq->waiting = &wait; 2340 blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); 2341 wait_for_completion(&wait); 2342 rq->waiting = NULL; 2343 2344 if (rq->errors) 2345 err = -EIO; 2346 2347 return err; 2348} 2349 2350EXPORT_SYMBOL(blk_execute_rq); 2351 2352/** 2353 * blkdev_issue_flush - queue a flush 2354 * @bdev: blockdev to issue flush for 2355 * @error_sector: error sector 2356 * 2357 * Description: 2358 * Issue a flush for the block device in question. Caller can supply 2359 * room for storing the error offset in case of a flush error, if they 2360 * wish to. Caller must run wait_for_completion() on its own. 2361 */ 2362int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) 2363{ 2364 request_queue_t *q; 2365 2366 if (bdev->bd_disk == NULL) 2367 return -ENXIO; 2368 2369 q = bdev_get_queue(bdev); 2370 if (!q) 2371 return -ENXIO; 2372 if (!q->issue_flush_fn) 2373 return -EOPNOTSUPP; 2374 2375 return q->issue_flush_fn(q, bdev->bd_disk, error_sector); 2376} 2377 2378EXPORT_SYMBOL(blkdev_issue_flush); 2379 2380static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io) 2381{ 2382 int rw = rq_data_dir(rq); 2383 2384 if (!blk_fs_request(rq) || !rq->rq_disk) 2385 return; 2386 2387 if (!new_io) { 2388 __disk_stat_inc(rq->rq_disk, merges[rw]); 2389 } else { 2390 disk_round_stats(rq->rq_disk); 2391 rq->rq_disk->in_flight++; 2392 } 2393} 2394 2395/* 2396 * add-request adds a request to the linked list. 2397 * queue lock is held and interrupts disabled, as we muck with the 2398 * request queue list. 2399 */ 2400static inline void add_request(request_queue_t * q, struct request * req) 2401{ 2402 drive_stat_acct(req, req->nr_sectors, 1); 2403 2404 if (q->activity_fn) 2405 q->activity_fn(q->activity_data, rq_data_dir(req)); 2406 2407 /* 2408 * elevator indicated where it wants this request to be 2409 * inserted at elevator_merge time 2410 */ 2411 __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0); 2412} 2413 2414/* 2415 * disk_round_stats() - Round off the performance stats on a struct 2416 * disk_stats. 2417 * 2418 * The average IO queue length and utilisation statistics are maintained 2419 * by observing the current state of the queue length and the amount of 2420 * time it has been in this state for. 2421 * 2422 * Normally, that accounting is done on IO completion, but that can result 2423 * in more than a second's worth of IO being accounted for within any one 2424 * second, leading to >100% utilisation. To deal with that, we call this 2425 * function to do a round-off before returning the results when reading 2426 * /proc/diskstats. This accounts immediately for all queue usage up to 2427 * the current jiffies and restarts the counters again. 2428 */ 2429void disk_round_stats(struct gendisk *disk) 2430{ 2431 unsigned long now = jiffies; 2432 2433 if (now == disk->stamp) 2434 return; 2435 2436 if (disk->in_flight) { 2437 __disk_stat_add(disk, time_in_queue, 2438 disk->in_flight * (now - disk->stamp)); 2439 __disk_stat_add(disk, io_ticks, (now - disk->stamp)); 2440 } 2441 disk->stamp = now; 2442} 2443 2444/* 2445 * queue lock must be held 2446 */ 2447static void __blk_put_request(request_queue_t *q, struct request *req) 2448{ 2449 struct request_list *rl = req->rl; 2450 2451 if (unlikely(!q)) 2452 return; 2453 if (unlikely(--req->ref_count)) 2454 return; 2455 2456 elv_completed_request(q, req); 2457 2458 req->rq_status = RQ_INACTIVE; 2459 req->rl = NULL; 2460 2461 /* 2462 * Request may not have originated from ll_rw_blk. if not, 2463 * it didn't come out of our reserved rq pools 2464 */ 2465 if (rl) { 2466 int rw = rq_data_dir(req); 2467 int priv = req->flags & REQ_ELVPRIV; 2468 2469 BUG_ON(!list_empty(&req->queuelist)); 2470 2471 blk_free_request(q, req); 2472 freed_request(q, rw, priv); 2473 } 2474} 2475 2476void blk_put_request(struct request *req) 2477{ 2478 unsigned long flags; 2479 request_queue_t *q = req->q; 2480 2481 /* 2482 * Gee, IDE calls in w/ NULL q. Fix IDE and remove the 2483 * following if (q) test. 2484 */ 2485 if (q) { 2486 spin_lock_irqsave(q->queue_lock, flags); 2487 __blk_put_request(q, req); 2488 spin_unlock_irqrestore(q->queue_lock, flags); 2489 } 2490} 2491 2492EXPORT_SYMBOL(blk_put_request); 2493 2494/** 2495 * blk_end_sync_rq - executes a completion event on a request 2496 * @rq: request to complete 2497 */ 2498void blk_end_sync_rq(struct request *rq) 2499{ 2500 struct completion *waiting = rq->waiting; 2501 2502 rq->waiting = NULL; 2503 __blk_put_request(rq->q, rq); 2504 2505 /* 2506 * complete last, if this is a stack request the process (and thus 2507 * the rq pointer) could be invalid right after this complete() 2508 */ 2509 complete(waiting); 2510} 2511EXPORT_SYMBOL(blk_end_sync_rq); 2512 2513/** 2514 * blk_congestion_wait - wait for a queue to become uncongested 2515 * @rw: READ or WRITE 2516 * @timeout: timeout in jiffies 2517 * 2518 * Waits for up to @timeout jiffies for a queue (any queue) to exit congestion. 2519 * If no queues are congested then just wait for the next request to be 2520 * returned. 2521 */ 2522long blk_congestion_wait(int rw, long timeout) 2523{ 2524 long ret; 2525 DEFINE_WAIT(wait); 2526 wait_queue_head_t *wqh = &congestion_wqh[rw]; 2527 2528 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); 2529 ret = io_schedule_timeout(timeout); 2530 finish_wait(wqh, &wait); 2531 return ret; 2532} 2533 2534EXPORT_SYMBOL(blk_congestion_wait); 2535 2536/* 2537 * Has to be called with the request spinlock acquired 2538 */ 2539static int attempt_merge(request_queue_t *q, struct request *req, 2540 struct request *next) 2541{ 2542 if (!rq_mergeable(req) || !rq_mergeable(next)) 2543 return 0; 2544 2545 /* 2546 * not contigious 2547 */ 2548 if (req->sector + req->nr_sectors != next->sector) 2549 return 0; 2550 2551 if (rq_data_dir(req) != rq_data_dir(next) 2552 || req->rq_disk != next->rq_disk 2553 || next->waiting || next->special) 2554 return 0; 2555 2556 /* 2557 * If we are allowed to merge, then append bio list 2558 * from next to rq and release next. merge_requests_fn 2559 * will have updated segment counts, update sector 2560 * counts here. 2561 */ 2562 if (!q->merge_requests_fn(q, req, next)) 2563 return 0; 2564 2565 /* 2566 * At this point we have either done a back merge 2567 * or front merge. We need the smaller start_time of 2568 * the merged requests to be the current request 2569 * for accounting purposes. 2570 */ 2571 if (time_after(req->start_time, next->start_time)) 2572 req->start_time = next->start_time; 2573 2574 req->biotail->bi_next = next->bio; 2575 req->biotail = next->biotail; 2576 2577 req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors; 2578 2579 elv_merge_requests(q, req, next); 2580 2581 if (req->rq_disk) { 2582 disk_round_stats(req->rq_disk); 2583 req->rq_disk->in_flight--; 2584 } 2585 2586 req->ioprio = ioprio_best(req->ioprio, next->ioprio); 2587 2588 __blk_put_request(q, next); 2589 return 1; 2590} 2591 2592static inline int attempt_back_merge(request_queue_t *q, struct request *rq) 2593{ 2594 struct request *next = elv_latter_request(q, rq); 2595 2596 if (next) 2597 return attempt_merge(q, rq, next); 2598 2599 return 0; 2600} 2601 2602static inline int attempt_front_merge(request_queue_t *q, struct request *rq) 2603{ 2604 struct request *prev = elv_former_request(q, rq); 2605 2606 if (prev) 2607 return attempt_merge(q, prev, rq); 2608 2609 return 0; 2610} 2611 2612/** 2613 * blk_attempt_remerge - attempt to remerge active head with next request 2614 * @q: The &request_queue_t belonging to the device 2615 * @rq: The head request (usually) 2616 * 2617 * Description: 2618 * For head-active devices, the queue can easily be unplugged so quickly 2619 * that proper merging is not done on the front request. This may hurt 2620 * performance greatly for some devices. The block layer cannot safely 2621 * do merging on that first request for these queues, but the driver can 2622 * call this function and make it happen any way. Only the driver knows 2623 * when it is safe to do so. 2624 **/ 2625void blk_attempt_remerge(request_queue_t *q, struct request *rq) 2626{ 2627 unsigned long flags; 2628 2629 spin_lock_irqsave(q->queue_lock, flags); 2630 attempt_back_merge(q, rq); 2631 spin_unlock_irqrestore(q->queue_lock, flags); 2632} 2633 2634EXPORT_SYMBOL(blk_attempt_remerge); 2635 2636static int __make_request(request_queue_t *q, struct bio *bio) 2637{ 2638 struct request *req; 2639 int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err, sync; 2640 unsigned short prio; 2641 sector_t sector; 2642 2643 sector = bio->bi_sector; 2644 nr_sectors = bio_sectors(bio); 2645 cur_nr_sectors = bio_cur_sectors(bio); 2646 prio = bio_prio(bio); 2647 2648 rw = bio_data_dir(bio); 2649 sync = bio_sync(bio); 2650 2651 /* 2652 * low level driver can indicate that it wants pages above a 2653 * certain limit bounced to low memory (ie for highmem, or even 2654 * ISA dma in theory) 2655 */ 2656 blk_queue_bounce(q, &bio); 2657 2658 spin_lock_prefetch(q->queue_lock); 2659 2660 barrier = bio_barrier(bio); 2661 if (unlikely(barrier) && (q->ordered == QUEUE_ORDERED_NONE)) { 2662 err = -EOPNOTSUPP; 2663 goto end_io; 2664 } 2665 2666 spin_lock_irq(q->queue_lock); 2667 2668 if (unlikely(barrier) || elv_queue_empty(q)) 2669 goto get_rq; 2670 2671 el_ret = elv_merge(q, &req, bio); 2672 switch (el_ret) { 2673 case ELEVATOR_BACK_MERGE: 2674 BUG_ON(!rq_mergeable(req)); 2675 2676 if (!q->back_merge_fn(q, req, bio)) 2677 break; 2678 2679 req->biotail->bi_next = bio; 2680 req->biotail = bio; 2681 req->nr_sectors = req->hard_nr_sectors += nr_sectors; 2682 req->ioprio = ioprio_best(req->ioprio, prio); 2683 drive_stat_acct(req, nr_sectors, 0); 2684 if (!attempt_back_merge(q, req)) 2685 elv_merged_request(q, req); 2686 goto out; 2687 2688 case ELEVATOR_FRONT_MERGE: 2689 BUG_ON(!rq_mergeable(req)); 2690 2691 if (!q->front_merge_fn(q, req, bio)) 2692 break; 2693 2694 bio->bi_next = req->bio; 2695 req->bio = bio; 2696 2697 /* 2698 * may not be valid. if the low level driver said 2699 * it didn't need a bounce buffer then it better 2700 * not touch req->buffer either... 2701 */ 2702 req->buffer = bio_data(bio); 2703 req->current_nr_sectors = cur_nr_sectors; 2704 req->hard_cur_sectors = cur_nr_sectors; 2705 req->sector = req->hard_sector = sector; 2706 req->nr_sectors = req->hard_nr_sectors += nr_sectors; 2707 req->ioprio = ioprio_best(req->ioprio, prio); 2708 drive_stat_acct(req, nr_sectors, 0); 2709 if (!attempt_front_merge(q, req)) 2710 elv_merged_request(q, req); 2711 goto out; 2712 2713 /* ELV_NO_MERGE: elevator says don't/can't merge. */ 2714 default: 2715 ; 2716 } 2717 2718get_rq: 2719 /* 2720 * Grab a free request. This is might sleep but can not fail. 2721 * Returns with the queue unlocked. 2722 */ 2723 req = get_request_wait(q, rw, bio); 2724 2725 /* 2726 * After dropping the lock and possibly sleeping here, our request 2727 * may now be mergeable after it had proven unmergeable (above). 2728 * We don't worry about that case for efficiency. It won't happen 2729 * often, and the elevators are able to handle it. 2730 */ 2731 2732 req->flags |= REQ_CMD; 2733 2734 /* 2735 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST) 2736 */ 2737 if (bio_rw_ahead(bio) || bio_failfast(bio)) 2738 req->flags |= REQ_FAILFAST; 2739 2740 /* 2741 * REQ_BARRIER implies no merging, but lets make it explicit 2742 */ 2743 if (unlikely(barrier)) 2744 req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE); 2745 2746 req->errors = 0; 2747 req->hard_sector = req->sector = sector; 2748 req->hard_nr_sectors = req->nr_sectors = nr_sectors; 2749 req->current_nr_sectors = req->hard_cur_sectors = cur_nr_sectors; 2750 req->nr_phys_segments = bio_phys_segments(q, bio); 2751 req->nr_hw_segments = bio_hw_segments(q, bio); 2752 req->buffer = bio_data(bio); /* see ->buffer comment above */ 2753 req->waiting = NULL; 2754 req->bio = req->biotail = bio; 2755 req->ioprio = prio; 2756 req->rq_disk = bio->bi_bdev->bd_disk; 2757 req->start_time = jiffies; 2758 2759 spin_lock_irq(q->queue_lock); 2760 if (elv_queue_empty(q)) 2761 blk_plug_device(q); 2762 add_request(q, req); 2763out: 2764 if (sync) 2765 __generic_unplug_device(q); 2766 2767 spin_unlock_irq(q->queue_lock); 2768 return 0; 2769 2770end_io: 2771 bio_endio(bio, nr_sectors << 9, err); 2772 return 0; 2773} 2774 2775/* 2776 * If bio->bi_dev is a partition, remap the location 2777 */ 2778static inline void blk_partition_remap(struct bio *bio) 2779{ 2780 struct block_device *bdev = bio->bi_bdev; 2781 2782 if (bdev != bdev->bd_contains) { 2783 struct hd_struct *p = bdev->bd_part; 2784 const int rw = bio_data_dir(bio); 2785 2786 p->sectors[rw] += bio_sectors(bio); 2787 p->ios[rw]++; 2788 2789 bio->bi_sector += p->start_sect; 2790 bio->bi_bdev = bdev->bd_contains; 2791 } 2792} 2793 2794static void handle_bad_sector(struct bio *bio) 2795{ 2796 char b[BDEVNAME_SIZE]; 2797 2798 printk(KERN_INFO "attempt to access beyond end of device\n"); 2799 printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n", 2800 bdevname(bio->bi_bdev, b), 2801 bio->bi_rw, 2802 (unsigned long long)bio->bi_sector + bio_sectors(bio), 2803 (long long)(bio->bi_bdev->bd_inode->i_size >> 9)); 2804 2805 set_bit(BIO_EOF, &bio->bi_flags); 2806} 2807 2808/** 2809 * generic_make_request: hand a buffer to its device driver for I/O 2810 * @bio: The bio describing the location in memory and on the device. 2811 * 2812 * generic_make_request() is used to make I/O requests of block 2813 * devices. It is passed a &struct bio, which describes the I/O that needs 2814 * to be done. 2815 * 2816 * generic_make_request() does not return any status. The 2817 * success/failure status of the request, along with notification of 2818 * completion, is delivered asynchronously through the bio->bi_end_io 2819 * function described (one day) else where. 2820 * 2821 * The caller of generic_make_request must make sure that bi_io_vec 2822 * are set to describe the memory buffer, and that bi_dev and bi_sector are 2823 * set to describe the device address, and the 2824 * bi_end_io and optionally bi_private are set to describe how 2825 * completion notification should be signaled. 2826 * 2827 * generic_make_request and the drivers it calls may use bi_next if this 2828 * bio happens to be merged with someone else, and may change bi_dev and 2829 * bi_sector for remaps as it sees fit. So the values of these fields 2830 * should NOT be depended on after the call to generic_make_request. 2831 */ 2832void generic_make_request(struct bio *bio) 2833{ 2834 request_queue_t *q; 2835 sector_t maxsector; 2836 int ret, nr_sectors = bio_sectors(bio); 2837 2838 might_sleep(); 2839 /* Test device or partition size, when known. */ 2840 maxsector = bio->bi_bdev->bd_inode->i_size >> 9; 2841 if (maxsector) { 2842 sector_t sector = bio->bi_sector; 2843 2844 if (maxsector < nr_sectors || maxsector - nr_sectors < sector) { 2845 /* 2846 * This may well happen - the kernel calls bread() 2847 * without checking the size of the device, e.g., when 2848 * mounting a device. 2849 */ 2850 handle_bad_sector(bio); 2851 goto end_io; 2852 } 2853 } 2854 2855 /* 2856 * Resolve the mapping until finished. (drivers are 2857 * still free to implement/resolve their own stacking 2858 * by explicitly returning 0) 2859 * 2860 * NOTE: we don't repeat the blk_size check for each new device. 2861 * Stacking drivers are expected to know what they are doing. 2862 */ 2863 do { 2864 char b[BDEVNAME_SIZE]; 2865 2866 q = bdev_get_queue(bio->bi_bdev); 2867 if (!q) { 2868 printk(KERN_ERR 2869 "generic_make_request: Trying to access " 2870 "nonexistent block-device %s (%Lu)\n", 2871 bdevname(bio->bi_bdev, b), 2872 (long long) bio->bi_sector); 2873end_io: 2874 bio_endio(bio, bio->bi_size, -EIO); 2875 break; 2876 } 2877 2878 if (unlikely(bio_sectors(bio) > q->max_hw_sectors)) { 2879 printk("bio too big device %s (%u > %u)\n", 2880 bdevname(bio->bi_bdev, b), 2881 bio_sectors(bio), 2882 q->max_hw_sectors); 2883 goto end_io; 2884 } 2885 2886 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) 2887 goto end_io; 2888 2889 /* 2890 * If this device has partitions, remap block n 2891 * of partition p to block n+start(p) of the disk. 2892 */ 2893 blk_partition_remap(bio); 2894 2895 ret = q->make_request_fn(q, bio); 2896 } while (ret); 2897} 2898 2899EXPORT_SYMBOL(generic_make_request); 2900 2901/** 2902 * submit_bio: submit a bio to the block device layer for I/O 2903 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) 2904 * @bio: The &struct bio which describes the I/O 2905 * 2906 * submit_bio() is very similar in purpose to generic_make_request(), and 2907 * uses that function to do most of the work. Both are fairly rough 2908 * interfaces, @bio must be presetup and ready for I/O. 2909 * 2910 */ 2911void submit_bio(int rw, struct bio *bio) 2912{ 2913 int count = bio_sectors(bio); 2914 2915 BIO_BUG_ON(!bio->bi_size); 2916 BIO_BUG_ON(!bio->bi_io_vec); 2917 bio->bi_rw |= rw; 2918 if (rw & WRITE) 2919 mod_page_state(pgpgout, count); 2920 else 2921 mod_page_state(pgpgin, count); 2922 2923 if (unlikely(block_dump)) { 2924 char b[BDEVNAME_SIZE]; 2925 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", 2926 current->comm, current->pid, 2927 (rw & WRITE) ? "WRITE" : "READ", 2928 (unsigned long long)bio->bi_sector, 2929 bdevname(bio->bi_bdev,b)); 2930 } 2931 2932 generic_make_request(bio); 2933} 2934 2935EXPORT_SYMBOL(submit_bio); 2936 2937static void blk_recalc_rq_segments(struct request *rq) 2938{ 2939 struct bio *bio, *prevbio = NULL; 2940 int nr_phys_segs, nr_hw_segs; 2941 unsigned int phys_size, hw_size; 2942 request_queue_t *q = rq->q; 2943 2944 if (!rq->bio) 2945 return; 2946 2947 phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0; 2948 rq_for_each_bio(bio, rq) { 2949 /* Force bio hw/phys segs to be recalculated. */ 2950 bio->bi_flags &= ~(1 << BIO_SEG_VALID); 2951 2952 nr_phys_segs += bio_phys_segments(q, bio); 2953 nr_hw_segs += bio_hw_segments(q, bio); 2954 if (prevbio) { 2955 int pseg = phys_size + prevbio->bi_size + bio->bi_size; 2956 int hseg = hw_size + prevbio->bi_size + bio->bi_size; 2957 2958 if (blk_phys_contig_segment(q, prevbio, bio) && 2959 pseg <= q->max_segment_size) { 2960 nr_phys_segs--; 2961 phys_size += prevbio->bi_size + bio->bi_size; 2962 } else 2963 phys_size = 0; 2964 2965 if (blk_hw_contig_segment(q, prevbio, bio) && 2966 hseg <= q->max_segment_size) { 2967 nr_hw_segs--; 2968 hw_size += prevbio->bi_size + bio->bi_size; 2969 } else 2970 hw_size = 0; 2971 } 2972 prevbio = bio; 2973 } 2974 2975 rq->nr_phys_segments = nr_phys_segs; 2976 rq->nr_hw_segments = nr_hw_segs; 2977} 2978 2979static void blk_recalc_rq_sectors(struct request *rq, int nsect) 2980{ 2981 if (blk_fs_request(rq)) { 2982 rq->hard_sector += nsect; 2983 rq->hard_nr_sectors -= nsect; 2984 2985 /* 2986 * Move the I/O submission pointers ahead if required. 2987 */ 2988 if ((rq->nr_sectors >= rq->hard_nr_sectors) && 2989 (rq->sector <= rq->hard_sector)) { 2990 rq->sector = rq->hard_sector; 2991 rq->nr_sectors = rq->hard_nr_sectors; 2992 rq->hard_cur_sectors = bio_cur_sectors(rq->bio); 2993 rq->current_nr_sectors = rq->hard_cur_sectors; 2994 rq->buffer = bio_data(rq->bio); 2995 } 2996 2997 /* 2998 * if total number of sectors is less than the first segment 2999 * size, something has gone terribly wrong 3000 */ 3001 if (rq->nr_sectors < rq->current_nr_sectors) { 3002 printk("blk: request botched\n"); 3003 rq->nr_sectors = rq->current_nr_sectors; 3004 } 3005 } 3006} 3007 3008static int __end_that_request_first(struct request *req, int uptodate, 3009 int nr_bytes) 3010{ 3011 int total_bytes, bio_nbytes, error, next_idx = 0; 3012 struct bio *bio; 3013 3014 /* 3015 * extend uptodate bool to allow < 0 value to be direct io error 3016 */ 3017 error = 0; 3018 if (end_io_error(uptodate)) 3019 error = !uptodate ? -EIO : uptodate; 3020 3021 /* 3022 * for a REQ_BLOCK_PC request, we want to carry any eventual 3023 * sense key with us all the way through 3024 */ 3025 if (!blk_pc_request(req)) 3026 req->errors = 0; 3027 3028 if (!uptodate) { 3029 if (blk_fs_request(req) && !(req->flags & REQ_QUIET)) 3030 printk("end_request: I/O error, dev %s, sector %llu\n", 3031 req->rq_disk ? req->rq_disk->disk_name : "?", 3032 (unsigned long long)req->sector); 3033 } 3034 3035 if (blk_fs_request(req) && req->rq_disk) { 3036 const int rw = rq_data_dir(req); 3037 3038 __disk_stat_add(req->rq_disk, sectors[rw], nr_bytes >> 9); 3039 } 3040 3041 total_bytes = bio_nbytes = 0; 3042 while ((bio = req->bio) != NULL) { 3043 int nbytes; 3044 3045 if (nr_bytes >= bio->bi_size) { 3046 req->bio = bio->bi_next; 3047 nbytes = bio->bi_size; 3048 bio_endio(bio, nbytes, error); 3049 next_idx = 0; 3050 bio_nbytes = 0; 3051 } else { 3052 int idx = bio->bi_idx + next_idx; 3053 3054 if (unlikely(bio->bi_idx >= bio->bi_vcnt)) { 3055 blk_dump_rq_flags(req, "__end_that"); 3056 printk("%s: bio idx %d >= vcnt %d\n", 3057 __FUNCTION__, 3058 bio->bi_idx, bio->bi_vcnt); 3059 break; 3060 } 3061 3062 nbytes = bio_iovec_idx(bio, idx)->bv_len; 3063 BIO_BUG_ON(nbytes > bio->bi_size); 3064 3065 /* 3066 * not a complete bvec done 3067 */ 3068 if (unlikely(nbytes > nr_bytes)) { 3069 bio_nbytes += nr_bytes; 3070 total_bytes += nr_bytes; 3071 break; 3072 } 3073 3074 /* 3075 * advance to the next vector 3076 */ 3077 next_idx++; 3078 bio_nbytes += nbytes; 3079 } 3080 3081 total_bytes += nbytes; 3082 nr_bytes -= nbytes; 3083 3084 if ((bio = req->bio)) { 3085 /* 3086 * end more in this run, or just return 'not-done' 3087 */ 3088 if (unlikely(nr_bytes <= 0)) 3089 break; 3090 } 3091 } 3092 3093 /* 3094 * completely done 3095 */ 3096 if (!req->bio) 3097 return 0; 3098 3099 /* 3100 * if the request wasn't completed, update state 3101 */ 3102 if (bio_nbytes) { 3103 bio_endio(bio, bio_nbytes, error); 3104 bio->bi_idx += next_idx; 3105 bio_iovec(bio)->bv_offset += nr_bytes; 3106 bio_iovec(bio)->bv_len -= nr_bytes; 3107 } 3108 3109 blk_recalc_rq_sectors(req, total_bytes >> 9); 3110 blk_recalc_rq_segments(req); 3111 return 1; 3112} 3113 3114/** 3115 * end_that_request_first - end I/O on a request 3116 * @req: the request being processed 3117 * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error 3118 * @nr_sectors: number of sectors to end I/O on 3119 * 3120 * Description: 3121 * Ends I/O on a number of sectors attached to @req, and sets it up 3122 * for the next range of segments (if any) in the cluster. 3123 * 3124 * Return: 3125 * 0 - we are done with this request, call end_that_request_last() 3126 * 1 - still buffers pending for this request 3127 **/ 3128int end_that_request_first(struct request *req, int uptodate, int nr_sectors) 3129{ 3130 return __end_that_request_first(req, uptodate, nr_sectors << 9); 3131} 3132 3133EXPORT_SYMBOL(end_that_request_first); 3134 3135/** 3136 * end_that_request_chunk - end I/O on a request 3137 * @req: the request being processed 3138 * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error 3139 * @nr_bytes: number of bytes to complete 3140 * 3141 * Description: 3142 * Ends I/O on a number of bytes attached to @req, and sets it up 3143 * for the next range of segments (if any). Like end_that_request_first(), 3144 * but deals with bytes instead of sectors. 3145 * 3146 * Return: 3147 * 0 - we are done with this request, call end_that_request_last() 3148 * 1 - still buffers pending for this request 3149 **/ 3150int end_that_request_chunk(struct request *req, int uptodate, int nr_bytes) 3151{ 3152 return __end_that_request_first(req, uptodate, nr_bytes); 3153} 3154 3155EXPORT_SYMBOL(end_that_request_chunk); 3156 3157/* 3158 * queue lock must be held 3159 */ 3160void end_that_request_last(struct request *req) 3161{ 3162 struct gendisk *disk = req->rq_disk; 3163 3164 if (unlikely(laptop_mode) && blk_fs_request(req)) 3165 laptop_io_completion(); 3166 3167 if (disk && blk_fs_request(req)) { 3168 unsigned long duration = jiffies - req->start_time; 3169 const int rw = rq_data_dir(req); 3170 3171 __disk_stat_inc(disk, ios[rw]); 3172 __disk_stat_add(disk, ticks[rw], duration); 3173 disk_round_stats(disk); 3174 disk->in_flight--; 3175 } 3176 if (req->end_io) 3177 req->end_io(req); 3178 else 3179 __blk_put_request(req->q, req); 3180} 3181 3182EXPORT_SYMBOL(end_that_request_last); 3183 3184void end_request(struct request *req, int uptodate) 3185{ 3186 if (!end_that_request_first(req, uptodate, req->hard_cur_sectors)) { 3187 add_disk_randomness(req->rq_disk); 3188 blkdev_dequeue_request(req); 3189 end_that_request_last(req); 3190 } 3191} 3192 3193EXPORT_SYMBOL(end_request); 3194 3195void blk_rq_bio_prep(request_queue_t *q, struct request *rq, struct bio *bio) 3196{ 3197 /* first three bits are identical in rq->flags and bio->bi_rw */ 3198 rq->flags |= (bio->bi_rw & 7); 3199 3200 rq->nr_phys_segments = bio_phys_segments(q, bio); 3201 rq->nr_hw_segments = bio_hw_segments(q, bio); 3202 rq->current_nr_sectors = bio_cur_sectors(bio); 3203 rq->hard_cur_sectors = rq->current_nr_sectors; 3204 rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio); 3205 rq->buffer = bio_data(bio); 3206 3207 rq->bio = rq->biotail = bio; 3208} 3209 3210EXPORT_SYMBOL(blk_rq_bio_prep); 3211 3212int kblockd_schedule_work(struct work_struct *work) 3213{ 3214 return queue_work(kblockd_workqueue, work); 3215} 3216 3217EXPORT_SYMBOL(kblockd_schedule_work); 3218 3219void kblockd_flush(void) 3220{ 3221 flush_workqueue(kblockd_workqueue); 3222} 3223EXPORT_SYMBOL(kblockd_flush); 3224 3225int __init blk_dev_init(void) 3226{ 3227 kblockd_workqueue = create_workqueue("kblockd"); 3228 if (!kblockd_workqueue) 3229 panic("Failed to create kblockd\n"); 3230 3231 request_cachep = kmem_cache_create("blkdev_requests", 3232 sizeof(struct request), 0, SLAB_PANIC, NULL, NULL); 3233 3234 requestq_cachep = kmem_cache_create("blkdev_queue", 3235 sizeof(request_queue_t), 0, SLAB_PANIC, NULL, NULL); 3236 3237 iocontext_cachep = kmem_cache_create("blkdev_ioc", 3238 sizeof(struct io_context), 0, SLAB_PANIC, NULL, NULL); 3239 3240 blk_max_low_pfn = max_low_pfn; 3241 blk_max_pfn = max_pfn; 3242 3243 return 0; 3244} 3245 3246/* 3247 * IO Context helper functions 3248 */ 3249void put_io_context(struct io_context *ioc) 3250{ 3251 if (ioc == NULL) 3252 return; 3253 3254 BUG_ON(atomic_read(&ioc->refcount) == 0); 3255 3256 if (atomic_dec_and_test(&ioc->refcount)) { 3257 if (ioc->aic && ioc->aic->dtor) 3258 ioc->aic->dtor(ioc->aic); 3259 if (ioc->cic && ioc->cic->dtor) 3260 ioc->cic->dtor(ioc->cic); 3261 3262 kmem_cache_free(iocontext_cachep, ioc); 3263 } 3264} 3265EXPORT_SYMBOL(put_io_context); 3266 3267/* Called by the exitting task */ 3268void exit_io_context(void) 3269{ 3270 unsigned long flags; 3271 struct io_context *ioc; 3272 3273 local_irq_save(flags); 3274 task_lock(current); 3275 ioc = current->io_context; 3276 current->io_context = NULL; 3277 ioc->task = NULL; 3278 task_unlock(current); 3279 local_irq_restore(flags); 3280 3281 if (ioc->aic && ioc->aic->exit) 3282 ioc->aic->exit(ioc->aic); 3283 if (ioc->cic && ioc->cic->exit) 3284 ioc->cic->exit(ioc->cic); 3285 3286 put_io_context(ioc); 3287} 3288 3289/* 3290 * If the current task has no IO context then create one and initialise it. 3291 * Otherwise, return its existing IO context. 3292 * 3293 * This returned IO context doesn't have a specifically elevated refcount, 3294 * but since the current task itself holds a reference, the context can be 3295 * used in general code, so long as it stays within `current` context. 3296 */ 3297struct io_context *current_io_context(gfp_t gfp_flags) 3298{ 3299 struct task_struct *tsk = current; 3300 struct io_context *ret; 3301 3302 ret = tsk->io_context; 3303 if (likely(ret)) 3304 return ret; 3305 3306 ret = kmem_cache_alloc(iocontext_cachep, gfp_flags); 3307 if (ret) { 3308 atomic_set(&ret->refcount, 1); 3309 ret->task = current; 3310 ret->set_ioprio = NULL; 3311 ret->last_waited = jiffies; /* doesn't matter... */ 3312 ret->nr_batch_requests = 0; /* because this is 0 */ 3313 ret->aic = NULL; 3314 ret->cic = NULL; 3315 tsk->io_context = ret; 3316 } 3317 3318 return ret; 3319} 3320EXPORT_SYMBOL(current_io_context); 3321 3322/* 3323 * If the current task has no IO context then create one and initialise it. 3324 * If it does have a context, take a ref on it. 3325 * 3326 * This is always called in the context of the task which submitted the I/O. 3327 */ 3328struct io_context *get_io_context(gfp_t gfp_flags) 3329{ 3330 struct io_context *ret; 3331 ret = current_io_context(gfp_flags); 3332 if (likely(ret)) 3333 atomic_inc(&ret->refcount); 3334 return ret; 3335} 3336EXPORT_SYMBOL(get_io_context); 3337 3338void copy_io_context(struct io_context **pdst, struct io_context **psrc) 3339{ 3340 struct io_context *src = *psrc; 3341 struct io_context *dst = *pdst; 3342 3343 if (src) { 3344 BUG_ON(atomic_read(&src->refcount) == 0); 3345 atomic_inc(&src->refcount); 3346 put_io_context(dst); 3347 *pdst = src; 3348 } 3349} 3350EXPORT_SYMBOL(copy_io_context); 3351 3352void swap_io_context(struct io_context **ioc1, struct io_context **ioc2) 3353{ 3354 struct io_context *temp; 3355 temp = *ioc1; 3356 *ioc1 = *ioc2; 3357 *ioc2 = temp; 3358} 3359EXPORT_SYMBOL(swap_io_context); 3360 3361/* 3362 * sysfs parts below 3363 */ 3364struct queue_sysfs_entry { 3365 struct attribute attr; 3366 ssize_t (*show)(struct request_queue *, char *); 3367 ssize_t (*store)(struct request_queue *, const char *, size_t); 3368}; 3369 3370static ssize_t 3371queue_var_show(unsigned int var, char *page) 3372{ 3373 return sprintf(page, "%d\n", var); 3374} 3375 3376static ssize_t 3377queue_var_store(unsigned long *var, const char *page, size_t count) 3378{ 3379 char *p = (char *) page; 3380 3381 *var = simple_strtoul(p, &p, 10); 3382 return count; 3383} 3384 3385static ssize_t queue_requests_show(struct request_queue *q, char *page) 3386{ 3387 return queue_var_show(q->nr_requests, (page)); 3388} 3389 3390static ssize_t 3391queue_requests_store(struct request_queue *q, const char *page, size_t count) 3392{ 3393 struct request_list *rl = &q->rq; 3394 3395 int ret = queue_var_store(&q->nr_requests, page, count); 3396 if (q->nr_requests < BLKDEV_MIN_RQ) 3397 q->nr_requests = BLKDEV_MIN_RQ; 3398 blk_queue_congestion_threshold(q); 3399 3400 if (rl->count[READ] >= queue_congestion_on_threshold(q)) 3401 set_queue_congested(q, READ); 3402 else if (rl->count[READ] < queue_congestion_off_threshold(q)) 3403 clear_queue_congested(q, READ); 3404 3405 if (rl->count[WRITE] >= queue_congestion_on_threshold(q)) 3406 set_queue_congested(q, WRITE); 3407 else if (rl->count[WRITE] < queue_congestion_off_threshold(q)) 3408 clear_queue_congested(q, WRITE); 3409 3410 if (rl->count[READ] >= q->nr_requests) { 3411 blk_set_queue_full(q, READ); 3412 } else if (rl->count[READ]+1 <= q->nr_requests) { 3413 blk_clear_queue_full(q, READ); 3414 wake_up(&rl->wait[READ]); 3415 } 3416 3417 if (rl->count[WRITE] >= q->nr_requests) { 3418 blk_set_queue_full(q, WRITE); 3419 } else if (rl->count[WRITE]+1 <= q->nr_requests) { 3420 blk_clear_queue_full(q, WRITE); 3421 wake_up(&rl->wait[WRITE]); 3422 } 3423 return ret; 3424} 3425 3426static ssize_t queue_ra_show(struct request_queue *q, char *page) 3427{ 3428 int ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10); 3429 3430 return queue_var_show(ra_kb, (page)); 3431} 3432 3433static ssize_t 3434queue_ra_store(struct request_queue *q, const char *page, size_t count) 3435{ 3436 unsigned long ra_kb; 3437 ssize_t ret = queue_var_store(&ra_kb, page, count); 3438 3439 spin_lock_irq(q->queue_lock); 3440 if (ra_kb > (q->max_sectors >> 1)) 3441 ra_kb = (q->max_sectors >> 1); 3442 3443 q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10); 3444 spin_unlock_irq(q->queue_lock); 3445 3446 return ret; 3447} 3448 3449static ssize_t queue_max_sectors_show(struct request_queue *q, char *page) 3450{ 3451 int max_sectors_kb = q->max_sectors >> 1; 3452 3453 return queue_var_show(max_sectors_kb, (page)); 3454} 3455 3456static ssize_t 3457queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) 3458{ 3459 unsigned long max_sectors_kb, 3460 max_hw_sectors_kb = q->max_hw_sectors >> 1, 3461 page_kb = 1 << (PAGE_CACHE_SHIFT - 10); 3462 ssize_t ret = queue_var_store(&max_sectors_kb, page, count); 3463 int ra_kb; 3464 3465 if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) 3466 return -EINVAL; 3467 /* 3468 * Take the queue lock to update the readahead and max_sectors 3469 * values synchronously: 3470 */ 3471 spin_lock_irq(q->queue_lock); 3472 /* 3473 * Trim readahead window as well, if necessary: 3474 */ 3475 ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10); 3476 if (ra_kb > max_sectors_kb) 3477 q->backing_dev_info.ra_pages = 3478 max_sectors_kb >> (PAGE_CACHE_SHIFT - 10); 3479 3480 q->max_sectors = max_sectors_kb << 1; 3481 spin_unlock_irq(q->queue_lock); 3482 3483 return ret; 3484} 3485 3486static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page) 3487{ 3488 int max_hw_sectors_kb = q->max_hw_sectors >> 1; 3489 3490 return queue_var_show(max_hw_sectors_kb, (page)); 3491} 3492 3493 3494static struct queue_sysfs_entry queue_requests_entry = { 3495 .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, 3496 .show = queue_requests_show, 3497 .store = queue_requests_store, 3498}; 3499 3500static struct queue_sysfs_entry queue_ra_entry = { 3501 .attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR }, 3502 .show = queue_ra_show, 3503 .store = queue_ra_store, 3504}; 3505 3506static struct queue_sysfs_entry queue_max_sectors_entry = { 3507 .attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR }, 3508 .show = queue_max_sectors_show, 3509 .store = queue_max_sectors_store, 3510}; 3511 3512static struct queue_sysfs_entry queue_max_hw_sectors_entry = { 3513 .attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO }, 3514 .show = queue_max_hw_sectors_show, 3515}; 3516 3517static struct queue_sysfs_entry queue_iosched_entry = { 3518 .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR }, 3519 .show = elv_iosched_show, 3520 .store = elv_iosched_store, 3521}; 3522 3523static struct attribute *default_attrs[] = { 3524 &queue_requests_entry.attr, 3525 &queue_ra_entry.attr, 3526 &queue_max_hw_sectors_entry.attr, 3527 &queue_max_sectors_entry.attr, 3528 &queue_iosched_entry.attr, 3529 NULL, 3530}; 3531 3532#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr) 3533 3534static ssize_t 3535queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3536{ 3537 struct queue_sysfs_entry *entry = to_queue(attr); 3538 struct request_queue *q; 3539 3540 q = container_of(kobj, struct request_queue, kobj); 3541 if (!entry->show) 3542 return -EIO; 3543 3544 return entry->show(q, page); 3545} 3546 3547static ssize_t 3548queue_attr_store(struct kobject *kobj, struct attribute *attr, 3549 const char *page, size_t length) 3550{ 3551 struct queue_sysfs_entry *entry = to_queue(attr); 3552 struct request_queue *q; 3553 3554 q = container_of(kobj, struct request_queue, kobj); 3555 if (!entry->store) 3556 return -EIO; 3557 3558 return entry->store(q, page, length); 3559} 3560 3561static struct sysfs_ops queue_sysfs_ops = { 3562 .show = queue_attr_show, 3563 .store = queue_attr_store, 3564}; 3565 3566static struct kobj_type queue_ktype = { 3567 .sysfs_ops = &queue_sysfs_ops, 3568 .default_attrs = default_attrs, 3569}; 3570 3571int blk_register_queue(struct gendisk *disk) 3572{ 3573 int ret; 3574 3575 request_queue_t *q = disk->queue; 3576 3577 if (!q || !q->request_fn) 3578 return -ENXIO; 3579 3580 q->kobj.parent = kobject_get(&disk->kobj); 3581 if (!q->kobj.parent) 3582 return -EBUSY; 3583 3584 snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue"); 3585 q->kobj.ktype = &queue_ktype; 3586 3587 ret = kobject_register(&q->kobj); 3588 if (ret < 0) 3589 return ret; 3590 3591 ret = elv_register_queue(q); 3592 if (ret) { 3593 kobject_unregister(&q->kobj); 3594 return ret; 3595 } 3596 3597 return 0; 3598} 3599 3600void blk_unregister_queue(struct gendisk *disk) 3601{ 3602 request_queue_t *q = disk->queue; 3603 3604 if (q && q->request_fn) { 3605 elv_unregister_queue(q); 3606 3607 kobject_unregister(&q->kobj); 3608 kobject_put(&disk->kobj); 3609 } 3610}