drivers/block/ll_rw_blk.c at 77b2555b52a894a2e39a42e43d993df875c46a6a

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / drivers / block / ll_rw_blk.c
at 77b2555b52a894a2e39a42e43d993df875c46a6a 3747 lines 97 kB view raw
wrap content
   1/*
   2 *  linux/drivers/block/ll_rw_blk.c
   3 *
   4 * Copyright (C) 1991, 1992 Linus Torvalds
   5 * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
   6 * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
   7 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
   8 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> -  July2000
   9 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
  10 */
  11
  12/*
  13 * This handles all read/write requests to block devices
  14 */
  15#include <linux/config.h>
  16#include <linux/kernel.h>
  17#include <linux/module.h>
  18#include <linux/backing-dev.h>
  19#include <linux/bio.h>
  20#include <linux/blkdev.h>
  21#include <linux/highmem.h>
  22#include <linux/mm.h>
  23#include <linux/kernel_stat.h>
  24#include <linux/string.h>
  25#include <linux/init.h>
  26#include <linux/bootmem.h>	/* for max_pfn/max_low_pfn */
  27#include <linux/completion.h>
  28#include <linux/slab.h>
  29#include <linux/swap.h>
  30#include <linux/writeback.h>
  31#include <linux/blkdev.h>
  32
  33/*
  34 * for max sense size
  35 */
  36#include <scsi/scsi_cmnd.h>
  37
  38static void blk_unplug_work(void *data);
  39static void blk_unplug_timeout(unsigned long data);
  40static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io);
  41
  42/*
  43 * For the allocated request tables
  44 */
  45static kmem_cache_t *request_cachep;
  46
  47/*
  48 * For queue allocation
  49 */
  50static kmem_cache_t *requestq_cachep;
  51
  52/*
  53 * For io context allocations
  54 */
  55static kmem_cache_t *iocontext_cachep;
  56
  57static wait_queue_head_t congestion_wqh[2] = {
  58		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
  59		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
  60	};
  61
  62/*
  63 * Controlling structure to kblockd
  64 */
  65static struct workqueue_struct *kblockd_workqueue; 
  66
  67unsigned long blk_max_low_pfn, blk_max_pfn;
  68
  69EXPORT_SYMBOL(blk_max_low_pfn);
  70EXPORT_SYMBOL(blk_max_pfn);
  71
  72/* Amount of time in which a process may batch requests */
  73#define BLK_BATCH_TIME	(HZ/50UL)
  74
  75/* Number of requests a "batching" process may submit */
  76#define BLK_BATCH_REQ	32
  77
  78/*
  79 * Return the threshold (number of used requests) at which the queue is
  80 * considered to be congested.  It include a little hysteresis to keep the
  81 * context switch rate down.
  82 */
  83static inline int queue_congestion_on_threshold(struct request_queue *q)
  84{
  85	return q->nr_congestion_on;
  86}
  87
  88/*
  89 * The threshold at which a queue is considered to be uncongested
  90 */
  91static inline int queue_congestion_off_threshold(struct request_queue *q)
  92{
  93	return q->nr_congestion_off;
  94}
  95
  96static void blk_queue_congestion_threshold(struct request_queue *q)
  97{
  98	int nr;
  99
 100	nr = q->nr_requests - (q->nr_requests / 8) + 1;
 101	if (nr > q->nr_requests)
 102		nr = q->nr_requests;
 103	q->nr_congestion_on = nr;
 104
 105	nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
 106	if (nr < 1)
 107		nr = 1;
 108	q->nr_congestion_off = nr;
 109}
 110
 111/*
 112 * A queue has just exitted congestion.  Note this in the global counter of
 113 * congested queues, and wake up anyone who was waiting for requests to be
 114 * put back.
 115 */
 116static void clear_queue_congested(request_queue_t *q, int rw)
 117{
 118	enum bdi_state bit;
 119	wait_queue_head_t *wqh = &congestion_wqh[rw];
 120
 121	bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
 122	clear_bit(bit, &q->backing_dev_info.state);
 123	smp_mb__after_clear_bit();
 124	if (waitqueue_active(wqh))
 125		wake_up(wqh);
 126}
 127
 128/*
 129 * A queue has just entered congestion.  Flag that in the queue's VM-visible
 130 * state flags and increment the global gounter of congested queues.
 131 */
 132static void set_queue_congested(request_queue_t *q, int rw)
 133{
 134	enum bdi_state bit;
 135
 136	bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
 137	set_bit(bit, &q->backing_dev_info.state);
 138}
 139
 140/**
 141 * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
 142 * @bdev:	device
 143 *
 144 * Locates the passed device's request queue and returns the address of its
 145 * backing_dev_info
 146 *
 147 * Will return NULL if the request queue cannot be located.
 148 */
 149struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
 150{
 151	struct backing_dev_info *ret = NULL;
 152	request_queue_t *q = bdev_get_queue(bdev);
 153
 154	if (q)
 155		ret = &q->backing_dev_info;
 156	return ret;
 157}
 158
 159EXPORT_SYMBOL(blk_get_backing_dev_info);
 160
 161void blk_queue_activity_fn(request_queue_t *q, activity_fn *fn, void *data)
 162{
 163	q->activity_fn = fn;
 164	q->activity_data = data;
 165}
 166
 167EXPORT_SYMBOL(blk_queue_activity_fn);
 168
 169/**
 170 * blk_queue_prep_rq - set a prepare_request function for queue
 171 * @q:		queue
 172 * @pfn:	prepare_request function
 173 *
 174 * It's possible for a queue to register a prepare_request callback which
 175 * is invoked before the request is handed to the request_fn. The goal of
 176 * the function is to prepare a request for I/O, it can be used to build a
 177 * cdb from the request data for instance.
 178 *
 179 */
 180void blk_queue_prep_rq(request_queue_t *q, prep_rq_fn *pfn)
 181{
 182	q->prep_rq_fn = pfn;
 183}
 184
 185EXPORT_SYMBOL(blk_queue_prep_rq);
 186
 187/**
 188 * blk_queue_merge_bvec - set a merge_bvec function for queue
 189 * @q:		queue
 190 * @mbfn:	merge_bvec_fn
 191 *
 192 * Usually queues have static limitations on the max sectors or segments that
 193 * we can put in a request. Stacking drivers may have some settings that
 194 * are dynamic, and thus we have to query the queue whether it is ok to
 195 * add a new bio_vec to a bio at a given offset or not. If the block device
 196 * has such limitations, it needs to register a merge_bvec_fn to control
 197 * the size of bio's sent to it. Note that a block device *must* allow a
 198 * single page to be added to an empty bio. The block device driver may want
 199 * to use the bio_split() function to deal with these bio's. By default
 200 * no merge_bvec_fn is defined for a queue, and only the fixed limits are
 201 * honored.
 202 */
 203void blk_queue_merge_bvec(request_queue_t *q, merge_bvec_fn *mbfn)
 204{
 205	q->merge_bvec_fn = mbfn;
 206}
 207
 208EXPORT_SYMBOL(blk_queue_merge_bvec);
 209
 210/**
 211 * blk_queue_make_request - define an alternate make_request function for a device
 212 * @q:  the request queue for the device to be affected
 213 * @mfn: the alternate make_request function
 214 *
 215 * Description:
 216 *    The normal way for &struct bios to be passed to a device
 217 *    driver is for them to be collected into requests on a request
 218 *    queue, and then to allow the device driver to select requests
 219 *    off that queue when it is ready.  This works well for many block
 220 *    devices. However some block devices (typically virtual devices
 221 *    such as md or lvm) do not benefit from the processing on the
 222 *    request queue, and are served best by having the requests passed
 223 *    directly to them.  This can be achieved by providing a function
 224 *    to blk_queue_make_request().
 225 *
 226 * Caveat:
 227 *    The driver that does this *must* be able to deal appropriately
 228 *    with buffers in "highmemory". This can be accomplished by either calling
 229 *    __bio_kmap_atomic() to get a temporary kernel mapping, or by calling
 230 *    blk_queue_bounce() to create a buffer in normal memory.
 231 **/
 232void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
 233{
 234	/*
 235	 * set defaults
 236	 */
 237	q->nr_requests = BLKDEV_MAX_RQ;
 238	blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
 239	blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
 240	q->make_request_fn = mfn;
 241	q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 242	q->backing_dev_info.state = 0;
 243	q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
 244	blk_queue_max_sectors(q, MAX_SECTORS);
 245	blk_queue_hardsect_size(q, 512);
 246	blk_queue_dma_alignment(q, 511);
 247	blk_queue_congestion_threshold(q);
 248	q->nr_batching = BLK_BATCH_REQ;
 249
 250	q->unplug_thresh = 4;		/* hmm */
 251	q->unplug_delay = (3 * HZ) / 1000;	/* 3 milliseconds */
 252	if (q->unplug_delay == 0)
 253		q->unplug_delay = 1;
 254
 255	INIT_WORK(&q->unplug_work, blk_unplug_work, q);
 256
 257	q->unplug_timer.function = blk_unplug_timeout;
 258	q->unplug_timer.data = (unsigned long)q;
 259
 260	/*
 261	 * by default assume old behaviour and bounce for any highmem page
 262	 */
 263	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
 264
 265	blk_queue_activity_fn(q, NULL, NULL);
 266
 267	INIT_LIST_HEAD(&q->drain_list);
 268}
 269
 270EXPORT_SYMBOL(blk_queue_make_request);
 271
 272static inline void rq_init(request_queue_t *q, struct request *rq)
 273{
 274	INIT_LIST_HEAD(&rq->queuelist);
 275
 276	rq->errors = 0;
 277	rq->rq_status = RQ_ACTIVE;
 278	rq->bio = rq->biotail = NULL;
 279	rq->ioprio = 0;
 280	rq->buffer = NULL;
 281	rq->ref_count = 1;
 282	rq->q = q;
 283	rq->waiting = NULL;
 284	rq->special = NULL;
 285	rq->data_len = 0;
 286	rq->data = NULL;
 287	rq->nr_phys_segments = 0;
 288	rq->sense = NULL;
 289	rq->end_io = NULL;
 290	rq->end_io_data = NULL;
 291}
 292
 293/**
 294 * blk_queue_ordered - does this queue support ordered writes
 295 * @q:     the request queue
 296 * @flag:  see below
 297 *
 298 * Description:
 299 *   For journalled file systems, doing ordered writes on a commit
 300 *   block instead of explicitly doing wait_on_buffer (which is bad
 301 *   for performance) can be a big win. Block drivers supporting this
 302 *   feature should call this function and indicate so.
 303 *
 304 **/
 305void blk_queue_ordered(request_queue_t *q, int flag)
 306{
 307	switch (flag) {
 308		case QUEUE_ORDERED_NONE:
 309			if (q->flush_rq)
 310				kmem_cache_free(request_cachep, q->flush_rq);
 311			q->flush_rq = NULL;
 312			q->ordered = flag;
 313			break;
 314		case QUEUE_ORDERED_TAG:
 315			q->ordered = flag;
 316			break;
 317		case QUEUE_ORDERED_FLUSH:
 318			q->ordered = flag;
 319			if (!q->flush_rq)
 320				q->flush_rq = kmem_cache_alloc(request_cachep,
 321								GFP_KERNEL);
 322			break;
 323		default:
 324			printk("blk_queue_ordered: bad value %d\n", flag);
 325			break;
 326	}
 327}
 328
 329EXPORT_SYMBOL(blk_queue_ordered);
 330
 331/**
 332 * blk_queue_issue_flush_fn - set function for issuing a flush
 333 * @q:     the request queue
 334 * @iff:   the function to be called issuing the flush
 335 *
 336 * Description:
 337 *   If a driver supports issuing a flush command, the support is notified
 338 *   to the block layer by defining it through this call.
 339 *
 340 **/
 341void blk_queue_issue_flush_fn(request_queue_t *q, issue_flush_fn *iff)
 342{
 343	q->issue_flush_fn = iff;
 344}
 345
 346EXPORT_SYMBOL(blk_queue_issue_flush_fn);
 347
 348/*
 349 * Cache flushing for ordered writes handling
 350 */
 351static void blk_pre_flush_end_io(struct request *flush_rq)
 352{
 353	struct request *rq = flush_rq->end_io_data;
 354	request_queue_t *q = rq->q;
 355
 356	rq->flags |= REQ_BAR_PREFLUSH;
 357
 358	if (!flush_rq->errors)
 359		elv_requeue_request(q, rq);
 360	else {
 361		q->end_flush_fn(q, flush_rq);
 362		clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
 363		q->request_fn(q);
 364	}
 365}
 366
 367static void blk_post_flush_end_io(struct request *flush_rq)
 368{
 369	struct request *rq = flush_rq->end_io_data;
 370	request_queue_t *q = rq->q;
 371
 372	rq->flags |= REQ_BAR_POSTFLUSH;
 373
 374	q->end_flush_fn(q, flush_rq);
 375	clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
 376	q->request_fn(q);
 377}
 378
 379struct request *blk_start_pre_flush(request_queue_t *q, struct request *rq)
 380{
 381	struct request *flush_rq = q->flush_rq;
 382
 383	BUG_ON(!blk_barrier_rq(rq));
 384
 385	if (test_and_set_bit(QUEUE_FLAG_FLUSH, &q->queue_flags))
 386		return NULL;
 387
 388	rq_init(q, flush_rq);
 389	flush_rq->elevator_private = NULL;
 390	flush_rq->flags = REQ_BAR_FLUSH;
 391	flush_rq->rq_disk = rq->rq_disk;
 392	flush_rq->rl = NULL;
 393
 394	/*
 395	 * prepare_flush returns 0 if no flush is needed, just mark both
 396	 * pre and post flush as done in that case
 397	 */
 398	if (!q->prepare_flush_fn(q, flush_rq)) {
 399		rq->flags |= REQ_BAR_PREFLUSH | REQ_BAR_POSTFLUSH;
 400		clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
 401		return rq;
 402	}
 403
 404	/*
 405	 * some drivers dequeue requests right away, some only after io
 406	 * completion. make sure the request is dequeued.
 407	 */
 408	if (!list_empty(&rq->queuelist))
 409		blkdev_dequeue_request(rq);
 410
 411	elv_deactivate_request(q, rq);
 412
 413	flush_rq->end_io_data = rq;
 414	flush_rq->end_io = blk_pre_flush_end_io;
 415
 416	__elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0);
 417	return flush_rq;
 418}
 419
 420static void blk_start_post_flush(request_queue_t *q, struct request *rq)
 421{
 422	struct request *flush_rq = q->flush_rq;
 423
 424	BUG_ON(!blk_barrier_rq(rq));
 425
 426	rq_init(q, flush_rq);
 427	flush_rq->elevator_private = NULL;
 428	flush_rq->flags = REQ_BAR_FLUSH;
 429	flush_rq->rq_disk = rq->rq_disk;
 430	flush_rq->rl = NULL;
 431
 432	if (q->prepare_flush_fn(q, flush_rq)) {
 433		flush_rq->end_io_data = rq;
 434		flush_rq->end_io = blk_post_flush_end_io;
 435
 436		__elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0);
 437		q->request_fn(q);
 438	}
 439}
 440
 441static inline int blk_check_end_barrier(request_queue_t *q, struct request *rq,
 442					int sectors)
 443{
 444	if (sectors > rq->nr_sectors)
 445		sectors = rq->nr_sectors;
 446
 447	rq->nr_sectors -= sectors;
 448	return rq->nr_sectors;
 449}
 450
 451static int __blk_complete_barrier_rq(request_queue_t *q, struct request *rq,
 452				     int sectors, int queue_locked)
 453{
 454	if (q->ordered != QUEUE_ORDERED_FLUSH)
 455		return 0;
 456	if (!blk_fs_request(rq) || !blk_barrier_rq(rq))
 457		return 0;
 458	if (blk_barrier_postflush(rq))
 459		return 0;
 460
 461	if (!blk_check_end_barrier(q, rq, sectors)) {
 462		unsigned long flags = 0;
 463
 464		if (!queue_locked)
 465			spin_lock_irqsave(q->queue_lock, flags);
 466
 467		blk_start_post_flush(q, rq);
 468
 469		if (!queue_locked)
 470			spin_unlock_irqrestore(q->queue_lock, flags);
 471	}
 472
 473	return 1;
 474}
 475
 476/**
 477 * blk_complete_barrier_rq - complete possible barrier request
 478 * @q:  the request queue for the device
 479 * @rq:  the request
 480 * @sectors:  number of sectors to complete
 481 *
 482 * Description:
 483 *   Used in driver end_io handling to determine whether to postpone
 484 *   completion of a barrier request until a post flush has been done. This
 485 *   is the unlocked variant, used if the caller doesn't already hold the
 486 *   queue lock.
 487 **/
 488int blk_complete_barrier_rq(request_queue_t *q, struct request *rq, int sectors)
 489{
 490	return __blk_complete_barrier_rq(q, rq, sectors, 0);
 491}
 492EXPORT_SYMBOL(blk_complete_barrier_rq);
 493
 494/**
 495 * blk_complete_barrier_rq_locked - complete possible barrier request
 496 * @q:  the request queue for the device
 497 * @rq:  the request
 498 * @sectors:  number of sectors to complete
 499 *
 500 * Description:
 501 *   See blk_complete_barrier_rq(). This variant must be used if the caller
 502 *   holds the queue lock.
 503 **/
 504int blk_complete_barrier_rq_locked(request_queue_t *q, struct request *rq,
 505				   int sectors)
 506{
 507	return __blk_complete_barrier_rq(q, rq, sectors, 1);
 508}
 509EXPORT_SYMBOL(blk_complete_barrier_rq_locked);
 510
 511/**
 512 * blk_queue_bounce_limit - set bounce buffer limit for queue
 513 * @q:  the request queue for the device
 514 * @dma_addr:   bus address limit
 515 *
 516 * Description:
 517 *    Different hardware can have different requirements as to what pages
 518 *    it can do I/O directly to. A low level driver can call
 519 *    blk_queue_bounce_limit to have lower memory pages allocated as bounce
 520 *    buffers for doing I/O to pages residing above @page. By default
 521 *    the block layer sets this to the highest numbered "low" memory page.
 522 **/
 523void blk_queue_bounce_limit(request_queue_t *q, u64 dma_addr)
 524{
 525	unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT;
 526
 527	/*
 528	 * set appropriate bounce gfp mask -- unfortunately we don't have a
 529	 * full 4GB zone, so we have to resort to low memory for any bounces.
 530	 * ISA has its own < 16MB zone.
 531	 */
 532	if (bounce_pfn < blk_max_low_pfn) {
 533		BUG_ON(dma_addr < BLK_BOUNCE_ISA);
 534		init_emergency_isa_pool();
 535		q->bounce_gfp = GFP_NOIO | GFP_DMA;
 536	} else
 537		q->bounce_gfp = GFP_NOIO;
 538
 539	q->bounce_pfn = bounce_pfn;
 540}
 541
 542EXPORT_SYMBOL(blk_queue_bounce_limit);
 543
 544/**
 545 * blk_queue_max_sectors - set max sectors for a request for this queue
 546 * @q:  the request queue for the device
 547 * @max_sectors:  max sectors in the usual 512b unit
 548 *
 549 * Description:
 550 *    Enables a low level driver to set an upper limit on the size of
 551 *    received requests.
 552 **/
 553void blk_queue_max_sectors(request_queue_t *q, unsigned short max_sectors)
 554{
 555	if ((max_sectors << 9) < PAGE_CACHE_SIZE) {
 556		max_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
 557		printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors);
 558	}
 559
 560	q->max_sectors = q->max_hw_sectors = max_sectors;
 561}
 562
 563EXPORT_SYMBOL(blk_queue_max_sectors);
 564
 565/**
 566 * blk_queue_max_phys_segments - set max phys segments for a request for this queue
 567 * @q:  the request queue for the device
 568 * @max_segments:  max number of segments
 569 *
 570 * Description:
 571 *    Enables a low level driver to set an upper limit on the number of
 572 *    physical data segments in a request.  This would be the largest sized
 573 *    scatter list the driver could handle.
 574 **/
 575void blk_queue_max_phys_segments(request_queue_t *q, unsigned short max_segments)
 576{
 577	if (!max_segments) {
 578		max_segments = 1;
 579		printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
 580	}
 581
 582	q->max_phys_segments = max_segments;
 583}
 584
 585EXPORT_SYMBOL(blk_queue_max_phys_segments);
 586
 587/**
 588 * blk_queue_max_hw_segments - set max hw segments for a request for this queue
 589 * @q:  the request queue for the device
 590 * @max_segments:  max number of segments
 591 *
 592 * Description:
 593 *    Enables a low level driver to set an upper limit on the number of
 594 *    hw data segments in a request.  This would be the largest number of
 595 *    address/length pairs the host adapter can actually give as once
 596 *    to the device.
 597 **/
 598void blk_queue_max_hw_segments(request_queue_t *q, unsigned short max_segments)
 599{
 600	if (!max_segments) {
 601		max_segments = 1;
 602		printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
 603	}
 604
 605	q->max_hw_segments = max_segments;
 606}
 607
 608EXPORT_SYMBOL(blk_queue_max_hw_segments);
 609
 610/**
 611 * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
 612 * @q:  the request queue for the device
 613 * @max_size:  max size of segment in bytes
 614 *
 615 * Description:
 616 *    Enables a low level driver to set an upper limit on the size of a
 617 *    coalesced segment
 618 **/
 619void blk_queue_max_segment_size(request_queue_t *q, unsigned int max_size)
 620{
 621	if (max_size < PAGE_CACHE_SIZE) {
 622		max_size = PAGE_CACHE_SIZE;
 623		printk("%s: set to minimum %d\n", __FUNCTION__, max_size);
 624	}
 625
 626	q->max_segment_size = max_size;
 627}
 628
 629EXPORT_SYMBOL(blk_queue_max_segment_size);
 630
 631/**
 632 * blk_queue_hardsect_size - set hardware sector size for the queue
 633 * @q:  the request queue for the device
 634 * @size:  the hardware sector size, in bytes
 635 *
 636 * Description:
 637 *   This should typically be set to the lowest possible sector size
 638 *   that the hardware can operate on (possible without reverting to
 639 *   even internal read-modify-write operations). Usually the default
 640 *   of 512 covers most hardware.
 641 **/
 642void blk_queue_hardsect_size(request_queue_t *q, unsigned short size)
 643{
 644	q->hardsect_size = size;
 645}
 646
 647EXPORT_SYMBOL(blk_queue_hardsect_size);
 648
 649/*
 650 * Returns the minimum that is _not_ zero, unless both are zero.
 651 */
 652#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
 653
 654/**
 655 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
 656 * @t:	the stacking driver (top)
 657 * @b:  the underlying device (bottom)
 658 **/
 659void blk_queue_stack_limits(request_queue_t *t, request_queue_t *b)
 660{
 661	/* zero is "infinity" */
 662	t->max_sectors = t->max_hw_sectors =
 663		min_not_zero(t->max_sectors,b->max_sectors);
 664
 665	t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments);
 666	t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);
 667	t->max_segment_size = min(t->max_segment_size,b->max_segment_size);
 668	t->hardsect_size = max(t->hardsect_size,b->hardsect_size);
 669}
 670
 671EXPORT_SYMBOL(blk_queue_stack_limits);
 672
 673/**
 674 * blk_queue_segment_boundary - set boundary rules for segment merging
 675 * @q:  the request queue for the device
 676 * @mask:  the memory boundary mask
 677 **/
 678void blk_queue_segment_boundary(request_queue_t *q, unsigned long mask)
 679{
 680	if (mask < PAGE_CACHE_SIZE - 1) {
 681		mask = PAGE_CACHE_SIZE - 1;
 682		printk("%s: set to minimum %lx\n", __FUNCTION__, mask);
 683	}
 684
 685	q->seg_boundary_mask = mask;
 686}
 687
 688EXPORT_SYMBOL(blk_queue_segment_boundary);
 689
 690/**
 691 * blk_queue_dma_alignment - set dma length and memory alignment
 692 * @q:     the request queue for the device
 693 * @mask:  alignment mask
 694 *
 695 * description:
 696 *    set required memory and length aligment for direct dma transactions.
 697 *    this is used when buiding direct io requests for the queue.
 698 *
 699 **/
 700void blk_queue_dma_alignment(request_queue_t *q, int mask)
 701{
 702	q->dma_alignment = mask;
 703}
 704
 705EXPORT_SYMBOL(blk_queue_dma_alignment);
 706
 707/**
 708 * blk_queue_find_tag - find a request by its tag and queue
 709 *
 710 * @q:	 The request queue for the device
 711 * @tag: The tag of the request
 712 *
 713 * Notes:
 714 *    Should be used when a device returns a tag and you want to match
 715 *    it with a request.
 716 *
 717 *    no locks need be held.
 718 **/
 719struct request *blk_queue_find_tag(request_queue_t *q, int tag)
 720{
 721	struct blk_queue_tag *bqt = q->queue_tags;
 722
 723	if (unlikely(bqt == NULL || tag >= bqt->real_max_depth))
 724		return NULL;
 725
 726	return bqt->tag_index[tag];
 727}
 728
 729EXPORT_SYMBOL(blk_queue_find_tag);
 730
 731/**
 732 * __blk_queue_free_tags - release tag maintenance info
 733 * @q:  the request queue for the device
 734 *
 735 *  Notes:
 736 *    blk_cleanup_queue() will take care of calling this function, if tagging
 737 *    has been used. So there's no need to call this directly.
 738 **/
 739static void __blk_queue_free_tags(request_queue_t *q)
 740{
 741	struct blk_queue_tag *bqt = q->queue_tags;
 742
 743	if (!bqt)
 744		return;
 745
 746	if (atomic_dec_and_test(&bqt->refcnt)) {
 747		BUG_ON(bqt->busy);
 748		BUG_ON(!list_empty(&bqt->busy_list));
 749
 750		kfree(bqt->tag_index);
 751		bqt->tag_index = NULL;
 752
 753		kfree(bqt->tag_map);
 754		bqt->tag_map = NULL;
 755
 756		kfree(bqt);
 757	}
 758
 759	q->queue_tags = NULL;
 760	q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED);
 761}
 762
 763/**
 764 * blk_queue_free_tags - release tag maintenance info
 765 * @q:  the request queue for the device
 766 *
 767 *  Notes:
 768 *	This is used to disabled tagged queuing to a device, yet leave
 769 *	queue in function.
 770 **/
 771void blk_queue_free_tags(request_queue_t *q)
 772{
 773	clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
 774}
 775
 776EXPORT_SYMBOL(blk_queue_free_tags);
 777
 778static int
 779init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth)
 780{
 781	struct request **tag_index;
 782	unsigned long *tag_map;
 783	int nr_ulongs;
 784
 785	if (depth > q->nr_requests * 2) {
 786		depth = q->nr_requests * 2;
 787		printk(KERN_ERR "%s: adjusted depth to %d\n",
 788				__FUNCTION__, depth);
 789	}
 790
 791	tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC);
 792	if (!tag_index)
 793		goto fail;
 794
 795	nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;
 796	tag_map = kmalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC);
 797	if (!tag_map)
 798		goto fail;
 799
 800	memset(tag_index, 0, depth * sizeof(struct request *));
 801	memset(tag_map, 0, nr_ulongs * sizeof(unsigned long));
 802	tags->real_max_depth = depth;
 803	tags->max_depth = depth;
 804	tags->tag_index = tag_index;
 805	tags->tag_map = tag_map;
 806
 807	return 0;
 808fail:
 809	kfree(tag_index);
 810	return -ENOMEM;
 811}
 812
 813/**
 814 * blk_queue_init_tags - initialize the queue tag info
 815 * @q:  the request queue for the device
 816 * @depth:  the maximum queue depth supported
 817 * @tags: the tag to use
 818 **/
 819int blk_queue_init_tags(request_queue_t *q, int depth,
 820			struct blk_queue_tag *tags)
 821{
 822	int rc;
 823
 824	BUG_ON(tags && q->queue_tags && tags != q->queue_tags);
 825
 826	if (!tags && !q->queue_tags) {
 827		tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);
 828		if (!tags)
 829			goto fail;
 830
 831		if (init_tag_map(q, tags, depth))
 832			goto fail;
 833
 834		INIT_LIST_HEAD(&tags->busy_list);
 835		tags->busy = 0;
 836		atomic_set(&tags->refcnt, 1);
 837	} else if (q->queue_tags) {
 838		if ((rc = blk_queue_resize_tags(q, depth)))
 839			return rc;
 840		set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
 841		return 0;
 842	} else
 843		atomic_inc(&tags->refcnt);
 844
 845	/*
 846	 * assign it, all done
 847	 */
 848	q->queue_tags = tags;
 849	q->queue_flags |= (1 << QUEUE_FLAG_QUEUED);
 850	return 0;
 851fail:
 852	kfree(tags);
 853	return -ENOMEM;
 854}
 855
 856EXPORT_SYMBOL(blk_queue_init_tags);
 857
 858/**
 859 * blk_queue_resize_tags - change the queueing depth
 860 * @q:  the request queue for the device
 861 * @new_depth: the new max command queueing depth
 862 *
 863 *  Notes:
 864 *    Must be called with the queue lock held.
 865 **/
 866int blk_queue_resize_tags(request_queue_t *q, int new_depth)
 867{
 868	struct blk_queue_tag *bqt = q->queue_tags;
 869	struct request **tag_index;
 870	unsigned long *tag_map;
 871	int max_depth, nr_ulongs;
 872
 873	if (!bqt)
 874		return -ENXIO;
 875
 876	/*
 877	 * if we already have large enough real_max_depth.  just
 878	 * adjust max_depth.  *NOTE* as requests with tag value
 879	 * between new_depth and real_max_depth can be in-flight, tag
 880	 * map can not be shrunk blindly here.
 881	 */
 882	if (new_depth <= bqt->real_max_depth) {
 883		bqt->max_depth = new_depth;
 884		return 0;
 885	}
 886
 887	/*
 888	 * save the old state info, so we can copy it back
 889	 */
 890	tag_index = bqt->tag_index;
 891	tag_map = bqt->tag_map;
 892	max_depth = bqt->real_max_depth;
 893
 894	if (init_tag_map(q, bqt, new_depth))
 895		return -ENOMEM;
 896
 897	memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
 898	nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG;
 899	memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));
 900
 901	kfree(tag_index);
 902	kfree(tag_map);
 903	return 0;
 904}
 905
 906EXPORT_SYMBOL(blk_queue_resize_tags);
 907
 908/**
 909 * blk_queue_end_tag - end tag operations for a request
 910 * @q:  the request queue for the device
 911 * @rq: the request that has completed
 912 *
 913 *  Description:
 914 *    Typically called when end_that_request_first() returns 0, meaning
 915 *    all transfers have been done for a request. It's important to call
 916 *    this function before end_that_request_last(), as that will put the
 917 *    request back on the free list thus corrupting the internal tag list.
 918 *
 919 *  Notes:
 920 *   queue lock must be held.
 921 **/
 922void blk_queue_end_tag(request_queue_t *q, struct request *rq)
 923{
 924	struct blk_queue_tag *bqt = q->queue_tags;
 925	int tag = rq->tag;
 926
 927	BUG_ON(tag == -1);
 928
 929	if (unlikely(tag >= bqt->real_max_depth))
 930		/*
 931		 * This can happen after tag depth has been reduced.
 932		 * FIXME: how about a warning or info message here?
 933		 */
 934		return;
 935
 936	if (unlikely(!__test_and_clear_bit(tag, bqt->tag_map))) {
 937		printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n",
 938		       __FUNCTION__, tag);
 939		return;
 940	}
 941
 942	list_del_init(&rq->queuelist);
 943	rq->flags &= ~REQ_QUEUED;
 944	rq->tag = -1;
 945
 946	if (unlikely(bqt->tag_index[tag] == NULL))
 947		printk(KERN_ERR "%s: tag %d is missing\n",
 948		       __FUNCTION__, tag);
 949
 950	bqt->tag_index[tag] = NULL;
 951	bqt->busy--;
 952}
 953
 954EXPORT_SYMBOL(blk_queue_end_tag);
 955
 956/**
 957 * blk_queue_start_tag - find a free tag and assign it
 958 * @q:  the request queue for the device
 959 * @rq:  the block request that needs tagging
 960 *
 961 *  Description:
 962 *    This can either be used as a stand-alone helper, or possibly be
 963 *    assigned as the queue &prep_rq_fn (in which case &struct request
 964 *    automagically gets a tag assigned). Note that this function
 965 *    assumes that any type of request can be queued! if this is not
 966 *    true for your device, you must check the request type before
 967 *    calling this function.  The request will also be removed from
 968 *    the request queue, so it's the drivers responsibility to readd
 969 *    it if it should need to be restarted for some reason.
 970 *
 971 *  Notes:
 972 *   queue lock must be held.
 973 **/
 974int blk_queue_start_tag(request_queue_t *q, struct request *rq)
 975{
 976	struct blk_queue_tag *bqt = q->queue_tags;
 977	int tag;
 978
 979	if (unlikely((rq->flags & REQ_QUEUED))) {
 980		printk(KERN_ERR 
 981		       "%s: request %p for device [%s] already tagged %d",
 982		       __FUNCTION__, rq,
 983		       rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag);
 984		BUG();
 985	}
 986
 987	tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth);
 988	if (tag >= bqt->max_depth)
 989		return 1;
 990
 991	__set_bit(tag, bqt->tag_map);
 992
 993	rq->flags |= REQ_QUEUED;
 994	rq->tag = tag;
 995	bqt->tag_index[tag] = rq;
 996	blkdev_dequeue_request(rq);
 997	list_add(&rq->queuelist, &bqt->busy_list);
 998	bqt->busy++;
 999	return 0;
1000}
1001
1002EXPORT_SYMBOL(blk_queue_start_tag);
1003
1004/**
1005 * blk_queue_invalidate_tags - invalidate all pending tags
1006 * @q:  the request queue for the device
1007 *
1008 *  Description:
1009 *   Hardware conditions may dictate a need to stop all pending requests.
1010 *   In this case, we will safely clear the block side of the tag queue and
1011 *   readd all requests to the request queue in the right order.
1012 *
1013 *  Notes:
1014 *   queue lock must be held.
1015 **/
1016void blk_queue_invalidate_tags(request_queue_t *q)
1017{
1018	struct blk_queue_tag *bqt = q->queue_tags;
1019	struct list_head *tmp, *n;
1020	struct request *rq;
1021
1022	list_for_each_safe(tmp, n, &bqt->busy_list) {
1023		rq = list_entry_rq(tmp);
1024
1025		if (rq->tag == -1) {
1026			printk(KERN_ERR
1027			       "%s: bad tag found on list\n", __FUNCTION__);
1028			list_del_init(&rq->queuelist);
1029			rq->flags &= ~REQ_QUEUED;
1030		} else
1031			blk_queue_end_tag(q, rq);
1032
1033		rq->flags &= ~REQ_STARTED;
1034		__elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
1035	}
1036}
1037
1038EXPORT_SYMBOL(blk_queue_invalidate_tags);
1039
1040static char *rq_flags[] = {
1041	"REQ_RW",
1042	"REQ_FAILFAST",
1043	"REQ_SOFTBARRIER",
1044	"REQ_HARDBARRIER",
1045	"REQ_CMD",
1046	"REQ_NOMERGE",
1047	"REQ_STARTED",
1048	"REQ_DONTPREP",
1049	"REQ_QUEUED",
1050	"REQ_PC",
1051	"REQ_BLOCK_PC",
1052	"REQ_SENSE",
1053	"REQ_FAILED",
1054	"REQ_QUIET",
1055	"REQ_SPECIAL",
1056	"REQ_DRIVE_CMD",
1057	"REQ_DRIVE_TASK",
1058	"REQ_DRIVE_TASKFILE",
1059	"REQ_PREEMPT",
1060	"REQ_PM_SUSPEND",
1061	"REQ_PM_RESUME",
1062	"REQ_PM_SHUTDOWN",
1063};
1064
1065void blk_dump_rq_flags(struct request *rq, char *msg)
1066{
1067	int bit;
1068
1069	printk("%s: dev %s: flags = ", msg,
1070		rq->rq_disk ? rq->rq_disk->disk_name : "?");
1071	bit = 0;
1072	do {
1073		if (rq->flags & (1 << bit))
1074			printk("%s ", rq_flags[bit]);
1075		bit++;
1076	} while (bit < __REQ_NR_BITS);
1077
1078	printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector,
1079						       rq->nr_sectors,
1080						       rq->current_nr_sectors);
1081	printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len);
1082
1083	if (rq->flags & (REQ_BLOCK_PC | REQ_PC)) {
1084		printk("cdb: ");
1085		for (bit = 0; bit < sizeof(rq->cmd); bit++)
1086			printk("%02x ", rq->cmd[bit]);
1087		printk("\n");
1088	}
1089}
1090
1091EXPORT_SYMBOL(blk_dump_rq_flags);
1092
1093void blk_recount_segments(request_queue_t *q, struct bio *bio)
1094{
1095	struct bio_vec *bv, *bvprv = NULL;
1096	int i, nr_phys_segs, nr_hw_segs, seg_size, hw_seg_size, cluster;
1097	int high, highprv = 1;
1098
1099	if (unlikely(!bio->bi_io_vec))
1100		return;
1101
1102	cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
1103	hw_seg_size = seg_size = nr_phys_segs = nr_hw_segs = 0;
1104	bio_for_each_segment(bv, bio, i) {
1105		/*
1106		 * the trick here is making sure that a high page is never
1107		 * considered part of another segment, since that might
1108		 * change with the bounce page.
1109		 */
1110		high = page_to_pfn(bv->bv_page) >= q->bounce_pfn;
1111		if (high || highprv)
1112			goto new_hw_segment;
1113		if (cluster) {
1114			if (seg_size + bv->bv_len > q->max_segment_size)
1115				goto new_segment;
1116			if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))
1117				goto new_segment;
1118			if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
1119				goto new_segment;
1120			if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
1121				goto new_hw_segment;
1122
1123			seg_size += bv->bv_len;
1124			hw_seg_size += bv->bv_len;
1125			bvprv = bv;
1126			continue;
1127		}
1128new_segment:
1129		if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) &&
1130		    !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) {
1131			hw_seg_size += bv->bv_len;
1132		} else {
1133new_hw_segment:
1134			if (hw_seg_size > bio->bi_hw_front_size)
1135				bio->bi_hw_front_size = hw_seg_size;
1136			hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len;
1137			nr_hw_segs++;
1138		}
1139
1140		nr_phys_segs++;
1141		bvprv = bv;
1142		seg_size = bv->bv_len;
1143		highprv = high;
1144	}
1145	if (hw_seg_size > bio->bi_hw_back_size)
1146		bio->bi_hw_back_size = hw_seg_size;
1147	if (nr_hw_segs == 1 && hw_seg_size > bio->bi_hw_front_size)
1148		bio->bi_hw_front_size = hw_seg_size;
1149	bio->bi_phys_segments = nr_phys_segs;
1150	bio->bi_hw_segments = nr_hw_segs;
1151	bio->bi_flags |= (1 << BIO_SEG_VALID);
1152}
1153
1154
1155static int blk_phys_contig_segment(request_queue_t *q, struct bio *bio,
1156				   struct bio *nxt)
1157{
1158	if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER)))
1159		return 0;
1160
1161	if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
1162		return 0;
1163	if (bio->bi_size + nxt->bi_size > q->max_segment_size)
1164		return 0;
1165
1166	/*
1167	 * bio and nxt are contigous in memory, check if the queue allows
1168	 * these two to be merged into one
1169	 */
1170	if (BIO_SEG_BOUNDARY(q, bio, nxt))
1171		return 1;
1172
1173	return 0;
1174}
1175
1176static int blk_hw_contig_segment(request_queue_t *q, struct bio *bio,
1177				 struct bio *nxt)
1178{
1179	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
1180		blk_recount_segments(q, bio);
1181	if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID)))
1182		blk_recount_segments(q, nxt);
1183	if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) ||
1184	    BIOVEC_VIRT_OVERSIZE(bio->bi_hw_front_size + bio->bi_hw_back_size))
1185		return 0;
1186	if (bio->bi_size + nxt->bi_size > q->max_segment_size)
1187		return 0;
1188
1189	return 1;
1190}
1191
1192/*
1193 * map a request to scatterlist, return number of sg entries setup. Caller
1194 * must make sure sg can hold rq->nr_phys_segments entries
1195 */
1196int blk_rq_map_sg(request_queue_t *q, struct request *rq, struct scatterlist *sg)
1197{
1198	struct bio_vec *bvec, *bvprv;
1199	struct bio *bio;
1200	int nsegs, i, cluster;
1201
1202	nsegs = 0;
1203	cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
1204
1205	/*
1206	 * for each bio in rq
1207	 */
1208	bvprv = NULL;
1209	rq_for_each_bio(bio, rq) {
1210		/*
1211		 * for each segment in bio
1212		 */
1213		bio_for_each_segment(bvec, bio, i) {
1214			int nbytes = bvec->bv_len;
1215
1216			if (bvprv && cluster) {
1217				if (sg[nsegs - 1].length + nbytes > q->max_segment_size)
1218					goto new_segment;
1219
1220				if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
1221					goto new_segment;
1222				if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
1223					goto new_segment;
1224
1225				sg[nsegs - 1].length += nbytes;
1226			} else {
1227new_segment:
1228				memset(&sg[nsegs],0,sizeof(struct scatterlist));
1229				sg[nsegs].page = bvec->bv_page;
1230				sg[nsegs].length = nbytes;
1231				sg[nsegs].offset = bvec->bv_offset;
1232
1233				nsegs++;
1234			}
1235			bvprv = bvec;
1236		} /* segments in bio */
1237	} /* bios in rq */
1238
1239	return nsegs;
1240}
1241
1242EXPORT_SYMBOL(blk_rq_map_sg);
1243
1244/*
1245 * the standard queue merge functions, can be overridden with device
1246 * specific ones if so desired
1247 */
1248
1249static inline int ll_new_mergeable(request_queue_t *q,
1250				   struct request *req,
1251				   struct bio *bio)
1252{
1253	int nr_phys_segs = bio_phys_segments(q, bio);
1254
1255	if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
1256		req->flags |= REQ_NOMERGE;
1257		if (req == q->last_merge)
1258			q->last_merge = NULL;
1259		return 0;
1260	}
1261
1262	/*
1263	 * A hw segment is just getting larger, bump just the phys
1264	 * counter.
1265	 */
1266	req->nr_phys_segments += nr_phys_segs;
1267	return 1;
1268}
1269
1270static inline int ll_new_hw_segment(request_queue_t *q,
1271				    struct request *req,
1272				    struct bio *bio)
1273{
1274	int nr_hw_segs = bio_hw_segments(q, bio);
1275	int nr_phys_segs = bio_phys_segments(q, bio);
1276
1277	if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments
1278	    || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
1279		req->flags |= REQ_NOMERGE;
1280		if (req == q->last_merge)
1281			q->last_merge = NULL;
1282		return 0;
1283	}
1284
1285	/*
1286	 * This will form the start of a new hw segment.  Bump both
1287	 * counters.
1288	 */
1289	req->nr_hw_segments += nr_hw_segs;
1290	req->nr_phys_segments += nr_phys_segs;
1291	return 1;
1292}
1293
1294static int ll_back_merge_fn(request_queue_t *q, struct request *req, 
1295			    struct bio *bio)
1296{
1297	int len;
1298
1299	if (req->nr_sectors + bio_sectors(bio) > q->max_sectors) {
1300		req->flags |= REQ_NOMERGE;
1301		if (req == q->last_merge)
1302			q->last_merge = NULL;
1303		return 0;
1304	}
1305	if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID)))
1306		blk_recount_segments(q, req->biotail);
1307	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
1308		blk_recount_segments(q, bio);
1309	len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size;
1310	if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)) &&
1311	    !BIOVEC_VIRT_OVERSIZE(len)) {
1312		int mergeable =  ll_new_mergeable(q, req, bio);
1313
1314		if (mergeable) {
1315			if (req->nr_hw_segments == 1)
1316				req->bio->bi_hw_front_size = len;
1317			if (bio->bi_hw_segments == 1)
1318				bio->bi_hw_back_size = len;
1319		}
1320		return mergeable;
1321	}
1322
1323	return ll_new_hw_segment(q, req, bio);
1324}
1325
1326static int ll_front_merge_fn(request_queue_t *q, struct request *req, 
1327			     struct bio *bio)
1328{
1329	int len;
1330
1331	if (req->nr_sectors + bio_sectors(bio) > q->max_sectors) {
1332		req->flags |= REQ_NOMERGE;
1333		if (req == q->last_merge)
1334			q->last_merge = NULL;
1335		return 0;
1336	}
1337	len = bio->bi_hw_back_size + req->bio->bi_hw_front_size;
1338	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
1339		blk_recount_segments(q, bio);
1340	if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID)))
1341		blk_recount_segments(q, req->bio);
1342	if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) &&
1343	    !BIOVEC_VIRT_OVERSIZE(len)) {
1344		int mergeable =  ll_new_mergeable(q, req, bio);
1345
1346		if (mergeable) {
1347			if (bio->bi_hw_segments == 1)
1348				bio->bi_hw_front_size = len;
1349			if (req->nr_hw_segments == 1)
1350				req->biotail->bi_hw_back_size = len;
1351		}
1352		return mergeable;
1353	}
1354
1355	return ll_new_hw_segment(q, req, bio);
1356}
1357
1358static int ll_merge_requests_fn(request_queue_t *q, struct request *req,
1359				struct request *next)
1360{
1361	int total_phys_segments;
1362	int total_hw_segments;
1363
1364	/*
1365	 * First check if the either of the requests are re-queued
1366	 * requests.  Can't merge them if they are.
1367	 */
1368	if (req->special || next->special)
1369		return 0;
1370
1371	/*
1372	 * Will it become too large?
1373	 */
1374	if ((req->nr_sectors + next->nr_sectors) > q->max_sectors)
1375		return 0;
1376
1377	total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
1378	if (blk_phys_contig_segment(q, req->biotail, next->bio))
1379		total_phys_segments--;
1380
1381	if (total_phys_segments > q->max_phys_segments)
1382		return 0;
1383
1384	total_hw_segments = req->nr_hw_segments + next->nr_hw_segments;
1385	if (blk_hw_contig_segment(q, req->biotail, next->bio)) {
1386		int len = req->biotail->bi_hw_back_size + next->bio->bi_hw_front_size;
1387		/*
1388		 * propagate the combined length to the end of the requests
1389		 */
1390		if (req->nr_hw_segments == 1)
1391			req->bio->bi_hw_front_size = len;
1392		if (next->nr_hw_segments == 1)
1393			next->biotail->bi_hw_back_size = len;
1394		total_hw_segments--;
1395	}
1396
1397	if (total_hw_segments > q->max_hw_segments)
1398		return 0;
1399
1400	/* Merge is OK... */
1401	req->nr_phys_segments = total_phys_segments;
1402	req->nr_hw_segments = total_hw_segments;
1403	return 1;
1404}
1405
1406/*
1407 * "plug" the device if there are no outstanding requests: this will
1408 * force the transfer to start only after we have put all the requests
1409 * on the list.
1410 *
1411 * This is called with interrupts off and no requests on the queue and
1412 * with the queue lock held.
1413 */
1414void blk_plug_device(request_queue_t *q)
1415{
1416	WARN_ON(!irqs_disabled());
1417
1418	/*
1419	 * don't plug a stopped queue, it must be paired with blk_start_queue()
1420	 * which will restart the queueing
1421	 */
1422	if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
1423		return;
1424
1425	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
1426		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
1427}
1428
1429EXPORT_SYMBOL(blk_plug_device);
1430
1431/*
1432 * remove the queue from the plugged list, if present. called with
1433 * queue lock held and interrupts disabled.
1434 */
1435int blk_remove_plug(request_queue_t *q)
1436{
1437	WARN_ON(!irqs_disabled());
1438
1439	if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
1440		return 0;
1441
1442	del_timer(&q->unplug_timer);
1443	return 1;
1444}
1445
1446EXPORT_SYMBOL(blk_remove_plug);
1447
1448/*
1449 * remove the plug and let it rip..
1450 */
1451void __generic_unplug_device(request_queue_t *q)
1452{
1453	if (unlikely(test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)))
1454		return;
1455
1456	if (!blk_remove_plug(q))
1457		return;
1458
1459	q->request_fn(q);
1460}
1461EXPORT_SYMBOL(__generic_unplug_device);
1462
1463/**
1464 * generic_unplug_device - fire a request queue
1465 * @q:    The &request_queue_t in question
1466 *
1467 * Description:
1468 *   Linux uses plugging to build bigger requests queues before letting
1469 *   the device have at them. If a queue is plugged, the I/O scheduler
1470 *   is still adding and merging requests on the queue. Once the queue
1471 *   gets unplugged, the request_fn defined for the queue is invoked and
1472 *   transfers started.
1473 **/
1474void generic_unplug_device(request_queue_t *q)
1475{
1476	spin_lock_irq(q->queue_lock);
1477	__generic_unplug_device(q);
1478	spin_unlock_irq(q->queue_lock);
1479}
1480EXPORT_SYMBOL(generic_unplug_device);
1481
1482static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
1483				   struct page *page)
1484{
1485	request_queue_t *q = bdi->unplug_io_data;
1486
1487	/*
1488	 * devices don't necessarily have an ->unplug_fn defined
1489	 */
1490	if (q->unplug_fn)
1491		q->unplug_fn(q);
1492}
1493
1494static void blk_unplug_work(void *data)
1495{
1496	request_queue_t *q = data;
1497
1498	q->unplug_fn(q);
1499}
1500
1501static void blk_unplug_timeout(unsigned long data)
1502{
1503	request_queue_t *q = (request_queue_t *)data;
1504
1505	kblockd_schedule_work(&q->unplug_work);
1506}
1507
1508/**
1509 * blk_start_queue - restart a previously stopped queue
1510 * @q:    The &request_queue_t in question
1511 *
1512 * Description:
1513 *   blk_start_queue() will clear the stop flag on the queue, and call
1514 *   the request_fn for the queue if it was in a stopped state when
1515 *   entered. Also see blk_stop_queue(). Queue lock must be held.
1516 **/
1517void blk_start_queue(request_queue_t *q)
1518{
1519	clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
1520
1521	/*
1522	 * one level of recursion is ok and is much faster than kicking
1523	 * the unplug handling
1524	 */
1525	if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
1526		q->request_fn(q);
1527		clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);
1528	} else {
1529		blk_plug_device(q);
1530		kblockd_schedule_work(&q->unplug_work);
1531	}
1532}
1533
1534EXPORT_SYMBOL(blk_start_queue);
1535
1536/**
1537 * blk_stop_queue - stop a queue
1538 * @q:    The &request_queue_t in question
1539 *
1540 * Description:
1541 *   The Linux block layer assumes that a block driver will consume all
1542 *   entries on the request queue when the request_fn strategy is called.
1543 *   Often this will not happen, because of hardware limitations (queue
1544 *   depth settings). If a device driver gets a 'queue full' response,
1545 *   or if it simply chooses not to queue more I/O at one point, it can
1546 *   call this function to prevent the request_fn from being called until
1547 *   the driver has signalled it's ready to go again. This happens by calling
1548 *   blk_start_queue() to restart queue operations. Queue lock must be held.
1549 **/
1550void blk_stop_queue(request_queue_t *q)
1551{
1552	blk_remove_plug(q);
1553	set_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
1554}
1555EXPORT_SYMBOL(blk_stop_queue);
1556
1557/**
1558 * blk_sync_queue - cancel any pending callbacks on a queue
1559 * @q: the queue
1560 *
1561 * Description:
1562 *     The block layer may perform asynchronous callback activity
1563 *     on a queue, such as calling the unplug function after a timeout.
1564 *     A block device may call blk_sync_queue to ensure that any
1565 *     such activity is cancelled, thus allowing it to release resources
1566 *     the the callbacks might use. The caller must already have made sure
1567 *     that its ->make_request_fn will not re-add plugging prior to calling
1568 *     this function.
1569 *
1570 */
1571void blk_sync_queue(struct request_queue *q)
1572{
1573	del_timer_sync(&q->unplug_timer);
1574	kblockd_flush();
1575}
1576EXPORT_SYMBOL(blk_sync_queue);
1577
1578/**
1579 * blk_run_queue - run a single device queue
1580 * @q:	The queue to run
1581 */
1582void blk_run_queue(struct request_queue *q)
1583{
1584	unsigned long flags;
1585
1586	spin_lock_irqsave(q->queue_lock, flags);
1587	blk_remove_plug(q);
1588	if (!elv_queue_empty(q))
1589		q->request_fn(q);
1590	spin_unlock_irqrestore(q->queue_lock, flags);
1591}
1592EXPORT_SYMBOL(blk_run_queue);
1593
1594/**
1595 * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed
1596 * @q:    the request queue to be released
1597 *
1598 * Description:
1599 *     blk_cleanup_queue is the pair to blk_init_queue() or
1600 *     blk_queue_make_request().  It should be called when a request queue is
1601 *     being released; typically when a block device is being de-registered.
1602 *     Currently, its primary task it to free all the &struct request
1603 *     structures that were allocated to the queue and the queue itself.
1604 *
1605 * Caveat:
1606 *     Hopefully the low level driver will have finished any
1607 *     outstanding requests first...
1608 **/
1609void blk_cleanup_queue(request_queue_t * q)
1610{
1611	struct request_list *rl = &q->rq;
1612
1613	if (!atomic_dec_and_test(&q->refcnt))
1614		return;
1615
1616	if (q->elevator)
1617		elevator_exit(q->elevator);
1618
1619	blk_sync_queue(q);
1620
1621	if (rl->rq_pool)
1622		mempool_destroy(rl->rq_pool);
1623
1624	if (q->queue_tags)
1625		__blk_queue_free_tags(q);
1626
1627	blk_queue_ordered(q, QUEUE_ORDERED_NONE);
1628
1629	kmem_cache_free(requestq_cachep, q);
1630}
1631
1632EXPORT_SYMBOL(blk_cleanup_queue);
1633
1634static int blk_init_free_list(request_queue_t *q)
1635{
1636	struct request_list *rl = &q->rq;
1637
1638	rl->count[READ] = rl->count[WRITE] = 0;
1639	rl->starved[READ] = rl->starved[WRITE] = 0;
1640	init_waitqueue_head(&rl->wait[READ]);
1641	init_waitqueue_head(&rl->wait[WRITE]);
1642	init_waitqueue_head(&rl->drain);
1643
1644	rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
1645				mempool_free_slab, request_cachep, q->node);
1646
1647	if (!rl->rq_pool)
1648		return -ENOMEM;
1649
1650	return 0;
1651}
1652
1653static int __make_request(request_queue_t *, struct bio *);
1654
1655request_queue_t *blk_alloc_queue(int gfp_mask)
1656{
1657	return blk_alloc_queue_node(gfp_mask, -1);
1658}
1659EXPORT_SYMBOL(blk_alloc_queue);
1660
1661request_queue_t *blk_alloc_queue_node(int gfp_mask, int node_id)
1662{
1663	request_queue_t *q;
1664
1665	q = kmem_cache_alloc_node(requestq_cachep, gfp_mask, node_id);
1666	if (!q)
1667		return NULL;
1668
1669	memset(q, 0, sizeof(*q));
1670	init_timer(&q->unplug_timer);
1671	atomic_set(&q->refcnt, 1);
1672
1673	q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
1674	q->backing_dev_info.unplug_io_data = q;
1675
1676	return q;
1677}
1678EXPORT_SYMBOL(blk_alloc_queue_node);
1679
1680/**
1681 * blk_init_queue  - prepare a request queue for use with a block device
1682 * @rfn:  The function to be called to process requests that have been
1683 *        placed on the queue.
1684 * @lock: Request queue spin lock
1685 *
1686 * Description:
1687 *    If a block device wishes to use the standard request handling procedures,
1688 *    which sorts requests and coalesces adjacent requests, then it must
1689 *    call blk_init_queue().  The function @rfn will be called when there
1690 *    are requests on the queue that need to be processed.  If the device
1691 *    supports plugging, then @rfn may not be called immediately when requests
1692 *    are available on the queue, but may be called at some time later instead.
1693 *    Plugged queues are generally unplugged when a buffer belonging to one
1694 *    of the requests on the queue is needed, or due to memory pressure.
1695 *
1696 *    @rfn is not required, or even expected, to remove all requests off the
1697 *    queue, but only as many as it can handle at a time.  If it does leave
1698 *    requests on the queue, it is responsible for arranging that the requests
1699 *    get dealt with eventually.
1700 *
1701 *    The queue spin lock must be held while manipulating the requests on the
1702 *    request queue.
1703 *
1704 *    Function returns a pointer to the initialized request queue, or NULL if
1705 *    it didn't succeed.
1706 *
1707 * Note:
1708 *    blk_init_queue() must be paired with a blk_cleanup_queue() call
1709 *    when the block device is deactivated (such as at module unload).
1710 **/
1711
1712request_queue_t *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
1713{
1714	return blk_init_queue_node(rfn, lock, -1);
1715}
1716EXPORT_SYMBOL(blk_init_queue);
1717
1718request_queue_t *
1719blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
1720{
1721	request_queue_t *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
1722
1723	if (!q)
1724		return NULL;
1725
1726	q->node = node_id;
1727	if (blk_init_free_list(q))
1728		goto out_init;
1729
1730	/*
1731	 * if caller didn't supply a lock, they get per-queue locking with
1732	 * our embedded lock
1733	 */
1734	if (!lock) {
1735		spin_lock_init(&q->__queue_lock);
1736		lock = &q->__queue_lock;
1737	}
1738
1739	q->request_fn		= rfn;
1740	q->back_merge_fn       	= ll_back_merge_fn;
1741	q->front_merge_fn      	= ll_front_merge_fn;
1742	q->merge_requests_fn	= ll_merge_requests_fn;
1743	q->prep_rq_fn		= NULL;
1744	q->unplug_fn		= generic_unplug_device;
1745	q->queue_flags		= (1 << QUEUE_FLAG_CLUSTER);
1746	q->queue_lock		= lock;
1747
1748	blk_queue_segment_boundary(q, 0xffffffff);
1749
1750	blk_queue_make_request(q, __make_request);
1751	blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);
1752
1753	blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
1754	blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
1755
1756	/*
1757	 * all done
1758	 */
1759	if (!elevator_init(q, NULL)) {
1760		blk_queue_congestion_threshold(q);
1761		return q;
1762	}
1763
1764	blk_cleanup_queue(q);
1765out_init:
1766	kmem_cache_free(requestq_cachep, q);
1767	return NULL;
1768}
1769EXPORT_SYMBOL(blk_init_queue_node);
1770
1771int blk_get_queue(request_queue_t *q)
1772{
1773	if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
1774		atomic_inc(&q->refcnt);
1775		return 0;
1776	}
1777
1778	return 1;
1779}
1780
1781EXPORT_SYMBOL(blk_get_queue);
1782
1783static inline void blk_free_request(request_queue_t *q, struct request *rq)
1784{
1785	elv_put_request(q, rq);
1786	mempool_free(rq, q->rq.rq_pool);
1787}
1788
1789static inline struct request *
1790blk_alloc_request(request_queue_t *q, int rw, struct bio *bio, int gfp_mask)
1791{
1792	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
1793
1794	if (!rq)
1795		return NULL;
1796
1797	/*
1798	 * first three bits are identical in rq->flags and bio->bi_rw,
1799	 * see bio.h and blkdev.h
1800	 */
1801	rq->flags = rw;
1802
1803	if (!elv_set_request(q, rq, bio, gfp_mask))
1804		return rq;
1805
1806	mempool_free(rq, q->rq.rq_pool);
1807	return NULL;
1808}
1809
1810/*
1811 * ioc_batching returns true if the ioc is a valid batching request and
1812 * should be given priority access to a request.
1813 */
1814static inline int ioc_batching(request_queue_t *q, struct io_context *ioc)
1815{
1816	if (!ioc)
1817		return 0;
1818
1819	/*
1820	 * Make sure the process is able to allocate at least 1 request
1821	 * even if the batch times out, otherwise we could theoretically
1822	 * lose wakeups.
1823	 */
1824	return ioc->nr_batch_requests == q->nr_batching ||
1825		(ioc->nr_batch_requests > 0
1826		&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
1827}
1828
1829/*
1830 * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
1831 * will cause the process to be a "batcher" on all queues in the system. This
1832 * is the behaviour we want though - once it gets a wakeup it should be given
1833 * a nice run.
1834 */
1835static void ioc_set_batching(request_queue_t *q, struct io_context *ioc)
1836{
1837	if (!ioc || ioc_batching(q, ioc))
1838		return;
1839
1840	ioc->nr_batch_requests = q->nr_batching;
1841	ioc->last_waited = jiffies;
1842}
1843
1844static void __freed_request(request_queue_t *q, int rw)
1845{
1846	struct request_list *rl = &q->rq;
1847
1848	if (rl->count[rw] < queue_congestion_off_threshold(q))
1849		clear_queue_congested(q, rw);
1850
1851	if (rl->count[rw] + 1 <= q->nr_requests) {
1852		if (waitqueue_active(&rl->wait[rw]))
1853			wake_up(&rl->wait[rw]);
1854
1855		blk_clear_queue_full(q, rw);
1856	}
1857}
1858
1859/*
1860 * A request has just been released.  Account for it, update the full and
1861 * congestion status, wake up any waiters.   Called under q->queue_lock.
1862 */
1863static void freed_request(request_queue_t *q, int rw)
1864{
1865	struct request_list *rl = &q->rq;
1866
1867	rl->count[rw]--;
1868
1869	__freed_request(q, rw);
1870
1871	if (unlikely(rl->starved[rw ^ 1]))
1872		__freed_request(q, rw ^ 1);
1873
1874	if (!rl->count[READ] && !rl->count[WRITE]) {
1875		smp_mb();
1876		if (unlikely(waitqueue_active(&rl->drain)))
1877			wake_up(&rl->drain);
1878	}
1879}
1880
1881#define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
1882/*
1883 * Get a free request, queue_lock must be held.
1884 * Returns NULL on failure, with queue_lock held.
1885 * Returns !NULL on success, with queue_lock *not held*.
1886 */
1887static struct request *get_request(request_queue_t *q, int rw, struct bio *bio,
1888				   int gfp_mask)
1889{
1890	struct request *rq = NULL;
1891	struct request_list *rl = &q->rq;
1892	struct io_context *ioc = current_io_context(GFP_ATOMIC);
1893
1894	if (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags)))
1895		goto out;
1896
1897	if (rl->count[rw]+1 >= q->nr_requests) {
1898		/*
1899		 * The queue will fill after this allocation, so set it as
1900		 * full, and mark this process as "batching". This process
1901		 * will be allowed to complete a batch of requests, others
1902		 * will be blocked.
1903		 */
1904		if (!blk_queue_full(q, rw)) {
1905			ioc_set_batching(q, ioc);
1906			blk_set_queue_full(q, rw);
1907		}
1908	}
1909
1910	switch (elv_may_queue(q, rw, bio)) {
1911		case ELV_MQUEUE_NO:
1912			goto rq_starved;
1913		case ELV_MQUEUE_MAY:
1914			break;
1915		case ELV_MQUEUE_MUST:
1916			goto get_rq;
1917	}
1918
1919	if (blk_queue_full(q, rw) && !ioc_batching(q, ioc)) {
1920		/*
1921		 * The queue is full and the allocating process is not a
1922		 * "batcher", and not exempted by the IO scheduler
1923		 */
1924		goto out;
1925	}
1926
1927get_rq:
1928	/*
1929	 * Only allow batching queuers to allocate up to 50% over the defined
1930	 * limit of requests, otherwise we could have thousands of requests
1931	 * allocated with any setting of ->nr_requests
1932	 */
1933	if (rl->count[rw] >= (3 * q->nr_requests / 2))
1934		goto out;
1935
1936	rl->count[rw]++;
1937	rl->starved[rw] = 0;
1938	if (rl->count[rw] >= queue_congestion_on_threshold(q))
1939		set_queue_congested(q, rw);
1940	spin_unlock_irq(q->queue_lock);
1941
1942	rq = blk_alloc_request(q, rw, bio, gfp_mask);
1943	if (!rq) {
1944		/*
1945		 * Allocation failed presumably due to memory. Undo anything
1946		 * we might have messed up.
1947		 *
1948		 * Allocating task should really be put onto the front of the
1949		 * wait queue, but this is pretty rare.
1950		 */
1951		spin_lock_irq(q->queue_lock);
1952		freed_request(q, rw);
1953
1954		/*
1955		 * in the very unlikely event that allocation failed and no
1956		 * requests for this direction was pending, mark us starved
1957		 * so that freeing of a request in the other direction will
1958		 * notice us. another possible fix would be to split the
1959		 * rq mempool into READ and WRITE
1960		 */
1961rq_starved:
1962		if (unlikely(rl->count[rw] == 0))
1963			rl->starved[rw] = 1;
1964
1965		goto out;
1966	}
1967
1968	if (ioc_batching(q, ioc))
1969		ioc->nr_batch_requests--;
1970	
1971	rq_init(q, rq);
1972	rq->rl = rl;
1973out:
1974	return rq;
1975}
1976
1977/*
1978 * No available requests for this queue, unplug the device and wait for some
1979 * requests to become available.
1980 *
1981 * Called with q->queue_lock held, and returns with it unlocked.
1982 */
1983static struct request *get_request_wait(request_queue_t *q, int rw,
1984					struct bio *bio)
1985{
1986	struct request *rq;
1987
1988	rq = get_request(q, rw, bio, GFP_NOIO);
1989	while (!rq) {
1990		DEFINE_WAIT(wait);
1991		struct request_list *rl = &q->rq;
1992
1993		prepare_to_wait_exclusive(&rl->wait[rw], &wait,
1994				TASK_UNINTERRUPTIBLE);
1995
1996		rq = get_request(q, rw, bio, GFP_NOIO);
1997
1998		if (!rq) {
1999			struct io_context *ioc;
2000
2001			__generic_unplug_device(q);
2002			spin_unlock_irq(q->queue_lock);
2003			io_schedule();
2004
2005			/*
2006			 * After sleeping, we become a "batching" process and
2007			 * will be able to allocate at least one request, and
2008			 * up to a big batch of them for a small period time.
2009			 * See ioc_batching, ioc_set_batching
2010			 */
2011			ioc = current_io_context(GFP_NOIO);
2012			ioc_set_batching(q, ioc);
2013
2014			spin_lock_irq(q->queue_lock);
2015		}
2016		finish_wait(&rl->wait[rw], &wait);
2017	}
2018
2019	return rq;
2020}
2021
2022struct request *blk_get_request(request_queue_t *q, int rw, int gfp_mask)
2023{
2024	struct request *rq;
2025
2026	BUG_ON(rw != READ && rw != WRITE);
2027
2028	spin_lock_irq(q->queue_lock);
2029	if (gfp_mask & __GFP_WAIT) {
2030		rq = get_request_wait(q, rw, NULL);
2031	} else {
2032		rq = get_request(q, rw, NULL, gfp_mask);
2033		if (!rq)
2034			spin_unlock_irq(q->queue_lock);
2035	}
2036	/* q->queue_lock is unlocked at this point */
2037
2038	return rq;
2039}
2040EXPORT_SYMBOL(blk_get_request);
2041
2042/**
2043 * blk_requeue_request - put a request back on queue
2044 * @q:		request queue where request should be inserted
2045 * @rq:		request to be inserted
2046 *
2047 * Description:
2048 *    Drivers often keep queueing requests until the hardware cannot accept
2049 *    more, when that condition happens we need to put the request back
2050 *    on the queue. Must be called with queue lock held.
2051 */
2052void blk_requeue_request(request_queue_t *q, struct request *rq)
2053{
2054	if (blk_rq_tagged(rq))
2055		blk_queue_end_tag(q, rq);
2056
2057	elv_requeue_request(q, rq);
2058}
2059
2060EXPORT_SYMBOL(blk_requeue_request);
2061
2062/**
2063 * blk_insert_request - insert a special request in to a request queue
2064 * @q:		request queue where request should be inserted
2065 * @rq:		request to be inserted
2066 * @at_head:	insert request at head or tail of queue
2067 * @data:	private data
2068 *
2069 * Description:
2070 *    Many block devices need to execute commands asynchronously, so they don't
2071 *    block the whole kernel from preemption during request execution.  This is
2072 *    accomplished normally by inserting aritficial requests tagged as
2073 *    REQ_SPECIAL in to the corresponding request queue, and letting them be
2074 *    scheduled for actual execution by the request queue.
2075 *
2076 *    We have the option of inserting the head or the tail of the queue.
2077 *    Typically we use the tail for new ioctls and so forth.  We use the head
2078 *    of the queue for things like a QUEUE_FULL message from a device, or a
2079 *    host that is unable to accept a particular command.
2080 */
2081void blk_insert_request(request_queue_t *q, struct request *rq,
2082			int at_head, void *data)
2083{
2084	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
2085	unsigned long flags;
2086
2087	/*
2088	 * tell I/O scheduler that this isn't a regular read/write (ie it
2089	 * must not attempt merges on this) and that it acts as a soft
2090	 * barrier
2091	 */
2092	rq->flags |= REQ_SPECIAL | REQ_SOFTBARRIER;
2093
2094	rq->special = data;
2095
2096	spin_lock_irqsave(q->queue_lock, flags);
2097
2098	/*
2099	 * If command is tagged, release the tag
2100	 */
2101	if (blk_rq_tagged(rq))
2102		blk_queue_end_tag(q, rq);
2103
2104	drive_stat_acct(rq, rq->nr_sectors, 1);
2105	__elv_add_request(q, rq, where, 0);
2106
2107	if (blk_queue_plugged(q))
2108		__generic_unplug_device(q);
2109	else
2110		q->request_fn(q);
2111	spin_unlock_irqrestore(q->queue_lock, flags);
2112}
2113
2114EXPORT_SYMBOL(blk_insert_request);
2115
2116/**
2117 * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage
2118 * @q:		request queue where request should be inserted
2119 * @rq:		request structure to fill
2120 * @ubuf:	the user buffer
2121 * @len:	length of user data
2122 *
2123 * Description:
2124 *    Data will be mapped directly for zero copy io, if possible. Otherwise
2125 *    a kernel bounce buffer is used.
2126 *
2127 *    A matching blk_rq_unmap_user() must be issued at the end of io, while
2128 *    still in process context.
2129 *
2130 *    Note: The mapped bio may need to be bounced through blk_queue_bounce()
2131 *    before being submitted to the device, as pages mapped may be out of
2132 *    reach. It's the callers responsibility to make sure this happens. The
2133 *    original bio must be passed back in to blk_rq_unmap_user() for proper
2134 *    unmapping.
2135 */
2136int blk_rq_map_user(request_queue_t *q, struct request *rq, void __user *ubuf,
2137		    unsigned int len)
2138{
2139	unsigned long uaddr;
2140	struct bio *bio;
2141	int reading;
2142
2143	if (len > (q->max_sectors << 9))
2144		return -EINVAL;
2145	if (!len || !ubuf)
2146		return -EINVAL;
2147
2148	reading = rq_data_dir(rq) == READ;
2149
2150	/*
2151	 * if alignment requirement is satisfied, map in user pages for
2152	 * direct dma. else, set up kernel bounce buffers
2153	 */
2154	uaddr = (unsigned long) ubuf;
2155	if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q)))
2156		bio = bio_map_user(q, NULL, uaddr, len, reading);
2157	else
2158		bio = bio_copy_user(q, uaddr, len, reading);
2159
2160	if (!IS_ERR(bio)) {
2161		rq->bio = rq->biotail = bio;
2162		blk_rq_bio_prep(q, rq, bio);
2163
2164		rq->buffer = rq->data = NULL;
2165		rq->data_len = len;
2166		return 0;
2167	}
2168
2169	/*
2170	 * bio is the err-ptr
2171	 */
2172	return PTR_ERR(bio);
2173}
2174
2175EXPORT_SYMBOL(blk_rq_map_user);
2176
2177/**
2178 * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage
2179 * @q:		request queue where request should be inserted
2180 * @rq:		request to map data to
2181 * @iov:	pointer to the iovec
2182 * @iov_count:	number of elements in the iovec
2183 *
2184 * Description:
2185 *    Data will be mapped directly for zero copy io, if possible. Otherwise
2186 *    a kernel bounce buffer is used.
2187 *
2188 *    A matching blk_rq_unmap_user() must be issued at the end of io, while
2189 *    still in process context.
2190 *
2191 *    Note: The mapped bio may need to be bounced through blk_queue_bounce()
2192 *    before being submitted to the device, as pages mapped may be out of
2193 *    reach. It's the callers responsibility to make sure this happens. The
2194 *    original bio must be passed back in to blk_rq_unmap_user() for proper
2195 *    unmapping.
2196 */
2197int blk_rq_map_user_iov(request_queue_t *q, struct request *rq,
2198			struct sg_iovec *iov, int iov_count)
2199{
2200	struct bio *bio;
2201
2202	if (!iov || iov_count <= 0)
2203		return -EINVAL;
2204
2205	/* we don't allow misaligned data like bio_map_user() does.  If the
2206	 * user is using sg, they're expected to know the alignment constraints
2207	 * and respect them accordingly */
2208	bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ);
2209	if (IS_ERR(bio))
2210		return PTR_ERR(bio);
2211
2212	rq->bio = rq->biotail = bio;
2213	blk_rq_bio_prep(q, rq, bio);
2214	rq->buffer = rq->data = NULL;
2215	rq->data_len = bio->bi_size;
2216	return 0;
2217}
2218
2219EXPORT_SYMBOL(blk_rq_map_user_iov);
2220
2221/**
2222 * blk_rq_unmap_user - unmap a request with user data
2223 * @bio:	bio to be unmapped
2224 * @ulen:	length of user buffer
2225 *
2226 * Description:
2227 *    Unmap a bio previously mapped by blk_rq_map_user().
2228 */
2229int blk_rq_unmap_user(struct bio *bio, unsigned int ulen)
2230{
2231	int ret = 0;
2232
2233	if (bio) {
2234		if (bio_flagged(bio, BIO_USER_MAPPED))
2235			bio_unmap_user(bio);
2236		else
2237			ret = bio_uncopy_user(bio);
2238	}
2239
2240	return 0;
2241}
2242
2243EXPORT_SYMBOL(blk_rq_unmap_user);
2244
2245/**
2246 * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage
2247 * @q:		request queue where request should be inserted
2248 * @rq:		request to fill
2249 * @kbuf:	the kernel buffer
2250 * @len:	length of user data
2251 * @gfp_mask:	memory allocation flags
2252 */
2253int blk_rq_map_kern(request_queue_t *q, struct request *rq, void *kbuf,
2254		    unsigned int len, unsigned int gfp_mask)
2255{
2256	struct bio *bio;
2257
2258	if (len > (q->max_sectors << 9))
2259		return -EINVAL;
2260	if (!len || !kbuf)
2261		return -EINVAL;
2262
2263	bio = bio_map_kern(q, kbuf, len, gfp_mask);
2264	if (IS_ERR(bio))
2265		return PTR_ERR(bio);
2266
2267	if (rq_data_dir(rq) == WRITE)
2268		bio->bi_rw |= (1 << BIO_RW);
2269
2270	rq->bio = rq->biotail = bio;
2271	blk_rq_bio_prep(q, rq, bio);
2272
2273	rq->buffer = rq->data = NULL;
2274	rq->data_len = len;
2275	return 0;
2276}
2277
2278EXPORT_SYMBOL(blk_rq_map_kern);
2279
2280/**
2281 * blk_execute_rq_nowait - insert a request into queue for execution
2282 * @q:		queue to insert the request in
2283 * @bd_disk:	matching gendisk
2284 * @rq:		request to insert
2285 * @at_head:    insert request at head or tail of queue
2286 * @done:	I/O completion handler
2287 *
2288 * Description:
2289 *    Insert a fully prepared request at the back of the io scheduler queue
2290 *    for execution.  Don't wait for completion.
2291 */
2292void blk_execute_rq_nowait(request_queue_t *q, struct gendisk *bd_disk,
2293			   struct request *rq, int at_head,
2294			   void (*done)(struct request *))
2295{
2296	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
2297
2298	rq->rq_disk = bd_disk;
2299	rq->flags |= REQ_NOMERGE;
2300	rq->end_io = done;
2301	elv_add_request(q, rq, where, 1);
2302	generic_unplug_device(q);
2303}
2304
2305/**
2306 * blk_execute_rq - insert a request into queue for execution
2307 * @q:		queue to insert the request in
2308 * @bd_disk:	matching gendisk
2309 * @rq:		request to insert
2310 * @at_head:    insert request at head or tail of queue
2311 *
2312 * Description:
2313 *    Insert a fully prepared request at the back of the io scheduler queue
2314 *    for execution and wait for completion.
2315 */
2316int blk_execute_rq(request_queue_t *q, struct gendisk *bd_disk,
2317		   struct request *rq, int at_head)
2318{
2319	DECLARE_COMPLETION(wait);
2320	char sense[SCSI_SENSE_BUFFERSIZE];
2321	int err = 0;
2322
2323	/*
2324	 * we need an extra reference to the request, so we can look at
2325	 * it after io completion
2326	 */
2327	rq->ref_count++;
2328
2329	if (!rq->sense) {
2330		memset(sense, 0, sizeof(sense));
2331		rq->sense = sense;
2332		rq->sense_len = 0;
2333	}
2334
2335	rq->waiting = &wait;
2336	blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
2337	wait_for_completion(&wait);
2338	rq->waiting = NULL;
2339
2340	if (rq->errors)
2341		err = -EIO;
2342
2343	return err;
2344}
2345
2346EXPORT_SYMBOL(blk_execute_rq);
2347
2348/**
2349 * blkdev_issue_flush - queue a flush
2350 * @bdev:	blockdev to issue flush for
2351 * @error_sector:	error sector
2352 *
2353 * Description:
2354 *    Issue a flush for the block device in question. Caller can supply
2355 *    room for storing the error offset in case of a flush error, if they
2356 *    wish to.  Caller must run wait_for_completion() on its own.
2357 */
2358int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
2359{
2360	request_queue_t *q;
2361
2362	if (bdev->bd_disk == NULL)
2363		return -ENXIO;
2364
2365	q = bdev_get_queue(bdev);
2366	if (!q)
2367		return -ENXIO;
2368	if (!q->issue_flush_fn)
2369		return -EOPNOTSUPP;
2370
2371	return q->issue_flush_fn(q, bdev->bd_disk, error_sector);
2372}
2373
2374EXPORT_SYMBOL(blkdev_issue_flush);
2375
2376/**
2377 * blkdev_scsi_issue_flush_fn - issue flush for SCSI devices
2378 * @q:		device queue
2379 * @disk:	gendisk
2380 * @error_sector:	error offset
2381 *
2382 * Description:
2383 *    Devices understanding the SCSI command set, can use this function as
2384 *    a helper for issuing a cache flush. Note: driver is required to store
2385 *    the error offset (in case of error flushing) in ->sector of struct
2386 *    request.
2387 */
2388int blkdev_scsi_issue_flush_fn(request_queue_t *q, struct gendisk *disk,
2389			       sector_t *error_sector)
2390{
2391	struct request *rq = blk_get_request(q, WRITE, __GFP_WAIT);
2392	int ret;
2393
2394	rq->flags |= REQ_BLOCK_PC | REQ_SOFTBARRIER;
2395	rq->sector = 0;
2396	memset(rq->cmd, 0, sizeof(rq->cmd));
2397	rq->cmd[0] = 0x35;
2398	rq->cmd_len = 12;
2399	rq->data = NULL;
2400	rq->data_len = 0;
2401	rq->timeout = 60 * HZ;
2402
2403	ret = blk_execute_rq(q, disk, rq, 0);
2404
2405	if (ret && error_sector)
2406		*error_sector = rq->sector;
2407
2408	blk_put_request(rq);
2409	return ret;
2410}
2411
2412EXPORT_SYMBOL(blkdev_scsi_issue_flush_fn);
2413
2414static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io)
2415{
2416	int rw = rq_data_dir(rq);
2417
2418	if (!blk_fs_request(rq) || !rq->rq_disk)
2419		return;
2420
2421	if (rw == READ) {
2422		__disk_stat_add(rq->rq_disk, read_sectors, nr_sectors);
2423		if (!new_io)
2424			__disk_stat_inc(rq->rq_disk, read_merges);
2425	} else if (rw == WRITE) {
2426		__disk_stat_add(rq->rq_disk, write_sectors, nr_sectors);
2427		if (!new_io)
2428			__disk_stat_inc(rq->rq_disk, write_merges);
2429	}
2430	if (new_io) {
2431		disk_round_stats(rq->rq_disk);
2432		rq->rq_disk->in_flight++;
2433	}
2434}
2435
2436/*
2437 * add-request adds a request to the linked list.
2438 * queue lock is held and interrupts disabled, as we muck with the
2439 * request queue list.
2440 */
2441static inline void add_request(request_queue_t * q, struct request * req)
2442{
2443	drive_stat_acct(req, req->nr_sectors, 1);
2444
2445	if (q->activity_fn)
2446		q->activity_fn(q->activity_data, rq_data_dir(req));
2447
2448	/*
2449	 * elevator indicated where it wants this request to be
2450	 * inserted at elevator_merge time
2451	 */
2452	__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
2453}
2454 
2455/*
2456 * disk_round_stats()	- Round off the performance stats on a struct
2457 * disk_stats.
2458 *
2459 * The average IO queue length and utilisation statistics are maintained
2460 * by observing the current state of the queue length and the amount of
2461 * time it has been in this state for.
2462 *
2463 * Normally, that accounting is done on IO completion, but that can result
2464 * in more than a second's worth of IO being accounted for within any one
2465 * second, leading to >100% utilisation.  To deal with that, we call this
2466 * function to do a round-off before returning the results when reading
2467 * /proc/diskstats.  This accounts immediately for all queue usage up to
2468 * the current jiffies and restarts the counters again.
2469 */
2470void disk_round_stats(struct gendisk *disk)
2471{
2472	unsigned long now = jiffies;
2473
2474	__disk_stat_add(disk, time_in_queue,
2475			disk->in_flight * (now - disk->stamp));
2476	disk->stamp = now;
2477
2478	if (disk->in_flight)
2479		__disk_stat_add(disk, io_ticks, (now - disk->stamp_idle));
2480	disk->stamp_idle = now;
2481}
2482
2483/*
2484 * queue lock must be held
2485 */
2486static void __blk_put_request(request_queue_t *q, struct request *req)
2487{
2488	struct request_list *rl = req->rl;
2489
2490	if (unlikely(!q))
2491		return;
2492	if (unlikely(--req->ref_count))
2493		return;
2494
2495	req->rq_status = RQ_INACTIVE;
2496	req->rl = NULL;
2497
2498	/*
2499	 * Request may not have originated from ll_rw_blk. if not,
2500	 * it didn't come out of our reserved rq pools
2501	 */
2502	if (rl) {
2503		int rw = rq_data_dir(req);
2504
2505		elv_completed_request(q, req);
2506
2507		BUG_ON(!list_empty(&req->queuelist));
2508
2509		blk_free_request(q, req);
2510		freed_request(q, rw);
2511	}
2512}
2513
2514void blk_put_request(struct request *req)
2515{
2516	/*
2517	 * if req->rl isn't set, this request didnt originate from the
2518	 * block layer, so it's safe to just disregard it
2519	 */
2520	if (req->rl) {
2521		unsigned long flags;
2522		request_queue_t *q = req->q;
2523
2524		spin_lock_irqsave(q->queue_lock, flags);
2525		__blk_put_request(q, req);
2526		spin_unlock_irqrestore(q->queue_lock, flags);
2527	}
2528}
2529
2530EXPORT_SYMBOL(blk_put_request);
2531
2532/**
2533 * blk_end_sync_rq - executes a completion event on a request
2534 * @rq: request to complete
2535 */
2536void blk_end_sync_rq(struct request *rq)
2537{
2538	struct completion *waiting = rq->waiting;
2539
2540	rq->waiting = NULL;
2541	__blk_put_request(rq->q, rq);
2542
2543	/*
2544	 * complete last, if this is a stack request the process (and thus
2545	 * the rq pointer) could be invalid right after this complete()
2546	 */
2547	complete(waiting);
2548}
2549EXPORT_SYMBOL(blk_end_sync_rq);
2550
2551/**
2552 * blk_congestion_wait - wait for a queue to become uncongested
2553 * @rw: READ or WRITE
2554 * @timeout: timeout in jiffies
2555 *
2556 * Waits for up to @timeout jiffies for a queue (any queue) to exit congestion.
2557 * If no queues are congested then just wait for the next request to be
2558 * returned.
2559 */
2560long blk_congestion_wait(int rw, long timeout)
2561{
2562	long ret;
2563	DEFINE_WAIT(wait);
2564	wait_queue_head_t *wqh = &congestion_wqh[rw];
2565
2566	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
2567	ret = io_schedule_timeout(timeout);
2568	finish_wait(wqh, &wait);
2569	return ret;
2570}
2571
2572EXPORT_SYMBOL(blk_congestion_wait);
2573
2574/*
2575 * Has to be called with the request spinlock acquired
2576 */
2577static int attempt_merge(request_queue_t *q, struct request *req,
2578			  struct request *next)
2579{
2580	if (!rq_mergeable(req) || !rq_mergeable(next))
2581		return 0;
2582
2583	/*
2584	 * not contigious
2585	 */
2586	if (req->sector + req->nr_sectors != next->sector)
2587		return 0;
2588
2589	if (rq_data_dir(req) != rq_data_dir(next)
2590	    || req->rq_disk != next->rq_disk
2591	    || next->waiting || next->special)
2592		return 0;
2593
2594	/*
2595	 * If we are allowed to merge, then append bio list
2596	 * from next to rq and release next. merge_requests_fn
2597	 * will have updated segment counts, update sector
2598	 * counts here.
2599	 */
2600	if (!q->merge_requests_fn(q, req, next))
2601		return 0;
2602
2603	/*
2604	 * At this point we have either done a back merge
2605	 * or front merge. We need the smaller start_time of
2606	 * the merged requests to be the current request
2607	 * for accounting purposes.
2608	 */
2609	if (time_after(req->start_time, next->start_time))
2610		req->start_time = next->start_time;
2611
2612	req->biotail->bi_next = next->bio;
2613	req->biotail = next->biotail;
2614
2615	req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
2616
2617	elv_merge_requests(q, req, next);
2618
2619	if (req->rq_disk) {
2620		disk_round_stats(req->rq_disk);
2621		req->rq_disk->in_flight--;
2622	}
2623
2624	req->ioprio = ioprio_best(req->ioprio, next->ioprio);
2625
2626	__blk_put_request(q, next);
2627	return 1;
2628}
2629
2630static inline int attempt_back_merge(request_queue_t *q, struct request *rq)
2631{
2632	struct request *next = elv_latter_request(q, rq);
2633
2634	if (next)
2635		return attempt_merge(q, rq, next);
2636
2637	return 0;
2638}
2639
2640static inline int attempt_front_merge(request_queue_t *q, struct request *rq)
2641{
2642	struct request *prev = elv_former_request(q, rq);
2643
2644	if (prev)
2645		return attempt_merge(q, prev, rq);
2646
2647	return 0;
2648}
2649
2650/**
2651 * blk_attempt_remerge  - attempt to remerge active head with next request
2652 * @q:    The &request_queue_t belonging to the device
2653 * @rq:   The head request (usually)
2654 *
2655 * Description:
2656 *    For head-active devices, the queue can easily be unplugged so quickly
2657 *    that proper merging is not done on the front request. This may hurt
2658 *    performance greatly for some devices. The block layer cannot safely
2659 *    do merging on that first request for these queues, but the driver can
2660 *    call this function and make it happen any way. Only the driver knows
2661 *    when it is safe to do so.
2662 **/
2663void blk_attempt_remerge(request_queue_t *q, struct request *rq)
2664{
2665	unsigned long flags;
2666
2667	spin_lock_irqsave(q->queue_lock, flags);
2668	attempt_back_merge(q, rq);
2669	spin_unlock_irqrestore(q->queue_lock, flags);
2670}
2671
2672EXPORT_SYMBOL(blk_attempt_remerge);
2673
2674static int __make_request(request_queue_t *q, struct bio *bio)
2675{
2676	struct request *req;
2677	int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err, sync;
2678	unsigned short prio;
2679	sector_t sector;
2680
2681	sector = bio->bi_sector;
2682	nr_sectors = bio_sectors(bio);
2683	cur_nr_sectors = bio_cur_sectors(bio);
2684	prio = bio_prio(bio);
2685
2686	rw = bio_data_dir(bio);
2687	sync = bio_sync(bio);
2688
2689	/*
2690	 * low level driver can indicate that it wants pages above a
2691	 * certain limit bounced to low memory (ie for highmem, or even
2692	 * ISA dma in theory)
2693	 */
2694	blk_queue_bounce(q, &bio);
2695
2696	spin_lock_prefetch(q->queue_lock);
2697
2698	barrier = bio_barrier(bio);
2699	if (unlikely(barrier) && (q->ordered == QUEUE_ORDERED_NONE)) {
2700		err = -EOPNOTSUPP;
2701		goto end_io;
2702	}
2703
2704	spin_lock_irq(q->queue_lock);
2705
2706	if (unlikely(barrier) || elv_queue_empty(q))
2707		goto get_rq;
2708
2709	el_ret = elv_merge(q, &req, bio);
2710	switch (el_ret) {
2711		case ELEVATOR_BACK_MERGE:
2712			BUG_ON(!rq_mergeable(req));
2713
2714			if (!q->back_merge_fn(q, req, bio))
2715				break;
2716
2717			req->biotail->bi_next = bio;
2718			req->biotail = bio;
2719			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
2720			req->ioprio = ioprio_best(req->ioprio, prio);
2721			drive_stat_acct(req, nr_sectors, 0);
2722			if (!attempt_back_merge(q, req))
2723				elv_merged_request(q, req);
2724			goto out;
2725
2726		case ELEVATOR_FRONT_MERGE:
2727			BUG_ON(!rq_mergeable(req));
2728
2729			if (!q->front_merge_fn(q, req, bio))
2730				break;
2731
2732			bio->bi_next = req->bio;
2733			req->bio = bio;
2734
2735			/*
2736			 * may not be valid. if the low level driver said
2737			 * it didn't need a bounce buffer then it better
2738			 * not touch req->buffer either...
2739			 */
2740			req->buffer = bio_data(bio);
2741			req->current_nr_sectors = cur_nr_sectors;
2742			req->hard_cur_sectors = cur_nr_sectors;
2743			req->sector = req->hard_sector = sector;
2744			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
2745			req->ioprio = ioprio_best(req->ioprio, prio);
2746			drive_stat_acct(req, nr_sectors, 0);
2747			if (!attempt_front_merge(q, req))
2748				elv_merged_request(q, req);
2749			goto out;
2750
2751		/* ELV_NO_MERGE: elevator says don't/can't merge. */
2752		default:
2753			;
2754	}
2755
2756get_rq:
2757	/*
2758	 * Grab a free request. This is might sleep but can not fail.
2759	 * Returns with the queue unlocked.
2760	 */
2761	req = get_request_wait(q, rw, bio);
2762
2763	/*
2764	 * After dropping the lock and possibly sleeping here, our request
2765	 * may now be mergeable after it had proven unmergeable (above).
2766	 * We don't worry about that case for efficiency. It won't happen
2767	 * often, and the elevators are able to handle it.
2768	 */
2769
2770	req->flags |= REQ_CMD;
2771
2772	/*
2773	 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
2774	 */
2775	if (bio_rw_ahead(bio) || bio_failfast(bio))
2776		req->flags |= REQ_FAILFAST;
2777
2778	/*
2779	 * REQ_BARRIER implies no merging, but lets make it explicit
2780	 */
2781	if (unlikely(barrier))
2782		req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
2783
2784	req->errors = 0;
2785	req->hard_sector = req->sector = sector;
2786	req->hard_nr_sectors = req->nr_sectors = nr_sectors;
2787	req->current_nr_sectors = req->hard_cur_sectors = cur_nr_sectors;
2788	req->nr_phys_segments = bio_phys_segments(q, bio);
2789	req->nr_hw_segments = bio_hw_segments(q, bio);
2790	req->buffer = bio_data(bio);	/* see ->buffer comment above */
2791	req->waiting = NULL;
2792	req->bio = req->biotail = bio;
2793	req->ioprio = prio;
2794	req->rq_disk = bio->bi_bdev->bd_disk;
2795	req->start_time = jiffies;
2796
2797	spin_lock_irq(q->queue_lock);
2798	if (elv_queue_empty(q))
2799		blk_plug_device(q);
2800	add_request(q, req);
2801out:
2802	if (sync)
2803		__generic_unplug_device(q);
2804
2805	spin_unlock_irq(q->queue_lock);
2806	return 0;
2807
2808end_io:
2809	bio_endio(bio, nr_sectors << 9, err);
2810	return 0;
2811}
2812
2813/*
2814 * If bio->bi_dev is a partition, remap the location
2815 */
2816static inline void blk_partition_remap(struct bio *bio)
2817{
2818	struct block_device *bdev = bio->bi_bdev;
2819
2820	if (bdev != bdev->bd_contains) {
2821		struct hd_struct *p = bdev->bd_part;
2822
2823		switch (bio_data_dir(bio)) {
2824		case READ:
2825			p->read_sectors += bio_sectors(bio);
2826			p->reads++;
2827			break;
2828		case WRITE:
2829			p->write_sectors += bio_sectors(bio);
2830			p->writes++;
2831			break;
2832		}
2833		bio->bi_sector += p->start_sect;
2834		bio->bi_bdev = bdev->bd_contains;
2835	}
2836}
2837
2838void blk_finish_queue_drain(request_queue_t *q)
2839{
2840	struct request_list *rl = &q->rq;
2841	struct request *rq;
2842	int requeued = 0;
2843
2844	spin_lock_irq(q->queue_lock);
2845	clear_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
2846
2847	while (!list_empty(&q->drain_list)) {
2848		rq = list_entry_rq(q->drain_list.next);
2849
2850		list_del_init(&rq->queuelist);
2851		elv_requeue_request(q, rq);
2852		requeued++;
2853	}
2854
2855	if (requeued)
2856		q->request_fn(q);
2857
2858	spin_unlock_irq(q->queue_lock);
2859
2860	wake_up(&rl->wait[0]);
2861	wake_up(&rl->wait[1]);
2862	wake_up(&rl->drain);
2863}
2864
2865static int wait_drain(request_queue_t *q, struct request_list *rl, int dispatch)
2866{
2867	int wait = rl->count[READ] + rl->count[WRITE];
2868
2869	if (dispatch)
2870		wait += !list_empty(&q->queue_head);
2871
2872	return wait;
2873}
2874
2875/*
2876 * We rely on the fact that only requests allocated through blk_alloc_request()
2877 * have io scheduler private data structures associated with them. Any other
2878 * type of request (allocated on stack or through kmalloc()) should not go
2879 * to the io scheduler core, but be attached to the queue head instead.
2880 */
2881void blk_wait_queue_drained(request_queue_t *q, int wait_dispatch)
2882{
2883	struct request_list *rl = &q->rq;
2884	DEFINE_WAIT(wait);
2885
2886	spin_lock_irq(q->queue_lock);
2887	set_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
2888
2889	while (wait_drain(q, rl, wait_dispatch)) {
2890		prepare_to_wait(&rl->drain, &wait, TASK_UNINTERRUPTIBLE);
2891
2892		if (wait_drain(q, rl, wait_dispatch)) {
2893			__generic_unplug_device(q);
2894			spin_unlock_irq(q->queue_lock);
2895			io_schedule();
2896			spin_lock_irq(q->queue_lock);
2897		}
2898
2899		finish_wait(&rl->drain, &wait);
2900	}
2901
2902	spin_unlock_irq(q->queue_lock);
2903}
2904
2905/*
2906 * block waiting for the io scheduler being started again.
2907 */
2908static inline void block_wait_queue_running(request_queue_t *q)
2909{
2910	DEFINE_WAIT(wait);
2911
2912	while (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags))) {
2913		struct request_list *rl = &q->rq;
2914
2915		prepare_to_wait_exclusive(&rl->drain, &wait,
2916				TASK_UNINTERRUPTIBLE);
2917
2918		/*
2919		 * re-check the condition. avoids using prepare_to_wait()
2920		 * in the fast path (queue is running)
2921		 */
2922		if (test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags))
2923			io_schedule();
2924
2925		finish_wait(&rl->drain, &wait);
2926	}
2927}
2928
2929static void handle_bad_sector(struct bio *bio)
2930{
2931	char b[BDEVNAME_SIZE];
2932
2933	printk(KERN_INFO "attempt to access beyond end of device\n");
2934	printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
2935			bdevname(bio->bi_bdev, b),
2936			bio->bi_rw,
2937			(unsigned long long)bio->bi_sector + bio_sectors(bio),
2938			(long long)(bio->bi_bdev->bd_inode->i_size >> 9));
2939
2940	set_bit(BIO_EOF, &bio->bi_flags);
2941}
2942
2943/**
2944 * generic_make_request: hand a buffer to its device driver for I/O
2945 * @bio:  The bio describing the location in memory and on the device.
2946 *
2947 * generic_make_request() is used to make I/O requests of block
2948 * devices. It is passed a &struct bio, which describes the I/O that needs
2949 * to be done.
2950 *
2951 * generic_make_request() does not return any status.  The
2952 * success/failure status of the request, along with notification of
2953 * completion, is delivered asynchronously through the bio->bi_end_io
2954 * function described (one day) else where.
2955 *
2956 * The caller of generic_make_request must make sure that bi_io_vec
2957 * are set to describe the memory buffer, and that bi_dev and bi_sector are
2958 * set to describe the device address, and the
2959 * bi_end_io and optionally bi_private are set to describe how
2960 * completion notification should be signaled.
2961 *
2962 * generic_make_request and the drivers it calls may use bi_next if this
2963 * bio happens to be merged with someone else, and may change bi_dev and
2964 * bi_sector for remaps as it sees fit.  So the values of these fields
2965 * should NOT be depended on after the call to generic_make_request.
2966 */
2967void generic_make_request(struct bio *bio)
2968{
2969	request_queue_t *q;
2970	sector_t maxsector;
2971	int ret, nr_sectors = bio_sectors(bio);
2972
2973	might_sleep();
2974	/* Test device or partition size, when known. */
2975	maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
2976	if (maxsector) {
2977		sector_t sector = bio->bi_sector;
2978
2979		if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
2980			/*
2981			 * This may well happen - the kernel calls bread()
2982			 * without checking the size of the device, e.g., when
2983			 * mounting a device.
2984			 */
2985			handle_bad_sector(bio);
2986			goto end_io;
2987		}
2988	}
2989
2990	/*
2991	 * Resolve the mapping until finished. (drivers are
2992	 * still free to implement/resolve their own stacking
2993	 * by explicitly returning 0)
2994	 *
2995	 * NOTE: we don't repeat the blk_size check for each new device.
2996	 * Stacking drivers are expected to know what they are doing.
2997	 */
2998	do {
2999		char b[BDEVNAME_SIZE];
3000
3001		q = bdev_get_queue(bio->bi_bdev);
3002		if (!q) {
3003			printk(KERN_ERR
3004			       "generic_make_request: Trying to access "
3005				"nonexistent block-device %s (%Lu)\n",
3006				bdevname(bio->bi_bdev, b),
3007				(long long) bio->bi_sector);
3008end_io:
3009			bio_endio(bio, bio->bi_size, -EIO);
3010			break;
3011		}
3012
3013		if (unlikely(bio_sectors(bio) > q->max_hw_sectors)) {
3014			printk("bio too big device %s (%u > %u)\n", 
3015				bdevname(bio->bi_bdev, b),
3016				bio_sectors(bio),
3017				q->max_hw_sectors);
3018			goto end_io;
3019		}
3020
3021		if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
3022			goto end_io;
3023
3024		block_wait_queue_running(q);
3025
3026		/*
3027		 * If this device has partitions, remap block n
3028		 * of partition p to block n+start(p) of the disk.
3029		 */
3030		blk_partition_remap(bio);
3031
3032		ret = q->make_request_fn(q, bio);
3033	} while (ret);
3034}
3035
3036EXPORT_SYMBOL(generic_make_request);
3037
3038/**
3039 * submit_bio: submit a bio to the block device layer for I/O
3040 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
3041 * @bio: The &struct bio which describes the I/O
3042 *
3043 * submit_bio() is very similar in purpose to generic_make_request(), and
3044 * uses that function to do most of the work. Both are fairly rough
3045 * interfaces, @bio must be presetup and ready for I/O.
3046 *
3047 */
3048void submit_bio(int rw, struct bio *bio)
3049{
3050	int count = bio_sectors(bio);
3051
3052	BIO_BUG_ON(!bio->bi_size);
3053	BIO_BUG_ON(!bio->bi_io_vec);
3054	bio->bi_rw |= rw;
3055	if (rw & WRITE)
3056		mod_page_state(pgpgout, count);
3057	else
3058		mod_page_state(pgpgin, count);
3059
3060	if (unlikely(block_dump)) {
3061		char b[BDEVNAME_SIZE];
3062		printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
3063			current->comm, current->pid,
3064			(rw & WRITE) ? "WRITE" : "READ",
3065			(unsigned long long)bio->bi_sector,
3066			bdevname(bio->bi_bdev,b));
3067	}
3068
3069	generic_make_request(bio);
3070}
3071
3072EXPORT_SYMBOL(submit_bio);
3073
3074static void blk_recalc_rq_segments(struct request *rq)
3075{
3076	struct bio *bio, *prevbio = NULL;
3077	int nr_phys_segs, nr_hw_segs;
3078	unsigned int phys_size, hw_size;
3079	request_queue_t *q = rq->q;
3080
3081	if (!rq->bio)
3082		return;
3083
3084	phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0;
3085	rq_for_each_bio(bio, rq) {
3086		/* Force bio hw/phys segs to be recalculated. */
3087		bio->bi_flags &= ~(1 << BIO_SEG_VALID);
3088
3089		nr_phys_segs += bio_phys_segments(q, bio);
3090		nr_hw_segs += bio_hw_segments(q, bio);
3091		if (prevbio) {
3092			int pseg = phys_size + prevbio->bi_size + bio->bi_size;
3093			int hseg = hw_size + prevbio->bi_size + bio->bi_size;
3094
3095			if (blk_phys_contig_segment(q, prevbio, bio) &&
3096			    pseg <= q->max_segment_size) {
3097				nr_phys_segs--;
3098				phys_size += prevbio->bi_size + bio->bi_size;
3099			} else
3100				phys_size = 0;
3101
3102			if (blk_hw_contig_segment(q, prevbio, bio) &&
3103			    hseg <= q->max_segment_size) {
3104				nr_hw_segs--;
3105				hw_size += prevbio->bi_size + bio->bi_size;
3106			} else
3107				hw_size = 0;
3108		}
3109		prevbio = bio;
3110	}
3111
3112	rq->nr_phys_segments = nr_phys_segs;
3113	rq->nr_hw_segments = nr_hw_segs;
3114}
3115
3116static void blk_recalc_rq_sectors(struct request *rq, int nsect)
3117{
3118	if (blk_fs_request(rq)) {
3119		rq->hard_sector += nsect;
3120		rq->hard_nr_sectors -= nsect;
3121
3122		/*
3123		 * Move the I/O submission pointers ahead if required.
3124		 */
3125		if ((rq->nr_sectors >= rq->hard_nr_sectors) &&
3126		    (rq->sector <= rq->hard_sector)) {
3127			rq->sector = rq->hard_sector;
3128			rq->nr_sectors = rq->hard_nr_sectors;
3129			rq->hard_cur_sectors = bio_cur_sectors(rq->bio);
3130			rq->current_nr_sectors = rq->hard_cur_sectors;
3131			rq->buffer = bio_data(rq->bio);
3132		}
3133
3134		/*
3135		 * if total number of sectors is less than the first segment
3136		 * size, something has gone terribly wrong
3137		 */
3138		if (rq->nr_sectors < rq->current_nr_sectors) {
3139			printk("blk: request botched\n");
3140			rq->nr_sectors = rq->current_nr_sectors;
3141		}
3142	}
3143}
3144
3145static int __end_that_request_first(struct request *req, int uptodate,
3146				    int nr_bytes)
3147{
3148	int total_bytes, bio_nbytes, error, next_idx = 0;
3149	struct bio *bio;
3150
3151	/*
3152	 * extend uptodate bool to allow < 0 value to be direct io error
3153	 */
3154	error = 0;
3155	if (end_io_error(uptodate))
3156		error = !uptodate ? -EIO : uptodate;
3157
3158	/*
3159	 * for a REQ_BLOCK_PC request, we want to carry any eventual
3160	 * sense key with us all the way through
3161	 */
3162	if (!blk_pc_request(req))
3163		req->errors = 0;
3164
3165	if (!uptodate) {
3166		if (blk_fs_request(req) && !(req->flags & REQ_QUIET))
3167			printk("end_request: I/O error, dev %s, sector %llu\n",
3168				req->rq_disk ? req->rq_disk->disk_name : "?",
3169				(unsigned long long)req->sector);
3170	}
3171
3172	total_bytes = bio_nbytes = 0;
3173	while ((bio = req->bio) != NULL) {
3174		int nbytes;
3175
3176		if (nr_bytes >= bio->bi_size) {
3177			req->bio = bio->bi_next;
3178			nbytes = bio->bi_size;
3179			bio_endio(bio, nbytes, error);
3180			next_idx = 0;
3181			bio_nbytes = 0;
3182		} else {
3183			int idx = bio->bi_idx + next_idx;
3184
3185			if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
3186				blk_dump_rq_flags(req, "__end_that");
3187				printk("%s: bio idx %d >= vcnt %d\n",
3188						__FUNCTION__,
3189						bio->bi_idx, bio->bi_vcnt);
3190				break;
3191			}
3192
3193			nbytes = bio_iovec_idx(bio, idx)->bv_len;
3194			BIO_BUG_ON(nbytes > bio->bi_size);
3195
3196			/*
3197			 * not a complete bvec done
3198			 */
3199			if (unlikely(nbytes > nr_bytes)) {
3200				bio_nbytes += nr_bytes;
3201				total_bytes += nr_bytes;
3202				break;
3203			}
3204
3205			/*
3206			 * advance to the next vector
3207			 */
3208			next_idx++;
3209			bio_nbytes += nbytes;
3210		}
3211
3212		total_bytes += nbytes;
3213		nr_bytes -= nbytes;
3214
3215		if ((bio = req->bio)) {
3216			/*
3217			 * end more in this run, or just return 'not-done'
3218			 */
3219			if (unlikely(nr_bytes <= 0))
3220				break;
3221		}
3222	}
3223
3224	/*
3225	 * completely done
3226	 */
3227	if (!req->bio)
3228		return 0;
3229
3230	/*
3231	 * if the request wasn't completed, update state
3232	 */
3233	if (bio_nbytes) {
3234		bio_endio(bio, bio_nbytes, error);
3235		bio->bi_idx += next_idx;
3236		bio_iovec(bio)->bv_offset += nr_bytes;
3237		bio_iovec(bio)->bv_len -= nr_bytes;
3238	}
3239
3240	blk_recalc_rq_sectors(req, total_bytes >> 9);
3241	blk_recalc_rq_segments(req);
3242	return 1;
3243}
3244
3245/**
3246 * end_that_request_first - end I/O on a request
3247 * @req:      the request being processed
3248 * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
3249 * @nr_sectors: number of sectors to end I/O on
3250 *
3251 * Description:
3252 *     Ends I/O on a number of sectors attached to @req, and sets it up
3253 *     for the next range of segments (if any) in the cluster.
3254 *
3255 * Return:
3256 *     0 - we are done with this request, call end_that_request_last()
3257 *     1 - still buffers pending for this request
3258 **/
3259int end_that_request_first(struct request *req, int uptodate, int nr_sectors)
3260{
3261	return __end_that_request_first(req, uptodate, nr_sectors << 9);
3262}
3263
3264EXPORT_SYMBOL(end_that_request_first);
3265
3266/**
3267 * end_that_request_chunk - end I/O on a request
3268 * @req:      the request being processed
3269 * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
3270 * @nr_bytes: number of bytes to complete
3271 *
3272 * Description:
3273 *     Ends I/O on a number of bytes attached to @req, and sets it up
3274 *     for the next range of segments (if any). Like end_that_request_first(),
3275 *     but deals with bytes instead of sectors.
3276 *
3277 * Return:
3278 *     0 - we are done with this request, call end_that_request_last()
3279 *     1 - still buffers pending for this request
3280 **/
3281int end_that_request_chunk(struct request *req, int uptodate, int nr_bytes)
3282{
3283	return __end_that_request_first(req, uptodate, nr_bytes);
3284}
3285
3286EXPORT_SYMBOL(end_that_request_chunk);
3287
3288/*
3289 * queue lock must be held
3290 */
3291void end_that_request_last(struct request *req)
3292{
3293	struct gendisk *disk = req->rq_disk;
3294
3295	if (unlikely(laptop_mode) && blk_fs_request(req))
3296		laptop_io_completion();
3297
3298	if (disk && blk_fs_request(req)) {
3299		unsigned long duration = jiffies - req->start_time;
3300		switch (rq_data_dir(req)) {
3301		    case WRITE:
3302			__disk_stat_inc(disk, writes);
3303			__disk_stat_add(disk, write_ticks, duration);
3304			break;
3305		    case READ:
3306			__disk_stat_inc(disk, reads);
3307			__disk_stat_add(disk, read_ticks, duration);
3308			break;
3309		}
3310		disk_round_stats(disk);
3311		disk->in_flight--;
3312	}
3313	if (req->end_io)
3314		req->end_io(req);
3315	else
3316		__blk_put_request(req->q, req);
3317}
3318
3319EXPORT_SYMBOL(end_that_request_last);
3320
3321void end_request(struct request *req, int uptodate)
3322{
3323	if (!end_that_request_first(req, uptodate, req->hard_cur_sectors)) {
3324		add_disk_randomness(req->rq_disk);
3325		blkdev_dequeue_request(req);
3326		end_that_request_last(req);
3327	}
3328}
3329
3330EXPORT_SYMBOL(end_request);
3331
3332void blk_rq_bio_prep(request_queue_t *q, struct request *rq, struct bio *bio)
3333{
3334	/* first three bits are identical in rq->flags and bio->bi_rw */
3335	rq->flags |= (bio->bi_rw & 7);
3336
3337	rq->nr_phys_segments = bio_phys_segments(q, bio);
3338	rq->nr_hw_segments = bio_hw_segments(q, bio);
3339	rq->current_nr_sectors = bio_cur_sectors(bio);
3340	rq->hard_cur_sectors = rq->current_nr_sectors;
3341	rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
3342	rq->buffer = bio_data(bio);
3343
3344	rq->bio = rq->biotail = bio;
3345}
3346
3347EXPORT_SYMBOL(blk_rq_bio_prep);
3348
3349int kblockd_schedule_work(struct work_struct *work)
3350{
3351	return queue_work(kblockd_workqueue, work);
3352}
3353
3354EXPORT_SYMBOL(kblockd_schedule_work);
3355
3356void kblockd_flush(void)
3357{
3358	flush_workqueue(kblockd_workqueue);
3359}
3360EXPORT_SYMBOL(kblockd_flush);
3361
3362int __init blk_dev_init(void)
3363{
3364	kblockd_workqueue = create_workqueue("kblockd");
3365	if (!kblockd_workqueue)
3366		panic("Failed to create kblockd\n");
3367
3368	request_cachep = kmem_cache_create("blkdev_requests",
3369			sizeof(struct request), 0, SLAB_PANIC, NULL, NULL);
3370
3371	requestq_cachep = kmem_cache_create("blkdev_queue",
3372			sizeof(request_queue_t), 0, SLAB_PANIC, NULL, NULL);
3373
3374	iocontext_cachep = kmem_cache_create("blkdev_ioc",
3375			sizeof(struct io_context), 0, SLAB_PANIC, NULL, NULL);
3376
3377	blk_max_low_pfn = max_low_pfn;
3378	blk_max_pfn = max_pfn;
3379
3380	return 0;
3381}
3382
3383/*
3384 * IO Context helper functions
3385 */
3386void put_io_context(struct io_context *ioc)
3387{
3388	if (ioc == NULL)
3389		return;
3390
3391	BUG_ON(atomic_read(&ioc->refcount) == 0);
3392
3393	if (atomic_dec_and_test(&ioc->refcount)) {
3394		if (ioc->aic && ioc->aic->dtor)
3395			ioc->aic->dtor(ioc->aic);
3396		if (ioc->cic && ioc->cic->dtor)
3397			ioc->cic->dtor(ioc->cic);
3398
3399		kmem_cache_free(iocontext_cachep, ioc);
3400	}
3401}
3402EXPORT_SYMBOL(put_io_context);
3403
3404/* Called by the exitting task */
3405void exit_io_context(void)
3406{
3407	unsigned long flags;
3408	struct io_context *ioc;
3409
3410	local_irq_save(flags);
3411	task_lock(current);
3412	ioc = current->io_context;
3413	current->io_context = NULL;
3414	ioc->task = NULL;
3415	task_unlock(current);
3416	local_irq_restore(flags);
3417
3418	if (ioc->aic && ioc->aic->exit)
3419		ioc->aic->exit(ioc->aic);
3420	if (ioc->cic && ioc->cic->exit)
3421		ioc->cic->exit(ioc->cic);
3422
3423	put_io_context(ioc);
3424}
3425
3426/*
3427 * If the current task has no IO context then create one and initialise it.
3428 * Otherwise, return its existing IO context.
3429 *
3430 * This returned IO context doesn't have a specifically elevated refcount,
3431 * but since the current task itself holds a reference, the context can be
3432 * used in general code, so long as it stays within `current` context.
3433 */
3434struct io_context *current_io_context(int gfp_flags)
3435{
3436	struct task_struct *tsk = current;
3437	struct io_context *ret;
3438
3439	ret = tsk->io_context;
3440	if (likely(ret))
3441		return ret;
3442
3443	ret = kmem_cache_alloc(iocontext_cachep, gfp_flags);
3444	if (ret) {
3445		atomic_set(&ret->refcount, 1);
3446		ret->task = current;
3447		ret->set_ioprio = NULL;
3448		ret->last_waited = jiffies; /* doesn't matter... */
3449		ret->nr_batch_requests = 0; /* because this is 0 */
3450		ret->aic = NULL;
3451		ret->cic = NULL;
3452		tsk->io_context = ret;
3453	}
3454
3455	return ret;
3456}
3457EXPORT_SYMBOL(current_io_context);
3458
3459/*
3460 * If the current task has no IO context then create one and initialise it.
3461 * If it does have a context, take a ref on it.
3462 *
3463 * This is always called in the context of the task which submitted the I/O.
3464 */
3465struct io_context *get_io_context(int gfp_flags)
3466{
3467	struct io_context *ret;
3468	ret = current_io_context(gfp_flags);
3469	if (likely(ret))
3470		atomic_inc(&ret->refcount);
3471	return ret;
3472}
3473EXPORT_SYMBOL(get_io_context);
3474
3475void copy_io_context(struct io_context **pdst, struct io_context **psrc)
3476{
3477	struct io_context *src = *psrc;
3478	struct io_context *dst = *pdst;
3479
3480	if (src) {
3481		BUG_ON(atomic_read(&src->refcount) == 0);
3482		atomic_inc(&src->refcount);
3483		put_io_context(dst);
3484		*pdst = src;
3485	}
3486}
3487EXPORT_SYMBOL(copy_io_context);
3488
3489void swap_io_context(struct io_context **ioc1, struct io_context **ioc2)
3490{
3491	struct io_context *temp;
3492	temp = *ioc1;
3493	*ioc1 = *ioc2;
3494	*ioc2 = temp;
3495}
3496EXPORT_SYMBOL(swap_io_context);
3497
3498/*
3499 * sysfs parts below
3500 */
3501struct queue_sysfs_entry {
3502	struct attribute attr;
3503	ssize_t (*show)(struct request_queue *, char *);
3504	ssize_t (*store)(struct request_queue *, const char *, size_t);
3505};
3506
3507static ssize_t
3508queue_var_show(unsigned int var, char *page)
3509{
3510	return sprintf(page, "%d\n", var);
3511}
3512
3513static ssize_t
3514queue_var_store(unsigned long *var, const char *page, size_t count)
3515{
3516	char *p = (char *) page;
3517
3518	*var = simple_strtoul(p, &p, 10);
3519	return count;
3520}
3521
3522static ssize_t queue_requests_show(struct request_queue *q, char *page)
3523{
3524	return queue_var_show(q->nr_requests, (page));
3525}
3526
3527static ssize_t
3528queue_requests_store(struct request_queue *q, const char *page, size_t count)
3529{
3530	struct request_list *rl = &q->rq;
3531
3532	int ret = queue_var_store(&q->nr_requests, page, count);
3533	if (q->nr_requests < BLKDEV_MIN_RQ)
3534		q->nr_requests = BLKDEV_MIN_RQ;
3535	blk_queue_congestion_threshold(q);
3536
3537	if (rl->count[READ] >= queue_congestion_on_threshold(q))
3538		set_queue_congested(q, READ);
3539	else if (rl->count[READ] < queue_congestion_off_threshold(q))
3540		clear_queue_congested(q, READ);
3541
3542	if (rl->count[WRITE] >= queue_congestion_on_threshold(q))
3543		set_queue_congested(q, WRITE);
3544	else if (rl->count[WRITE] < queue_congestion_off_threshold(q))
3545		clear_queue_congested(q, WRITE);
3546
3547	if (rl->count[READ] >= q->nr_requests) {
3548		blk_set_queue_full(q, READ);
3549	} else if (rl->count[READ]+1 <= q->nr_requests) {
3550		blk_clear_queue_full(q, READ);
3551		wake_up(&rl->wait[READ]);
3552	}
3553
3554	if (rl->count[WRITE] >= q->nr_requests) {
3555		blk_set_queue_full(q, WRITE);
3556	} else if (rl->count[WRITE]+1 <= q->nr_requests) {
3557		blk_clear_queue_full(q, WRITE);
3558		wake_up(&rl->wait[WRITE]);
3559	}
3560	return ret;
3561}
3562
3563static ssize_t queue_ra_show(struct request_queue *q, char *page)
3564{
3565	int ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);
3566
3567	return queue_var_show(ra_kb, (page));
3568}
3569
3570static ssize_t
3571queue_ra_store(struct request_queue *q, const char *page, size_t count)
3572{
3573	unsigned long ra_kb;
3574	ssize_t ret = queue_var_store(&ra_kb, page, count);
3575
3576	spin_lock_irq(q->queue_lock);
3577	if (ra_kb > (q->max_sectors >> 1))
3578		ra_kb = (q->max_sectors >> 1);
3579
3580	q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);
3581	spin_unlock_irq(q->queue_lock);
3582
3583	return ret;
3584}
3585
3586static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
3587{
3588	int max_sectors_kb = q->max_sectors >> 1;
3589
3590	return queue_var_show(max_sectors_kb, (page));
3591}
3592
3593static ssize_t
3594queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
3595{
3596	unsigned long max_sectors_kb,
3597			max_hw_sectors_kb = q->max_hw_sectors >> 1,
3598			page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
3599	ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
3600	int ra_kb;
3601
3602	if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
3603		return -EINVAL;
3604	/*
3605	 * Take the queue lock to update the readahead and max_sectors
3606	 * values synchronously:
3607	 */
3608	spin_lock_irq(q->queue_lock);
3609	/*
3610	 * Trim readahead window as well, if necessary:
3611	 */
3612	ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);
3613	if (ra_kb > max_sectors_kb)
3614		q->backing_dev_info.ra_pages =
3615				max_sectors_kb >> (PAGE_CACHE_SHIFT - 10);
3616
3617	q->max_sectors = max_sectors_kb << 1;
3618	spin_unlock_irq(q->queue_lock);
3619
3620	return ret;
3621}
3622
3623static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
3624{
3625	int max_hw_sectors_kb = q->max_hw_sectors >> 1;
3626
3627	return queue_var_show(max_hw_sectors_kb, (page));
3628}
3629
3630
3631static struct queue_sysfs_entry queue_requests_entry = {
3632	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
3633	.show = queue_requests_show,
3634	.store = queue_requests_store,
3635};
3636
3637static struct queue_sysfs_entry queue_ra_entry = {
3638	.attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR },
3639	.show = queue_ra_show,
3640	.store = queue_ra_store,
3641};
3642
3643static struct queue_sysfs_entry queue_max_sectors_entry = {
3644	.attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR },
3645	.show = queue_max_sectors_show,
3646	.store = queue_max_sectors_store,
3647};
3648
3649static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
3650	.attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO },
3651	.show = queue_max_hw_sectors_show,
3652};
3653
3654static struct queue_sysfs_entry queue_iosched_entry = {
3655	.attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
3656	.show = elv_iosched_show,
3657	.store = elv_iosched_store,
3658};
3659
3660static struct attribute *default_attrs[] = {
3661	&queue_requests_entry.attr,
3662	&queue_ra_entry.attr,
3663	&queue_max_hw_sectors_entry.attr,
3664	&queue_max_sectors_entry.attr,
3665	&queue_iosched_entry.attr,
3666	NULL,
3667};
3668
3669#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr)
3670
3671static ssize_t
3672queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3673{
3674	struct queue_sysfs_entry *entry = to_queue(attr);
3675	struct request_queue *q;
3676
3677	q = container_of(kobj, struct request_queue, kobj);
3678	if (!entry->show)
3679		return -EIO;
3680
3681	return entry->show(q, page);
3682}
3683
3684static ssize_t
3685queue_attr_store(struct kobject *kobj, struct attribute *attr,
3686		    const char *page, size_t length)
3687{
3688	struct queue_sysfs_entry *entry = to_queue(attr);
3689	struct request_queue *q;
3690
3691	q = container_of(kobj, struct request_queue, kobj);
3692	if (!entry->store)
3693		return -EIO;
3694
3695	return entry->store(q, page, length);
3696}
3697
3698static struct sysfs_ops queue_sysfs_ops = {
3699	.show	= queue_attr_show,
3700	.store	= queue_attr_store,
3701};
3702
3703static struct kobj_type queue_ktype = {
3704	.sysfs_ops	= &queue_sysfs_ops,
3705	.default_attrs	= default_attrs,
3706};
3707
3708int blk_register_queue(struct gendisk *disk)
3709{
3710	int ret;
3711
3712	request_queue_t *q = disk->queue;
3713
3714	if (!q || !q->request_fn)
3715		return -ENXIO;
3716
3717	q->kobj.parent = kobject_get(&disk->kobj);
3718	if (!q->kobj.parent)
3719		return -EBUSY;
3720
3721	snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue");
3722	q->kobj.ktype = &queue_ktype;
3723
3724	ret = kobject_register(&q->kobj);
3725	if (ret < 0)
3726		return ret;
3727
3728	ret = elv_register_queue(q);
3729	if (ret) {
3730		kobject_unregister(&q->kobj);
3731		return ret;
3732	}
3733
3734	return 0;
3735}
3736
3737void blk_unregister_queue(struct gendisk *disk)
3738{
3739	request_queue_t *q = disk->queue;
3740
3741	if (q && q->request_fn) {
3742		elv_unregister_queue(q);
3743
3744		kobject_unregister(&q->kobj);
3745		kobject_put(&disk->kobj);
3746	}
3747}
Configure Feed

Configure Feed