Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v2.6.19-rc2 1281 lines 31 kB view raw
1/* 2 * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk> 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License version 2 as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * GNU General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public Licens 14 * along with this program; if not, write to the Free Software 15 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 16 * 17 */ 18#include <linux/mm.h> 19#include <linux/swap.h> 20#include <linux/bio.h> 21#include <linux/blkdev.h> 22#include <linux/slab.h> 23#include <linux/init.h> 24#include <linux/kernel.h> 25#include <linux/module.h> 26#include <linux/mempool.h> 27#include <linux/workqueue.h> 28#include <linux/blktrace_api.h> 29#include <scsi/sg.h> /* for struct sg_iovec */ 30 31#define BIO_POOL_SIZE 256 32 33static kmem_cache_t *bio_slab __read_mostly; 34 35#define BIOVEC_NR_POOLS 6 36 37/* 38 * a small number of entries is fine, not going to be performance critical. 39 * basically we just need to survive 40 */ 41#define BIO_SPLIT_ENTRIES 8 42mempool_t *bio_split_pool __read_mostly; 43 44struct biovec_slab { 45 int nr_vecs; 46 char *name; 47 kmem_cache_t *slab; 48}; 49 50/* 51 * if you change this list, also change bvec_alloc or things will 52 * break badly! cannot be bigger than what you can fit into an 53 * unsigned short 54 */ 55 56#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) } 57static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { 58 BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES), 59}; 60#undef BV 61 62/* 63 * bio_set is used to allow other portions of the IO system to 64 * allocate their own private memory pools for bio and iovec structures. 65 * These memory pools in turn all allocate from the bio_slab 66 * and the bvec_slabs[]. 67 */ 68struct bio_set { 69 mempool_t *bio_pool; 70 mempool_t *bvec_pools[BIOVEC_NR_POOLS]; 71}; 72 73/* 74 * fs_bio_set is the bio_set containing bio and iovec memory pools used by 75 * IO code that does not need private memory pools. 76 */ 77static struct bio_set *fs_bio_set; 78 79static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs) 80{ 81 struct bio_vec *bvl; 82 83 /* 84 * see comment near bvec_array define! 85 */ 86 switch (nr) { 87 case 1 : *idx = 0; break; 88 case 2 ... 4: *idx = 1; break; 89 case 5 ... 16: *idx = 2; break; 90 case 17 ... 64: *idx = 3; break; 91 case 65 ... 128: *idx = 4; break; 92 case 129 ... BIO_MAX_PAGES: *idx = 5; break; 93 default: 94 return NULL; 95 } 96 /* 97 * idx now points to the pool we want to allocate from 98 */ 99 100 bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask); 101 if (bvl) { 102 struct biovec_slab *bp = bvec_slabs + *idx; 103 104 memset(bvl, 0, bp->nr_vecs * sizeof(struct bio_vec)); 105 } 106 107 return bvl; 108} 109 110void bio_free(struct bio *bio, struct bio_set *bio_set) 111{ 112 const int pool_idx = BIO_POOL_IDX(bio); 113 114 BIO_BUG_ON(pool_idx >= BIOVEC_NR_POOLS); 115 116 mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]); 117 mempool_free(bio, bio_set->bio_pool); 118} 119 120/* 121 * default destructor for a bio allocated with bio_alloc_bioset() 122 */ 123static void bio_fs_destructor(struct bio *bio) 124{ 125 bio_free(bio, fs_bio_set); 126} 127 128void bio_init(struct bio *bio) 129{ 130 bio->bi_next = NULL; 131 bio->bi_bdev = NULL; 132 bio->bi_flags = 1 << BIO_UPTODATE; 133 bio->bi_rw = 0; 134 bio->bi_vcnt = 0; 135 bio->bi_idx = 0; 136 bio->bi_phys_segments = 0; 137 bio->bi_hw_segments = 0; 138 bio->bi_hw_front_size = 0; 139 bio->bi_hw_back_size = 0; 140 bio->bi_size = 0; 141 bio->bi_max_vecs = 0; 142 bio->bi_end_io = NULL; 143 atomic_set(&bio->bi_cnt, 1); 144 bio->bi_private = NULL; 145} 146 147/** 148 * bio_alloc_bioset - allocate a bio for I/O 149 * @gfp_mask: the GFP_ mask given to the slab allocator 150 * @nr_iovecs: number of iovecs to pre-allocate 151 * @bs: the bio_set to allocate from 152 * 153 * Description: 154 * bio_alloc_bioset will first try it's on mempool to satisfy the allocation. 155 * If %__GFP_WAIT is set then we will block on the internal pool waiting 156 * for a &struct bio to become free. 157 * 158 * allocate bio and iovecs from the memory pools specified by the 159 * bio_set structure. 160 **/ 161struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) 162{ 163 struct bio *bio = mempool_alloc(bs->bio_pool, gfp_mask); 164 165 if (likely(bio)) { 166 struct bio_vec *bvl = NULL; 167 168 bio_init(bio); 169 if (likely(nr_iovecs)) { 170 unsigned long idx = 0; /* shut up gcc */ 171 172 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); 173 if (unlikely(!bvl)) { 174 mempool_free(bio, bs->bio_pool); 175 bio = NULL; 176 goto out; 177 } 178 bio->bi_flags |= idx << BIO_POOL_OFFSET; 179 bio->bi_max_vecs = bvec_slabs[idx].nr_vecs; 180 } 181 bio->bi_io_vec = bvl; 182 } 183out: 184 return bio; 185} 186 187struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) 188{ 189 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); 190 191 if (bio) 192 bio->bi_destructor = bio_fs_destructor; 193 194 return bio; 195} 196 197void zero_fill_bio(struct bio *bio) 198{ 199 unsigned long flags; 200 struct bio_vec *bv; 201 int i; 202 203 bio_for_each_segment(bv, bio, i) { 204 char *data = bvec_kmap_irq(bv, &flags); 205 memset(data, 0, bv->bv_len); 206 flush_dcache_page(bv->bv_page); 207 bvec_kunmap_irq(data, &flags); 208 } 209} 210EXPORT_SYMBOL(zero_fill_bio); 211 212/** 213 * bio_put - release a reference to a bio 214 * @bio: bio to release reference to 215 * 216 * Description: 217 * Put a reference to a &struct bio, either one you have gotten with 218 * bio_alloc or bio_get. The last put of a bio will free it. 219 **/ 220void bio_put(struct bio *bio) 221{ 222 BIO_BUG_ON(!atomic_read(&bio->bi_cnt)); 223 224 /* 225 * last put frees it 226 */ 227 if (atomic_dec_and_test(&bio->bi_cnt)) { 228 bio->bi_next = NULL; 229 bio->bi_destructor(bio); 230 } 231} 232 233inline int bio_phys_segments(request_queue_t *q, struct bio *bio) 234{ 235 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 236 blk_recount_segments(q, bio); 237 238 return bio->bi_phys_segments; 239} 240 241inline int bio_hw_segments(request_queue_t *q, struct bio *bio) 242{ 243 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 244 blk_recount_segments(q, bio); 245 246 return bio->bi_hw_segments; 247} 248 249/** 250 * __bio_clone - clone a bio 251 * @bio: destination bio 252 * @bio_src: bio to clone 253 * 254 * Clone a &bio. Caller will own the returned bio, but not 255 * the actual data it points to. Reference count of returned 256 * bio will be one. 257 */ 258void __bio_clone(struct bio *bio, struct bio *bio_src) 259{ 260 request_queue_t *q = bdev_get_queue(bio_src->bi_bdev); 261 262 memcpy(bio->bi_io_vec, bio_src->bi_io_vec, 263 bio_src->bi_max_vecs * sizeof(struct bio_vec)); 264 265 bio->bi_sector = bio_src->bi_sector; 266 bio->bi_bdev = bio_src->bi_bdev; 267 bio->bi_flags |= 1 << BIO_CLONED; 268 bio->bi_rw = bio_src->bi_rw; 269 bio->bi_vcnt = bio_src->bi_vcnt; 270 bio->bi_size = bio_src->bi_size; 271 bio->bi_idx = bio_src->bi_idx; 272 bio_phys_segments(q, bio); 273 bio_hw_segments(q, bio); 274} 275 276/** 277 * bio_clone - clone a bio 278 * @bio: bio to clone 279 * @gfp_mask: allocation priority 280 * 281 * Like __bio_clone, only also allocates the returned bio 282 */ 283struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) 284{ 285 struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set); 286 287 if (b) { 288 b->bi_destructor = bio_fs_destructor; 289 __bio_clone(b, bio); 290 } 291 292 return b; 293} 294 295/** 296 * bio_get_nr_vecs - return approx number of vecs 297 * @bdev: I/O target 298 * 299 * Return the approximate number of pages we can send to this target. 300 * There's no guarantee that you will be able to fit this number of pages 301 * into a bio, it does not account for dynamic restrictions that vary 302 * on offset. 303 */ 304int bio_get_nr_vecs(struct block_device *bdev) 305{ 306 request_queue_t *q = bdev_get_queue(bdev); 307 int nr_pages; 308 309 nr_pages = ((q->max_sectors << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT; 310 if (nr_pages > q->max_phys_segments) 311 nr_pages = q->max_phys_segments; 312 if (nr_pages > q->max_hw_segments) 313 nr_pages = q->max_hw_segments; 314 315 return nr_pages; 316} 317 318static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page 319 *page, unsigned int len, unsigned int offset, 320 unsigned short max_sectors) 321{ 322 int retried_segments = 0; 323 struct bio_vec *bvec; 324 325 /* 326 * cloned bio must not modify vec list 327 */ 328 if (unlikely(bio_flagged(bio, BIO_CLONED))) 329 return 0; 330 331 if (((bio->bi_size + len) >> 9) > max_sectors) 332 return 0; 333 334 /* 335 * For filesystems with a blocksize smaller than the pagesize 336 * we will often be called with the same page as last time and 337 * a consecutive offset. Optimize this special case. 338 */ 339 if (bio->bi_vcnt > 0) { 340 struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; 341 342 if (page == prev->bv_page && 343 offset == prev->bv_offset + prev->bv_len) { 344 prev->bv_len += len; 345 if (q->merge_bvec_fn && 346 q->merge_bvec_fn(q, bio, prev) < len) { 347 prev->bv_len -= len; 348 return 0; 349 } 350 351 goto done; 352 } 353 } 354 355 if (bio->bi_vcnt >= bio->bi_max_vecs) 356 return 0; 357 358 /* 359 * we might lose a segment or two here, but rather that than 360 * make this too complex. 361 */ 362 363 while (bio->bi_phys_segments >= q->max_phys_segments 364 || bio->bi_hw_segments >= q->max_hw_segments 365 || BIOVEC_VIRT_OVERSIZE(bio->bi_size)) { 366 367 if (retried_segments) 368 return 0; 369 370 retried_segments = 1; 371 blk_recount_segments(q, bio); 372 } 373 374 /* 375 * setup the new entry, we might clear it again later if we 376 * cannot add the page 377 */ 378 bvec = &bio->bi_io_vec[bio->bi_vcnt]; 379 bvec->bv_page = page; 380 bvec->bv_len = len; 381 bvec->bv_offset = offset; 382 383 /* 384 * if queue has other restrictions (eg varying max sector size 385 * depending on offset), it can specify a merge_bvec_fn in the 386 * queue to get further control 387 */ 388 if (q->merge_bvec_fn) { 389 /* 390 * merge_bvec_fn() returns number of bytes it can accept 391 * at this offset 392 */ 393 if (q->merge_bvec_fn(q, bio, bvec) < len) { 394 bvec->bv_page = NULL; 395 bvec->bv_len = 0; 396 bvec->bv_offset = 0; 397 return 0; 398 } 399 } 400 401 /* If we may be able to merge these biovecs, force a recount */ 402 if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec) || 403 BIOVEC_VIRT_MERGEABLE(bvec-1, bvec))) 404 bio->bi_flags &= ~(1 << BIO_SEG_VALID); 405 406 bio->bi_vcnt++; 407 bio->bi_phys_segments++; 408 bio->bi_hw_segments++; 409 done: 410 bio->bi_size += len; 411 return len; 412} 413 414/** 415 * bio_add_pc_page - attempt to add page to bio 416 * @q: the target queue 417 * @bio: destination bio 418 * @page: page to add 419 * @len: vec entry length 420 * @offset: vec entry offset 421 * 422 * Attempt to add a page to the bio_vec maplist. This can fail for a 423 * number of reasons, such as the bio being full or target block 424 * device limitations. The target block device must allow bio's 425 * smaller than PAGE_SIZE, so it is always possible to add a single 426 * page to an empty bio. This should only be used by REQ_PC bios. 427 */ 428int bio_add_pc_page(request_queue_t *q, struct bio *bio, struct page *page, 429 unsigned int len, unsigned int offset) 430{ 431 return __bio_add_page(q, bio, page, len, offset, q->max_hw_sectors); 432} 433 434/** 435 * bio_add_page - attempt to add page to bio 436 * @bio: destination bio 437 * @page: page to add 438 * @len: vec entry length 439 * @offset: vec entry offset 440 * 441 * Attempt to add a page to the bio_vec maplist. This can fail for a 442 * number of reasons, such as the bio being full or target block 443 * device limitations. The target block device must allow bio's 444 * smaller than PAGE_SIZE, so it is always possible to add a single 445 * page to an empty bio. 446 */ 447int bio_add_page(struct bio *bio, struct page *page, unsigned int len, 448 unsigned int offset) 449{ 450 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 451 return __bio_add_page(q, bio, page, len, offset, q->max_sectors); 452} 453 454struct bio_map_data { 455 struct bio_vec *iovecs; 456 void __user *userptr; 457}; 458 459static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio) 460{ 461 memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt); 462 bio->bi_private = bmd; 463} 464 465static void bio_free_map_data(struct bio_map_data *bmd) 466{ 467 kfree(bmd->iovecs); 468 kfree(bmd); 469} 470 471static struct bio_map_data *bio_alloc_map_data(int nr_segs) 472{ 473 struct bio_map_data *bmd = kmalloc(sizeof(*bmd), GFP_KERNEL); 474 475 if (!bmd) 476 return NULL; 477 478 bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, GFP_KERNEL); 479 if (bmd->iovecs) 480 return bmd; 481 482 kfree(bmd); 483 return NULL; 484} 485 486/** 487 * bio_uncopy_user - finish previously mapped bio 488 * @bio: bio being terminated 489 * 490 * Free pages allocated from bio_copy_user() and write back data 491 * to user space in case of a read. 492 */ 493int bio_uncopy_user(struct bio *bio) 494{ 495 struct bio_map_data *bmd = bio->bi_private; 496 const int read = bio_data_dir(bio) == READ; 497 struct bio_vec *bvec; 498 int i, ret = 0; 499 500 __bio_for_each_segment(bvec, bio, i, 0) { 501 char *addr = page_address(bvec->bv_page); 502 unsigned int len = bmd->iovecs[i].bv_len; 503 504 if (read && !ret && copy_to_user(bmd->userptr, addr, len)) 505 ret = -EFAULT; 506 507 __free_page(bvec->bv_page); 508 bmd->userptr += len; 509 } 510 bio_free_map_data(bmd); 511 bio_put(bio); 512 return ret; 513} 514 515/** 516 * bio_copy_user - copy user data to bio 517 * @q: destination block queue 518 * @uaddr: start of user address 519 * @len: length in bytes 520 * @write_to_vm: bool indicating writing to pages or not 521 * 522 * Prepares and returns a bio for indirect user io, bouncing data 523 * to/from kernel pages as necessary. Must be paired with 524 * call bio_uncopy_user() on io completion. 525 */ 526struct bio *bio_copy_user(request_queue_t *q, unsigned long uaddr, 527 unsigned int len, int write_to_vm) 528{ 529 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 530 unsigned long start = uaddr >> PAGE_SHIFT; 531 struct bio_map_data *bmd; 532 struct bio_vec *bvec; 533 struct page *page; 534 struct bio *bio; 535 int i, ret; 536 537 bmd = bio_alloc_map_data(end - start); 538 if (!bmd) 539 return ERR_PTR(-ENOMEM); 540 541 bmd->userptr = (void __user *) uaddr; 542 543 ret = -ENOMEM; 544 bio = bio_alloc(GFP_KERNEL, end - start); 545 if (!bio) 546 goto out_bmd; 547 548 bio->bi_rw |= (!write_to_vm << BIO_RW); 549 550 ret = 0; 551 while (len) { 552 unsigned int bytes = PAGE_SIZE; 553 554 if (bytes > len) 555 bytes = len; 556 557 page = alloc_page(q->bounce_gfp | GFP_KERNEL); 558 if (!page) { 559 ret = -ENOMEM; 560 break; 561 } 562 563 if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) { 564 ret = -EINVAL; 565 break; 566 } 567 568 len -= bytes; 569 } 570 571 if (ret) 572 goto cleanup; 573 574 /* 575 * success 576 */ 577 if (!write_to_vm) { 578 char __user *p = (char __user *) uaddr; 579 580 /* 581 * for a write, copy in data to kernel pages 582 */ 583 ret = -EFAULT; 584 bio_for_each_segment(bvec, bio, i) { 585 char *addr = page_address(bvec->bv_page); 586 587 if (copy_from_user(addr, p, bvec->bv_len)) 588 goto cleanup; 589 p += bvec->bv_len; 590 } 591 } 592 593 bio_set_map_data(bmd, bio); 594 return bio; 595cleanup: 596 bio_for_each_segment(bvec, bio, i) 597 __free_page(bvec->bv_page); 598 599 bio_put(bio); 600out_bmd: 601 bio_free_map_data(bmd); 602 return ERR_PTR(ret); 603} 604 605static struct bio *__bio_map_user_iov(request_queue_t *q, 606 struct block_device *bdev, 607 struct sg_iovec *iov, int iov_count, 608 int write_to_vm) 609{ 610 int i, j; 611 int nr_pages = 0; 612 struct page **pages; 613 struct bio *bio; 614 int cur_page = 0; 615 int ret, offset; 616 617 for (i = 0; i < iov_count; i++) { 618 unsigned long uaddr = (unsigned long)iov[i].iov_base; 619 unsigned long len = iov[i].iov_len; 620 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 621 unsigned long start = uaddr >> PAGE_SHIFT; 622 623 nr_pages += end - start; 624 /* 625 * transfer and buffer must be aligned to at least hardsector 626 * size for now, in the future we can relax this restriction 627 */ 628 if ((uaddr & queue_dma_alignment(q)) || (len & queue_dma_alignment(q))) 629 return ERR_PTR(-EINVAL); 630 } 631 632 if (!nr_pages) 633 return ERR_PTR(-EINVAL); 634 635 bio = bio_alloc(GFP_KERNEL, nr_pages); 636 if (!bio) 637 return ERR_PTR(-ENOMEM); 638 639 ret = -ENOMEM; 640 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); 641 if (!pages) 642 goto out; 643 644 for (i = 0; i < iov_count; i++) { 645 unsigned long uaddr = (unsigned long)iov[i].iov_base; 646 unsigned long len = iov[i].iov_len; 647 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 648 unsigned long start = uaddr >> PAGE_SHIFT; 649 const int local_nr_pages = end - start; 650 const int page_limit = cur_page + local_nr_pages; 651 652 down_read(&current->mm->mmap_sem); 653 ret = get_user_pages(current, current->mm, uaddr, 654 local_nr_pages, 655 write_to_vm, 0, &pages[cur_page], NULL); 656 up_read(&current->mm->mmap_sem); 657 658 if (ret < local_nr_pages) { 659 ret = -EFAULT; 660 goto out_unmap; 661 } 662 663 offset = uaddr & ~PAGE_MASK; 664 for (j = cur_page; j < page_limit; j++) { 665 unsigned int bytes = PAGE_SIZE - offset; 666 667 if (len <= 0) 668 break; 669 670 if (bytes > len) 671 bytes = len; 672 673 /* 674 * sorry... 675 */ 676 if (bio_add_pc_page(q, bio, pages[j], bytes, offset) < 677 bytes) 678 break; 679 680 len -= bytes; 681 offset = 0; 682 } 683 684 cur_page = j; 685 /* 686 * release the pages we didn't map into the bio, if any 687 */ 688 while (j < page_limit) 689 page_cache_release(pages[j++]); 690 } 691 692 kfree(pages); 693 694 /* 695 * set data direction, and check if mapped pages need bouncing 696 */ 697 if (!write_to_vm) 698 bio->bi_rw |= (1 << BIO_RW); 699 700 bio->bi_bdev = bdev; 701 bio->bi_flags |= (1 << BIO_USER_MAPPED); 702 return bio; 703 704 out_unmap: 705 for (i = 0; i < nr_pages; i++) { 706 if(!pages[i]) 707 break; 708 page_cache_release(pages[i]); 709 } 710 out: 711 kfree(pages); 712 bio_put(bio); 713 return ERR_PTR(ret); 714} 715 716/** 717 * bio_map_user - map user address into bio 718 * @q: the request_queue_t for the bio 719 * @bdev: destination block device 720 * @uaddr: start of user address 721 * @len: length in bytes 722 * @write_to_vm: bool indicating writing to pages or not 723 * 724 * Map the user space address into a bio suitable for io to a block 725 * device. Returns an error pointer in case of error. 726 */ 727struct bio *bio_map_user(request_queue_t *q, struct block_device *bdev, 728 unsigned long uaddr, unsigned int len, int write_to_vm) 729{ 730 struct sg_iovec iov; 731 732 iov.iov_base = (void __user *)uaddr; 733 iov.iov_len = len; 734 735 return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm); 736} 737 738/** 739 * bio_map_user_iov - map user sg_iovec table into bio 740 * @q: the request_queue_t for the bio 741 * @bdev: destination block device 742 * @iov: the iovec. 743 * @iov_count: number of elements in the iovec 744 * @write_to_vm: bool indicating writing to pages or not 745 * 746 * Map the user space address into a bio suitable for io to a block 747 * device. Returns an error pointer in case of error. 748 */ 749struct bio *bio_map_user_iov(request_queue_t *q, struct block_device *bdev, 750 struct sg_iovec *iov, int iov_count, 751 int write_to_vm) 752{ 753 struct bio *bio; 754 int len = 0, i; 755 756 bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm); 757 758 if (IS_ERR(bio)) 759 return bio; 760 761 /* 762 * subtle -- if __bio_map_user() ended up bouncing a bio, 763 * it would normally disappear when its bi_end_io is run. 764 * however, we need it for the unmap, so grab an extra 765 * reference to it 766 */ 767 bio_get(bio); 768 769 for (i = 0; i < iov_count; i++) 770 len += iov[i].iov_len; 771 772 if (bio->bi_size == len) 773 return bio; 774 775 /* 776 * don't support partial mappings 777 */ 778 bio_endio(bio, bio->bi_size, 0); 779 bio_unmap_user(bio); 780 return ERR_PTR(-EINVAL); 781} 782 783static void __bio_unmap_user(struct bio *bio) 784{ 785 struct bio_vec *bvec; 786 int i; 787 788 /* 789 * make sure we dirty pages we wrote to 790 */ 791 __bio_for_each_segment(bvec, bio, i, 0) { 792 if (bio_data_dir(bio) == READ) 793 set_page_dirty_lock(bvec->bv_page); 794 795 page_cache_release(bvec->bv_page); 796 } 797 798 bio_put(bio); 799} 800 801/** 802 * bio_unmap_user - unmap a bio 803 * @bio: the bio being unmapped 804 * 805 * Unmap a bio previously mapped by bio_map_user(). Must be called with 806 * a process context. 807 * 808 * bio_unmap_user() may sleep. 809 */ 810void bio_unmap_user(struct bio *bio) 811{ 812 __bio_unmap_user(bio); 813 bio_put(bio); 814} 815 816static int bio_map_kern_endio(struct bio *bio, unsigned int bytes_done, int err) 817{ 818 if (bio->bi_size) 819 return 1; 820 821 bio_put(bio); 822 return 0; 823} 824 825 826static struct bio *__bio_map_kern(request_queue_t *q, void *data, 827 unsigned int len, gfp_t gfp_mask) 828{ 829 unsigned long kaddr = (unsigned long)data; 830 unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 831 unsigned long start = kaddr >> PAGE_SHIFT; 832 const int nr_pages = end - start; 833 int offset, i; 834 struct bio *bio; 835 836 bio = bio_alloc(gfp_mask, nr_pages); 837 if (!bio) 838 return ERR_PTR(-ENOMEM); 839 840 offset = offset_in_page(kaddr); 841 for (i = 0; i < nr_pages; i++) { 842 unsigned int bytes = PAGE_SIZE - offset; 843 844 if (len <= 0) 845 break; 846 847 if (bytes > len) 848 bytes = len; 849 850 if (bio_add_pc_page(q, bio, virt_to_page(data), bytes, 851 offset) < bytes) 852 break; 853 854 data += bytes; 855 len -= bytes; 856 offset = 0; 857 } 858 859 bio->bi_end_io = bio_map_kern_endio; 860 return bio; 861} 862 863/** 864 * bio_map_kern - map kernel address into bio 865 * @q: the request_queue_t for the bio 866 * @data: pointer to buffer to map 867 * @len: length in bytes 868 * @gfp_mask: allocation flags for bio allocation 869 * 870 * Map the kernel address into a bio suitable for io to a block 871 * device. Returns an error pointer in case of error. 872 */ 873struct bio *bio_map_kern(request_queue_t *q, void *data, unsigned int len, 874 gfp_t gfp_mask) 875{ 876 struct bio *bio; 877 878 bio = __bio_map_kern(q, data, len, gfp_mask); 879 if (IS_ERR(bio)) 880 return bio; 881 882 if (bio->bi_size == len) 883 return bio; 884 885 /* 886 * Don't support partial mappings. 887 */ 888 bio_put(bio); 889 return ERR_PTR(-EINVAL); 890} 891 892/* 893 * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions 894 * for performing direct-IO in BIOs. 895 * 896 * The problem is that we cannot run set_page_dirty() from interrupt context 897 * because the required locks are not interrupt-safe. So what we can do is to 898 * mark the pages dirty _before_ performing IO. And in interrupt context, 899 * check that the pages are still dirty. If so, fine. If not, redirty them 900 * in process context. 901 * 902 * We special-case compound pages here: normally this means reads into hugetlb 903 * pages. The logic in here doesn't really work right for compound pages 904 * because the VM does not uniformly chase down the head page in all cases. 905 * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't 906 * handle them at all. So we skip compound pages here at an early stage. 907 * 908 * Note that this code is very hard to test under normal circumstances because 909 * direct-io pins the pages with get_user_pages(). This makes 910 * is_page_cache_freeable return false, and the VM will not clean the pages. 911 * But other code (eg, pdflush) could clean the pages if they are mapped 912 * pagecache. 913 * 914 * Simply disabling the call to bio_set_pages_dirty() is a good way to test the 915 * deferred bio dirtying paths. 916 */ 917 918/* 919 * bio_set_pages_dirty() will mark all the bio's pages as dirty. 920 */ 921void bio_set_pages_dirty(struct bio *bio) 922{ 923 struct bio_vec *bvec = bio->bi_io_vec; 924 int i; 925 926 for (i = 0; i < bio->bi_vcnt; i++) { 927 struct page *page = bvec[i].bv_page; 928 929 if (page && !PageCompound(page)) 930 set_page_dirty_lock(page); 931 } 932} 933 934static void bio_release_pages(struct bio *bio) 935{ 936 struct bio_vec *bvec = bio->bi_io_vec; 937 int i; 938 939 for (i = 0; i < bio->bi_vcnt; i++) { 940 struct page *page = bvec[i].bv_page; 941 942 if (page) 943 put_page(page); 944 } 945} 946 947/* 948 * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. 949 * If they are, then fine. If, however, some pages are clean then they must 950 * have been written out during the direct-IO read. So we take another ref on 951 * the BIO and the offending pages and re-dirty the pages in process context. 952 * 953 * It is expected that bio_check_pages_dirty() will wholly own the BIO from 954 * here on. It will run one page_cache_release() against each page and will 955 * run one bio_put() against the BIO. 956 */ 957 958static void bio_dirty_fn(void *data); 959 960static DECLARE_WORK(bio_dirty_work, bio_dirty_fn, NULL); 961static DEFINE_SPINLOCK(bio_dirty_lock); 962static struct bio *bio_dirty_list; 963 964/* 965 * This runs in process context 966 */ 967static void bio_dirty_fn(void *data) 968{ 969 unsigned long flags; 970 struct bio *bio; 971 972 spin_lock_irqsave(&bio_dirty_lock, flags); 973 bio = bio_dirty_list; 974 bio_dirty_list = NULL; 975 spin_unlock_irqrestore(&bio_dirty_lock, flags); 976 977 while (bio) { 978 struct bio *next = bio->bi_private; 979 980 bio_set_pages_dirty(bio); 981 bio_release_pages(bio); 982 bio_put(bio); 983 bio = next; 984 } 985} 986 987void bio_check_pages_dirty(struct bio *bio) 988{ 989 struct bio_vec *bvec = bio->bi_io_vec; 990 int nr_clean_pages = 0; 991 int i; 992 993 for (i = 0; i < bio->bi_vcnt; i++) { 994 struct page *page = bvec[i].bv_page; 995 996 if (PageDirty(page) || PageCompound(page)) { 997 page_cache_release(page); 998 bvec[i].bv_page = NULL; 999 } else { 1000 nr_clean_pages++; 1001 } 1002 } 1003 1004 if (nr_clean_pages) { 1005 unsigned long flags; 1006 1007 spin_lock_irqsave(&bio_dirty_lock, flags); 1008 bio->bi_private = bio_dirty_list; 1009 bio_dirty_list = bio; 1010 spin_unlock_irqrestore(&bio_dirty_lock, flags); 1011 schedule_work(&bio_dirty_work); 1012 } else { 1013 bio_put(bio); 1014 } 1015} 1016 1017/** 1018 * bio_endio - end I/O on a bio 1019 * @bio: bio 1020 * @bytes_done: number of bytes completed 1021 * @error: error, if any 1022 * 1023 * Description: 1024 * bio_endio() will end I/O on @bytes_done number of bytes. This may be 1025 * just a partial part of the bio, or it may be the whole bio. bio_endio() 1026 * is the preferred way to end I/O on a bio, it takes care of decrementing 1027 * bi_size and clearing BIO_UPTODATE on error. @error is 0 on success, and 1028 * and one of the established -Exxxx (-EIO, for instance) error values in 1029 * case something went wrong. Noone should call bi_end_io() directly on 1030 * a bio unless they own it and thus know that it has an end_io function. 1031 **/ 1032void bio_endio(struct bio *bio, unsigned int bytes_done, int error) 1033{ 1034 if (error) 1035 clear_bit(BIO_UPTODATE, &bio->bi_flags); 1036 1037 if (unlikely(bytes_done > bio->bi_size)) { 1038 printk("%s: want %u bytes done, only %u left\n", __FUNCTION__, 1039 bytes_done, bio->bi_size); 1040 bytes_done = bio->bi_size; 1041 } 1042 1043 bio->bi_size -= bytes_done; 1044 bio->bi_sector += (bytes_done >> 9); 1045 1046 if (bio->bi_end_io) 1047 bio->bi_end_io(bio, bytes_done, error); 1048} 1049 1050void bio_pair_release(struct bio_pair *bp) 1051{ 1052 if (atomic_dec_and_test(&bp->cnt)) { 1053 struct bio *master = bp->bio1.bi_private; 1054 1055 bio_endio(master, master->bi_size, bp->error); 1056 mempool_free(bp, bp->bio2.bi_private); 1057 } 1058} 1059 1060static int bio_pair_end_1(struct bio * bi, unsigned int done, int err) 1061{ 1062 struct bio_pair *bp = container_of(bi, struct bio_pair, bio1); 1063 1064 if (err) 1065 bp->error = err; 1066 1067 if (bi->bi_size) 1068 return 1; 1069 1070 bio_pair_release(bp); 1071 return 0; 1072} 1073 1074static int bio_pair_end_2(struct bio * bi, unsigned int done, int err) 1075{ 1076 struct bio_pair *bp = container_of(bi, struct bio_pair, bio2); 1077 1078 if (err) 1079 bp->error = err; 1080 1081 if (bi->bi_size) 1082 return 1; 1083 1084 bio_pair_release(bp); 1085 return 0; 1086} 1087 1088/* 1089 * split a bio - only worry about a bio with a single page 1090 * in it's iovec 1091 */ 1092struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors) 1093{ 1094 struct bio_pair *bp = mempool_alloc(pool, GFP_NOIO); 1095 1096 if (!bp) 1097 return bp; 1098 1099 blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi, 1100 bi->bi_sector + first_sectors); 1101 1102 BUG_ON(bi->bi_vcnt != 1); 1103 BUG_ON(bi->bi_idx != 0); 1104 atomic_set(&bp->cnt, 3); 1105 bp->error = 0; 1106 bp->bio1 = *bi; 1107 bp->bio2 = *bi; 1108 bp->bio2.bi_sector += first_sectors; 1109 bp->bio2.bi_size -= first_sectors << 9; 1110 bp->bio1.bi_size = first_sectors << 9; 1111 1112 bp->bv1 = bi->bi_io_vec[0]; 1113 bp->bv2 = bi->bi_io_vec[0]; 1114 bp->bv2.bv_offset += first_sectors << 9; 1115 bp->bv2.bv_len -= first_sectors << 9; 1116 bp->bv1.bv_len = first_sectors << 9; 1117 1118 bp->bio1.bi_io_vec = &bp->bv1; 1119 bp->bio2.bi_io_vec = &bp->bv2; 1120 1121 bp->bio1.bi_max_vecs = 1; 1122 bp->bio2.bi_max_vecs = 1; 1123 1124 bp->bio1.bi_end_io = bio_pair_end_1; 1125 bp->bio2.bi_end_io = bio_pair_end_2; 1126 1127 bp->bio1.bi_private = bi; 1128 bp->bio2.bi_private = pool; 1129 1130 return bp; 1131} 1132 1133 1134/* 1135 * create memory pools for biovec's in a bio_set. 1136 * use the global biovec slabs created for general use. 1137 */ 1138static int biovec_create_pools(struct bio_set *bs, int pool_entries, int scale) 1139{ 1140 int i; 1141 1142 for (i = 0; i < BIOVEC_NR_POOLS; i++) { 1143 struct biovec_slab *bp = bvec_slabs + i; 1144 mempool_t **bvp = bs->bvec_pools + i; 1145 1146 if (pool_entries > 1 && i >= scale) 1147 pool_entries >>= 1; 1148 1149 *bvp = mempool_create_slab_pool(pool_entries, bp->slab); 1150 if (!*bvp) 1151 return -ENOMEM; 1152 } 1153 return 0; 1154} 1155 1156static void biovec_free_pools(struct bio_set *bs) 1157{ 1158 int i; 1159 1160 for (i = 0; i < BIOVEC_NR_POOLS; i++) { 1161 mempool_t *bvp = bs->bvec_pools[i]; 1162 1163 if (bvp) 1164 mempool_destroy(bvp); 1165 } 1166 1167} 1168 1169void bioset_free(struct bio_set *bs) 1170{ 1171 if (bs->bio_pool) 1172 mempool_destroy(bs->bio_pool); 1173 1174 biovec_free_pools(bs); 1175 1176 kfree(bs); 1177} 1178 1179struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size, int scale) 1180{ 1181 struct bio_set *bs = kzalloc(sizeof(*bs), GFP_KERNEL); 1182 1183 if (!bs) 1184 return NULL; 1185 1186 bs->bio_pool = mempool_create_slab_pool(bio_pool_size, bio_slab); 1187 if (!bs->bio_pool) 1188 goto bad; 1189 1190 if (!biovec_create_pools(bs, bvec_pool_size, scale)) 1191 return bs; 1192 1193bad: 1194 bioset_free(bs); 1195 return NULL; 1196} 1197 1198static void __init biovec_init_slabs(void) 1199{ 1200 int i; 1201 1202 for (i = 0; i < BIOVEC_NR_POOLS; i++) { 1203 int size; 1204 struct biovec_slab *bvs = bvec_slabs + i; 1205 1206 size = bvs->nr_vecs * sizeof(struct bio_vec); 1207 bvs->slab = kmem_cache_create(bvs->name, size, 0, 1208 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1209 } 1210} 1211 1212static int __init init_bio(void) 1213{ 1214 int megabytes, bvec_pool_entries; 1215 int scale = BIOVEC_NR_POOLS; 1216 1217 bio_slab = kmem_cache_create("bio", sizeof(struct bio), 0, 1218 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1219 1220 biovec_init_slabs(); 1221 1222 megabytes = nr_free_pages() >> (20 - PAGE_SHIFT); 1223 1224 /* 1225 * find out where to start scaling 1226 */ 1227 if (megabytes <= 16) 1228 scale = 0; 1229 else if (megabytes <= 32) 1230 scale = 1; 1231 else if (megabytes <= 64) 1232 scale = 2; 1233 else if (megabytes <= 96) 1234 scale = 3; 1235 else if (megabytes <= 128) 1236 scale = 4; 1237 1238 /* 1239 * Limit number of entries reserved -- mempools are only used when 1240 * the system is completely unable to allocate memory, so we only 1241 * need enough to make progress. 1242 */ 1243 bvec_pool_entries = 1 + scale; 1244 1245 fs_bio_set = bioset_create(BIO_POOL_SIZE, bvec_pool_entries, scale); 1246 if (!fs_bio_set) 1247 panic("bio: can't allocate bios\n"); 1248 1249 bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES, 1250 sizeof(struct bio_pair)); 1251 if (!bio_split_pool) 1252 panic("bio: can't create split pool\n"); 1253 1254 return 0; 1255} 1256 1257subsys_initcall(init_bio); 1258 1259EXPORT_SYMBOL(bio_alloc); 1260EXPORT_SYMBOL(bio_put); 1261EXPORT_SYMBOL(bio_free); 1262EXPORT_SYMBOL(bio_endio); 1263EXPORT_SYMBOL(bio_init); 1264EXPORT_SYMBOL(__bio_clone); 1265EXPORT_SYMBOL(bio_clone); 1266EXPORT_SYMBOL(bio_phys_segments); 1267EXPORT_SYMBOL(bio_hw_segments); 1268EXPORT_SYMBOL(bio_add_page); 1269EXPORT_SYMBOL(bio_add_pc_page); 1270EXPORT_SYMBOL(bio_get_nr_vecs); 1271EXPORT_SYMBOL(bio_map_user); 1272EXPORT_SYMBOL(bio_unmap_user); 1273EXPORT_SYMBOL(bio_map_kern); 1274EXPORT_SYMBOL(bio_pair_release); 1275EXPORT_SYMBOL(bio_split); 1276EXPORT_SYMBOL(bio_split_pool); 1277EXPORT_SYMBOL(bio_copy_user); 1278EXPORT_SYMBOL(bio_uncopy_user); 1279EXPORT_SYMBOL(bioset_create); 1280EXPORT_SYMBOL(bioset_free); 1281EXPORT_SYMBOL(bio_alloc_bioset);