Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at 77dc2db6d1d2703ee4e83d4b3dbecf4e06a910e6 1280 lines 31 kB view raw
1/* 2 * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk> 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License version 2 as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * GNU General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public Licens 14 * along with this program; if not, write to the Free Software 15 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 16 * 17 */ 18#include <linux/mm.h> 19#include <linux/swap.h> 20#include <linux/bio.h> 21#include <linux/blkdev.h> 22#include <linux/slab.h> 23#include <linux/init.h> 24#include <linux/kernel.h> 25#include <linux/module.h> 26#include <linux/mempool.h> 27#include <linux/workqueue.h> 28#include <linux/blktrace_api.h> 29#include <scsi/sg.h> /* for struct sg_iovec */ 30 31#define BIO_POOL_SIZE 256 32 33static kmem_cache_t *bio_slab __read_mostly; 34 35#define BIOVEC_NR_POOLS 6 36 37/* 38 * a small number of entries is fine, not going to be performance critical. 39 * basically we just need to survive 40 */ 41#define BIO_SPLIT_ENTRIES 8 42mempool_t *bio_split_pool __read_mostly; 43 44struct biovec_slab { 45 int nr_vecs; 46 char *name; 47 kmem_cache_t *slab; 48}; 49 50/* 51 * if you change this list, also change bvec_alloc or things will 52 * break badly! cannot be bigger than what you can fit into an 53 * unsigned short 54 */ 55 56#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) } 57static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { 58 BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES), 59}; 60#undef BV 61 62/* 63 * bio_set is used to allow other portions of the IO system to 64 * allocate their own private memory pools for bio and iovec structures. 65 * These memory pools in turn all allocate from the bio_slab 66 * and the bvec_slabs[]. 67 */ 68struct bio_set { 69 mempool_t *bio_pool; 70 mempool_t *bvec_pools[BIOVEC_NR_POOLS]; 71}; 72 73/* 74 * fs_bio_set is the bio_set containing bio and iovec memory pools used by 75 * IO code that does not need private memory pools. 76 */ 77static struct bio_set *fs_bio_set; 78 79static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs) 80{ 81 struct bio_vec *bvl; 82 struct biovec_slab *bp; 83 84 /* 85 * see comment near bvec_array define! 86 */ 87 switch (nr) { 88 case 1 : *idx = 0; break; 89 case 2 ... 4: *idx = 1; break; 90 case 5 ... 16: *idx = 2; break; 91 case 17 ... 64: *idx = 3; break; 92 case 65 ... 128: *idx = 4; break; 93 case 129 ... BIO_MAX_PAGES: *idx = 5; break; 94 default: 95 return NULL; 96 } 97 /* 98 * idx now points to the pool we want to allocate from 99 */ 100 101 bp = bvec_slabs + *idx; 102 bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask); 103 if (bvl) 104 memset(bvl, 0, bp->nr_vecs * sizeof(struct bio_vec)); 105 106 return bvl; 107} 108 109void bio_free(struct bio *bio, struct bio_set *bio_set) 110{ 111 const int pool_idx = BIO_POOL_IDX(bio); 112 113 BIO_BUG_ON(pool_idx >= BIOVEC_NR_POOLS); 114 115 mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]); 116 mempool_free(bio, bio_set->bio_pool); 117} 118 119/* 120 * default destructor for a bio allocated with bio_alloc_bioset() 121 */ 122static void bio_fs_destructor(struct bio *bio) 123{ 124 bio_free(bio, fs_bio_set); 125} 126 127void bio_init(struct bio *bio) 128{ 129 bio->bi_next = NULL; 130 bio->bi_bdev = NULL; 131 bio->bi_flags = 1 << BIO_UPTODATE; 132 bio->bi_rw = 0; 133 bio->bi_vcnt = 0; 134 bio->bi_idx = 0; 135 bio->bi_phys_segments = 0; 136 bio->bi_hw_segments = 0; 137 bio->bi_hw_front_size = 0; 138 bio->bi_hw_back_size = 0; 139 bio->bi_size = 0; 140 bio->bi_max_vecs = 0; 141 bio->bi_end_io = NULL; 142 atomic_set(&bio->bi_cnt, 1); 143 bio->bi_private = NULL; 144} 145 146/** 147 * bio_alloc_bioset - allocate a bio for I/O 148 * @gfp_mask: the GFP_ mask given to the slab allocator 149 * @nr_iovecs: number of iovecs to pre-allocate 150 * @bs: the bio_set to allocate from 151 * 152 * Description: 153 * bio_alloc_bioset will first try it's on mempool to satisfy the allocation. 154 * If %__GFP_WAIT is set then we will block on the internal pool waiting 155 * for a &struct bio to become free. 156 * 157 * allocate bio and iovecs from the memory pools specified by the 158 * bio_set structure. 159 **/ 160struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) 161{ 162 struct bio *bio = mempool_alloc(bs->bio_pool, gfp_mask); 163 164 if (likely(bio)) { 165 struct bio_vec *bvl = NULL; 166 167 bio_init(bio); 168 if (likely(nr_iovecs)) { 169 unsigned long idx; 170 171 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); 172 if (unlikely(!bvl)) { 173 mempool_free(bio, bs->bio_pool); 174 bio = NULL; 175 goto out; 176 } 177 bio->bi_flags |= idx << BIO_POOL_OFFSET; 178 bio->bi_max_vecs = bvec_slabs[idx].nr_vecs; 179 } 180 bio->bi_io_vec = bvl; 181 } 182out: 183 return bio; 184} 185 186struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) 187{ 188 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); 189 190 if (bio) 191 bio->bi_destructor = bio_fs_destructor; 192 193 return bio; 194} 195 196void zero_fill_bio(struct bio *bio) 197{ 198 unsigned long flags; 199 struct bio_vec *bv; 200 int i; 201 202 bio_for_each_segment(bv, bio, i) { 203 char *data = bvec_kmap_irq(bv, &flags); 204 memset(data, 0, bv->bv_len); 205 flush_dcache_page(bv->bv_page); 206 bvec_kunmap_irq(data, &flags); 207 } 208} 209EXPORT_SYMBOL(zero_fill_bio); 210 211/** 212 * bio_put - release a reference to a bio 213 * @bio: bio to release reference to 214 * 215 * Description: 216 * Put a reference to a &struct bio, either one you have gotten with 217 * bio_alloc or bio_get. The last put of a bio will free it. 218 **/ 219void bio_put(struct bio *bio) 220{ 221 BIO_BUG_ON(!atomic_read(&bio->bi_cnt)); 222 223 /* 224 * last put frees it 225 */ 226 if (atomic_dec_and_test(&bio->bi_cnt)) { 227 bio->bi_next = NULL; 228 bio->bi_destructor(bio); 229 } 230} 231 232inline int bio_phys_segments(request_queue_t *q, struct bio *bio) 233{ 234 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 235 blk_recount_segments(q, bio); 236 237 return bio->bi_phys_segments; 238} 239 240inline int bio_hw_segments(request_queue_t *q, struct bio *bio) 241{ 242 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 243 blk_recount_segments(q, bio); 244 245 return bio->bi_hw_segments; 246} 247 248/** 249 * __bio_clone - clone a bio 250 * @bio: destination bio 251 * @bio_src: bio to clone 252 * 253 * Clone a &bio. Caller will own the returned bio, but not 254 * the actual data it points to. Reference count of returned 255 * bio will be one. 256 */ 257void __bio_clone(struct bio *bio, struct bio *bio_src) 258{ 259 request_queue_t *q = bdev_get_queue(bio_src->bi_bdev); 260 261 memcpy(bio->bi_io_vec, bio_src->bi_io_vec, 262 bio_src->bi_max_vecs * sizeof(struct bio_vec)); 263 264 bio->bi_sector = bio_src->bi_sector; 265 bio->bi_bdev = bio_src->bi_bdev; 266 bio->bi_flags |= 1 << BIO_CLONED; 267 bio->bi_rw = bio_src->bi_rw; 268 bio->bi_vcnt = bio_src->bi_vcnt; 269 bio->bi_size = bio_src->bi_size; 270 bio->bi_idx = bio_src->bi_idx; 271 bio_phys_segments(q, bio); 272 bio_hw_segments(q, bio); 273} 274 275/** 276 * bio_clone - clone a bio 277 * @bio: bio to clone 278 * @gfp_mask: allocation priority 279 * 280 * Like __bio_clone, only also allocates the returned bio 281 */ 282struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) 283{ 284 struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set); 285 286 if (b) { 287 b->bi_destructor = bio_fs_destructor; 288 __bio_clone(b, bio); 289 } 290 291 return b; 292} 293 294/** 295 * bio_get_nr_vecs - return approx number of vecs 296 * @bdev: I/O target 297 * 298 * Return the approximate number of pages we can send to this target. 299 * There's no guarantee that you will be able to fit this number of pages 300 * into a bio, it does not account for dynamic restrictions that vary 301 * on offset. 302 */ 303int bio_get_nr_vecs(struct block_device *bdev) 304{ 305 request_queue_t *q = bdev_get_queue(bdev); 306 int nr_pages; 307 308 nr_pages = ((q->max_sectors << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT; 309 if (nr_pages > q->max_phys_segments) 310 nr_pages = q->max_phys_segments; 311 if (nr_pages > q->max_hw_segments) 312 nr_pages = q->max_hw_segments; 313 314 return nr_pages; 315} 316 317static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page 318 *page, unsigned int len, unsigned int offset, 319 unsigned short max_sectors) 320{ 321 int retried_segments = 0; 322 struct bio_vec *bvec; 323 324 /* 325 * cloned bio must not modify vec list 326 */ 327 if (unlikely(bio_flagged(bio, BIO_CLONED))) 328 return 0; 329 330 if (((bio->bi_size + len) >> 9) > max_sectors) 331 return 0; 332 333 /* 334 * For filesystems with a blocksize smaller than the pagesize 335 * we will often be called with the same page as last time and 336 * a consecutive offset. Optimize this special case. 337 */ 338 if (bio->bi_vcnt > 0) { 339 struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; 340 341 if (page == prev->bv_page && 342 offset == prev->bv_offset + prev->bv_len) { 343 prev->bv_len += len; 344 if (q->merge_bvec_fn && 345 q->merge_bvec_fn(q, bio, prev) < len) { 346 prev->bv_len -= len; 347 return 0; 348 } 349 350 goto done; 351 } 352 } 353 354 if (bio->bi_vcnt >= bio->bi_max_vecs) 355 return 0; 356 357 /* 358 * we might lose a segment or two here, but rather that than 359 * make this too complex. 360 */ 361 362 while (bio->bi_phys_segments >= q->max_phys_segments 363 || bio->bi_hw_segments >= q->max_hw_segments 364 || BIOVEC_VIRT_OVERSIZE(bio->bi_size)) { 365 366 if (retried_segments) 367 return 0; 368 369 retried_segments = 1; 370 blk_recount_segments(q, bio); 371 } 372 373 /* 374 * setup the new entry, we might clear it again later if we 375 * cannot add the page 376 */ 377 bvec = &bio->bi_io_vec[bio->bi_vcnt]; 378 bvec->bv_page = page; 379 bvec->bv_len = len; 380 bvec->bv_offset = offset; 381 382 /* 383 * if queue has other restrictions (eg varying max sector size 384 * depending on offset), it can specify a merge_bvec_fn in the 385 * queue to get further control 386 */ 387 if (q->merge_bvec_fn) { 388 /* 389 * merge_bvec_fn() returns number of bytes it can accept 390 * at this offset 391 */ 392 if (q->merge_bvec_fn(q, bio, bvec) < len) { 393 bvec->bv_page = NULL; 394 bvec->bv_len = 0; 395 bvec->bv_offset = 0; 396 return 0; 397 } 398 } 399 400 /* If we may be able to merge these biovecs, force a recount */ 401 if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec) || 402 BIOVEC_VIRT_MERGEABLE(bvec-1, bvec))) 403 bio->bi_flags &= ~(1 << BIO_SEG_VALID); 404 405 bio->bi_vcnt++; 406 bio->bi_phys_segments++; 407 bio->bi_hw_segments++; 408 done: 409 bio->bi_size += len; 410 return len; 411} 412 413/** 414 * bio_add_pc_page - attempt to add page to bio 415 * @q: the target queue 416 * @bio: destination bio 417 * @page: page to add 418 * @len: vec entry length 419 * @offset: vec entry offset 420 * 421 * Attempt to add a page to the bio_vec maplist. This can fail for a 422 * number of reasons, such as the bio being full or target block 423 * device limitations. The target block device must allow bio's 424 * smaller than PAGE_SIZE, so it is always possible to add a single 425 * page to an empty bio. This should only be used by REQ_PC bios. 426 */ 427int bio_add_pc_page(request_queue_t *q, struct bio *bio, struct page *page, 428 unsigned int len, unsigned int offset) 429{ 430 return __bio_add_page(q, bio, page, len, offset, q->max_hw_sectors); 431} 432 433/** 434 * bio_add_page - attempt to add page to bio 435 * @bio: destination bio 436 * @page: page to add 437 * @len: vec entry length 438 * @offset: vec entry offset 439 * 440 * Attempt to add a page to the bio_vec maplist. This can fail for a 441 * number of reasons, such as the bio being full or target block 442 * device limitations. The target block device must allow bio's 443 * smaller than PAGE_SIZE, so it is always possible to add a single 444 * page to an empty bio. 445 */ 446int bio_add_page(struct bio *bio, struct page *page, unsigned int len, 447 unsigned int offset) 448{ 449 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 450 return __bio_add_page(q, bio, page, len, offset, q->max_sectors); 451} 452 453struct bio_map_data { 454 struct bio_vec *iovecs; 455 void __user *userptr; 456}; 457 458static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio) 459{ 460 memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt); 461 bio->bi_private = bmd; 462} 463 464static void bio_free_map_data(struct bio_map_data *bmd) 465{ 466 kfree(bmd->iovecs); 467 kfree(bmd); 468} 469 470static struct bio_map_data *bio_alloc_map_data(int nr_segs) 471{ 472 struct bio_map_data *bmd = kmalloc(sizeof(*bmd), GFP_KERNEL); 473 474 if (!bmd) 475 return NULL; 476 477 bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, GFP_KERNEL); 478 if (bmd->iovecs) 479 return bmd; 480 481 kfree(bmd); 482 return NULL; 483} 484 485/** 486 * bio_uncopy_user - finish previously mapped bio 487 * @bio: bio being terminated 488 * 489 * Free pages allocated from bio_copy_user() and write back data 490 * to user space in case of a read. 491 */ 492int bio_uncopy_user(struct bio *bio) 493{ 494 struct bio_map_data *bmd = bio->bi_private; 495 const int read = bio_data_dir(bio) == READ; 496 struct bio_vec *bvec; 497 int i, ret = 0; 498 499 __bio_for_each_segment(bvec, bio, i, 0) { 500 char *addr = page_address(bvec->bv_page); 501 unsigned int len = bmd->iovecs[i].bv_len; 502 503 if (read && !ret && copy_to_user(bmd->userptr, addr, len)) 504 ret = -EFAULT; 505 506 __free_page(bvec->bv_page); 507 bmd->userptr += len; 508 } 509 bio_free_map_data(bmd); 510 bio_put(bio); 511 return ret; 512} 513 514/** 515 * bio_copy_user - copy user data to bio 516 * @q: destination block queue 517 * @uaddr: start of user address 518 * @len: length in bytes 519 * @write_to_vm: bool indicating writing to pages or not 520 * 521 * Prepares and returns a bio for indirect user io, bouncing data 522 * to/from kernel pages as necessary. Must be paired with 523 * call bio_uncopy_user() on io completion. 524 */ 525struct bio *bio_copy_user(request_queue_t *q, unsigned long uaddr, 526 unsigned int len, int write_to_vm) 527{ 528 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 529 unsigned long start = uaddr >> PAGE_SHIFT; 530 struct bio_map_data *bmd; 531 struct bio_vec *bvec; 532 struct page *page; 533 struct bio *bio; 534 int i, ret; 535 536 bmd = bio_alloc_map_data(end - start); 537 if (!bmd) 538 return ERR_PTR(-ENOMEM); 539 540 bmd->userptr = (void __user *) uaddr; 541 542 ret = -ENOMEM; 543 bio = bio_alloc(GFP_KERNEL, end - start); 544 if (!bio) 545 goto out_bmd; 546 547 bio->bi_rw |= (!write_to_vm << BIO_RW); 548 549 ret = 0; 550 while (len) { 551 unsigned int bytes = PAGE_SIZE; 552 553 if (bytes > len) 554 bytes = len; 555 556 page = alloc_page(q->bounce_gfp | GFP_KERNEL); 557 if (!page) { 558 ret = -ENOMEM; 559 break; 560 } 561 562 if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) { 563 ret = -EINVAL; 564 break; 565 } 566 567 len -= bytes; 568 } 569 570 if (ret) 571 goto cleanup; 572 573 /* 574 * success 575 */ 576 if (!write_to_vm) { 577 char __user *p = (char __user *) uaddr; 578 579 /* 580 * for a write, copy in data to kernel pages 581 */ 582 ret = -EFAULT; 583 bio_for_each_segment(bvec, bio, i) { 584 char *addr = page_address(bvec->bv_page); 585 586 if (copy_from_user(addr, p, bvec->bv_len)) 587 goto cleanup; 588 p += bvec->bv_len; 589 } 590 } 591 592 bio_set_map_data(bmd, bio); 593 return bio; 594cleanup: 595 bio_for_each_segment(bvec, bio, i) 596 __free_page(bvec->bv_page); 597 598 bio_put(bio); 599out_bmd: 600 bio_free_map_data(bmd); 601 return ERR_PTR(ret); 602} 603 604static struct bio *__bio_map_user_iov(request_queue_t *q, 605 struct block_device *bdev, 606 struct sg_iovec *iov, int iov_count, 607 int write_to_vm) 608{ 609 int i, j; 610 int nr_pages = 0; 611 struct page **pages; 612 struct bio *bio; 613 int cur_page = 0; 614 int ret, offset; 615 616 for (i = 0; i < iov_count; i++) { 617 unsigned long uaddr = (unsigned long)iov[i].iov_base; 618 unsigned long len = iov[i].iov_len; 619 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 620 unsigned long start = uaddr >> PAGE_SHIFT; 621 622 nr_pages += end - start; 623 /* 624 * transfer and buffer must be aligned to at least hardsector 625 * size for now, in the future we can relax this restriction 626 */ 627 if ((uaddr & queue_dma_alignment(q)) || (len & queue_dma_alignment(q))) 628 return ERR_PTR(-EINVAL); 629 } 630 631 if (!nr_pages) 632 return ERR_PTR(-EINVAL); 633 634 bio = bio_alloc(GFP_KERNEL, nr_pages); 635 if (!bio) 636 return ERR_PTR(-ENOMEM); 637 638 ret = -ENOMEM; 639 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); 640 if (!pages) 641 goto out; 642 643 for (i = 0; i < iov_count; i++) { 644 unsigned long uaddr = (unsigned long)iov[i].iov_base; 645 unsigned long len = iov[i].iov_len; 646 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 647 unsigned long start = uaddr >> PAGE_SHIFT; 648 const int local_nr_pages = end - start; 649 const int page_limit = cur_page + local_nr_pages; 650 651 down_read(&current->mm->mmap_sem); 652 ret = get_user_pages(current, current->mm, uaddr, 653 local_nr_pages, 654 write_to_vm, 0, &pages[cur_page], NULL); 655 up_read(&current->mm->mmap_sem); 656 657 if (ret < local_nr_pages) { 658 ret = -EFAULT; 659 goto out_unmap; 660 } 661 662 offset = uaddr & ~PAGE_MASK; 663 for (j = cur_page; j < page_limit; j++) { 664 unsigned int bytes = PAGE_SIZE - offset; 665 666 if (len <= 0) 667 break; 668 669 if (bytes > len) 670 bytes = len; 671 672 /* 673 * sorry... 674 */ 675 if (bio_add_pc_page(q, bio, pages[j], bytes, offset) < 676 bytes) 677 break; 678 679 len -= bytes; 680 offset = 0; 681 } 682 683 cur_page = j; 684 /* 685 * release the pages we didn't map into the bio, if any 686 */ 687 while (j < page_limit) 688 page_cache_release(pages[j++]); 689 } 690 691 kfree(pages); 692 693 /* 694 * set data direction, and check if mapped pages need bouncing 695 */ 696 if (!write_to_vm) 697 bio->bi_rw |= (1 << BIO_RW); 698 699 bio->bi_bdev = bdev; 700 bio->bi_flags |= (1 << BIO_USER_MAPPED); 701 return bio; 702 703 out_unmap: 704 for (i = 0; i < nr_pages; i++) { 705 if(!pages[i]) 706 break; 707 page_cache_release(pages[i]); 708 } 709 out: 710 kfree(pages); 711 bio_put(bio); 712 return ERR_PTR(ret); 713} 714 715/** 716 * bio_map_user - map user address into bio 717 * @q: the request_queue_t for the bio 718 * @bdev: destination block device 719 * @uaddr: start of user address 720 * @len: length in bytes 721 * @write_to_vm: bool indicating writing to pages or not 722 * 723 * Map the user space address into a bio suitable for io to a block 724 * device. Returns an error pointer in case of error. 725 */ 726struct bio *bio_map_user(request_queue_t *q, struct block_device *bdev, 727 unsigned long uaddr, unsigned int len, int write_to_vm) 728{ 729 struct sg_iovec iov; 730 731 iov.iov_base = (void __user *)uaddr; 732 iov.iov_len = len; 733 734 return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm); 735} 736 737/** 738 * bio_map_user_iov - map user sg_iovec table into bio 739 * @q: the request_queue_t for the bio 740 * @bdev: destination block device 741 * @iov: the iovec. 742 * @iov_count: number of elements in the iovec 743 * @write_to_vm: bool indicating writing to pages or not 744 * 745 * Map the user space address into a bio suitable for io to a block 746 * device. Returns an error pointer in case of error. 747 */ 748struct bio *bio_map_user_iov(request_queue_t *q, struct block_device *bdev, 749 struct sg_iovec *iov, int iov_count, 750 int write_to_vm) 751{ 752 struct bio *bio; 753 int len = 0, i; 754 755 bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm); 756 757 if (IS_ERR(bio)) 758 return bio; 759 760 /* 761 * subtle -- if __bio_map_user() ended up bouncing a bio, 762 * it would normally disappear when its bi_end_io is run. 763 * however, we need it for the unmap, so grab an extra 764 * reference to it 765 */ 766 bio_get(bio); 767 768 for (i = 0; i < iov_count; i++) 769 len += iov[i].iov_len; 770 771 if (bio->bi_size == len) 772 return bio; 773 774 /* 775 * don't support partial mappings 776 */ 777 bio_endio(bio, bio->bi_size, 0); 778 bio_unmap_user(bio); 779 return ERR_PTR(-EINVAL); 780} 781 782static void __bio_unmap_user(struct bio *bio) 783{ 784 struct bio_vec *bvec; 785 int i; 786 787 /* 788 * make sure we dirty pages we wrote to 789 */ 790 __bio_for_each_segment(bvec, bio, i, 0) { 791 if (bio_data_dir(bio) == READ) 792 set_page_dirty_lock(bvec->bv_page); 793 794 page_cache_release(bvec->bv_page); 795 } 796 797 bio_put(bio); 798} 799 800/** 801 * bio_unmap_user - unmap a bio 802 * @bio: the bio being unmapped 803 * 804 * Unmap a bio previously mapped by bio_map_user(). Must be called with 805 * a process context. 806 * 807 * bio_unmap_user() may sleep. 808 */ 809void bio_unmap_user(struct bio *bio) 810{ 811 __bio_unmap_user(bio); 812 bio_put(bio); 813} 814 815static int bio_map_kern_endio(struct bio *bio, unsigned int bytes_done, int err) 816{ 817 if (bio->bi_size) 818 return 1; 819 820 bio_put(bio); 821 return 0; 822} 823 824 825static struct bio *__bio_map_kern(request_queue_t *q, void *data, 826 unsigned int len, gfp_t gfp_mask) 827{ 828 unsigned long kaddr = (unsigned long)data; 829 unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 830 unsigned long start = kaddr >> PAGE_SHIFT; 831 const int nr_pages = end - start; 832 int offset, i; 833 struct bio *bio; 834 835 bio = bio_alloc(gfp_mask, nr_pages); 836 if (!bio) 837 return ERR_PTR(-ENOMEM); 838 839 offset = offset_in_page(kaddr); 840 for (i = 0; i < nr_pages; i++) { 841 unsigned int bytes = PAGE_SIZE - offset; 842 843 if (len <= 0) 844 break; 845 846 if (bytes > len) 847 bytes = len; 848 849 if (bio_add_pc_page(q, bio, virt_to_page(data), bytes, 850 offset) < bytes) 851 break; 852 853 data += bytes; 854 len -= bytes; 855 offset = 0; 856 } 857 858 bio->bi_end_io = bio_map_kern_endio; 859 return bio; 860} 861 862/** 863 * bio_map_kern - map kernel address into bio 864 * @q: the request_queue_t for the bio 865 * @data: pointer to buffer to map 866 * @len: length in bytes 867 * @gfp_mask: allocation flags for bio allocation 868 * 869 * Map the kernel address into a bio suitable for io to a block 870 * device. Returns an error pointer in case of error. 871 */ 872struct bio *bio_map_kern(request_queue_t *q, void *data, unsigned int len, 873 gfp_t gfp_mask) 874{ 875 struct bio *bio; 876 877 bio = __bio_map_kern(q, data, len, gfp_mask); 878 if (IS_ERR(bio)) 879 return bio; 880 881 if (bio->bi_size == len) 882 return bio; 883 884 /* 885 * Don't support partial mappings. 886 */ 887 bio_put(bio); 888 return ERR_PTR(-EINVAL); 889} 890 891/* 892 * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions 893 * for performing direct-IO in BIOs. 894 * 895 * The problem is that we cannot run set_page_dirty() from interrupt context 896 * because the required locks are not interrupt-safe. So what we can do is to 897 * mark the pages dirty _before_ performing IO. And in interrupt context, 898 * check that the pages are still dirty. If so, fine. If not, redirty them 899 * in process context. 900 * 901 * We special-case compound pages here: normally this means reads into hugetlb 902 * pages. The logic in here doesn't really work right for compound pages 903 * because the VM does not uniformly chase down the head page in all cases. 904 * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't 905 * handle them at all. So we skip compound pages here at an early stage. 906 * 907 * Note that this code is very hard to test under normal circumstances because 908 * direct-io pins the pages with get_user_pages(). This makes 909 * is_page_cache_freeable return false, and the VM will not clean the pages. 910 * But other code (eg, pdflush) could clean the pages if they are mapped 911 * pagecache. 912 * 913 * Simply disabling the call to bio_set_pages_dirty() is a good way to test the 914 * deferred bio dirtying paths. 915 */ 916 917/* 918 * bio_set_pages_dirty() will mark all the bio's pages as dirty. 919 */ 920void bio_set_pages_dirty(struct bio *bio) 921{ 922 struct bio_vec *bvec = bio->bi_io_vec; 923 int i; 924 925 for (i = 0; i < bio->bi_vcnt; i++) { 926 struct page *page = bvec[i].bv_page; 927 928 if (page && !PageCompound(page)) 929 set_page_dirty_lock(page); 930 } 931} 932 933static void bio_release_pages(struct bio *bio) 934{ 935 struct bio_vec *bvec = bio->bi_io_vec; 936 int i; 937 938 for (i = 0; i < bio->bi_vcnt; i++) { 939 struct page *page = bvec[i].bv_page; 940 941 if (page) 942 put_page(page); 943 } 944} 945 946/* 947 * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. 948 * If they are, then fine. If, however, some pages are clean then they must 949 * have been written out during the direct-IO read. So we take another ref on 950 * the BIO and the offending pages and re-dirty the pages in process context. 951 * 952 * It is expected that bio_check_pages_dirty() will wholly own the BIO from 953 * here on. It will run one page_cache_release() against each page and will 954 * run one bio_put() against the BIO. 955 */ 956 957static void bio_dirty_fn(void *data); 958 959static DECLARE_WORK(bio_dirty_work, bio_dirty_fn, NULL); 960static DEFINE_SPINLOCK(bio_dirty_lock); 961static struct bio *bio_dirty_list; 962 963/* 964 * This runs in process context 965 */ 966static void bio_dirty_fn(void *data) 967{ 968 unsigned long flags; 969 struct bio *bio; 970 971 spin_lock_irqsave(&bio_dirty_lock, flags); 972 bio = bio_dirty_list; 973 bio_dirty_list = NULL; 974 spin_unlock_irqrestore(&bio_dirty_lock, flags); 975 976 while (bio) { 977 struct bio *next = bio->bi_private; 978 979 bio_set_pages_dirty(bio); 980 bio_release_pages(bio); 981 bio_put(bio); 982 bio = next; 983 } 984} 985 986void bio_check_pages_dirty(struct bio *bio) 987{ 988 struct bio_vec *bvec = bio->bi_io_vec; 989 int nr_clean_pages = 0; 990 int i; 991 992 for (i = 0; i < bio->bi_vcnt; i++) { 993 struct page *page = bvec[i].bv_page; 994 995 if (PageDirty(page) || PageCompound(page)) { 996 page_cache_release(page); 997 bvec[i].bv_page = NULL; 998 } else { 999 nr_clean_pages++; 1000 } 1001 } 1002 1003 if (nr_clean_pages) { 1004 unsigned long flags; 1005 1006 spin_lock_irqsave(&bio_dirty_lock, flags); 1007 bio->bi_private = bio_dirty_list; 1008 bio_dirty_list = bio; 1009 spin_unlock_irqrestore(&bio_dirty_lock, flags); 1010 schedule_work(&bio_dirty_work); 1011 } else { 1012 bio_put(bio); 1013 } 1014} 1015 1016/** 1017 * bio_endio - end I/O on a bio 1018 * @bio: bio 1019 * @bytes_done: number of bytes completed 1020 * @error: error, if any 1021 * 1022 * Description: 1023 * bio_endio() will end I/O on @bytes_done number of bytes. This may be 1024 * just a partial part of the bio, or it may be the whole bio. bio_endio() 1025 * is the preferred way to end I/O on a bio, it takes care of decrementing 1026 * bi_size and clearing BIO_UPTODATE on error. @error is 0 on success, and 1027 * and one of the established -Exxxx (-EIO, for instance) error values in 1028 * case something went wrong. Noone should call bi_end_io() directly on 1029 * a bio unless they own it and thus know that it has an end_io function. 1030 **/ 1031void bio_endio(struct bio *bio, unsigned int bytes_done, int error) 1032{ 1033 if (error) 1034 clear_bit(BIO_UPTODATE, &bio->bi_flags); 1035 1036 if (unlikely(bytes_done > bio->bi_size)) { 1037 printk("%s: want %u bytes done, only %u left\n", __FUNCTION__, 1038 bytes_done, bio->bi_size); 1039 bytes_done = bio->bi_size; 1040 } 1041 1042 bio->bi_size -= bytes_done; 1043 bio->bi_sector += (bytes_done >> 9); 1044 1045 if (bio->bi_end_io) 1046 bio->bi_end_io(bio, bytes_done, error); 1047} 1048 1049void bio_pair_release(struct bio_pair *bp) 1050{ 1051 if (atomic_dec_and_test(&bp->cnt)) { 1052 struct bio *master = bp->bio1.bi_private; 1053 1054 bio_endio(master, master->bi_size, bp->error); 1055 mempool_free(bp, bp->bio2.bi_private); 1056 } 1057} 1058 1059static int bio_pair_end_1(struct bio * bi, unsigned int done, int err) 1060{ 1061 struct bio_pair *bp = container_of(bi, struct bio_pair, bio1); 1062 1063 if (err) 1064 bp->error = err; 1065 1066 if (bi->bi_size) 1067 return 1; 1068 1069 bio_pair_release(bp); 1070 return 0; 1071} 1072 1073static int bio_pair_end_2(struct bio * bi, unsigned int done, int err) 1074{ 1075 struct bio_pair *bp = container_of(bi, struct bio_pair, bio2); 1076 1077 if (err) 1078 bp->error = err; 1079 1080 if (bi->bi_size) 1081 return 1; 1082 1083 bio_pair_release(bp); 1084 return 0; 1085} 1086 1087/* 1088 * split a bio - only worry about a bio with a single page 1089 * in it's iovec 1090 */ 1091struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors) 1092{ 1093 struct bio_pair *bp = mempool_alloc(pool, GFP_NOIO); 1094 1095 if (!bp) 1096 return bp; 1097 1098 blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi, 1099 bi->bi_sector + first_sectors); 1100 1101 BUG_ON(bi->bi_vcnt != 1); 1102 BUG_ON(bi->bi_idx != 0); 1103 atomic_set(&bp->cnt, 3); 1104 bp->error = 0; 1105 bp->bio1 = *bi; 1106 bp->bio2 = *bi; 1107 bp->bio2.bi_sector += first_sectors; 1108 bp->bio2.bi_size -= first_sectors << 9; 1109 bp->bio1.bi_size = first_sectors << 9; 1110 1111 bp->bv1 = bi->bi_io_vec[0]; 1112 bp->bv2 = bi->bi_io_vec[0]; 1113 bp->bv2.bv_offset += first_sectors << 9; 1114 bp->bv2.bv_len -= first_sectors << 9; 1115 bp->bv1.bv_len = first_sectors << 9; 1116 1117 bp->bio1.bi_io_vec = &bp->bv1; 1118 bp->bio2.bi_io_vec = &bp->bv2; 1119 1120 bp->bio1.bi_max_vecs = 1; 1121 bp->bio2.bi_max_vecs = 1; 1122 1123 bp->bio1.bi_end_io = bio_pair_end_1; 1124 bp->bio2.bi_end_io = bio_pair_end_2; 1125 1126 bp->bio1.bi_private = bi; 1127 bp->bio2.bi_private = pool; 1128 1129 return bp; 1130} 1131 1132 1133/* 1134 * create memory pools for biovec's in a bio_set. 1135 * use the global biovec slabs created for general use. 1136 */ 1137static int biovec_create_pools(struct bio_set *bs, int pool_entries, int scale) 1138{ 1139 int i; 1140 1141 for (i = 0; i < BIOVEC_NR_POOLS; i++) { 1142 struct biovec_slab *bp = bvec_slabs + i; 1143 mempool_t **bvp = bs->bvec_pools + i; 1144 1145 if (pool_entries > 1 && i >= scale) 1146 pool_entries >>= 1; 1147 1148 *bvp = mempool_create_slab_pool(pool_entries, bp->slab); 1149 if (!*bvp) 1150 return -ENOMEM; 1151 } 1152 return 0; 1153} 1154 1155static void biovec_free_pools(struct bio_set *bs) 1156{ 1157 int i; 1158 1159 for (i = 0; i < BIOVEC_NR_POOLS; i++) { 1160 mempool_t *bvp = bs->bvec_pools[i]; 1161 1162 if (bvp) 1163 mempool_destroy(bvp); 1164 } 1165 1166} 1167 1168void bioset_free(struct bio_set *bs) 1169{ 1170 if (bs->bio_pool) 1171 mempool_destroy(bs->bio_pool); 1172 1173 biovec_free_pools(bs); 1174 1175 kfree(bs); 1176} 1177 1178struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size, int scale) 1179{ 1180 struct bio_set *bs = kzalloc(sizeof(*bs), GFP_KERNEL); 1181 1182 if (!bs) 1183 return NULL; 1184 1185 bs->bio_pool = mempool_create_slab_pool(bio_pool_size, bio_slab); 1186 if (!bs->bio_pool) 1187 goto bad; 1188 1189 if (!biovec_create_pools(bs, bvec_pool_size, scale)) 1190 return bs; 1191 1192bad: 1193 bioset_free(bs); 1194 return NULL; 1195} 1196 1197static void __init biovec_init_slabs(void) 1198{ 1199 int i; 1200 1201 for (i = 0; i < BIOVEC_NR_POOLS; i++) { 1202 int size; 1203 struct biovec_slab *bvs = bvec_slabs + i; 1204 1205 size = bvs->nr_vecs * sizeof(struct bio_vec); 1206 bvs->slab = kmem_cache_create(bvs->name, size, 0, 1207 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1208 } 1209} 1210 1211static int __init init_bio(void) 1212{ 1213 int megabytes, bvec_pool_entries; 1214 int scale = BIOVEC_NR_POOLS; 1215 1216 bio_slab = kmem_cache_create("bio", sizeof(struct bio), 0, 1217 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1218 1219 biovec_init_slabs(); 1220 1221 megabytes = nr_free_pages() >> (20 - PAGE_SHIFT); 1222 1223 /* 1224 * find out where to start scaling 1225 */ 1226 if (megabytes <= 16) 1227 scale = 0; 1228 else if (megabytes <= 32) 1229 scale = 1; 1230 else if (megabytes <= 64) 1231 scale = 2; 1232 else if (megabytes <= 96) 1233 scale = 3; 1234 else if (megabytes <= 128) 1235 scale = 4; 1236 1237 /* 1238 * Limit number of entries reserved -- mempools are only used when 1239 * the system is completely unable to allocate memory, so we only 1240 * need enough to make progress. 1241 */ 1242 bvec_pool_entries = 1 + scale; 1243 1244 fs_bio_set = bioset_create(BIO_POOL_SIZE, bvec_pool_entries, scale); 1245 if (!fs_bio_set) 1246 panic("bio: can't allocate bios\n"); 1247 1248 bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES, 1249 sizeof(struct bio_pair)); 1250 if (!bio_split_pool) 1251 panic("bio: can't create split pool\n"); 1252 1253 return 0; 1254} 1255 1256subsys_initcall(init_bio); 1257 1258EXPORT_SYMBOL(bio_alloc); 1259EXPORT_SYMBOL(bio_put); 1260EXPORT_SYMBOL(bio_free); 1261EXPORT_SYMBOL(bio_endio); 1262EXPORT_SYMBOL(bio_init); 1263EXPORT_SYMBOL(__bio_clone); 1264EXPORT_SYMBOL(bio_clone); 1265EXPORT_SYMBOL(bio_phys_segments); 1266EXPORT_SYMBOL(bio_hw_segments); 1267EXPORT_SYMBOL(bio_add_page); 1268EXPORT_SYMBOL(bio_add_pc_page); 1269EXPORT_SYMBOL(bio_get_nr_vecs); 1270EXPORT_SYMBOL(bio_map_user); 1271EXPORT_SYMBOL(bio_unmap_user); 1272EXPORT_SYMBOL(bio_map_kern); 1273EXPORT_SYMBOL(bio_pair_release); 1274EXPORT_SYMBOL(bio_split); 1275EXPORT_SYMBOL(bio_split_pool); 1276EXPORT_SYMBOL(bio_copy_user); 1277EXPORT_SYMBOL(bio_uncopy_user); 1278EXPORT_SYMBOL(bioset_create); 1279EXPORT_SYMBOL(bioset_free); 1280EXPORT_SYMBOL(bio_alloc_bioset);