Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at 77b2555b52a894a2e39a42e43d993df875c46a6a 1242 lines 29 kB view raw
1/* 2 * Copyright (C) 2001 Jens Axboe <axboe@suse.de> 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License version 2 as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * GNU General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public Licens 14 * along with this program; if not, write to the Free Software 15 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 16 * 17 */ 18#include <linux/mm.h> 19#include <linux/swap.h> 20#include <linux/bio.h> 21#include <linux/blkdev.h> 22#include <linux/slab.h> 23#include <linux/init.h> 24#include <linux/kernel.h> 25#include <linux/module.h> 26#include <linux/mempool.h> 27#include <linux/workqueue.h> 28#include <scsi/sg.h> /* for struct sg_iovec */ 29 30#define BIO_POOL_SIZE 256 31 32static kmem_cache_t *bio_slab; 33 34#define BIOVEC_NR_POOLS 6 35 36/* 37 * a small number of entries is fine, not going to be performance critical. 38 * basically we just need to survive 39 */ 40#define BIO_SPLIT_ENTRIES 8 41mempool_t *bio_split_pool; 42 43struct biovec_slab { 44 int nr_vecs; 45 char *name; 46 kmem_cache_t *slab; 47}; 48 49/* 50 * if you change this list, also change bvec_alloc or things will 51 * break badly! cannot be bigger than what you can fit into an 52 * unsigned short 53 */ 54 55#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) } 56static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { 57 BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES), 58}; 59#undef BV 60 61/* 62 * bio_set is used to allow other portions of the IO system to 63 * allocate their own private memory pools for bio and iovec structures. 64 * These memory pools in turn all allocate from the bio_slab 65 * and the bvec_slabs[]. 66 */ 67struct bio_set { 68 mempool_t *bio_pool; 69 mempool_t *bvec_pools[BIOVEC_NR_POOLS]; 70}; 71 72/* 73 * fs_bio_set is the bio_set containing bio and iovec memory pools used by 74 * IO code that does not need private memory pools. 75 */ 76static struct bio_set *fs_bio_set; 77 78static inline struct bio_vec *bvec_alloc_bs(unsigned int __nocast gfp_mask, int nr, unsigned long *idx, struct bio_set *bs) 79{ 80 struct bio_vec *bvl; 81 struct biovec_slab *bp; 82 83 /* 84 * see comment near bvec_array define! 85 */ 86 switch (nr) { 87 case 1 : *idx = 0; break; 88 case 2 ... 4: *idx = 1; break; 89 case 5 ... 16: *idx = 2; break; 90 case 17 ... 64: *idx = 3; break; 91 case 65 ... 128: *idx = 4; break; 92 case 129 ... BIO_MAX_PAGES: *idx = 5; break; 93 default: 94 return NULL; 95 } 96 /* 97 * idx now points to the pool we want to allocate from 98 */ 99 100 bp = bvec_slabs + *idx; 101 bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask); 102 if (bvl) 103 memset(bvl, 0, bp->nr_vecs * sizeof(struct bio_vec)); 104 105 return bvl; 106} 107 108void bio_free(struct bio *bio, struct bio_set *bio_set) 109{ 110 const int pool_idx = BIO_POOL_IDX(bio); 111 112 BIO_BUG_ON(pool_idx >= BIOVEC_NR_POOLS); 113 114 mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]); 115 mempool_free(bio, bio_set->bio_pool); 116} 117 118/* 119 * default destructor for a bio allocated with bio_alloc_bioset() 120 */ 121static void bio_fs_destructor(struct bio *bio) 122{ 123 bio_free(bio, fs_bio_set); 124} 125 126inline void bio_init(struct bio *bio) 127{ 128 bio->bi_next = NULL; 129 bio->bi_flags = 1 << BIO_UPTODATE; 130 bio->bi_rw = 0; 131 bio->bi_vcnt = 0; 132 bio->bi_idx = 0; 133 bio->bi_phys_segments = 0; 134 bio->bi_hw_segments = 0; 135 bio->bi_hw_front_size = 0; 136 bio->bi_hw_back_size = 0; 137 bio->bi_size = 0; 138 bio->bi_max_vecs = 0; 139 bio->bi_end_io = NULL; 140 atomic_set(&bio->bi_cnt, 1); 141 bio->bi_private = NULL; 142} 143 144/** 145 * bio_alloc_bioset - allocate a bio for I/O 146 * @gfp_mask: the GFP_ mask given to the slab allocator 147 * @nr_iovecs: number of iovecs to pre-allocate 148 * @bs: the bio_set to allocate from 149 * 150 * Description: 151 * bio_alloc_bioset will first try it's on mempool to satisfy the allocation. 152 * If %__GFP_WAIT is set then we will block on the internal pool waiting 153 * for a &struct bio to become free. 154 * 155 * allocate bio and iovecs from the memory pools specified by the 156 * bio_set structure. 157 **/ 158struct bio *bio_alloc_bioset(unsigned int __nocast gfp_mask, int nr_iovecs, struct bio_set *bs) 159{ 160 struct bio *bio = mempool_alloc(bs->bio_pool, gfp_mask); 161 162 if (likely(bio)) { 163 struct bio_vec *bvl = NULL; 164 165 bio_init(bio); 166 if (likely(nr_iovecs)) { 167 unsigned long idx; 168 169 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); 170 if (unlikely(!bvl)) { 171 mempool_free(bio, bs->bio_pool); 172 bio = NULL; 173 goto out; 174 } 175 bio->bi_flags |= idx << BIO_POOL_OFFSET; 176 bio->bi_max_vecs = bvec_slabs[idx].nr_vecs; 177 } 178 bio->bi_io_vec = bvl; 179 } 180out: 181 return bio; 182} 183 184struct bio *bio_alloc(unsigned int __nocast gfp_mask, int nr_iovecs) 185{ 186 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); 187 188 if (bio) 189 bio->bi_destructor = bio_fs_destructor; 190 191 return bio; 192} 193 194void zero_fill_bio(struct bio *bio) 195{ 196 unsigned long flags; 197 struct bio_vec *bv; 198 int i; 199 200 bio_for_each_segment(bv, bio, i) { 201 char *data = bvec_kmap_irq(bv, &flags); 202 memset(data, 0, bv->bv_len); 203 flush_dcache_page(bv->bv_page); 204 bvec_kunmap_irq(data, &flags); 205 } 206} 207EXPORT_SYMBOL(zero_fill_bio); 208 209/** 210 * bio_put - release a reference to a bio 211 * @bio: bio to release reference to 212 * 213 * Description: 214 * Put a reference to a &struct bio, either one you have gotten with 215 * bio_alloc or bio_get. The last put of a bio will free it. 216 **/ 217void bio_put(struct bio *bio) 218{ 219 BIO_BUG_ON(!atomic_read(&bio->bi_cnt)); 220 221 /* 222 * last put frees it 223 */ 224 if (atomic_dec_and_test(&bio->bi_cnt)) { 225 bio->bi_next = NULL; 226 bio->bi_destructor(bio); 227 } 228} 229 230inline int bio_phys_segments(request_queue_t *q, struct bio *bio) 231{ 232 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 233 blk_recount_segments(q, bio); 234 235 return bio->bi_phys_segments; 236} 237 238inline int bio_hw_segments(request_queue_t *q, struct bio *bio) 239{ 240 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 241 blk_recount_segments(q, bio); 242 243 return bio->bi_hw_segments; 244} 245 246/** 247 * __bio_clone - clone a bio 248 * @bio: destination bio 249 * @bio_src: bio to clone 250 * 251 * Clone a &bio. Caller will own the returned bio, but not 252 * the actual data it points to. Reference count of returned 253 * bio will be one. 254 */ 255inline void __bio_clone(struct bio *bio, struct bio *bio_src) 256{ 257 request_queue_t *q = bdev_get_queue(bio_src->bi_bdev); 258 259 memcpy(bio->bi_io_vec, bio_src->bi_io_vec, 260 bio_src->bi_max_vecs * sizeof(struct bio_vec)); 261 262 bio->bi_sector = bio_src->bi_sector; 263 bio->bi_bdev = bio_src->bi_bdev; 264 bio->bi_flags |= 1 << BIO_CLONED; 265 bio->bi_rw = bio_src->bi_rw; 266 bio->bi_vcnt = bio_src->bi_vcnt; 267 bio->bi_size = bio_src->bi_size; 268 bio->bi_idx = bio_src->bi_idx; 269 bio_phys_segments(q, bio); 270 bio_hw_segments(q, bio); 271} 272 273/** 274 * bio_clone - clone a bio 275 * @bio: bio to clone 276 * @gfp_mask: allocation priority 277 * 278 * Like __bio_clone, only also allocates the returned bio 279 */ 280struct bio *bio_clone(struct bio *bio, unsigned int __nocast gfp_mask) 281{ 282 struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set); 283 284 if (b) { 285 b->bi_destructor = bio_fs_destructor; 286 __bio_clone(b, bio); 287 } 288 289 return b; 290} 291 292/** 293 * bio_get_nr_vecs - return approx number of vecs 294 * @bdev: I/O target 295 * 296 * Return the approximate number of pages we can send to this target. 297 * There's no guarantee that you will be able to fit this number of pages 298 * into a bio, it does not account for dynamic restrictions that vary 299 * on offset. 300 */ 301int bio_get_nr_vecs(struct block_device *bdev) 302{ 303 request_queue_t *q = bdev_get_queue(bdev); 304 int nr_pages; 305 306 nr_pages = ((q->max_sectors << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT; 307 if (nr_pages > q->max_phys_segments) 308 nr_pages = q->max_phys_segments; 309 if (nr_pages > q->max_hw_segments) 310 nr_pages = q->max_hw_segments; 311 312 return nr_pages; 313} 314 315static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page 316 *page, unsigned int len, unsigned int offset) 317{ 318 int retried_segments = 0; 319 struct bio_vec *bvec; 320 321 /* 322 * cloned bio must not modify vec list 323 */ 324 if (unlikely(bio_flagged(bio, BIO_CLONED))) 325 return 0; 326 327 if (bio->bi_vcnt >= bio->bi_max_vecs) 328 return 0; 329 330 if (((bio->bi_size + len) >> 9) > q->max_sectors) 331 return 0; 332 333 /* 334 * we might lose a segment or two here, but rather that than 335 * make this too complex. 336 */ 337 338 while (bio->bi_phys_segments >= q->max_phys_segments 339 || bio->bi_hw_segments >= q->max_hw_segments 340 || BIOVEC_VIRT_OVERSIZE(bio->bi_size)) { 341 342 if (retried_segments) 343 return 0; 344 345 retried_segments = 1; 346 blk_recount_segments(q, bio); 347 } 348 349 /* 350 * setup the new entry, we might clear it again later if we 351 * cannot add the page 352 */ 353 bvec = &bio->bi_io_vec[bio->bi_vcnt]; 354 bvec->bv_page = page; 355 bvec->bv_len = len; 356 bvec->bv_offset = offset; 357 358 /* 359 * if queue has other restrictions (eg varying max sector size 360 * depending on offset), it can specify a merge_bvec_fn in the 361 * queue to get further control 362 */ 363 if (q->merge_bvec_fn) { 364 /* 365 * merge_bvec_fn() returns number of bytes it can accept 366 * at this offset 367 */ 368 if (q->merge_bvec_fn(q, bio, bvec) < len) { 369 bvec->bv_page = NULL; 370 bvec->bv_len = 0; 371 bvec->bv_offset = 0; 372 return 0; 373 } 374 } 375 376 /* If we may be able to merge these biovecs, force a recount */ 377 if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec) || 378 BIOVEC_VIRT_MERGEABLE(bvec-1, bvec))) 379 bio->bi_flags &= ~(1 << BIO_SEG_VALID); 380 381 bio->bi_vcnt++; 382 bio->bi_phys_segments++; 383 bio->bi_hw_segments++; 384 bio->bi_size += len; 385 return len; 386} 387 388/** 389 * bio_add_page - attempt to add page to bio 390 * @bio: destination bio 391 * @page: page to add 392 * @len: vec entry length 393 * @offset: vec entry offset 394 * 395 * Attempt to add a page to the bio_vec maplist. This can fail for a 396 * number of reasons, such as the bio being full or target block 397 * device limitations. The target block device must allow bio's 398 * smaller than PAGE_SIZE, so it is always possible to add a single 399 * page to an empty bio. 400 */ 401int bio_add_page(struct bio *bio, struct page *page, unsigned int len, 402 unsigned int offset) 403{ 404 return __bio_add_page(bdev_get_queue(bio->bi_bdev), bio, page, 405 len, offset); 406} 407 408struct bio_map_data { 409 struct bio_vec *iovecs; 410 void __user *userptr; 411}; 412 413static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio) 414{ 415 memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt); 416 bio->bi_private = bmd; 417} 418 419static void bio_free_map_data(struct bio_map_data *bmd) 420{ 421 kfree(bmd->iovecs); 422 kfree(bmd); 423} 424 425static struct bio_map_data *bio_alloc_map_data(int nr_segs) 426{ 427 struct bio_map_data *bmd = kmalloc(sizeof(*bmd), GFP_KERNEL); 428 429 if (!bmd) 430 return NULL; 431 432 bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, GFP_KERNEL); 433 if (bmd->iovecs) 434 return bmd; 435 436 kfree(bmd); 437 return NULL; 438} 439 440/** 441 * bio_uncopy_user - finish previously mapped bio 442 * @bio: bio being terminated 443 * 444 * Free pages allocated from bio_copy_user() and write back data 445 * to user space in case of a read. 446 */ 447int bio_uncopy_user(struct bio *bio) 448{ 449 struct bio_map_data *bmd = bio->bi_private; 450 const int read = bio_data_dir(bio) == READ; 451 struct bio_vec *bvec; 452 int i, ret = 0; 453 454 __bio_for_each_segment(bvec, bio, i, 0) { 455 char *addr = page_address(bvec->bv_page); 456 unsigned int len = bmd->iovecs[i].bv_len; 457 458 if (read && !ret && copy_to_user(bmd->userptr, addr, len)) 459 ret = -EFAULT; 460 461 __free_page(bvec->bv_page); 462 bmd->userptr += len; 463 } 464 bio_free_map_data(bmd); 465 bio_put(bio); 466 return ret; 467} 468 469/** 470 * bio_copy_user - copy user data to bio 471 * @q: destination block queue 472 * @uaddr: start of user address 473 * @len: length in bytes 474 * @write_to_vm: bool indicating writing to pages or not 475 * 476 * Prepares and returns a bio for indirect user io, bouncing data 477 * to/from kernel pages as necessary. Must be paired with 478 * call bio_uncopy_user() on io completion. 479 */ 480struct bio *bio_copy_user(request_queue_t *q, unsigned long uaddr, 481 unsigned int len, int write_to_vm) 482{ 483 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 484 unsigned long start = uaddr >> PAGE_SHIFT; 485 struct bio_map_data *bmd; 486 struct bio_vec *bvec; 487 struct page *page; 488 struct bio *bio; 489 int i, ret; 490 491 bmd = bio_alloc_map_data(end - start); 492 if (!bmd) 493 return ERR_PTR(-ENOMEM); 494 495 bmd->userptr = (void __user *) uaddr; 496 497 ret = -ENOMEM; 498 bio = bio_alloc(GFP_KERNEL, end - start); 499 if (!bio) 500 goto out_bmd; 501 502 bio->bi_rw |= (!write_to_vm << BIO_RW); 503 504 ret = 0; 505 while (len) { 506 unsigned int bytes = PAGE_SIZE; 507 508 if (bytes > len) 509 bytes = len; 510 511 page = alloc_page(q->bounce_gfp | GFP_KERNEL); 512 if (!page) { 513 ret = -ENOMEM; 514 break; 515 } 516 517 if (__bio_add_page(q, bio, page, bytes, 0) < bytes) { 518 ret = -EINVAL; 519 break; 520 } 521 522 len -= bytes; 523 } 524 525 if (ret) 526 goto cleanup; 527 528 /* 529 * success 530 */ 531 if (!write_to_vm) { 532 char __user *p = (char __user *) uaddr; 533 534 /* 535 * for a write, copy in data to kernel pages 536 */ 537 ret = -EFAULT; 538 bio_for_each_segment(bvec, bio, i) { 539 char *addr = page_address(bvec->bv_page); 540 541 if (copy_from_user(addr, p, bvec->bv_len)) 542 goto cleanup; 543 p += bvec->bv_len; 544 } 545 } 546 547 bio_set_map_data(bmd, bio); 548 return bio; 549cleanup: 550 bio_for_each_segment(bvec, bio, i) 551 __free_page(bvec->bv_page); 552 553 bio_put(bio); 554out_bmd: 555 bio_free_map_data(bmd); 556 return ERR_PTR(ret); 557} 558 559static struct bio *__bio_map_user_iov(request_queue_t *q, 560 struct block_device *bdev, 561 struct sg_iovec *iov, int iov_count, 562 int write_to_vm) 563{ 564 int i, j; 565 int nr_pages = 0; 566 struct page **pages; 567 struct bio *bio; 568 int cur_page = 0; 569 int ret, offset; 570 571 for (i = 0; i < iov_count; i++) { 572 unsigned long uaddr = (unsigned long)iov[i].iov_base; 573 unsigned long len = iov[i].iov_len; 574 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 575 unsigned long start = uaddr >> PAGE_SHIFT; 576 577 nr_pages += end - start; 578 /* 579 * transfer and buffer must be aligned to at least hardsector 580 * size for now, in the future we can relax this restriction 581 */ 582 if ((uaddr & queue_dma_alignment(q)) || (len & queue_dma_alignment(q))) 583 return ERR_PTR(-EINVAL); 584 } 585 586 if (!nr_pages) 587 return ERR_PTR(-EINVAL); 588 589 bio = bio_alloc(GFP_KERNEL, nr_pages); 590 if (!bio) 591 return ERR_PTR(-ENOMEM); 592 593 ret = -ENOMEM; 594 pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); 595 if (!pages) 596 goto out; 597 598 memset(pages, 0, nr_pages * sizeof(struct page *)); 599 600 for (i = 0; i < iov_count; i++) { 601 unsigned long uaddr = (unsigned long)iov[i].iov_base; 602 unsigned long len = iov[i].iov_len; 603 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 604 unsigned long start = uaddr >> PAGE_SHIFT; 605 const int local_nr_pages = end - start; 606 const int page_limit = cur_page + local_nr_pages; 607 608 down_read(&current->mm->mmap_sem); 609 ret = get_user_pages(current, current->mm, uaddr, 610 local_nr_pages, 611 write_to_vm, 0, &pages[cur_page], NULL); 612 up_read(&current->mm->mmap_sem); 613 614 if (ret < local_nr_pages) 615 goto out_unmap; 616 617 618 offset = uaddr & ~PAGE_MASK; 619 for (j = cur_page; j < page_limit; j++) { 620 unsigned int bytes = PAGE_SIZE - offset; 621 622 if (len <= 0) 623 break; 624 625 if (bytes > len) 626 bytes = len; 627 628 /* 629 * sorry... 630 */ 631 if (__bio_add_page(q, bio, pages[j], bytes, offset) < bytes) 632 break; 633 634 len -= bytes; 635 offset = 0; 636 } 637 638 cur_page = j; 639 /* 640 * release the pages we didn't map into the bio, if any 641 */ 642 while (j < page_limit) 643 page_cache_release(pages[j++]); 644 } 645 646 kfree(pages); 647 648 /* 649 * set data direction, and check if mapped pages need bouncing 650 */ 651 if (!write_to_vm) 652 bio->bi_rw |= (1 << BIO_RW); 653 654 bio->bi_bdev = bdev; 655 bio->bi_flags |= (1 << BIO_USER_MAPPED); 656 return bio; 657 658 out_unmap: 659 for (i = 0; i < nr_pages; i++) { 660 if(!pages[i]) 661 break; 662 page_cache_release(pages[i]); 663 } 664 out: 665 kfree(pages); 666 bio_put(bio); 667 return ERR_PTR(ret); 668} 669 670/** 671 * bio_map_user - map user address into bio 672 * @q: the request_queue_t for the bio 673 * @bdev: destination block device 674 * @uaddr: start of user address 675 * @len: length in bytes 676 * @write_to_vm: bool indicating writing to pages or not 677 * 678 * Map the user space address into a bio suitable for io to a block 679 * device. Returns an error pointer in case of error. 680 */ 681struct bio *bio_map_user(request_queue_t *q, struct block_device *bdev, 682 unsigned long uaddr, unsigned int len, int write_to_vm) 683{ 684 struct sg_iovec iov; 685 686 iov.iov_base = (void __user *)uaddr; 687 iov.iov_len = len; 688 689 return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm); 690} 691 692/** 693 * bio_map_user_iov - map user sg_iovec table into bio 694 * @q: the request_queue_t for the bio 695 * @bdev: destination block device 696 * @iov: the iovec. 697 * @iov_count: number of elements in the iovec 698 * @write_to_vm: bool indicating writing to pages or not 699 * 700 * Map the user space address into a bio suitable for io to a block 701 * device. Returns an error pointer in case of error. 702 */ 703struct bio *bio_map_user_iov(request_queue_t *q, struct block_device *bdev, 704 struct sg_iovec *iov, int iov_count, 705 int write_to_vm) 706{ 707 struct bio *bio; 708 int len = 0, i; 709 710 bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm); 711 712 if (IS_ERR(bio)) 713 return bio; 714 715 /* 716 * subtle -- if __bio_map_user() ended up bouncing a bio, 717 * it would normally disappear when its bi_end_io is run. 718 * however, we need it for the unmap, so grab an extra 719 * reference to it 720 */ 721 bio_get(bio); 722 723 for (i = 0; i < iov_count; i++) 724 len += iov[i].iov_len; 725 726 if (bio->bi_size == len) 727 return bio; 728 729 /* 730 * don't support partial mappings 731 */ 732 bio_endio(bio, bio->bi_size, 0); 733 bio_unmap_user(bio); 734 return ERR_PTR(-EINVAL); 735} 736 737static void __bio_unmap_user(struct bio *bio) 738{ 739 struct bio_vec *bvec; 740 int i; 741 742 /* 743 * make sure we dirty pages we wrote to 744 */ 745 __bio_for_each_segment(bvec, bio, i, 0) { 746 if (bio_data_dir(bio) == READ) 747 set_page_dirty_lock(bvec->bv_page); 748 749 page_cache_release(bvec->bv_page); 750 } 751 752 bio_put(bio); 753} 754 755/** 756 * bio_unmap_user - unmap a bio 757 * @bio: the bio being unmapped 758 * 759 * Unmap a bio previously mapped by bio_map_user(). Must be called with 760 * a process context. 761 * 762 * bio_unmap_user() may sleep. 763 */ 764void bio_unmap_user(struct bio *bio) 765{ 766 __bio_unmap_user(bio); 767 bio_put(bio); 768} 769 770static int bio_map_kern_endio(struct bio *bio, unsigned int bytes_done, int err) 771{ 772 if (bio->bi_size) 773 return 1; 774 775 bio_put(bio); 776 return 0; 777} 778 779 780static struct bio *__bio_map_kern(request_queue_t *q, void *data, 781 unsigned int len, unsigned int gfp_mask) 782{ 783 unsigned long kaddr = (unsigned long)data; 784 unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 785 unsigned long start = kaddr >> PAGE_SHIFT; 786 const int nr_pages = end - start; 787 int offset, i; 788 struct bio *bio; 789 790 bio = bio_alloc(gfp_mask, nr_pages); 791 if (!bio) 792 return ERR_PTR(-ENOMEM); 793 794 offset = offset_in_page(kaddr); 795 for (i = 0; i < nr_pages; i++) { 796 unsigned int bytes = PAGE_SIZE - offset; 797 798 if (len <= 0) 799 break; 800 801 if (bytes > len) 802 bytes = len; 803 804 if (__bio_add_page(q, bio, virt_to_page(data), bytes, 805 offset) < bytes) 806 break; 807 808 data += bytes; 809 len -= bytes; 810 offset = 0; 811 } 812 813 bio->bi_end_io = bio_map_kern_endio; 814 return bio; 815} 816 817/** 818 * bio_map_kern - map kernel address into bio 819 * @q: the request_queue_t for the bio 820 * @data: pointer to buffer to map 821 * @len: length in bytes 822 * @gfp_mask: allocation flags for bio allocation 823 * 824 * Map the kernel address into a bio suitable for io to a block 825 * device. Returns an error pointer in case of error. 826 */ 827struct bio *bio_map_kern(request_queue_t *q, void *data, unsigned int len, 828 unsigned int gfp_mask) 829{ 830 struct bio *bio; 831 832 bio = __bio_map_kern(q, data, len, gfp_mask); 833 if (IS_ERR(bio)) 834 return bio; 835 836 if (bio->bi_size == len) 837 return bio; 838 839 /* 840 * Don't support partial mappings. 841 */ 842 bio_put(bio); 843 return ERR_PTR(-EINVAL); 844} 845 846/* 847 * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions 848 * for performing direct-IO in BIOs. 849 * 850 * The problem is that we cannot run set_page_dirty() from interrupt context 851 * because the required locks are not interrupt-safe. So what we can do is to 852 * mark the pages dirty _before_ performing IO. And in interrupt context, 853 * check that the pages are still dirty. If so, fine. If not, redirty them 854 * in process context. 855 * 856 * We special-case compound pages here: normally this means reads into hugetlb 857 * pages. The logic in here doesn't really work right for compound pages 858 * because the VM does not uniformly chase down the head page in all cases. 859 * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't 860 * handle them at all. So we skip compound pages here at an early stage. 861 * 862 * Note that this code is very hard to test under normal circumstances because 863 * direct-io pins the pages with get_user_pages(). This makes 864 * is_page_cache_freeable return false, and the VM will not clean the pages. 865 * But other code (eg, pdflush) could clean the pages if they are mapped 866 * pagecache. 867 * 868 * Simply disabling the call to bio_set_pages_dirty() is a good way to test the 869 * deferred bio dirtying paths. 870 */ 871 872/* 873 * bio_set_pages_dirty() will mark all the bio's pages as dirty. 874 */ 875void bio_set_pages_dirty(struct bio *bio) 876{ 877 struct bio_vec *bvec = bio->bi_io_vec; 878 int i; 879 880 for (i = 0; i < bio->bi_vcnt; i++) { 881 struct page *page = bvec[i].bv_page; 882 883 if (page && !PageCompound(page)) 884 set_page_dirty_lock(page); 885 } 886} 887 888static void bio_release_pages(struct bio *bio) 889{ 890 struct bio_vec *bvec = bio->bi_io_vec; 891 int i; 892 893 for (i = 0; i < bio->bi_vcnt; i++) { 894 struct page *page = bvec[i].bv_page; 895 896 if (page) 897 put_page(page); 898 } 899} 900 901/* 902 * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. 903 * If they are, then fine. If, however, some pages are clean then they must 904 * have been written out during the direct-IO read. So we take another ref on 905 * the BIO and the offending pages and re-dirty the pages in process context. 906 * 907 * It is expected that bio_check_pages_dirty() will wholly own the BIO from 908 * here on. It will run one page_cache_release() against each page and will 909 * run one bio_put() against the BIO. 910 */ 911 912static void bio_dirty_fn(void *data); 913 914static DECLARE_WORK(bio_dirty_work, bio_dirty_fn, NULL); 915static DEFINE_SPINLOCK(bio_dirty_lock); 916static struct bio *bio_dirty_list; 917 918/* 919 * This runs in process context 920 */ 921static void bio_dirty_fn(void *data) 922{ 923 unsigned long flags; 924 struct bio *bio; 925 926 spin_lock_irqsave(&bio_dirty_lock, flags); 927 bio = bio_dirty_list; 928 bio_dirty_list = NULL; 929 spin_unlock_irqrestore(&bio_dirty_lock, flags); 930 931 while (bio) { 932 struct bio *next = bio->bi_private; 933 934 bio_set_pages_dirty(bio); 935 bio_release_pages(bio); 936 bio_put(bio); 937 bio = next; 938 } 939} 940 941void bio_check_pages_dirty(struct bio *bio) 942{ 943 struct bio_vec *bvec = bio->bi_io_vec; 944 int nr_clean_pages = 0; 945 int i; 946 947 for (i = 0; i < bio->bi_vcnt; i++) { 948 struct page *page = bvec[i].bv_page; 949 950 if (PageDirty(page) || PageCompound(page)) { 951 page_cache_release(page); 952 bvec[i].bv_page = NULL; 953 } else { 954 nr_clean_pages++; 955 } 956 } 957 958 if (nr_clean_pages) { 959 unsigned long flags; 960 961 spin_lock_irqsave(&bio_dirty_lock, flags); 962 bio->bi_private = bio_dirty_list; 963 bio_dirty_list = bio; 964 spin_unlock_irqrestore(&bio_dirty_lock, flags); 965 schedule_work(&bio_dirty_work); 966 } else { 967 bio_put(bio); 968 } 969} 970 971/** 972 * bio_endio - end I/O on a bio 973 * @bio: bio 974 * @bytes_done: number of bytes completed 975 * @error: error, if any 976 * 977 * Description: 978 * bio_endio() will end I/O on @bytes_done number of bytes. This may be 979 * just a partial part of the bio, or it may be the whole bio. bio_endio() 980 * is the preferred way to end I/O on a bio, it takes care of decrementing 981 * bi_size and clearing BIO_UPTODATE on error. @error is 0 on success, and 982 * and one of the established -Exxxx (-EIO, for instance) error values in 983 * case something went wrong. Noone should call bi_end_io() directly on 984 * a bio unless they own it and thus know that it has an end_io function. 985 **/ 986void bio_endio(struct bio *bio, unsigned int bytes_done, int error) 987{ 988 if (error) 989 clear_bit(BIO_UPTODATE, &bio->bi_flags); 990 991 if (unlikely(bytes_done > bio->bi_size)) { 992 printk("%s: want %u bytes done, only %u left\n", __FUNCTION__, 993 bytes_done, bio->bi_size); 994 bytes_done = bio->bi_size; 995 } 996 997 bio->bi_size -= bytes_done; 998 bio->bi_sector += (bytes_done >> 9); 999 1000 if (bio->bi_end_io) 1001 bio->bi_end_io(bio, bytes_done, error); 1002} 1003 1004void bio_pair_release(struct bio_pair *bp) 1005{ 1006 if (atomic_dec_and_test(&bp->cnt)) { 1007 struct bio *master = bp->bio1.bi_private; 1008 1009 bio_endio(master, master->bi_size, bp->error); 1010 mempool_free(bp, bp->bio2.bi_private); 1011 } 1012} 1013 1014static int bio_pair_end_1(struct bio * bi, unsigned int done, int err) 1015{ 1016 struct bio_pair *bp = container_of(bi, struct bio_pair, bio1); 1017 1018 if (err) 1019 bp->error = err; 1020 1021 if (bi->bi_size) 1022 return 1; 1023 1024 bio_pair_release(bp); 1025 return 0; 1026} 1027 1028static int bio_pair_end_2(struct bio * bi, unsigned int done, int err) 1029{ 1030 struct bio_pair *bp = container_of(bi, struct bio_pair, bio2); 1031 1032 if (err) 1033 bp->error = err; 1034 1035 if (bi->bi_size) 1036 return 1; 1037 1038 bio_pair_release(bp); 1039 return 0; 1040} 1041 1042/* 1043 * split a bio - only worry about a bio with a single page 1044 * in it's iovec 1045 */ 1046struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors) 1047{ 1048 struct bio_pair *bp = mempool_alloc(pool, GFP_NOIO); 1049 1050 if (!bp) 1051 return bp; 1052 1053 BUG_ON(bi->bi_vcnt != 1); 1054 BUG_ON(bi->bi_idx != 0); 1055 atomic_set(&bp->cnt, 3); 1056 bp->error = 0; 1057 bp->bio1 = *bi; 1058 bp->bio2 = *bi; 1059 bp->bio2.bi_sector += first_sectors; 1060 bp->bio2.bi_size -= first_sectors << 9; 1061 bp->bio1.bi_size = first_sectors << 9; 1062 1063 bp->bv1 = bi->bi_io_vec[0]; 1064 bp->bv2 = bi->bi_io_vec[0]; 1065 bp->bv2.bv_offset += first_sectors << 9; 1066 bp->bv2.bv_len -= first_sectors << 9; 1067 bp->bv1.bv_len = first_sectors << 9; 1068 1069 bp->bio1.bi_io_vec = &bp->bv1; 1070 bp->bio2.bi_io_vec = &bp->bv2; 1071 1072 bp->bio1.bi_end_io = bio_pair_end_1; 1073 bp->bio2.bi_end_io = bio_pair_end_2; 1074 1075 bp->bio1.bi_private = bi; 1076 bp->bio2.bi_private = pool; 1077 1078 return bp; 1079} 1080 1081static void *bio_pair_alloc(unsigned int __nocast gfp_flags, void *data) 1082{ 1083 return kmalloc(sizeof(struct bio_pair), gfp_flags); 1084} 1085 1086static void bio_pair_free(void *bp, void *data) 1087{ 1088 kfree(bp); 1089} 1090 1091 1092/* 1093 * create memory pools for biovec's in a bio_set. 1094 * use the global biovec slabs created for general use. 1095 */ 1096static int biovec_create_pools(struct bio_set *bs, int pool_entries, int scale) 1097{ 1098 int i; 1099 1100 for (i = 0; i < BIOVEC_NR_POOLS; i++) { 1101 struct biovec_slab *bp = bvec_slabs + i; 1102 mempool_t **bvp = bs->bvec_pools + i; 1103 1104 if (i >= scale) 1105 pool_entries >>= 1; 1106 1107 *bvp = mempool_create(pool_entries, mempool_alloc_slab, 1108 mempool_free_slab, bp->slab); 1109 if (!*bvp) 1110 return -ENOMEM; 1111 } 1112 return 0; 1113} 1114 1115static void biovec_free_pools(struct bio_set *bs) 1116{ 1117 int i; 1118 1119 for (i = 0; i < BIOVEC_NR_POOLS; i++) { 1120 mempool_t *bvp = bs->bvec_pools[i]; 1121 1122 if (bvp) 1123 mempool_destroy(bvp); 1124 } 1125 1126} 1127 1128void bioset_free(struct bio_set *bs) 1129{ 1130 if (bs->bio_pool) 1131 mempool_destroy(bs->bio_pool); 1132 1133 biovec_free_pools(bs); 1134 1135 kfree(bs); 1136} 1137 1138struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size, int scale) 1139{ 1140 struct bio_set *bs = kmalloc(sizeof(*bs), GFP_KERNEL); 1141 1142 if (!bs) 1143 return NULL; 1144 1145 memset(bs, 0, sizeof(*bs)); 1146 bs->bio_pool = mempool_create(bio_pool_size, mempool_alloc_slab, 1147 mempool_free_slab, bio_slab); 1148 1149 if (!bs->bio_pool) 1150 goto bad; 1151 1152 if (!biovec_create_pools(bs, bvec_pool_size, scale)) 1153 return bs; 1154 1155bad: 1156 bioset_free(bs); 1157 return NULL; 1158} 1159 1160static void __init biovec_init_slabs(void) 1161{ 1162 int i; 1163 1164 for (i = 0; i < BIOVEC_NR_POOLS; i++) { 1165 int size; 1166 struct biovec_slab *bvs = bvec_slabs + i; 1167 1168 size = bvs->nr_vecs * sizeof(struct bio_vec); 1169 bvs->slab = kmem_cache_create(bvs->name, size, 0, 1170 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1171 } 1172} 1173 1174static int __init init_bio(void) 1175{ 1176 int megabytes, bvec_pool_entries; 1177 int scale = BIOVEC_NR_POOLS; 1178 1179 bio_slab = kmem_cache_create("bio", sizeof(struct bio), 0, 1180 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1181 1182 biovec_init_slabs(); 1183 1184 megabytes = nr_free_pages() >> (20 - PAGE_SHIFT); 1185 1186 /* 1187 * find out where to start scaling 1188 */ 1189 if (megabytes <= 16) 1190 scale = 0; 1191 else if (megabytes <= 32) 1192 scale = 1; 1193 else if (megabytes <= 64) 1194 scale = 2; 1195 else if (megabytes <= 96) 1196 scale = 3; 1197 else if (megabytes <= 128) 1198 scale = 4; 1199 1200 /* 1201 * scale number of entries 1202 */ 1203 bvec_pool_entries = megabytes * 2; 1204 if (bvec_pool_entries > 256) 1205 bvec_pool_entries = 256; 1206 1207 fs_bio_set = bioset_create(BIO_POOL_SIZE, bvec_pool_entries, scale); 1208 if (!fs_bio_set) 1209 panic("bio: can't allocate bios\n"); 1210 1211 bio_split_pool = mempool_create(BIO_SPLIT_ENTRIES, 1212 bio_pair_alloc, bio_pair_free, NULL); 1213 if (!bio_split_pool) 1214 panic("bio: can't create split pool\n"); 1215 1216 return 0; 1217} 1218 1219subsys_initcall(init_bio); 1220 1221EXPORT_SYMBOL(bio_alloc); 1222EXPORT_SYMBOL(bio_put); 1223EXPORT_SYMBOL(bio_free); 1224EXPORT_SYMBOL(bio_endio); 1225EXPORT_SYMBOL(bio_init); 1226EXPORT_SYMBOL(__bio_clone); 1227EXPORT_SYMBOL(bio_clone); 1228EXPORT_SYMBOL(bio_phys_segments); 1229EXPORT_SYMBOL(bio_hw_segments); 1230EXPORT_SYMBOL(bio_add_page); 1231EXPORT_SYMBOL(bio_get_nr_vecs); 1232EXPORT_SYMBOL(bio_map_user); 1233EXPORT_SYMBOL(bio_unmap_user); 1234EXPORT_SYMBOL(bio_map_kern); 1235EXPORT_SYMBOL(bio_pair_release); 1236EXPORT_SYMBOL(bio_split); 1237EXPORT_SYMBOL(bio_split_pool); 1238EXPORT_SYMBOL(bio_copy_user); 1239EXPORT_SYMBOL(bio_uncopy_user); 1240EXPORT_SYMBOL(bioset_create); 1241EXPORT_SYMBOL(bioset_free); 1242EXPORT_SYMBOL(bio_alloc_bioset);