Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
at v2.6.13-rc4 1099 lines 26 kB view raw
1/* 2 * Copyright (C) 2001 Jens Axboe <axboe@suse.de> 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License version 2 as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * GNU General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public Licens 14 * along with this program; if not, write to the Free Software 15 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 16 * 17 */ 18#include <linux/mm.h> 19#include <linux/swap.h> 20#include <linux/bio.h> 21#include <linux/blkdev.h> 22#include <linux/slab.h> 23#include <linux/init.h> 24#include <linux/kernel.h> 25#include <linux/module.h> 26#include <linux/mempool.h> 27#include <linux/workqueue.h> 28 29#define BIO_POOL_SIZE 256 30 31static kmem_cache_t *bio_slab; 32 33#define BIOVEC_NR_POOLS 6 34 35/* 36 * a small number of entries is fine, not going to be performance critical. 37 * basically we just need to survive 38 */ 39#define BIO_SPLIT_ENTRIES 8 40mempool_t *bio_split_pool; 41 42struct biovec_slab { 43 int nr_vecs; 44 char *name; 45 kmem_cache_t *slab; 46}; 47 48/* 49 * if you change this list, also change bvec_alloc or things will 50 * break badly! cannot be bigger than what you can fit into an 51 * unsigned short 52 */ 53 54#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) } 55static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { 56 BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES), 57}; 58#undef BV 59 60/* 61 * bio_set is used to allow other portions of the IO system to 62 * allocate their own private memory pools for bio and iovec structures. 63 * These memory pools in turn all allocate from the bio_slab 64 * and the bvec_slabs[]. 65 */ 66struct bio_set { 67 mempool_t *bio_pool; 68 mempool_t *bvec_pools[BIOVEC_NR_POOLS]; 69}; 70 71/* 72 * fs_bio_set is the bio_set containing bio and iovec memory pools used by 73 * IO code that does not need private memory pools. 74 */ 75static struct bio_set *fs_bio_set; 76 77static inline struct bio_vec *bvec_alloc_bs(unsigned int __nocast gfp_mask, int nr, unsigned long *idx, struct bio_set *bs) 78{ 79 struct bio_vec *bvl; 80 struct biovec_slab *bp; 81 82 /* 83 * see comment near bvec_array define! 84 */ 85 switch (nr) { 86 case 1 : *idx = 0; break; 87 case 2 ... 4: *idx = 1; break; 88 case 5 ... 16: *idx = 2; break; 89 case 17 ... 64: *idx = 3; break; 90 case 65 ... 128: *idx = 4; break; 91 case 129 ... BIO_MAX_PAGES: *idx = 5; break; 92 default: 93 return NULL; 94 } 95 /* 96 * idx now points to the pool we want to allocate from 97 */ 98 99 bp = bvec_slabs + *idx; 100 bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask); 101 if (bvl) 102 memset(bvl, 0, bp->nr_vecs * sizeof(struct bio_vec)); 103 104 return bvl; 105} 106 107/* 108 * default destructor for a bio allocated with bio_alloc_bioset() 109 */ 110static void bio_destructor(struct bio *bio) 111{ 112 const int pool_idx = BIO_POOL_IDX(bio); 113 struct bio_set *bs = bio->bi_set; 114 115 BIO_BUG_ON(pool_idx >= BIOVEC_NR_POOLS); 116 117 mempool_free(bio->bi_io_vec, bs->bvec_pools[pool_idx]); 118 mempool_free(bio, bs->bio_pool); 119} 120 121inline void bio_init(struct bio *bio) 122{ 123 bio->bi_next = NULL; 124 bio->bi_flags = 1 << BIO_UPTODATE; 125 bio->bi_rw = 0; 126 bio->bi_vcnt = 0; 127 bio->bi_idx = 0; 128 bio->bi_phys_segments = 0; 129 bio->bi_hw_segments = 0; 130 bio->bi_hw_front_size = 0; 131 bio->bi_hw_back_size = 0; 132 bio->bi_size = 0; 133 bio->bi_max_vecs = 0; 134 bio->bi_end_io = NULL; 135 atomic_set(&bio->bi_cnt, 1); 136 bio->bi_private = NULL; 137} 138 139/** 140 * bio_alloc_bioset - allocate a bio for I/O 141 * @gfp_mask: the GFP_ mask given to the slab allocator 142 * @nr_iovecs: number of iovecs to pre-allocate 143 * @bs: the bio_set to allocate from 144 * 145 * Description: 146 * bio_alloc_bioset will first try it's on mempool to satisfy the allocation. 147 * If %__GFP_WAIT is set then we will block on the internal pool waiting 148 * for a &struct bio to become free. 149 * 150 * allocate bio and iovecs from the memory pools specified by the 151 * bio_set structure. 152 **/ 153struct bio *bio_alloc_bioset(unsigned int __nocast gfp_mask, int nr_iovecs, struct bio_set *bs) 154{ 155 struct bio *bio = mempool_alloc(bs->bio_pool, gfp_mask); 156 157 if (likely(bio)) { 158 struct bio_vec *bvl = NULL; 159 160 bio_init(bio); 161 if (likely(nr_iovecs)) { 162 unsigned long idx; 163 164 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); 165 if (unlikely(!bvl)) { 166 mempool_free(bio, bs->bio_pool); 167 bio = NULL; 168 goto out; 169 } 170 bio->bi_flags |= idx << BIO_POOL_OFFSET; 171 bio->bi_max_vecs = bvec_slabs[idx].nr_vecs; 172 } 173 bio->bi_io_vec = bvl; 174 bio->bi_destructor = bio_destructor; 175 bio->bi_set = bs; 176 } 177out: 178 return bio; 179} 180 181struct bio *bio_alloc(unsigned int __nocast gfp_mask, int nr_iovecs) 182{ 183 return bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); 184} 185 186void zero_fill_bio(struct bio *bio) 187{ 188 unsigned long flags; 189 struct bio_vec *bv; 190 int i; 191 192 bio_for_each_segment(bv, bio, i) { 193 char *data = bvec_kmap_irq(bv, &flags); 194 memset(data, 0, bv->bv_len); 195 flush_dcache_page(bv->bv_page); 196 bvec_kunmap_irq(data, &flags); 197 } 198} 199EXPORT_SYMBOL(zero_fill_bio); 200 201/** 202 * bio_put - release a reference to a bio 203 * @bio: bio to release reference to 204 * 205 * Description: 206 * Put a reference to a &struct bio, either one you have gotten with 207 * bio_alloc or bio_get. The last put of a bio will free it. 208 **/ 209void bio_put(struct bio *bio) 210{ 211 BIO_BUG_ON(!atomic_read(&bio->bi_cnt)); 212 213 /* 214 * last put frees it 215 */ 216 if (atomic_dec_and_test(&bio->bi_cnt)) { 217 bio->bi_next = NULL; 218 bio->bi_destructor(bio); 219 } 220} 221 222inline int bio_phys_segments(request_queue_t *q, struct bio *bio) 223{ 224 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 225 blk_recount_segments(q, bio); 226 227 return bio->bi_phys_segments; 228} 229 230inline int bio_hw_segments(request_queue_t *q, struct bio *bio) 231{ 232 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 233 blk_recount_segments(q, bio); 234 235 return bio->bi_hw_segments; 236} 237 238/** 239 * __bio_clone - clone a bio 240 * @bio: destination bio 241 * @bio_src: bio to clone 242 * 243 * Clone a &bio. Caller will own the returned bio, but not 244 * the actual data it points to. Reference count of returned 245 * bio will be one. 246 */ 247inline void __bio_clone(struct bio *bio, struct bio *bio_src) 248{ 249 request_queue_t *q = bdev_get_queue(bio_src->bi_bdev); 250 251 memcpy(bio->bi_io_vec, bio_src->bi_io_vec, bio_src->bi_max_vecs * sizeof(struct bio_vec)); 252 253 bio->bi_sector = bio_src->bi_sector; 254 bio->bi_bdev = bio_src->bi_bdev; 255 bio->bi_flags |= 1 << BIO_CLONED; 256 bio->bi_rw = bio_src->bi_rw; 257 258 /* 259 * notes -- maybe just leave bi_idx alone. assume identical mapping 260 * for the clone 261 */ 262 bio->bi_vcnt = bio_src->bi_vcnt; 263 bio->bi_size = bio_src->bi_size; 264 bio->bi_idx = bio_src->bi_idx; 265 bio_phys_segments(q, bio); 266 bio_hw_segments(q, bio); 267} 268 269/** 270 * bio_clone - clone a bio 271 * @bio: bio to clone 272 * @gfp_mask: allocation priority 273 * 274 * Like __bio_clone, only also allocates the returned bio 275 */ 276struct bio *bio_clone(struct bio *bio, unsigned int __nocast gfp_mask) 277{ 278 struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set); 279 280 if (b) 281 __bio_clone(b, bio); 282 283 return b; 284} 285 286/** 287 * bio_get_nr_vecs - return approx number of vecs 288 * @bdev: I/O target 289 * 290 * Return the approximate number of pages we can send to this target. 291 * There's no guarantee that you will be able to fit this number of pages 292 * into a bio, it does not account for dynamic restrictions that vary 293 * on offset. 294 */ 295int bio_get_nr_vecs(struct block_device *bdev) 296{ 297 request_queue_t *q = bdev_get_queue(bdev); 298 int nr_pages; 299 300 nr_pages = ((q->max_sectors << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT; 301 if (nr_pages > q->max_phys_segments) 302 nr_pages = q->max_phys_segments; 303 if (nr_pages > q->max_hw_segments) 304 nr_pages = q->max_hw_segments; 305 306 return nr_pages; 307} 308 309static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page 310 *page, unsigned int len, unsigned int offset) 311{ 312 int retried_segments = 0; 313 struct bio_vec *bvec; 314 315 /* 316 * cloned bio must not modify vec list 317 */ 318 if (unlikely(bio_flagged(bio, BIO_CLONED))) 319 return 0; 320 321 if (bio->bi_vcnt >= bio->bi_max_vecs) 322 return 0; 323 324 if (((bio->bi_size + len) >> 9) > q->max_sectors) 325 return 0; 326 327 /* 328 * we might lose a segment or two here, but rather that than 329 * make this too complex. 330 */ 331 332 while (bio->bi_phys_segments >= q->max_phys_segments 333 || bio->bi_hw_segments >= q->max_hw_segments 334 || BIOVEC_VIRT_OVERSIZE(bio->bi_size)) { 335 336 if (retried_segments) 337 return 0; 338 339 retried_segments = 1; 340 blk_recount_segments(q, bio); 341 } 342 343 /* 344 * setup the new entry, we might clear it again later if we 345 * cannot add the page 346 */ 347 bvec = &bio->bi_io_vec[bio->bi_vcnt]; 348 bvec->bv_page = page; 349 bvec->bv_len = len; 350 bvec->bv_offset = offset; 351 352 /* 353 * if queue has other restrictions (eg varying max sector size 354 * depending on offset), it can specify a merge_bvec_fn in the 355 * queue to get further control 356 */ 357 if (q->merge_bvec_fn) { 358 /* 359 * merge_bvec_fn() returns number of bytes it can accept 360 * at this offset 361 */ 362 if (q->merge_bvec_fn(q, bio, bvec) < len) { 363 bvec->bv_page = NULL; 364 bvec->bv_len = 0; 365 bvec->bv_offset = 0; 366 return 0; 367 } 368 } 369 370 /* If we may be able to merge these biovecs, force a recount */ 371 if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec) || 372 BIOVEC_VIRT_MERGEABLE(bvec-1, bvec))) 373 bio->bi_flags &= ~(1 << BIO_SEG_VALID); 374 375 bio->bi_vcnt++; 376 bio->bi_phys_segments++; 377 bio->bi_hw_segments++; 378 bio->bi_size += len; 379 return len; 380} 381 382/** 383 * bio_add_page - attempt to add page to bio 384 * @bio: destination bio 385 * @page: page to add 386 * @len: vec entry length 387 * @offset: vec entry offset 388 * 389 * Attempt to add a page to the bio_vec maplist. This can fail for a 390 * number of reasons, such as the bio being full or target block 391 * device limitations. The target block device must allow bio's 392 * smaller than PAGE_SIZE, so it is always possible to add a single 393 * page to an empty bio. 394 */ 395int bio_add_page(struct bio *bio, struct page *page, unsigned int len, 396 unsigned int offset) 397{ 398 return __bio_add_page(bdev_get_queue(bio->bi_bdev), bio, page, 399 len, offset); 400} 401 402struct bio_map_data { 403 struct bio_vec *iovecs; 404 void __user *userptr; 405}; 406 407static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio) 408{ 409 memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt); 410 bio->bi_private = bmd; 411} 412 413static void bio_free_map_data(struct bio_map_data *bmd) 414{ 415 kfree(bmd->iovecs); 416 kfree(bmd); 417} 418 419static struct bio_map_data *bio_alloc_map_data(int nr_segs) 420{ 421 struct bio_map_data *bmd = kmalloc(sizeof(*bmd), GFP_KERNEL); 422 423 if (!bmd) 424 return NULL; 425 426 bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, GFP_KERNEL); 427 if (bmd->iovecs) 428 return bmd; 429 430 kfree(bmd); 431 return NULL; 432} 433 434/** 435 * bio_uncopy_user - finish previously mapped bio 436 * @bio: bio being terminated 437 * 438 * Free pages allocated from bio_copy_user() and write back data 439 * to user space in case of a read. 440 */ 441int bio_uncopy_user(struct bio *bio) 442{ 443 struct bio_map_data *bmd = bio->bi_private; 444 const int read = bio_data_dir(bio) == READ; 445 struct bio_vec *bvec; 446 int i, ret = 0; 447 448 __bio_for_each_segment(bvec, bio, i, 0) { 449 char *addr = page_address(bvec->bv_page); 450 unsigned int len = bmd->iovecs[i].bv_len; 451 452 if (read && !ret && copy_to_user(bmd->userptr, addr, len)) 453 ret = -EFAULT; 454 455 __free_page(bvec->bv_page); 456 bmd->userptr += len; 457 } 458 bio_free_map_data(bmd); 459 bio_put(bio); 460 return ret; 461} 462 463/** 464 * bio_copy_user - copy user data to bio 465 * @q: destination block queue 466 * @uaddr: start of user address 467 * @len: length in bytes 468 * @write_to_vm: bool indicating writing to pages or not 469 * 470 * Prepares and returns a bio for indirect user io, bouncing data 471 * to/from kernel pages as necessary. Must be paired with 472 * call bio_uncopy_user() on io completion. 473 */ 474struct bio *bio_copy_user(request_queue_t *q, unsigned long uaddr, 475 unsigned int len, int write_to_vm) 476{ 477 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 478 unsigned long start = uaddr >> PAGE_SHIFT; 479 struct bio_map_data *bmd; 480 struct bio_vec *bvec; 481 struct page *page; 482 struct bio *bio; 483 int i, ret; 484 485 bmd = bio_alloc_map_data(end - start); 486 if (!bmd) 487 return ERR_PTR(-ENOMEM); 488 489 bmd->userptr = (void __user *) uaddr; 490 491 ret = -ENOMEM; 492 bio = bio_alloc(GFP_KERNEL, end - start); 493 if (!bio) 494 goto out_bmd; 495 496 bio->bi_rw |= (!write_to_vm << BIO_RW); 497 498 ret = 0; 499 while (len) { 500 unsigned int bytes = PAGE_SIZE; 501 502 if (bytes > len) 503 bytes = len; 504 505 page = alloc_page(q->bounce_gfp | GFP_KERNEL); 506 if (!page) { 507 ret = -ENOMEM; 508 break; 509 } 510 511 if (__bio_add_page(q, bio, page, bytes, 0) < bytes) { 512 ret = -EINVAL; 513 break; 514 } 515 516 len -= bytes; 517 } 518 519 if (ret) 520 goto cleanup; 521 522 /* 523 * success 524 */ 525 if (!write_to_vm) { 526 char __user *p = (char __user *) uaddr; 527 528 /* 529 * for a write, copy in data to kernel pages 530 */ 531 ret = -EFAULT; 532 bio_for_each_segment(bvec, bio, i) { 533 char *addr = page_address(bvec->bv_page); 534 535 if (copy_from_user(addr, p, bvec->bv_len)) 536 goto cleanup; 537 p += bvec->bv_len; 538 } 539 } 540 541 bio_set_map_data(bmd, bio); 542 return bio; 543cleanup: 544 bio_for_each_segment(bvec, bio, i) 545 __free_page(bvec->bv_page); 546 547 bio_put(bio); 548out_bmd: 549 bio_free_map_data(bmd); 550 return ERR_PTR(ret); 551} 552 553static struct bio *__bio_map_user(request_queue_t *q, struct block_device *bdev, 554 unsigned long uaddr, unsigned int len, 555 int write_to_vm) 556{ 557 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 558 unsigned long start = uaddr >> PAGE_SHIFT; 559 const int nr_pages = end - start; 560 int ret, offset, i; 561 struct page **pages; 562 struct bio *bio; 563 564 /* 565 * transfer and buffer must be aligned to at least hardsector 566 * size for now, in the future we can relax this restriction 567 */ 568 if ((uaddr & queue_dma_alignment(q)) || (len & queue_dma_alignment(q))) 569 return ERR_PTR(-EINVAL); 570 571 bio = bio_alloc(GFP_KERNEL, nr_pages); 572 if (!bio) 573 return ERR_PTR(-ENOMEM); 574 575 ret = -ENOMEM; 576 pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); 577 if (!pages) 578 goto out; 579 580 down_read(&current->mm->mmap_sem); 581 ret = get_user_pages(current, current->mm, uaddr, nr_pages, 582 write_to_vm, 0, pages, NULL); 583 up_read(&current->mm->mmap_sem); 584 585 if (ret < nr_pages) 586 goto out; 587 588 bio->bi_bdev = bdev; 589 590 offset = uaddr & ~PAGE_MASK; 591 for (i = 0; i < nr_pages; i++) { 592 unsigned int bytes = PAGE_SIZE - offset; 593 594 if (len <= 0) 595 break; 596 597 if (bytes > len) 598 bytes = len; 599 600 /* 601 * sorry... 602 */ 603 if (__bio_add_page(q, bio, pages[i], bytes, offset) < bytes) 604 break; 605 606 len -= bytes; 607 offset = 0; 608 } 609 610 /* 611 * release the pages we didn't map into the bio, if any 612 */ 613 while (i < nr_pages) 614 page_cache_release(pages[i++]); 615 616 kfree(pages); 617 618 /* 619 * set data direction, and check if mapped pages need bouncing 620 */ 621 if (!write_to_vm) 622 bio->bi_rw |= (1 << BIO_RW); 623 624 bio->bi_flags |= (1 << BIO_USER_MAPPED); 625 return bio; 626out: 627 kfree(pages); 628 bio_put(bio); 629 return ERR_PTR(ret); 630} 631 632/** 633 * bio_map_user - map user address into bio 634 * @q: the request_queue_t for the bio 635 * @bdev: destination block device 636 * @uaddr: start of user address 637 * @len: length in bytes 638 * @write_to_vm: bool indicating writing to pages or not 639 * 640 * Map the user space address into a bio suitable for io to a block 641 * device. Returns an error pointer in case of error. 642 */ 643struct bio *bio_map_user(request_queue_t *q, struct block_device *bdev, 644 unsigned long uaddr, unsigned int len, int write_to_vm) 645{ 646 struct bio *bio; 647 648 bio = __bio_map_user(q, bdev, uaddr, len, write_to_vm); 649 650 if (IS_ERR(bio)) 651 return bio; 652 653 /* 654 * subtle -- if __bio_map_user() ended up bouncing a bio, 655 * it would normally disappear when its bi_end_io is run. 656 * however, we need it for the unmap, so grab an extra 657 * reference to it 658 */ 659 bio_get(bio); 660 661 if (bio->bi_size == len) 662 return bio; 663 664 /* 665 * don't support partial mappings 666 */ 667 bio_endio(bio, bio->bi_size, 0); 668 bio_unmap_user(bio); 669 return ERR_PTR(-EINVAL); 670} 671 672static void __bio_unmap_user(struct bio *bio) 673{ 674 struct bio_vec *bvec; 675 int i; 676 677 /* 678 * make sure we dirty pages we wrote to 679 */ 680 __bio_for_each_segment(bvec, bio, i, 0) { 681 if (bio_data_dir(bio) == READ) 682 set_page_dirty_lock(bvec->bv_page); 683 684 page_cache_release(bvec->bv_page); 685 } 686 687 bio_put(bio); 688} 689 690/** 691 * bio_unmap_user - unmap a bio 692 * @bio: the bio being unmapped 693 * 694 * Unmap a bio previously mapped by bio_map_user(). Must be called with 695 * a process context. 696 * 697 * bio_unmap_user() may sleep. 698 */ 699void bio_unmap_user(struct bio *bio) 700{ 701 __bio_unmap_user(bio); 702 bio_put(bio); 703} 704 705/* 706 * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions 707 * for performing direct-IO in BIOs. 708 * 709 * The problem is that we cannot run set_page_dirty() from interrupt context 710 * because the required locks are not interrupt-safe. So what we can do is to 711 * mark the pages dirty _before_ performing IO. And in interrupt context, 712 * check that the pages are still dirty. If so, fine. If not, redirty them 713 * in process context. 714 * 715 * We special-case compound pages here: normally this means reads into hugetlb 716 * pages. The logic in here doesn't really work right for compound pages 717 * because the VM does not uniformly chase down the head page in all cases. 718 * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't 719 * handle them at all. So we skip compound pages here at an early stage. 720 * 721 * Note that this code is very hard to test under normal circumstances because 722 * direct-io pins the pages with get_user_pages(). This makes 723 * is_page_cache_freeable return false, and the VM will not clean the pages. 724 * But other code (eg, pdflush) could clean the pages if they are mapped 725 * pagecache. 726 * 727 * Simply disabling the call to bio_set_pages_dirty() is a good way to test the 728 * deferred bio dirtying paths. 729 */ 730 731/* 732 * bio_set_pages_dirty() will mark all the bio's pages as dirty. 733 */ 734void bio_set_pages_dirty(struct bio *bio) 735{ 736 struct bio_vec *bvec = bio->bi_io_vec; 737 int i; 738 739 for (i = 0; i < bio->bi_vcnt; i++) { 740 struct page *page = bvec[i].bv_page; 741 742 if (page && !PageCompound(page)) 743 set_page_dirty_lock(page); 744 } 745} 746 747static void bio_release_pages(struct bio *bio) 748{ 749 struct bio_vec *bvec = bio->bi_io_vec; 750 int i; 751 752 for (i = 0; i < bio->bi_vcnt; i++) { 753 struct page *page = bvec[i].bv_page; 754 755 if (page) 756 put_page(page); 757 } 758} 759 760/* 761 * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. 762 * If they are, then fine. If, however, some pages are clean then they must 763 * have been written out during the direct-IO read. So we take another ref on 764 * the BIO and the offending pages and re-dirty the pages in process context. 765 * 766 * It is expected that bio_check_pages_dirty() will wholly own the BIO from 767 * here on. It will run one page_cache_release() against each page and will 768 * run one bio_put() against the BIO. 769 */ 770 771static void bio_dirty_fn(void *data); 772 773static DECLARE_WORK(bio_dirty_work, bio_dirty_fn, NULL); 774static DEFINE_SPINLOCK(bio_dirty_lock); 775static struct bio *bio_dirty_list; 776 777/* 778 * This runs in process context 779 */ 780static void bio_dirty_fn(void *data) 781{ 782 unsigned long flags; 783 struct bio *bio; 784 785 spin_lock_irqsave(&bio_dirty_lock, flags); 786 bio = bio_dirty_list; 787 bio_dirty_list = NULL; 788 spin_unlock_irqrestore(&bio_dirty_lock, flags); 789 790 while (bio) { 791 struct bio *next = bio->bi_private; 792 793 bio_set_pages_dirty(bio); 794 bio_release_pages(bio); 795 bio_put(bio); 796 bio = next; 797 } 798} 799 800void bio_check_pages_dirty(struct bio *bio) 801{ 802 struct bio_vec *bvec = bio->bi_io_vec; 803 int nr_clean_pages = 0; 804 int i; 805 806 for (i = 0; i < bio->bi_vcnt; i++) { 807 struct page *page = bvec[i].bv_page; 808 809 if (PageDirty(page) || PageCompound(page)) { 810 page_cache_release(page); 811 bvec[i].bv_page = NULL; 812 } else { 813 nr_clean_pages++; 814 } 815 } 816 817 if (nr_clean_pages) { 818 unsigned long flags; 819 820 spin_lock_irqsave(&bio_dirty_lock, flags); 821 bio->bi_private = bio_dirty_list; 822 bio_dirty_list = bio; 823 spin_unlock_irqrestore(&bio_dirty_lock, flags); 824 schedule_work(&bio_dirty_work); 825 } else { 826 bio_put(bio); 827 } 828} 829 830/** 831 * bio_endio - end I/O on a bio 832 * @bio: bio 833 * @bytes_done: number of bytes completed 834 * @error: error, if any 835 * 836 * Description: 837 * bio_endio() will end I/O on @bytes_done number of bytes. This may be 838 * just a partial part of the bio, or it may be the whole bio. bio_endio() 839 * is the preferred way to end I/O on a bio, it takes care of decrementing 840 * bi_size and clearing BIO_UPTODATE on error. @error is 0 on success, and 841 * and one of the established -Exxxx (-EIO, for instance) error values in 842 * case something went wrong. Noone should call bi_end_io() directly on 843 * a bio unless they own it and thus know that it has an end_io function. 844 **/ 845void bio_endio(struct bio *bio, unsigned int bytes_done, int error) 846{ 847 if (error) 848 clear_bit(BIO_UPTODATE, &bio->bi_flags); 849 850 if (unlikely(bytes_done > bio->bi_size)) { 851 printk("%s: want %u bytes done, only %u left\n", __FUNCTION__, 852 bytes_done, bio->bi_size); 853 bytes_done = bio->bi_size; 854 } 855 856 bio->bi_size -= bytes_done; 857 bio->bi_sector += (bytes_done >> 9); 858 859 if (bio->bi_end_io) 860 bio->bi_end_io(bio, bytes_done, error); 861} 862 863void bio_pair_release(struct bio_pair *bp) 864{ 865 if (atomic_dec_and_test(&bp->cnt)) { 866 struct bio *master = bp->bio1.bi_private; 867 868 bio_endio(master, master->bi_size, bp->error); 869 mempool_free(bp, bp->bio2.bi_private); 870 } 871} 872 873static int bio_pair_end_1(struct bio * bi, unsigned int done, int err) 874{ 875 struct bio_pair *bp = container_of(bi, struct bio_pair, bio1); 876 877 if (err) 878 bp->error = err; 879 880 if (bi->bi_size) 881 return 1; 882 883 bio_pair_release(bp); 884 return 0; 885} 886 887static int bio_pair_end_2(struct bio * bi, unsigned int done, int err) 888{ 889 struct bio_pair *bp = container_of(bi, struct bio_pair, bio2); 890 891 if (err) 892 bp->error = err; 893 894 if (bi->bi_size) 895 return 1; 896 897 bio_pair_release(bp); 898 return 0; 899} 900 901/* 902 * split a bio - only worry about a bio with a single page 903 * in it's iovec 904 */ 905struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors) 906{ 907 struct bio_pair *bp = mempool_alloc(pool, GFP_NOIO); 908 909 if (!bp) 910 return bp; 911 912 BUG_ON(bi->bi_vcnt != 1); 913 BUG_ON(bi->bi_idx != 0); 914 atomic_set(&bp->cnt, 3); 915 bp->error = 0; 916 bp->bio1 = *bi; 917 bp->bio2 = *bi; 918 bp->bio2.bi_sector += first_sectors; 919 bp->bio2.bi_size -= first_sectors << 9; 920 bp->bio1.bi_size = first_sectors << 9; 921 922 bp->bv1 = bi->bi_io_vec[0]; 923 bp->bv2 = bi->bi_io_vec[0]; 924 bp->bv2.bv_offset += first_sectors << 9; 925 bp->bv2.bv_len -= first_sectors << 9; 926 bp->bv1.bv_len = first_sectors << 9; 927 928 bp->bio1.bi_io_vec = &bp->bv1; 929 bp->bio2.bi_io_vec = &bp->bv2; 930 931 bp->bio1.bi_end_io = bio_pair_end_1; 932 bp->bio2.bi_end_io = bio_pair_end_2; 933 934 bp->bio1.bi_private = bi; 935 bp->bio2.bi_private = pool; 936 937 return bp; 938} 939 940static void *bio_pair_alloc(unsigned int __nocast gfp_flags, void *data) 941{ 942 return kmalloc(sizeof(struct bio_pair), gfp_flags); 943} 944 945static void bio_pair_free(void *bp, void *data) 946{ 947 kfree(bp); 948} 949 950 951/* 952 * create memory pools for biovec's in a bio_set. 953 * use the global biovec slabs created for general use. 954 */ 955static int biovec_create_pools(struct bio_set *bs, int pool_entries, int scale) 956{ 957 int i; 958 959 for (i = 0; i < BIOVEC_NR_POOLS; i++) { 960 struct biovec_slab *bp = bvec_slabs + i; 961 mempool_t **bvp = bs->bvec_pools + i; 962 963 if (i >= scale) 964 pool_entries >>= 1; 965 966 *bvp = mempool_create(pool_entries, mempool_alloc_slab, 967 mempool_free_slab, bp->slab); 968 if (!*bvp) 969 return -ENOMEM; 970 } 971 return 0; 972} 973 974static void biovec_free_pools(struct bio_set *bs) 975{ 976 int i; 977 978 for (i = 0; i < BIOVEC_NR_POOLS; i++) { 979 mempool_t *bvp = bs->bvec_pools[i]; 980 981 if (bvp) 982 mempool_destroy(bvp); 983 } 984 985} 986 987void bioset_free(struct bio_set *bs) 988{ 989 if (bs->bio_pool) 990 mempool_destroy(bs->bio_pool); 991 992 biovec_free_pools(bs); 993 994 kfree(bs); 995} 996 997struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size, int scale) 998{ 999 struct bio_set *bs = kmalloc(sizeof(*bs), GFP_KERNEL); 1000 1001 if (!bs) 1002 return NULL; 1003 1004 memset(bs, 0, sizeof(*bs)); 1005 bs->bio_pool = mempool_create(bio_pool_size, mempool_alloc_slab, 1006 mempool_free_slab, bio_slab); 1007 1008 if (!bs->bio_pool) 1009 goto bad; 1010 1011 if (!biovec_create_pools(bs, bvec_pool_size, scale)) 1012 return bs; 1013 1014bad: 1015 bioset_free(bs); 1016 return NULL; 1017} 1018 1019static void __init biovec_init_slabs(void) 1020{ 1021 int i; 1022 1023 for (i = 0; i < BIOVEC_NR_POOLS; i++) { 1024 int size; 1025 struct biovec_slab *bvs = bvec_slabs + i; 1026 1027 size = bvs->nr_vecs * sizeof(struct bio_vec); 1028 bvs->slab = kmem_cache_create(bvs->name, size, 0, 1029 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1030 } 1031} 1032 1033static int __init init_bio(void) 1034{ 1035 int megabytes, bvec_pool_entries; 1036 int scale = BIOVEC_NR_POOLS; 1037 1038 bio_slab = kmem_cache_create("bio", sizeof(struct bio), 0, 1039 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1040 1041 biovec_init_slabs(); 1042 1043 megabytes = nr_free_pages() >> (20 - PAGE_SHIFT); 1044 1045 /* 1046 * find out where to start scaling 1047 */ 1048 if (megabytes <= 16) 1049 scale = 0; 1050 else if (megabytes <= 32) 1051 scale = 1; 1052 else if (megabytes <= 64) 1053 scale = 2; 1054 else if (megabytes <= 96) 1055 scale = 3; 1056 else if (megabytes <= 128) 1057 scale = 4; 1058 1059 /* 1060 * scale number of entries 1061 */ 1062 bvec_pool_entries = megabytes * 2; 1063 if (bvec_pool_entries > 256) 1064 bvec_pool_entries = 256; 1065 1066 fs_bio_set = bioset_create(BIO_POOL_SIZE, bvec_pool_entries, scale); 1067 if (!fs_bio_set) 1068 panic("bio: can't allocate bios\n"); 1069 1070 bio_split_pool = mempool_create(BIO_SPLIT_ENTRIES, 1071 bio_pair_alloc, bio_pair_free, NULL); 1072 if (!bio_split_pool) 1073 panic("bio: can't create split pool\n"); 1074 1075 return 0; 1076} 1077 1078subsys_initcall(init_bio); 1079 1080EXPORT_SYMBOL(bio_alloc); 1081EXPORT_SYMBOL(bio_put); 1082EXPORT_SYMBOL(bio_endio); 1083EXPORT_SYMBOL(bio_init); 1084EXPORT_SYMBOL(__bio_clone); 1085EXPORT_SYMBOL(bio_clone); 1086EXPORT_SYMBOL(bio_phys_segments); 1087EXPORT_SYMBOL(bio_hw_segments); 1088EXPORT_SYMBOL(bio_add_page); 1089EXPORT_SYMBOL(bio_get_nr_vecs); 1090EXPORT_SYMBOL(bio_map_user); 1091EXPORT_SYMBOL(bio_unmap_user); 1092EXPORT_SYMBOL(bio_pair_release); 1093EXPORT_SYMBOL(bio_split); 1094EXPORT_SYMBOL(bio_split_pool); 1095EXPORT_SYMBOL(bio_copy_user); 1096EXPORT_SYMBOL(bio_uncopy_user); 1097EXPORT_SYMBOL(bioset_create); 1098EXPORT_SYMBOL(bioset_free); 1099EXPORT_SYMBOL(bio_alloc_bioset);