Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at 7143740d26098aca84ecc7376ccfe2c58fd0412e 1205 lines 29 kB view raw
1/* 2 * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk> 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License version 2 as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * GNU General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public Licens 14 * along with this program; if not, write to the Free Software 15 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 16 * 17 */ 18#include <linux/mm.h> 19#include <linux/swap.h> 20#include <linux/bio.h> 21#include <linux/blkdev.h> 22#include <linux/slab.h> 23#include <linux/init.h> 24#include <linux/kernel.h> 25#include <linux/module.h> 26#include <linux/mempool.h> 27#include <linux/workqueue.h> 28#include <linux/blktrace_api.h> 29#include <scsi/sg.h> /* for struct sg_iovec */ 30 31#define BIO_POOL_SIZE 2 32 33static struct kmem_cache *bio_slab __read_mostly; 34 35#define BIOVEC_NR_POOLS 6 36 37/* 38 * a small number of entries is fine, not going to be performance critical. 39 * basically we just need to survive 40 */ 41#define BIO_SPLIT_ENTRIES 2 42mempool_t *bio_split_pool __read_mostly; 43 44struct biovec_slab { 45 int nr_vecs; 46 char *name; 47 struct kmem_cache *slab; 48}; 49 50/* 51 * if you change this list, also change bvec_alloc or things will 52 * break badly! cannot be bigger than what you can fit into an 53 * unsigned short 54 */ 55 56#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) } 57static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { 58 BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES), 59}; 60#undef BV 61 62/* 63 * bio_set is used to allow other portions of the IO system to 64 * allocate their own private memory pools for bio and iovec structures. 65 * These memory pools in turn all allocate from the bio_slab 66 * and the bvec_slabs[]. 67 */ 68struct bio_set { 69 mempool_t *bio_pool; 70 mempool_t *bvec_pools[BIOVEC_NR_POOLS]; 71}; 72 73/* 74 * fs_bio_set is the bio_set containing bio and iovec memory pools used by 75 * IO code that does not need private memory pools. 76 */ 77static struct bio_set *fs_bio_set; 78 79static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs) 80{ 81 struct bio_vec *bvl; 82 83 /* 84 * see comment near bvec_array define! 85 */ 86 switch (nr) { 87 case 1 : *idx = 0; break; 88 case 2 ... 4: *idx = 1; break; 89 case 5 ... 16: *idx = 2; break; 90 case 17 ... 64: *idx = 3; break; 91 case 65 ... 128: *idx = 4; break; 92 case 129 ... BIO_MAX_PAGES: *idx = 5; break; 93 default: 94 return NULL; 95 } 96 /* 97 * idx now points to the pool we want to allocate from 98 */ 99 100 bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask); 101 if (bvl) { 102 struct biovec_slab *bp = bvec_slabs + *idx; 103 104 memset(bvl, 0, bp->nr_vecs * sizeof(struct bio_vec)); 105 } 106 107 return bvl; 108} 109 110void bio_free(struct bio *bio, struct bio_set *bio_set) 111{ 112 if (bio->bi_io_vec) { 113 const int pool_idx = BIO_POOL_IDX(bio); 114 115 BIO_BUG_ON(pool_idx >= BIOVEC_NR_POOLS); 116 117 mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]); 118 } 119 120 mempool_free(bio, bio_set->bio_pool); 121} 122 123/* 124 * default destructor for a bio allocated with bio_alloc_bioset() 125 */ 126static void bio_fs_destructor(struct bio *bio) 127{ 128 bio_free(bio, fs_bio_set); 129} 130 131void bio_init(struct bio *bio) 132{ 133 memset(bio, 0, sizeof(*bio)); 134 bio->bi_flags = 1 << BIO_UPTODATE; 135 atomic_set(&bio->bi_cnt, 1); 136} 137 138/** 139 * bio_alloc_bioset - allocate a bio for I/O 140 * @gfp_mask: the GFP_ mask given to the slab allocator 141 * @nr_iovecs: number of iovecs to pre-allocate 142 * @bs: the bio_set to allocate from 143 * 144 * Description: 145 * bio_alloc_bioset will first try it's on mempool to satisfy the allocation. 146 * If %__GFP_WAIT is set then we will block on the internal pool waiting 147 * for a &struct bio to become free. 148 * 149 * allocate bio and iovecs from the memory pools specified by the 150 * bio_set structure. 151 **/ 152struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) 153{ 154 struct bio *bio = mempool_alloc(bs->bio_pool, gfp_mask); 155 156 if (likely(bio)) { 157 struct bio_vec *bvl = NULL; 158 159 bio_init(bio); 160 if (likely(nr_iovecs)) { 161 unsigned long idx = 0; /* shut up gcc */ 162 163 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); 164 if (unlikely(!bvl)) { 165 mempool_free(bio, bs->bio_pool); 166 bio = NULL; 167 goto out; 168 } 169 bio->bi_flags |= idx << BIO_POOL_OFFSET; 170 bio->bi_max_vecs = bvec_slabs[idx].nr_vecs; 171 } 172 bio->bi_io_vec = bvl; 173 } 174out: 175 return bio; 176} 177 178struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) 179{ 180 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); 181 182 if (bio) 183 bio->bi_destructor = bio_fs_destructor; 184 185 return bio; 186} 187 188void zero_fill_bio(struct bio *bio) 189{ 190 unsigned long flags; 191 struct bio_vec *bv; 192 int i; 193 194 bio_for_each_segment(bv, bio, i) { 195 char *data = bvec_kmap_irq(bv, &flags); 196 memset(data, 0, bv->bv_len); 197 flush_dcache_page(bv->bv_page); 198 bvec_kunmap_irq(data, &flags); 199 } 200} 201EXPORT_SYMBOL(zero_fill_bio); 202 203/** 204 * bio_put - release a reference to a bio 205 * @bio: bio to release reference to 206 * 207 * Description: 208 * Put a reference to a &struct bio, either one you have gotten with 209 * bio_alloc or bio_get. The last put of a bio will free it. 210 **/ 211void bio_put(struct bio *bio) 212{ 213 BIO_BUG_ON(!atomic_read(&bio->bi_cnt)); 214 215 /* 216 * last put frees it 217 */ 218 if (atomic_dec_and_test(&bio->bi_cnt)) { 219 bio->bi_next = NULL; 220 bio->bi_destructor(bio); 221 } 222} 223 224inline int bio_phys_segments(struct request_queue *q, struct bio *bio) 225{ 226 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 227 blk_recount_segments(q, bio); 228 229 return bio->bi_phys_segments; 230} 231 232inline int bio_hw_segments(struct request_queue *q, struct bio *bio) 233{ 234 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 235 blk_recount_segments(q, bio); 236 237 return bio->bi_hw_segments; 238} 239 240/** 241 * __bio_clone - clone a bio 242 * @bio: destination bio 243 * @bio_src: bio to clone 244 * 245 * Clone a &bio. Caller will own the returned bio, but not 246 * the actual data it points to. Reference count of returned 247 * bio will be one. 248 */ 249void __bio_clone(struct bio *bio, struct bio *bio_src) 250{ 251 memcpy(bio->bi_io_vec, bio_src->bi_io_vec, 252 bio_src->bi_max_vecs * sizeof(struct bio_vec)); 253 254 /* 255 * most users will be overriding ->bi_bdev with a new target, 256 * so we don't set nor calculate new physical/hw segment counts here 257 */ 258 bio->bi_sector = bio_src->bi_sector; 259 bio->bi_bdev = bio_src->bi_bdev; 260 bio->bi_flags |= 1 << BIO_CLONED; 261 bio->bi_rw = bio_src->bi_rw; 262 bio->bi_vcnt = bio_src->bi_vcnt; 263 bio->bi_size = bio_src->bi_size; 264 bio->bi_idx = bio_src->bi_idx; 265} 266 267/** 268 * bio_clone - clone a bio 269 * @bio: bio to clone 270 * @gfp_mask: allocation priority 271 * 272 * Like __bio_clone, only also allocates the returned bio 273 */ 274struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) 275{ 276 struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set); 277 278 if (b) { 279 b->bi_destructor = bio_fs_destructor; 280 __bio_clone(b, bio); 281 } 282 283 return b; 284} 285 286/** 287 * bio_get_nr_vecs - return approx number of vecs 288 * @bdev: I/O target 289 * 290 * Return the approximate number of pages we can send to this target. 291 * There's no guarantee that you will be able to fit this number of pages 292 * into a bio, it does not account for dynamic restrictions that vary 293 * on offset. 294 */ 295int bio_get_nr_vecs(struct block_device *bdev) 296{ 297 struct request_queue *q = bdev_get_queue(bdev); 298 int nr_pages; 299 300 nr_pages = ((q->max_sectors << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT; 301 if (nr_pages > q->max_phys_segments) 302 nr_pages = q->max_phys_segments; 303 if (nr_pages > q->max_hw_segments) 304 nr_pages = q->max_hw_segments; 305 306 return nr_pages; 307} 308 309static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page 310 *page, unsigned int len, unsigned int offset, 311 unsigned short max_sectors) 312{ 313 int retried_segments = 0; 314 struct bio_vec *bvec; 315 316 /* 317 * cloned bio must not modify vec list 318 */ 319 if (unlikely(bio_flagged(bio, BIO_CLONED))) 320 return 0; 321 322 if (((bio->bi_size + len) >> 9) > max_sectors) 323 return 0; 324 325 /* 326 * For filesystems with a blocksize smaller than the pagesize 327 * we will often be called with the same page as last time and 328 * a consecutive offset. Optimize this special case. 329 */ 330 if (bio->bi_vcnt > 0) { 331 struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; 332 333 if (page == prev->bv_page && 334 offset == prev->bv_offset + prev->bv_len) { 335 prev->bv_len += len; 336 if (q->merge_bvec_fn && 337 q->merge_bvec_fn(q, bio, prev) < len) { 338 prev->bv_len -= len; 339 return 0; 340 } 341 342 goto done; 343 } 344 } 345 346 if (bio->bi_vcnt >= bio->bi_max_vecs) 347 return 0; 348 349 /* 350 * we might lose a segment or two here, but rather that than 351 * make this too complex. 352 */ 353 354 while (bio->bi_phys_segments >= q->max_phys_segments 355 || bio->bi_hw_segments >= q->max_hw_segments 356 || BIOVEC_VIRT_OVERSIZE(bio->bi_size)) { 357 358 if (retried_segments) 359 return 0; 360 361 retried_segments = 1; 362 blk_recount_segments(q, bio); 363 } 364 365 /* 366 * setup the new entry, we might clear it again later if we 367 * cannot add the page 368 */ 369 bvec = &bio->bi_io_vec[bio->bi_vcnt]; 370 bvec->bv_page = page; 371 bvec->bv_len = len; 372 bvec->bv_offset = offset; 373 374 /* 375 * if queue has other restrictions (eg varying max sector size 376 * depending on offset), it can specify a merge_bvec_fn in the 377 * queue to get further control 378 */ 379 if (q->merge_bvec_fn) { 380 /* 381 * merge_bvec_fn() returns number of bytes it can accept 382 * at this offset 383 */ 384 if (q->merge_bvec_fn(q, bio, bvec) < len) { 385 bvec->bv_page = NULL; 386 bvec->bv_len = 0; 387 bvec->bv_offset = 0; 388 return 0; 389 } 390 } 391 392 /* If we may be able to merge these biovecs, force a recount */ 393 if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec) || 394 BIOVEC_VIRT_MERGEABLE(bvec-1, bvec))) 395 bio->bi_flags &= ~(1 << BIO_SEG_VALID); 396 397 bio->bi_vcnt++; 398 bio->bi_phys_segments++; 399 bio->bi_hw_segments++; 400 done: 401 bio->bi_size += len; 402 return len; 403} 404 405/** 406 * bio_add_pc_page - attempt to add page to bio 407 * @q: the target queue 408 * @bio: destination bio 409 * @page: page to add 410 * @len: vec entry length 411 * @offset: vec entry offset 412 * 413 * Attempt to add a page to the bio_vec maplist. This can fail for a 414 * number of reasons, such as the bio being full or target block 415 * device limitations. The target block device must allow bio's 416 * smaller than PAGE_SIZE, so it is always possible to add a single 417 * page to an empty bio. This should only be used by REQ_PC bios. 418 */ 419int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, 420 unsigned int len, unsigned int offset) 421{ 422 return __bio_add_page(q, bio, page, len, offset, q->max_hw_sectors); 423} 424 425/** 426 * bio_add_page - attempt to add page to bio 427 * @bio: destination bio 428 * @page: page to add 429 * @len: vec entry length 430 * @offset: vec entry offset 431 * 432 * Attempt to add a page to the bio_vec maplist. This can fail for a 433 * number of reasons, such as the bio being full or target block 434 * device limitations. The target block device must allow bio's 435 * smaller than PAGE_SIZE, so it is always possible to add a single 436 * page to an empty bio. 437 */ 438int bio_add_page(struct bio *bio, struct page *page, unsigned int len, 439 unsigned int offset) 440{ 441 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 442 return __bio_add_page(q, bio, page, len, offset, q->max_sectors); 443} 444 445struct bio_map_data { 446 struct bio_vec *iovecs; 447 void __user *userptr; 448}; 449 450static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio) 451{ 452 memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt); 453 bio->bi_private = bmd; 454} 455 456static void bio_free_map_data(struct bio_map_data *bmd) 457{ 458 kfree(bmd->iovecs); 459 kfree(bmd); 460} 461 462static struct bio_map_data *bio_alloc_map_data(int nr_segs) 463{ 464 struct bio_map_data *bmd = kmalloc(sizeof(*bmd), GFP_KERNEL); 465 466 if (!bmd) 467 return NULL; 468 469 bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, GFP_KERNEL); 470 if (bmd->iovecs) 471 return bmd; 472 473 kfree(bmd); 474 return NULL; 475} 476 477/** 478 * bio_uncopy_user - finish previously mapped bio 479 * @bio: bio being terminated 480 * 481 * Free pages allocated from bio_copy_user() and write back data 482 * to user space in case of a read. 483 */ 484int bio_uncopy_user(struct bio *bio) 485{ 486 struct bio_map_data *bmd = bio->bi_private; 487 const int read = bio_data_dir(bio) == READ; 488 struct bio_vec *bvec; 489 int i, ret = 0; 490 491 __bio_for_each_segment(bvec, bio, i, 0) { 492 char *addr = page_address(bvec->bv_page); 493 unsigned int len = bmd->iovecs[i].bv_len; 494 495 if (read && !ret && copy_to_user(bmd->userptr, addr, len)) 496 ret = -EFAULT; 497 498 __free_page(bvec->bv_page); 499 bmd->userptr += len; 500 } 501 bio_free_map_data(bmd); 502 bio_put(bio); 503 return ret; 504} 505 506/** 507 * bio_copy_user - copy user data to bio 508 * @q: destination block queue 509 * @uaddr: start of user address 510 * @len: length in bytes 511 * @write_to_vm: bool indicating writing to pages or not 512 * 513 * Prepares and returns a bio for indirect user io, bouncing data 514 * to/from kernel pages as necessary. Must be paired with 515 * call bio_uncopy_user() on io completion. 516 */ 517struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr, 518 unsigned int len, int write_to_vm) 519{ 520 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 521 unsigned long start = uaddr >> PAGE_SHIFT; 522 struct bio_map_data *bmd; 523 struct bio_vec *bvec; 524 struct page *page; 525 struct bio *bio; 526 int i, ret; 527 528 bmd = bio_alloc_map_data(end - start); 529 if (!bmd) 530 return ERR_PTR(-ENOMEM); 531 532 bmd->userptr = (void __user *) uaddr; 533 534 ret = -ENOMEM; 535 bio = bio_alloc(GFP_KERNEL, end - start); 536 if (!bio) 537 goto out_bmd; 538 539 bio->bi_rw |= (!write_to_vm << BIO_RW); 540 541 ret = 0; 542 while (len) { 543 unsigned int bytes = PAGE_SIZE; 544 545 if (bytes > len) 546 bytes = len; 547 548 page = alloc_page(q->bounce_gfp | GFP_KERNEL); 549 if (!page) { 550 ret = -ENOMEM; 551 break; 552 } 553 554 if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) 555 break; 556 557 len -= bytes; 558 } 559 560 if (ret) 561 goto cleanup; 562 563 /* 564 * success 565 */ 566 if (!write_to_vm) { 567 char __user *p = (char __user *) uaddr; 568 569 /* 570 * for a write, copy in data to kernel pages 571 */ 572 ret = -EFAULT; 573 bio_for_each_segment(bvec, bio, i) { 574 char *addr = page_address(bvec->bv_page); 575 576 if (copy_from_user(addr, p, bvec->bv_len)) 577 goto cleanup; 578 p += bvec->bv_len; 579 } 580 } 581 582 bio_set_map_data(bmd, bio); 583 return bio; 584cleanup: 585 bio_for_each_segment(bvec, bio, i) 586 __free_page(bvec->bv_page); 587 588 bio_put(bio); 589out_bmd: 590 bio_free_map_data(bmd); 591 return ERR_PTR(ret); 592} 593 594static struct bio *__bio_map_user_iov(struct request_queue *q, 595 struct block_device *bdev, 596 struct sg_iovec *iov, int iov_count, 597 int write_to_vm) 598{ 599 int i, j; 600 int nr_pages = 0; 601 struct page **pages; 602 struct bio *bio; 603 int cur_page = 0; 604 int ret, offset; 605 606 for (i = 0; i < iov_count; i++) { 607 unsigned long uaddr = (unsigned long)iov[i].iov_base; 608 unsigned long len = iov[i].iov_len; 609 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 610 unsigned long start = uaddr >> PAGE_SHIFT; 611 612 nr_pages += end - start; 613 /* 614 * buffer must be aligned to at least hardsector size for now 615 */ 616 if (uaddr & queue_dma_alignment(q)) 617 return ERR_PTR(-EINVAL); 618 } 619 620 if (!nr_pages) 621 return ERR_PTR(-EINVAL); 622 623 bio = bio_alloc(GFP_KERNEL, nr_pages); 624 if (!bio) 625 return ERR_PTR(-ENOMEM); 626 627 ret = -ENOMEM; 628 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); 629 if (!pages) 630 goto out; 631 632 for (i = 0; i < iov_count; i++) { 633 unsigned long uaddr = (unsigned long)iov[i].iov_base; 634 unsigned long len = iov[i].iov_len; 635 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 636 unsigned long start = uaddr >> PAGE_SHIFT; 637 const int local_nr_pages = end - start; 638 const int page_limit = cur_page + local_nr_pages; 639 640 down_read(&current->mm->mmap_sem); 641 ret = get_user_pages(current, current->mm, uaddr, 642 local_nr_pages, 643 write_to_vm, 0, &pages[cur_page], NULL); 644 up_read(&current->mm->mmap_sem); 645 646 if (ret < local_nr_pages) { 647 ret = -EFAULT; 648 goto out_unmap; 649 } 650 651 offset = uaddr & ~PAGE_MASK; 652 for (j = cur_page; j < page_limit; j++) { 653 unsigned int bytes = PAGE_SIZE - offset; 654 655 if (len <= 0) 656 break; 657 658 if (bytes > len) 659 bytes = len; 660 661 /* 662 * sorry... 663 */ 664 if (bio_add_pc_page(q, bio, pages[j], bytes, offset) < 665 bytes) 666 break; 667 668 len -= bytes; 669 offset = 0; 670 } 671 672 cur_page = j; 673 /* 674 * release the pages we didn't map into the bio, if any 675 */ 676 while (j < page_limit) 677 page_cache_release(pages[j++]); 678 } 679 680 kfree(pages); 681 682 /* 683 * set data direction, and check if mapped pages need bouncing 684 */ 685 if (!write_to_vm) 686 bio->bi_rw |= (1 << BIO_RW); 687 688 bio->bi_bdev = bdev; 689 bio->bi_flags |= (1 << BIO_USER_MAPPED); 690 return bio; 691 692 out_unmap: 693 for (i = 0; i < nr_pages; i++) { 694 if(!pages[i]) 695 break; 696 page_cache_release(pages[i]); 697 } 698 out: 699 kfree(pages); 700 bio_put(bio); 701 return ERR_PTR(ret); 702} 703 704/** 705 * bio_map_user - map user address into bio 706 * @q: the struct request_queue for the bio 707 * @bdev: destination block device 708 * @uaddr: start of user address 709 * @len: length in bytes 710 * @write_to_vm: bool indicating writing to pages or not 711 * 712 * Map the user space address into a bio suitable for io to a block 713 * device. Returns an error pointer in case of error. 714 */ 715struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev, 716 unsigned long uaddr, unsigned int len, int write_to_vm) 717{ 718 struct sg_iovec iov; 719 720 iov.iov_base = (void __user *)uaddr; 721 iov.iov_len = len; 722 723 return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm); 724} 725 726/** 727 * bio_map_user_iov - map user sg_iovec table into bio 728 * @q: the struct request_queue for the bio 729 * @bdev: destination block device 730 * @iov: the iovec. 731 * @iov_count: number of elements in the iovec 732 * @write_to_vm: bool indicating writing to pages or not 733 * 734 * Map the user space address into a bio suitable for io to a block 735 * device. Returns an error pointer in case of error. 736 */ 737struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev, 738 struct sg_iovec *iov, int iov_count, 739 int write_to_vm) 740{ 741 struct bio *bio; 742 743 bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm); 744 745 if (IS_ERR(bio)) 746 return bio; 747 748 /* 749 * subtle -- if __bio_map_user() ended up bouncing a bio, 750 * it would normally disappear when its bi_end_io is run. 751 * however, we need it for the unmap, so grab an extra 752 * reference to it 753 */ 754 bio_get(bio); 755 756 return bio; 757} 758 759static void __bio_unmap_user(struct bio *bio) 760{ 761 struct bio_vec *bvec; 762 int i; 763 764 /* 765 * make sure we dirty pages we wrote to 766 */ 767 __bio_for_each_segment(bvec, bio, i, 0) { 768 if (bio_data_dir(bio) == READ) 769 set_page_dirty_lock(bvec->bv_page); 770 771 page_cache_release(bvec->bv_page); 772 } 773 774 bio_put(bio); 775} 776 777/** 778 * bio_unmap_user - unmap a bio 779 * @bio: the bio being unmapped 780 * 781 * Unmap a bio previously mapped by bio_map_user(). Must be called with 782 * a process context. 783 * 784 * bio_unmap_user() may sleep. 785 */ 786void bio_unmap_user(struct bio *bio) 787{ 788 __bio_unmap_user(bio); 789 bio_put(bio); 790} 791 792static void bio_map_kern_endio(struct bio *bio, int err) 793{ 794 bio_put(bio); 795} 796 797 798static struct bio *__bio_map_kern(struct request_queue *q, void *data, 799 unsigned int len, gfp_t gfp_mask) 800{ 801 unsigned long kaddr = (unsigned long)data; 802 unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 803 unsigned long start = kaddr >> PAGE_SHIFT; 804 const int nr_pages = end - start; 805 int offset, i; 806 struct bio *bio; 807 808 bio = bio_alloc(gfp_mask, nr_pages); 809 if (!bio) 810 return ERR_PTR(-ENOMEM); 811 812 offset = offset_in_page(kaddr); 813 for (i = 0; i < nr_pages; i++) { 814 unsigned int bytes = PAGE_SIZE - offset; 815 816 if (len <= 0) 817 break; 818 819 if (bytes > len) 820 bytes = len; 821 822 if (bio_add_pc_page(q, bio, virt_to_page(data), bytes, 823 offset) < bytes) 824 break; 825 826 data += bytes; 827 len -= bytes; 828 offset = 0; 829 } 830 831 bio->bi_end_io = bio_map_kern_endio; 832 return bio; 833} 834 835/** 836 * bio_map_kern - map kernel address into bio 837 * @q: the struct request_queue for the bio 838 * @data: pointer to buffer to map 839 * @len: length in bytes 840 * @gfp_mask: allocation flags for bio allocation 841 * 842 * Map the kernel address into a bio suitable for io to a block 843 * device. Returns an error pointer in case of error. 844 */ 845struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, 846 gfp_t gfp_mask) 847{ 848 struct bio *bio; 849 850 bio = __bio_map_kern(q, data, len, gfp_mask); 851 if (IS_ERR(bio)) 852 return bio; 853 854 if (bio->bi_size == len) 855 return bio; 856 857 /* 858 * Don't support partial mappings. 859 */ 860 bio_put(bio); 861 return ERR_PTR(-EINVAL); 862} 863 864/* 865 * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions 866 * for performing direct-IO in BIOs. 867 * 868 * The problem is that we cannot run set_page_dirty() from interrupt context 869 * because the required locks are not interrupt-safe. So what we can do is to 870 * mark the pages dirty _before_ performing IO. And in interrupt context, 871 * check that the pages are still dirty. If so, fine. If not, redirty them 872 * in process context. 873 * 874 * We special-case compound pages here: normally this means reads into hugetlb 875 * pages. The logic in here doesn't really work right for compound pages 876 * because the VM does not uniformly chase down the head page in all cases. 877 * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't 878 * handle them at all. So we skip compound pages here at an early stage. 879 * 880 * Note that this code is very hard to test under normal circumstances because 881 * direct-io pins the pages with get_user_pages(). This makes 882 * is_page_cache_freeable return false, and the VM will not clean the pages. 883 * But other code (eg, pdflush) could clean the pages if they are mapped 884 * pagecache. 885 * 886 * Simply disabling the call to bio_set_pages_dirty() is a good way to test the 887 * deferred bio dirtying paths. 888 */ 889 890/* 891 * bio_set_pages_dirty() will mark all the bio's pages as dirty. 892 */ 893void bio_set_pages_dirty(struct bio *bio) 894{ 895 struct bio_vec *bvec = bio->bi_io_vec; 896 int i; 897 898 for (i = 0; i < bio->bi_vcnt; i++) { 899 struct page *page = bvec[i].bv_page; 900 901 if (page && !PageCompound(page)) 902 set_page_dirty_lock(page); 903 } 904} 905 906void bio_release_pages(struct bio *bio) 907{ 908 struct bio_vec *bvec = bio->bi_io_vec; 909 int i; 910 911 for (i = 0; i < bio->bi_vcnt; i++) { 912 struct page *page = bvec[i].bv_page; 913 914 if (page) 915 put_page(page); 916 } 917} 918 919/* 920 * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. 921 * If they are, then fine. If, however, some pages are clean then they must 922 * have been written out during the direct-IO read. So we take another ref on 923 * the BIO and the offending pages and re-dirty the pages in process context. 924 * 925 * It is expected that bio_check_pages_dirty() will wholly own the BIO from 926 * here on. It will run one page_cache_release() against each page and will 927 * run one bio_put() against the BIO. 928 */ 929 930static void bio_dirty_fn(struct work_struct *work); 931 932static DECLARE_WORK(bio_dirty_work, bio_dirty_fn); 933static DEFINE_SPINLOCK(bio_dirty_lock); 934static struct bio *bio_dirty_list; 935 936/* 937 * This runs in process context 938 */ 939static void bio_dirty_fn(struct work_struct *work) 940{ 941 unsigned long flags; 942 struct bio *bio; 943 944 spin_lock_irqsave(&bio_dirty_lock, flags); 945 bio = bio_dirty_list; 946 bio_dirty_list = NULL; 947 spin_unlock_irqrestore(&bio_dirty_lock, flags); 948 949 while (bio) { 950 struct bio *next = bio->bi_private; 951 952 bio_set_pages_dirty(bio); 953 bio_release_pages(bio); 954 bio_put(bio); 955 bio = next; 956 } 957} 958 959void bio_check_pages_dirty(struct bio *bio) 960{ 961 struct bio_vec *bvec = bio->bi_io_vec; 962 int nr_clean_pages = 0; 963 int i; 964 965 for (i = 0; i < bio->bi_vcnt; i++) { 966 struct page *page = bvec[i].bv_page; 967 968 if (PageDirty(page) || PageCompound(page)) { 969 page_cache_release(page); 970 bvec[i].bv_page = NULL; 971 } else { 972 nr_clean_pages++; 973 } 974 } 975 976 if (nr_clean_pages) { 977 unsigned long flags; 978 979 spin_lock_irqsave(&bio_dirty_lock, flags); 980 bio->bi_private = bio_dirty_list; 981 bio_dirty_list = bio; 982 spin_unlock_irqrestore(&bio_dirty_lock, flags); 983 schedule_work(&bio_dirty_work); 984 } else { 985 bio_put(bio); 986 } 987} 988 989/** 990 * bio_endio - end I/O on a bio 991 * @bio: bio 992 * @error: error, if any 993 * 994 * Description: 995 * bio_endio() will end I/O on the whole bio. bio_endio() is the 996 * preferred way to end I/O on a bio, it takes care of clearing 997 * BIO_UPTODATE on error. @error is 0 on success, and and one of the 998 * established -Exxxx (-EIO, for instance) error values in case 999 * something went wrong. Noone should call bi_end_io() directly on a 1000 * bio unless they own it and thus know that it has an end_io 1001 * function. 1002 **/ 1003void bio_endio(struct bio *bio, int error) 1004{ 1005 if (error) 1006 clear_bit(BIO_UPTODATE, &bio->bi_flags); 1007 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 1008 error = -EIO; 1009 1010 if (bio->bi_end_io) 1011 bio->bi_end_io(bio, error); 1012} 1013 1014void bio_pair_release(struct bio_pair *bp) 1015{ 1016 if (atomic_dec_and_test(&bp->cnt)) { 1017 struct bio *master = bp->bio1.bi_private; 1018 1019 bio_endio(master, bp->error); 1020 mempool_free(bp, bp->bio2.bi_private); 1021 } 1022} 1023 1024static void bio_pair_end_1(struct bio *bi, int err) 1025{ 1026 struct bio_pair *bp = container_of(bi, struct bio_pair, bio1); 1027 1028 if (err) 1029 bp->error = err; 1030 1031 bio_pair_release(bp); 1032} 1033 1034static void bio_pair_end_2(struct bio *bi, int err) 1035{ 1036 struct bio_pair *bp = container_of(bi, struct bio_pair, bio2); 1037 1038 if (err) 1039 bp->error = err; 1040 1041 bio_pair_release(bp); 1042} 1043 1044/* 1045 * split a bio - only worry about a bio with a single page 1046 * in it's iovec 1047 */ 1048struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors) 1049{ 1050 struct bio_pair *bp = mempool_alloc(pool, GFP_NOIO); 1051 1052 if (!bp) 1053 return bp; 1054 1055 blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi, 1056 bi->bi_sector + first_sectors); 1057 1058 BUG_ON(bi->bi_vcnt != 1); 1059 BUG_ON(bi->bi_idx != 0); 1060 atomic_set(&bp->cnt, 3); 1061 bp->error = 0; 1062 bp->bio1 = *bi; 1063 bp->bio2 = *bi; 1064 bp->bio2.bi_sector += first_sectors; 1065 bp->bio2.bi_size -= first_sectors << 9; 1066 bp->bio1.bi_size = first_sectors << 9; 1067 1068 bp->bv1 = bi->bi_io_vec[0]; 1069 bp->bv2 = bi->bi_io_vec[0]; 1070 bp->bv2.bv_offset += first_sectors << 9; 1071 bp->bv2.bv_len -= first_sectors << 9; 1072 bp->bv1.bv_len = first_sectors << 9; 1073 1074 bp->bio1.bi_io_vec = &bp->bv1; 1075 bp->bio2.bi_io_vec = &bp->bv2; 1076 1077 bp->bio1.bi_max_vecs = 1; 1078 bp->bio2.bi_max_vecs = 1; 1079 1080 bp->bio1.bi_end_io = bio_pair_end_1; 1081 bp->bio2.bi_end_io = bio_pair_end_2; 1082 1083 bp->bio1.bi_private = bi; 1084 bp->bio2.bi_private = pool; 1085 1086 return bp; 1087} 1088 1089 1090/* 1091 * create memory pools for biovec's in a bio_set. 1092 * use the global biovec slabs created for general use. 1093 */ 1094static int biovec_create_pools(struct bio_set *bs, int pool_entries) 1095{ 1096 int i; 1097 1098 for (i = 0; i < BIOVEC_NR_POOLS; i++) { 1099 struct biovec_slab *bp = bvec_slabs + i; 1100 mempool_t **bvp = bs->bvec_pools + i; 1101 1102 *bvp = mempool_create_slab_pool(pool_entries, bp->slab); 1103 if (!*bvp) 1104 return -ENOMEM; 1105 } 1106 return 0; 1107} 1108 1109static void biovec_free_pools(struct bio_set *bs) 1110{ 1111 int i; 1112 1113 for (i = 0; i < BIOVEC_NR_POOLS; i++) { 1114 mempool_t *bvp = bs->bvec_pools[i]; 1115 1116 if (bvp) 1117 mempool_destroy(bvp); 1118 } 1119 1120} 1121 1122void bioset_free(struct bio_set *bs) 1123{ 1124 if (bs->bio_pool) 1125 mempool_destroy(bs->bio_pool); 1126 1127 biovec_free_pools(bs); 1128 1129 kfree(bs); 1130} 1131 1132struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size) 1133{ 1134 struct bio_set *bs = kzalloc(sizeof(*bs), GFP_KERNEL); 1135 1136 if (!bs) 1137 return NULL; 1138 1139 bs->bio_pool = mempool_create_slab_pool(bio_pool_size, bio_slab); 1140 if (!bs->bio_pool) 1141 goto bad; 1142 1143 if (!biovec_create_pools(bs, bvec_pool_size)) 1144 return bs; 1145 1146bad: 1147 bioset_free(bs); 1148 return NULL; 1149} 1150 1151static void __init biovec_init_slabs(void) 1152{ 1153 int i; 1154 1155 for (i = 0; i < BIOVEC_NR_POOLS; i++) { 1156 int size; 1157 struct biovec_slab *bvs = bvec_slabs + i; 1158 1159 size = bvs->nr_vecs * sizeof(struct bio_vec); 1160 bvs->slab = kmem_cache_create(bvs->name, size, 0, 1161 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1162 } 1163} 1164 1165static int __init init_bio(void) 1166{ 1167 bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC); 1168 1169 biovec_init_slabs(); 1170 1171 fs_bio_set = bioset_create(BIO_POOL_SIZE, 2); 1172 if (!fs_bio_set) 1173 panic("bio: can't allocate bios\n"); 1174 1175 bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES, 1176 sizeof(struct bio_pair)); 1177 if (!bio_split_pool) 1178 panic("bio: can't create split pool\n"); 1179 1180 return 0; 1181} 1182 1183subsys_initcall(init_bio); 1184 1185EXPORT_SYMBOL(bio_alloc); 1186EXPORT_SYMBOL(bio_put); 1187EXPORT_SYMBOL(bio_free); 1188EXPORT_SYMBOL(bio_endio); 1189EXPORT_SYMBOL(bio_init); 1190EXPORT_SYMBOL(__bio_clone); 1191EXPORT_SYMBOL(bio_clone); 1192EXPORT_SYMBOL(bio_phys_segments); 1193EXPORT_SYMBOL(bio_hw_segments); 1194EXPORT_SYMBOL(bio_add_page); 1195EXPORT_SYMBOL(bio_add_pc_page); 1196EXPORT_SYMBOL(bio_get_nr_vecs); 1197EXPORT_SYMBOL(bio_map_kern); 1198EXPORT_SYMBOL(bio_pair_release); 1199EXPORT_SYMBOL(bio_split); 1200EXPORT_SYMBOL(bio_split_pool); 1201EXPORT_SYMBOL(bio_copy_user); 1202EXPORT_SYMBOL(bio_uncopy_user); 1203EXPORT_SYMBOL(bioset_create); 1204EXPORT_SYMBOL(bioset_free); 1205EXPORT_SYMBOL(bio_alloc_bioset);