[XFS] Initial pass at going directly-to-bio on the buffered IO path. This allows us to submit much larger I/Os instead of sending down lots of small buffer_heads. To do this we need to have a rather complicated I/O submission and completion tracking infrastructure. Part of the latter has been merged already a long time ago for direct I/O support. Part of the problem is that we need to track sub-pagesize regions and for that we still need buffer_heads for the time beeing. Long-term I hope we can move to better data strucutures and/or maybe move this to fs/mpage.c instead of having it in XFS. Original patch from Nathan Scott with various updates from David Chinner and Christoph Hellwig.

SGI-PV: 947118
SGI-Modid: xfs-linux-melb:xfs-kern:203822a

Signed-off-by: Christoph Hellwig <hch@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>

authored by Christoph Hellwig and committed by Nathan Scott f6d6d4fc ce8e922c

+431 -364
+421 -355
fs/xfs/linux-2.6/xfs_aops.c
··· 43 43 #include <linux/writeback.h> 44 44 45 45 STATIC void xfs_count_page_state(struct page *, int *, int *, int *); 46 - STATIC void xfs_convert_page(struct inode *, struct page *, xfs_iomap_t *, 47 - struct writeback_control *wbc, void *, int, int); 48 46 49 47 #if defined(XFS_RW_TRACE) 50 48 void ··· 56 58 bhv_desc_t *bdp; 57 59 vnode_t *vp = LINVFS_GET_VP(inode); 58 60 loff_t isize = i_size_read(inode); 59 - loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT; 61 + loff_t offset = page_offset(page); 60 62 int delalloc = -1, unmapped = -1, unwritten = -1; 61 63 62 64 if (page_has_buffers(page)) ··· 101 103 queue_work(xfsdatad_workqueue, &ioend->io_work); 102 104 } 103 105 106 + /* 107 + * We're now finished for good with this ioend structure. 108 + * Update the page state via the associated buffer_heads, 109 + * release holds on the inode and bio, and finally free 110 + * up memory. Do not use the ioend after this. 111 + */ 104 112 STATIC void 105 113 xfs_destroy_ioend( 106 114 xfs_ioend_t *ioend) 107 115 { 116 + struct buffer_head *bh, *next; 117 + 118 + for (bh = ioend->io_buffer_head; bh; bh = next) { 119 + next = bh->b_private; 120 + bh->b_end_io(bh, ioend->io_uptodate); 121 + } 122 + 108 123 vn_iowake(ioend->io_vnode); 109 124 mempool_free(ioend, xfs_ioend_pool); 110 125 } 111 126 112 127 /* 128 + * Buffered IO write completion for delayed allocate extents. 129 + * TODO: Update ondisk isize now that we know the file data 130 + * has been flushed (i.e. the notorious "NULL file" problem). 131 + */ 132 + STATIC void 133 + xfs_end_bio_delalloc( 134 + void *data) 135 + { 136 + xfs_ioend_t *ioend = data; 137 + 138 + xfs_destroy_ioend(ioend); 139 + } 140 + 141 + /* 142 + * Buffered IO write completion for regular, written extents. 143 + */ 144 + STATIC void 145 + xfs_end_bio_written( 146 + void *data) 147 + { 148 + xfs_ioend_t *ioend = data; 149 + 150 + xfs_destroy_ioend(ioend); 151 + } 152 + 153 + /* 154 + * IO write completion for unwritten extents. 155 + * 113 156 * Issue transactions to convert a buffer range from unwritten 114 157 * to written extents. 115 158 */ ··· 162 123 vnode_t *vp = ioend->io_vnode; 163 124 xfs_off_t offset = ioend->io_offset; 164 125 size_t size = ioend->io_size; 165 - struct buffer_head *bh, *next; 166 126 int error; 167 127 168 128 if (ioend->io_uptodate) 169 129 VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error); 170 - 171 - /* ioend->io_buffer_head is only non-NULL for buffered I/O */ 172 - for (bh = ioend->io_buffer_head; bh; bh = next) { 173 - next = bh->b_private; 174 - 175 - bh->b_end_io = NULL; 176 - clear_buffer_unwritten(bh); 177 - end_buffer_async_write(bh, ioend->io_uptodate); 178 - } 179 - 180 130 xfs_destroy_ioend(ioend); 181 131 } 182 132 ··· 177 149 */ 178 150 STATIC xfs_ioend_t * 179 151 xfs_alloc_ioend( 180 - struct inode *inode) 152 + struct inode *inode, 153 + unsigned int type) 181 154 { 182 155 xfs_ioend_t *ioend; 183 156 ··· 191 162 */ 192 163 atomic_set(&ioend->io_remaining, 1); 193 164 ioend->io_uptodate = 1; /* cleared if any I/O fails */ 165 + ioend->io_list = NULL; 166 + ioend->io_type = type; 194 167 ioend->io_vnode = LINVFS_GET_VP(inode); 195 168 ioend->io_buffer_head = NULL; 169 + ioend->io_buffer_tail = NULL; 196 170 atomic_inc(&ioend->io_vnode->v_iocount); 197 171 ioend->io_offset = 0; 198 172 ioend->io_size = 0; 199 173 200 - INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten, ioend); 174 + if (type == IOMAP_UNWRITTEN) 175 + INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten, ioend); 176 + else if (type == IOMAP_DELAY) 177 + INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc, ioend); 178 + else 179 + INIT_WORK(&ioend->io_work, xfs_end_bio_written, ioend); 201 180 202 181 return ioend; 203 - } 204 - 205 - void 206 - linvfs_unwritten_done( 207 - struct buffer_head *bh, 208 - int uptodate) 209 - { 210 - xfs_ioend_t *ioend = bh->b_private; 211 - static spinlock_t unwritten_done_lock = SPIN_LOCK_UNLOCKED; 212 - unsigned long flags; 213 - 214 - ASSERT(buffer_unwritten(bh)); 215 - bh->b_end_io = NULL; 216 - 217 - if (!uptodate) 218 - ioend->io_uptodate = 0; 219 - 220 - /* 221 - * Deep magic here. We reuse b_private in the buffer_heads to build 222 - * a chain for completing the I/O from user context after we've issued 223 - * a transaction to convert the unwritten extent. 224 - */ 225 - spin_lock_irqsave(&unwritten_done_lock, flags); 226 - bh->b_private = ioend->io_buffer_head; 227 - ioend->io_buffer_head = bh; 228 - spin_unlock_irqrestore(&unwritten_done_lock, flags); 229 - 230 - xfs_finish_ioend(ioend); 231 182 } 232 183 233 184 STATIC int ··· 237 228 xfs_iomap_t *iomapp, 238 229 unsigned long offset) 239 230 { 240 - loff_t full_offset; /* offset from start of file */ 231 + xfs_off_t full_offset; /* offset from start of file */ 241 232 242 233 ASSERT(offset < PAGE_CACHE_SIZE); 243 234 ··· 252 243 return NULL; 253 244 } 254 245 246 + /* 247 + * BIO completion handler for buffered IO. 248 + */ 249 + STATIC int 250 + xfs_end_bio( 251 + struct bio *bio, 252 + unsigned int bytes_done, 253 + int error) 254 + { 255 + xfs_ioend_t *ioend = bio->bi_private; 256 + 257 + if (bio->bi_size) 258 + return 1; 259 + 260 + ASSERT(ioend); 261 + ASSERT(atomic_read(&bio->bi_cnt) >= 1); 262 + 263 + /* Toss bio and pass work off to an xfsdatad thread */ 264 + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 265 + ioend->io_uptodate = 0; 266 + bio->bi_private = NULL; 267 + bio->bi_end_io = NULL; 268 + 269 + bio_put(bio); 270 + xfs_finish_ioend(ioend); 271 + return 0; 272 + } 273 + 274 + STATIC void 275 + xfs_submit_ioend_bio( 276 + xfs_ioend_t *ioend, 277 + struct bio *bio) 278 + { 279 + atomic_inc(&ioend->io_remaining); 280 + 281 + bio->bi_private = ioend; 282 + bio->bi_end_io = xfs_end_bio; 283 + 284 + submit_bio(WRITE, bio); 285 + ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP)); 286 + bio_put(bio); 287 + } 288 + 289 + STATIC struct bio * 290 + xfs_alloc_ioend_bio( 291 + struct buffer_head *bh) 292 + { 293 + struct bio *bio; 294 + int nvecs = bio_get_nr_vecs(bh->b_bdev); 295 + 296 + do { 297 + bio = bio_alloc(GFP_NOIO, nvecs); 298 + nvecs >>= 1; 299 + } while (!bio); 300 + 301 + ASSERT(bio->bi_private == NULL); 302 + bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 303 + bio->bi_bdev = bh->b_bdev; 304 + bio_get(bio); 305 + return bio; 306 + } 307 + 308 + STATIC void 309 + xfs_start_buffer_writeback( 310 + struct buffer_head *bh) 311 + { 312 + ASSERT(buffer_mapped(bh)); 313 + ASSERT(buffer_locked(bh)); 314 + ASSERT(!buffer_delay(bh)); 315 + ASSERT(!buffer_unwritten(bh)); 316 + 317 + mark_buffer_async_write(bh); 318 + set_buffer_uptodate(bh); 319 + clear_buffer_dirty(bh); 320 + } 321 + 322 + STATIC void 323 + xfs_start_page_writeback( 324 + struct page *page, 325 + struct writeback_control *wbc, 326 + int clear_dirty, 327 + int buffers) 328 + { 329 + ASSERT(PageLocked(page)); 330 + ASSERT(!PageWriteback(page)); 331 + set_page_writeback(page); 332 + if (clear_dirty) 333 + clear_page_dirty(page); 334 + unlock_page(page); 335 + if (!buffers) { 336 + end_page_writeback(page); 337 + wbc->pages_skipped++; /* We didn't write this page */ 338 + } 339 + } 340 + 341 + static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh) 342 + { 343 + return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); 344 + } 345 + 346 + /* 347 + * Submit all of the bios for all of the ioends we have saved up, 348 + * covering the initial writepage page and also any probed pages. 349 + */ 350 + STATIC void 351 + xfs_submit_ioend( 352 + xfs_ioend_t *ioend) 353 + { 354 + xfs_ioend_t *next; 355 + struct buffer_head *bh; 356 + struct bio *bio; 357 + sector_t lastblock = 0; 358 + 359 + do { 360 + next = ioend->io_list; 361 + bio = NULL; 362 + 363 + for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { 364 + xfs_start_buffer_writeback(bh); 365 + 366 + if (!bio) { 367 + retry: 368 + bio = xfs_alloc_ioend_bio(bh); 369 + } else if (bh->b_blocknr != lastblock + 1) { 370 + xfs_submit_ioend_bio(ioend, bio); 371 + goto retry; 372 + } 373 + 374 + if (bio_add_buffer(bio, bh) != bh->b_size) { 375 + xfs_submit_ioend_bio(ioend, bio); 376 + goto retry; 377 + } 378 + 379 + lastblock = bh->b_blocknr; 380 + } 381 + if (bio) 382 + xfs_submit_ioend_bio(ioend, bio); 383 + xfs_finish_ioend(ioend); 384 + } while ((ioend = next) != NULL); 385 + } 386 + 387 + /* 388 + * Cancel submission of all buffer_heads so far in this endio. 389 + * Toss the endio too. Only ever called for the initial page 390 + * in a writepage request, so only ever one page. 391 + */ 392 + STATIC void 393 + xfs_cancel_ioend( 394 + xfs_ioend_t *ioend) 395 + { 396 + xfs_ioend_t *next; 397 + struct buffer_head *bh, *next_bh; 398 + 399 + do { 400 + next = ioend->io_list; 401 + bh = ioend->io_buffer_head; 402 + do { 403 + next_bh = bh->b_private; 404 + clear_buffer_async_write(bh); 405 + unlock_buffer(bh); 406 + } while ((bh = next_bh) != NULL); 407 + 408 + vn_iowake(ioend->io_vnode); 409 + mempool_free(ioend, xfs_ioend_pool); 410 + } while ((ioend = next) != NULL); 411 + } 412 + 413 + /* 414 + * Test to see if we've been building up a completion structure for 415 + * earlier buffers -- if so, we try to append to this ioend if we 416 + * can, otherwise we finish off any current ioend and start another. 417 + * Return true if we've finished the given ioend. 418 + */ 419 + STATIC void 420 + xfs_add_to_ioend( 421 + struct inode *inode, 422 + struct buffer_head *bh, 423 + unsigned int p_offset, 424 + unsigned int type, 425 + xfs_ioend_t **result, 426 + int need_ioend) 427 + { 428 + xfs_ioend_t *ioend = *result; 429 + 430 + if (!ioend || need_ioend || type != ioend->io_type) { 431 + xfs_ioend_t *previous = *result; 432 + xfs_off_t offset; 433 + 434 + offset = (xfs_off_t)bh->b_page->index << PAGE_CACHE_SHIFT; 435 + offset += p_offset; 436 + ioend = xfs_alloc_ioend(inode, type); 437 + ioend->io_offset = offset; 438 + ioend->io_buffer_head = bh; 439 + ioend->io_buffer_tail = bh; 440 + if (previous) 441 + previous->io_list = ioend; 442 + *result = ioend; 443 + } else { 444 + ioend->io_buffer_tail->b_private = bh; 445 + ioend->io_buffer_tail = bh; 446 + } 447 + 448 + bh->b_private = NULL; 449 + ioend->io_size += bh->b_size; 450 + } 451 + 255 452 STATIC void 256 453 xfs_map_at_offset( 257 454 struct page *page, 258 455 struct buffer_head *bh, 259 456 unsigned long offset, 260 457 int block_bits, 261 - xfs_iomap_t *iomapp) 458 + xfs_iomap_t *iomapp, 459 + xfs_ioend_t *ioend) 262 460 { 263 461 xfs_daddr_t bn; 264 - loff_t delta; 462 + xfs_off_t delta; 265 463 int sector_shift; 266 464 267 465 ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE)); ··· 492 276 bh->b_bdev = iomapp->iomap_target->bt_bdev; 493 277 set_buffer_mapped(bh); 494 278 clear_buffer_delay(bh); 495 - } 496 - 497 - /* 498 - * Look for a page at index which is unlocked and contains our 499 - * unwritten extent flagged buffers at its head. Returns page 500 - * locked and with an extra reference count, and length of the 501 - * unwritten extent component on this page that we can write, 502 - * in units of filesystem blocks. 503 - */ 504 - STATIC struct page * 505 - xfs_probe_unwritten_page( 506 - struct address_space *mapping, 507 - pgoff_t index, 508 - xfs_iomap_t *iomapp, 509 - xfs_ioend_t *ioend, 510 - unsigned long max_offset, 511 - unsigned long *fsbs, 512 - unsigned int bbits) 513 - { 514 - struct page *page; 515 - 516 - page = find_trylock_page(mapping, index); 517 - if (!page) 518 - return NULL; 519 - if (PageWriteback(page)) 520 - goto out; 521 - 522 - if (page->mapping && page_has_buffers(page)) { 523 - struct buffer_head *bh, *head; 524 - unsigned long p_offset = 0; 525 - 526 - *fsbs = 0; 527 - bh = head = page_buffers(page); 528 - do { 529 - if (!buffer_unwritten(bh) || !buffer_uptodate(bh)) 530 - break; 531 - if (!xfs_offset_to_map(page, iomapp, p_offset)) 532 - break; 533 - if (p_offset >= max_offset) 534 - break; 535 - xfs_map_at_offset(page, bh, p_offset, bbits, iomapp); 536 - set_buffer_unwritten_io(bh); 537 - bh->b_private = ioend; 538 - p_offset += bh->b_size; 539 - (*fsbs)++; 540 - } while ((bh = bh->b_this_page) != head); 541 - 542 - if (p_offset) 543 - return page; 544 - } 545 - 546 - out: 547 - unlock_page(page); 548 - return NULL; 279 + clear_buffer_unwritten(bh); 549 280 } 550 281 551 282 /* ··· 535 372 return ret; 536 373 } 537 374 538 - STATIC unsigned int 375 + STATIC size_t 539 376 xfs_probe_unmapped_cluster( 540 377 struct inode *inode, 541 378 struct page *startpage, 542 379 struct buffer_head *bh, 543 380 struct buffer_head *head) 544 381 { 382 + size_t len, total = 0; 545 383 pgoff_t tindex, tlast, tloff; 546 - unsigned int pg_offset, len, total = 0; 384 + unsigned int pg_offset; 547 385 struct address_space *mapping = inode->i_mapping; 548 386 549 387 /* First sum forwards in this page */ ··· 578 414 } 579 415 580 416 /* 581 - * Probe for a given page (index) in the inode and test if it is delayed 582 - * and without unwritten buffers. Returns page locked and with an extra 583 - * reference count. 417 + * Probe for a given page (index) in the inode and test if it is suitable 418 + * for writing as part of an unwritten or delayed allocate extent. 419 + * Returns page locked and with an extra reference count if so, else NULL. 584 420 */ 585 421 STATIC struct page * 586 - xfs_probe_delalloc_page( 422 + xfs_probe_delayed_page( 587 423 struct inode *inode, 588 - pgoff_t index) 424 + pgoff_t index, 425 + unsigned int type) 589 426 { 590 427 struct page *page; 591 428 ··· 602 437 603 438 bh = head = page_buffers(page); 604 439 do { 605 - if (buffer_unwritten(bh)) { 606 - acceptable = 0; 440 + if (buffer_unwritten(bh)) 441 + acceptable = (type == IOMAP_UNWRITTEN); 442 + else if (buffer_delay(bh)) 443 + acceptable = (type == IOMAP_DELAY); 444 + else 607 445 break; 608 - } else if (buffer_delay(bh)) { 609 - acceptable = 1; 610 - } 611 446 } while ((bh = bh->b_this_page) != head); 612 447 613 448 if (acceptable) ··· 619 454 return NULL; 620 455 } 621 456 622 - STATIC int 623 - xfs_map_unwritten( 624 - struct inode *inode, 625 - struct page *start_page, 626 - struct buffer_head *head, 627 - struct buffer_head *curr, 628 - unsigned long p_offset, 629 - int block_bits, 630 - xfs_iomap_t *iomapp, 631 - struct writeback_control *wbc, 632 - int startio, 633 - int all_bh) 634 - { 635 - struct buffer_head *bh = curr; 636 - xfs_iomap_t *tmp; 637 - xfs_ioend_t *ioend; 638 - loff_t offset; 639 - unsigned long nblocks = 0; 640 - 641 - offset = start_page->index; 642 - offset <<= PAGE_CACHE_SHIFT; 643 - offset += p_offset; 644 - 645 - ioend = xfs_alloc_ioend(inode); 646 - 647 - /* First map forwards in the page consecutive buffers 648 - * covering this unwritten extent 649 - */ 650 - do { 651 - if (!buffer_unwritten(bh)) 652 - break; 653 - tmp = xfs_offset_to_map(start_page, iomapp, p_offset); 654 - if (!tmp) 655 - break; 656 - xfs_map_at_offset(start_page, bh, p_offset, block_bits, iomapp); 657 - set_buffer_unwritten_io(bh); 658 - bh->b_private = ioend; 659 - p_offset += bh->b_size; 660 - nblocks++; 661 - } while ((bh = bh->b_this_page) != head); 662 - 663 - atomic_add(nblocks, &ioend->io_remaining); 664 - 665 - /* If we reached the end of the page, map forwards in any 666 - * following pages which are also covered by this extent. 667 - */ 668 - if (bh == head) { 669 - struct address_space *mapping = inode->i_mapping; 670 - pgoff_t tindex, tloff, tlast; 671 - unsigned long bs; 672 - unsigned int pg_offset, bbits = inode->i_blkbits; 673 - struct page *page; 674 - 675 - tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT; 676 - tloff = (iomapp->iomap_offset + iomapp->iomap_bsize) >> PAGE_CACHE_SHIFT; 677 - tloff = min(tlast, tloff); 678 - for (tindex = start_page->index + 1; tindex < tloff; tindex++) { 679 - page = xfs_probe_unwritten_page(mapping, 680 - tindex, iomapp, ioend, 681 - PAGE_CACHE_SIZE, &bs, bbits); 682 - if (!page) 683 - break; 684 - nblocks += bs; 685 - atomic_add(bs, &ioend->io_remaining); 686 - xfs_convert_page(inode, page, iomapp, wbc, ioend, 687 - startio, all_bh); 688 - /* stop if converting the next page might add 689 - * enough blocks that the corresponding byte 690 - * count won't fit in our ulong page buf length */ 691 - if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits)) 692 - goto enough; 693 - } 694 - 695 - if (tindex == tlast && 696 - (pg_offset = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1)))) { 697 - page = xfs_probe_unwritten_page(mapping, 698 - tindex, iomapp, ioend, 699 - pg_offset, &bs, bbits); 700 - if (page) { 701 - nblocks += bs; 702 - atomic_add(bs, &ioend->io_remaining); 703 - xfs_convert_page(inode, page, iomapp, wbc, ioend, 704 - startio, all_bh); 705 - if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits)) 706 - goto enough; 707 - } 708 - } 709 - } 710 - 711 - enough: 712 - ioend->io_size = (xfs_off_t)nblocks << block_bits; 713 - ioend->io_offset = offset; 714 - xfs_finish_ioend(ioend); 715 - return 0; 716 - } 717 - 718 - STATIC void 719 - xfs_submit_page( 720 - struct page *page, 721 - struct writeback_control *wbc, 722 - struct buffer_head *bh_arr[], 723 - int bh_count, 724 - int probed_page, 725 - int clear_dirty) 726 - { 727 - struct buffer_head *bh; 728 - int i; 729 - 730 - BUG_ON(PageWriteback(page)); 731 - if (bh_count) 732 - set_page_writeback(page); 733 - if (clear_dirty) 734 - clear_page_dirty(page); 735 - unlock_page(page); 736 - 737 - if (bh_count) { 738 - for (i = 0; i < bh_count; i++) { 739 - bh = bh_arr[i]; 740 - mark_buffer_async_write(bh); 741 - if (buffer_unwritten(bh)) 742 - set_buffer_unwritten_io(bh); 743 - set_buffer_uptodate(bh); 744 - clear_buffer_dirty(bh); 745 - } 746 - 747 - for (i = 0; i < bh_count; i++) 748 - submit_bh(WRITE, bh_arr[i]); 749 - 750 - if (probed_page && clear_dirty) 751 - wbc->nr_to_write--; /* Wrote an "extra" page */ 752 - } 753 - } 754 - 755 457 /* 756 458 * Allocate & map buffers for page given the extent map. Write it out. 757 459 * except for the original page of a writepage, this is called on 758 460 * delalloc/unwritten pages only, for the original page it is possible 759 461 * that the page has no mapping at all. 760 462 */ 761 - STATIC void 463 + STATIC int 762 464 xfs_convert_page( 763 465 struct inode *inode, 764 466 struct page *page, 765 467 xfs_iomap_t *iomapp, 468 + xfs_ioend_t **ioendp, 766 469 struct writeback_control *wbc, 767 470 void *private, 768 471 int startio, 769 472 int all_bh) 770 473 { 771 - struct buffer_head *bh_arr[MAX_BUF_PER_PAGE], *bh, *head; 474 + struct buffer_head *bh, *head; 772 475 xfs_iomap_t *mp = iomapp, *tmp; 773 - unsigned long offset, end_offset; 774 - int index = 0; 476 + unsigned long p_offset, end_offset; 477 + unsigned int type; 775 478 int bbits = inode->i_blkbits; 776 479 int len, page_dirty; 480 + int count = 0, done = 0, uptodate = 1; 777 481 778 482 end_offset = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1)); 779 483 ··· 655 621 end_offset = roundup(end_offset, len); 656 622 page_dirty = end_offset / len; 657 623 658 - offset = 0; 624 + p_offset = 0; 659 625 bh = head = page_buffers(page); 660 626 do { 661 - if (offset >= end_offset) 627 + if (p_offset >= end_offset) 662 628 break; 663 - if (!(PageUptodate(page) || buffer_uptodate(bh))) 629 + if (!buffer_uptodate(bh)) 630 + uptodate = 0; 631 + if (!(PageUptodate(page) || buffer_uptodate(bh))) { 632 + done = 1; 664 633 continue; 665 - if (buffer_mapped(bh) && all_bh && 666 - !(buffer_unwritten(bh) || buffer_delay(bh))) { 667 - if (startio) { 634 + } 635 + 636 + if (buffer_unwritten(bh)) 637 + type = IOMAP_UNWRITTEN; 638 + else if (buffer_delay(bh)) 639 + type = IOMAP_DELAY; 640 + else { 641 + type = 0; 642 + if (!(buffer_mapped(bh) && all_bh && startio)) { 643 + done = 1; 644 + } else if (startio) { 668 645 lock_buffer(bh); 669 - bh_arr[index++] = bh; 646 + xfs_add_to_ioend(inode, bh, p_offset, 647 + type, ioendp, done); 648 + count++; 670 649 page_dirty--; 671 650 } 672 651 continue; 673 652 } 674 - tmp = xfs_offset_to_map(page, mp, offset); 675 - if (!tmp) 653 + tmp = xfs_offset_to_map(page, mp, p_offset); 654 + if (!tmp) { 655 + done = 1; 676 656 continue; 657 + } 677 658 ASSERT(!(tmp->iomap_flags & IOMAP_HOLE)); 678 659 ASSERT(!(tmp->iomap_flags & IOMAP_DELAY)); 679 660 680 - /* If this is a new unwritten extent buffer (i.e. one 681 - * that we haven't passed in private data for, we must 682 - * now map this buffer too. 683 - */ 684 - if (buffer_unwritten(bh) && !bh->b_end_io) { 685 - ASSERT(tmp->iomap_flags & IOMAP_UNWRITTEN); 686 - xfs_map_unwritten(inode, page, head, bh, offset, 687 - bbits, tmp, wbc, startio, all_bh); 688 - } else if (! (buffer_unwritten(bh) && buffer_locked(bh))) { 689 - xfs_map_at_offset(page, bh, offset, bbits, tmp); 690 - if (buffer_unwritten(bh)) { 691 - set_buffer_unwritten_io(bh); 692 - bh->b_private = private; 693 - ASSERT(private); 694 - } 695 - } 661 + xfs_map_at_offset(page, bh, p_offset, bbits, tmp, *ioendp); 696 662 if (startio) { 697 - bh_arr[index++] = bh; 663 + xfs_add_to_ioend(inode, bh, p_offset, 664 + type, ioendp, done); 665 + count++; 698 666 } else { 699 667 set_buffer_dirty(bh); 700 668 unlock_buffer(bh); 701 669 mark_buffer_dirty(bh); 702 670 } 703 671 page_dirty--; 704 - } while (offset += len, (bh = bh->b_this_page) != head); 672 + } while (p_offset += len, (bh = bh->b_this_page) != head); 705 673 706 - if (startio && index) { 707 - xfs_submit_page(page, wbc, bh_arr, index, 1, !page_dirty); 708 - } else { 709 - unlock_page(page); 674 + if (uptodate && bh == head) 675 + SetPageUptodate(page); 676 + 677 + if (startio) { 678 + if (count) 679 + wbc->nr_to_write--; 680 + xfs_start_page_writeback(page, wbc, !page_dirty, count); 710 681 } 682 + 683 + return done; 711 684 } 712 685 713 686 /* ··· 726 685 struct inode *inode, 727 686 pgoff_t tindex, 728 687 xfs_iomap_t *iomapp, 688 + xfs_ioend_t **ioendp, 729 689 struct writeback_control *wbc, 730 690 int startio, 731 691 int all_bh, 732 692 pgoff_t tlast) 733 693 { 734 694 struct page *page; 695 + unsigned int type = (*ioendp)->io_type; 696 + int done; 735 697 736 - for (; tindex <= tlast; tindex++) { 737 - page = xfs_probe_delalloc_page(inode, tindex); 698 + for (done = 0; tindex <= tlast && !done; tindex++) { 699 + page = xfs_probe_delayed_page(inode, tindex, type); 738 700 if (!page) 739 701 break; 740 - xfs_convert_page(inode, page, iomapp, wbc, NULL, 741 - startio, all_bh); 702 + done = xfs_convert_page(inode, page, iomapp, ioendp, 703 + wbc, NULL, startio, all_bh); 742 704 } 743 705 } 744 706 ··· 772 728 int startio, 773 729 int unmapped) /* also implies page uptodate */ 774 730 { 775 - struct buffer_head *bh_arr[MAX_BUF_PER_PAGE], *bh, *head; 731 + struct buffer_head *bh, *head; 776 732 xfs_iomap_t *iomp, iomap; 733 + xfs_ioend_t *ioend = NULL, *iohead = NULL; 777 734 loff_t offset; 778 735 unsigned long p_offset = 0; 736 + unsigned int type; 779 737 __uint64_t end_offset; 780 738 pgoff_t end_index, last_index, tlast; 781 - int len, err, i, cnt = 0, uptodate = 1; 782 - int flags; 783 - int page_dirty; 739 + int flags, len, err, done = 1; 740 + int uptodate = 1; 741 + int page_dirty, count = 0, trylock_flag = 0; 784 742 785 743 /* wait for other IO threads? */ 786 - flags = (startio && wbc->sync_mode != WB_SYNC_NONE) ? 0 : BMAPI_TRYLOCK; 744 + if (startio && wbc->sync_mode != WB_SYNC_NONE) 745 + trylock_flag |= BMAPI_TRYLOCK; 787 746 788 747 /* Is this page beyond the end of the file? */ 789 748 offset = i_size_read(inode); ··· 801 754 } 802 755 } 803 756 804 - end_offset = min_t(unsigned long long, 805 - (loff_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset); 806 - offset = (loff_t)page->index << PAGE_CACHE_SHIFT; 807 - 808 757 /* 809 758 * page_dirty is initially a count of buffers on the page before 810 759 * EOF and is decrememted as we move each into a cleanable state. 811 - */ 760 + * 761 + * Derivation: 762 + * 763 + * End offset is the highest offset that this page should represent. 764 + * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1)) 765 + * will evaluate non-zero and be less than PAGE_CACHE_SIZE and 766 + * hence give us the correct page_dirty count. On any other page, 767 + * it will be zero and in that case we need page_dirty to be the 768 + * count of buffers on the page. 769 + */ 770 + end_offset = min_t(unsigned long long, 771 + (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset); 812 772 len = 1 << inode->i_blkbits; 813 - p_offset = max(p_offset, PAGE_CACHE_SIZE); 814 - p_offset = roundup(p_offset, len); 773 + p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), 774 + PAGE_CACHE_SIZE); 775 + p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; 815 776 page_dirty = p_offset / len; 816 777 817 778 iomp = NULL; 818 - p_offset = 0; 819 779 bh = head = page_buffers(page); 780 + offset = page_offset(page); 781 + 782 + /* TODO: fix up "done" variable and iomap pointer (boolean) */ 783 + /* TODO: cleanup count and page_dirty */ 820 784 821 785 do { 822 786 if (offset >= end_offset) 823 787 break; 824 788 if (!buffer_uptodate(bh)) 825 789 uptodate = 0; 826 - if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) 790 + if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) { 791 + done = 1; 827 792 continue; 793 + } 828 794 829 795 if (iomp) { 830 796 iomp = xfs_offset_to_map(page, &iomap, p_offset); 797 + done = (iomp == NULL); 831 798 } 832 799 833 800 /* 834 801 * First case, map an unwritten extent and prepare for 835 802 * extent state conversion transaction on completion. 836 - */ 837 - if (buffer_unwritten(bh)) { 838 - if (!startio) 839 - continue; 840 - if (!iomp) { 841 - err = xfs_map_blocks(inode, offset, len, &iomap, 842 - BMAPI_WRITE|BMAPI_IGNSTATE); 843 - if (err) { 844 - goto error; 845 - } 846 - iomp = xfs_offset_to_map(page, &iomap, 847 - p_offset); 848 - } 849 - if (iomp) { 850 - if (!bh->b_end_io) { 851 - err = xfs_map_unwritten(inode, page, 852 - head, bh, p_offset, 853 - inode->i_blkbits, iomp, 854 - wbc, startio, unmapped); 855 - if (err) { 856 - goto error; 857 - } 858 - } else { 859 - set_bit(BH_Lock, &bh->b_state); 860 - } 861 - BUG_ON(!buffer_locked(bh)); 862 - bh_arr[cnt++] = bh; 863 - page_dirty--; 864 - } 865 - /* 803 + * 866 804 * Second case, allocate space for a delalloc buffer. 867 805 * We can return EAGAIN here in the release page case. 868 806 */ 869 - } else if (buffer_delay(bh)) { 807 + if (buffer_unwritten(bh) || buffer_delay(bh)) { 808 + if (buffer_unwritten(bh)) { 809 + type = IOMAP_UNWRITTEN; 810 + flags = BMAPI_WRITE|BMAPI_IGNSTATE; 811 + } else { 812 + type = IOMAP_DELAY; 813 + flags = BMAPI_ALLOCATE; 814 + if (!startio) 815 + flags |= trylock_flag; 816 + } 817 + 870 818 if (!iomp) { 819 + done = 1; 871 820 err = xfs_map_blocks(inode, offset, len, &iomap, 872 - BMAPI_ALLOCATE | flags); 873 - if (err) { 821 + flags); 822 + if (err) 874 823 goto error; 875 - } 876 824 iomp = xfs_offset_to_map(page, &iomap, 877 825 p_offset); 826 + done = (iomp == NULL); 878 827 } 879 828 if (iomp) { 880 829 xfs_map_at_offset(page, bh, p_offset, 881 - inode->i_blkbits, iomp); 830 + inode->i_blkbits, iomp, ioend); 882 831 if (startio) { 883 - bh_arr[cnt++] = bh; 832 + xfs_add_to_ioend(inode, bh, p_offset, 833 + type, &ioend, done); 884 834 } else { 885 835 set_buffer_dirty(bh); 886 836 unlock_buffer(bh); 887 837 mark_buffer_dirty(bh); 888 838 } 889 839 page_dirty--; 840 + count++; 841 + } else { 842 + done = 1; 890 843 } 891 844 } else if ((buffer_uptodate(bh) || PageUptodate(page)) && 892 845 (unmapped || startio)) { 893 846 847 + type = 0; 894 848 if (!buffer_mapped(bh)) { 895 - int size; 896 849 897 850 /* 898 851 * Getting here implies an unmapped buffer ··· 900 853 * need to write the whole page out. 901 854 */ 902 855 if (!iomp) { 856 + int size; 857 + 903 858 size = xfs_probe_unmapped_cluster( 904 859 inode, page, bh, head); 905 860 err = xfs_map_blocks(inode, offset, ··· 912 863 } 913 864 iomp = xfs_offset_to_map(page, &iomap, 914 865 p_offset); 866 + done = (iomp == NULL); 915 867 } 916 868 if (iomp) { 917 - xfs_map_at_offset(page, 918 - bh, p_offset, 919 - inode->i_blkbits, iomp); 869 + xfs_map_at_offset(page, bh, p_offset, 870 + inode->i_blkbits, iomp, 871 + ioend); 920 872 if (startio) { 921 - bh_arr[cnt++] = bh; 873 + xfs_add_to_ioend(inode, 874 + bh, p_offset, type, 875 + &ioend, done); 922 876 } else { 923 877 set_buffer_dirty(bh); 924 878 unlock_buffer(bh); 925 879 mark_buffer_dirty(bh); 926 880 } 927 881 page_dirty--; 882 + count++; 883 + } else { 884 + done = 1; 928 885 } 929 886 } else if (startio) { 930 887 if (buffer_uptodate(bh) && 931 888 !test_and_set_bit(BH_Lock, &bh->b_state)) { 932 - bh_arr[cnt++] = bh; 889 + ASSERT(buffer_mapped(bh)); 890 + xfs_add_to_ioend(inode, 891 + bh, p_offset, type, 892 + &ioend, done); 933 893 page_dirty--; 894 + count++; 895 + } else { 896 + done = 1; 934 897 } 898 + } else { 899 + done = 1; 935 900 } 936 901 } 937 - } while (offset += len, p_offset += len, 938 - ((bh = bh->b_this_page) != head)); 902 + 903 + if (!iohead) 904 + iohead = ioend; 905 + 906 + } while (offset += len, ((bh = bh->b_this_page) != head)); 939 907 940 908 if (uptodate && bh == head) 941 909 SetPageUptodate(page); 942 910 943 - if (startio) { 944 - xfs_submit_page(page, wbc, bh_arr, cnt, 0, !page_dirty); 945 - } 911 + if (startio) 912 + xfs_start_page_writeback(page, wbc, 1, count); 946 913 947 - if (iomp) { 914 + if (ioend && iomp && !done) { 948 915 offset = (iomp->iomap_offset + iomp->iomap_bsize - 1) >> 949 916 PAGE_CACHE_SHIFT; 950 917 tlast = min_t(pgoff_t, offset, last_index); 951 - xfs_cluster_write(inode, page->index + 1, iomp, wbc, 952 - startio, unmapped, tlast); 918 + xfs_cluster_write(inode, page->index + 1, iomp, &ioend, 919 + wbc, startio, unmapped, tlast); 953 920 } 921 + 922 + if (iohead) 923 + xfs_submit_ioend(iohead); 954 924 955 925 return page_dirty; 956 926 957 927 error: 958 - for (i = 0; i < cnt; i++) { 959 - unlock_buffer(bh_arr[i]); 960 - } 928 + if (iohead) 929 + xfs_cancel_ioend(iohead); 961 930 962 931 /* 963 932 * If it's delalloc and we have nowhere to put it, ··· 983 916 * us to try again. 984 917 */ 985 918 if (err != -EAGAIN) { 986 - if (!unmapped) { 919 + if (!unmapped) 987 920 block_invalidatepage(page, 0); 988 - } 989 921 ClearPageUptodate(page); 990 922 } 991 923 return err; ··· 1160 1094 if (error) 1161 1095 return -error; 1162 1096 1163 - iocb->private = xfs_alloc_ioend(inode); 1097 + iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN); 1164 1098 1165 1099 ret = blockdev_direct_IO_own_locking(rw, iocb, inode, 1166 1100 iomap.iomap_target->bt_bdev,
+10
fs/xfs/linux-2.6/xfs_aops.h
··· 23 23 24 24 typedef void (*xfs_ioend_func_t)(void *); 25 25 26 + /* 27 + * xfs_ioend struct manages large extent writes for XFS. 28 + * It can manage several multi-page bio's at once. 29 + */ 26 30 typedef struct xfs_ioend { 31 + struct xfs_ioend *io_list; /* next ioend in chain */ 32 + unsigned int io_type; /* delalloc / unwritten */ 27 33 unsigned int io_uptodate; /* I/O status register */ 28 34 atomic_t io_remaining; /* hold count */ 29 35 struct vnode *io_vnode; /* file being written to */ 30 36 struct buffer_head *io_buffer_head;/* buffer linked list head */ 37 + struct buffer_head *io_buffer_tail;/* buffer linked list tail */ 31 38 size_t io_size; /* size of the extent */ 32 39 xfs_off_t io_offset; /* offset in the file */ 33 40 struct work_struct io_work; /* xfsdatad work queue */ 34 41 } xfs_ioend_t; 42 + 43 + extern struct address_space_operations linvfs_aops; 44 + extern int linvfs_get_block(struct inode *, sector_t, struct buffer_head *, int); 35 45 36 46 #endif /* __XFS_IOPS_H__ */
-5
fs/xfs/linux-2.6/xfs_iops.h
··· 26 26 extern struct file_operations linvfs_invis_file_operations; 27 27 extern struct file_operations linvfs_dir_operations; 28 28 29 - extern struct address_space_operations linvfs_aops; 30 - 31 - extern int linvfs_get_block(struct inode *, sector_t, struct buffer_head *, int); 32 - extern void linvfs_unwritten_done(struct buffer_head *, int); 33 - 34 29 extern int xfs_ioctl(struct bhv_desc *, struct inode *, struct file *, 35 30 int, unsigned int, void __user *); 36 31
-4
fs/xfs/linux-2.6/xfs_linux.h
··· 110 110 * delalloc and these ondisk-uninitialised buffers. 111 111 */ 112 112 BUFFER_FNS(PrivateStart, unwritten); 113 - static inline void set_buffer_unwritten_io(struct buffer_head *bh) 114 - { 115 - bh->b_end_io = linvfs_unwritten_done; 116 - } 117 113 118 114 #define restricted_chown xfs_params.restrict_chown.val 119 115 #define irix_sgid_inherit xfs_params.sgid_inherit.val