fuse: allow splice to move pages

When splicing buffers to the fuse device with SPLICE_F_MOVE, try to
move pages from the pipe buffer into the page cache. This allows
populating the fuse filesystem's cache without ever touching the page
contents, i.e. zero copy read capability.

The following steps are performed when trying to move a page into the
page cache:

- buf->ops->confirm() to make sure the new page is uptodate
- buf->ops->steal() to try to remove the new page from it's previous place
- remove_from_page_cache() on the old page
- add_to_page_cache_locked() on the new page

If any of the above steps fail (non fatally) then the code falls back
to copying the page. In particular ->steal() will fail if there are
external references (other than the page cache and the pipe buffer) to
the page.

Also since the remove_from_page_cache() + add_to_page_cache_locked()
are non-atomic it is possible that the page cache is repopulated in
between the two and add_to_page_cache_locked() will fail. This could
be fixed by creating a new atomic replace_page_cache_page() function.

fuse_readpages_end() needed to be reworked so it works even if
page->mapping is NULL for some or all pages which can happen if the
add_to_page_cache_locked() failed.

A number of sanity checks were added to make sure the stolen pages
don't have weird flags set, etc... These could be moved into generic
splice/steal code.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>

+167 -15
+145 -6
fs/fuse/dev.c
··· 17 #include <linux/file.h> 18 #include <linux/slab.h> 19 #include <linux/pipe_fs_i.h> 20 21 MODULE_ALIAS_MISCDEV(FUSE_MINOR); 22 ··· 511 void *mapaddr; 512 void *buf; 513 unsigned len; 514 }; 515 516 static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc, ··· 612 return ncpy; 613 } 614 615 /* 616 * Copy a page in the request to/from the userspace buffer. Must be 617 * done atomically 618 */ 619 - static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page, 620 unsigned offset, unsigned count, int zeroing) 621 { 622 if (page && zeroing && count < PAGE_SIZE) { 623 void *mapaddr = kmap_atomic(page, KM_USER1); 624 memset(mapaddr, 0, PAGE_SIZE); ··· 748 } 749 while (count) { 750 if (!cs->len) { 751 - int err = fuse_copy_fill(cs); 752 - if (err) 753 - return err; 754 } 755 if (page) { 756 void *mapaddr = kmap_atomic(page, KM_USER1); ··· 782 unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset); 783 784 for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) { 785 - struct page *page = req->pages[i]; 786 - int err = fuse_copy_page(cs, page, offset, count, zeroing); 787 if (err) 788 return err; 789 ··· 1213 req->out.h = oh; 1214 req->locked = 1; 1215 cs->req = req; 1216 spin_unlock(&fc->lock); 1217 1218 err = copy_out_args(cs, &req->out, nbytes); ··· 1317 cs.pipebufs = bufs; 1318 cs.nr_segs = nbuf; 1319 cs.pipe = pipe; 1320 1321 ret = fuse_dev_do_write(fc, &cs, len); 1322
··· 17 #include <linux/file.h> 18 #include <linux/slab.h> 19 #include <linux/pipe_fs_i.h> 20 + #include <linux/swap.h> 21 + #include <linux/splice.h> 22 23 MODULE_ALIAS_MISCDEV(FUSE_MINOR); 24 ··· 509 void *mapaddr; 510 void *buf; 511 unsigned len; 512 + unsigned move_pages:1; 513 }; 514 515 static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc, ··· 609 return ncpy; 610 } 611 612 + static int fuse_check_page(struct page *page) 613 + { 614 + if (page_mapcount(page) || 615 + page->mapping != NULL || 616 + page_count(page) != 1 || 617 + (page->flags & PAGE_FLAGS_CHECK_AT_PREP & 618 + ~(1 << PG_locked | 619 + 1 << PG_referenced | 620 + 1 << PG_uptodate | 621 + 1 << PG_lru | 622 + 1 << PG_active | 623 + 1 << PG_reclaim))) { 624 + printk(KERN_WARNING "fuse: trying to steal weird page\n"); 625 + printk(KERN_WARNING " page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping); 626 + return 1; 627 + } 628 + return 0; 629 + } 630 + 631 + static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) 632 + { 633 + int err; 634 + struct page *oldpage = *pagep; 635 + struct page *newpage; 636 + struct pipe_buffer *buf = cs->pipebufs; 637 + struct address_space *mapping; 638 + pgoff_t index; 639 + 640 + unlock_request(cs->fc, cs->req); 641 + fuse_copy_finish(cs); 642 + 643 + err = buf->ops->confirm(cs->pipe, buf); 644 + if (err) 645 + return err; 646 + 647 + BUG_ON(!cs->nr_segs); 648 + cs->currbuf = buf; 649 + cs->len = buf->len; 650 + cs->pipebufs++; 651 + cs->nr_segs--; 652 + 653 + if (cs->len != PAGE_SIZE) 654 + goto out_fallback; 655 + 656 + if (buf->ops->steal(cs->pipe, buf) != 0) 657 + goto out_fallback; 658 + 659 + newpage = buf->page; 660 + 661 + if (WARN_ON(!PageUptodate(newpage))) 662 + return -EIO; 663 + 664 + ClearPageMappedToDisk(newpage); 665 + 666 + if (fuse_check_page(newpage) != 0) 667 + goto out_fallback_unlock; 668 + 669 + mapping = oldpage->mapping; 670 + index = oldpage->index; 671 + 672 + /* 673 + * This is a new and locked page, it shouldn't be mapped or 674 + * have any special flags on it 675 + */ 676 + if (WARN_ON(page_mapped(oldpage))) 677 + goto out_fallback_unlock; 678 + if (WARN_ON(page_has_private(oldpage))) 679 + goto out_fallback_unlock; 680 + if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage))) 681 + goto out_fallback_unlock; 682 + if (WARN_ON(PageMlocked(oldpage))) 683 + goto out_fallback_unlock; 684 + 685 + remove_from_page_cache(oldpage); 686 + page_cache_release(oldpage); 687 + 688 + err = add_to_page_cache_locked(newpage, mapping, index, GFP_KERNEL); 689 + if (err) { 690 + printk(KERN_WARNING "fuse_try_move_page: failed to add page"); 691 + goto out_fallback_unlock; 692 + } 693 + page_cache_get(newpage); 694 + 695 + if (!(buf->flags & PIPE_BUF_FLAG_LRU)) 696 + lru_cache_add_file(newpage); 697 + 698 + err = 0; 699 + spin_lock(&cs->fc->lock); 700 + if (cs->req->aborted) 701 + err = -ENOENT; 702 + else 703 + *pagep = newpage; 704 + spin_unlock(&cs->fc->lock); 705 + 706 + if (err) { 707 + unlock_page(newpage); 708 + page_cache_release(newpage); 709 + return err; 710 + } 711 + 712 + unlock_page(oldpage); 713 + page_cache_release(oldpage); 714 + cs->len = 0; 715 + 716 + return 0; 717 + 718 + out_fallback_unlock: 719 + unlock_page(newpage); 720 + out_fallback: 721 + cs->mapaddr = buf->ops->map(cs->pipe, buf, 1); 722 + cs->buf = cs->mapaddr + buf->offset; 723 + 724 + err = lock_request(cs->fc, cs->req); 725 + if (err) 726 + return err; 727 + 728 + return 1; 729 + } 730 + 731 /* 732 * Copy a page in the request to/from the userspace buffer. Must be 733 * done atomically 734 */ 735 + static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, 736 unsigned offset, unsigned count, int zeroing) 737 { 738 + int err; 739 + struct page *page = *pagep; 740 + 741 if (page && zeroing && count < PAGE_SIZE) { 742 void *mapaddr = kmap_atomic(page, KM_USER1); 743 memset(mapaddr, 0, PAGE_SIZE); ··· 623 } 624 while (count) { 625 if (!cs->len) { 626 + if (cs->move_pages && page && 627 + offset == 0 && count == PAGE_SIZE) { 628 + err = fuse_try_move_page(cs, pagep); 629 + if (err <= 0) 630 + return err; 631 + } else { 632 + err = fuse_copy_fill(cs); 633 + if (err) 634 + return err; 635 + } 636 } 637 if (page) { 638 void *mapaddr = kmap_atomic(page, KM_USER1); ··· 650 unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset); 651 652 for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) { 653 + int err; 654 + 655 + err = fuse_copy_page(cs, &req->pages[i], offset, count, 656 + zeroing); 657 if (err) 658 return err; 659 ··· 1079 req->out.h = oh; 1080 req->locked = 1; 1081 cs->req = req; 1082 + if (!req->out.page_replace) 1083 + cs->move_pages = 0; 1084 spin_unlock(&fc->lock); 1085 1086 err = copy_out_args(cs, &req->out, nbytes); ··· 1181 cs.pipebufs = bufs; 1182 cs.nr_segs = nbuf; 1183 cs.pipe = pipe; 1184 + 1185 + if (flags & SPLICE_F_MOVE) 1186 + cs.move_pages = 1; 1187 1188 ret = fuse_dev_do_write(fc, &cs, len); 1189
+19 -9
fs/fuse/file.c
··· 517 int i; 518 size_t count = req->misc.read.in.size; 519 size_t num_read = req->out.args[0].size; 520 - struct inode *inode = req->pages[0]->mapping->host; 521 522 - /* 523 - * Short read means EOF. If file size is larger, truncate it 524 - */ 525 - if (!req->out.h.error && num_read < count) { 526 - loff_t pos = page_offset(req->pages[0]) + num_read; 527 - fuse_read_update_size(inode, pos, req->misc.read.attr_ver); 528 } 529 - 530 - fuse_invalidate_attr(inode); /* atime changed */ 531 532 for (i = 0; i < req->num_pages; i++) { 533 struct page *page = req->pages[i]; ··· 560 561 req->out.argpages = 1; 562 req->out.page_zeroing = 1; 563 fuse_read_fill(req, file, pos, count, FUSE_READ); 564 req->misc.read.attr_ver = fuse_get_attr_version(fc); 565 if (fc->async_read) {
··· 517 int i; 518 size_t count = req->misc.read.in.size; 519 size_t num_read = req->out.args[0].size; 520 + struct address_space *mapping = NULL; 521 522 + for (i = 0; mapping == NULL && i < req->num_pages; i++) 523 + mapping = req->pages[i]->mapping; 524 + 525 + if (mapping) { 526 + struct inode *inode = mapping->host; 527 + 528 + /* 529 + * Short read means EOF. If file size is larger, truncate it 530 + */ 531 + if (!req->out.h.error && num_read < count) { 532 + loff_t pos; 533 + 534 + pos = page_offset(req->pages[0]) + num_read; 535 + fuse_read_update_size(inode, pos, 536 + req->misc.read.attr_ver); 537 + } 538 + fuse_invalidate_attr(inode); /* atime changed */ 539 } 540 541 for (i = 0; i < req->num_pages; i++) { 542 struct page *page = req->pages[i]; ··· 551 552 req->out.argpages = 1; 553 req->out.page_zeroing = 1; 554 + req->out.page_replace = 1; 555 fuse_read_fill(req, file, pos, count, FUSE_READ); 556 req->misc.read.attr_ver = fuse_get_attr_version(fc); 557 if (fc->async_read) {
+3
fs/fuse/fuse_i.h
··· 177 /** Zero partially or not copied pages */ 178 unsigned page_zeroing:1; 179 180 /** Number or arguments */ 181 unsigned numargs; 182
··· 177 /** Zero partially or not copied pages */ 178 unsigned page_zeroing:1; 179 180 + /** Pages may be replaced with new ones */ 181 + unsigned page_replace:1; 182 + 183 /** Number or arguments */ 184 unsigned numargs; 185