vfs: pagecache usage optimization for pagesize!=blocksize

When we read some part of a file through pagecache, if there is a
pagecache of corresponding index but this page is not uptodate, read IO
is issued and this page will be uptodate.

I think this is good for pagesize == blocksize environment but there is
room for improvement on pagesize != blocksize environment. Because in
this case a page can have multiple buffers and even if a page is not
uptodate, some buffers can be uptodate.

So I suggest that when all buffers which correspond to a part of a file
that we want to read are uptodate, use this pagecache and copy data from
this pagecache to user buffer even if a page is not uptodate. This can
reduce read IO and improve system throughput.

I wrote a benchmark program and got result number with this program.

This benchmark do:

1: mount and open a test file.

2: create a 512MB file.

3: close a file and umount.

4: mount and again open a test file.

5: pwrite randomly 300000 times on a test file. offset is aligned
by IO size(1024bytes).

6: measure time of preading randomly 100000 times on a test file.

The result was:
2.6.26
330 sec

2.6.26-patched
226 sec

Arch:i386
Filesystem:ext3
Blocksize:1024 bytes
Memory: 1GB

On ext3/4, a file is written through buffer/block. So random read/write
mixed workloads or random read after random write workloads are optimized
with this patch under pagesize != blocksize environment. This test result
showed this.

The benchmark program is as follows:

#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mount.h>

#define LEN 1024
#define LOOP 1024*512 /* 512MB */

main(void)
{
unsigned long i, offset, filesize;
int fd;
char buf[LEN];
time_t t1, t2;

if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) {
perror("cannot mount\n");
exit(1);
}
memset(buf, 0, LEN);
fd = open("/root/test1/testfile", O_CREAT|O_RDWR|O_TRUNC);
if (fd < 0) {
perror("cannot open file\n");
exit(1);
}
for (i = 0; i < LOOP; i++)
write(fd, buf, LEN);
close(fd);
if (umount("/root/test1/") < 0) {
perror("cannot umount\n");
exit(1);
}
if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) {
perror("cannot mount\n");
exit(1);
}
fd = open("/root/test1/testfile", O_RDWR);
if (fd < 0) {
perror("cannot open file\n");
exit(1);
}

filesize = LEN * LOOP;
for (i = 0; i < 300000; i++){
offset = (random() % filesize) & (~(LEN - 1));
pwrite(fd, buf, LEN, offset);
}
printf("start test\n");
time(&t1);
for (i = 0; i < 100000; i++){
offset = (random() % filesize) & (~(LEN - 1));
pread(fd, buf, LEN, offset);
}
time(&t2);
printf("%ld sec\n", t2-t1);
close(fd);
if (umount("/root/test1/") < 0) {
perror("cannot umount\n");
exit(1);
}
}

Signed-off-by: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jan Kara <jack@ucw.cz>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by Hisashi Hifumi and committed by Linus Torvalds 8ab22b9a d84a52f6

+167 -99
+46
fs/buffer.c
··· 2096 2096 EXPORT_SYMBOL(generic_write_end); 2097 2097 2098 2098 /* 2099 + * block_is_partially_uptodate checks whether buffers within a page are 2100 + * uptodate or not. 2101 + * 2102 + * Returns true if all buffers which correspond to a file portion 2103 + * we want to read are uptodate. 2104 + */ 2105 + int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, 2106 + unsigned long from) 2107 + { 2108 + struct inode *inode = page->mapping->host; 2109 + unsigned block_start, block_end, blocksize; 2110 + unsigned to; 2111 + struct buffer_head *bh, *head; 2112 + int ret = 1; 2113 + 2114 + if (!page_has_buffers(page)) 2115 + return 0; 2116 + 2117 + blocksize = 1 << inode->i_blkbits; 2118 + to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); 2119 + to = from + to; 2120 + if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) 2121 + return 0; 2122 + 2123 + head = page_buffers(page); 2124 + bh = head; 2125 + block_start = 0; 2126 + do { 2127 + block_end = block_start + blocksize; 2128 + if (block_end > from && block_start < to) { 2129 + if (!buffer_uptodate(bh)) { 2130 + ret = 0; 2131 + break; 2132 + } 2133 + if (block_end >= to) 2134 + break; 2135 + } 2136 + block_start = block_end; 2137 + bh = bh->b_this_page; 2138 + } while (bh != head); 2139 + 2140 + return ret; 2141 + } 2142 + EXPORT_SYMBOL(block_is_partially_uptodate); 2143 + 2144 + /* 2099 2145 * Generic "read page" function for block devices that have the normal 2100 2146 * get_block functionality. This is most of the block device filesystems. 2101 2147 * Reads the page asynchronously --- the unlock_buffer() and
+1
fs/ext2/inode.c
··· 791 791 .direct_IO = ext2_direct_IO, 792 792 .writepages = ext2_writepages, 793 793 .migratepage = buffer_migrate_page, 794 + .is_partially_uptodate = block_is_partially_uptodate, 794 795 }; 795 796 796 797 const struct address_space_operations ext2_aops_xip = {
+35 -32
fs/ext3/inode.c
··· 1767 1767 } 1768 1768 1769 1769 static const struct address_space_operations ext3_ordered_aops = { 1770 - .readpage = ext3_readpage, 1771 - .readpages = ext3_readpages, 1772 - .writepage = ext3_ordered_writepage, 1773 - .sync_page = block_sync_page, 1774 - .write_begin = ext3_write_begin, 1775 - .write_end = ext3_ordered_write_end, 1776 - .bmap = ext3_bmap, 1777 - .invalidatepage = ext3_invalidatepage, 1778 - .releasepage = ext3_releasepage, 1779 - .direct_IO = ext3_direct_IO, 1780 - .migratepage = buffer_migrate_page, 1770 + .readpage = ext3_readpage, 1771 + .readpages = ext3_readpages, 1772 + .writepage = ext3_ordered_writepage, 1773 + .sync_page = block_sync_page, 1774 + .write_begin = ext3_write_begin, 1775 + .write_end = ext3_ordered_write_end, 1776 + .bmap = ext3_bmap, 1777 + .invalidatepage = ext3_invalidatepage, 1778 + .releasepage = ext3_releasepage, 1779 + .direct_IO = ext3_direct_IO, 1780 + .migratepage = buffer_migrate_page, 1781 + .is_partially_uptodate = block_is_partially_uptodate, 1781 1782 }; 1782 1783 1783 1784 static const struct address_space_operations ext3_writeback_aops = { 1784 - .readpage = ext3_readpage, 1785 - .readpages = ext3_readpages, 1786 - .writepage = ext3_writeback_writepage, 1787 - .sync_page = block_sync_page, 1788 - .write_begin = ext3_write_begin, 1789 - .write_end = ext3_writeback_write_end, 1790 - .bmap = ext3_bmap, 1791 - .invalidatepage = ext3_invalidatepage, 1792 - .releasepage = ext3_releasepage, 1793 - .direct_IO = ext3_direct_IO, 1794 - .migratepage = buffer_migrate_page, 1785 + .readpage = ext3_readpage, 1786 + .readpages = ext3_readpages, 1787 + .writepage = ext3_writeback_writepage, 1788 + .sync_page = block_sync_page, 1789 + .write_begin = ext3_write_begin, 1790 + .write_end = ext3_writeback_write_end, 1791 + .bmap = ext3_bmap, 1792 + .invalidatepage = ext3_invalidatepage, 1793 + .releasepage = ext3_releasepage, 1794 + .direct_IO = ext3_direct_IO, 1795 + .migratepage = buffer_migrate_page, 1796 + .is_partially_uptodate = block_is_partially_uptodate, 1795 1797 }; 1796 1798 1797 1799 static const struct address_space_operations ext3_journalled_aops = { 1798 - .readpage = ext3_readpage, 1799 - .readpages = ext3_readpages, 1800 - .writepage = ext3_journalled_writepage, 1801 - .sync_page = block_sync_page, 1802 - .write_begin = ext3_write_begin, 1803 - .write_end = ext3_journalled_write_end, 1804 - .set_page_dirty = ext3_journalled_set_page_dirty, 1805 - .bmap = ext3_bmap, 1806 - .invalidatepage = ext3_invalidatepage, 1807 - .releasepage = ext3_releasepage, 1800 + .readpage = ext3_readpage, 1801 + .readpages = ext3_readpages, 1802 + .writepage = ext3_journalled_writepage, 1803 + .sync_page = block_sync_page, 1804 + .write_begin = ext3_write_begin, 1805 + .write_end = ext3_journalled_write_end, 1806 + .set_page_dirty = ext3_journalled_set_page_dirty, 1807 + .bmap = ext3_bmap, 1808 + .invalidatepage = ext3_invalidatepage, 1809 + .releasepage = ext3_releasepage, 1810 + .is_partially_uptodate = block_is_partially_uptodate, 1808 1811 }; 1809 1812 1810 1813 void ext3_set_aops(struct inode *inode)
+48 -44
fs/ext4/inode.c
··· 2806 2806 } 2807 2807 2808 2808 static const struct address_space_operations ext4_ordered_aops = { 2809 - .readpage = ext4_readpage, 2810 - .readpages = ext4_readpages, 2811 - .writepage = ext4_normal_writepage, 2812 - .sync_page = block_sync_page, 2813 - .write_begin = ext4_write_begin, 2814 - .write_end = ext4_ordered_write_end, 2815 - .bmap = ext4_bmap, 2816 - .invalidatepage = ext4_invalidatepage, 2817 - .releasepage = ext4_releasepage, 2818 - .direct_IO = ext4_direct_IO, 2819 - .migratepage = buffer_migrate_page, 2809 + .readpage = ext4_readpage, 2810 + .readpages = ext4_readpages, 2811 + .writepage = ext4_normal_writepage, 2812 + .sync_page = block_sync_page, 2813 + .write_begin = ext4_write_begin, 2814 + .write_end = ext4_ordered_write_end, 2815 + .bmap = ext4_bmap, 2816 + .invalidatepage = ext4_invalidatepage, 2817 + .releasepage = ext4_releasepage, 2818 + .direct_IO = ext4_direct_IO, 2819 + .migratepage = buffer_migrate_page, 2820 + .is_partially_uptodate = block_is_partially_uptodate, 2820 2821 }; 2821 2822 2822 2823 static const struct address_space_operations ext4_writeback_aops = { 2823 - .readpage = ext4_readpage, 2824 - .readpages = ext4_readpages, 2825 - .writepage = ext4_normal_writepage, 2826 - .sync_page = block_sync_page, 2827 - .write_begin = ext4_write_begin, 2828 - .write_end = ext4_writeback_write_end, 2829 - .bmap = ext4_bmap, 2830 - .invalidatepage = ext4_invalidatepage, 2831 - .releasepage = ext4_releasepage, 2832 - .direct_IO = ext4_direct_IO, 2833 - .migratepage = buffer_migrate_page, 2824 + .readpage = ext4_readpage, 2825 + .readpages = ext4_readpages, 2826 + .writepage = ext4_normal_writepage, 2827 + .sync_page = block_sync_page, 2828 + .write_begin = ext4_write_begin, 2829 + .write_end = ext4_writeback_write_end, 2830 + .bmap = ext4_bmap, 2831 + .invalidatepage = ext4_invalidatepage, 2832 + .releasepage = ext4_releasepage, 2833 + .direct_IO = ext4_direct_IO, 2834 + .migratepage = buffer_migrate_page, 2835 + .is_partially_uptodate = block_is_partially_uptodate, 2834 2836 }; 2835 2837 2836 2838 static const struct address_space_operations ext4_journalled_aops = { 2837 - .readpage = ext4_readpage, 2838 - .readpages = ext4_readpages, 2839 - .writepage = ext4_journalled_writepage, 2840 - .sync_page = block_sync_page, 2841 - .write_begin = ext4_write_begin, 2842 - .write_end = ext4_journalled_write_end, 2843 - .set_page_dirty = ext4_journalled_set_page_dirty, 2844 - .bmap = ext4_bmap, 2845 - .invalidatepage = ext4_invalidatepage, 2846 - .releasepage = ext4_releasepage, 2839 + .readpage = ext4_readpage, 2840 + .readpages = ext4_readpages, 2841 + .writepage = ext4_journalled_writepage, 2842 + .sync_page = block_sync_page, 2843 + .write_begin = ext4_write_begin, 2844 + .write_end = ext4_journalled_write_end, 2845 + .set_page_dirty = ext4_journalled_set_page_dirty, 2846 + .bmap = ext4_bmap, 2847 + .invalidatepage = ext4_invalidatepage, 2848 + .releasepage = ext4_releasepage, 2849 + .is_partially_uptodate = block_is_partially_uptodate, 2847 2850 }; 2848 2851 2849 2852 static const struct address_space_operations ext4_da_aops = { 2850 - .readpage = ext4_readpage, 2851 - .readpages = ext4_readpages, 2852 - .writepage = ext4_da_writepage, 2853 - .writepages = ext4_da_writepages, 2854 - .sync_page = block_sync_page, 2855 - .write_begin = ext4_da_write_begin, 2856 - .write_end = ext4_da_write_end, 2857 - .bmap = ext4_bmap, 2858 - .invalidatepage = ext4_da_invalidatepage, 2859 - .releasepage = ext4_releasepage, 2860 - .direct_IO = ext4_direct_IO, 2861 - .migratepage = buffer_migrate_page, 2853 + .readpage = ext4_readpage, 2854 + .readpages = ext4_readpages, 2855 + .writepage = ext4_da_writepage, 2856 + .writepages = ext4_da_writepages, 2857 + .sync_page = block_sync_page, 2858 + .write_begin = ext4_da_write_begin, 2859 + .write_end = ext4_da_write_end, 2860 + .bmap = ext4_bmap, 2861 + .invalidatepage = ext4_da_invalidatepage, 2862 + .releasepage = ext4_releasepage, 2863 + .direct_IO = ext4_direct_IO, 2864 + .migratepage = buffer_migrate_page, 2865 + .is_partially_uptodate = block_is_partially_uptodate, 2862 2866 }; 2863 2867 2864 2868 void ext4_set_aops(struct inode *inode)
+2
include/linux/buffer_head.h
··· 205 205 int block_write_full_page(struct page *page, get_block_t *get_block, 206 206 struct writeback_control *wbc); 207 207 int block_read_full_page(struct page*, get_block_t*); 208 + int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, 209 + unsigned long from); 208 210 int block_write_begin(struct file *, struct address_space *, 209 211 loff_t, unsigned, unsigned, 210 212 struct page **, void **, get_block_t*);
+23 -21
include/linux/fs.h
··· 443 443 return i->count; 444 444 } 445 445 446 + /* 447 + * "descriptor" for what we're up to with a read. 448 + * This allows us to use the same read code yet 449 + * have multiple different users of the data that 450 + * we read from a file. 451 + * 452 + * The simplest case just copies the data to user 453 + * mode. 454 + */ 455 + typedef struct { 456 + size_t written; 457 + size_t count; 458 + union { 459 + char __user *buf; 460 + void *data; 461 + } arg; 462 + int error; 463 + } read_descriptor_t; 464 + 465 + typedef int (*read_actor_t)(read_descriptor_t *, struct page *, 466 + unsigned long, unsigned long); 446 467 447 468 struct address_space_operations { 448 469 int (*writepage)(struct page *page, struct writeback_control *wbc); ··· 505 484 int (*migratepage) (struct address_space *, 506 485 struct page *, struct page *); 507 486 int (*launder_page) (struct page *); 487 + int (*is_partially_uptodate) (struct page *, read_descriptor_t *, 488 + unsigned long); 508 489 }; 509 490 510 491 /* ··· 1220 1197 int (*getgeo)(struct block_device *, struct hd_geometry *); 1221 1198 struct module *owner; 1222 1199 }; 1223 - 1224 - /* 1225 - * "descriptor" for what we're up to with a read. 1226 - * This allows us to use the same read code yet 1227 - * have multiple different users of the data that 1228 - * we read from a file. 1229 - * 1230 - * The simplest case just copies the data to user 1231 - * mode. 1232 - */ 1233 - typedef struct { 1234 - size_t written; 1235 - size_t count; 1236 - union { 1237 - char __user * buf; 1238 - void *data; 1239 - } arg; 1240 - int error; 1241 - } read_descriptor_t; 1242 - 1243 - typedef int (*read_actor_t)(read_descriptor_t *, struct page *, unsigned long, unsigned long); 1244 1200 1245 1201 /* These macros are for out of kernel modules to test that 1246 1202 * the kernel supports the unlocked_ioctl and compat_ioctl
+12 -2
mm/filemap.c
··· 1023 1023 ra, filp, page, 1024 1024 index, last_index - index); 1025 1025 } 1026 - if (!PageUptodate(page)) 1027 - goto page_not_up_to_date; 1026 + if (!PageUptodate(page)) { 1027 + if (inode->i_blkbits == PAGE_CACHE_SHIFT || 1028 + !mapping->a_ops->is_partially_uptodate) 1029 + goto page_not_up_to_date; 1030 + if (TestSetPageLocked(page)) 1031 + goto page_not_up_to_date; 1032 + if (!mapping->a_ops->is_partially_uptodate(page, 1033 + desc, offset)) 1034 + goto page_not_up_to_date_locked; 1035 + unlock_page(page); 1036 + } 1028 1037 page_ok: 1029 1038 /* 1030 1039 * i_size must be checked after we know the page is Uptodate. ··· 1103 1094 if (lock_page_killable(page)) 1104 1095 goto readpage_eio; 1105 1096 1097 + page_not_up_to_date_locked: 1106 1098 /* Did it get truncated before we got the lock? */ 1107 1099 if (!page->mapping) { 1108 1100 unlock_page(page);