Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

fs-verity: implement readahead of Merkle tree pages

When fs-verity verifies data pages, currently it reads each Merkle tree
page synchronously using read_mapping_page().

Therefore, when the Merkle tree pages aren't already cached, fs-verity
causes an extra 4 KiB I/O request for every 512 KiB of data (assuming
that the Merkle tree uses SHA-256 and 4 KiB blocks). This results in
more I/O requests and performance loss than is strictly necessary.

Therefore, implement readahead of the Merkle tree pages.

For simplicity, we take advantage of the fact that the kernel already
does readahead of the file's *data*, just like it does for any other
file. Due to this, we don't really need a separate readahead state
(struct file_ra_state) just for the Merkle tree, but rather we just need
to piggy-back on the existing data readahead requests.

We also only really need to bother with the first level of the Merkle
tree, since the usual fan-out factor is 128, so normally over 99% of
Merkle tree I/O requests are for the first level.

Therefore, make fsverity_verify_bio() enable readahead of the first
Merkle tree level, for up to 1/4 the number of pages in the bio, when it
sees that the REQ_RAHEAD flag is set on the bio. The readahead size is
then passed down to ->read_merkle_tree_page() for the filesystem to
(optionally) implement if it sees that the requested page is uncached.

While we're at it, also make build_merkle_tree_level() set the Merkle
tree readahead size, since it's easy to do there.

However, for now don't set the readahead size in fsverity_verify_page(),
since currently it's only used to verify holes on ext4 and f2fs, and it
would need parameters added to know how much to read ahead.

This patch significantly improves fs-verity sequential read performance.
Some quick benchmarks with 'cat'-ing a 250MB file after dropping caches:

On an ARM64 phone (using sha256-ce):
Before: 217 MB/s
After: 263 MB/s
(compare to sha256sum of non-verity file: 357 MB/s)

In an x86_64 VM (using sha256-avx2):
Before: 173 MB/s
After: 215 MB/s
(compare to sha256sum of non-verity file: 223 MB/s)

Link: https://lore.kernel.org/r/20200106205533.137005-1-ebiggers@kernel.org
Reviewed-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Eric Biggers <ebiggers@google.com>

+139 -15
+46 -3
fs/ext4/verity.c
··· 342 342 return desc_size; 343 343 } 344 344 345 - static struct page *ext4_read_merkle_tree_page(struct inode *inode, 346 - pgoff_t index) 345 + /* 346 + * Prefetch some pages from the file's Merkle tree. 347 + * 348 + * This is basically a stripped-down version of __do_page_cache_readahead() 349 + * which works on pages past i_size. 350 + */ 351 + static void ext4_merkle_tree_readahead(struct address_space *mapping, 352 + pgoff_t start_index, unsigned long count) 347 353 { 354 + LIST_HEAD(pages); 355 + unsigned int nr_pages = 0; 356 + struct page *page; 357 + pgoff_t index; 358 + struct blk_plug plug; 359 + 360 + for (index = start_index; index < start_index + count; index++) { 361 + page = xa_load(&mapping->i_pages, index); 362 + if (!page || xa_is_value(page)) { 363 + page = __page_cache_alloc(readahead_gfp_mask(mapping)); 364 + if (!page) 365 + break; 366 + page->index = index; 367 + list_add(&page->lru, &pages); 368 + nr_pages++; 369 + } 370 + } 371 + blk_start_plug(&plug); 372 + ext4_mpage_readpages(mapping, &pages, NULL, nr_pages, true); 373 + blk_finish_plug(&plug); 374 + } 375 + 376 + static struct page *ext4_read_merkle_tree_page(struct inode *inode, 377 + pgoff_t index, 378 + unsigned long num_ra_pages) 379 + { 380 + struct page *page; 381 + 348 382 index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT; 349 383 350 - return read_mapping_page(inode->i_mapping, index, NULL); 384 + page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); 385 + if (!page || !PageUptodate(page)) { 386 + if (page) 387 + put_page(page); 388 + else if (num_ra_pages > 1) 389 + ext4_merkle_tree_readahead(inode->i_mapping, index, 390 + num_ra_pages); 391 + page = read_mapping_page(inode->i_mapping, index, NULL); 392 + } 393 + return page; 351 394 } 352 395 353 396 static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf,
+1 -1
fs/f2fs/data.c
··· 1881 1881 * use ->readpage() or do the necessary surgery to decouple ->readpages() 1882 1882 * from read-ahead. 1883 1883 */ 1884 - static int f2fs_mpage_readpages(struct address_space *mapping, 1884 + int f2fs_mpage_readpages(struct address_space *mapping, 1885 1885 struct list_head *pages, struct page *page, 1886 1886 unsigned nr_pages, bool is_readahead) 1887 1887 {
+3
fs/f2fs/f2fs.h
··· 3229 3229 int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); 3230 3230 int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from); 3231 3231 int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); 3232 + int f2fs_mpage_readpages(struct address_space *mapping, 3233 + struct list_head *pages, struct page *page, 3234 + unsigned nr_pages, bool is_readahead); 3232 3235 struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, 3233 3236 int op_flags, bool for_write); 3234 3237 struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index);
+46 -3
fs/f2fs/verity.c
··· 222 222 return size; 223 223 } 224 224 225 - static struct page *f2fs_read_merkle_tree_page(struct inode *inode, 226 - pgoff_t index) 225 + /* 226 + * Prefetch some pages from the file's Merkle tree. 227 + * 228 + * This is basically a stripped-down version of __do_page_cache_readahead() 229 + * which works on pages past i_size. 230 + */ 231 + static void f2fs_merkle_tree_readahead(struct address_space *mapping, 232 + pgoff_t start_index, unsigned long count) 227 233 { 234 + LIST_HEAD(pages); 235 + unsigned int nr_pages = 0; 236 + struct page *page; 237 + pgoff_t index; 238 + struct blk_plug plug; 239 + 240 + for (index = start_index; index < start_index + count; index++) { 241 + page = xa_load(&mapping->i_pages, index); 242 + if (!page || xa_is_value(page)) { 243 + page = __page_cache_alloc(readahead_gfp_mask(mapping)); 244 + if (!page) 245 + break; 246 + page->index = index; 247 + list_add(&page->lru, &pages); 248 + nr_pages++; 249 + } 250 + } 251 + blk_start_plug(&plug); 252 + f2fs_mpage_readpages(mapping, &pages, NULL, nr_pages, true); 253 + blk_finish_plug(&plug); 254 + } 255 + 256 + static struct page *f2fs_read_merkle_tree_page(struct inode *inode, 257 + pgoff_t index, 258 + unsigned long num_ra_pages) 259 + { 260 + struct page *page; 261 + 228 262 index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT; 229 263 230 - return read_mapping_page(inode->i_mapping, index, NULL); 264 + page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); 265 + if (!page || !PageUptodate(page)) { 266 + if (page) 267 + put_page(page); 268 + else if (num_ra_pages > 1) 269 + f2fs_merkle_tree_readahead(inode->i_mapping, index, 270 + num_ra_pages); 271 + page = read_mapping_page(inode->i_mapping, index, NULL); 272 + } 273 + return page; 231 274 } 232 275 233 276 static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf,
+7 -1
fs/verity/enable.c
··· 8 8 #include "fsverity_private.h" 9 9 10 10 #include <crypto/hash.h> 11 + #include <linux/backing-dev.h> 11 12 #include <linux/mount.h> 12 13 #include <linux/pagemap.h> 13 14 #include <linux/sched/signal.h> ··· 87 86 return err; 88 87 } 89 88 } else { 89 + unsigned long num_ra_pages = 90 + min_t(unsigned long, num_blocks_to_hash - i, 91 + inode->i_sb->s_bdi->io_pages); 92 + 90 93 /* Non-leaf: hashing hash block from level below */ 91 94 src_page = vops->read_merkle_tree_page(inode, 92 - params->level_start[level - 1] + i); 95 + params->level_start[level - 1] + i, 96 + num_ra_pages); 93 97 if (IS_ERR(src_page)) { 94 98 err = PTR_ERR(src_page); 95 99 fsverity_err(inode,
+1
fs/verity/fsverity_private.h
··· 50 50 unsigned int log_arity; /* log2(hashes_per_block) */ 51 51 unsigned int num_levels; /* number of levels in Merkle tree */ 52 52 u64 tree_size; /* Merkle tree size in bytes */ 53 + unsigned long level0_blocks; /* number of blocks in tree level 0 */ 53 54 54 55 /* 55 56 * Starting block index for each tree level, ordered from leaf level (0)
+1
fs/verity/open.c
··· 102 102 /* temporarily using level_start[] to store blocks in level */ 103 103 params->level_start[params->num_levels++] = blocks; 104 104 } 105 + params->level0_blocks = params->level_start[0]; 105 106 106 107 /* Compute the starting block of each level */ 107 108 offset = 0;
+28 -6
fs/verity/verify.c
··· 84 84 * Return: true if the page is valid, else false. 85 85 */ 86 86 static bool verify_page(struct inode *inode, const struct fsverity_info *vi, 87 - struct ahash_request *req, struct page *data_page) 87 + struct ahash_request *req, struct page *data_page, 88 + unsigned long level0_ra_pages) 88 89 { 89 90 const struct merkle_tree_params *params = &vi->tree_params; 90 91 const unsigned int hsize = params->digest_size; ··· 118 117 pr_debug_ratelimited("Level %d: hindex=%lu, hoffset=%u\n", 119 118 level, hindex, hoffset); 120 119 121 - hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode, 122 - hindex); 120 + hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode, hindex, 121 + level == 0 ? level0_ra_pages : 0); 123 122 if (IS_ERR(hpage)) { 124 123 err = PTR_ERR(hpage); 125 124 fsverity_err(inode, ··· 196 195 if (unlikely(!req)) 197 196 return false; 198 197 199 - valid = verify_page(inode, vi, req, page); 198 + valid = verify_page(inode, vi, req, page, 0); 200 199 201 200 ahash_request_free(req); 202 201 ··· 223 222 { 224 223 struct inode *inode = bio_first_page_all(bio)->mapping->host; 225 224 const struct fsverity_info *vi = inode->i_verity_info; 225 + const struct merkle_tree_params *params = &vi->tree_params; 226 226 struct ahash_request *req; 227 227 struct bio_vec *bv; 228 228 struct bvec_iter_all iter_all; 229 + unsigned long max_ra_pages = 0; 229 230 230 - req = ahash_request_alloc(vi->tree_params.hash_alg->tfm, GFP_NOFS); 231 + req = ahash_request_alloc(params->hash_alg->tfm, GFP_NOFS); 231 232 if (unlikely(!req)) { 232 233 bio_for_each_segment_all(bv, bio, iter_all) 233 234 SetPageError(bv->bv_page); 234 235 return; 235 236 } 236 237 238 + if (bio->bi_opf & REQ_RAHEAD) { 239 + /* 240 + * If this bio is for data readahead, then we also do readahead 241 + * of the first (largest) level of the Merkle tree. Namely, 242 + * when a Merkle tree page is read, we also try to piggy-back on 243 + * some additional pages -- up to 1/4 the number of data pages. 244 + * 245 + * This improves sequential read performance, as it greatly 246 + * reduces the number of I/O requests made to the Merkle tree. 247 + */ 248 + bio_for_each_segment_all(bv, bio, iter_all) 249 + max_ra_pages++; 250 + max_ra_pages /= 4; 251 + } 252 + 237 253 bio_for_each_segment_all(bv, bio, iter_all) { 238 254 struct page *page = bv->bv_page; 255 + unsigned long level0_index = page->index >> params->log_arity; 256 + unsigned long level0_ra_pages = 257 + min(max_ra_pages, params->level0_blocks - level0_index); 239 258 240 - if (!PageError(page) && !verify_page(inode, vi, req, page)) 259 + if (!PageError(page) && 260 + !verify_page(inode, vi, req, page, level0_ra_pages)) 241 261 SetPageError(page); 242 262 } 243 263
+6 -1
include/linux/fsverity.h
··· 77 77 * 78 78 * @inode: the inode 79 79 * @index: 0-based index of the page within the Merkle tree 80 + * @num_ra_pages: The number of Merkle tree pages that should be 81 + * prefetched starting at @index if the page at @index 82 + * isn't already cached. Implementations may ignore this 83 + * argument; it's only a performance optimization. 80 84 * 81 85 * This can be called at any time on an open verity file, as well as 82 86 * between ->begin_enable_verity() and ->end_enable_verity(). It may be ··· 91 87 * Return: the page on success, ERR_PTR() on failure 92 88 */ 93 89 struct page *(*read_merkle_tree_page)(struct inode *inode, 94 - pgoff_t index); 90 + pgoff_t index, 91 + unsigned long num_ra_pages); 95 92 96 93 /** 97 94 * Write a Merkle tree block to the given inode.