Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'iomap-5.3-merge-4' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull iomap split/cleanup from Darrick Wong:
"As promised, here's the second part of the iomap merge for 5.3, in
which we break up iomap.c into smaller files grouped by functional
area so that it'll be easier in the long run to maintain cohesiveness
of code units and to review incoming patches. There are no functional
changes and fs/iomap.c split cleanly.

Summary:

- Regroup the fs/iomap.c code by major functional area so that we can
start development for 5.4 from a more stable base"

* tag 'iomap-5.3-merge-4' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux:
iomap: move internal declarations into fs/iomap/
iomap: move the main iteration code into a separate file
iomap: move the buffered IO code into a separate file
iomap: move the direct IO code into a separate file
iomap: move the SEEK_HOLE code into a separate file
iomap: move the file mapping reporting code into a separate file
iomap: move the swapfile code into a separate file
iomap: start moving code to fs/iomap/

+2277 -2217
+1
MAINTAINERS
··· 8415 8415 T: git git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git 8416 8416 S: Supported 8417 8417 F: fs/iomap.c 8418 + F: fs/iomap/ 8418 8419 F: include/linux/iomap.h 8419 8420 8420 8421 IOMMU DRIVERS
+1 -1
fs/Makefile
··· 52 52 obj-$(CONFIG_SYSCTL) += drop_caches.o 53 53 54 54 obj-$(CONFIG_FHANDLE) += fhandle.o 55 - obj-$(CONFIG_FS_IOMAP) += iomap.o 55 + obj-y += iomap/ 56 56 57 57 obj-y += quota/ 58 58
-1
fs/dax.c
··· 26 26 #include <linux/mmu_notifier.h> 27 27 #include <linux/iomap.h> 28 28 #include <asm/pgalloc.h> 29 - #include "internal.h" 30 29 31 30 #define CREATE_TRACE_POINTS 32 31 #include <trace/events/fs_dax.h>
-10
fs/internal.h
··· 185 185 extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd, 186 186 unsigned long arg); 187 187 188 - /* 189 - * iomap support: 190 - */ 191 - typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, 192 - void *data, struct iomap *iomap); 193 - 194 - loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, 195 - unsigned flags, const struct iomap_ops *ops, void *data, 196 - iomap_actor_t actor); 197 - 198 188 /* direct-io.c: */ 199 189 int sb_init_dio_done_wq(struct super_block *sb);
-2205
fs/iomap.c
··· 1 - // SPDX-License-Identifier: GPL-2.0 2 - /* 3 - * Copyright (C) 2010 Red Hat, Inc. 4 - * Copyright (c) 2016-2018 Christoph Hellwig. 5 - */ 6 - #include <linux/module.h> 7 - #include <linux/compiler.h> 8 - #include <linux/fs.h> 9 - #include <linux/iomap.h> 10 - #include <linux/uaccess.h> 11 - #include <linux/gfp.h> 12 - #include <linux/migrate.h> 13 - #include <linux/mm.h> 14 - #include <linux/mm_inline.h> 15 - #include <linux/swap.h> 16 - #include <linux/pagemap.h> 17 - #include <linux/pagevec.h> 18 - #include <linux/file.h> 19 - #include <linux/uio.h> 20 - #include <linux/backing-dev.h> 21 - #include <linux/buffer_head.h> 22 - #include <linux/task_io_accounting_ops.h> 23 - #include <linux/dax.h> 24 - #include <linux/sched/signal.h> 25 - 26 - #include "internal.h" 27 - 28 - /* 29 - * Execute a iomap write on a segment of the mapping that spans a 30 - * contiguous range of pages that have identical block mapping state. 31 - * 32 - * This avoids the need to map pages individually, do individual allocations 33 - * for each page and most importantly avoid the need for filesystem specific 34 - * locking per page. Instead, all the operations are amortised over the entire 35 - * range of pages. It is assumed that the filesystems will lock whatever 36 - * resources they require in the iomap_begin call, and release them in the 37 - * iomap_end call. 38 - */ 39 - loff_t 40 - iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, 41 - const struct iomap_ops *ops, void *data, iomap_actor_t actor) 42 - { 43 - struct iomap iomap = { 0 }; 44 - loff_t written = 0, ret; 45 - 46 - /* 47 - * Need to map a range from start position for length bytes. This can 48 - * span multiple pages - it is only guaranteed to return a range of a 49 - * single type of pages (e.g. all into a hole, all mapped or all 50 - * unwritten). Failure at this point has nothing to undo. 51 - * 52 - * If allocation is required for this range, reserve the space now so 53 - * that the allocation is guaranteed to succeed later on. Once we copy 54 - * the data into the page cache pages, then we cannot fail otherwise we 55 - * expose transient stale data. If the reserve fails, we can safely 56 - * back out at this point as there is nothing to undo. 57 - */ 58 - ret = ops->iomap_begin(inode, pos, length, flags, &iomap); 59 - if (ret) 60 - return ret; 61 - if (WARN_ON(iomap.offset > pos)) 62 - return -EIO; 63 - if (WARN_ON(iomap.length == 0)) 64 - return -EIO; 65 - 66 - /* 67 - * Cut down the length to the one actually provided by the filesystem, 68 - * as it might not be able to give us the whole size that we requested. 69 - */ 70 - if (iomap.offset + iomap.length < pos + length) 71 - length = iomap.offset + iomap.length - pos; 72 - 73 - /* 74 - * Now that we have guaranteed that the space allocation will succeed. 75 - * we can do the copy-in page by page without having to worry about 76 - * failures exposing transient data. 77 - */ 78 - written = actor(inode, pos, length, data, &iomap); 79 - 80 - /* 81 - * Now the data has been copied, commit the range we've copied. This 82 - * should not fail unless the filesystem has had a fatal error. 83 - */ 84 - if (ops->iomap_end) { 85 - ret = ops->iomap_end(inode, pos, length, 86 - written > 0 ? written : 0, 87 - flags, &iomap); 88 - } 89 - 90 - return written ? written : ret; 91 - } 92 - 93 - static sector_t 94 - iomap_sector(struct iomap *iomap, loff_t pos) 95 - { 96 - return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT; 97 - } 98 - 99 - static struct iomap_page * 100 - iomap_page_create(struct inode *inode, struct page *page) 101 - { 102 - struct iomap_page *iop = to_iomap_page(page); 103 - 104 - if (iop || i_blocksize(inode) == PAGE_SIZE) 105 - return iop; 106 - 107 - iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL); 108 - atomic_set(&iop->read_count, 0); 109 - atomic_set(&iop->write_count, 0); 110 - bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE); 111 - 112 - /* 113 - * migrate_page_move_mapping() assumes that pages with private data have 114 - * their count elevated by 1. 115 - */ 116 - get_page(page); 117 - set_page_private(page, (unsigned long)iop); 118 - SetPagePrivate(page); 119 - return iop; 120 - } 121 - 122 - static void 123 - iomap_page_release(struct page *page) 124 - { 125 - struct iomap_page *iop = to_iomap_page(page); 126 - 127 - if (!iop) 128 - return; 129 - WARN_ON_ONCE(atomic_read(&iop->read_count)); 130 - WARN_ON_ONCE(atomic_read(&iop->write_count)); 131 - ClearPagePrivate(page); 132 - set_page_private(page, 0); 133 - put_page(page); 134 - kfree(iop); 135 - } 136 - 137 - /* 138 - * Calculate the range inside the page that we actually need to read. 139 - */ 140 - static void 141 - iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, 142 - loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp) 143 - { 144 - loff_t orig_pos = *pos; 145 - loff_t isize = i_size_read(inode); 146 - unsigned block_bits = inode->i_blkbits; 147 - unsigned block_size = (1 << block_bits); 148 - unsigned poff = offset_in_page(*pos); 149 - unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length); 150 - unsigned first = poff >> block_bits; 151 - unsigned last = (poff + plen - 1) >> block_bits; 152 - 153 - /* 154 - * If the block size is smaller than the page size we need to check the 155 - * per-block uptodate status and adjust the offset and length if needed 156 - * to avoid reading in already uptodate ranges. 157 - */ 158 - if (iop) { 159 - unsigned int i; 160 - 161 - /* move forward for each leading block marked uptodate */ 162 - for (i = first; i <= last; i++) { 163 - if (!test_bit(i, iop->uptodate)) 164 - break; 165 - *pos += block_size; 166 - poff += block_size; 167 - plen -= block_size; 168 - first++; 169 - } 170 - 171 - /* truncate len if we find any trailing uptodate block(s) */ 172 - for ( ; i <= last; i++) { 173 - if (test_bit(i, iop->uptodate)) { 174 - plen -= (last - i + 1) * block_size; 175 - last = i - 1; 176 - break; 177 - } 178 - } 179 - } 180 - 181 - /* 182 - * If the extent spans the block that contains the i_size we need to 183 - * handle both halves separately so that we properly zero data in the 184 - * page cache for blocks that are entirely outside of i_size. 185 - */ 186 - if (orig_pos <= isize && orig_pos + length > isize) { 187 - unsigned end = offset_in_page(isize - 1) >> block_bits; 188 - 189 - if (first <= end && last > end) 190 - plen -= (last - end) * block_size; 191 - } 192 - 193 - *offp = poff; 194 - *lenp = plen; 195 - } 196 - 197 - static void 198 - iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len) 199 - { 200 - struct iomap_page *iop = to_iomap_page(page); 201 - struct inode *inode = page->mapping->host; 202 - unsigned first = off >> inode->i_blkbits; 203 - unsigned last = (off + len - 1) >> inode->i_blkbits; 204 - unsigned int i; 205 - bool uptodate = true; 206 - 207 - if (iop) { 208 - for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) { 209 - if (i >= first && i <= last) 210 - set_bit(i, iop->uptodate); 211 - else if (!test_bit(i, iop->uptodate)) 212 - uptodate = false; 213 - } 214 - } 215 - 216 - if (uptodate && !PageError(page)) 217 - SetPageUptodate(page); 218 - } 219 - 220 - static void 221 - iomap_read_finish(struct iomap_page *iop, struct page *page) 222 - { 223 - if (!iop || atomic_dec_and_test(&iop->read_count)) 224 - unlock_page(page); 225 - } 226 - 227 - static void 228 - iomap_read_page_end_io(struct bio_vec *bvec, int error) 229 - { 230 - struct page *page = bvec->bv_page; 231 - struct iomap_page *iop = to_iomap_page(page); 232 - 233 - if (unlikely(error)) { 234 - ClearPageUptodate(page); 235 - SetPageError(page); 236 - } else { 237 - iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len); 238 - } 239 - 240 - iomap_read_finish(iop, page); 241 - } 242 - 243 - static void 244 - iomap_read_end_io(struct bio *bio) 245 - { 246 - int error = blk_status_to_errno(bio->bi_status); 247 - struct bio_vec *bvec; 248 - struct bvec_iter_all iter_all; 249 - 250 - bio_for_each_segment_all(bvec, bio, iter_all) 251 - iomap_read_page_end_io(bvec, error); 252 - bio_put(bio); 253 - } 254 - 255 - struct iomap_readpage_ctx { 256 - struct page *cur_page; 257 - bool cur_page_in_bio; 258 - bool is_readahead; 259 - struct bio *bio; 260 - struct list_head *pages; 261 - }; 262 - 263 - static void 264 - iomap_read_inline_data(struct inode *inode, struct page *page, 265 - struct iomap *iomap) 266 - { 267 - size_t size = i_size_read(inode); 268 - void *addr; 269 - 270 - if (PageUptodate(page)) 271 - return; 272 - 273 - BUG_ON(page->index); 274 - BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data)); 275 - 276 - addr = kmap_atomic(page); 277 - memcpy(addr, iomap->inline_data, size); 278 - memset(addr + size, 0, PAGE_SIZE - size); 279 - kunmap_atomic(addr); 280 - SetPageUptodate(page); 281 - } 282 - 283 - static loff_t 284 - iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 285 - struct iomap *iomap) 286 - { 287 - struct iomap_readpage_ctx *ctx = data; 288 - struct page *page = ctx->cur_page; 289 - struct iomap_page *iop = iomap_page_create(inode, page); 290 - bool same_page = false, is_contig = false; 291 - loff_t orig_pos = pos; 292 - unsigned poff, plen; 293 - sector_t sector; 294 - 295 - if (iomap->type == IOMAP_INLINE) { 296 - WARN_ON_ONCE(pos); 297 - iomap_read_inline_data(inode, page, iomap); 298 - return PAGE_SIZE; 299 - } 300 - 301 - /* zero post-eof blocks as the page may be mapped */ 302 - iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen); 303 - if (plen == 0) 304 - goto done; 305 - 306 - if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) { 307 - zero_user(page, poff, plen); 308 - iomap_set_range_uptodate(page, poff, plen); 309 - goto done; 310 - } 311 - 312 - ctx->cur_page_in_bio = true; 313 - 314 - /* 315 - * Try to merge into a previous segment if we can. 316 - */ 317 - sector = iomap_sector(iomap, pos); 318 - if (ctx->bio && bio_end_sector(ctx->bio) == sector) 319 - is_contig = true; 320 - 321 - if (is_contig && 322 - __bio_try_merge_page(ctx->bio, page, plen, poff, &same_page)) { 323 - if (!same_page && iop) 324 - atomic_inc(&iop->read_count); 325 - goto done; 326 - } 327 - 328 - /* 329 - * If we start a new segment we need to increase the read count, and we 330 - * need to do so before submitting any previous full bio to make sure 331 - * that we don't prematurely unlock the page. 332 - */ 333 - if (iop) 334 - atomic_inc(&iop->read_count); 335 - 336 - if (!ctx->bio || !is_contig || bio_full(ctx->bio, plen)) { 337 - gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); 338 - int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT; 339 - 340 - if (ctx->bio) 341 - submit_bio(ctx->bio); 342 - 343 - if (ctx->is_readahead) /* same as readahead_gfp_mask */ 344 - gfp |= __GFP_NORETRY | __GFP_NOWARN; 345 - ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs)); 346 - ctx->bio->bi_opf = REQ_OP_READ; 347 - if (ctx->is_readahead) 348 - ctx->bio->bi_opf |= REQ_RAHEAD; 349 - ctx->bio->bi_iter.bi_sector = sector; 350 - bio_set_dev(ctx->bio, iomap->bdev); 351 - ctx->bio->bi_end_io = iomap_read_end_io; 352 - } 353 - 354 - bio_add_page(ctx->bio, page, plen, poff); 355 - done: 356 - /* 357 - * Move the caller beyond our range so that it keeps making progress. 358 - * For that we have to include any leading non-uptodate ranges, but 359 - * we can skip trailing ones as they will be handled in the next 360 - * iteration. 361 - */ 362 - return pos - orig_pos + plen; 363 - } 364 - 365 - int 366 - iomap_readpage(struct page *page, const struct iomap_ops *ops) 367 - { 368 - struct iomap_readpage_ctx ctx = { .cur_page = page }; 369 - struct inode *inode = page->mapping->host; 370 - unsigned poff; 371 - loff_t ret; 372 - 373 - for (poff = 0; poff < PAGE_SIZE; poff += ret) { 374 - ret = iomap_apply(inode, page_offset(page) + poff, 375 - PAGE_SIZE - poff, 0, ops, &ctx, 376 - iomap_readpage_actor); 377 - if (ret <= 0) { 378 - WARN_ON_ONCE(ret == 0); 379 - SetPageError(page); 380 - break; 381 - } 382 - } 383 - 384 - if (ctx.bio) { 385 - submit_bio(ctx.bio); 386 - WARN_ON_ONCE(!ctx.cur_page_in_bio); 387 - } else { 388 - WARN_ON_ONCE(ctx.cur_page_in_bio); 389 - unlock_page(page); 390 - } 391 - 392 - /* 393 - * Just like mpage_readpages and block_read_full_page we always 394 - * return 0 and just mark the page as PageError on errors. This 395 - * should be cleaned up all through the stack eventually. 396 - */ 397 - return 0; 398 - } 399 - EXPORT_SYMBOL_GPL(iomap_readpage); 400 - 401 - static struct page * 402 - iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos, 403 - loff_t length, loff_t *done) 404 - { 405 - while (!list_empty(pages)) { 406 - struct page *page = lru_to_page(pages); 407 - 408 - if (page_offset(page) >= (u64)pos + length) 409 - break; 410 - 411 - list_del(&page->lru); 412 - if (!add_to_page_cache_lru(page, inode->i_mapping, page->index, 413 - GFP_NOFS)) 414 - return page; 415 - 416 - /* 417 - * If we already have a page in the page cache at index we are 418 - * done. Upper layers don't care if it is uptodate after the 419 - * readpages call itself as every page gets checked again once 420 - * actually needed. 421 - */ 422 - *done += PAGE_SIZE; 423 - put_page(page); 424 - } 425 - 426 - return NULL; 427 - } 428 - 429 - static loff_t 430 - iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, 431 - void *data, struct iomap *iomap) 432 - { 433 - struct iomap_readpage_ctx *ctx = data; 434 - loff_t done, ret; 435 - 436 - for (done = 0; done < length; done += ret) { 437 - if (ctx->cur_page && offset_in_page(pos + done) == 0) { 438 - if (!ctx->cur_page_in_bio) 439 - unlock_page(ctx->cur_page); 440 - put_page(ctx->cur_page); 441 - ctx->cur_page = NULL; 442 - } 443 - if (!ctx->cur_page) { 444 - ctx->cur_page = iomap_next_page(inode, ctx->pages, 445 - pos, length, &done); 446 - if (!ctx->cur_page) 447 - break; 448 - ctx->cur_page_in_bio = false; 449 - } 450 - ret = iomap_readpage_actor(inode, pos + done, length - done, 451 - ctx, iomap); 452 - } 453 - 454 - return done; 455 - } 456 - 457 - int 458 - iomap_readpages(struct address_space *mapping, struct list_head *pages, 459 - unsigned nr_pages, const struct iomap_ops *ops) 460 - { 461 - struct iomap_readpage_ctx ctx = { 462 - .pages = pages, 463 - .is_readahead = true, 464 - }; 465 - loff_t pos = page_offset(list_entry(pages->prev, struct page, lru)); 466 - loff_t last = page_offset(list_entry(pages->next, struct page, lru)); 467 - loff_t length = last - pos + PAGE_SIZE, ret = 0; 468 - 469 - while (length > 0) { 470 - ret = iomap_apply(mapping->host, pos, length, 0, ops, 471 - &ctx, iomap_readpages_actor); 472 - if (ret <= 0) { 473 - WARN_ON_ONCE(ret == 0); 474 - goto done; 475 - } 476 - pos += ret; 477 - length -= ret; 478 - } 479 - ret = 0; 480 - done: 481 - if (ctx.bio) 482 - submit_bio(ctx.bio); 483 - if (ctx.cur_page) { 484 - if (!ctx.cur_page_in_bio) 485 - unlock_page(ctx.cur_page); 486 - put_page(ctx.cur_page); 487 - } 488 - 489 - /* 490 - * Check that we didn't lose a page due to the arcance calling 491 - * conventions.. 492 - */ 493 - WARN_ON_ONCE(!ret && !list_empty(ctx.pages)); 494 - return ret; 495 - } 496 - EXPORT_SYMBOL_GPL(iomap_readpages); 497 - 498 - /* 499 - * iomap_is_partially_uptodate checks whether blocks within a page are 500 - * uptodate or not. 501 - * 502 - * Returns true if all blocks which correspond to a file portion 503 - * we want to read within the page are uptodate. 504 - */ 505 - int 506 - iomap_is_partially_uptodate(struct page *page, unsigned long from, 507 - unsigned long count) 508 - { 509 - struct iomap_page *iop = to_iomap_page(page); 510 - struct inode *inode = page->mapping->host; 511 - unsigned len, first, last; 512 - unsigned i; 513 - 514 - /* Limit range to one page */ 515 - len = min_t(unsigned, PAGE_SIZE - from, count); 516 - 517 - /* First and last blocks in range within page */ 518 - first = from >> inode->i_blkbits; 519 - last = (from + len - 1) >> inode->i_blkbits; 520 - 521 - if (iop) { 522 - for (i = first; i <= last; i++) 523 - if (!test_bit(i, iop->uptodate)) 524 - return 0; 525 - return 1; 526 - } 527 - 528 - return 0; 529 - } 530 - EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); 531 - 532 - int 533 - iomap_releasepage(struct page *page, gfp_t gfp_mask) 534 - { 535 - /* 536 - * mm accommodates an old ext3 case where clean pages might not have had 537 - * the dirty bit cleared. Thus, it can send actual dirty pages to 538 - * ->releasepage() via shrink_active_list(), skip those here. 539 - */ 540 - if (PageDirty(page) || PageWriteback(page)) 541 - return 0; 542 - iomap_page_release(page); 543 - return 1; 544 - } 545 - EXPORT_SYMBOL_GPL(iomap_releasepage); 546 - 547 - void 548 - iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len) 549 - { 550 - /* 551 - * If we are invalidating the entire page, clear the dirty state from it 552 - * and release it to avoid unnecessary buildup of the LRU. 553 - */ 554 - if (offset == 0 && len == PAGE_SIZE) { 555 - WARN_ON_ONCE(PageWriteback(page)); 556 - cancel_dirty_page(page); 557 - iomap_page_release(page); 558 - } 559 - } 560 - EXPORT_SYMBOL_GPL(iomap_invalidatepage); 561 - 562 - #ifdef CONFIG_MIGRATION 563 - int 564 - iomap_migrate_page(struct address_space *mapping, struct page *newpage, 565 - struct page *page, enum migrate_mode mode) 566 - { 567 - int ret; 568 - 569 - ret = migrate_page_move_mapping(mapping, newpage, page, 0); 570 - if (ret != MIGRATEPAGE_SUCCESS) 571 - return ret; 572 - 573 - if (page_has_private(page)) { 574 - ClearPagePrivate(page); 575 - get_page(newpage); 576 - set_page_private(newpage, page_private(page)); 577 - set_page_private(page, 0); 578 - put_page(page); 579 - SetPagePrivate(newpage); 580 - } 581 - 582 - if (mode != MIGRATE_SYNC_NO_COPY) 583 - migrate_page_copy(newpage, page); 584 - else 585 - migrate_page_states(newpage, page); 586 - return MIGRATEPAGE_SUCCESS; 587 - } 588 - EXPORT_SYMBOL_GPL(iomap_migrate_page); 589 - #endif /* CONFIG_MIGRATION */ 590 - 591 - static void 592 - iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) 593 - { 594 - loff_t i_size = i_size_read(inode); 595 - 596 - /* 597 - * Only truncate newly allocated pages beyoned EOF, even if the 598 - * write started inside the existing inode size. 599 - */ 600 - if (pos + len > i_size) 601 - truncate_pagecache_range(inode, max(pos, i_size), pos + len); 602 - } 603 - 604 - static int 605 - iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page, 606 - unsigned poff, unsigned plen, unsigned from, unsigned to, 607 - struct iomap *iomap) 608 - { 609 - struct bio_vec bvec; 610 - struct bio bio; 611 - 612 - if (iomap->type != IOMAP_MAPPED || block_start >= i_size_read(inode)) { 613 - zero_user_segments(page, poff, from, to, poff + plen); 614 - iomap_set_range_uptodate(page, poff, plen); 615 - return 0; 616 - } 617 - 618 - bio_init(&bio, &bvec, 1); 619 - bio.bi_opf = REQ_OP_READ; 620 - bio.bi_iter.bi_sector = iomap_sector(iomap, block_start); 621 - bio_set_dev(&bio, iomap->bdev); 622 - __bio_add_page(&bio, page, plen, poff); 623 - return submit_bio_wait(&bio); 624 - } 625 - 626 - static int 627 - __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, 628 - struct page *page, struct iomap *iomap) 629 - { 630 - struct iomap_page *iop = iomap_page_create(inode, page); 631 - loff_t block_size = i_blocksize(inode); 632 - loff_t block_start = pos & ~(block_size - 1); 633 - loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1); 634 - unsigned from = offset_in_page(pos), to = from + len, poff, plen; 635 - int status = 0; 636 - 637 - if (PageUptodate(page)) 638 - return 0; 639 - 640 - do { 641 - iomap_adjust_read_range(inode, iop, &block_start, 642 - block_end - block_start, &poff, &plen); 643 - if (plen == 0) 644 - break; 645 - 646 - if ((from > poff && from < poff + plen) || 647 - (to > poff && to < poff + plen)) { 648 - status = iomap_read_page_sync(inode, block_start, page, 649 - poff, plen, from, to, iomap); 650 - if (status) 651 - break; 652 - } 653 - 654 - } while ((block_start += plen) < block_end); 655 - 656 - return status; 657 - } 658 - 659 - static int 660 - iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, 661 - struct page **pagep, struct iomap *iomap) 662 - { 663 - const struct iomap_page_ops *page_ops = iomap->page_ops; 664 - pgoff_t index = pos >> PAGE_SHIFT; 665 - struct page *page; 666 - int status = 0; 667 - 668 - BUG_ON(pos + len > iomap->offset + iomap->length); 669 - 670 - if (fatal_signal_pending(current)) 671 - return -EINTR; 672 - 673 - if (page_ops && page_ops->page_prepare) { 674 - status = page_ops->page_prepare(inode, pos, len, iomap); 675 - if (status) 676 - return status; 677 - } 678 - 679 - page = grab_cache_page_write_begin(inode->i_mapping, index, flags); 680 - if (!page) { 681 - status = -ENOMEM; 682 - goto out_no_page; 683 - } 684 - 685 - if (iomap->type == IOMAP_INLINE) 686 - iomap_read_inline_data(inode, page, iomap); 687 - else if (iomap->flags & IOMAP_F_BUFFER_HEAD) 688 - status = __block_write_begin_int(page, pos, len, NULL, iomap); 689 - else 690 - status = __iomap_write_begin(inode, pos, len, page, iomap); 691 - 692 - if (unlikely(status)) 693 - goto out_unlock; 694 - 695 - *pagep = page; 696 - return 0; 697 - 698 - out_unlock: 699 - unlock_page(page); 700 - put_page(page); 701 - iomap_write_failed(inode, pos, len); 702 - 703 - out_no_page: 704 - if (page_ops && page_ops->page_done) 705 - page_ops->page_done(inode, pos, 0, NULL, iomap); 706 - return status; 707 - } 708 - 709 - int 710 - iomap_set_page_dirty(struct page *page) 711 - { 712 - struct address_space *mapping = page_mapping(page); 713 - int newly_dirty; 714 - 715 - if (unlikely(!mapping)) 716 - return !TestSetPageDirty(page); 717 - 718 - /* 719 - * Lock out page->mem_cgroup migration to keep PageDirty 720 - * synchronized with per-memcg dirty page counters. 721 - */ 722 - lock_page_memcg(page); 723 - newly_dirty = !TestSetPageDirty(page); 724 - if (newly_dirty) 725 - __set_page_dirty(page, mapping, 0); 726 - unlock_page_memcg(page); 727 - 728 - if (newly_dirty) 729 - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 730 - return newly_dirty; 731 - } 732 - EXPORT_SYMBOL_GPL(iomap_set_page_dirty); 733 - 734 - static int 735 - __iomap_write_end(struct inode *inode, loff_t pos, unsigned len, 736 - unsigned copied, struct page *page, struct iomap *iomap) 737 - { 738 - flush_dcache_page(page); 739 - 740 - /* 741 - * The blocks that were entirely written will now be uptodate, so we 742 - * don't have to worry about a readpage reading them and overwriting a 743 - * partial write. However if we have encountered a short write and only 744 - * partially written into a block, it will not be marked uptodate, so a 745 - * readpage might come in and destroy our partial write. 746 - * 747 - * Do the simplest thing, and just treat any short write to a non 748 - * uptodate page as a zero-length write, and force the caller to redo 749 - * the whole thing. 750 - */ 751 - if (unlikely(copied < len && !PageUptodate(page))) 752 - return 0; 753 - iomap_set_range_uptodate(page, offset_in_page(pos), len); 754 - iomap_set_page_dirty(page); 755 - return copied; 756 - } 757 - 758 - static int 759 - iomap_write_end_inline(struct inode *inode, struct page *page, 760 - struct iomap *iomap, loff_t pos, unsigned copied) 761 - { 762 - void *addr; 763 - 764 - WARN_ON_ONCE(!PageUptodate(page)); 765 - BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data)); 766 - 767 - addr = kmap_atomic(page); 768 - memcpy(iomap->inline_data + pos, addr + pos, copied); 769 - kunmap_atomic(addr); 770 - 771 - mark_inode_dirty(inode); 772 - return copied; 773 - } 774 - 775 - static int 776 - iomap_write_end(struct inode *inode, loff_t pos, unsigned len, 777 - unsigned copied, struct page *page, struct iomap *iomap) 778 - { 779 - const struct iomap_page_ops *page_ops = iomap->page_ops; 780 - loff_t old_size = inode->i_size; 781 - int ret; 782 - 783 - if (iomap->type == IOMAP_INLINE) { 784 - ret = iomap_write_end_inline(inode, page, iomap, pos, copied); 785 - } else if (iomap->flags & IOMAP_F_BUFFER_HEAD) { 786 - ret = block_write_end(NULL, inode->i_mapping, pos, len, copied, 787 - page, NULL); 788 - } else { 789 - ret = __iomap_write_end(inode, pos, len, copied, page, iomap); 790 - } 791 - 792 - /* 793 - * Update the in-memory inode size after copying the data into the page 794 - * cache. It's up to the file system to write the updated size to disk, 795 - * preferably after I/O completion so that no stale data is exposed. 796 - */ 797 - if (pos + ret > old_size) { 798 - i_size_write(inode, pos + ret); 799 - iomap->flags |= IOMAP_F_SIZE_CHANGED; 800 - } 801 - unlock_page(page); 802 - 803 - if (old_size < pos) 804 - pagecache_isize_extended(inode, old_size, pos); 805 - if (page_ops && page_ops->page_done) 806 - page_ops->page_done(inode, pos, ret, page, iomap); 807 - put_page(page); 808 - 809 - if (ret < len) 810 - iomap_write_failed(inode, pos, len); 811 - return ret; 812 - } 813 - 814 - static loff_t 815 - iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 816 - struct iomap *iomap) 817 - { 818 - struct iov_iter *i = data; 819 - long status = 0; 820 - ssize_t written = 0; 821 - unsigned int flags = AOP_FLAG_NOFS; 822 - 823 - do { 824 - struct page *page; 825 - unsigned long offset; /* Offset into pagecache page */ 826 - unsigned long bytes; /* Bytes to write to page */ 827 - size_t copied; /* Bytes copied from user */ 828 - 829 - offset = offset_in_page(pos); 830 - bytes = min_t(unsigned long, PAGE_SIZE - offset, 831 - iov_iter_count(i)); 832 - again: 833 - if (bytes > length) 834 - bytes = length; 835 - 836 - /* 837 - * Bring in the user page that we will copy from _first_. 838 - * Otherwise there's a nasty deadlock on copying from the 839 - * same page as we're writing to, without it being marked 840 - * up-to-date. 841 - * 842 - * Not only is this an optimisation, but it is also required 843 - * to check that the address is actually valid, when atomic 844 - * usercopies are used, below. 845 - */ 846 - if (unlikely(iov_iter_fault_in_readable(i, bytes))) { 847 - status = -EFAULT; 848 - break; 849 - } 850 - 851 - status = iomap_write_begin(inode, pos, bytes, flags, &page, 852 - iomap); 853 - if (unlikely(status)) 854 - break; 855 - 856 - if (mapping_writably_mapped(inode->i_mapping)) 857 - flush_dcache_page(page); 858 - 859 - copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 860 - 861 - flush_dcache_page(page); 862 - 863 - status = iomap_write_end(inode, pos, bytes, copied, page, 864 - iomap); 865 - if (unlikely(status < 0)) 866 - break; 867 - copied = status; 868 - 869 - cond_resched(); 870 - 871 - iov_iter_advance(i, copied); 872 - if (unlikely(copied == 0)) { 873 - /* 874 - * If we were unable to copy any data at all, we must 875 - * fall back to a single segment length write. 876 - * 877 - * If we didn't fallback here, we could livelock 878 - * because not all segments in the iov can be copied at 879 - * once without a pagefault. 880 - */ 881 - bytes = min_t(unsigned long, PAGE_SIZE - offset, 882 - iov_iter_single_seg_count(i)); 883 - goto again; 884 - } 885 - pos += copied; 886 - written += copied; 887 - length -= copied; 888 - 889 - balance_dirty_pages_ratelimited(inode->i_mapping); 890 - } while (iov_iter_count(i) && length); 891 - 892 - return written ? written : status; 893 - } 894 - 895 - ssize_t 896 - iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, 897 - const struct iomap_ops *ops) 898 - { 899 - struct inode *inode = iocb->ki_filp->f_mapping->host; 900 - loff_t pos = iocb->ki_pos, ret = 0, written = 0; 901 - 902 - while (iov_iter_count(iter)) { 903 - ret = iomap_apply(inode, pos, iov_iter_count(iter), 904 - IOMAP_WRITE, ops, iter, iomap_write_actor); 905 - if (ret <= 0) 906 - break; 907 - pos += ret; 908 - written += ret; 909 - } 910 - 911 - return written ? written : ret; 912 - } 913 - EXPORT_SYMBOL_GPL(iomap_file_buffered_write); 914 - 915 - static struct page * 916 - __iomap_read_page(struct inode *inode, loff_t offset) 917 - { 918 - struct address_space *mapping = inode->i_mapping; 919 - struct page *page; 920 - 921 - page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL); 922 - if (IS_ERR(page)) 923 - return page; 924 - if (!PageUptodate(page)) { 925 - put_page(page); 926 - return ERR_PTR(-EIO); 927 - } 928 - return page; 929 - } 930 - 931 - static loff_t 932 - iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 933 - struct iomap *iomap) 934 - { 935 - long status = 0; 936 - ssize_t written = 0; 937 - 938 - do { 939 - struct page *page, *rpage; 940 - unsigned long offset; /* Offset into pagecache page */ 941 - unsigned long bytes; /* Bytes to write to page */ 942 - 943 - offset = offset_in_page(pos); 944 - bytes = min_t(loff_t, PAGE_SIZE - offset, length); 945 - 946 - rpage = __iomap_read_page(inode, pos); 947 - if (IS_ERR(rpage)) 948 - return PTR_ERR(rpage); 949 - 950 - status = iomap_write_begin(inode, pos, bytes, 951 - AOP_FLAG_NOFS, &page, iomap); 952 - put_page(rpage); 953 - if (unlikely(status)) 954 - return status; 955 - 956 - WARN_ON_ONCE(!PageUptodate(page)); 957 - 958 - status = iomap_write_end(inode, pos, bytes, bytes, page, iomap); 959 - if (unlikely(status <= 0)) { 960 - if (WARN_ON_ONCE(status == 0)) 961 - return -EIO; 962 - return status; 963 - } 964 - 965 - cond_resched(); 966 - 967 - pos += status; 968 - written += status; 969 - length -= status; 970 - 971 - balance_dirty_pages_ratelimited(inode->i_mapping); 972 - } while (length); 973 - 974 - return written; 975 - } 976 - 977 - int 978 - iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, 979 - const struct iomap_ops *ops) 980 - { 981 - loff_t ret; 982 - 983 - while (len) { 984 - ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL, 985 - iomap_dirty_actor); 986 - if (ret <= 0) 987 - return ret; 988 - pos += ret; 989 - len -= ret; 990 - } 991 - 992 - return 0; 993 - } 994 - EXPORT_SYMBOL_GPL(iomap_file_dirty); 995 - 996 - static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, 997 - unsigned bytes, struct iomap *iomap) 998 - { 999 - struct page *page; 1000 - int status; 1001 - 1002 - status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page, 1003 - iomap); 1004 - if (status) 1005 - return status; 1006 - 1007 - zero_user(page, offset, bytes); 1008 - mark_page_accessed(page); 1009 - 1010 - return iomap_write_end(inode, pos, bytes, bytes, page, iomap); 1011 - } 1012 - 1013 - static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, 1014 - struct iomap *iomap) 1015 - { 1016 - return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, 1017 - iomap_sector(iomap, pos & PAGE_MASK), offset, bytes); 1018 - } 1019 - 1020 - static loff_t 1021 - iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, 1022 - void *data, struct iomap *iomap) 1023 - { 1024 - bool *did_zero = data; 1025 - loff_t written = 0; 1026 - int status; 1027 - 1028 - /* already zeroed? we're done. */ 1029 - if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) 1030 - return count; 1031 - 1032 - do { 1033 - unsigned offset, bytes; 1034 - 1035 - offset = offset_in_page(pos); 1036 - bytes = min_t(loff_t, PAGE_SIZE - offset, count); 1037 - 1038 - if (IS_DAX(inode)) 1039 - status = iomap_dax_zero(pos, offset, bytes, iomap); 1040 - else 1041 - status = iomap_zero(inode, pos, offset, bytes, iomap); 1042 - if (status < 0) 1043 - return status; 1044 - 1045 - pos += bytes; 1046 - count -= bytes; 1047 - written += bytes; 1048 - if (did_zero) 1049 - *did_zero = true; 1050 - } while (count > 0); 1051 - 1052 - return written; 1053 - } 1054 - 1055 - int 1056 - iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, 1057 - const struct iomap_ops *ops) 1058 - { 1059 - loff_t ret; 1060 - 1061 - while (len > 0) { 1062 - ret = iomap_apply(inode, pos, len, IOMAP_ZERO, 1063 - ops, did_zero, iomap_zero_range_actor); 1064 - if (ret <= 0) 1065 - return ret; 1066 - 1067 - pos += ret; 1068 - len -= ret; 1069 - } 1070 - 1071 - return 0; 1072 - } 1073 - EXPORT_SYMBOL_GPL(iomap_zero_range); 1074 - 1075 - int 1076 - iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, 1077 - const struct iomap_ops *ops) 1078 - { 1079 - unsigned int blocksize = i_blocksize(inode); 1080 - unsigned int off = pos & (blocksize - 1); 1081 - 1082 - /* Block boundary? Nothing to do */ 1083 - if (!off) 1084 - return 0; 1085 - return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); 1086 - } 1087 - EXPORT_SYMBOL_GPL(iomap_truncate_page); 1088 - 1089 - static loff_t 1090 - iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, 1091 - void *data, struct iomap *iomap) 1092 - { 1093 - struct page *page = data; 1094 - int ret; 1095 - 1096 - if (iomap->flags & IOMAP_F_BUFFER_HEAD) { 1097 - ret = __block_write_begin_int(page, pos, length, NULL, iomap); 1098 - if (ret) 1099 - return ret; 1100 - block_commit_write(page, 0, length); 1101 - } else { 1102 - WARN_ON_ONCE(!PageUptodate(page)); 1103 - iomap_page_create(inode, page); 1104 - set_page_dirty(page); 1105 - } 1106 - 1107 - return length; 1108 - } 1109 - 1110 - vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) 1111 - { 1112 - struct page *page = vmf->page; 1113 - struct inode *inode = file_inode(vmf->vma->vm_file); 1114 - unsigned long length; 1115 - loff_t offset, size; 1116 - ssize_t ret; 1117 - 1118 - lock_page(page); 1119 - size = i_size_read(inode); 1120 - if ((page->mapping != inode->i_mapping) || 1121 - (page_offset(page) > size)) { 1122 - /* We overload EFAULT to mean page got truncated */ 1123 - ret = -EFAULT; 1124 - goto out_unlock; 1125 - } 1126 - 1127 - /* page is wholly or partially inside EOF */ 1128 - if (((page->index + 1) << PAGE_SHIFT) > size) 1129 - length = offset_in_page(size); 1130 - else 1131 - length = PAGE_SIZE; 1132 - 1133 - offset = page_offset(page); 1134 - while (length > 0) { 1135 - ret = iomap_apply(inode, offset, length, 1136 - IOMAP_WRITE | IOMAP_FAULT, ops, page, 1137 - iomap_page_mkwrite_actor); 1138 - if (unlikely(ret <= 0)) 1139 - goto out_unlock; 1140 - offset += ret; 1141 - length -= ret; 1142 - } 1143 - 1144 - wait_for_stable_page(page); 1145 - return VM_FAULT_LOCKED; 1146 - out_unlock: 1147 - unlock_page(page); 1148 - return block_page_mkwrite_return(ret); 1149 - } 1150 - EXPORT_SYMBOL_GPL(iomap_page_mkwrite); 1151 - 1152 - struct fiemap_ctx { 1153 - struct fiemap_extent_info *fi; 1154 - struct iomap prev; 1155 - }; 1156 - 1157 - static int iomap_to_fiemap(struct fiemap_extent_info *fi, 1158 - struct iomap *iomap, u32 flags) 1159 - { 1160 - switch (iomap->type) { 1161 - case IOMAP_HOLE: 1162 - /* skip holes */ 1163 - return 0; 1164 - case IOMAP_DELALLOC: 1165 - flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN; 1166 - break; 1167 - case IOMAP_MAPPED: 1168 - break; 1169 - case IOMAP_UNWRITTEN: 1170 - flags |= FIEMAP_EXTENT_UNWRITTEN; 1171 - break; 1172 - case IOMAP_INLINE: 1173 - flags |= FIEMAP_EXTENT_DATA_INLINE; 1174 - break; 1175 - } 1176 - 1177 - if (iomap->flags & IOMAP_F_MERGED) 1178 - flags |= FIEMAP_EXTENT_MERGED; 1179 - if (iomap->flags & IOMAP_F_SHARED) 1180 - flags |= FIEMAP_EXTENT_SHARED; 1181 - 1182 - return fiemap_fill_next_extent(fi, iomap->offset, 1183 - iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0, 1184 - iomap->length, flags); 1185 - } 1186 - 1187 - static loff_t 1188 - iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 1189 - struct iomap *iomap) 1190 - { 1191 - struct fiemap_ctx *ctx = data; 1192 - loff_t ret = length; 1193 - 1194 - if (iomap->type == IOMAP_HOLE) 1195 - return length; 1196 - 1197 - ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0); 1198 - ctx->prev = *iomap; 1199 - switch (ret) { 1200 - case 0: /* success */ 1201 - return length; 1202 - case 1: /* extent array full */ 1203 - return 0; 1204 - default: 1205 - return ret; 1206 - } 1207 - } 1208 - 1209 - int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, 1210 - loff_t start, loff_t len, const struct iomap_ops *ops) 1211 - { 1212 - struct fiemap_ctx ctx; 1213 - loff_t ret; 1214 - 1215 - memset(&ctx, 0, sizeof(ctx)); 1216 - ctx.fi = fi; 1217 - ctx.prev.type = IOMAP_HOLE; 1218 - 1219 - ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC); 1220 - if (ret) 1221 - return ret; 1222 - 1223 - if (fi->fi_flags & FIEMAP_FLAG_SYNC) { 1224 - ret = filemap_write_and_wait(inode->i_mapping); 1225 - if (ret) 1226 - return ret; 1227 - } 1228 - 1229 - while (len > 0) { 1230 - ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx, 1231 - iomap_fiemap_actor); 1232 - /* inode with no (attribute) mapping will give ENOENT */ 1233 - if (ret == -ENOENT) 1234 - break; 1235 - if (ret < 0) 1236 - return ret; 1237 - if (ret == 0) 1238 - break; 1239 - 1240 - start += ret; 1241 - len -= ret; 1242 - } 1243 - 1244 - if (ctx.prev.type != IOMAP_HOLE) { 1245 - ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST); 1246 - if (ret < 0) 1247 - return ret; 1248 - } 1249 - 1250 - return 0; 1251 - } 1252 - EXPORT_SYMBOL_GPL(iomap_fiemap); 1253 - 1254 - /* 1255 - * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff. 1256 - * Returns true if found and updates @lastoff to the offset in file. 1257 - */ 1258 - static bool 1259 - page_seek_hole_data(struct inode *inode, struct page *page, loff_t *lastoff, 1260 - int whence) 1261 - { 1262 - const struct address_space_operations *ops = inode->i_mapping->a_ops; 1263 - unsigned int bsize = i_blocksize(inode), off; 1264 - bool seek_data = whence == SEEK_DATA; 1265 - loff_t poff = page_offset(page); 1266 - 1267 - if (WARN_ON_ONCE(*lastoff >= poff + PAGE_SIZE)) 1268 - return false; 1269 - 1270 - if (*lastoff < poff) { 1271 - /* 1272 - * Last offset smaller than the start of the page means we found 1273 - * a hole: 1274 - */ 1275 - if (whence == SEEK_HOLE) 1276 - return true; 1277 - *lastoff = poff; 1278 - } 1279 - 1280 - /* 1281 - * Just check the page unless we can and should check block ranges: 1282 - */ 1283 - if (bsize == PAGE_SIZE || !ops->is_partially_uptodate) 1284 - return PageUptodate(page) == seek_data; 1285 - 1286 - lock_page(page); 1287 - if (unlikely(page->mapping != inode->i_mapping)) 1288 - goto out_unlock_not_found; 1289 - 1290 - for (off = 0; off < PAGE_SIZE; off += bsize) { 1291 - if (offset_in_page(*lastoff) >= off + bsize) 1292 - continue; 1293 - if (ops->is_partially_uptodate(page, off, bsize) == seek_data) { 1294 - unlock_page(page); 1295 - return true; 1296 - } 1297 - *lastoff = poff + off + bsize; 1298 - } 1299 - 1300 - out_unlock_not_found: 1301 - unlock_page(page); 1302 - return false; 1303 - } 1304 - 1305 - /* 1306 - * Seek for SEEK_DATA / SEEK_HOLE in the page cache. 1307 - * 1308 - * Within unwritten extents, the page cache determines which parts are holes 1309 - * and which are data: uptodate buffer heads count as data; everything else 1310 - * counts as a hole. 1311 - * 1312 - * Returns the resulting offset on successs, and -ENOENT otherwise. 1313 - */ 1314 - static loff_t 1315 - page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, 1316 - int whence) 1317 - { 1318 - pgoff_t index = offset >> PAGE_SHIFT; 1319 - pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE); 1320 - loff_t lastoff = offset; 1321 - struct pagevec pvec; 1322 - 1323 - if (length <= 0) 1324 - return -ENOENT; 1325 - 1326 - pagevec_init(&pvec); 1327 - 1328 - do { 1329 - unsigned nr_pages, i; 1330 - 1331 - nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index, 1332 - end - 1); 1333 - if (nr_pages == 0) 1334 - break; 1335 - 1336 - for (i = 0; i < nr_pages; i++) { 1337 - struct page *page = pvec.pages[i]; 1338 - 1339 - if (page_seek_hole_data(inode, page, &lastoff, whence)) 1340 - goto check_range; 1341 - lastoff = page_offset(page) + PAGE_SIZE; 1342 - } 1343 - pagevec_release(&pvec); 1344 - } while (index < end); 1345 - 1346 - /* When no page at lastoff and we are not done, we found a hole. */ 1347 - if (whence != SEEK_HOLE) 1348 - goto not_found; 1349 - 1350 - check_range: 1351 - if (lastoff < offset + length) 1352 - goto out; 1353 - not_found: 1354 - lastoff = -ENOENT; 1355 - out: 1356 - pagevec_release(&pvec); 1357 - return lastoff; 1358 - } 1359 - 1360 - 1361 - static loff_t 1362 - iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length, 1363 - void *data, struct iomap *iomap) 1364 - { 1365 - switch (iomap->type) { 1366 - case IOMAP_UNWRITTEN: 1367 - offset = page_cache_seek_hole_data(inode, offset, length, 1368 - SEEK_HOLE); 1369 - if (offset < 0) 1370 - return length; 1371 - /* fall through */ 1372 - case IOMAP_HOLE: 1373 - *(loff_t *)data = offset; 1374 - return 0; 1375 - default: 1376 - return length; 1377 - } 1378 - } 1379 - 1380 - loff_t 1381 - iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops) 1382 - { 1383 - loff_t size = i_size_read(inode); 1384 - loff_t length = size - offset; 1385 - loff_t ret; 1386 - 1387 - /* Nothing to be found before or beyond the end of the file. */ 1388 - if (offset < 0 || offset >= size) 1389 - return -ENXIO; 1390 - 1391 - while (length > 0) { 1392 - ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops, 1393 - &offset, iomap_seek_hole_actor); 1394 - if (ret < 0) 1395 - return ret; 1396 - if (ret == 0) 1397 - break; 1398 - 1399 - offset += ret; 1400 - length -= ret; 1401 - } 1402 - 1403 - return offset; 1404 - } 1405 - EXPORT_SYMBOL_GPL(iomap_seek_hole); 1406 - 1407 - static loff_t 1408 - iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length, 1409 - void *data, struct iomap *iomap) 1410 - { 1411 - switch (iomap->type) { 1412 - case IOMAP_HOLE: 1413 - return length; 1414 - case IOMAP_UNWRITTEN: 1415 - offset = page_cache_seek_hole_data(inode, offset, length, 1416 - SEEK_DATA); 1417 - if (offset < 0) 1418 - return length; 1419 - /*FALLTHRU*/ 1420 - default: 1421 - *(loff_t *)data = offset; 1422 - return 0; 1423 - } 1424 - } 1425 - 1426 - loff_t 1427 - iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops) 1428 - { 1429 - loff_t size = i_size_read(inode); 1430 - loff_t length = size - offset; 1431 - loff_t ret; 1432 - 1433 - /* Nothing to be found before or beyond the end of the file. */ 1434 - if (offset < 0 || offset >= size) 1435 - return -ENXIO; 1436 - 1437 - while (length > 0) { 1438 - ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops, 1439 - &offset, iomap_seek_data_actor); 1440 - if (ret < 0) 1441 - return ret; 1442 - if (ret == 0) 1443 - break; 1444 - 1445 - offset += ret; 1446 - length -= ret; 1447 - } 1448 - 1449 - if (length <= 0) 1450 - return -ENXIO; 1451 - return offset; 1452 - } 1453 - EXPORT_SYMBOL_GPL(iomap_seek_data); 1454 - 1455 - /* 1456 - * Private flags for iomap_dio, must not overlap with the public ones in 1457 - * iomap.h: 1458 - */ 1459 - #define IOMAP_DIO_WRITE_FUA (1 << 28) 1460 - #define IOMAP_DIO_NEED_SYNC (1 << 29) 1461 - #define IOMAP_DIO_WRITE (1 << 30) 1462 - #define IOMAP_DIO_DIRTY (1 << 31) 1463 - 1464 - struct iomap_dio { 1465 - struct kiocb *iocb; 1466 - iomap_dio_end_io_t *end_io; 1467 - loff_t i_size; 1468 - loff_t size; 1469 - atomic_t ref; 1470 - unsigned flags; 1471 - int error; 1472 - bool wait_for_completion; 1473 - 1474 - union { 1475 - /* used during submission and for synchronous completion: */ 1476 - struct { 1477 - struct iov_iter *iter; 1478 - struct task_struct *waiter; 1479 - struct request_queue *last_queue; 1480 - blk_qc_t cookie; 1481 - } submit; 1482 - 1483 - /* used for aio completion: */ 1484 - struct { 1485 - struct work_struct work; 1486 - } aio; 1487 - }; 1488 - }; 1489 - 1490 - int iomap_dio_iopoll(struct kiocb *kiocb, bool spin) 1491 - { 1492 - struct request_queue *q = READ_ONCE(kiocb->private); 1493 - 1494 - if (!q) 1495 - return 0; 1496 - return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin); 1497 - } 1498 - EXPORT_SYMBOL_GPL(iomap_dio_iopoll); 1499 - 1500 - static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap, 1501 - struct bio *bio) 1502 - { 1503 - atomic_inc(&dio->ref); 1504 - 1505 - if (dio->iocb->ki_flags & IOCB_HIPRI) 1506 - bio_set_polled(bio, dio->iocb); 1507 - 1508 - dio->submit.last_queue = bdev_get_queue(iomap->bdev); 1509 - dio->submit.cookie = submit_bio(bio); 1510 - } 1511 - 1512 - static ssize_t iomap_dio_complete(struct iomap_dio *dio) 1513 - { 1514 - struct kiocb *iocb = dio->iocb; 1515 - struct inode *inode = file_inode(iocb->ki_filp); 1516 - loff_t offset = iocb->ki_pos; 1517 - ssize_t ret; 1518 - 1519 - if (dio->end_io) { 1520 - ret = dio->end_io(iocb, 1521 - dio->error ? dio->error : dio->size, 1522 - dio->flags); 1523 - } else { 1524 - ret = dio->error; 1525 - } 1526 - 1527 - if (likely(!ret)) { 1528 - ret = dio->size; 1529 - /* check for short read */ 1530 - if (offset + ret > dio->i_size && 1531 - !(dio->flags & IOMAP_DIO_WRITE)) 1532 - ret = dio->i_size - offset; 1533 - iocb->ki_pos += ret; 1534 - } 1535 - 1536 - /* 1537 - * Try again to invalidate clean pages which might have been cached by 1538 - * non-direct readahead, or faulted in by get_user_pages() if the source 1539 - * of the write was an mmap'ed region of the file we're writing. Either 1540 - * one is a pretty crazy thing to do, so we don't support it 100%. If 1541 - * this invalidation fails, tough, the write still worked... 1542 - * 1543 - * And this page cache invalidation has to be after dio->end_io(), as 1544 - * some filesystems convert unwritten extents to real allocations in 1545 - * end_io() when necessary, otherwise a racing buffer read would cache 1546 - * zeros from unwritten extents. 1547 - */ 1548 - if (!dio->error && 1549 - (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) { 1550 - int err; 1551 - err = invalidate_inode_pages2_range(inode->i_mapping, 1552 - offset >> PAGE_SHIFT, 1553 - (offset + dio->size - 1) >> PAGE_SHIFT); 1554 - if (err) 1555 - dio_warn_stale_pagecache(iocb->ki_filp); 1556 - } 1557 - 1558 - /* 1559 - * If this is a DSYNC write, make sure we push it to stable storage now 1560 - * that we've written data. 1561 - */ 1562 - if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC)) 1563 - ret = generic_write_sync(iocb, ret); 1564 - 1565 - inode_dio_end(file_inode(iocb->ki_filp)); 1566 - kfree(dio); 1567 - 1568 - return ret; 1569 - } 1570 - 1571 - static void iomap_dio_complete_work(struct work_struct *work) 1572 - { 1573 - struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); 1574 - struct kiocb *iocb = dio->iocb; 1575 - 1576 - iocb->ki_complete(iocb, iomap_dio_complete(dio), 0); 1577 - } 1578 - 1579 - /* 1580 - * Set an error in the dio if none is set yet. We have to use cmpxchg 1581 - * as the submission context and the completion context(s) can race to 1582 - * update the error. 1583 - */ 1584 - static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret) 1585 - { 1586 - cmpxchg(&dio->error, 0, ret); 1587 - } 1588 - 1589 - static void iomap_dio_bio_end_io(struct bio *bio) 1590 - { 1591 - struct iomap_dio *dio = bio->bi_private; 1592 - bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); 1593 - 1594 - if (bio->bi_status) 1595 - iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); 1596 - 1597 - if (atomic_dec_and_test(&dio->ref)) { 1598 - if (dio->wait_for_completion) { 1599 - struct task_struct *waiter = dio->submit.waiter; 1600 - WRITE_ONCE(dio->submit.waiter, NULL); 1601 - blk_wake_io_task(waiter); 1602 - } else if (dio->flags & IOMAP_DIO_WRITE) { 1603 - struct inode *inode = file_inode(dio->iocb->ki_filp); 1604 - 1605 - INIT_WORK(&dio->aio.work, iomap_dio_complete_work); 1606 - queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); 1607 - } else { 1608 - iomap_dio_complete_work(&dio->aio.work); 1609 - } 1610 - } 1611 - 1612 - if (should_dirty) { 1613 - bio_check_pages_dirty(bio); 1614 - } else { 1615 - bio_release_pages(bio, false); 1616 - bio_put(bio); 1617 - } 1618 - } 1619 - 1620 - static void 1621 - iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, 1622 - unsigned len) 1623 - { 1624 - struct page *page = ZERO_PAGE(0); 1625 - int flags = REQ_SYNC | REQ_IDLE; 1626 - struct bio *bio; 1627 - 1628 - bio = bio_alloc(GFP_KERNEL, 1); 1629 - bio_set_dev(bio, iomap->bdev); 1630 - bio->bi_iter.bi_sector = iomap_sector(iomap, pos); 1631 - bio->bi_private = dio; 1632 - bio->bi_end_io = iomap_dio_bio_end_io; 1633 - 1634 - get_page(page); 1635 - __bio_add_page(bio, page, len, 0); 1636 - bio_set_op_attrs(bio, REQ_OP_WRITE, flags); 1637 - iomap_dio_submit_bio(dio, iomap, bio); 1638 - } 1639 - 1640 - static loff_t 1641 - iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, 1642 - struct iomap_dio *dio, struct iomap *iomap) 1643 - { 1644 - unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); 1645 - unsigned int fs_block_size = i_blocksize(inode), pad; 1646 - unsigned int align = iov_iter_alignment(dio->submit.iter); 1647 - struct iov_iter iter; 1648 - struct bio *bio; 1649 - bool need_zeroout = false; 1650 - bool use_fua = false; 1651 - int nr_pages, ret = 0; 1652 - size_t copied = 0; 1653 - 1654 - if ((pos | length | align) & ((1 << blkbits) - 1)) 1655 - return -EINVAL; 1656 - 1657 - if (iomap->type == IOMAP_UNWRITTEN) { 1658 - dio->flags |= IOMAP_DIO_UNWRITTEN; 1659 - need_zeroout = true; 1660 - } 1661 - 1662 - if (iomap->flags & IOMAP_F_SHARED) 1663 - dio->flags |= IOMAP_DIO_COW; 1664 - 1665 - if (iomap->flags & IOMAP_F_NEW) { 1666 - need_zeroout = true; 1667 - } else if (iomap->type == IOMAP_MAPPED) { 1668 - /* 1669 - * Use a FUA write if we need datasync semantics, this is a pure 1670 - * data IO that doesn't require any metadata updates (including 1671 - * after IO completion such as unwritten extent conversion) and 1672 - * the underlying device supports FUA. This allows us to avoid 1673 - * cache flushes on IO completion. 1674 - */ 1675 - if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && 1676 - (dio->flags & IOMAP_DIO_WRITE_FUA) && 1677 - blk_queue_fua(bdev_get_queue(iomap->bdev))) 1678 - use_fua = true; 1679 - } 1680 - 1681 - /* 1682 - * Operate on a partial iter trimmed to the extent we were called for. 1683 - * We'll update the iter in the dio once we're done with this extent. 1684 - */ 1685 - iter = *dio->submit.iter; 1686 - iov_iter_truncate(&iter, length); 1687 - 1688 - nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); 1689 - if (nr_pages <= 0) 1690 - return nr_pages; 1691 - 1692 - if (need_zeroout) { 1693 - /* zero out from the start of the block to the write offset */ 1694 - pad = pos & (fs_block_size - 1); 1695 - if (pad) 1696 - iomap_dio_zero(dio, iomap, pos - pad, pad); 1697 - } 1698 - 1699 - do { 1700 - size_t n; 1701 - if (dio->error) { 1702 - iov_iter_revert(dio->submit.iter, copied); 1703 - return 0; 1704 - } 1705 - 1706 - bio = bio_alloc(GFP_KERNEL, nr_pages); 1707 - bio_set_dev(bio, iomap->bdev); 1708 - bio->bi_iter.bi_sector = iomap_sector(iomap, pos); 1709 - bio->bi_write_hint = dio->iocb->ki_hint; 1710 - bio->bi_ioprio = dio->iocb->ki_ioprio; 1711 - bio->bi_private = dio; 1712 - bio->bi_end_io = iomap_dio_bio_end_io; 1713 - 1714 - ret = bio_iov_iter_get_pages(bio, &iter); 1715 - if (unlikely(ret)) { 1716 - /* 1717 - * We have to stop part way through an IO. We must fall 1718 - * through to the sub-block tail zeroing here, otherwise 1719 - * this short IO may expose stale data in the tail of 1720 - * the block we haven't written data to. 1721 - */ 1722 - bio_put(bio); 1723 - goto zero_tail; 1724 - } 1725 - 1726 - n = bio->bi_iter.bi_size; 1727 - if (dio->flags & IOMAP_DIO_WRITE) { 1728 - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; 1729 - if (use_fua) 1730 - bio->bi_opf |= REQ_FUA; 1731 - else 1732 - dio->flags &= ~IOMAP_DIO_WRITE_FUA; 1733 - task_io_account_write(n); 1734 - } else { 1735 - bio->bi_opf = REQ_OP_READ; 1736 - if (dio->flags & IOMAP_DIO_DIRTY) 1737 - bio_set_pages_dirty(bio); 1738 - } 1739 - 1740 - iov_iter_advance(dio->submit.iter, n); 1741 - 1742 - dio->size += n; 1743 - pos += n; 1744 - copied += n; 1745 - 1746 - nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); 1747 - iomap_dio_submit_bio(dio, iomap, bio); 1748 - } while (nr_pages); 1749 - 1750 - /* 1751 - * We need to zeroout the tail of a sub-block write if the extent type 1752 - * requires zeroing or the write extends beyond EOF. If we don't zero 1753 - * the block tail in the latter case, we can expose stale data via mmap 1754 - * reads of the EOF block. 1755 - */ 1756 - zero_tail: 1757 - if (need_zeroout || 1758 - ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) { 1759 - /* zero out from the end of the write to the end of the block */ 1760 - pad = pos & (fs_block_size - 1); 1761 - if (pad) 1762 - iomap_dio_zero(dio, iomap, pos, fs_block_size - pad); 1763 - } 1764 - return copied ? copied : ret; 1765 - } 1766 - 1767 - static loff_t 1768 - iomap_dio_hole_actor(loff_t length, struct iomap_dio *dio) 1769 - { 1770 - length = iov_iter_zero(length, dio->submit.iter); 1771 - dio->size += length; 1772 - return length; 1773 - } 1774 - 1775 - static loff_t 1776 - iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, 1777 - struct iomap_dio *dio, struct iomap *iomap) 1778 - { 1779 - struct iov_iter *iter = dio->submit.iter; 1780 - size_t copied; 1781 - 1782 - BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data)); 1783 - 1784 - if (dio->flags & IOMAP_DIO_WRITE) { 1785 - loff_t size = inode->i_size; 1786 - 1787 - if (pos > size) 1788 - memset(iomap->inline_data + size, 0, pos - size); 1789 - copied = copy_from_iter(iomap->inline_data + pos, length, iter); 1790 - if (copied) { 1791 - if (pos + copied > size) 1792 - i_size_write(inode, pos + copied); 1793 - mark_inode_dirty(inode); 1794 - } 1795 - } else { 1796 - copied = copy_to_iter(iomap->inline_data + pos, length, iter); 1797 - } 1798 - dio->size += copied; 1799 - return copied; 1800 - } 1801 - 1802 - static loff_t 1803 - iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, 1804 - void *data, struct iomap *iomap) 1805 - { 1806 - struct iomap_dio *dio = data; 1807 - 1808 - switch (iomap->type) { 1809 - case IOMAP_HOLE: 1810 - if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE)) 1811 - return -EIO; 1812 - return iomap_dio_hole_actor(length, dio); 1813 - case IOMAP_UNWRITTEN: 1814 - if (!(dio->flags & IOMAP_DIO_WRITE)) 1815 - return iomap_dio_hole_actor(length, dio); 1816 - return iomap_dio_bio_actor(inode, pos, length, dio, iomap); 1817 - case IOMAP_MAPPED: 1818 - return iomap_dio_bio_actor(inode, pos, length, dio, iomap); 1819 - case IOMAP_INLINE: 1820 - return iomap_dio_inline_actor(inode, pos, length, dio, iomap); 1821 - default: 1822 - WARN_ON_ONCE(1); 1823 - return -EIO; 1824 - } 1825 - } 1826 - 1827 - /* 1828 - * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO 1829 - * is being issued as AIO or not. This allows us to optimise pure data writes 1830 - * to use REQ_FUA rather than requiring generic_write_sync() to issue a 1831 - * REQ_FLUSH post write. This is slightly tricky because a single request here 1832 - * can be mapped into multiple disjoint IOs and only a subset of the IOs issued 1833 - * may be pure data writes. In that case, we still need to do a full data sync 1834 - * completion. 1835 - */ 1836 - ssize_t 1837 - iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, 1838 - const struct iomap_ops *ops, iomap_dio_end_io_t end_io) 1839 - { 1840 - struct address_space *mapping = iocb->ki_filp->f_mapping; 1841 - struct inode *inode = file_inode(iocb->ki_filp); 1842 - size_t count = iov_iter_count(iter); 1843 - loff_t pos = iocb->ki_pos, start = pos; 1844 - loff_t end = iocb->ki_pos + count - 1, ret = 0; 1845 - unsigned int flags = IOMAP_DIRECT; 1846 - bool wait_for_completion = is_sync_kiocb(iocb); 1847 - struct blk_plug plug; 1848 - struct iomap_dio *dio; 1849 - 1850 - lockdep_assert_held(&inode->i_rwsem); 1851 - 1852 - if (!count) 1853 - return 0; 1854 - 1855 - dio = kmalloc(sizeof(*dio), GFP_KERNEL); 1856 - if (!dio) 1857 - return -ENOMEM; 1858 - 1859 - dio->iocb = iocb; 1860 - atomic_set(&dio->ref, 1); 1861 - dio->size = 0; 1862 - dio->i_size = i_size_read(inode); 1863 - dio->end_io = end_io; 1864 - dio->error = 0; 1865 - dio->flags = 0; 1866 - 1867 - dio->submit.iter = iter; 1868 - dio->submit.waiter = current; 1869 - dio->submit.cookie = BLK_QC_T_NONE; 1870 - dio->submit.last_queue = NULL; 1871 - 1872 - if (iov_iter_rw(iter) == READ) { 1873 - if (pos >= dio->i_size) 1874 - goto out_free_dio; 1875 - 1876 - if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ) 1877 - dio->flags |= IOMAP_DIO_DIRTY; 1878 - } else { 1879 - flags |= IOMAP_WRITE; 1880 - dio->flags |= IOMAP_DIO_WRITE; 1881 - 1882 - /* for data sync or sync, we need sync completion processing */ 1883 - if (iocb->ki_flags & IOCB_DSYNC) 1884 - dio->flags |= IOMAP_DIO_NEED_SYNC; 1885 - 1886 - /* 1887 - * For datasync only writes, we optimistically try using FUA for 1888 - * this IO. Any non-FUA write that occurs will clear this flag, 1889 - * hence we know before completion whether a cache flush is 1890 - * necessary. 1891 - */ 1892 - if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC) 1893 - dio->flags |= IOMAP_DIO_WRITE_FUA; 1894 - } 1895 - 1896 - if (iocb->ki_flags & IOCB_NOWAIT) { 1897 - if (filemap_range_has_page(mapping, start, end)) { 1898 - ret = -EAGAIN; 1899 - goto out_free_dio; 1900 - } 1901 - flags |= IOMAP_NOWAIT; 1902 - } 1903 - 1904 - ret = filemap_write_and_wait_range(mapping, start, end); 1905 - if (ret) 1906 - goto out_free_dio; 1907 - 1908 - /* 1909 - * Try to invalidate cache pages for the range we're direct 1910 - * writing. If this invalidation fails, tough, the write will 1911 - * still work, but racing two incompatible write paths is a 1912 - * pretty crazy thing to do, so we don't support it 100%. 1913 - */ 1914 - ret = invalidate_inode_pages2_range(mapping, 1915 - start >> PAGE_SHIFT, end >> PAGE_SHIFT); 1916 - if (ret) 1917 - dio_warn_stale_pagecache(iocb->ki_filp); 1918 - ret = 0; 1919 - 1920 - if (iov_iter_rw(iter) == WRITE && !wait_for_completion && 1921 - !inode->i_sb->s_dio_done_wq) { 1922 - ret = sb_init_dio_done_wq(inode->i_sb); 1923 - if (ret < 0) 1924 - goto out_free_dio; 1925 - } 1926 - 1927 - inode_dio_begin(inode); 1928 - 1929 - blk_start_plug(&plug); 1930 - do { 1931 - ret = iomap_apply(inode, pos, count, flags, ops, dio, 1932 - iomap_dio_actor); 1933 - if (ret <= 0) { 1934 - /* magic error code to fall back to buffered I/O */ 1935 - if (ret == -ENOTBLK) { 1936 - wait_for_completion = true; 1937 - ret = 0; 1938 - } 1939 - break; 1940 - } 1941 - pos += ret; 1942 - 1943 - if (iov_iter_rw(iter) == READ && pos >= dio->i_size) 1944 - break; 1945 - } while ((count = iov_iter_count(iter)) > 0); 1946 - blk_finish_plug(&plug); 1947 - 1948 - if (ret < 0) 1949 - iomap_dio_set_error(dio, ret); 1950 - 1951 - /* 1952 - * If all the writes we issued were FUA, we don't need to flush the 1953 - * cache on IO completion. Clear the sync flag for this case. 1954 - */ 1955 - if (dio->flags & IOMAP_DIO_WRITE_FUA) 1956 - dio->flags &= ~IOMAP_DIO_NEED_SYNC; 1957 - 1958 - WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie); 1959 - WRITE_ONCE(iocb->private, dio->submit.last_queue); 1960 - 1961 - /* 1962 - * We are about to drop our additional submission reference, which 1963 - * might be the last reference to the dio. There are three three 1964 - * different ways we can progress here: 1965 - * 1966 - * (a) If this is the last reference we will always complete and free 1967 - * the dio ourselves. 1968 - * (b) If this is not the last reference, and we serve an asynchronous 1969 - * iocb, we must never touch the dio after the decrement, the 1970 - * I/O completion handler will complete and free it. 1971 - * (c) If this is not the last reference, but we serve a synchronous 1972 - * iocb, the I/O completion handler will wake us up on the drop 1973 - * of the final reference, and we will complete and free it here 1974 - * after we got woken by the I/O completion handler. 1975 - */ 1976 - dio->wait_for_completion = wait_for_completion; 1977 - if (!atomic_dec_and_test(&dio->ref)) { 1978 - if (!wait_for_completion) 1979 - return -EIOCBQUEUED; 1980 - 1981 - for (;;) { 1982 - set_current_state(TASK_UNINTERRUPTIBLE); 1983 - if (!READ_ONCE(dio->submit.waiter)) 1984 - break; 1985 - 1986 - if (!(iocb->ki_flags & IOCB_HIPRI) || 1987 - !dio->submit.last_queue || 1988 - !blk_poll(dio->submit.last_queue, 1989 - dio->submit.cookie, true)) 1990 - io_schedule(); 1991 - } 1992 - __set_current_state(TASK_RUNNING); 1993 - } 1994 - 1995 - return iomap_dio_complete(dio); 1996 - 1997 - out_free_dio: 1998 - kfree(dio); 1999 - return ret; 2000 - } 2001 - EXPORT_SYMBOL_GPL(iomap_dio_rw); 2002 - 2003 - /* Swapfile activation */ 2004 - 2005 - #ifdef CONFIG_SWAP 2006 - struct iomap_swapfile_info { 2007 - struct iomap iomap; /* accumulated iomap */ 2008 - struct swap_info_struct *sis; 2009 - uint64_t lowest_ppage; /* lowest physical addr seen (pages) */ 2010 - uint64_t highest_ppage; /* highest physical addr seen (pages) */ 2011 - unsigned long nr_pages; /* number of pages collected */ 2012 - int nr_extents; /* extent count */ 2013 - }; 2014 - 2015 - /* 2016 - * Collect physical extents for this swap file. Physical extents reported to 2017 - * the swap code must be trimmed to align to a page boundary. The logical 2018 - * offset within the file is irrelevant since the swapfile code maps logical 2019 - * page numbers of the swap device to the physical page-aligned extents. 2020 - */ 2021 - static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) 2022 - { 2023 - struct iomap *iomap = &isi->iomap; 2024 - unsigned long nr_pages; 2025 - uint64_t first_ppage; 2026 - uint64_t first_ppage_reported; 2027 - uint64_t next_ppage; 2028 - int error; 2029 - 2030 - /* 2031 - * Round the start up and the end down so that the physical 2032 - * extent aligns to a page boundary. 2033 - */ 2034 - first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT; 2035 - next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >> 2036 - PAGE_SHIFT; 2037 - 2038 - /* Skip too-short physical extents. */ 2039 - if (first_ppage >= next_ppage) 2040 - return 0; 2041 - nr_pages = next_ppage - first_ppage; 2042 - 2043 - /* 2044 - * Calculate how much swap space we're adding; the first page contains 2045 - * the swap header and doesn't count. The mm still wants that first 2046 - * page fed to add_swap_extent, however. 2047 - */ 2048 - first_ppage_reported = first_ppage; 2049 - if (iomap->offset == 0) 2050 - first_ppage_reported++; 2051 - if (isi->lowest_ppage > first_ppage_reported) 2052 - isi->lowest_ppage = first_ppage_reported; 2053 - if (isi->highest_ppage < (next_ppage - 1)) 2054 - isi->highest_ppage = next_ppage - 1; 2055 - 2056 - /* Add extent, set up for the next call. */ 2057 - error = add_swap_extent(isi->sis, isi->nr_pages, nr_pages, first_ppage); 2058 - if (error < 0) 2059 - return error; 2060 - isi->nr_extents += error; 2061 - isi->nr_pages += nr_pages; 2062 - return 0; 2063 - } 2064 - 2065 - /* 2066 - * Accumulate iomaps for this swap file. We have to accumulate iomaps because 2067 - * swap only cares about contiguous page-aligned physical extents and makes no 2068 - * distinction between written and unwritten extents. 2069 - */ 2070 - static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos, 2071 - loff_t count, void *data, struct iomap *iomap) 2072 - { 2073 - struct iomap_swapfile_info *isi = data; 2074 - int error; 2075 - 2076 - switch (iomap->type) { 2077 - case IOMAP_MAPPED: 2078 - case IOMAP_UNWRITTEN: 2079 - /* Only real or unwritten extents. */ 2080 - break; 2081 - case IOMAP_INLINE: 2082 - /* No inline data. */ 2083 - pr_err("swapon: file is inline\n"); 2084 - return -EINVAL; 2085 - default: 2086 - pr_err("swapon: file has unallocated extents\n"); 2087 - return -EINVAL; 2088 - } 2089 - 2090 - /* No uncommitted metadata or shared blocks. */ 2091 - if (iomap->flags & IOMAP_F_DIRTY) { 2092 - pr_err("swapon: file is not committed\n"); 2093 - return -EINVAL; 2094 - } 2095 - if (iomap->flags & IOMAP_F_SHARED) { 2096 - pr_err("swapon: file has shared extents\n"); 2097 - return -EINVAL; 2098 - } 2099 - 2100 - /* Only one bdev per swap file. */ 2101 - if (iomap->bdev != isi->sis->bdev) { 2102 - pr_err("swapon: file is on multiple devices\n"); 2103 - return -EINVAL; 2104 - } 2105 - 2106 - if (isi->iomap.length == 0) { 2107 - /* No accumulated extent, so just store it. */ 2108 - memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); 2109 - } else if (isi->iomap.addr + isi->iomap.length == iomap->addr) { 2110 - /* Append this to the accumulated extent. */ 2111 - isi->iomap.length += iomap->length; 2112 - } else { 2113 - /* Otherwise, add the retained iomap and store this one. */ 2114 - error = iomap_swapfile_add_extent(isi); 2115 - if (error) 2116 - return error; 2117 - memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); 2118 - } 2119 - return count; 2120 - } 2121 - 2122 - /* 2123 - * Iterate a swap file's iomaps to construct physical extents that can be 2124 - * passed to the swapfile subsystem. 2125 - */ 2126 - int iomap_swapfile_activate(struct swap_info_struct *sis, 2127 - struct file *swap_file, sector_t *pagespan, 2128 - const struct iomap_ops *ops) 2129 - { 2130 - struct iomap_swapfile_info isi = { 2131 - .sis = sis, 2132 - .lowest_ppage = (sector_t)-1ULL, 2133 - }; 2134 - struct address_space *mapping = swap_file->f_mapping; 2135 - struct inode *inode = mapping->host; 2136 - loff_t pos = 0; 2137 - loff_t len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE); 2138 - loff_t ret; 2139 - 2140 - /* 2141 - * Persist all file mapping metadata so that we won't have any 2142 - * IOMAP_F_DIRTY iomaps. 2143 - */ 2144 - ret = vfs_fsync(swap_file, 1); 2145 - if (ret) 2146 - return ret; 2147 - 2148 - while (len > 0) { 2149 - ret = iomap_apply(inode, pos, len, IOMAP_REPORT, 2150 - ops, &isi, iomap_swapfile_activate_actor); 2151 - if (ret <= 0) 2152 - return ret; 2153 - 2154 - pos += ret; 2155 - len -= ret; 2156 - } 2157 - 2158 - if (isi.iomap.length) { 2159 - ret = iomap_swapfile_add_extent(&isi); 2160 - if (ret) 2161 - return ret; 2162 - } 2163 - 2164 - *pagespan = 1 + isi.highest_ppage - isi.lowest_ppage; 2165 - sis->max = isi.nr_pages; 2166 - sis->pages = isi.nr_pages - 1; 2167 - sis->highest_bit = isi.nr_pages - 1; 2168 - return isi.nr_extents; 2169 - } 2170 - EXPORT_SYMBOL_GPL(iomap_swapfile_activate); 2171 - #endif /* CONFIG_SWAP */ 2172 - 2173 - static loff_t 2174 - iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length, 2175 - void *data, struct iomap *iomap) 2176 - { 2177 - sector_t *bno = data, addr; 2178 - 2179 - if (iomap->type == IOMAP_MAPPED) { 2180 - addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits; 2181 - if (addr > INT_MAX) 2182 - WARN(1, "would truncate bmap result\n"); 2183 - else 2184 - *bno = addr; 2185 - } 2186 - return 0; 2187 - } 2188 - 2189 - /* legacy ->bmap interface. 0 is the error return (!) */ 2190 - sector_t 2191 - iomap_bmap(struct address_space *mapping, sector_t bno, 2192 - const struct iomap_ops *ops) 2193 - { 2194 - struct inode *inode = mapping->host; 2195 - loff_t pos = bno << inode->i_blkbits; 2196 - unsigned blocksize = i_blocksize(inode); 2197 - 2198 - if (filemap_write_and_wait(mapping)) 2199 - return 0; 2200 - 2201 - bno = 0; 2202 - iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor); 2203 - return bno; 2204 - } 2205 - EXPORT_SYMBOL_GPL(iomap_bmap);
+15
fs/iomap/Makefile
··· 1 + # SPDX-License-Identifier: GPL-2.0-or-newer 2 + # 3 + # Copyright (c) 2019 Oracle. 4 + # All Rights Reserved. 5 + # 6 + obj-$(CONFIG_FS_IOMAP) += iomap.o 7 + 8 + iomap-y += \ 9 + apply.o \ 10 + buffered-io.o \ 11 + direct-io.o \ 12 + fiemap.o \ 13 + seek.o 14 + 15 + iomap-$(CONFIG_SWAP) += swapfile.o
+74
fs/iomap/apply.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2010 Red Hat, Inc. 4 + * Copyright (c) 2016-2018 Christoph Hellwig. 5 + */ 6 + #include <linux/module.h> 7 + #include <linux/compiler.h> 8 + #include <linux/fs.h> 9 + #include <linux/iomap.h> 10 + 11 + /* 12 + * Execute a iomap write on a segment of the mapping that spans a 13 + * contiguous range of pages that have identical block mapping state. 14 + * 15 + * This avoids the need to map pages individually, do individual allocations 16 + * for each page and most importantly avoid the need for filesystem specific 17 + * locking per page. Instead, all the operations are amortised over the entire 18 + * range of pages. It is assumed that the filesystems will lock whatever 19 + * resources they require in the iomap_begin call, and release them in the 20 + * iomap_end call. 21 + */ 22 + loff_t 23 + iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, 24 + const struct iomap_ops *ops, void *data, iomap_actor_t actor) 25 + { 26 + struct iomap iomap = { 0 }; 27 + loff_t written = 0, ret; 28 + 29 + /* 30 + * Need to map a range from start position for length bytes. This can 31 + * span multiple pages - it is only guaranteed to return a range of a 32 + * single type of pages (e.g. all into a hole, all mapped or all 33 + * unwritten). Failure at this point has nothing to undo. 34 + * 35 + * If allocation is required for this range, reserve the space now so 36 + * that the allocation is guaranteed to succeed later on. Once we copy 37 + * the data into the page cache pages, then we cannot fail otherwise we 38 + * expose transient stale data. If the reserve fails, we can safely 39 + * back out at this point as there is nothing to undo. 40 + */ 41 + ret = ops->iomap_begin(inode, pos, length, flags, &iomap); 42 + if (ret) 43 + return ret; 44 + if (WARN_ON(iomap.offset > pos)) 45 + return -EIO; 46 + if (WARN_ON(iomap.length == 0)) 47 + return -EIO; 48 + 49 + /* 50 + * Cut down the length to the one actually provided by the filesystem, 51 + * as it might not be able to give us the whole size that we requested. 52 + */ 53 + if (iomap.offset + iomap.length < pos + length) 54 + length = iomap.offset + iomap.length - pos; 55 + 56 + /* 57 + * Now that we have guaranteed that the space allocation will succeed. 58 + * we can do the copy-in page by page without having to worry about 59 + * failures exposing transient data. 60 + */ 61 + written = actor(inode, pos, length, data, &iomap); 62 + 63 + /* 64 + * Now the data has been copied, commit the range we've copied. This 65 + * should not fail unless the filesystem has had a fatal error. 66 + */ 67 + if (ops->iomap_end) { 68 + ret = ops->iomap_end(inode, pos, length, 69 + written > 0 ? written : 0, 70 + flags, &iomap); 71 + } 72 + 73 + return written ? written : ret; 74 + }
+1073
fs/iomap/buffered-io.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2010 Red Hat, Inc. 4 + * Copyright (c) 2016-2018 Christoph Hellwig. 5 + */ 6 + #include <linux/module.h> 7 + #include <linux/compiler.h> 8 + #include <linux/fs.h> 9 + #include <linux/iomap.h> 10 + #include <linux/pagemap.h> 11 + #include <linux/uio.h> 12 + #include <linux/buffer_head.h> 13 + #include <linux/dax.h> 14 + #include <linux/writeback.h> 15 + #include <linux/swap.h> 16 + #include <linux/bio.h> 17 + #include <linux/sched/signal.h> 18 + #include <linux/migrate.h> 19 + 20 + #include "../internal.h" 21 + 22 + static struct iomap_page * 23 + iomap_page_create(struct inode *inode, struct page *page) 24 + { 25 + struct iomap_page *iop = to_iomap_page(page); 26 + 27 + if (iop || i_blocksize(inode) == PAGE_SIZE) 28 + return iop; 29 + 30 + iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL); 31 + atomic_set(&iop->read_count, 0); 32 + atomic_set(&iop->write_count, 0); 33 + bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE); 34 + 35 + /* 36 + * migrate_page_move_mapping() assumes that pages with private data have 37 + * their count elevated by 1. 38 + */ 39 + get_page(page); 40 + set_page_private(page, (unsigned long)iop); 41 + SetPagePrivate(page); 42 + return iop; 43 + } 44 + 45 + static void 46 + iomap_page_release(struct page *page) 47 + { 48 + struct iomap_page *iop = to_iomap_page(page); 49 + 50 + if (!iop) 51 + return; 52 + WARN_ON_ONCE(atomic_read(&iop->read_count)); 53 + WARN_ON_ONCE(atomic_read(&iop->write_count)); 54 + ClearPagePrivate(page); 55 + set_page_private(page, 0); 56 + put_page(page); 57 + kfree(iop); 58 + } 59 + 60 + /* 61 + * Calculate the range inside the page that we actually need to read. 62 + */ 63 + static void 64 + iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, 65 + loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp) 66 + { 67 + loff_t orig_pos = *pos; 68 + loff_t isize = i_size_read(inode); 69 + unsigned block_bits = inode->i_blkbits; 70 + unsigned block_size = (1 << block_bits); 71 + unsigned poff = offset_in_page(*pos); 72 + unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length); 73 + unsigned first = poff >> block_bits; 74 + unsigned last = (poff + plen - 1) >> block_bits; 75 + 76 + /* 77 + * If the block size is smaller than the page size we need to check the 78 + * per-block uptodate status and adjust the offset and length if needed 79 + * to avoid reading in already uptodate ranges. 80 + */ 81 + if (iop) { 82 + unsigned int i; 83 + 84 + /* move forward for each leading block marked uptodate */ 85 + for (i = first; i <= last; i++) { 86 + if (!test_bit(i, iop->uptodate)) 87 + break; 88 + *pos += block_size; 89 + poff += block_size; 90 + plen -= block_size; 91 + first++; 92 + } 93 + 94 + /* truncate len if we find any trailing uptodate block(s) */ 95 + for ( ; i <= last; i++) { 96 + if (test_bit(i, iop->uptodate)) { 97 + plen -= (last - i + 1) * block_size; 98 + last = i - 1; 99 + break; 100 + } 101 + } 102 + } 103 + 104 + /* 105 + * If the extent spans the block that contains the i_size we need to 106 + * handle both halves separately so that we properly zero data in the 107 + * page cache for blocks that are entirely outside of i_size. 108 + */ 109 + if (orig_pos <= isize && orig_pos + length > isize) { 110 + unsigned end = offset_in_page(isize - 1) >> block_bits; 111 + 112 + if (first <= end && last > end) 113 + plen -= (last - end) * block_size; 114 + } 115 + 116 + *offp = poff; 117 + *lenp = plen; 118 + } 119 + 120 + static void 121 + iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len) 122 + { 123 + struct iomap_page *iop = to_iomap_page(page); 124 + struct inode *inode = page->mapping->host; 125 + unsigned first = off >> inode->i_blkbits; 126 + unsigned last = (off + len - 1) >> inode->i_blkbits; 127 + unsigned int i; 128 + bool uptodate = true; 129 + 130 + if (iop) { 131 + for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) { 132 + if (i >= first && i <= last) 133 + set_bit(i, iop->uptodate); 134 + else if (!test_bit(i, iop->uptodate)) 135 + uptodate = false; 136 + } 137 + } 138 + 139 + if (uptodate && !PageError(page)) 140 + SetPageUptodate(page); 141 + } 142 + 143 + static void 144 + iomap_read_finish(struct iomap_page *iop, struct page *page) 145 + { 146 + if (!iop || atomic_dec_and_test(&iop->read_count)) 147 + unlock_page(page); 148 + } 149 + 150 + static void 151 + iomap_read_page_end_io(struct bio_vec *bvec, int error) 152 + { 153 + struct page *page = bvec->bv_page; 154 + struct iomap_page *iop = to_iomap_page(page); 155 + 156 + if (unlikely(error)) { 157 + ClearPageUptodate(page); 158 + SetPageError(page); 159 + } else { 160 + iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len); 161 + } 162 + 163 + iomap_read_finish(iop, page); 164 + } 165 + 166 + static void 167 + iomap_read_end_io(struct bio *bio) 168 + { 169 + int error = blk_status_to_errno(bio->bi_status); 170 + struct bio_vec *bvec; 171 + struct bvec_iter_all iter_all; 172 + 173 + bio_for_each_segment_all(bvec, bio, iter_all) 174 + iomap_read_page_end_io(bvec, error); 175 + bio_put(bio); 176 + } 177 + 178 + struct iomap_readpage_ctx { 179 + struct page *cur_page; 180 + bool cur_page_in_bio; 181 + bool is_readahead; 182 + struct bio *bio; 183 + struct list_head *pages; 184 + }; 185 + 186 + static void 187 + iomap_read_inline_data(struct inode *inode, struct page *page, 188 + struct iomap *iomap) 189 + { 190 + size_t size = i_size_read(inode); 191 + void *addr; 192 + 193 + if (PageUptodate(page)) 194 + return; 195 + 196 + BUG_ON(page->index); 197 + BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data)); 198 + 199 + addr = kmap_atomic(page); 200 + memcpy(addr, iomap->inline_data, size); 201 + memset(addr + size, 0, PAGE_SIZE - size); 202 + kunmap_atomic(addr); 203 + SetPageUptodate(page); 204 + } 205 + 206 + static loff_t 207 + iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 208 + struct iomap *iomap) 209 + { 210 + struct iomap_readpage_ctx *ctx = data; 211 + struct page *page = ctx->cur_page; 212 + struct iomap_page *iop = iomap_page_create(inode, page); 213 + bool same_page = false, is_contig = false; 214 + loff_t orig_pos = pos; 215 + unsigned poff, plen; 216 + sector_t sector; 217 + 218 + if (iomap->type == IOMAP_INLINE) { 219 + WARN_ON_ONCE(pos); 220 + iomap_read_inline_data(inode, page, iomap); 221 + return PAGE_SIZE; 222 + } 223 + 224 + /* zero post-eof blocks as the page may be mapped */ 225 + iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen); 226 + if (plen == 0) 227 + goto done; 228 + 229 + if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) { 230 + zero_user(page, poff, plen); 231 + iomap_set_range_uptodate(page, poff, plen); 232 + goto done; 233 + } 234 + 235 + ctx->cur_page_in_bio = true; 236 + 237 + /* 238 + * Try to merge into a previous segment if we can. 239 + */ 240 + sector = iomap_sector(iomap, pos); 241 + if (ctx->bio && bio_end_sector(ctx->bio) == sector) 242 + is_contig = true; 243 + 244 + if (is_contig && 245 + __bio_try_merge_page(ctx->bio, page, plen, poff, &same_page)) { 246 + if (!same_page && iop) 247 + atomic_inc(&iop->read_count); 248 + goto done; 249 + } 250 + 251 + /* 252 + * If we start a new segment we need to increase the read count, and we 253 + * need to do so before submitting any previous full bio to make sure 254 + * that we don't prematurely unlock the page. 255 + */ 256 + if (iop) 257 + atomic_inc(&iop->read_count); 258 + 259 + if (!ctx->bio || !is_contig || bio_full(ctx->bio, plen)) { 260 + gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); 261 + int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT; 262 + 263 + if (ctx->bio) 264 + submit_bio(ctx->bio); 265 + 266 + if (ctx->is_readahead) /* same as readahead_gfp_mask */ 267 + gfp |= __GFP_NORETRY | __GFP_NOWARN; 268 + ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs)); 269 + ctx->bio->bi_opf = REQ_OP_READ; 270 + if (ctx->is_readahead) 271 + ctx->bio->bi_opf |= REQ_RAHEAD; 272 + ctx->bio->bi_iter.bi_sector = sector; 273 + bio_set_dev(ctx->bio, iomap->bdev); 274 + ctx->bio->bi_end_io = iomap_read_end_io; 275 + } 276 + 277 + bio_add_page(ctx->bio, page, plen, poff); 278 + done: 279 + /* 280 + * Move the caller beyond our range so that it keeps making progress. 281 + * For that we have to include any leading non-uptodate ranges, but 282 + * we can skip trailing ones as they will be handled in the next 283 + * iteration. 284 + */ 285 + return pos - orig_pos + plen; 286 + } 287 + 288 + int 289 + iomap_readpage(struct page *page, const struct iomap_ops *ops) 290 + { 291 + struct iomap_readpage_ctx ctx = { .cur_page = page }; 292 + struct inode *inode = page->mapping->host; 293 + unsigned poff; 294 + loff_t ret; 295 + 296 + for (poff = 0; poff < PAGE_SIZE; poff += ret) { 297 + ret = iomap_apply(inode, page_offset(page) + poff, 298 + PAGE_SIZE - poff, 0, ops, &ctx, 299 + iomap_readpage_actor); 300 + if (ret <= 0) { 301 + WARN_ON_ONCE(ret == 0); 302 + SetPageError(page); 303 + break; 304 + } 305 + } 306 + 307 + if (ctx.bio) { 308 + submit_bio(ctx.bio); 309 + WARN_ON_ONCE(!ctx.cur_page_in_bio); 310 + } else { 311 + WARN_ON_ONCE(ctx.cur_page_in_bio); 312 + unlock_page(page); 313 + } 314 + 315 + /* 316 + * Just like mpage_readpages and block_read_full_page we always 317 + * return 0 and just mark the page as PageError on errors. This 318 + * should be cleaned up all through the stack eventually. 319 + */ 320 + return 0; 321 + } 322 + EXPORT_SYMBOL_GPL(iomap_readpage); 323 + 324 + static struct page * 325 + iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos, 326 + loff_t length, loff_t *done) 327 + { 328 + while (!list_empty(pages)) { 329 + struct page *page = lru_to_page(pages); 330 + 331 + if (page_offset(page) >= (u64)pos + length) 332 + break; 333 + 334 + list_del(&page->lru); 335 + if (!add_to_page_cache_lru(page, inode->i_mapping, page->index, 336 + GFP_NOFS)) 337 + return page; 338 + 339 + /* 340 + * If we already have a page in the page cache at index we are 341 + * done. Upper layers don't care if it is uptodate after the 342 + * readpages call itself as every page gets checked again once 343 + * actually needed. 344 + */ 345 + *done += PAGE_SIZE; 346 + put_page(page); 347 + } 348 + 349 + return NULL; 350 + } 351 + 352 + static loff_t 353 + iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, 354 + void *data, struct iomap *iomap) 355 + { 356 + struct iomap_readpage_ctx *ctx = data; 357 + loff_t done, ret; 358 + 359 + for (done = 0; done < length; done += ret) { 360 + if (ctx->cur_page && offset_in_page(pos + done) == 0) { 361 + if (!ctx->cur_page_in_bio) 362 + unlock_page(ctx->cur_page); 363 + put_page(ctx->cur_page); 364 + ctx->cur_page = NULL; 365 + } 366 + if (!ctx->cur_page) { 367 + ctx->cur_page = iomap_next_page(inode, ctx->pages, 368 + pos, length, &done); 369 + if (!ctx->cur_page) 370 + break; 371 + ctx->cur_page_in_bio = false; 372 + } 373 + ret = iomap_readpage_actor(inode, pos + done, length - done, 374 + ctx, iomap); 375 + } 376 + 377 + return done; 378 + } 379 + 380 + int 381 + iomap_readpages(struct address_space *mapping, struct list_head *pages, 382 + unsigned nr_pages, const struct iomap_ops *ops) 383 + { 384 + struct iomap_readpage_ctx ctx = { 385 + .pages = pages, 386 + .is_readahead = true, 387 + }; 388 + loff_t pos = page_offset(list_entry(pages->prev, struct page, lru)); 389 + loff_t last = page_offset(list_entry(pages->next, struct page, lru)); 390 + loff_t length = last - pos + PAGE_SIZE, ret = 0; 391 + 392 + while (length > 0) { 393 + ret = iomap_apply(mapping->host, pos, length, 0, ops, 394 + &ctx, iomap_readpages_actor); 395 + if (ret <= 0) { 396 + WARN_ON_ONCE(ret == 0); 397 + goto done; 398 + } 399 + pos += ret; 400 + length -= ret; 401 + } 402 + ret = 0; 403 + done: 404 + if (ctx.bio) 405 + submit_bio(ctx.bio); 406 + if (ctx.cur_page) { 407 + if (!ctx.cur_page_in_bio) 408 + unlock_page(ctx.cur_page); 409 + put_page(ctx.cur_page); 410 + } 411 + 412 + /* 413 + * Check that we didn't lose a page due to the arcance calling 414 + * conventions.. 415 + */ 416 + WARN_ON_ONCE(!ret && !list_empty(ctx.pages)); 417 + return ret; 418 + } 419 + EXPORT_SYMBOL_GPL(iomap_readpages); 420 + 421 + /* 422 + * iomap_is_partially_uptodate checks whether blocks within a page are 423 + * uptodate or not. 424 + * 425 + * Returns true if all blocks which correspond to a file portion 426 + * we want to read within the page are uptodate. 427 + */ 428 + int 429 + iomap_is_partially_uptodate(struct page *page, unsigned long from, 430 + unsigned long count) 431 + { 432 + struct iomap_page *iop = to_iomap_page(page); 433 + struct inode *inode = page->mapping->host; 434 + unsigned len, first, last; 435 + unsigned i; 436 + 437 + /* Limit range to one page */ 438 + len = min_t(unsigned, PAGE_SIZE - from, count); 439 + 440 + /* First and last blocks in range within page */ 441 + first = from >> inode->i_blkbits; 442 + last = (from + len - 1) >> inode->i_blkbits; 443 + 444 + if (iop) { 445 + for (i = first; i <= last; i++) 446 + if (!test_bit(i, iop->uptodate)) 447 + return 0; 448 + return 1; 449 + } 450 + 451 + return 0; 452 + } 453 + EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); 454 + 455 + int 456 + iomap_releasepage(struct page *page, gfp_t gfp_mask) 457 + { 458 + /* 459 + * mm accommodates an old ext3 case where clean pages might not have had 460 + * the dirty bit cleared. Thus, it can send actual dirty pages to 461 + * ->releasepage() via shrink_active_list(), skip those here. 462 + */ 463 + if (PageDirty(page) || PageWriteback(page)) 464 + return 0; 465 + iomap_page_release(page); 466 + return 1; 467 + } 468 + EXPORT_SYMBOL_GPL(iomap_releasepage); 469 + 470 + void 471 + iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len) 472 + { 473 + /* 474 + * If we are invalidating the entire page, clear the dirty state from it 475 + * and release it to avoid unnecessary buildup of the LRU. 476 + */ 477 + if (offset == 0 && len == PAGE_SIZE) { 478 + WARN_ON_ONCE(PageWriteback(page)); 479 + cancel_dirty_page(page); 480 + iomap_page_release(page); 481 + } 482 + } 483 + EXPORT_SYMBOL_GPL(iomap_invalidatepage); 484 + 485 + #ifdef CONFIG_MIGRATION 486 + int 487 + iomap_migrate_page(struct address_space *mapping, struct page *newpage, 488 + struct page *page, enum migrate_mode mode) 489 + { 490 + int ret; 491 + 492 + ret = migrate_page_move_mapping(mapping, newpage, page, 0); 493 + if (ret != MIGRATEPAGE_SUCCESS) 494 + return ret; 495 + 496 + if (page_has_private(page)) { 497 + ClearPagePrivate(page); 498 + get_page(newpage); 499 + set_page_private(newpage, page_private(page)); 500 + set_page_private(page, 0); 501 + put_page(page); 502 + SetPagePrivate(newpage); 503 + } 504 + 505 + if (mode != MIGRATE_SYNC_NO_COPY) 506 + migrate_page_copy(newpage, page); 507 + else 508 + migrate_page_states(newpage, page); 509 + return MIGRATEPAGE_SUCCESS; 510 + } 511 + EXPORT_SYMBOL_GPL(iomap_migrate_page); 512 + #endif /* CONFIG_MIGRATION */ 513 + 514 + static void 515 + iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) 516 + { 517 + loff_t i_size = i_size_read(inode); 518 + 519 + /* 520 + * Only truncate newly allocated pages beyoned EOF, even if the 521 + * write started inside the existing inode size. 522 + */ 523 + if (pos + len > i_size) 524 + truncate_pagecache_range(inode, max(pos, i_size), pos + len); 525 + } 526 + 527 + static int 528 + iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page, 529 + unsigned poff, unsigned plen, unsigned from, unsigned to, 530 + struct iomap *iomap) 531 + { 532 + struct bio_vec bvec; 533 + struct bio bio; 534 + 535 + if (iomap->type != IOMAP_MAPPED || block_start >= i_size_read(inode)) { 536 + zero_user_segments(page, poff, from, to, poff + plen); 537 + iomap_set_range_uptodate(page, poff, plen); 538 + return 0; 539 + } 540 + 541 + bio_init(&bio, &bvec, 1); 542 + bio.bi_opf = REQ_OP_READ; 543 + bio.bi_iter.bi_sector = iomap_sector(iomap, block_start); 544 + bio_set_dev(&bio, iomap->bdev); 545 + __bio_add_page(&bio, page, plen, poff); 546 + return submit_bio_wait(&bio); 547 + } 548 + 549 + static int 550 + __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, 551 + struct page *page, struct iomap *iomap) 552 + { 553 + struct iomap_page *iop = iomap_page_create(inode, page); 554 + loff_t block_size = i_blocksize(inode); 555 + loff_t block_start = pos & ~(block_size - 1); 556 + loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1); 557 + unsigned from = offset_in_page(pos), to = from + len, poff, plen; 558 + int status = 0; 559 + 560 + if (PageUptodate(page)) 561 + return 0; 562 + 563 + do { 564 + iomap_adjust_read_range(inode, iop, &block_start, 565 + block_end - block_start, &poff, &plen); 566 + if (plen == 0) 567 + break; 568 + 569 + if ((from > poff && from < poff + plen) || 570 + (to > poff && to < poff + plen)) { 571 + status = iomap_read_page_sync(inode, block_start, page, 572 + poff, plen, from, to, iomap); 573 + if (status) 574 + break; 575 + } 576 + 577 + } while ((block_start += plen) < block_end); 578 + 579 + return status; 580 + } 581 + 582 + static int 583 + iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, 584 + struct page **pagep, struct iomap *iomap) 585 + { 586 + const struct iomap_page_ops *page_ops = iomap->page_ops; 587 + pgoff_t index = pos >> PAGE_SHIFT; 588 + struct page *page; 589 + int status = 0; 590 + 591 + BUG_ON(pos + len > iomap->offset + iomap->length); 592 + 593 + if (fatal_signal_pending(current)) 594 + return -EINTR; 595 + 596 + if (page_ops && page_ops->page_prepare) { 597 + status = page_ops->page_prepare(inode, pos, len, iomap); 598 + if (status) 599 + return status; 600 + } 601 + 602 + page = grab_cache_page_write_begin(inode->i_mapping, index, flags); 603 + if (!page) { 604 + status = -ENOMEM; 605 + goto out_no_page; 606 + } 607 + 608 + if (iomap->type == IOMAP_INLINE) 609 + iomap_read_inline_data(inode, page, iomap); 610 + else if (iomap->flags & IOMAP_F_BUFFER_HEAD) 611 + status = __block_write_begin_int(page, pos, len, NULL, iomap); 612 + else 613 + status = __iomap_write_begin(inode, pos, len, page, iomap); 614 + 615 + if (unlikely(status)) 616 + goto out_unlock; 617 + 618 + *pagep = page; 619 + return 0; 620 + 621 + out_unlock: 622 + unlock_page(page); 623 + put_page(page); 624 + iomap_write_failed(inode, pos, len); 625 + 626 + out_no_page: 627 + if (page_ops && page_ops->page_done) 628 + page_ops->page_done(inode, pos, 0, NULL, iomap); 629 + return status; 630 + } 631 + 632 + int 633 + iomap_set_page_dirty(struct page *page) 634 + { 635 + struct address_space *mapping = page_mapping(page); 636 + int newly_dirty; 637 + 638 + if (unlikely(!mapping)) 639 + return !TestSetPageDirty(page); 640 + 641 + /* 642 + * Lock out page->mem_cgroup migration to keep PageDirty 643 + * synchronized with per-memcg dirty page counters. 644 + */ 645 + lock_page_memcg(page); 646 + newly_dirty = !TestSetPageDirty(page); 647 + if (newly_dirty) 648 + __set_page_dirty(page, mapping, 0); 649 + unlock_page_memcg(page); 650 + 651 + if (newly_dirty) 652 + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 653 + return newly_dirty; 654 + } 655 + EXPORT_SYMBOL_GPL(iomap_set_page_dirty); 656 + 657 + static int 658 + __iomap_write_end(struct inode *inode, loff_t pos, unsigned len, 659 + unsigned copied, struct page *page, struct iomap *iomap) 660 + { 661 + flush_dcache_page(page); 662 + 663 + /* 664 + * The blocks that were entirely written will now be uptodate, so we 665 + * don't have to worry about a readpage reading them and overwriting a 666 + * partial write. However if we have encountered a short write and only 667 + * partially written into a block, it will not be marked uptodate, so a 668 + * readpage might come in and destroy our partial write. 669 + * 670 + * Do the simplest thing, and just treat any short write to a non 671 + * uptodate page as a zero-length write, and force the caller to redo 672 + * the whole thing. 673 + */ 674 + if (unlikely(copied < len && !PageUptodate(page))) 675 + return 0; 676 + iomap_set_range_uptodate(page, offset_in_page(pos), len); 677 + iomap_set_page_dirty(page); 678 + return copied; 679 + } 680 + 681 + static int 682 + iomap_write_end_inline(struct inode *inode, struct page *page, 683 + struct iomap *iomap, loff_t pos, unsigned copied) 684 + { 685 + void *addr; 686 + 687 + WARN_ON_ONCE(!PageUptodate(page)); 688 + BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data)); 689 + 690 + addr = kmap_atomic(page); 691 + memcpy(iomap->inline_data + pos, addr + pos, copied); 692 + kunmap_atomic(addr); 693 + 694 + mark_inode_dirty(inode); 695 + return copied; 696 + } 697 + 698 + static int 699 + iomap_write_end(struct inode *inode, loff_t pos, unsigned len, 700 + unsigned copied, struct page *page, struct iomap *iomap) 701 + { 702 + const struct iomap_page_ops *page_ops = iomap->page_ops; 703 + loff_t old_size = inode->i_size; 704 + int ret; 705 + 706 + if (iomap->type == IOMAP_INLINE) { 707 + ret = iomap_write_end_inline(inode, page, iomap, pos, copied); 708 + } else if (iomap->flags & IOMAP_F_BUFFER_HEAD) { 709 + ret = block_write_end(NULL, inode->i_mapping, pos, len, copied, 710 + page, NULL); 711 + } else { 712 + ret = __iomap_write_end(inode, pos, len, copied, page, iomap); 713 + } 714 + 715 + /* 716 + * Update the in-memory inode size after copying the data into the page 717 + * cache. It's up to the file system to write the updated size to disk, 718 + * preferably after I/O completion so that no stale data is exposed. 719 + */ 720 + if (pos + ret > old_size) { 721 + i_size_write(inode, pos + ret); 722 + iomap->flags |= IOMAP_F_SIZE_CHANGED; 723 + } 724 + unlock_page(page); 725 + 726 + if (old_size < pos) 727 + pagecache_isize_extended(inode, old_size, pos); 728 + if (page_ops && page_ops->page_done) 729 + page_ops->page_done(inode, pos, ret, page, iomap); 730 + put_page(page); 731 + 732 + if (ret < len) 733 + iomap_write_failed(inode, pos, len); 734 + return ret; 735 + } 736 + 737 + static loff_t 738 + iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 739 + struct iomap *iomap) 740 + { 741 + struct iov_iter *i = data; 742 + long status = 0; 743 + ssize_t written = 0; 744 + unsigned int flags = AOP_FLAG_NOFS; 745 + 746 + do { 747 + struct page *page; 748 + unsigned long offset; /* Offset into pagecache page */ 749 + unsigned long bytes; /* Bytes to write to page */ 750 + size_t copied; /* Bytes copied from user */ 751 + 752 + offset = offset_in_page(pos); 753 + bytes = min_t(unsigned long, PAGE_SIZE - offset, 754 + iov_iter_count(i)); 755 + again: 756 + if (bytes > length) 757 + bytes = length; 758 + 759 + /* 760 + * Bring in the user page that we will copy from _first_. 761 + * Otherwise there's a nasty deadlock on copying from the 762 + * same page as we're writing to, without it being marked 763 + * up-to-date. 764 + * 765 + * Not only is this an optimisation, but it is also required 766 + * to check that the address is actually valid, when atomic 767 + * usercopies are used, below. 768 + */ 769 + if (unlikely(iov_iter_fault_in_readable(i, bytes))) { 770 + status = -EFAULT; 771 + break; 772 + } 773 + 774 + status = iomap_write_begin(inode, pos, bytes, flags, &page, 775 + iomap); 776 + if (unlikely(status)) 777 + break; 778 + 779 + if (mapping_writably_mapped(inode->i_mapping)) 780 + flush_dcache_page(page); 781 + 782 + copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 783 + 784 + flush_dcache_page(page); 785 + 786 + status = iomap_write_end(inode, pos, bytes, copied, page, 787 + iomap); 788 + if (unlikely(status < 0)) 789 + break; 790 + copied = status; 791 + 792 + cond_resched(); 793 + 794 + iov_iter_advance(i, copied); 795 + if (unlikely(copied == 0)) { 796 + /* 797 + * If we were unable to copy any data at all, we must 798 + * fall back to a single segment length write. 799 + * 800 + * If we didn't fallback here, we could livelock 801 + * because not all segments in the iov can be copied at 802 + * once without a pagefault. 803 + */ 804 + bytes = min_t(unsigned long, PAGE_SIZE - offset, 805 + iov_iter_single_seg_count(i)); 806 + goto again; 807 + } 808 + pos += copied; 809 + written += copied; 810 + length -= copied; 811 + 812 + balance_dirty_pages_ratelimited(inode->i_mapping); 813 + } while (iov_iter_count(i) && length); 814 + 815 + return written ? written : status; 816 + } 817 + 818 + ssize_t 819 + iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, 820 + const struct iomap_ops *ops) 821 + { 822 + struct inode *inode = iocb->ki_filp->f_mapping->host; 823 + loff_t pos = iocb->ki_pos, ret = 0, written = 0; 824 + 825 + while (iov_iter_count(iter)) { 826 + ret = iomap_apply(inode, pos, iov_iter_count(iter), 827 + IOMAP_WRITE, ops, iter, iomap_write_actor); 828 + if (ret <= 0) 829 + break; 830 + pos += ret; 831 + written += ret; 832 + } 833 + 834 + return written ? written : ret; 835 + } 836 + EXPORT_SYMBOL_GPL(iomap_file_buffered_write); 837 + 838 + static struct page * 839 + __iomap_read_page(struct inode *inode, loff_t offset) 840 + { 841 + struct address_space *mapping = inode->i_mapping; 842 + struct page *page; 843 + 844 + page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL); 845 + if (IS_ERR(page)) 846 + return page; 847 + if (!PageUptodate(page)) { 848 + put_page(page); 849 + return ERR_PTR(-EIO); 850 + } 851 + return page; 852 + } 853 + 854 + static loff_t 855 + iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 856 + struct iomap *iomap) 857 + { 858 + long status = 0; 859 + ssize_t written = 0; 860 + 861 + do { 862 + struct page *page, *rpage; 863 + unsigned long offset; /* Offset into pagecache page */ 864 + unsigned long bytes; /* Bytes to write to page */ 865 + 866 + offset = offset_in_page(pos); 867 + bytes = min_t(loff_t, PAGE_SIZE - offset, length); 868 + 869 + rpage = __iomap_read_page(inode, pos); 870 + if (IS_ERR(rpage)) 871 + return PTR_ERR(rpage); 872 + 873 + status = iomap_write_begin(inode, pos, bytes, 874 + AOP_FLAG_NOFS, &page, iomap); 875 + put_page(rpage); 876 + if (unlikely(status)) 877 + return status; 878 + 879 + WARN_ON_ONCE(!PageUptodate(page)); 880 + 881 + status = iomap_write_end(inode, pos, bytes, bytes, page, iomap); 882 + if (unlikely(status <= 0)) { 883 + if (WARN_ON_ONCE(status == 0)) 884 + return -EIO; 885 + return status; 886 + } 887 + 888 + cond_resched(); 889 + 890 + pos += status; 891 + written += status; 892 + length -= status; 893 + 894 + balance_dirty_pages_ratelimited(inode->i_mapping); 895 + } while (length); 896 + 897 + return written; 898 + } 899 + 900 + int 901 + iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, 902 + const struct iomap_ops *ops) 903 + { 904 + loff_t ret; 905 + 906 + while (len) { 907 + ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL, 908 + iomap_dirty_actor); 909 + if (ret <= 0) 910 + return ret; 911 + pos += ret; 912 + len -= ret; 913 + } 914 + 915 + return 0; 916 + } 917 + EXPORT_SYMBOL_GPL(iomap_file_dirty); 918 + 919 + static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, 920 + unsigned bytes, struct iomap *iomap) 921 + { 922 + struct page *page; 923 + int status; 924 + 925 + status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page, 926 + iomap); 927 + if (status) 928 + return status; 929 + 930 + zero_user(page, offset, bytes); 931 + mark_page_accessed(page); 932 + 933 + return iomap_write_end(inode, pos, bytes, bytes, page, iomap); 934 + } 935 + 936 + static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, 937 + struct iomap *iomap) 938 + { 939 + return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, 940 + iomap_sector(iomap, pos & PAGE_MASK), offset, bytes); 941 + } 942 + 943 + static loff_t 944 + iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, 945 + void *data, struct iomap *iomap) 946 + { 947 + bool *did_zero = data; 948 + loff_t written = 0; 949 + int status; 950 + 951 + /* already zeroed? we're done. */ 952 + if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) 953 + return count; 954 + 955 + do { 956 + unsigned offset, bytes; 957 + 958 + offset = offset_in_page(pos); 959 + bytes = min_t(loff_t, PAGE_SIZE - offset, count); 960 + 961 + if (IS_DAX(inode)) 962 + status = iomap_dax_zero(pos, offset, bytes, iomap); 963 + else 964 + status = iomap_zero(inode, pos, offset, bytes, iomap); 965 + if (status < 0) 966 + return status; 967 + 968 + pos += bytes; 969 + count -= bytes; 970 + written += bytes; 971 + if (did_zero) 972 + *did_zero = true; 973 + } while (count > 0); 974 + 975 + return written; 976 + } 977 + 978 + int 979 + iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, 980 + const struct iomap_ops *ops) 981 + { 982 + loff_t ret; 983 + 984 + while (len > 0) { 985 + ret = iomap_apply(inode, pos, len, IOMAP_ZERO, 986 + ops, did_zero, iomap_zero_range_actor); 987 + if (ret <= 0) 988 + return ret; 989 + 990 + pos += ret; 991 + len -= ret; 992 + } 993 + 994 + return 0; 995 + } 996 + EXPORT_SYMBOL_GPL(iomap_zero_range); 997 + 998 + int 999 + iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, 1000 + const struct iomap_ops *ops) 1001 + { 1002 + unsigned int blocksize = i_blocksize(inode); 1003 + unsigned int off = pos & (blocksize - 1); 1004 + 1005 + /* Block boundary? Nothing to do */ 1006 + if (!off) 1007 + return 0; 1008 + return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); 1009 + } 1010 + EXPORT_SYMBOL_GPL(iomap_truncate_page); 1011 + 1012 + static loff_t 1013 + iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, 1014 + void *data, struct iomap *iomap) 1015 + { 1016 + struct page *page = data; 1017 + int ret; 1018 + 1019 + if (iomap->flags & IOMAP_F_BUFFER_HEAD) { 1020 + ret = __block_write_begin_int(page, pos, length, NULL, iomap); 1021 + if (ret) 1022 + return ret; 1023 + block_commit_write(page, 0, length); 1024 + } else { 1025 + WARN_ON_ONCE(!PageUptodate(page)); 1026 + iomap_page_create(inode, page); 1027 + set_page_dirty(page); 1028 + } 1029 + 1030 + return length; 1031 + } 1032 + 1033 + vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) 1034 + { 1035 + struct page *page = vmf->page; 1036 + struct inode *inode = file_inode(vmf->vma->vm_file); 1037 + unsigned long length; 1038 + loff_t offset, size; 1039 + ssize_t ret; 1040 + 1041 + lock_page(page); 1042 + size = i_size_read(inode); 1043 + if ((page->mapping != inode->i_mapping) || 1044 + (page_offset(page) > size)) { 1045 + /* We overload EFAULT to mean page got truncated */ 1046 + ret = -EFAULT; 1047 + goto out_unlock; 1048 + } 1049 + 1050 + /* page is wholly or partially inside EOF */ 1051 + if (((page->index + 1) << PAGE_SHIFT) > size) 1052 + length = offset_in_page(size); 1053 + else 1054 + length = PAGE_SIZE; 1055 + 1056 + offset = page_offset(page); 1057 + while (length > 0) { 1058 + ret = iomap_apply(inode, offset, length, 1059 + IOMAP_WRITE | IOMAP_FAULT, ops, page, 1060 + iomap_page_mkwrite_actor); 1061 + if (unlikely(ret <= 0)) 1062 + goto out_unlock; 1063 + offset += ret; 1064 + length -= ret; 1065 + } 1066 + 1067 + wait_for_stable_page(page); 1068 + return VM_FAULT_LOCKED; 1069 + out_unlock: 1070 + unlock_page(page); 1071 + return block_page_mkwrite_return(ret); 1072 + } 1073 + EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
+562
fs/iomap/direct-io.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2010 Red Hat, Inc. 4 + * Copyright (c) 2016-2018 Christoph Hellwig. 5 + */ 6 + #include <linux/module.h> 7 + #include <linux/compiler.h> 8 + #include <linux/fs.h> 9 + #include <linux/iomap.h> 10 + #include <linux/backing-dev.h> 11 + #include <linux/uio.h> 12 + #include <linux/task_io_accounting_ops.h> 13 + 14 + #include "../internal.h" 15 + 16 + /* 17 + * Private flags for iomap_dio, must not overlap with the public ones in 18 + * iomap.h: 19 + */ 20 + #define IOMAP_DIO_WRITE_FUA (1 << 28) 21 + #define IOMAP_DIO_NEED_SYNC (1 << 29) 22 + #define IOMAP_DIO_WRITE (1 << 30) 23 + #define IOMAP_DIO_DIRTY (1 << 31) 24 + 25 + struct iomap_dio { 26 + struct kiocb *iocb; 27 + iomap_dio_end_io_t *end_io; 28 + loff_t i_size; 29 + loff_t size; 30 + atomic_t ref; 31 + unsigned flags; 32 + int error; 33 + bool wait_for_completion; 34 + 35 + union { 36 + /* used during submission and for synchronous completion: */ 37 + struct { 38 + struct iov_iter *iter; 39 + struct task_struct *waiter; 40 + struct request_queue *last_queue; 41 + blk_qc_t cookie; 42 + } submit; 43 + 44 + /* used for aio completion: */ 45 + struct { 46 + struct work_struct work; 47 + } aio; 48 + }; 49 + }; 50 + 51 + int iomap_dio_iopoll(struct kiocb *kiocb, bool spin) 52 + { 53 + struct request_queue *q = READ_ONCE(kiocb->private); 54 + 55 + if (!q) 56 + return 0; 57 + return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin); 58 + } 59 + EXPORT_SYMBOL_GPL(iomap_dio_iopoll); 60 + 61 + static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap, 62 + struct bio *bio) 63 + { 64 + atomic_inc(&dio->ref); 65 + 66 + if (dio->iocb->ki_flags & IOCB_HIPRI) 67 + bio_set_polled(bio, dio->iocb); 68 + 69 + dio->submit.last_queue = bdev_get_queue(iomap->bdev); 70 + dio->submit.cookie = submit_bio(bio); 71 + } 72 + 73 + static ssize_t iomap_dio_complete(struct iomap_dio *dio) 74 + { 75 + struct kiocb *iocb = dio->iocb; 76 + struct inode *inode = file_inode(iocb->ki_filp); 77 + loff_t offset = iocb->ki_pos; 78 + ssize_t ret; 79 + 80 + if (dio->end_io) { 81 + ret = dio->end_io(iocb, 82 + dio->error ? dio->error : dio->size, 83 + dio->flags); 84 + } else { 85 + ret = dio->error; 86 + } 87 + 88 + if (likely(!ret)) { 89 + ret = dio->size; 90 + /* check for short read */ 91 + if (offset + ret > dio->i_size && 92 + !(dio->flags & IOMAP_DIO_WRITE)) 93 + ret = dio->i_size - offset; 94 + iocb->ki_pos += ret; 95 + } 96 + 97 + /* 98 + * Try again to invalidate clean pages which might have been cached by 99 + * non-direct readahead, or faulted in by get_user_pages() if the source 100 + * of the write was an mmap'ed region of the file we're writing. Either 101 + * one is a pretty crazy thing to do, so we don't support it 100%. If 102 + * this invalidation fails, tough, the write still worked... 103 + * 104 + * And this page cache invalidation has to be after dio->end_io(), as 105 + * some filesystems convert unwritten extents to real allocations in 106 + * end_io() when necessary, otherwise a racing buffer read would cache 107 + * zeros from unwritten extents. 108 + */ 109 + if (!dio->error && 110 + (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) { 111 + int err; 112 + err = invalidate_inode_pages2_range(inode->i_mapping, 113 + offset >> PAGE_SHIFT, 114 + (offset + dio->size - 1) >> PAGE_SHIFT); 115 + if (err) 116 + dio_warn_stale_pagecache(iocb->ki_filp); 117 + } 118 + 119 + /* 120 + * If this is a DSYNC write, make sure we push it to stable storage now 121 + * that we've written data. 122 + */ 123 + if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC)) 124 + ret = generic_write_sync(iocb, ret); 125 + 126 + inode_dio_end(file_inode(iocb->ki_filp)); 127 + kfree(dio); 128 + 129 + return ret; 130 + } 131 + 132 + static void iomap_dio_complete_work(struct work_struct *work) 133 + { 134 + struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); 135 + struct kiocb *iocb = dio->iocb; 136 + 137 + iocb->ki_complete(iocb, iomap_dio_complete(dio), 0); 138 + } 139 + 140 + /* 141 + * Set an error in the dio if none is set yet. We have to use cmpxchg 142 + * as the submission context and the completion context(s) can race to 143 + * update the error. 144 + */ 145 + static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret) 146 + { 147 + cmpxchg(&dio->error, 0, ret); 148 + } 149 + 150 + static void iomap_dio_bio_end_io(struct bio *bio) 151 + { 152 + struct iomap_dio *dio = bio->bi_private; 153 + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); 154 + 155 + if (bio->bi_status) 156 + iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); 157 + 158 + if (atomic_dec_and_test(&dio->ref)) { 159 + if (dio->wait_for_completion) { 160 + struct task_struct *waiter = dio->submit.waiter; 161 + WRITE_ONCE(dio->submit.waiter, NULL); 162 + blk_wake_io_task(waiter); 163 + } else if (dio->flags & IOMAP_DIO_WRITE) { 164 + struct inode *inode = file_inode(dio->iocb->ki_filp); 165 + 166 + INIT_WORK(&dio->aio.work, iomap_dio_complete_work); 167 + queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); 168 + } else { 169 + iomap_dio_complete_work(&dio->aio.work); 170 + } 171 + } 172 + 173 + if (should_dirty) { 174 + bio_check_pages_dirty(bio); 175 + } else { 176 + bio_release_pages(bio, false); 177 + bio_put(bio); 178 + } 179 + } 180 + 181 + static void 182 + iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, 183 + unsigned len) 184 + { 185 + struct page *page = ZERO_PAGE(0); 186 + int flags = REQ_SYNC | REQ_IDLE; 187 + struct bio *bio; 188 + 189 + bio = bio_alloc(GFP_KERNEL, 1); 190 + bio_set_dev(bio, iomap->bdev); 191 + bio->bi_iter.bi_sector = iomap_sector(iomap, pos); 192 + bio->bi_private = dio; 193 + bio->bi_end_io = iomap_dio_bio_end_io; 194 + 195 + get_page(page); 196 + __bio_add_page(bio, page, len, 0); 197 + bio_set_op_attrs(bio, REQ_OP_WRITE, flags); 198 + iomap_dio_submit_bio(dio, iomap, bio); 199 + } 200 + 201 + static loff_t 202 + iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, 203 + struct iomap_dio *dio, struct iomap *iomap) 204 + { 205 + unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); 206 + unsigned int fs_block_size = i_blocksize(inode), pad; 207 + unsigned int align = iov_iter_alignment(dio->submit.iter); 208 + struct iov_iter iter; 209 + struct bio *bio; 210 + bool need_zeroout = false; 211 + bool use_fua = false; 212 + int nr_pages, ret = 0; 213 + size_t copied = 0; 214 + 215 + if ((pos | length | align) & ((1 << blkbits) - 1)) 216 + return -EINVAL; 217 + 218 + if (iomap->type == IOMAP_UNWRITTEN) { 219 + dio->flags |= IOMAP_DIO_UNWRITTEN; 220 + need_zeroout = true; 221 + } 222 + 223 + if (iomap->flags & IOMAP_F_SHARED) 224 + dio->flags |= IOMAP_DIO_COW; 225 + 226 + if (iomap->flags & IOMAP_F_NEW) { 227 + need_zeroout = true; 228 + } else if (iomap->type == IOMAP_MAPPED) { 229 + /* 230 + * Use a FUA write if we need datasync semantics, this is a pure 231 + * data IO that doesn't require any metadata updates (including 232 + * after IO completion such as unwritten extent conversion) and 233 + * the underlying device supports FUA. This allows us to avoid 234 + * cache flushes on IO completion. 235 + */ 236 + if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && 237 + (dio->flags & IOMAP_DIO_WRITE_FUA) && 238 + blk_queue_fua(bdev_get_queue(iomap->bdev))) 239 + use_fua = true; 240 + } 241 + 242 + /* 243 + * Operate on a partial iter trimmed to the extent we were called for. 244 + * We'll update the iter in the dio once we're done with this extent. 245 + */ 246 + iter = *dio->submit.iter; 247 + iov_iter_truncate(&iter, length); 248 + 249 + nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); 250 + if (nr_pages <= 0) 251 + return nr_pages; 252 + 253 + if (need_zeroout) { 254 + /* zero out from the start of the block to the write offset */ 255 + pad = pos & (fs_block_size - 1); 256 + if (pad) 257 + iomap_dio_zero(dio, iomap, pos - pad, pad); 258 + } 259 + 260 + do { 261 + size_t n; 262 + if (dio->error) { 263 + iov_iter_revert(dio->submit.iter, copied); 264 + return 0; 265 + } 266 + 267 + bio = bio_alloc(GFP_KERNEL, nr_pages); 268 + bio_set_dev(bio, iomap->bdev); 269 + bio->bi_iter.bi_sector = iomap_sector(iomap, pos); 270 + bio->bi_write_hint = dio->iocb->ki_hint; 271 + bio->bi_ioprio = dio->iocb->ki_ioprio; 272 + bio->bi_private = dio; 273 + bio->bi_end_io = iomap_dio_bio_end_io; 274 + 275 + ret = bio_iov_iter_get_pages(bio, &iter); 276 + if (unlikely(ret)) { 277 + /* 278 + * We have to stop part way through an IO. We must fall 279 + * through to the sub-block tail zeroing here, otherwise 280 + * this short IO may expose stale data in the tail of 281 + * the block we haven't written data to. 282 + */ 283 + bio_put(bio); 284 + goto zero_tail; 285 + } 286 + 287 + n = bio->bi_iter.bi_size; 288 + if (dio->flags & IOMAP_DIO_WRITE) { 289 + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; 290 + if (use_fua) 291 + bio->bi_opf |= REQ_FUA; 292 + else 293 + dio->flags &= ~IOMAP_DIO_WRITE_FUA; 294 + task_io_account_write(n); 295 + } else { 296 + bio->bi_opf = REQ_OP_READ; 297 + if (dio->flags & IOMAP_DIO_DIRTY) 298 + bio_set_pages_dirty(bio); 299 + } 300 + 301 + iov_iter_advance(dio->submit.iter, n); 302 + 303 + dio->size += n; 304 + pos += n; 305 + copied += n; 306 + 307 + nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); 308 + iomap_dio_submit_bio(dio, iomap, bio); 309 + } while (nr_pages); 310 + 311 + /* 312 + * We need to zeroout the tail of a sub-block write if the extent type 313 + * requires zeroing or the write extends beyond EOF. If we don't zero 314 + * the block tail in the latter case, we can expose stale data via mmap 315 + * reads of the EOF block. 316 + */ 317 + zero_tail: 318 + if (need_zeroout || 319 + ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) { 320 + /* zero out from the end of the write to the end of the block */ 321 + pad = pos & (fs_block_size - 1); 322 + if (pad) 323 + iomap_dio_zero(dio, iomap, pos, fs_block_size - pad); 324 + } 325 + return copied ? copied : ret; 326 + } 327 + 328 + static loff_t 329 + iomap_dio_hole_actor(loff_t length, struct iomap_dio *dio) 330 + { 331 + length = iov_iter_zero(length, dio->submit.iter); 332 + dio->size += length; 333 + return length; 334 + } 335 + 336 + static loff_t 337 + iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, 338 + struct iomap_dio *dio, struct iomap *iomap) 339 + { 340 + struct iov_iter *iter = dio->submit.iter; 341 + size_t copied; 342 + 343 + BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data)); 344 + 345 + if (dio->flags & IOMAP_DIO_WRITE) { 346 + loff_t size = inode->i_size; 347 + 348 + if (pos > size) 349 + memset(iomap->inline_data + size, 0, pos - size); 350 + copied = copy_from_iter(iomap->inline_data + pos, length, iter); 351 + if (copied) { 352 + if (pos + copied > size) 353 + i_size_write(inode, pos + copied); 354 + mark_inode_dirty(inode); 355 + } 356 + } else { 357 + copied = copy_to_iter(iomap->inline_data + pos, length, iter); 358 + } 359 + dio->size += copied; 360 + return copied; 361 + } 362 + 363 + static loff_t 364 + iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, 365 + void *data, struct iomap *iomap) 366 + { 367 + struct iomap_dio *dio = data; 368 + 369 + switch (iomap->type) { 370 + case IOMAP_HOLE: 371 + if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE)) 372 + return -EIO; 373 + return iomap_dio_hole_actor(length, dio); 374 + case IOMAP_UNWRITTEN: 375 + if (!(dio->flags & IOMAP_DIO_WRITE)) 376 + return iomap_dio_hole_actor(length, dio); 377 + return iomap_dio_bio_actor(inode, pos, length, dio, iomap); 378 + case IOMAP_MAPPED: 379 + return iomap_dio_bio_actor(inode, pos, length, dio, iomap); 380 + case IOMAP_INLINE: 381 + return iomap_dio_inline_actor(inode, pos, length, dio, iomap); 382 + default: 383 + WARN_ON_ONCE(1); 384 + return -EIO; 385 + } 386 + } 387 + 388 + /* 389 + * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO 390 + * is being issued as AIO or not. This allows us to optimise pure data writes 391 + * to use REQ_FUA rather than requiring generic_write_sync() to issue a 392 + * REQ_FLUSH post write. This is slightly tricky because a single request here 393 + * can be mapped into multiple disjoint IOs and only a subset of the IOs issued 394 + * may be pure data writes. In that case, we still need to do a full data sync 395 + * completion. 396 + */ 397 + ssize_t 398 + iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, 399 + const struct iomap_ops *ops, iomap_dio_end_io_t end_io) 400 + { 401 + struct address_space *mapping = iocb->ki_filp->f_mapping; 402 + struct inode *inode = file_inode(iocb->ki_filp); 403 + size_t count = iov_iter_count(iter); 404 + loff_t pos = iocb->ki_pos, start = pos; 405 + loff_t end = iocb->ki_pos + count - 1, ret = 0; 406 + unsigned int flags = IOMAP_DIRECT; 407 + bool wait_for_completion = is_sync_kiocb(iocb); 408 + struct blk_plug plug; 409 + struct iomap_dio *dio; 410 + 411 + lockdep_assert_held(&inode->i_rwsem); 412 + 413 + if (!count) 414 + return 0; 415 + 416 + dio = kmalloc(sizeof(*dio), GFP_KERNEL); 417 + if (!dio) 418 + return -ENOMEM; 419 + 420 + dio->iocb = iocb; 421 + atomic_set(&dio->ref, 1); 422 + dio->size = 0; 423 + dio->i_size = i_size_read(inode); 424 + dio->end_io = end_io; 425 + dio->error = 0; 426 + dio->flags = 0; 427 + 428 + dio->submit.iter = iter; 429 + dio->submit.waiter = current; 430 + dio->submit.cookie = BLK_QC_T_NONE; 431 + dio->submit.last_queue = NULL; 432 + 433 + if (iov_iter_rw(iter) == READ) { 434 + if (pos >= dio->i_size) 435 + goto out_free_dio; 436 + 437 + if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ) 438 + dio->flags |= IOMAP_DIO_DIRTY; 439 + } else { 440 + flags |= IOMAP_WRITE; 441 + dio->flags |= IOMAP_DIO_WRITE; 442 + 443 + /* for data sync or sync, we need sync completion processing */ 444 + if (iocb->ki_flags & IOCB_DSYNC) 445 + dio->flags |= IOMAP_DIO_NEED_SYNC; 446 + 447 + /* 448 + * For datasync only writes, we optimistically try using FUA for 449 + * this IO. Any non-FUA write that occurs will clear this flag, 450 + * hence we know before completion whether a cache flush is 451 + * necessary. 452 + */ 453 + if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC) 454 + dio->flags |= IOMAP_DIO_WRITE_FUA; 455 + } 456 + 457 + if (iocb->ki_flags & IOCB_NOWAIT) { 458 + if (filemap_range_has_page(mapping, start, end)) { 459 + ret = -EAGAIN; 460 + goto out_free_dio; 461 + } 462 + flags |= IOMAP_NOWAIT; 463 + } 464 + 465 + ret = filemap_write_and_wait_range(mapping, start, end); 466 + if (ret) 467 + goto out_free_dio; 468 + 469 + /* 470 + * Try to invalidate cache pages for the range we're direct 471 + * writing. If this invalidation fails, tough, the write will 472 + * still work, but racing two incompatible write paths is a 473 + * pretty crazy thing to do, so we don't support it 100%. 474 + */ 475 + ret = invalidate_inode_pages2_range(mapping, 476 + start >> PAGE_SHIFT, end >> PAGE_SHIFT); 477 + if (ret) 478 + dio_warn_stale_pagecache(iocb->ki_filp); 479 + ret = 0; 480 + 481 + if (iov_iter_rw(iter) == WRITE && !wait_for_completion && 482 + !inode->i_sb->s_dio_done_wq) { 483 + ret = sb_init_dio_done_wq(inode->i_sb); 484 + if (ret < 0) 485 + goto out_free_dio; 486 + } 487 + 488 + inode_dio_begin(inode); 489 + 490 + blk_start_plug(&plug); 491 + do { 492 + ret = iomap_apply(inode, pos, count, flags, ops, dio, 493 + iomap_dio_actor); 494 + if (ret <= 0) { 495 + /* magic error code to fall back to buffered I/O */ 496 + if (ret == -ENOTBLK) { 497 + wait_for_completion = true; 498 + ret = 0; 499 + } 500 + break; 501 + } 502 + pos += ret; 503 + 504 + if (iov_iter_rw(iter) == READ && pos >= dio->i_size) 505 + break; 506 + } while ((count = iov_iter_count(iter)) > 0); 507 + blk_finish_plug(&plug); 508 + 509 + if (ret < 0) 510 + iomap_dio_set_error(dio, ret); 511 + 512 + /* 513 + * If all the writes we issued were FUA, we don't need to flush the 514 + * cache on IO completion. Clear the sync flag for this case. 515 + */ 516 + if (dio->flags & IOMAP_DIO_WRITE_FUA) 517 + dio->flags &= ~IOMAP_DIO_NEED_SYNC; 518 + 519 + WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie); 520 + WRITE_ONCE(iocb->private, dio->submit.last_queue); 521 + 522 + /* 523 + * We are about to drop our additional submission reference, which 524 + * might be the last reference to the dio. There are three three 525 + * different ways we can progress here: 526 + * 527 + * (a) If this is the last reference we will always complete and free 528 + * the dio ourselves. 529 + * (b) If this is not the last reference, and we serve an asynchronous 530 + * iocb, we must never touch the dio after the decrement, the 531 + * I/O completion handler will complete and free it. 532 + * (c) If this is not the last reference, but we serve a synchronous 533 + * iocb, the I/O completion handler will wake us up on the drop 534 + * of the final reference, and we will complete and free it here 535 + * after we got woken by the I/O completion handler. 536 + */ 537 + dio->wait_for_completion = wait_for_completion; 538 + if (!atomic_dec_and_test(&dio->ref)) { 539 + if (!wait_for_completion) 540 + return -EIOCBQUEUED; 541 + 542 + for (;;) { 543 + set_current_state(TASK_UNINTERRUPTIBLE); 544 + if (!READ_ONCE(dio->submit.waiter)) 545 + break; 546 + 547 + if (!(iocb->ki_flags & IOCB_HIPRI) || 548 + !dio->submit.last_queue || 549 + !blk_poll(dio->submit.last_queue, 550 + dio->submit.cookie, true)) 551 + io_schedule(); 552 + } 553 + __set_current_state(TASK_RUNNING); 554 + } 555 + 556 + return iomap_dio_complete(dio); 557 + 558 + out_free_dio: 559 + kfree(dio); 560 + return ret; 561 + } 562 + EXPORT_SYMBOL_GPL(iomap_dio_rw);
+144
fs/iomap/fiemap.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (c) 2016-2018 Christoph Hellwig. 4 + */ 5 + #include <linux/module.h> 6 + #include <linux/compiler.h> 7 + #include <linux/fs.h> 8 + #include <linux/iomap.h> 9 + 10 + struct fiemap_ctx { 11 + struct fiemap_extent_info *fi; 12 + struct iomap prev; 13 + }; 14 + 15 + static int iomap_to_fiemap(struct fiemap_extent_info *fi, 16 + struct iomap *iomap, u32 flags) 17 + { 18 + switch (iomap->type) { 19 + case IOMAP_HOLE: 20 + /* skip holes */ 21 + return 0; 22 + case IOMAP_DELALLOC: 23 + flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN; 24 + break; 25 + case IOMAP_MAPPED: 26 + break; 27 + case IOMAP_UNWRITTEN: 28 + flags |= FIEMAP_EXTENT_UNWRITTEN; 29 + break; 30 + case IOMAP_INLINE: 31 + flags |= FIEMAP_EXTENT_DATA_INLINE; 32 + break; 33 + } 34 + 35 + if (iomap->flags & IOMAP_F_MERGED) 36 + flags |= FIEMAP_EXTENT_MERGED; 37 + if (iomap->flags & IOMAP_F_SHARED) 38 + flags |= FIEMAP_EXTENT_SHARED; 39 + 40 + return fiemap_fill_next_extent(fi, iomap->offset, 41 + iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0, 42 + iomap->length, flags); 43 + } 44 + 45 + static loff_t 46 + iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 47 + struct iomap *iomap) 48 + { 49 + struct fiemap_ctx *ctx = data; 50 + loff_t ret = length; 51 + 52 + if (iomap->type == IOMAP_HOLE) 53 + return length; 54 + 55 + ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0); 56 + ctx->prev = *iomap; 57 + switch (ret) { 58 + case 0: /* success */ 59 + return length; 60 + case 1: /* extent array full */ 61 + return 0; 62 + default: 63 + return ret; 64 + } 65 + } 66 + 67 + int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, 68 + loff_t start, loff_t len, const struct iomap_ops *ops) 69 + { 70 + struct fiemap_ctx ctx; 71 + loff_t ret; 72 + 73 + memset(&ctx, 0, sizeof(ctx)); 74 + ctx.fi = fi; 75 + ctx.prev.type = IOMAP_HOLE; 76 + 77 + ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC); 78 + if (ret) 79 + return ret; 80 + 81 + if (fi->fi_flags & FIEMAP_FLAG_SYNC) { 82 + ret = filemap_write_and_wait(inode->i_mapping); 83 + if (ret) 84 + return ret; 85 + } 86 + 87 + while (len > 0) { 88 + ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx, 89 + iomap_fiemap_actor); 90 + /* inode with no (attribute) mapping will give ENOENT */ 91 + if (ret == -ENOENT) 92 + break; 93 + if (ret < 0) 94 + return ret; 95 + if (ret == 0) 96 + break; 97 + 98 + start += ret; 99 + len -= ret; 100 + } 101 + 102 + if (ctx.prev.type != IOMAP_HOLE) { 103 + ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST); 104 + if (ret < 0) 105 + return ret; 106 + } 107 + 108 + return 0; 109 + } 110 + EXPORT_SYMBOL_GPL(iomap_fiemap); 111 + 112 + static loff_t 113 + iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length, 114 + void *data, struct iomap *iomap) 115 + { 116 + sector_t *bno = data, addr; 117 + 118 + if (iomap->type == IOMAP_MAPPED) { 119 + addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits; 120 + if (addr > INT_MAX) 121 + WARN(1, "would truncate bmap result\n"); 122 + else 123 + *bno = addr; 124 + } 125 + return 0; 126 + } 127 + 128 + /* legacy ->bmap interface. 0 is the error return (!) */ 129 + sector_t 130 + iomap_bmap(struct address_space *mapping, sector_t bno, 131 + const struct iomap_ops *ops) 132 + { 133 + struct inode *inode = mapping->host; 134 + loff_t pos = bno << inode->i_blkbits; 135 + unsigned blocksize = i_blocksize(inode); 136 + 137 + if (filemap_write_and_wait(mapping)) 138 + return 0; 139 + 140 + bno = 0; 141 + iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor); 142 + return bno; 143 + } 144 + EXPORT_SYMBOL_GPL(iomap_bmap);
+212
fs/iomap/seek.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2017 Red Hat, Inc. 4 + * Copyright (c) 2018 Christoph Hellwig. 5 + */ 6 + #include <linux/module.h> 7 + #include <linux/compiler.h> 8 + #include <linux/fs.h> 9 + #include <linux/iomap.h> 10 + #include <linux/pagemap.h> 11 + #include <linux/pagevec.h> 12 + 13 + /* 14 + * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff. 15 + * Returns true if found and updates @lastoff to the offset in file. 16 + */ 17 + static bool 18 + page_seek_hole_data(struct inode *inode, struct page *page, loff_t *lastoff, 19 + int whence) 20 + { 21 + const struct address_space_operations *ops = inode->i_mapping->a_ops; 22 + unsigned int bsize = i_blocksize(inode), off; 23 + bool seek_data = whence == SEEK_DATA; 24 + loff_t poff = page_offset(page); 25 + 26 + if (WARN_ON_ONCE(*lastoff >= poff + PAGE_SIZE)) 27 + return false; 28 + 29 + if (*lastoff < poff) { 30 + /* 31 + * Last offset smaller than the start of the page means we found 32 + * a hole: 33 + */ 34 + if (whence == SEEK_HOLE) 35 + return true; 36 + *lastoff = poff; 37 + } 38 + 39 + /* 40 + * Just check the page unless we can and should check block ranges: 41 + */ 42 + if (bsize == PAGE_SIZE || !ops->is_partially_uptodate) 43 + return PageUptodate(page) == seek_data; 44 + 45 + lock_page(page); 46 + if (unlikely(page->mapping != inode->i_mapping)) 47 + goto out_unlock_not_found; 48 + 49 + for (off = 0; off < PAGE_SIZE; off += bsize) { 50 + if (offset_in_page(*lastoff) >= off + bsize) 51 + continue; 52 + if (ops->is_partially_uptodate(page, off, bsize) == seek_data) { 53 + unlock_page(page); 54 + return true; 55 + } 56 + *lastoff = poff + off + bsize; 57 + } 58 + 59 + out_unlock_not_found: 60 + unlock_page(page); 61 + return false; 62 + } 63 + 64 + /* 65 + * Seek for SEEK_DATA / SEEK_HOLE in the page cache. 66 + * 67 + * Within unwritten extents, the page cache determines which parts are holes 68 + * and which are data: uptodate buffer heads count as data; everything else 69 + * counts as a hole. 70 + * 71 + * Returns the resulting offset on successs, and -ENOENT otherwise. 72 + */ 73 + static loff_t 74 + page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, 75 + int whence) 76 + { 77 + pgoff_t index = offset >> PAGE_SHIFT; 78 + pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE); 79 + loff_t lastoff = offset; 80 + struct pagevec pvec; 81 + 82 + if (length <= 0) 83 + return -ENOENT; 84 + 85 + pagevec_init(&pvec); 86 + 87 + do { 88 + unsigned nr_pages, i; 89 + 90 + nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index, 91 + end - 1); 92 + if (nr_pages == 0) 93 + break; 94 + 95 + for (i = 0; i < nr_pages; i++) { 96 + struct page *page = pvec.pages[i]; 97 + 98 + if (page_seek_hole_data(inode, page, &lastoff, whence)) 99 + goto check_range; 100 + lastoff = page_offset(page) + PAGE_SIZE; 101 + } 102 + pagevec_release(&pvec); 103 + } while (index < end); 104 + 105 + /* When no page at lastoff and we are not done, we found a hole. */ 106 + if (whence != SEEK_HOLE) 107 + goto not_found; 108 + 109 + check_range: 110 + if (lastoff < offset + length) 111 + goto out; 112 + not_found: 113 + lastoff = -ENOENT; 114 + out: 115 + pagevec_release(&pvec); 116 + return lastoff; 117 + } 118 + 119 + 120 + static loff_t 121 + iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length, 122 + void *data, struct iomap *iomap) 123 + { 124 + switch (iomap->type) { 125 + case IOMAP_UNWRITTEN: 126 + offset = page_cache_seek_hole_data(inode, offset, length, 127 + SEEK_HOLE); 128 + if (offset < 0) 129 + return length; 130 + /* fall through */ 131 + case IOMAP_HOLE: 132 + *(loff_t *)data = offset; 133 + return 0; 134 + default: 135 + return length; 136 + } 137 + } 138 + 139 + loff_t 140 + iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops) 141 + { 142 + loff_t size = i_size_read(inode); 143 + loff_t length = size - offset; 144 + loff_t ret; 145 + 146 + /* Nothing to be found before or beyond the end of the file. */ 147 + if (offset < 0 || offset >= size) 148 + return -ENXIO; 149 + 150 + while (length > 0) { 151 + ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops, 152 + &offset, iomap_seek_hole_actor); 153 + if (ret < 0) 154 + return ret; 155 + if (ret == 0) 156 + break; 157 + 158 + offset += ret; 159 + length -= ret; 160 + } 161 + 162 + return offset; 163 + } 164 + EXPORT_SYMBOL_GPL(iomap_seek_hole); 165 + 166 + static loff_t 167 + iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length, 168 + void *data, struct iomap *iomap) 169 + { 170 + switch (iomap->type) { 171 + case IOMAP_HOLE: 172 + return length; 173 + case IOMAP_UNWRITTEN: 174 + offset = page_cache_seek_hole_data(inode, offset, length, 175 + SEEK_DATA); 176 + if (offset < 0) 177 + return length; 178 + /*FALLTHRU*/ 179 + default: 180 + *(loff_t *)data = offset; 181 + return 0; 182 + } 183 + } 184 + 185 + loff_t 186 + iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops) 187 + { 188 + loff_t size = i_size_read(inode); 189 + loff_t length = size - offset; 190 + loff_t ret; 191 + 192 + /* Nothing to be found before or beyond the end of the file. */ 193 + if (offset < 0 || offset >= size) 194 + return -ENXIO; 195 + 196 + while (length > 0) { 197 + ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops, 198 + &offset, iomap_seek_data_actor); 199 + if (ret < 0) 200 + return ret; 201 + if (ret == 0) 202 + break; 203 + 204 + offset += ret; 205 + length -= ret; 206 + } 207 + 208 + if (length <= 0) 209 + return -ENXIO; 210 + return offset; 211 + } 212 + EXPORT_SYMBOL_GPL(iomap_seek_data);
+178
fs/iomap/swapfile.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2018 Oracle. All Rights Reserved. 4 + * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 + */ 6 + #include <linux/module.h> 7 + #include <linux/compiler.h> 8 + #include <linux/fs.h> 9 + #include <linux/iomap.h> 10 + #include <linux/swap.h> 11 + 12 + /* Swapfile activation */ 13 + 14 + struct iomap_swapfile_info { 15 + struct iomap iomap; /* accumulated iomap */ 16 + struct swap_info_struct *sis; 17 + uint64_t lowest_ppage; /* lowest physical addr seen (pages) */ 18 + uint64_t highest_ppage; /* highest physical addr seen (pages) */ 19 + unsigned long nr_pages; /* number of pages collected */ 20 + int nr_extents; /* extent count */ 21 + }; 22 + 23 + /* 24 + * Collect physical extents for this swap file. Physical extents reported to 25 + * the swap code must be trimmed to align to a page boundary. The logical 26 + * offset within the file is irrelevant since the swapfile code maps logical 27 + * page numbers of the swap device to the physical page-aligned extents. 28 + */ 29 + static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) 30 + { 31 + struct iomap *iomap = &isi->iomap; 32 + unsigned long nr_pages; 33 + uint64_t first_ppage; 34 + uint64_t first_ppage_reported; 35 + uint64_t next_ppage; 36 + int error; 37 + 38 + /* 39 + * Round the start up and the end down so that the physical 40 + * extent aligns to a page boundary. 41 + */ 42 + first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT; 43 + next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >> 44 + PAGE_SHIFT; 45 + 46 + /* Skip too-short physical extents. */ 47 + if (first_ppage >= next_ppage) 48 + return 0; 49 + nr_pages = next_ppage - first_ppage; 50 + 51 + /* 52 + * Calculate how much swap space we're adding; the first page contains 53 + * the swap header and doesn't count. The mm still wants that first 54 + * page fed to add_swap_extent, however. 55 + */ 56 + first_ppage_reported = first_ppage; 57 + if (iomap->offset == 0) 58 + first_ppage_reported++; 59 + if (isi->lowest_ppage > first_ppage_reported) 60 + isi->lowest_ppage = first_ppage_reported; 61 + if (isi->highest_ppage < (next_ppage - 1)) 62 + isi->highest_ppage = next_ppage - 1; 63 + 64 + /* Add extent, set up for the next call. */ 65 + error = add_swap_extent(isi->sis, isi->nr_pages, nr_pages, first_ppage); 66 + if (error < 0) 67 + return error; 68 + isi->nr_extents += error; 69 + isi->nr_pages += nr_pages; 70 + return 0; 71 + } 72 + 73 + /* 74 + * Accumulate iomaps for this swap file. We have to accumulate iomaps because 75 + * swap only cares about contiguous page-aligned physical extents and makes no 76 + * distinction between written and unwritten extents. 77 + */ 78 + static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos, 79 + loff_t count, void *data, struct iomap *iomap) 80 + { 81 + struct iomap_swapfile_info *isi = data; 82 + int error; 83 + 84 + switch (iomap->type) { 85 + case IOMAP_MAPPED: 86 + case IOMAP_UNWRITTEN: 87 + /* Only real or unwritten extents. */ 88 + break; 89 + case IOMAP_INLINE: 90 + /* No inline data. */ 91 + pr_err("swapon: file is inline\n"); 92 + return -EINVAL; 93 + default: 94 + pr_err("swapon: file has unallocated extents\n"); 95 + return -EINVAL; 96 + } 97 + 98 + /* No uncommitted metadata or shared blocks. */ 99 + if (iomap->flags & IOMAP_F_DIRTY) { 100 + pr_err("swapon: file is not committed\n"); 101 + return -EINVAL; 102 + } 103 + if (iomap->flags & IOMAP_F_SHARED) { 104 + pr_err("swapon: file has shared extents\n"); 105 + return -EINVAL; 106 + } 107 + 108 + /* Only one bdev per swap file. */ 109 + if (iomap->bdev != isi->sis->bdev) { 110 + pr_err("swapon: file is on multiple devices\n"); 111 + return -EINVAL; 112 + } 113 + 114 + if (isi->iomap.length == 0) { 115 + /* No accumulated extent, so just store it. */ 116 + memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); 117 + } else if (isi->iomap.addr + isi->iomap.length == iomap->addr) { 118 + /* Append this to the accumulated extent. */ 119 + isi->iomap.length += iomap->length; 120 + } else { 121 + /* Otherwise, add the retained iomap and store this one. */ 122 + error = iomap_swapfile_add_extent(isi); 123 + if (error) 124 + return error; 125 + memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); 126 + } 127 + return count; 128 + } 129 + 130 + /* 131 + * Iterate a swap file's iomaps to construct physical extents that can be 132 + * passed to the swapfile subsystem. 133 + */ 134 + int iomap_swapfile_activate(struct swap_info_struct *sis, 135 + struct file *swap_file, sector_t *pagespan, 136 + const struct iomap_ops *ops) 137 + { 138 + struct iomap_swapfile_info isi = { 139 + .sis = sis, 140 + .lowest_ppage = (sector_t)-1ULL, 141 + }; 142 + struct address_space *mapping = swap_file->f_mapping; 143 + struct inode *inode = mapping->host; 144 + loff_t pos = 0; 145 + loff_t len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE); 146 + loff_t ret; 147 + 148 + /* 149 + * Persist all file mapping metadata so that we won't have any 150 + * IOMAP_F_DIRTY iomaps. 151 + */ 152 + ret = vfs_fsync(swap_file, 1); 153 + if (ret) 154 + return ret; 155 + 156 + while (len > 0) { 157 + ret = iomap_apply(inode, pos, len, IOMAP_REPORT, 158 + ops, &isi, iomap_swapfile_activate_actor); 159 + if (ret <= 0) 160 + return ret; 161 + 162 + pos += ret; 163 + len -= ret; 164 + } 165 + 166 + if (isi.iomap.length) { 167 + ret = iomap_swapfile_add_extent(&isi); 168 + if (ret) 169 + return ret; 170 + } 171 + 172 + *pagespan = 1 + isi.highest_ppage - isi.lowest_ppage; 173 + sis->max = isi.nr_pages; 174 + sis->pages = isi.nr_pages - 1; 175 + sis->highest_bit = isi.nr_pages - 1; 176 + return isi.nr_extents; 177 + } 178 + EXPORT_SYMBOL_GPL(iomap_swapfile_activate);
+17
include/linux/iomap.h
··· 7 7 #include <linux/mm.h> 8 8 #include <linux/types.h> 9 9 #include <linux/mm_types.h> 10 + #include <linux/blkdev.h> 10 11 11 12 struct address_space; 12 13 struct fiemap_extent_info; ··· 70 69 const struct iomap_page_ops *page_ops; 71 70 }; 72 71 72 + static inline sector_t 73 + iomap_sector(struct iomap *iomap, loff_t pos) 74 + { 75 + return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT; 76 + } 77 + 73 78 /* 74 79 * When a filesystem sets page_ops in an iomap mapping it returns, page_prepare 75 80 * and page_done will be called for each page written to. This only applies to ··· 121 114 int (*iomap_end)(struct inode *inode, loff_t pos, loff_t length, 122 115 ssize_t written, unsigned flags, struct iomap *iomap); 123 116 }; 117 + 118 + /* 119 + * Main iomap iterator function. 120 + */ 121 + typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, 122 + void *data, struct iomap *iomap); 123 + 124 + loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, 125 + unsigned flags, const struct iomap_ops *ops, void *data, 126 + iomap_actor_t actor); 124 127 125 128 /* 126 129 * Structure allocate for each page when block size < PAGE_SIZE to track