Merge tag 'iomap-5.3-merge-4' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

+1

MAINTAINERS

··· 8415 8415 T: git git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git 8416 8416 S: Supported 8417 8417 F: fs/iomap.c 8418 + F: fs/iomap/ 8418 8419 F: include/linux/iomap.h 8419 8420 8420 8421 IOMMU DRIVERS

+1 -1

fs/Makefile

··· 52 52 obj-$(CONFIG_SYSCTL) += drop_caches.o 53 53 54 54 obj-$(CONFIG_FHANDLE) += fhandle.o 55 - obj-$(CONFIG_FS_IOMAP) += iomap.o 55 + obj-y += iomap/ 56 56 57 57 obj-y += quota/ 58 58

-1

fs/dax.c

··· 26 26 #include <linux/mmu_notifier.h> 27 27 #include <linux/iomap.h> 28 28 #include <asm/pgalloc.h> 29 - #include "internal.h" 30 29 31 30 #define CREATE_TRACE_POINTS 32 31 #include <trace/events/fs_dax.h>

-10

fs/internal.h

··· 185 185 extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd, 186 186 unsigned long arg); 187 187 188 - /* 189 - * iomap support: 190 - */ 191 - typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, 192 - void *data, struct iomap *iomap); 193 - 194 - loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, 195 - unsigned flags, const struct iomap_ops *ops, void *data, 196 - iomap_actor_t actor); 197 - 198 188 /* direct-io.c: */ 199 189 int sb_init_dio_done_wq(struct super_block *sb);

-2205

fs/iomap.c

··· 1 - // SPDX-License-Identifier: GPL-2.0 2 - /* 3 - * Copyright (C) 2010 Red Hat, Inc. 4 - * Copyright (c) 2016-2018 Christoph Hellwig. 5 - */ 6 - #include <linux/module.h> 7 - #include <linux/compiler.h> 8 - #include <linux/fs.h> 9 - #include <linux/iomap.h> 10 - #include <linux/uaccess.h> 11 - #include <linux/gfp.h> 12 - #include <linux/migrate.h> 13 - #include <linux/mm.h> 14 - #include <linux/mm_inline.h> 15 - #include <linux/swap.h> 16 - #include <linux/pagemap.h> 17 - #include <linux/pagevec.h> 18 - #include <linux/file.h> 19 - #include <linux/uio.h> 20 - #include <linux/backing-dev.h> 21 - #include <linux/buffer_head.h> 22 - #include <linux/task_io_accounting_ops.h> 23 - #include <linux/dax.h> 24 - #include <linux/sched/signal.h> 25 - 26 - #include "internal.h" 27 - 28 - /* 29 - * Execute a iomap write on a segment of the mapping that spans a 30 - * contiguous range of pages that have identical block mapping state. 31 - * 32 - * This avoids the need to map pages individually, do individual allocations 33 - * for each page and most importantly avoid the need for filesystem specific 34 - * locking per page. Instead, all the operations are amortised over the entire 35 - * range of pages. It is assumed that the filesystems will lock whatever 36 - * resources they require in the iomap_begin call, and release them in the 37 - * iomap_end call. 38 - */ 39 - loff_t 40 - iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, 41 - const struct iomap_ops *ops, void *data, iomap_actor_t actor) 42 - { 43 - struct iomap iomap = { 0 }; 44 - loff_t written = 0, ret; 45 - 46 - /* 47 - * Need to map a range from start position for length bytes. This can 48 - * span multiple pages - it is only guaranteed to return a range of a 49 - * single type of pages (e.g. all into a hole, all mapped or all 50 - * unwritten). Failure at this point has nothing to undo. 51 - * 52 - * If allocation is required for this range, reserve the space now so 53 - * that the allocation is guaranteed to succeed later on. Once we copy 54 - * the data into the page cache pages, then we cannot fail otherwise we 55 - * expose transient stale data. If the reserve fails, we can safely 56 - * back out at this point as there is nothing to undo. 57 - */ 58 - ret = ops->iomap_begin(inode, pos, length, flags, &iomap); 59 - if (ret) 60 - return ret; 61 - if (WARN_ON(iomap.offset > pos)) 62 - return -EIO; 63 - if (WARN_ON(iomap.length == 0)) 64 - return -EIO; 65 - 66 - /* 67 - * Cut down the length to the one actually provided by the filesystem, 68 - * as it might not be able to give us the whole size that we requested. 69 - */ 70 - if (iomap.offset + iomap.length < pos + length) 71 - length = iomap.offset + iomap.length - pos; 72 - 73 - /* 74 - * Now that we have guaranteed that the space allocation will succeed. 75 - * we can do the copy-in page by page without having to worry about 76 - * failures exposing transient data. 77 - */ 78 - written = actor(inode, pos, length, data, &iomap); 79 - 80 - /* 81 - * Now the data has been copied, commit the range we've copied. This 82 - * should not fail unless the filesystem has had a fatal error. 83 - */ 84 - if (ops->iomap_end) { 85 - ret = ops->iomap_end(inode, pos, length, 86 - written > 0 ? written : 0, 87 - flags, &iomap); 88 - } 89 - 90 - return written ? written : ret; 91 - } 92 - 93 - static sector_t 94 - iomap_sector(struct iomap *iomap, loff_t pos) 95 - { 96 - return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT; 97 - } 98 - 99 - static struct iomap_page * 100 - iomap_page_create(struct inode *inode, struct page *page) 101 - { 102 - struct iomap_page *iop = to_iomap_page(page); 103 - 104 - if (iop || i_blocksize(inode) == PAGE_SIZE) 105 - return iop; 106 - 107 - iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL); 108 - atomic_set(&iop->read_count, 0); 109 - atomic_set(&iop->write_count, 0); 110 - bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE); 111 - 112 - /* 113 - * migrate_page_move_mapping() assumes that pages with private data have 114 - * their count elevated by 1. 115 - */ 116 - get_page(page); 117 - set_page_private(page, (unsigned long)iop); 118 - SetPagePrivate(page); 119 - return iop; 120 - } 121 - 122 - static void 123 - iomap_page_release(struct page *page) 124 - { 125 - struct iomap_page *iop = to_iomap_page(page); 126 - 127 - if (!iop) 128 - return; 129 - WARN_ON_ONCE(atomic_read(&iop->read_count)); 130 - WARN_ON_ONCE(atomic_read(&iop->write_count)); 131 - ClearPagePrivate(page); 132 - set_page_private(page, 0); 133 - put_page(page); 134 - kfree(iop); 135 - } 136 - 137 - /* 138 - * Calculate the range inside the page that we actually need to read. 139 - */ 140 - static void 141 - iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, 142 - loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp) 143 - { 144 - loff_t orig_pos = *pos; 145 - loff_t isize = i_size_read(inode); 146 - unsigned block_bits = inode->i_blkbits; 147 - unsigned block_size = (1 << block_bits); 148 - unsigned poff = offset_in_page(*pos); 149 - unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length); 150 - unsigned first = poff >> block_bits; 151 - unsigned last = (poff + plen - 1) >> block_bits; 152 - 153 - /* 154 - * If the block size is smaller than the page size we need to check the 155 - * per-block uptodate status and adjust the offset and length if needed 156 - * to avoid reading in already uptodate ranges. 157 - */ 158 - if (iop) { 159 - unsigned int i; 160 - 161 - /* move forward for each leading block marked uptodate */ 162 - for (i = first; i <= last; i++) { 163 - if (!test_bit(i, iop->uptodate)) 164 - break; 165 - *pos += block_size; 166 - poff += block_size; 167 - plen -= block_size; 168 - first++; 169 - } 170 - 171 - /* truncate len if we find any trailing uptodate block(s) */ 172 - for ( ; i <= last; i++) { 173 - if (test_bit(i, iop->uptodate)) { 174 - plen -= (last - i + 1) * block_size; 175 - last = i - 1; 176 - break; 177 - } 178 - } 179 - } 180 - 181 - /* 182 - * If the extent spans the block that contains the i_size we need to 183 - * handle both halves separately so that we properly zero data in the 184 - * page cache for blocks that are entirely outside of i_size. 185 - */ 186 - if (orig_pos <= isize && orig_pos + length > isize) { 187 - unsigned end = offset_in_page(isize - 1) >> block_bits; 188 - 189 - if (first <= end && last > end) 190 - plen -= (last - end) * block_size; 191 - } 192 - 193 - *offp = poff; 194 - *lenp = plen; 195 - } 196 - 197 - static void 198 - iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len) 199 - { 200 - struct iomap_page *iop = to_iomap_page(page); 201 - struct inode *inode = page->mapping->host; 202 - unsigned first = off >> inode->i_blkbits; 203 - unsigned last = (off + len - 1) >> inode->i_blkbits; 204 - unsigned int i; 205 - bool uptodate = true; 206 - 207 - if (iop) { 208 - for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) { 209 - if (i >= first && i <= last) 210 - set_bit(i, iop->uptodate); 211 - else if (!test_bit(i, iop->uptodate)) 212 - uptodate = false; 213 - } 214 - } 215 - 216 - if (uptodate && !PageError(page)) 217 - SetPageUptodate(page); 218 - } 219 - 220 - static void 221 - iomap_read_finish(struct iomap_page *iop, struct page *page) 222 - { 223 - if (!iop || atomic_dec_and_test(&iop->read_count)) 224 - unlock_page(page); 225 - } 226 - 227 - static void 228 - iomap_read_page_end_io(struct bio_vec *bvec, int error) 229 - { 230 - struct page *page = bvec->bv_page; 231 - struct iomap_page *iop = to_iomap_page(page); 232 - 233 - if (unlikely(error)) { 234 - ClearPageUptodate(page); 235 - SetPageError(page); 236 - } else { 237 - iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len); 238 - } 239 - 240 - iomap_read_finish(iop, page); 241 - } 242 - 243 - static void 244 - iomap_read_end_io(struct bio *bio) 245 - { 246 - int error = blk_status_to_errno(bio->bi_status); 247 - struct bio_vec *bvec; 248 - struct bvec_iter_all iter_all; 249 - 250 - bio_for_each_segment_all(bvec, bio, iter_all) 251 - iomap_read_page_end_io(bvec, error); 252 - bio_put(bio); 253 - } 254 - 255 - struct iomap_readpage_ctx { 256 - struct page *cur_page; 257 - bool cur_page_in_bio; 258 - bool is_readahead; 259 - struct bio *bio; 260 - struct list_head *pages; 261 - }; 262 - 263 - static void 264 - iomap_read_inline_data(struct inode *inode, struct page *page, 265 - struct iomap *iomap) 266 - { 267 - size_t size = i_size_read(inode); 268 - void *addr; 269 - 270 - if (PageUptodate(page)) 271 - return; 272 - 273 - BUG_ON(page->index); 274 - BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data)); 275 - 276 - addr = kmap_atomic(page); 277 - memcpy(addr, iomap->inline_data, size); 278 - memset(addr + size, 0, PAGE_SIZE - size); 279 - kunmap_atomic(addr); 280 - SetPageUptodate(page); 281 - } 282 - 283 - static loff_t 284 - iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 285 - struct iomap *iomap) 286 - { 287 - struct iomap_readpage_ctx *ctx = data; 288 - struct page *page = ctx->cur_page; 289 - struct iomap_page *iop = iomap_page_create(inode, page); 290 - bool same_page = false, is_contig = false; 291 - loff_t orig_pos = pos; 292 - unsigned poff, plen; 293 - sector_t sector; 294 - 295 - if (iomap->type == IOMAP_INLINE) { 296 - WARN_ON_ONCE(pos); 297 - iomap_read_inline_data(inode, page, iomap); 298 - return PAGE_SIZE; 299 - } 300 - 301 - /* zero post-eof blocks as the page may be mapped */ 302 - iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen); 303 - if (plen == 0) 304 - goto done; 305 - 306 - if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) { 307 - zero_user(page, poff, plen); 308 - iomap_set_range_uptodate(page, poff, plen); 309 - goto done; 310 - } 311 - 312 - ctx->cur_page_in_bio = true; 313 - 314 - /* 315 - * Try to merge into a previous segment if we can. 316 - */ 317 - sector = iomap_sector(iomap, pos); 318 - if (ctx->bio && bio_end_sector(ctx->bio) == sector) 319 - is_contig = true; 320 - 321 - if (is_contig && 322 - __bio_try_merge_page(ctx->bio, page, plen, poff, &same_page)) { 323 - if (!same_page && iop) 324 - atomic_inc(&iop->read_count); 325 - goto done; 326 - } 327 - 328 - /* 329 - * If we start a new segment we need to increase the read count, and we 330 - * need to do so before submitting any previous full bio to make sure 331 - * that we don't prematurely unlock the page. 332 - */ 333 - if (iop) 334 - atomic_inc(&iop->read_count); 335 - 336 - if (!ctx->bio || !is_contig || bio_full(ctx->bio, plen)) { 337 - gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); 338 - int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT; 339 - 340 - if (ctx->bio) 341 - submit_bio(ctx->bio); 342 - 343 - if (ctx->is_readahead) /* same as readahead_gfp_mask */ 344 - gfp |= __GFP_NORETRY | __GFP_NOWARN; 345 - ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs)); 346 - ctx->bio->bi_opf = REQ_OP_READ; 347 - if (ctx->is_readahead) 348 - ctx->bio->bi_opf |= REQ_RAHEAD; 349 - ctx->bio->bi_iter.bi_sector = sector; 350 - bio_set_dev(ctx->bio, iomap->bdev); 351 - ctx->bio->bi_end_io = iomap_read_end_io; 352 - } 353 - 354 - bio_add_page(ctx->bio, page, plen, poff); 355 - done: 356 - /* 357 - * Move the caller beyond our range so that it keeps making progress. 358 - * For that we have to include any leading non-uptodate ranges, but 359 - * we can skip trailing ones as they will be handled in the next 360 - * iteration. 361 - */ 362 - return pos - orig_pos + plen; 363 - } 364 - 365 - int 366 - iomap_readpage(struct page *page, const struct iomap_ops *ops) 367 - { 368 - struct iomap_readpage_ctx ctx = { .cur_page = page }; 369 - struct inode *inode = page->mapping->host; 370 - unsigned poff; 371 - loff_t ret; 372 - 373 - for (poff = 0; poff < PAGE_SIZE; poff += ret) { 374 - ret = iomap_apply(inode, page_offset(page) + poff, 375 - PAGE_SIZE - poff, 0, ops, &ctx, 376 - iomap_readpage_actor); 377 - if (ret <= 0) { 378 - WARN_ON_ONCE(ret == 0); 379 - SetPageError(page); 380 - break; 381 - } 382 - } 383 - 384 - if (ctx.bio) { 385 - submit_bio(ctx.bio); 386 - WARN_ON_ONCE(!ctx.cur_page_in_bio); 387 - } else { 388 - WARN_ON_ONCE(ctx.cur_page_in_bio); 389 - unlock_page(page); 390 - } 391 - 392 - /* 393 - * Just like mpage_readpages and block_read_full_page we always 394 - * return 0 and just mark the page as PageError on errors. This 395 - * should be cleaned up all through the stack eventually. 396 - */ 397 - return 0; 398 - } 399 - EXPORT_SYMBOL_GPL(iomap_readpage); 400 - 401 - static struct page * 402 - iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos, 403 - loff_t length, loff_t *done) 404 - { 405 - while (!list_empty(pages)) { 406 - struct page *page = lru_to_page(pages); 407 - 408 - if (page_offset(page) >= (u64)pos + length) 409 - break; 410 - 411 - list_del(&page->lru); 412 - if (!add_to_page_cache_lru(page, inode->i_mapping, page->index, 413 - GFP_NOFS)) 414 - return page; 415 - 416 - /* 417 - * If we already have a page in the page cache at index we are 418 - * done. Upper layers don't care if it is uptodate after the 419 - * readpages call itself as every page gets checked again once 420 - * actually needed. 421 - */ 422 - *done += PAGE_SIZE; 423 - put_page(page); 424 - } 425 - 426 - return NULL; 427 - } 428 - 429 - static loff_t 430 - iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, 431 - void *data, struct iomap *iomap) 432 - { 433 - struct iomap_readpage_ctx *ctx = data; 434 - loff_t done, ret; 435 - 436 - for (done = 0; done < length; done += ret) { 437 - if (ctx->cur_page && offset_in_page(pos + done) == 0) { 438 - if (!ctx->cur_page_in_bio) 439 - unlock_page(ctx->cur_page); 440 - put_page(ctx->cur_page); 441 - ctx->cur_page = NULL; 442 - } 443 - if (!ctx->cur_page) { 444 - ctx->cur_page = iomap_next_page(inode, ctx->pages, 445 - pos, length, &done); 446 - if (!ctx->cur_page) 447 - break; 448 - ctx->cur_page_in_bio = false; 449 - } 450 - ret = iomap_readpage_actor(inode, pos + done, length - done, 451 - ctx, iomap); 452 - } 453 - 454 - return done; 455 - } 456 - 457 - int 458 - iomap_readpages(struct address_space *mapping, struct list_head *pages, 459 - unsigned nr_pages, const struct iomap_ops *ops) 460 - { 461 - struct iomap_readpage_ctx ctx = { 462 - .pages = pages, 463 - .is_readahead = true, 464 - }; 465 - loff_t pos = page_offset(list_entry(pages->prev, struct page, lru)); 466 - loff_t last = page_offset(list_entry(pages->next, struct page, lru)); 467 - loff_t length = last - pos + PAGE_SIZE, ret = 0; 468 - 469 - while (length > 0) { 470 - ret = iomap_apply(mapping->host, pos, length, 0, ops, 471 - &ctx, iomap_readpages_actor); 472 - if (ret <= 0) { 473 - WARN_ON_ONCE(ret == 0); 474 - goto done; 475 - } 476 - pos += ret; 477 - length -= ret; 478 - } 479 - ret = 0; 480 - done: 481 - if (ctx.bio) 482 - submit_bio(ctx.bio); 483 - if (ctx.cur_page) { 484 - if (!ctx.cur_page_in_bio) 485 - unlock_page(ctx.cur_page); 486 - put_page(ctx.cur_page); 487 - } 488 - 489 - /* 490 - * Check that we didn't lose a page due to the arcance calling 491 - * conventions.. 492 - */ 493 - WARN_ON_ONCE(!ret && !list_empty(ctx.pages)); 494 - return ret; 495 - } 496 - EXPORT_SYMBOL_GPL(iomap_readpages); 497 - 498 - /* 499 - * iomap_is_partially_uptodate checks whether blocks within a page are 500 - * uptodate or not. 501 - * 502 - * Returns true if all blocks which correspond to a file portion 503 - * we want to read within the page are uptodate. 504 - */ 505 - int 506 - iomap_is_partially_uptodate(struct page *page, unsigned long from, 507 - unsigned long count) 508 - { 509 - struct iomap_page *iop = to_iomap_page(page); 510 - struct inode *inode = page->mapping->host; 511 - unsigned len, first, last; 512 - unsigned i; 513 - 514 - /* Limit range to one page */ 515 - len = min_t(unsigned, PAGE_SIZE - from, count); 516 - 517 - /* First and last blocks in range within page */ 518 - first = from >> inode->i_blkbits; 519 - last = (from + len - 1) >> inode->i_blkbits; 520 - 521 - if (iop) { 522 - for (i = first; i <= last; i++) 523 - if (!test_bit(i, iop->uptodate)) 524 - return 0; 525 - return 1; 526 - } 527 - 528 - return 0; 529 - } 530 - EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); 531 - 532 - int 533 - iomap_releasepage(struct page *page, gfp_t gfp_mask) 534 - { 535 - /* 536 - * mm accommodates an old ext3 case where clean pages might not have had 537 - * the dirty bit cleared. Thus, it can send actual dirty pages to 538 - * ->releasepage() via shrink_active_list(), skip those here. 539 - */ 540 - if (PageDirty(page) || PageWriteback(page)) 541 - return 0; 542 - iomap_page_release(page); 543 - return 1; 544 - } 545 - EXPORT_SYMBOL_GPL(iomap_releasepage); 546 - 547 - void 548 - iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len) 549 - { 550 - /* 551 - * If we are invalidating the entire page, clear the dirty state from it 552 - * and release it to avoid unnecessary buildup of the LRU. 553 - */ 554 - if (offset == 0 && len == PAGE_SIZE) { 555 - WARN_ON_ONCE(PageWriteback(page)); 556 - cancel_dirty_page(page); 557 - iomap_page_release(page); 558 - } 559 - } 560 - EXPORT_SYMBOL_GPL(iomap_invalidatepage); 561 - 562 - #ifdef CONFIG_MIGRATION 563 - int 564 - iomap_migrate_page(struct address_space *mapping, struct page *newpage, 565 - struct page *page, enum migrate_mode mode) 566 - { 567 - int ret; 568 - 569 - ret = migrate_page_move_mapping(mapping, newpage, page, 0); 570 - if (ret != MIGRATEPAGE_SUCCESS) 571 - return ret; 572 - 573 - if (page_has_private(page)) { 574 - ClearPagePrivate(page); 575 - get_page(newpage); 576 - set_page_private(newpage, page_private(page)); 577 - set_page_private(page, 0); 578 - put_page(page); 579 - SetPagePrivate(newpage); 580 - } 581 - 582 - if (mode != MIGRATE_SYNC_NO_COPY) 583 - migrate_page_copy(newpage, page); 584 - else 585 - migrate_page_states(newpage, page); 586 - return MIGRATEPAGE_SUCCESS; 587 - } 588 - EXPORT_SYMBOL_GPL(iomap_migrate_page); 589 - #endif /* CONFIG_MIGRATION */ 590 - 591 - static void 592 - iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) 593 - { 594 - loff_t i_size = i_size_read(inode); 595 - 596 - /* 597 - * Only truncate newly allocated pages beyoned EOF, even if the 598 - * write started inside the existing inode size. 599 - */ 600 - if (pos + len > i_size) 601 - truncate_pagecache_range(inode, max(pos, i_size), pos + len); 602 - } 603 - 604 - static int 605 - iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page, 606 - unsigned poff, unsigned plen, unsigned from, unsigned to, 607 - struct iomap *iomap) 608 - { 609 - struct bio_vec bvec; 610 - struct bio bio; 611 - 612 - if (iomap->type != IOMAP_MAPPED || block_start >= i_size_read(inode)) { 613 - zero_user_segments(page, poff, from, to, poff + plen); 614 - iomap_set_range_uptodate(page, poff, plen); 615 - return 0; 616 - } 617 - 618 - bio_init(&bio, &bvec, 1); 619 - bio.bi_opf = REQ_OP_READ; 620 - bio.bi_iter.bi_sector = iomap_sector(iomap, block_start); 621 - bio_set_dev(&bio, iomap->bdev); 622 - __bio_add_page(&bio, page, plen, poff); 623 - return submit_bio_wait(&bio); 624 - } 625 - 626 - static int 627 - __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, 628 - struct page *page, struct iomap *iomap) 629 - { 630 - struct iomap_page *iop = iomap_page_create(inode, page); 631 - loff_t block_size = i_blocksize(inode); 632 - loff_t block_start = pos & ~(block_size - 1); 633 - loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1); 634 - unsigned from = offset_in_page(pos), to = from + len, poff, plen; 635 - int status = 0; 636 - 637 - if (PageUptodate(page)) 638 - return 0; 639 - 640 - do { 641 - iomap_adjust_read_range(inode, iop, &block_start, 642 - block_end - block_start, &poff, &plen); 643 - if (plen == 0) 644 - break; 645 - 646 - if ((from > poff && from < poff + plen) || 647 - (to > poff && to < poff + plen)) { 648 - status = iomap_read_page_sync(inode, block_start, page, 649 - poff, plen, from, to, iomap); 650 - if (status) 651 - break; 652 - } 653 - 654 - } while ((block_start += plen) < block_end); 655 - 656 - return status; 657 - } 658 - 659 - static int 660 - iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, 661 - struct page **pagep, struct iomap *iomap) 662 - { 663 - const struct iomap_page_ops *page_ops = iomap->page_ops; 664 - pgoff_t index = pos >> PAGE_SHIFT; 665 - struct page *page; 666 - int status = 0; 667 - 668 - BUG_ON(pos + len > iomap->offset + iomap->length); 669 - 670 - if (fatal_signal_pending(current)) 671 - return -EINTR; 672 - 673 - if (page_ops && page_ops->page_prepare) { 674 - status = page_ops->page_prepare(inode, pos, len, iomap); 675 - if (status) 676 - return status; 677 - } 678 - 679 - page = grab_cache_page_write_begin(inode->i_mapping, index, flags); 680 - if (!page) { 681 - status = -ENOMEM; 682 - goto out_no_page; 683 - } 684 - 685 - if (iomap->type == IOMAP_INLINE) 686 - iomap_read_inline_data(inode, page, iomap); 687 - else if (iomap->flags & IOMAP_F_BUFFER_HEAD) 688 - status = __block_write_begin_int(page, pos, len, NULL, iomap); 689 - else 690 - status = __iomap_write_begin(inode, pos, len, page, iomap); 691 - 692 - if (unlikely(status)) 693 - goto out_unlock; 694 - 695 - *pagep = page; 696 - return 0; 697 - 698 - out_unlock: 699 - unlock_page(page); 700 - put_page(page); 701 - iomap_write_failed(inode, pos, len); 702 - 703 - out_no_page: 704 - if (page_ops && page_ops->page_done) 705 - page_ops->page_done(inode, pos, 0, NULL, iomap); 706 - return status; 707 - } 708 - 709 - int 710 - iomap_set_page_dirty(struct page *page) 711 - { 712 - struct address_space *mapping = page_mapping(page); 713 - int newly_dirty; 714 - 715 - if (unlikely(!mapping)) 716 - return !TestSetPageDirty(page); 717 - 718 - /* 719 - * Lock out page->mem_cgroup migration to keep PageDirty 720 - * synchronized with per-memcg dirty page counters. 721 - */ 722 - lock_page_memcg(page); 723 - newly_dirty = !TestSetPageDirty(page); 724 - if (newly_dirty) 725 - __set_page_dirty(page, mapping, 0); 726 - unlock_page_memcg(page); 727 - 728 - if (newly_dirty) 729 - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 730 - return newly_dirty; 731 - } 732 - EXPORT_SYMBOL_GPL(iomap_set_page_dirty); 733 - 734 - static int 735 - __iomap_write_end(struct inode *inode, loff_t pos, unsigned len, 736 - unsigned copied, struct page *page, struct iomap *iomap) 737 - { 738 - flush_dcache_page(page); 739 - 740 - /* 741 - * The blocks that were entirely written will now be uptodate, so we 742 - * don't have to worry about a readpage reading them and overwriting a 743 - * partial write. However if we have encountered a short write and only 744 - * partially written into a block, it will not be marked uptodate, so a 745 - * readpage might come in and destroy our partial write. 746 - * 747 - * Do the simplest thing, and just treat any short write to a non 748 - * uptodate page as a zero-length write, and force the caller to redo 749 - * the whole thing. 750 - */ 751 - if (unlikely(copied < len && !PageUptodate(page))) 752 - return 0; 753 - iomap_set_range_uptodate(page, offset_in_page(pos), len); 754 - iomap_set_page_dirty(page); 755 - return copied; 756 - } 757 - 758 - static int 759 - iomap_write_end_inline(struct inode *inode, struct page *page, 760 - struct iomap *iomap, loff_t pos, unsigned copied) 761 - { 762 - void *addr; 763 - 764 - WARN_ON_ONCE(!PageUptodate(page)); 765 - BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data)); 766 - 767 - addr = kmap_atomic(page); 768 - memcpy(iomap->inline_data + pos, addr + pos, copied); 769 - kunmap_atomic(addr); 770 - 771 - mark_inode_dirty(inode); 772 - return copied; 773 - } 774 - 775 - static int 776 - iomap_write_end(struct inode *inode, loff_t pos, unsigned len, 777 - unsigned copied, struct page *page, struct iomap *iomap) 778 - { 779 - const struct iomap_page_ops *page_ops = iomap->page_ops; 780 - loff_t old_size = inode->i_size; 781 - int ret; 782 - 783 - if (iomap->type == IOMAP_INLINE) { 784 - ret = iomap_write_end_inline(inode, page, iomap, pos, copied); 785 - } else if (iomap->flags & IOMAP_F_BUFFER_HEAD) { 786 - ret = block_write_end(NULL, inode->i_mapping, pos, len, copied, 787 - page, NULL); 788 - } else { 789 - ret = __iomap_write_end(inode, pos, len, copied, page, iomap); 790 - } 791 - 792 - /* 793 - * Update the in-memory inode size after copying the data into the page 794 - * cache. It's up to the file system to write the updated size to disk, 795 - * preferably after I/O completion so that no stale data is exposed. 796 - */ 797 - if (pos + ret > old_size) { 798 - i_size_write(inode, pos + ret); 799 - iomap->flags |= IOMAP_F_SIZE_CHANGED; 800 - } 801 - unlock_page(page); 802 - 803 - if (old_size < pos) 804 - pagecache_isize_extended(inode, old_size, pos); 805 - if (page_ops && page_ops->page_done) 806 - page_ops->page_done(inode, pos, ret, page, iomap); 807 - put_page(page); 808 - 809 - if (ret < len) 810 - iomap_write_failed(inode, pos, len); 811 - return ret; 812 - } 813 - 814 - static loff_t 815 - iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 816 - struct iomap *iomap) 817 - { 818 - struct iov_iter *i = data; 819 - long status = 0; 820 - ssize_t written = 0; 821 - unsigned int flags = AOP_FLAG_NOFS; 822 - 823 - do { 824 - struct page *page; 825 - unsigned long offset; /* Offset into pagecache page */ 826 - unsigned long bytes; /* Bytes to write to page */ 827 - size_t copied; /* Bytes copied from user */ 828 - 829 - offset = offset_in_page(pos); 830 - bytes = min_t(unsigned long, PAGE_SIZE - offset, 831 - iov_iter_count(i)); 832 - again: 833 - if (bytes > length) 834 - bytes = length; 835 - 836 - /* 837 - * Bring in the user page that we will copy from _first_. 838 - * Otherwise there's a nasty deadlock on copying from the 839 - * same page as we're writing to, without it being marked 840 - * up-to-date. 841 - * 842 - * Not only is this an optimisation, but it is also required 843 - * to check that the address is actually valid, when atomic 844 - * usercopies are used, below. 845 - */ 846 - if (unlikely(iov_iter_fault_in_readable(i, bytes))) { 847 - status = -EFAULT; 848 - break; 849 - } 850 - 851 - status = iomap_write_begin(inode, pos, bytes, flags, &page, 852 - iomap); 853 - if (unlikely(status)) 854 - break; 855 - 856 - if (mapping_writably_mapped(inode->i_mapping)) 857 - flush_dcache_page(page); 858 - 859 - copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 860 - 861 - flush_dcache_page(page); 862 - 863 - status = iomap_write_end(inode, pos, bytes, copied, page, 864 - iomap); 865 - if (unlikely(status < 0)) 866 - break; 867 - copied = status; 868 - 869 - cond_resched(); 870 - 871 - iov_iter_advance(i, copied); 872 - if (unlikely(copied == 0)) { 873 - /* 874 - * If we were unable to copy any data at all, we must 875 - * fall back to a single segment length write. 876 - * 877 - * If we didn't fallback here, we could livelock 878 - * because not all segments in the iov can be copied at 879 - * once without a pagefault. 880 - */ 881 - bytes = min_t(unsigned long, PAGE_SIZE - offset, 882 - iov_iter_single_seg_count(i)); 883 - goto again; 884 - } 885 - pos += copied; 886 - written += copied; 887 - length -= copied; 888 - 889 - balance_dirty_pages_ratelimited(inode->i_mapping); 890 - } while (iov_iter_count(i) && length); 891 - 892 - return written ? written : status; 893 - } 894 - 895 - ssize_t 896 - iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, 897 - const struct iomap_ops *ops) 898 - { 899 - struct inode *inode = iocb->ki_filp->f_mapping->host; 900 - loff_t pos = iocb->ki_pos, ret = 0, written = 0; 901 - 902 - while (iov_iter_count(iter)) { 903 - ret = iomap_apply(inode, pos, iov_iter_count(iter), 904 - IOMAP_WRITE, ops, iter, iomap_write_actor); 905 - if (ret <= 0) 906 - break; 907 - pos += ret; 908 - written += ret; 909 - } 910 - 911 - return written ? written : ret; 912 - } 913 - EXPORT_SYMBOL_GPL(iomap_file_buffered_write); 914 - 915 - static struct page * 916 - __iomap_read_page(struct inode *inode, loff_t offset) 917 - { 918 - struct address_space *mapping = inode->i_mapping; 919 - struct page *page; 920 - 921 - page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL); 922 - if (IS_ERR(page)) 923 - return page; 924 - if (!PageUptodate(page)) { 925 - put_page(page); 926 - return ERR_PTR(-EIO); 927 - } 928 - return page; 929 - } 930 - 931 - static loff_t 932 - iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 933 - struct iomap *iomap) 934 - { 935 - long status = 0; 936 - ssize_t written = 0; 937 - 938 - do { 939 - struct page *page, *rpage; 940 - unsigned long offset; /* Offset into pagecache page */ 941 - unsigned long bytes; /* Bytes to write to page */ 942 - 943 - offset = offset_in_page(pos); 944 - bytes = min_t(loff_t, PAGE_SIZE - offset, length); 945 - 946 - rpage = __iomap_read_page(inode, pos); 947 - if (IS_ERR(rpage)) 948 - return PTR_ERR(rpage); 949 - 950 - status = iomap_write_begin(inode, pos, bytes, 951 - AOP_FLAG_NOFS, &page, iomap); 952 - put_page(rpage); 953 - if (unlikely(status)) 954 - return status; 955 - 956 - WARN_ON_ONCE(!PageUptodate(page)); 957 - 958 - status = iomap_write_end(inode, pos, bytes, bytes, page, iomap); 959 - if (unlikely(status <= 0)) { 960 - if (WARN_ON_ONCE(status == 0)) 961 - return -EIO; 962 - return status; 963 - } 964 - 965 - cond_resched(); 966 - 967 - pos += status; 968 - written += status; 969 - length -= status; 970 - 971 - balance_dirty_pages_ratelimited(inode->i_mapping); 972 - } while (length); 973 - 974 - return written; 975 - } 976 - 977 - int 978 - iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, 979 - const struct iomap_ops *ops) 980 - { 981 - loff_t ret; 982 - 983 - while (len) { 984 - ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL, 985 - iomap_dirty_actor); 986 - if (ret <= 0) 987 - return ret; 988 - pos += ret; 989 - len -= ret; 990 - } 991 - 992 - return 0; 993 - } 994 - EXPORT_SYMBOL_GPL(iomap_file_dirty); 995 - 996 - static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, 997 - unsigned bytes, struct iomap *iomap) 998 - { 999 - struct page *page; 1000 - int status; 1001 - 1002 - status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page, 1003 - iomap); 1004 - if (status) 1005 - return status; 1006 - 1007 - zero_user(page, offset, bytes); 1008 - mark_page_accessed(page); 1009 - 1010 - return iomap_write_end(inode, pos, bytes, bytes, page, iomap); 1011 - } 1012 - 1013 - static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, 1014 - struct iomap *iomap) 1015 - { 1016 - return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, 1017 - iomap_sector(iomap, pos & PAGE_MASK), offset, bytes); 1018 - } 1019 - 1020 - static loff_t 1021 - iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, 1022 - void *data, struct iomap *iomap) 1023 - { 1024 - bool *did_zero = data; 1025 - loff_t written = 0; 1026 - int status; 1027 - 1028 - /* already zeroed? we're done. */ 1029 - if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) 1030 - return count; 1031 - 1032 - do { 1033 - unsigned offset, bytes; 1034 - 1035 - offset = offset_in_page(pos); 1036 - bytes = min_t(loff_t, PAGE_SIZE - offset, count); 1037 - 1038 - if (IS_DAX(inode)) 1039 - status = iomap_dax_zero(pos, offset, bytes, iomap); 1040 - else 1041 - status = iomap_zero(inode, pos, offset, bytes, iomap); 1042 - if (status < 0) 1043 - return status; 1044 - 1045 - pos += bytes; 1046 - count -= bytes; 1047 - written += bytes; 1048 - if (did_zero) 1049 - *did_zero = true; 1050 - } while (count > 0); 1051 - 1052 - return written; 1053 - } 1054 - 1055 - int 1056 - iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, 1057 - const struct iomap_ops *ops) 1058 - { 1059 - loff_t ret; 1060 - 1061 - while (len > 0) { 1062 - ret = iomap_apply(inode, pos, len, IOMAP_ZERO, 1063 - ops, did_zero, iomap_zero_range_actor); 1064 - if (ret <= 0) 1065 - return ret; 1066 - 1067 - pos += ret; 1068 - len -= ret; 1069 - } 1070 - 1071 - return 0; 1072 - } 1073 - EXPORT_SYMBOL_GPL(iomap_zero_range); 1074 - 1075 - int 1076 - iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, 1077 - const struct iomap_ops *ops) 1078 - { 1079 - unsigned int blocksize = i_blocksize(inode); 1080 - unsigned int off = pos & (blocksize - 1); 1081 - 1082 - /* Block boundary? Nothing to do */ 1083 - if (!off) 1084 - return 0; 1085 - return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); 1086 - } 1087 - EXPORT_SYMBOL_GPL(iomap_truncate_page); 1088 - 1089 - static loff_t 1090 - iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, 1091 - void *data, struct iomap *iomap) 1092 - { 1093 - struct page *page = data; 1094 - int ret; 1095 - 1096 - if (iomap->flags & IOMAP_F_BUFFER_HEAD) { 1097 - ret = __block_write_begin_int(page, pos, length, NULL, iomap); 1098 - if (ret) 1099 - return ret; 1100 - block_commit_write(page, 0, length); 1101 - } else { 1102 - WARN_ON_ONCE(!PageUptodate(page)); 1103 - iomap_page_create(inode, page); 1104 - set_page_dirty(page); 1105 - } 1106 - 1107 - return length; 1108 - } 1109 - 1110 - vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) 1111 - { 1112 - struct page *page = vmf->page; 1113 - struct inode *inode = file_inode(vmf->vma->vm_file); 1114 - unsigned long length; 1115 - loff_t offset, size; 1116 - ssize_t ret; 1117 - 1118 - lock_page(page); 1119 - size = i_size_read(inode); 1120 - if ((page->mapping != inode->i_mapping) || 1121 - (page_offset(page) > size)) { 1122 - /* We overload EFAULT to mean page got truncated */ 1123 - ret = -EFAULT; 1124 - goto out_unlock; 1125 - } 1126 - 1127 - /* page is wholly or partially inside EOF */ 1128 - if (((page->index + 1) << PAGE_SHIFT) > size) 1129 - length = offset_in_page(size); 1130 - else 1131 - length = PAGE_SIZE; 1132 - 1133 - offset = page_offset(page); 1134 - while (length > 0) { 1135 - ret = iomap_apply(inode, offset, length, 1136 - IOMAP_WRITE | IOMAP_FAULT, ops, page, 1137 - iomap_page_mkwrite_actor); 1138 - if (unlikely(ret <= 0)) 1139 - goto out_unlock; 1140 - offset += ret; 1141 - length -= ret; 1142 - } 1143 - 1144 - wait_for_stable_page(page); 1145 - return VM_FAULT_LOCKED; 1146 - out_unlock: 1147 - unlock_page(page); 1148 - return block_page_mkwrite_return(ret); 1149 - } 1150 - EXPORT_SYMBOL_GPL(iomap_page_mkwrite); 1151 - 1152 - struct fiemap_ctx { 1153 - struct fiemap_extent_info *fi; 1154 - struct iomap prev; 1155 - }; 1156 - 1157 - static int iomap_to_fiemap(struct fiemap_extent_info *fi, 1158 - struct iomap *iomap, u32 flags) 1159 - { 1160 - switch (iomap->type) { 1161 - case IOMAP_HOLE: 1162 - /* skip holes */ 1163 - return 0; 1164 - case IOMAP_DELALLOC: 1165 - flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN; 1166 - break; 1167 - case IOMAP_MAPPED: 1168 - break; 1169 - case IOMAP_UNWRITTEN: 1170 - flags |= FIEMAP_EXTENT_UNWRITTEN; 1171 - break; 1172 - case IOMAP_INLINE: 1173 - flags |= FIEMAP_EXTENT_DATA_INLINE; 1174 - break; 1175 - } 1176 - 1177 - if (iomap->flags & IOMAP_F_MERGED) 1178 - flags |= FIEMAP_EXTENT_MERGED; 1179 - if (iomap->flags & IOMAP_F_SHARED) 1180 - flags |= FIEMAP_EXTENT_SHARED; 1181 - 1182 - return fiemap_fill_next_extent(fi, iomap->offset, 1183 - iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0, 1184 - iomap->length, flags); 1185 - } 1186 - 1187 - static loff_t 1188 - iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 1189 - struct iomap *iomap) 1190 - { 1191 - struct fiemap_ctx *ctx = data; 1192 - loff_t ret = length; 1193 - 1194 - if (iomap->type == IOMAP_HOLE) 1195 - return length; 1196 - 1197 - ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0); 1198 - ctx->prev = *iomap; 1199 - switch (ret) { 1200 - case 0: /* success */ 1201 - return length; 1202 - case 1: /* extent array full */ 1203 - return 0; 1204 - default: 1205 - return ret; 1206 - } 1207 - } 1208 - 1209 - int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, 1210 - loff_t start, loff_t len, const struct iomap_ops *ops) 1211 - { 1212 - struct fiemap_ctx ctx; 1213 - loff_t ret; 1214 - 1215 - memset(&ctx, 0, sizeof(ctx)); 1216 - ctx.fi = fi; 1217 - ctx.prev.type = IOMAP_HOLE; 1218 - 1219 - ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC); 1220 - if (ret) 1221 - return ret; 1222 - 1223 - if (fi->fi_flags & FIEMAP_FLAG_SYNC) { 1224 - ret = filemap_write_and_wait(inode->i_mapping); 1225 - if (ret) 1226 - return ret; 1227 - } 1228 - 1229 - while (len > 0) { 1230 - ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx, 1231 - iomap_fiemap_actor); 1232 - /* inode with no (attribute) mapping will give ENOENT */ 1233 - if (ret == -ENOENT) 1234 - break; 1235 - if (ret < 0) 1236 - return ret; 1237 - if (ret == 0) 1238 - break; 1239 - 1240 - start += ret; 1241 - len -= ret; 1242 - } 1243 - 1244 - if (ctx.prev.type != IOMAP_HOLE) { 1245 - ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST); 1246 - if (ret < 0) 1247 - return ret; 1248 - } 1249 - 1250 - return 0; 1251 - } 1252 - EXPORT_SYMBOL_GPL(iomap_fiemap); 1253 - 1254 - /* 1255 - * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff. 1256 - * Returns true if found and updates @lastoff to the offset in file. 1257 - */ 1258 - static bool 1259 - page_seek_hole_data(struct inode *inode, struct page *page, loff_t *lastoff, 1260 - int whence) 1261 - { 1262 - const struct address_space_operations *ops = inode->i_mapping->a_ops; 1263 - unsigned int bsize = i_blocksize(inode), off; 1264 - bool seek_data = whence == SEEK_DATA; 1265 - loff_t poff = page_offset(page); 1266 - 1267 - if (WARN_ON_ONCE(*lastoff >= poff + PAGE_SIZE)) 1268 - return false; 1269 - 1270 - if (*lastoff < poff) { 1271 - /* 1272 - * Last offset smaller than the start of the page means we found 1273 - * a hole: 1274 - */ 1275 - if (whence == SEEK_HOLE) 1276 - return true; 1277 - *lastoff = poff; 1278 - } 1279 - 1280 - /* 1281 - * Just check the page unless we can and should check block ranges: 1282 - */ 1283 - if (bsize == PAGE_SIZE || !ops->is_partially_uptodate) 1284 - return PageUptodate(page) == seek_data; 1285 - 1286 - lock_page(page); 1287 - if (unlikely(page->mapping != inode->i_mapping)) 1288 - goto out_unlock_not_found; 1289 - 1290 - for (off = 0; off < PAGE_SIZE; off += bsize) { 1291 - if (offset_in_page(*lastoff) >= off + bsize) 1292 - continue; 1293 - if (ops->is_partially_uptodate(page, off, bsize) == seek_data) { 1294 - unlock_page(page); 1295 - return true; 1296 - } 1297 - *lastoff = poff + off + bsize; 1298 - } 1299 - 1300 - out_unlock_not_found: 1301 - unlock_page(page); 1302 - return false; 1303 - } 1304 - 1305 - /* 1306 - * Seek for SEEK_DATA / SEEK_HOLE in the page cache. 1307 - * 1308 - * Within unwritten extents, the page cache determines which parts are holes 1309 - * and which are data: uptodate buffer heads count as data; everything else 1310 - * counts as a hole. 1311 - * 1312 - * Returns the resulting offset on successs, and -ENOENT otherwise. 1313 - */ 1314 - static loff_t 1315 - page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, 1316 - int whence) 1317 - { 1318 - pgoff_t index = offset >> PAGE_SHIFT; 1319 - pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE); 1320 - loff_t lastoff = offset; 1321 - struct pagevec pvec; 1322 - 1323 - if (length <= 0) 1324 - return -ENOENT; 1325 - 1326 - pagevec_init(&pvec); 1327 - 1328 - do { 1329 - unsigned nr_pages, i; 1330 - 1331 - nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index, 1332 - end - 1); 1333 - if (nr_pages == 0) 1334 - break; 1335 - 1336 - for (i = 0; i < nr_pages; i++) { 1337 - struct page *page = pvec.pages[i]; 1338 - 1339 - if (page_seek_hole_data(inode, page, &lastoff, whence)) 1340 - goto check_range; 1341 - lastoff = page_offset(page) + PAGE_SIZE; 1342 - } 1343 - pagevec_release(&pvec); 1344 - } while (index < end); 1345 - 1346 - /* When no page at lastoff and we are not done, we found a hole. */ 1347 - if (whence != SEEK_HOLE) 1348 - goto not_found; 1349 - 1350 - check_range: 1351 - if (lastoff < offset + length) 1352 - goto out; 1353 - not_found: 1354 - lastoff = -ENOENT; 1355 - out: 1356 - pagevec_release(&pvec); 1357 - return lastoff; 1358 - } 1359 - 1360 - 1361 - static loff_t 1362 - iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length, 1363 - void *data, struct iomap *iomap) 1364 - { 1365 - switch (iomap->type) { 1366 - case IOMAP_UNWRITTEN: 1367 - offset = page_cache_seek_hole_data(inode, offset, length, 1368 - SEEK_HOLE); 1369 - if (offset < 0) 1370 - return length; 1371 - /* fall through */ 1372 - case IOMAP_HOLE: 1373 - *(loff_t *)data = offset; 1374 - return 0; 1375 - default: 1376 - return length; 1377 - } 1378 - } 1379 - 1380 - loff_t 1381 - iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops) 1382 - { 1383 - loff_t size = i_size_read(inode); 1384 - loff_t length = size - offset; 1385 - loff_t ret; 1386 - 1387 - /* Nothing to be found before or beyond the end of the file. */ 1388 - if (offset < 0 || offset >= size) 1389 - return -ENXIO; 1390 - 1391 - while (length > 0) { 1392 - ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops, 1393 - &offset, iomap_seek_hole_actor); 1394 - if (ret < 0) 1395 - return ret; 1396 - if (ret == 0) 1397 - break; 1398 - 1399 - offset += ret; 1400 - length -= ret; 1401 - } 1402 - 1403 - return offset; 1404 - } 1405 - EXPORT_SYMBOL_GPL(iomap_seek_hole); 1406 - 1407 - static loff_t 1408 - iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length, 1409 - void *data, struct iomap *iomap) 1410 - { 1411 - switch (iomap->type) { 1412 - case IOMAP_HOLE: 1413 - return length; 1414 - case IOMAP_UNWRITTEN: 1415 - offset = page_cache_seek_hole_data(inode, offset, length, 1416 - SEEK_DATA); 1417 - if (offset < 0) 1418 - return length; 1419 - /*FALLTHRU*/ 1420 - default: 1421 - *(loff_t *)data = offset; 1422 - return 0; 1423 - } 1424 - } 1425 - 1426 - loff_t 1427 - iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops) 1428 - { 1429 - loff_t size = i_size_read(inode); 1430 - loff_t length = size - offset; 1431 - loff_t ret; 1432 - 1433 - /* Nothing to be found before or beyond the end of the file. */ 1434 - if (offset < 0 || offset >= size) 1435 - return -ENXIO; 1436 - 1437 - while (length > 0) { 1438 - ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops, 1439 - &offset, iomap_seek_data_actor); 1440 - if (ret < 0) 1441 - return ret; 1442 - if (ret == 0) 1443 - break; 1444 - 1445 - offset += ret; 1446 - length -= ret; 1447 - } 1448 - 1449 - if (length <= 0) 1450 - return -ENXIO; 1451 - return offset; 1452 - } 1453 - EXPORT_SYMBOL_GPL(iomap_seek_data); 1454 - 1455 - /* 1456 - * Private flags for iomap_dio, must not overlap with the public ones in 1457 - * iomap.h: 1458 - */ 1459 - #define IOMAP_DIO_WRITE_FUA (1 << 28) 1460 - #define IOMAP_DIO_NEED_SYNC (1 << 29) 1461 - #define IOMAP_DIO_WRITE (1 << 30) 1462 - #define IOMAP_DIO_DIRTY (1 << 31) 1463 - 1464 - struct iomap_dio { 1465 - struct kiocb *iocb; 1466 - iomap_dio_end_io_t *end_io; 1467 - loff_t i_size; 1468 - loff_t size; 1469 - atomic_t ref; 1470 - unsigned flags; 1471 - int error; 1472 - bool wait_for_completion; 1473 - 1474 - union { 1475 - /* used during submission and for synchronous completion: */ 1476 - struct { 1477 - struct iov_iter *iter; 1478 - struct task_struct *waiter; 1479 - struct request_queue *last_queue; 1480 - blk_qc_t cookie; 1481 - } submit; 1482 - 1483 - /* used for aio completion: */ 1484 - struct { 1485 - struct work_struct work; 1486 - } aio; 1487 - }; 1488 - }; 1489 - 1490 - int iomap_dio_iopoll(struct kiocb *kiocb, bool spin) 1491 - { 1492 - struct request_queue *q = READ_ONCE(kiocb->private); 1493 - 1494 - if (!q) 1495 - return 0; 1496 - return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin); 1497 - } 1498 - EXPORT_SYMBOL_GPL(iomap_dio_iopoll); 1499 - 1500 - static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap, 1501 - struct bio *bio) 1502 - { 1503 - atomic_inc(&dio->ref); 1504 - 1505 - if (dio->iocb->ki_flags & IOCB_HIPRI) 1506 - bio_set_polled(bio, dio->iocb); 1507 - 1508 - dio->submit.last_queue = bdev_get_queue(iomap->bdev); 1509 - dio->submit.cookie = submit_bio(bio); 1510 - } 1511 - 1512 - static ssize_t iomap_dio_complete(struct iomap_dio *dio) 1513 - { 1514 - struct kiocb *iocb = dio->iocb; 1515 - struct inode *inode = file_inode(iocb->ki_filp); 1516 - loff_t offset = iocb->ki_pos; 1517 - ssize_t ret; 1518 - 1519 - if (dio->end_io) { 1520 - ret = dio->end_io(iocb, 1521 - dio->error ? dio->error : dio->size, 1522 - dio->flags); 1523 - } else { 1524 - ret = dio->error; 1525 - } 1526 - 1527 - if (likely(!ret)) { 1528 - ret = dio->size; 1529 - /* check for short read */ 1530 - if (offset + ret > dio->i_size && 1531 - !(dio->flags & IOMAP_DIO_WRITE)) 1532 - ret = dio->i_size - offset; 1533 - iocb->ki_pos += ret; 1534 - } 1535 - 1536 - /* 1537 - * Try again to invalidate clean pages which might have been cached by 1538 - * non-direct readahead, or faulted in by get_user_pages() if the source 1539 - * of the write was an mmap'ed region of the file we're writing. Either 1540 - * one is a pretty crazy thing to do, so we don't support it 100%. If 1541 - * this invalidation fails, tough, the write still worked... 1542 - * 1543 - * And this page cache invalidation has to be after dio->end_io(), as 1544 - * some filesystems convert unwritten extents to real allocations in 1545 - * end_io() when necessary, otherwise a racing buffer read would cache 1546 - * zeros from unwritten extents. 1547 - */ 1548 - if (!dio->error && 1549 - (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) { 1550 - int err; 1551 - err = invalidate_inode_pages2_range(inode->i_mapping, 1552 - offset >> PAGE_SHIFT, 1553 - (offset + dio->size - 1) >> PAGE_SHIFT); 1554 - if (err) 1555 - dio_warn_stale_pagecache(iocb->ki_filp); 1556 - } 1557 - 1558 - /* 1559 - * If this is a DSYNC write, make sure we push it to stable storage now 1560 - * that we've written data. 1561 - */ 1562 - if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC)) 1563 - ret = generic_write_sync(iocb, ret); 1564 - 1565 - inode_dio_end(file_inode(iocb->ki_filp)); 1566 - kfree(dio); 1567 - 1568 - return ret; 1569 - } 1570 - 1571 - static void iomap_dio_complete_work(struct work_struct *work) 1572 - { 1573 - struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); 1574 - struct kiocb *iocb = dio->iocb; 1575 - 1576 - iocb->ki_complete(iocb, iomap_dio_complete(dio), 0); 1577 - } 1578 - 1579 - /* 1580 - * Set an error in the dio if none is set yet. We have to use cmpxchg 1581 - * as the submission context and the completion context(s) can race to 1582 - * update the error. 1583 - */ 1584 - static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret) 1585 - { 1586 - cmpxchg(&dio->error, 0, ret); 1587 - } 1588 - 1589 - static void iomap_dio_bio_end_io(struct bio *bio) 1590 - { 1591 - struct iomap_dio *dio = bio->bi_private; 1592 - bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); 1593 - 1594 - if (bio->bi_status) 1595 - iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); 1596 - 1597 - if (atomic_dec_and_test(&dio->ref)) { 1598 - if (dio->wait_for_completion) { 1599 - struct task_struct *waiter = dio->submit.waiter; 1600 - WRITE_ONCE(dio->submit.waiter, NULL); 1601 - blk_wake_io_task(waiter); 1602 - } else if (dio->flags & IOMAP_DIO_WRITE) { 1603 - struct inode *inode = file_inode(dio->iocb->ki_filp); 1604 - 1605 - INIT_WORK(&dio->aio.work, iomap_dio_complete_work); 1606 - queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); 1607 - } else { 1608 - iomap_dio_complete_work(&dio->aio.work); 1609 - } 1610 - } 1611 - 1612 - if (should_dirty) { 1613 - bio_check_pages_dirty(bio); 1614 - } else { 1615 - bio_release_pages(bio, false); 1616 - bio_put(bio); 1617 - } 1618 - } 1619 - 1620 - static void 1621 - iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, 1622 - unsigned len) 1623 - { 1624 - struct page *page = ZERO_PAGE(0); 1625 - int flags = REQ_SYNC | REQ_IDLE; 1626 - struct bio *bio; 1627 - 1628 - bio = bio_alloc(GFP_KERNEL, 1); 1629 - bio_set_dev(bio, iomap->bdev); 1630 - bio->bi_iter.bi_sector = iomap_sector(iomap, pos); 1631 - bio->bi_private = dio; 1632 - bio->bi_end_io = iomap_dio_bio_end_io; 1633 - 1634 - get_page(page); 1635 - __bio_add_page(bio, page, len, 0); 1636 - bio_set_op_attrs(bio, REQ_OP_WRITE, flags); 1637 - iomap_dio_submit_bio(dio, iomap, bio); 1638 - } 1639 - 1640 - static loff_t 1641 - iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, 1642 - struct iomap_dio *dio, struct iomap *iomap) 1643 - { 1644 - unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); 1645 - unsigned int fs_block_size = i_blocksize(inode), pad; 1646 - unsigned int align = iov_iter_alignment(dio->submit.iter); 1647 - struct iov_iter iter; 1648 - struct bio *bio; 1649 - bool need_zeroout = false; 1650 - bool use_fua = false; 1651 - int nr_pages, ret = 0; 1652 - size_t copied = 0; 1653 - 1654 - if ((pos | length | align) & ((1 << blkbits) - 1)) 1655 - return -EINVAL; 1656 - 1657 - if (iomap->type == IOMAP_UNWRITTEN) { 1658 - dio->flags |= IOMAP_DIO_UNWRITTEN; 1659 - need_zeroout = true; 1660 - } 1661 - 1662 - if (iomap->flags & IOMAP_F_SHARED) 1663 - dio->flags |= IOMAP_DIO_COW; 1664 - 1665 - if (iomap->flags & IOMAP_F_NEW) { 1666 - need_zeroout = true; 1667 - } else if (iomap->type == IOMAP_MAPPED) { 1668 - /* 1669 - * Use a FUA write if we need datasync semantics, this is a pure 1670 - * data IO that doesn't require any metadata updates (including 1671 - * after IO completion such as unwritten extent conversion) and 1672 - * the underlying device supports FUA. This allows us to avoid 1673 - * cache flushes on IO completion. 1674 - */ 1675 - if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && 1676 - (dio->flags & IOMAP_DIO_WRITE_FUA) && 1677 - blk_queue_fua(bdev_get_queue(iomap->bdev))) 1678 - use_fua = true; 1679 - } 1680 - 1681 - /* 1682 - * Operate on a partial iter trimmed to the extent we were called for. 1683 - * We'll update the iter in the dio once we're done with this extent. 1684 - */ 1685 - iter = *dio->submit.iter; 1686 - iov_iter_truncate(&iter, length); 1687 - 1688 - nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); 1689 - if (nr_pages <= 0) 1690 - return nr_pages; 1691 - 1692 - if (need_zeroout) { 1693 - /* zero out from the start of the block to the write offset */ 1694 - pad = pos & (fs_block_size - 1); 1695 - if (pad) 1696 - iomap_dio_zero(dio, iomap, pos - pad, pad); 1697 - } 1698 - 1699 - do { 1700 - size_t n; 1701 - if (dio->error) { 1702 - iov_iter_revert(dio->submit.iter, copied); 1703 - return 0; 1704 - } 1705 - 1706 - bio = bio_alloc(GFP_KERNEL, nr_pages); 1707 - bio_set_dev(bio, iomap->bdev); 1708 - bio->bi_iter.bi_sector = iomap_sector(iomap, pos); 1709 - bio->bi_write_hint = dio->iocb->ki_hint; 1710 - bio->bi_ioprio = dio->iocb->ki_ioprio; 1711 - bio->bi_private = dio; 1712 - bio->bi_end_io = iomap_dio_bio_end_io; 1713 - 1714 - ret = bio_iov_iter_get_pages(bio, &iter); 1715 - if (unlikely(ret)) { 1716 - /* 1717 - * We have to stop part way through an IO. We must fall 1718 - * through to the sub-block tail zeroing here, otherwise 1719 - * this short IO may expose stale data in the tail of 1720 - * the block we haven't written data to. 1721 - */ 1722 - bio_put(bio); 1723 - goto zero_tail; 1724 - } 1725 - 1726 - n = bio->bi_iter.bi_size; 1727 - if (dio->flags & IOMAP_DIO_WRITE) { 1728 - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; 1729 - if (use_fua) 1730 - bio->bi_opf |= REQ_FUA; 1731 - else 1732 - dio->flags &= ~IOMAP_DIO_WRITE_FUA; 1733 - task_io_account_write(n); 1734 - } else { 1735 - bio->bi_opf = REQ_OP_READ; 1736 - if (dio->flags & IOMAP_DIO_DIRTY) 1737 - bio_set_pages_dirty(bio); 1738 - } 1739 - 1740 - iov_iter_advance(dio->submit.iter, n); 1741 - 1742 - dio->size += n; 1743 - pos += n; 1744 - copied += n; 1745 - 1746 - nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); 1747 - iomap_dio_submit_bio(dio, iomap, bio); 1748 - } while (nr_pages); 1749 - 1750 - /* 1751 - * We need to zeroout the tail of a sub-block write if the extent type 1752 - * requires zeroing or the write extends beyond EOF. If we don't zero 1753 - * the block tail in the latter case, we can expose stale data via mmap 1754 - * reads of the EOF block. 1755 - */ 1756 - zero_tail: 1757 - if (need_zeroout || 1758 - ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) { 1759 - /* zero out from the end of the write to the end of the block */ 1760 - pad = pos & (fs_block_size - 1); 1761 - if (pad) 1762 - iomap_dio_zero(dio, iomap, pos, fs_block_size - pad); 1763 - } 1764 - return copied ? copied : ret; 1765 - } 1766 - 1767 - static loff_t 1768 - iomap_dio_hole_actor(loff_t length, struct iomap_dio *dio) 1769 - { 1770 - length = iov_iter_zero(length, dio->submit.iter); 1771 - dio->size += length; 1772 - return length; 1773 - } 1774 - 1775 - static loff_t 1776 - iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, 1777 - struct iomap_dio *dio, struct iomap *iomap) 1778 - { 1779 - struct iov_iter *iter = dio->submit.iter; 1780 - size_t copied; 1781 - 1782 - BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data)); 1783 - 1784 - if (dio->flags & IOMAP_DIO_WRITE) { 1785 - loff_t size = inode->i_size; 1786 - 1787 - if (pos > size) 1788 - memset(iomap->inline_data + size, 0, pos - size); 1789 - copied = copy_from_iter(iomap->inline_data + pos, length, iter); 1790 - if (copied) { 1791 - if (pos + copied > size) 1792 - i_size_write(inode, pos + copied); 1793 - mark_inode_dirty(inode); 1794 - } 1795 - } else { 1796 - copied = copy_to_iter(iomap->inline_data + pos, length, iter); 1797 - } 1798 - dio->size += copied; 1799 - return copied; 1800 - } 1801 - 1802 - static loff_t 1803 - iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, 1804 - void *data, struct iomap *iomap) 1805 - { 1806 - struct iomap_dio *dio = data; 1807 - 1808 - switch (iomap->type) { 1809 - case IOMAP_HOLE: 1810 - if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE)) 1811 - return -EIO; 1812 - return iomap_dio_hole_actor(length, dio); 1813 - case IOMAP_UNWRITTEN: 1814 - if (!(dio->flags & IOMAP_DIO_WRITE)) 1815 - return iomap_dio_hole_actor(length, dio); 1816 - return iomap_dio_bio_actor(inode, pos, length, dio, iomap); 1817 - case IOMAP_MAPPED: 1818 - return iomap_dio_bio_actor(inode, pos, length, dio, iomap); 1819 - case IOMAP_INLINE: 1820 - return iomap_dio_inline_actor(inode, pos, length, dio, iomap); 1821 - default: 1822 - WARN_ON_ONCE(1); 1823 - return -EIO; 1824 - } 1825 - } 1826 - 1827 - /* 1828 - * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO 1829 - * is being issued as AIO or not. This allows us to optimise pure data writes 1830 - * to use REQ_FUA rather than requiring generic_write_sync() to issue a 1831 - * REQ_FLUSH post write. This is slightly tricky because a single request here 1832 - * can be mapped into multiple disjoint IOs and only a subset of the IOs issued 1833 - * may be pure data writes. In that case, we still need to do a full data sync 1834 - * completion. 1835 - */ 1836 - ssize_t 1837 - iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, 1838 - const struct iomap_ops *ops, iomap_dio_end_io_t end_io) 1839 - { 1840 - struct address_space *mapping = iocb->ki_filp->f_mapping; 1841 - struct inode *inode = file_inode(iocb->ki_filp); 1842 - size_t count = iov_iter_count(iter); 1843 - loff_t pos = iocb->ki_pos, start = pos; 1844 - loff_t end = iocb->ki_pos + count - 1, ret = 0; 1845 - unsigned int flags = IOMAP_DIRECT; 1846 - bool wait_for_completion = is_sync_kiocb(iocb); 1847 - struct blk_plug plug; 1848 - struct iomap_dio *dio; 1849 - 1850 - lockdep_assert_held(&inode->i_rwsem); 1851 - 1852 - if (!count) 1853 - return 0; 1854 - 1855 - dio = kmalloc(sizeof(*dio), GFP_KERNEL); 1856 - if (!dio) 1857 - return -ENOMEM; 1858 - 1859 - dio->iocb = iocb; 1860 - atomic_set(&dio->ref, 1); 1861 - dio->size = 0; 1862 - dio->i_size = i_size_read(inode); 1863 - dio->end_io = end_io; 1864 - dio->error = 0; 1865 - dio->flags = 0; 1866 - 1867 - dio->submit.iter = iter; 1868 - dio->submit.waiter = current; 1869 - dio->submit.cookie = BLK_QC_T_NONE; 1870 - dio->submit.last_queue = NULL; 1871 - 1872 - if (iov_iter_rw(iter) == READ) { 1873 - if (pos >= dio->i_size) 1874 - goto out_free_dio; 1875 - 1876 - if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ) 1877 - dio->flags |= IOMAP_DIO_DIRTY; 1878 - } else { 1879 - flags |= IOMAP_WRITE; 1880 - dio->flags |= IOMAP_DIO_WRITE; 1881 - 1882 - /* for data sync or sync, we need sync completion processing */ 1883 - if (iocb->ki_flags & IOCB_DSYNC) 1884 - dio->flags |= IOMAP_DIO_NEED_SYNC; 1885 - 1886 - /* 1887 - * For datasync only writes, we optimistically try using FUA for 1888 - * this IO. Any non-FUA write that occurs will clear this flag, 1889 - * hence we know before completion whether a cache flush is 1890 - * necessary. 1891 - */ 1892 - if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC) 1893 - dio->flags |= IOMAP_DIO_WRITE_FUA; 1894 - } 1895 - 1896 - if (iocb->ki_flags & IOCB_NOWAIT) { 1897 - if (filemap_range_has_page(mapping, start, end)) { 1898 - ret = -EAGAIN; 1899 - goto out_free_dio; 1900 - } 1901 - flags |= IOMAP_NOWAIT; 1902 - } 1903 - 1904 - ret = filemap_write_and_wait_range(mapping, start, end); 1905 - if (ret) 1906 - goto out_free_dio; 1907 - 1908 - /* 1909 - * Try to invalidate cache pages for the range we're direct 1910 - * writing. If this invalidation fails, tough, the write will 1911 - * still work, but racing two incompatible write paths is a 1912 - * pretty crazy thing to do, so we don't support it 100%. 1913 - */ 1914 - ret = invalidate_inode_pages2_range(mapping, 1915 - start >> PAGE_SHIFT, end >> PAGE_SHIFT); 1916 - if (ret) 1917 - dio_warn_stale_pagecache(iocb->ki_filp); 1918 - ret = 0; 1919 - 1920 - if (iov_iter_rw(iter) == WRITE && !wait_for_completion && 1921 - !inode->i_sb->s_dio_done_wq) { 1922 - ret = sb_init_dio_done_wq(inode->i_sb); 1923 - if (ret < 0) 1924 - goto out_free_dio; 1925 - } 1926 - 1927 - inode_dio_begin(inode); 1928 - 1929 - blk_start_plug(&plug); 1930 - do { 1931 - ret = iomap_apply(inode, pos, count, flags, ops, dio, 1932 - iomap_dio_actor); 1933 - if (ret <= 0) { 1934 - /* magic error code to fall back to buffered I/O */ 1935 - if (ret == -ENOTBLK) { 1936 - wait_for_completion = true; 1937 - ret = 0; 1938 - } 1939 - break; 1940 - } 1941 - pos += ret; 1942 - 1943 - if (iov_iter_rw(iter) == READ && pos >= dio->i_size) 1944 - break; 1945 - } while ((count = iov_iter_count(iter)) > 0); 1946 - blk_finish_plug(&plug); 1947 - 1948 - if (ret < 0) 1949 - iomap_dio_set_error(dio, ret); 1950 - 1951 - /* 1952 - * If all the writes we issued were FUA, we don't need to flush the 1953 - * cache on IO completion. Clear the sync flag for this case. 1954 - */ 1955 - if (dio->flags & IOMAP_DIO_WRITE_FUA) 1956 - dio->flags &= ~IOMAP_DIO_NEED_SYNC; 1957 - 1958 - WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie); 1959 - WRITE_ONCE(iocb->private, dio->submit.last_queue); 1960 - 1961 - /* 1962 - * We are about to drop our additional submission reference, which 1963 - * might be the last reference to the dio. There are three three 1964 - * different ways we can progress here: 1965 - * 1966 - * (a) If this is the last reference we will always complete and free 1967 - * the dio ourselves. 1968 - * (b) If this is not the last reference, and we serve an asynchronous 1969 - * iocb, we must never touch the dio after the decrement, the 1970 - * I/O completion handler will complete and free it. 1971 - * (c) If this is not the last reference, but we serve a synchronous 1972 - * iocb, the I/O completion handler will wake us up on the drop 1973 - * of the final reference, and we will complete and free it here 1974 - * after we got woken by the I/O completion handler. 1975 - */ 1976 - dio->wait_for_completion = wait_for_completion; 1977 - if (!atomic_dec_and_test(&dio->ref)) { 1978 - if (!wait_for_completion) 1979 - return -EIOCBQUEUED; 1980 - 1981 - for (;;) { 1982 - set_current_state(TASK_UNINTERRUPTIBLE); 1983 - if (!READ_ONCE(dio->submit.waiter)) 1984 - break; 1985 - 1986 - if (!(iocb->ki_flags & IOCB_HIPRI) || 1987 - !dio->submit.last_queue || 1988 - !blk_poll(dio->submit.last_queue, 1989 - dio->submit.cookie, true)) 1990 - io_schedule(); 1991 - } 1992 - __set_current_state(TASK_RUNNING); 1993 - } 1994 - 1995 - return iomap_dio_complete(dio); 1996 - 1997 - out_free_dio: 1998 - kfree(dio); 1999 - return ret; 2000 - } 2001 - EXPORT_SYMBOL_GPL(iomap_dio_rw); 2002 - 2003 - /* Swapfile activation */ 2004 - 2005 - #ifdef CONFIG_SWAP 2006 - struct iomap_swapfile_info { 2007 - struct iomap iomap; /* accumulated iomap */ 2008 - struct swap_info_struct *sis; 2009 - uint64_t lowest_ppage; /* lowest physical addr seen (pages) */ 2010 - uint64_t highest_ppage; /* highest physical addr seen (pages) */ 2011 - unsigned long nr_pages; /* number of pages collected */ 2012 - int nr_extents; /* extent count */ 2013 - }; 2014 - 2015 - /* 2016 - * Collect physical extents for this swap file. Physical extents reported to 2017 - * the swap code must be trimmed to align to a page boundary. The logical 2018 - * offset within the file is irrelevant since the swapfile code maps logical 2019 - * page numbers of the swap device to the physical page-aligned extents. 2020 - */ 2021 - static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) 2022 - { 2023 - struct iomap *iomap = &isi->iomap; 2024 - unsigned long nr_pages; 2025 - uint64_t first_ppage; 2026 - uint64_t first_ppage_reported; 2027 - uint64_t next_ppage; 2028 - int error; 2029 - 2030 - /* 2031 - * Round the start up and the end down so that the physical 2032 - * extent aligns to a page boundary. 2033 - */ 2034 - first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT; 2035 - next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >> 2036 - PAGE_SHIFT; 2037 - 2038 - /* Skip too-short physical extents. */ 2039 - if (first_ppage >= next_ppage) 2040 - return 0; 2041 - nr_pages = next_ppage - first_ppage; 2042 - 2043 - /* 2044 - * Calculate how much swap space we're adding; the first page contains 2045 - * the swap header and doesn't count. The mm still wants that first 2046 - * page fed to add_swap_extent, however. 2047 - */ 2048 - first_ppage_reported = first_ppage; 2049 - if (iomap->offset == 0) 2050 - first_ppage_reported++; 2051 - if (isi->lowest_ppage > first_ppage_reported) 2052 - isi->lowest_ppage = first_ppage_reported; 2053 - if (isi->highest_ppage < (next_ppage - 1)) 2054 - isi->highest_ppage = next_ppage - 1; 2055 - 2056 - /* Add extent, set up for the next call. */ 2057 - error = add_swap_extent(isi->sis, isi->nr_pages, nr_pages, first_ppage); 2058 - if (error < 0) 2059 - return error; 2060 - isi->nr_extents += error; 2061 - isi->nr_pages += nr_pages; 2062 - return 0; 2063 - } 2064 - 2065 - /* 2066 - * Accumulate iomaps for this swap file. We have to accumulate iomaps because 2067 - * swap only cares about contiguous page-aligned physical extents and makes no 2068 - * distinction between written and unwritten extents. 2069 - */ 2070 - static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos, 2071 - loff_t count, void *data, struct iomap *iomap) 2072 - { 2073 - struct iomap_swapfile_info *isi = data; 2074 - int error; 2075 - 2076 - switch (iomap->type) { 2077 - case IOMAP_MAPPED: 2078 - case IOMAP_UNWRITTEN: 2079 - /* Only real or unwritten extents. */ 2080 - break; 2081 - case IOMAP_INLINE: 2082 - /* No inline data. */ 2083 - pr_err("swapon: file is inline\n"); 2084 - return -EINVAL; 2085 - default: 2086 - pr_err("swapon: file has unallocated extents\n"); 2087 - return -EINVAL; 2088 - } 2089 - 2090 - /* No uncommitted metadata or shared blocks. */ 2091 - if (iomap->flags & IOMAP_F_DIRTY) { 2092 - pr_err("swapon: file is not committed\n"); 2093 - return -EINVAL; 2094 - } 2095 - if (iomap->flags & IOMAP_F_SHARED) { 2096 - pr_err("swapon: file has shared extents\n"); 2097 - return -EINVAL; 2098 - } 2099 - 2100 - /* Only one bdev per swap file. */ 2101 - if (iomap->bdev != isi->sis->bdev) { 2102 - pr_err("swapon: file is on multiple devices\n"); 2103 - return -EINVAL; 2104 - } 2105 - 2106 - if (isi->iomap.length == 0) { 2107 - /* No accumulated extent, so just store it. */ 2108 - memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); 2109 - } else if (isi->iomap.addr + isi->iomap.length == iomap->addr) { 2110 - /* Append this to the accumulated extent. */ 2111 - isi->iomap.length += iomap->length; 2112 - } else { 2113 - /* Otherwise, add the retained iomap and store this one. */ 2114 - error = iomap_swapfile_add_extent(isi); 2115 - if (error) 2116 - return error; 2117 - memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); 2118 - } 2119 - return count; 2120 - } 2121 - 2122 - /* 2123 - * Iterate a swap file's iomaps to construct physical extents that can be 2124 - * passed to the swapfile subsystem. 2125 - */ 2126 - int iomap_swapfile_activate(struct swap_info_struct *sis, 2127 - struct file *swap_file, sector_t *pagespan, 2128 - const struct iomap_ops *ops) 2129 - { 2130 - struct iomap_swapfile_info isi = { 2131 - .sis = sis, 2132 - .lowest_ppage = (sector_t)-1ULL, 2133 - }; 2134 - struct address_space *mapping = swap_file->f_mapping; 2135 - struct inode *inode = mapping->host; 2136 - loff_t pos = 0; 2137 - loff_t len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE); 2138 - loff_t ret; 2139 - 2140 - /* 2141 - * Persist all file mapping metadata so that we won't have any 2142 - * IOMAP_F_DIRTY iomaps. 2143 - */ 2144 - ret = vfs_fsync(swap_file, 1); 2145 - if (ret) 2146 - return ret; 2147 - 2148 - while (len > 0) { 2149 - ret = iomap_apply(inode, pos, len, IOMAP_REPORT, 2150 - ops, &isi, iomap_swapfile_activate_actor); 2151 - if (ret <= 0) 2152 - return ret; 2153 - 2154 - pos += ret; 2155 - len -= ret; 2156 - } 2157 - 2158 - if (isi.iomap.length) { 2159 - ret = iomap_swapfile_add_extent(&isi); 2160 - if (ret) 2161 - return ret; 2162 - } 2163 - 2164 - *pagespan = 1 + isi.highest_ppage - isi.lowest_ppage; 2165 - sis->max = isi.nr_pages; 2166 - sis->pages = isi.nr_pages - 1; 2167 - sis->highest_bit = isi.nr_pages - 1; 2168 - return isi.nr_extents; 2169 - } 2170 - EXPORT_SYMBOL_GPL(iomap_swapfile_activate); 2171 - #endif /* CONFIG_SWAP */ 2172 - 2173 - static loff_t 2174 - iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length, 2175 - void *data, struct iomap *iomap) 2176 - { 2177 - sector_t *bno = data, addr; 2178 - 2179 - if (iomap->type == IOMAP_MAPPED) { 2180 - addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits; 2181 - if (addr > INT_MAX) 2182 - WARN(1, "would truncate bmap result\n"); 2183 - else 2184 - *bno = addr; 2185 - } 2186 - return 0; 2187 - } 2188 - 2189 - /* legacy ->bmap interface. 0 is the error return (!) */ 2190 - sector_t 2191 - iomap_bmap(struct address_space *mapping, sector_t bno, 2192 - const struct iomap_ops *ops) 2193 - { 2194 - struct inode *inode = mapping->host; 2195 - loff_t pos = bno << inode->i_blkbits; 2196 - unsigned blocksize = i_blocksize(inode); 2197 - 2198 - if (filemap_write_and_wait(mapping)) 2199 - return 0; 2200 - 2201 - bno = 0; 2202 - iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor); 2203 - return bno; 2204 - } 2205 - EXPORT_SYMBOL_GPL(iomap_bmap);

+15

fs/iomap/Makefile

··· 1 + # SPDX-License-Identifier: GPL-2.0-or-newer 2 + # 3 + # Copyright (c) 2019 Oracle. 4 + # All Rights Reserved. 5 + # 6 + obj-$(CONFIG_FS_IOMAP) += iomap.o 7 + 8 + iomap-y += \ 9 + apply.o \ 10 + buffered-io.o \ 11 + direct-io.o \ 12 + fiemap.o \ 13 + seek.o 14 + 15 + iomap-$(CONFIG_SWAP) += swapfile.o

+74

fs/iomap/apply.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2010 Red Hat, Inc. 4 + * Copyright (c) 2016-2018 Christoph Hellwig. 5 + */ 6 + #include <linux/module.h> 7 + #include <linux/compiler.h> 8 + #include <linux/fs.h> 9 + #include <linux/iomap.h> 10 + 11 + /* 12 + * Execute a iomap write on a segment of the mapping that spans a 13 + * contiguous range of pages that have identical block mapping state. 14 + * 15 + * This avoids the need to map pages individually, do individual allocations 16 + * for each page and most importantly avoid the need for filesystem specific 17 + * locking per page. Instead, all the operations are amortised over the entire 18 + * range of pages. It is assumed that the filesystems will lock whatever 19 + * resources they require in the iomap_begin call, and release them in the 20 + * iomap_end call. 21 + */ 22 + loff_t 23 + iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, 24 + const struct iomap_ops *ops, void *data, iomap_actor_t actor) 25 + { 26 + struct iomap iomap = { 0 }; 27 + loff_t written = 0, ret; 28 + 29 + /* 30 + * Need to map a range from start position for length bytes. This can 31 + * span multiple pages - it is only guaranteed to return a range of a 32 + * single type of pages (e.g. all into a hole, all mapped or all 33 + * unwritten). Failure at this point has nothing to undo. 34 + * 35 + * If allocation is required for this range, reserve the space now so 36 + * that the allocation is guaranteed to succeed later on. Once we copy 37 + * the data into the page cache pages, then we cannot fail otherwise we 38 + * expose transient stale data. If the reserve fails, we can safely 39 + * back out at this point as there is nothing to undo. 40 + */ 41 + ret = ops->iomap_begin(inode, pos, length, flags, &iomap); 42 + if (ret) 43 + return ret; 44 + if (WARN_ON(iomap.offset > pos)) 45 + return -EIO; 46 + if (WARN_ON(iomap.length == 0)) 47 + return -EIO; 48 + 49 + /* 50 + * Cut down the length to the one actually provided by the filesystem, 51 + * as it might not be able to give us the whole size that we requested. 52 + */ 53 + if (iomap.offset + iomap.length < pos + length) 54 + length = iomap.offset + iomap.length - pos; 55 + 56 + /* 57 + * Now that we have guaranteed that the space allocation will succeed. 58 + * we can do the copy-in page by page without having to worry about 59 + * failures exposing transient data. 60 + */ 61 + written = actor(inode, pos, length, data, &iomap); 62 + 63 + /* 64 + * Now the data has been copied, commit the range we've copied. This 65 + * should not fail unless the filesystem has had a fatal error. 66 + */ 67 + if (ops->iomap_end) { 68 + ret = ops->iomap_end(inode, pos, length, 69 + written > 0 ? written : 0, 70 + flags, &iomap); 71 + } 72 + 73 + return written ? written : ret; 74 + }

+1073

fs/iomap/buffered-io.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2010 Red Hat, Inc. 4 + * Copyright (c) 2016-2018 Christoph Hellwig. 5 + */ 6 + #include <linux/module.h> 7 + #include <linux/compiler.h> 8 + #include <linux/fs.h> 9 + #include <linux/iomap.h> 10 + #include <linux/pagemap.h> 11 + #include <linux/uio.h> 12 + #include <linux/buffer_head.h> 13 + #include <linux/dax.h> 14 + #include <linux/writeback.h> 15 + #include <linux/swap.h> 16 + #include <linux/bio.h> 17 + #include <linux/sched/signal.h> 18 + #include <linux/migrate.h> 19 + 20 + #include "../internal.h" 21 + 22 + static struct iomap_page * 23 + iomap_page_create(struct inode *inode, struct page *page) 24 + { 25 + struct iomap_page *iop = to_iomap_page(page); 26 + 27 + if (iop || i_blocksize(inode) == PAGE_SIZE) 28 + return iop; 29 + 30 + iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL); 31 + atomic_set(&iop->read_count, 0); 32 + atomic_set(&iop->write_count, 0); 33 + bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE); 34 + 35 + /* 36 + * migrate_page_move_mapping() assumes that pages with private data have 37 + * their count elevated by 1. 38 + */ 39 + get_page(page); 40 + set_page_private(page, (unsigned long)iop); 41 + SetPagePrivate(page); 42 + return iop; 43 + } 44 + 45 + static void 46 + iomap_page_release(struct page *page) 47 + { 48 + struct iomap_page *iop = to_iomap_page(page); 49 + 50 + if (!iop) 51 + return; 52 + WARN_ON_ONCE(atomic_read(&iop->read_count)); 53 + WARN_ON_ONCE(atomic_read(&iop->write_count)); 54 + ClearPagePrivate(page); 55 + set_page_private(page, 0); 56 + put_page(page); 57 + kfree(iop); 58 + } 59 + 60 + /* 61 + * Calculate the range inside the page that we actually need to read. 62 + */ 63 + static void 64 + iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, 65 + loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp) 66 + { 67 + loff_t orig_pos = *pos; 68 + loff_t isize = i_size_read(inode); 69 + unsigned block_bits = inode->i_blkbits; 70 + unsigned block_size = (1 << block_bits); 71 + unsigned poff = offset_in_page(*pos); 72 + unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length); 73 + unsigned first = poff >> block_bits; 74 + unsigned last = (poff + plen - 1) >> block_bits; 75 + 76 + /* 77 + * If the block size is smaller than the page size we need to check the 78 + * per-block uptodate status and adjust the offset and length if needed 79 + * to avoid reading in already uptodate ranges. 80 + */ 81 + if (iop) { 82 + unsigned int i; 83 + 84 + /* move forward for each leading block marked uptodate */ 85 + for (i = first; i <= last; i++) { 86 + if (!test_bit(i, iop->uptodate)) 87 + break; 88 + *pos += block_size; 89 + poff += block_size; 90 + plen -= block_size; 91 + first++; 92 + } 93 + 94 + /* truncate len if we find any trailing uptodate block(s) */ 95 + for ( ; i <= last; i++) { 96 + if (test_bit(i, iop->uptodate)) { 97 + plen -= (last - i + 1) * block_size; 98 + last = i - 1; 99 + break; 100 + } 101 + } 102 + } 103 + 104 + /* 105 + * If the extent spans the block that contains the i_size we need to 106 + * handle both halves separately so that we properly zero data in the 107 + * page cache for blocks that are entirely outside of i_size. 108 + */ 109 + if (orig_pos <= isize && orig_pos + length > isize) { 110 + unsigned end = offset_in_page(isize - 1) >> block_bits; 111 + 112 + if (first <= end && last > end) 113 + plen -= (last - end) * block_size; 114 + } 115 + 116 + *offp = poff; 117 + *lenp = plen; 118 + } 119 + 120 + static void 121 + iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len) 122 + { 123 + struct iomap_page *iop = to_iomap_page(page); 124 + struct inode *inode = page->mapping->host; 125 + unsigned first = off >> inode->i_blkbits; 126 + unsigned last = (off + len - 1) >> inode->i_blkbits; 127 + unsigned int i; 128 + bool uptodate = true; 129 + 130 + if (iop) { 131 + for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) { 132 + if (i >= first && i <= last) 133 + set_bit(i, iop->uptodate); 134 + else if (!test_bit(i, iop->uptodate)) 135 + uptodate = false; 136 + } 137 + } 138 + 139 + if (uptodate && !PageError(page)) 140 + SetPageUptodate(page); 141 + } 142 + 143 + static void 144 + iomap_read_finish(struct iomap_page *iop, struct page *page) 145 + { 146 + if (!iop || atomic_dec_and_test(&iop->read_count)) 147 + unlock_page(page); 148 + } 149 + 150 + static void 151 + iomap_read_page_end_io(struct bio_vec *bvec, int error) 152 + { 153 + struct page *page = bvec->bv_page; 154 + struct iomap_page *iop = to_iomap_page(page); 155 + 156 + if (unlikely(error)) { 157 + ClearPageUptodate(page); 158 + SetPageError(page); 159 + } else { 160 + iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len); 161 + } 162 + 163 + iomap_read_finish(iop, page); 164 + } 165 + 166 + static void 167 + iomap_read_end_io(struct bio *bio) 168 + { 169 + int error = blk_status_to_errno(bio->bi_status); 170 + struct bio_vec *bvec; 171 + struct bvec_iter_all iter_all; 172 + 173 + bio_for_each_segment_all(bvec, bio, iter_all) 174 + iomap_read_page_end_io(bvec, error); 175 + bio_put(bio); 176 + } 177 + 178 + struct iomap_readpage_ctx { 179 + struct page *cur_page; 180 + bool cur_page_in_bio; 181 + bool is_readahead; 182 + struct bio *bio; 183 + struct list_head *pages; 184 + }; 185 + 186 + static void 187 + iomap_read_inline_data(struct inode *inode, struct page *page, 188 + struct iomap *iomap) 189 + { 190 + size_t size = i_size_read(inode); 191 + void *addr; 192 + 193 + if (PageUptodate(page)) 194 + return; 195 + 196 + BUG_ON(page->index); 197 + BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data)); 198 + 199 + addr = kmap_atomic(page); 200 + memcpy(addr, iomap->inline_data, size); 201 + memset(addr + size, 0, PAGE_SIZE - size); 202 + kunmap_atomic(addr); 203 + SetPageUptodate(page); 204 + } 205 + 206 + static loff_t 207 + iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 208 + struct iomap *iomap) 209 + { 210 + struct iomap_readpage_ctx *ctx = data; 211 + struct page *page = ctx->cur_page; 212 + struct iomap_page *iop = iomap_page_create(inode, page); 213 + bool same_page = false, is_contig = false; 214 + loff_t orig_pos = pos; 215 + unsigned poff, plen; 216 + sector_t sector; 217 + 218 + if (iomap->type == IOMAP_INLINE) { 219 + WARN_ON_ONCE(pos); 220 + iomap_read_inline_data(inode, page, iomap); 221 + return PAGE_SIZE; 222 + } 223 + 224 + /* zero post-eof blocks as the page may be mapped */ 225 + iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen); 226 + if (plen == 0) 227 + goto done; 228 + 229 + if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) { 230 + zero_user(page, poff, plen); 231 + iomap_set_range_uptodate(page, poff, plen); 232 + goto done; 233 + } 234 + 235 + ctx->cur_page_in_bio = true; 236 + 237 + /* 238 + * Try to merge into a previous segment if we can. 239 + */ 240 + sector = iomap_sector(iomap, pos); 241 + if (ctx->bio && bio_end_sector(ctx->bio) == sector) 242 + is_contig = true; 243 + 244 + if (is_contig && 245 + __bio_try_merge_page(ctx->bio, page, plen, poff, &same_page)) { 246 + if (!same_page && iop) 247 + atomic_inc(&iop->read_count); 248 + goto done; 249 + } 250 + 251 + /* 252 + * If we start a new segment we need to increase the read count, and we 253 + * need to do so before submitting any previous full bio to make sure 254 + * that we don't prematurely unlock the page. 255 + */ 256 + if (iop) 257 + atomic_inc(&iop->read_count); 258 + 259 + if (!ctx->bio || !is_contig || bio_full(ctx->bio, plen)) { 260 + gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); 261 + int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT; 262 + 263 + if (ctx->bio) 264 + submit_bio(ctx->bio); 265 + 266 + if (ctx->is_readahead) /* same as readahead_gfp_mask */ 267 + gfp |= __GFP_NORETRY | __GFP_NOWARN; 268 + ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs)); 269 + ctx->bio->bi_opf = REQ_OP_READ; 270 + if (ctx->is_readahead) 271 + ctx->bio->bi_opf |= REQ_RAHEAD; 272 + ctx->bio->bi_iter.bi_sector = sector; 273 + bio_set_dev(ctx->bio, iomap->bdev); 274 + ctx->bio->bi_end_io = iomap_read_end_io; 275 + } 276 + 277 + bio_add_page(ctx->bio, page, plen, poff); 278 + done: 279 + /* 280 + * Move the caller beyond our range so that it keeps making progress. 281 + * For that we have to include any leading non-uptodate ranges, but 282 + * we can skip trailing ones as they will be handled in the next 283 + * iteration. 284 + */ 285 + return pos - orig_pos + plen; 286 + } 287 + 288 + int 289 + iomap_readpage(struct page *page, const struct iomap_ops *ops) 290 + { 291 + struct iomap_readpage_ctx ctx = { .cur_page = page }; 292 + struct inode *inode = page->mapping->host; 293 + unsigned poff; 294 + loff_t ret; 295 + 296 + for (poff = 0; poff < PAGE_SIZE; poff += ret) { 297 + ret = iomap_apply(inode, page_offset(page) + poff, 298 + PAGE_SIZE - poff, 0, ops, &ctx, 299 + iomap_readpage_actor); 300 + if (ret <= 0) { 301 + WARN_ON_ONCE(ret == 0); 302 + SetPageError(page); 303 + break; 304 + } 305 + } 306 + 307 + if (ctx.bio) { 308 + submit_bio(ctx.bio); 309 + WARN_ON_ONCE(!ctx.cur_page_in_bio); 310 + } else { 311 + WARN_ON_ONCE(ctx.cur_page_in_bio); 312 + unlock_page(page); 313 + } 314 + 315 + /* 316 + * Just like mpage_readpages and block_read_full_page we always 317 + * return 0 and just mark the page as PageError on errors. This 318 + * should be cleaned up all through the stack eventually. 319 + */ 320 + return 0; 321 + } 322 + EXPORT_SYMBOL_GPL(iomap_readpage); 323 + 324 + static struct page * 325 + iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos, 326 + loff_t length, loff_t *done) 327 + { 328 + while (!list_empty(pages)) { 329 + struct page *page = lru_to_page(pages); 330 + 331 + if (page_offset(page) >= (u64)pos + length) 332 + break; 333 + 334 + list_del(&page->lru); 335 + if (!add_to_page_cache_lru(page, inode->i_mapping, page->index, 336 + GFP_NOFS)) 337 + return page; 338 + 339 + /* 340 + * If we already have a page in the page cache at index we are 341 + * done. Upper layers don't care if it is uptodate after the 342 + * readpages call itself as every page gets checked again once 343 + * actually needed. 344 + */ 345 + *done += PAGE_SIZE; 346 + put_page(page); 347 + } 348 + 349 + return NULL; 350 + } 351 + 352 + static loff_t 353 + iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, 354 + void *data, struct iomap *iomap) 355 + { 356 + struct iomap_readpage_ctx *ctx = data; 357 + loff_t done, ret; 358 + 359 + for (done = 0; done < length; done += ret) { 360 + if (ctx->cur_page && offset_in_page(pos + done) == 0) { 361 + if (!ctx->cur_page_in_bio) 362 + unlock_page(ctx->cur_page); 363 + put_page(ctx->cur_page); 364 + ctx->cur_page = NULL; 365 + } 366 + if (!ctx->cur_page) { 367 + ctx->cur_page = iomap_next_page(inode, ctx->pages, 368 + pos, length, &done); 369 + if (!ctx->cur_page) 370 + break; 371 + ctx->cur_page_in_bio = false; 372 + } 373 + ret = iomap_readpage_actor(inode, pos + done, length - done, 374 + ctx, iomap); 375 + } 376 + 377 + return done; 378 + } 379 + 380 + int 381 + iomap_readpages(struct address_space *mapping, struct list_head *pages, 382 + unsigned nr_pages, const struct iomap_ops *ops) 383 + { 384 + struct iomap_readpage_ctx ctx = { 385 + .pages = pages, 386 + .is_readahead = true, 387 + }; 388 + loff_t pos = page_offset(list_entry(pages->prev, struct page, lru)); 389 + loff_t last = page_offset(list_entry(pages->next, struct page, lru)); 390 + loff_t length = last - pos + PAGE_SIZE, ret = 0; 391 + 392 + while (length > 0) { 393 + ret = iomap_apply(mapping->host, pos, length, 0, ops, 394 + &ctx, iomap_readpages_actor); 395 + if (ret <= 0) { 396 + WARN_ON_ONCE(ret == 0); 397 + goto done; 398 + } 399 + pos += ret; 400 + length -= ret; 401 + } 402 + ret = 0; 403 + done: 404 + if (ctx.bio) 405 + submit_bio(ctx.bio); 406 + if (ctx.cur_page) { 407 + if (!ctx.cur_page_in_bio) 408 + unlock_page(ctx.cur_page); 409 + put_page(ctx.cur_page); 410 + } 411 + 412 + /* 413 + * Check that we didn't lose a page due to the arcance calling 414 + * conventions.. 415 + */ 416 + WARN_ON_ONCE(!ret && !list_empty(ctx.pages)); 417 + return ret; 418 + } 419 + EXPORT_SYMBOL_GPL(iomap_readpages); 420 + 421 + /* 422 + * iomap_is_partially_uptodate checks whether blocks within a page are 423 + * uptodate or not. 424 + * 425 + * Returns true if all blocks which correspond to a file portion 426 + * we want to read within the page are uptodate. 427 + */ 428 + int 429 + iomap_is_partially_uptodate(struct page *page, unsigned long from, 430 + unsigned long count) 431 + { 432 + struct iomap_page *iop = to_iomap_page(page); 433 + struct inode *inode = page->mapping->host; 434 + unsigned len, first, last; 435 + unsigned i; 436 + 437 + /* Limit range to one page */ 438 + len = min_t(unsigned, PAGE_SIZE - from, count); 439 + 440 + /* First and last blocks in range within page */ 441 + first = from >> inode->i_blkbits; 442 + last = (from + len - 1) >> inode->i_blkbits; 443 + 444 + if (iop) { 445 + for (i = first; i <= last; i++) 446 + if (!test_bit(i, iop->uptodate)) 447 + return 0; 448 + return 1; 449 + } 450 + 451 + return 0; 452 + } 453 + EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); 454 + 455 + int 456 + iomap_releasepage(struct page *page, gfp_t gfp_mask) 457 + { 458 + /* 459 + * mm accommodates an old ext3 case where clean pages might not have had 460 + * the dirty bit cleared. Thus, it can send actual dirty pages to 461 + * ->releasepage() via shrink_active_list(), skip those here. 462 + */ 463 + if (PageDirty(page) || PageWriteback(page)) 464 + return 0; 465 + iomap_page_release(page); 466 + return 1; 467 + } 468 + EXPORT_SYMBOL_GPL(iomap_releasepage); 469 + 470 + void 471 + iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len) 472 + { 473 + /* 474 + * If we are invalidating the entire page, clear the dirty state from it 475 + * and release it to avoid unnecessary buildup of the LRU. 476 + */ 477 + if (offset == 0 && len == PAGE_SIZE) { 478 + WARN_ON_ONCE(PageWriteback(page)); 479 + cancel_dirty_page(page); 480 + iomap_page_release(page); 481 + } 482 + } 483 + EXPORT_SYMBOL_GPL(iomap_invalidatepage); 484 + 485 + #ifdef CONFIG_MIGRATION 486 + int 487 + iomap_migrate_page(struct address_space *mapping, struct page *newpage, 488 + struct page *page, enum migrate_mode mode) 489 + { 490 + int ret; 491 + 492 + ret = migrate_page_move_mapping(mapping, newpage, page, 0); 493 + if (ret != MIGRATEPAGE_SUCCESS) 494 + return ret; 495 + 496 + if (page_has_private(page)) { 497 + ClearPagePrivate(page); 498 + get_page(newpage); 499 + set_page_private(newpage, page_private(page)); 500 + set_page_private(page, 0); 501 + put_page(page); 502 + SetPagePrivate(newpage); 503 + } 504 + 505 + if (mode != MIGRATE_SYNC_NO_COPY) 506 + migrate_page_copy(newpage, page); 507 + else 508 + migrate_page_states(newpage, page); 509 + return MIGRATEPAGE_SUCCESS; 510 + } 511 + EXPORT_SYMBOL_GPL(iomap_migrate_page); 512 + #endif /* CONFIG_MIGRATION */ 513 + 514 + static void 515 + iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) 516 + { 517 + loff_t i_size = i_size_read(inode); 518 + 519 + /* 520 + * Only truncate newly allocated pages beyoned EOF, even if the 521 + * write started inside the existing inode size. 522 + */ 523 + if (pos + len > i_size) 524 + truncate_pagecache_range(inode, max(pos, i_size), pos + len); 525 + } 526 + 527 + static int 528 + iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page, 529 + unsigned poff, unsigned plen, unsigned from, unsigned to, 530 + struct iomap *iomap) 531 + { 532 + struct bio_vec bvec; 533 + struct bio bio; 534 + 535 + if (iomap->type != IOMAP_MAPPED || block_start >= i_size_read(inode)) { 536 + zero_user_segments(page, poff, from, to, poff + plen); 537 + iomap_set_range_uptodate(page, poff, plen); 538 + return 0; 539 + } 540 + 541 + bio_init(&bio, &bvec, 1); 542 + bio.bi_opf = REQ_OP_READ; 543 + bio.bi_iter.bi_sector = iomap_sector(iomap, block_start); 544 + bio_set_dev(&bio, iomap->bdev); 545 + __bio_add_page(&bio, page, plen, poff); 546 + return submit_bio_wait(&bio); 547 + } 548 + 549 + static int 550 + __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, 551 + struct page *page, struct iomap *iomap) 552 + { 553 + struct iomap_page *iop = iomap_page_create(inode, page); 554 + loff_t block_size = i_blocksize(inode); 555 + loff_t block_start = pos & ~(block_size - 1); 556 + loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1); 557 + unsigned from = offset_in_page(pos), to = from + len, poff, plen; 558 + int status = 0; 559 + 560 + if (PageUptodate(page)) 561 + return 0; 562 + 563 + do { 564 + iomap_adjust_read_range(inode, iop, &block_start, 565 + block_end - block_start, &poff, &plen); 566 + if (plen == 0) 567 + break; 568 + 569 + if ((from > poff && from < poff + plen) || 570 + (to > poff && to < poff + plen)) { 571 + status = iomap_read_page_sync(inode, block_start, page, 572 + poff, plen, from, to, iomap); 573 + if (status) 574 + break; 575 + } 576 + 577 + } while ((block_start += plen) < block_end); 578 + 579 + return status; 580 + } 581 + 582 + static int 583 + iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, 584 + struct page **pagep, struct iomap *iomap) 585 + { 586 + const struct iomap_page_ops *page_ops = iomap->page_ops; 587 + pgoff_t index = pos >> PAGE_SHIFT; 588 + struct page *page; 589 + int status = 0; 590 + 591 + BUG_ON(pos + len > iomap->offset + iomap->length); 592 + 593 + if (fatal_signal_pending(current)) 594 + return -EINTR; 595 + 596 + if (page_ops && page_ops->page_prepare) { 597 + status = page_ops->page_prepare(inode, pos, len, iomap); 598 + if (status) 599 + return status; 600 + } 601 + 602 + page = grab_cache_page_write_begin(inode->i_mapping, index, flags); 603 + if (!page) { 604 + status = -ENOMEM; 605 + goto out_no_page; 606 + } 607 + 608 + if (iomap->type == IOMAP_INLINE) 609 + iomap_read_inline_data(inode, page, iomap); 610 + else if (iomap->flags & IOMAP_F_BUFFER_HEAD) 611 + status = __block_write_begin_int(page, pos, len, NULL, iomap); 612 + else 613 + status = __iomap_write_begin(inode, pos, len, page, iomap); 614 + 615 + if (unlikely(status)) 616 + goto out_unlock; 617 + 618 + *pagep = page; 619 + return 0; 620 + 621 + out_unlock: 622 + unlock_page(page); 623 + put_page(page); 624 + iomap_write_failed(inode, pos, len); 625 + 626 + out_no_page: 627 + if (page_ops && page_ops->page_done) 628 + page_ops->page_done(inode, pos, 0, NULL, iomap); 629 + return status; 630 + } 631 + 632 + int 633 + iomap_set_page_dirty(struct page *page) 634 + { 635 + struct address_space *mapping = page_mapping(page); 636 + int newly_dirty; 637 + 638 + if (unlikely(!mapping)) 639 + return !TestSetPageDirty(page); 640 + 641 + /* 642 + * Lock out page->mem_cgroup migration to keep PageDirty 643 + * synchronized with per-memcg dirty page counters. 644 + */ 645 + lock_page_memcg(page); 646 + newly_dirty = !TestSetPageDirty(page); 647 + if (newly_dirty) 648 + __set_page_dirty(page, mapping, 0); 649 + unlock_page_memcg(page); 650 + 651 + if (newly_dirty) 652 + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 653 + return newly_dirty; 654 + } 655 + EXPORT_SYMBOL_GPL(iomap_set_page_dirty); 656 + 657 + static int 658 + __iomap_write_end(struct inode *inode, loff_t pos, unsigned len, 659 + unsigned copied, struct page *page, struct iomap *iomap) 660 + { 661 + flush_dcache_page(page); 662 + 663 + /* 664 + * The blocks that were entirely written will now be uptodate, so we 665 + * don't have to worry about a readpage reading them and overwriting a 666 + * partial write. However if we have encountered a short write and only 667 + * partially written into a block, it will not be marked uptodate, so a 668 + * readpage might come in and destroy our partial write. 669 + * 670 + * Do the simplest thing, and just treat any short write to a non 671 + * uptodate page as a zero-length write, and force the caller to redo 672 + * the whole thing. 673 + */ 674 + if (unlikely(copied < len && !PageUptodate(page))) 675 + return 0; 676 + iomap_set_range_uptodate(page, offset_in_page(pos), len); 677 + iomap_set_page_dirty(page); 678 + return copied; 679 + } 680 + 681 + static int 682 + iomap_write_end_inline(struct inode *inode, struct page *page, 683 + struct iomap *iomap, loff_t pos, unsigned copied) 684 + { 685 + void *addr; 686 + 687 + WARN_ON_ONCE(!PageUptodate(page)); 688 + BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data)); 689 + 690 + addr = kmap_atomic(page); 691 + memcpy(iomap->inline_data + pos, addr + pos, copied); 692 + kunmap_atomic(addr); 693 + 694 + mark_inode_dirty(inode); 695 + return copied; 696 + } 697 + 698 + static int 699 + iomap_write_end(struct inode *inode, loff_t pos, unsigned len, 700 + unsigned copied, struct page *page, struct iomap *iomap) 701 + { 702 + const struct iomap_page_ops *page_ops = iomap->page_ops; 703 + loff_t old_size = inode->i_size; 704 + int ret; 705 + 706 + if (iomap->type == IOMAP_INLINE) { 707 + ret = iomap_write_end_inline(inode, page, iomap, pos, copied); 708 + } else if (iomap->flags & IOMAP_F_BUFFER_HEAD) { 709 + ret = block_write_end(NULL, inode->i_mapping, pos, len, copied, 710 + page, NULL); 711 + } else { 712 + ret = __iomap_write_end(inode, pos, len, copied, page, iomap); 713 + } 714 + 715 + /* 716 + * Update the in-memory inode size after copying the data into the page 717 + * cache. It's up to the file system to write the updated size to disk, 718 + * preferably after I/O completion so that no stale data is exposed. 719 + */ 720 + if (pos + ret > old_size) { 721 + i_size_write(inode, pos + ret); 722 + iomap->flags |= IOMAP_F_SIZE_CHANGED; 723 + } 724 + unlock_page(page); 725 + 726 + if (old_size < pos) 727 + pagecache_isize_extended(inode, old_size, pos); 728 + if (page_ops && page_ops->page_done) 729 + page_ops->page_done(inode, pos, ret, page, iomap); 730 + put_page(page); 731 + 732 + if (ret < len) 733 + iomap_write_failed(inode, pos, len); 734 + return ret; 735 + } 736 + 737 + static loff_t 738 + iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 739 + struct iomap *iomap) 740 + { 741 + struct iov_iter *i = data; 742 + long status = 0; 743 + ssize_t written = 0; 744 + unsigned int flags = AOP_FLAG_NOFS; 745 + 746 + do { 747 + struct page *page; 748 + unsigned long offset; /* Offset into pagecache page */ 749 + unsigned long bytes; /* Bytes to write to page */ 750 + size_t copied; /* Bytes copied from user */ 751 + 752 + offset = offset_in_page(pos); 753 + bytes = min_t(unsigned long, PAGE_SIZE - offset, 754 + iov_iter_count(i)); 755 + again: 756 + if (bytes > length) 757 + bytes = length; 758 + 759 + /* 760 + * Bring in the user page that we will copy from _first_. 761 + * Otherwise there's a nasty deadlock on copying from the 762 + * same page as we're writing to, without it being marked 763 + * up-to-date. 764 + * 765 + * Not only is this an optimisation, but it is also required 766 + * to check that the address is actually valid, when atomic 767 + * usercopies are used, below. 768 + */ 769 + if (unlikely(iov_iter_fault_in_readable(i, bytes))) { 770 + status = -EFAULT; 771 + break; 772 + } 773 + 774 + status = iomap_write_begin(inode, pos, bytes, flags, &page, 775 + iomap); 776 + if (unlikely(status)) 777 + break; 778 + 779 + if (mapping_writably_mapped(inode->i_mapping)) 780 + flush_dcache_page(page); 781 + 782 + copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 783 + 784 + flush_dcache_page(page); 785 + 786 + status = iomap_write_end(inode, pos, bytes, copied, page, 787 + iomap); 788 + if (unlikely(status < 0)) 789 + break; 790 + copied = status; 791 + 792 + cond_resched(); 793 + 794 + iov_iter_advance(i, copied); 795 + if (unlikely(copied == 0)) { 796 + /* 797 + * If we were unable to copy any data at all, we must 798 + * fall back to a single segment length write. 799 + * 800 + * If we didn't fallback here, we could livelock 801 + * because not all segments in the iov can be copied at 802 + * once without a pagefault. 803 + */ 804 + bytes = min_t(unsigned long, PAGE_SIZE - offset, 805 + iov_iter_single_seg_count(i)); 806 + goto again; 807 + } 808 + pos += copied; 809 + written += copied; 810 + length -= copied; 811 + 812 + balance_dirty_pages_ratelimited(inode->i_mapping); 813 + } while (iov_iter_count(i) && length); 814 + 815 + return written ? written : status; 816 + } 817 + 818 + ssize_t 819 + iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, 820 + const struct iomap_ops *ops) 821 + { 822 + struct inode *inode = iocb->ki_filp->f_mapping->host; 823 + loff_t pos = iocb->ki_pos, ret = 0, written = 0; 824 + 825 + while (iov_iter_count(iter)) { 826 + ret = iomap_apply(inode, pos, iov_iter_count(iter), 827 + IOMAP_WRITE, ops, iter, iomap_write_actor); 828 + if (ret <= 0) 829 + break; 830 + pos += ret; 831 + written += ret; 832 + } 833 + 834 + return written ? written : ret; 835 + } 836 + EXPORT_SYMBOL_GPL(iomap_file_buffered_write); 837 + 838 + static struct page * 839 + __iomap_read_page(struct inode *inode, loff_t offset) 840 + { 841 + struct address_space *mapping = inode->i_mapping; 842 + struct page *page; 843 + 844 + page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL); 845 + if (IS_ERR(page)) 846 + return page; 847 + if (!PageUptodate(page)) { 848 + put_page(page); 849 + return ERR_PTR(-EIO); 850 + } 851 + return page; 852 + } 853 + 854 + static loff_t 855 + iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 856 + struct iomap *iomap) 857 + { 858 + long status = 0; 859 + ssize_t written = 0; 860 + 861 + do { 862 + struct page *page, *rpage; 863 + unsigned long offset; /* Offset into pagecache page */ 864 + unsigned long bytes; /* Bytes to write to page */ 865 + 866 + offset = offset_in_page(pos); 867 + bytes = min_t(loff_t, PAGE_SIZE - offset, length); 868 + 869 + rpage = __iomap_read_page(inode, pos); 870 + if (IS_ERR(rpage)) 871 + return PTR_ERR(rpage); 872 + 873 + status = iomap_write_begin(inode, pos, bytes, 874 + AOP_FLAG_NOFS, &page, iomap); 875 + put_page(rpage); 876 + if (unlikely(status)) 877 + return status; 878 + 879 + WARN_ON_ONCE(!PageUptodate(page)); 880 + 881 + status = iomap_write_end(inode, pos, bytes, bytes, page, iomap); 882 + if (unlikely(status <= 0)) { 883 + if (WARN_ON_ONCE(status == 0)) 884 + return -EIO; 885 + return status; 886 + } 887 + 888 + cond_resched(); 889 + 890 + pos += status; 891 + written += status; 892 + length -= status; 893 + 894 + balance_dirty_pages_ratelimited(inode->i_mapping); 895 + } while (length); 896 + 897 + return written; 898 + } 899 + 900 + int 901 + iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, 902 + const struct iomap_ops *ops) 903 + { 904 + loff_t ret; 905 + 906 + while (len) { 907 + ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL, 908 + iomap_dirty_actor); 909 + if (ret <= 0) 910 + return ret; 911 + pos += ret; 912 + len -= ret; 913 + } 914 + 915 + return 0; 916 + } 917 + EXPORT_SYMBOL_GPL(iomap_file_dirty); 918 + 919 + static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, 920 + unsigned bytes, struct iomap *iomap) 921 + { 922 + struct page *page; 923 + int status; 924 + 925 + status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page, 926 + iomap); 927 + if (status) 928 + return status; 929 + 930 + zero_user(page, offset, bytes); 931 + mark_page_accessed(page); 932 + 933 + return iomap_write_end(inode, pos, bytes, bytes, page, iomap); 934 + } 935 + 936 + static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, 937 + struct iomap *iomap) 938 + { 939 + return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, 940 + iomap_sector(iomap, pos & PAGE_MASK), offset, bytes); 941 + } 942 + 943 + static loff_t 944 + iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, 945 + void *data, struct iomap *iomap) 946 + { 947 + bool *did_zero = data; 948 + loff_t written = 0; 949 + int status; 950 + 951 + /* already zeroed? we're done. */ 952 + if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) 953 + return count; 954 + 955 + do { 956 + unsigned offset, bytes; 957 + 958 + offset = offset_in_page(pos); 959 + bytes = min_t(loff_t, PAGE_SIZE - offset, count); 960 + 961 + if (IS_DAX(inode)) 962 + status = iomap_dax_zero(pos, offset, bytes, iomap); 963 + else 964 + status = iomap_zero(inode, pos, offset, bytes, iomap); 965 + if (status < 0) 966 + return status; 967 + 968 + pos += bytes; 969 + count -= bytes; 970 + written += bytes; 971 + if (did_zero) 972 + *did_zero = true; 973 + } while (count > 0); 974 + 975 + return written; 976 + } 977 + 978 + int 979 + iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, 980 + const struct iomap_ops *ops) 981 + { 982 + loff_t ret; 983 + 984 + while (len > 0) { 985 + ret = iomap_apply(inode, pos, len, IOMAP_ZERO, 986 + ops, did_zero, iomap_zero_range_actor); 987 + if (ret <= 0) 988 + return ret; 989 + 990 + pos += ret; 991 + len -= ret; 992 + } 993 + 994 + return 0; 995 + } 996 + EXPORT_SYMBOL_GPL(iomap_zero_range); 997 + 998 + int 999 + iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, 1000 + const struct iomap_ops *ops) 1001 + { 1002 + unsigned int blocksize = i_blocksize(inode); 1003 + unsigned int off = pos & (blocksize - 1); 1004 + 1005 + /* Block boundary? Nothing to do */ 1006 + if (!off) 1007 + return 0; 1008 + return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); 1009 + } 1010 + EXPORT_SYMBOL_GPL(iomap_truncate_page); 1011 + 1012 + static loff_t 1013 + iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, 1014 + void *data, struct iomap *iomap) 1015 + { 1016 + struct page *page = data; 1017 + int ret; 1018 + 1019 + if (iomap->flags & IOMAP_F_BUFFER_HEAD) { 1020 + ret = __block_write_begin_int(page, pos, length, NULL, iomap); 1021 + if (ret) 1022 + return ret; 1023 + block_commit_write(page, 0, length); 1024 + } else { 1025 + WARN_ON_ONCE(!PageUptodate(page)); 1026 + iomap_page_create(inode, page); 1027 + set_page_dirty(page); 1028 + } 1029 + 1030 + return length; 1031 + } 1032 + 1033 + vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) 1034 + { 1035 + struct page *page = vmf->page; 1036 + struct inode *inode = file_inode(vmf->vma->vm_file); 1037 + unsigned long length; 1038 + loff_t offset, size; 1039 + ssize_t ret; 1040 + 1041 + lock_page(page); 1042 + size = i_size_read(inode); 1043 + if ((page->mapping != inode->i_mapping) || 1044 + (page_offset(page) > size)) { 1045 + /* We overload EFAULT to mean page got truncated */ 1046 + ret = -EFAULT; 1047 + goto out_unlock; 1048 + } 1049 + 1050 + /* page is wholly or partially inside EOF */ 1051 + if (((page->index + 1) << PAGE_SHIFT) > size) 1052 + length = offset_in_page(size); 1053 + else 1054 + length = PAGE_SIZE; 1055 + 1056 + offset = page_offset(page); 1057 + while (length > 0) { 1058 + ret = iomap_apply(inode, offset, length, 1059 + IOMAP_WRITE | IOMAP_FAULT, ops, page, 1060 + iomap_page_mkwrite_actor); 1061 + if (unlikely(ret <= 0)) 1062 + goto out_unlock; 1063 + offset += ret; 1064 + length -= ret; 1065 + } 1066 + 1067 + wait_for_stable_page(page); 1068 + return VM_FAULT_LOCKED; 1069 + out_unlock: 1070 + unlock_page(page); 1071 + return block_page_mkwrite_return(ret); 1072 + } 1073 + EXPORT_SYMBOL_GPL(iomap_page_mkwrite);

+562

fs/iomap/direct-io.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2010 Red Hat, Inc. 4 + * Copyright (c) 2016-2018 Christoph Hellwig. 5 + */ 6 + #include <linux/module.h> 7 + #include <linux/compiler.h> 8 + #include <linux/fs.h> 9 + #include <linux/iomap.h> 10 + #include <linux/backing-dev.h> 11 + #include <linux/uio.h> 12 + #include <linux/task_io_accounting_ops.h> 13 + 14 + #include "../internal.h" 15 + 16 + /* 17 + * Private flags for iomap_dio, must not overlap with the public ones in 18 + * iomap.h: 19 + */ 20 + #define IOMAP_DIO_WRITE_FUA (1 << 28) 21 + #define IOMAP_DIO_NEED_SYNC (1 << 29) 22 + #define IOMAP_DIO_WRITE (1 << 30) 23 + #define IOMAP_DIO_DIRTY (1 << 31) 24 + 25 + struct iomap_dio { 26 + struct kiocb *iocb; 27 + iomap_dio_end_io_t *end_io; 28 + loff_t i_size; 29 + loff_t size; 30 + atomic_t ref; 31 + unsigned flags; 32 + int error; 33 + bool wait_for_completion; 34 + 35 + union { 36 + /* used during submission and for synchronous completion: */ 37 + struct { 38 + struct iov_iter *iter; 39 + struct task_struct *waiter; 40 + struct request_queue *last_queue; 41 + blk_qc_t cookie; 42 + } submit; 43 + 44 + /* used for aio completion: */ 45 + struct { 46 + struct work_struct work; 47 + } aio; 48 + }; 49 + }; 50 + 51 + int iomap_dio_iopoll(struct kiocb *kiocb, bool spin) 52 + { 53 + struct request_queue *q = READ_ONCE(kiocb->private); 54 + 55 + if (!q) 56 + return 0; 57 + return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin); 58 + } 59 + EXPORT_SYMBOL_GPL(iomap_dio_iopoll); 60 + 61 + static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap, 62 + struct bio *bio) 63 + { 64 + atomic_inc(&dio->ref); 65 + 66 + if (dio->iocb->ki_flags & IOCB_HIPRI) 67 + bio_set_polled(bio, dio->iocb); 68 + 69 + dio->submit.last_queue = bdev_get_queue(iomap->bdev); 70 + dio->submit.cookie = submit_bio(bio); 71 + } 72 + 73 + static ssize_t iomap_dio_complete(struct iomap_dio *dio) 74 + { 75 + struct kiocb *iocb = dio->iocb; 76 + struct inode *inode = file_inode(iocb->ki_filp); 77 + loff_t offset = iocb->ki_pos; 78 + ssize_t ret; 79 + 80 + if (dio->end_io) { 81 + ret = dio->end_io(iocb, 82 + dio->error ? dio->error : dio->size, 83 + dio->flags); 84 + } else { 85 + ret = dio->error; 86 + } 87 + 88 + if (likely(!ret)) { 89 + ret = dio->size; 90 + /* check for short read */ 91 + if (offset + ret > dio->i_size && 92 + !(dio->flags & IOMAP_DIO_WRITE)) 93 + ret = dio->i_size - offset; 94 + iocb->ki_pos += ret; 95 + } 96 + 97 + /* 98 + * Try again to invalidate clean pages which might have been cached by 99 + * non-direct readahead, or faulted in by get_user_pages() if the source 100 + * of the write was an mmap'ed region of the file we're writing. Either 101 + * one is a pretty crazy thing to do, so we don't support it 100%. If 102 + * this invalidation fails, tough, the write still worked... 103 + * 104 + * And this page cache invalidation has to be after dio->end_io(), as 105 + * some filesystems convert unwritten extents to real allocations in 106 + * end_io() when necessary, otherwise a racing buffer read would cache 107 + * zeros from unwritten extents. 108 + */ 109 + if (!dio->error && 110 + (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) { 111 + int err; 112 + err = invalidate_inode_pages2_range(inode->i_mapping, 113 + offset >> PAGE_SHIFT, 114 + (offset + dio->size - 1) >> PAGE_SHIFT); 115 + if (err) 116 + dio_warn_stale_pagecache(iocb->ki_filp); 117 + } 118 + 119 + /* 120 + * If this is a DSYNC write, make sure we push it to stable storage now 121 + * that we've written data. 122 + */ 123 + if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC)) 124 + ret = generic_write_sync(iocb, ret); 125 + 126 + inode_dio_end(file_inode(iocb->ki_filp)); 127 + kfree(dio); 128 + 129 + return ret; 130 + } 131 + 132 + static void iomap_dio_complete_work(struct work_struct *work) 133 + { 134 + struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); 135 + struct kiocb *iocb = dio->iocb; 136 + 137 + iocb->ki_complete(iocb, iomap_dio_complete(dio), 0); 138 + } 139 + 140 + /* 141 + * Set an error in the dio if none is set yet. We have to use cmpxchg 142 + * as the submission context and the completion context(s) can race to 143 + * update the error. 144 + */ 145 + static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret) 146 + { 147 + cmpxchg(&dio->error, 0, ret); 148 + } 149 + 150 + static void iomap_dio_bio_end_io(struct bio *bio) 151 + { 152 + struct iomap_dio *dio = bio->bi_private; 153 + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); 154 + 155 + if (bio->bi_status) 156 + iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); 157 + 158 + if (atomic_dec_and_test(&dio->ref)) { 159 + if (dio->wait_for_completion) { 160 + struct task_struct *waiter = dio->submit.waiter; 161 + WRITE_ONCE(dio->submit.waiter, NULL); 162 + blk_wake_io_task(waiter); 163 + } else if (dio->flags & IOMAP_DIO_WRITE) { 164 + struct inode *inode = file_inode(dio->iocb->ki_filp); 165 + 166 + INIT_WORK(&dio->aio.work, iomap_dio_complete_work); 167 + queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); 168 + } else { 169 + iomap_dio_complete_work(&dio->aio.work); 170 + } 171 + } 172 + 173 + if (should_dirty) { 174 + bio_check_pages_dirty(bio); 175 + } else { 176 + bio_release_pages(bio, false); 177 + bio_put(bio); 178 + } 179 + } 180 + 181 + static void 182 + iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, 183 + unsigned len) 184 + { 185 + struct page *page = ZERO_PAGE(0); 186 + int flags = REQ_SYNC | REQ_IDLE; 187 + struct bio *bio; 188 + 189 + bio = bio_alloc(GFP_KERNEL, 1); 190 + bio_set_dev(bio, iomap->bdev); 191 + bio->bi_iter.bi_sector = iomap_sector(iomap, pos); 192 + bio->bi_private = dio; 193 + bio->bi_end_io = iomap_dio_bio_end_io; 194 + 195 + get_page(page); 196 + __bio_add_page(bio, page, len, 0); 197 + bio_set_op_attrs(bio, REQ_OP_WRITE, flags); 198 + iomap_dio_submit_bio(dio, iomap, bio); 199 + } 200 + 201 + static loff_t 202 + iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, 203 + struct iomap_dio *dio, struct iomap *iomap) 204 + { 205 + unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); 206 + unsigned int fs_block_size = i_blocksize(inode), pad; 207 + unsigned int align = iov_iter_alignment(dio->submit.iter); 208 + struct iov_iter iter; 209 + struct bio *bio; 210 + bool need_zeroout = false; 211 + bool use_fua = false; 212 + int nr_pages, ret = 0; 213 + size_t copied = 0; 214 + 215 + if ((pos | length | align) & ((1 << blkbits) - 1)) 216 + return -EINVAL; 217 + 218 + if (iomap->type == IOMAP_UNWRITTEN) { 219 + dio->flags |= IOMAP_DIO_UNWRITTEN; 220 + need_zeroout = true; 221 + } 222 + 223 + if (iomap->flags & IOMAP_F_SHARED) 224 + dio->flags |= IOMAP_DIO_COW; 225 + 226 + if (iomap->flags & IOMAP_F_NEW) { 227 + need_zeroout = true; 228 + } else if (iomap->type == IOMAP_MAPPED) { 229 + /* 230 + * Use a FUA write if we need datasync semantics, this is a pure 231 + * data IO that doesn't require any metadata updates (including 232 + * after IO completion such as unwritten extent conversion) and 233 + * the underlying device supports FUA. This allows us to avoid 234 + * cache flushes on IO completion. 235 + */ 236 + if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && 237 + (dio->flags & IOMAP_DIO_WRITE_FUA) && 238 + blk_queue_fua(bdev_get_queue(iomap->bdev))) 239 + use_fua = true; 240 + } 241 + 242 + /* 243 + * Operate on a partial iter trimmed to the extent we were called for. 244 + * We'll update the iter in the dio once we're done with this extent. 245 + */ 246 + iter = *dio->submit.iter; 247 + iov_iter_truncate(&iter, length); 248 + 249 + nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); 250 + if (nr_pages <= 0) 251 + return nr_pages; 252 + 253 + if (need_zeroout) { 254 + /* zero out from the start of the block to the write offset */ 255 + pad = pos & (fs_block_size - 1); 256 + if (pad) 257 + iomap_dio_zero(dio, iomap, pos - pad, pad); 258 + } 259 + 260 + do { 261 + size_t n; 262 + if (dio->error) { 263 + iov_iter_revert(dio->submit.iter, copied); 264 + return 0; 265 + } 266 + 267 + bio = bio_alloc(GFP_KERNEL, nr_pages); 268 + bio_set_dev(bio, iomap->bdev); 269 + bio->bi_iter.bi_sector = iomap_sector(iomap, pos); 270 + bio->bi_write_hint = dio->iocb->ki_hint; 271 + bio->bi_ioprio = dio->iocb->ki_ioprio; 272 + bio->bi_private = dio; 273 + bio->bi_end_io = iomap_dio_bio_end_io; 274 + 275 + ret = bio_iov_iter_get_pages(bio, &iter); 276 + if (unlikely(ret)) { 277 + /* 278 + * We have to stop part way through an IO. We must fall 279 + * through to the sub-block tail zeroing here, otherwise 280 + * this short IO may expose stale data in the tail of 281 + * the block we haven't written data to. 282 + */ 283 + bio_put(bio); 284 + goto zero_tail; 285 + } 286 + 287 + n = bio->bi_iter.bi_size; 288 + if (dio->flags & IOMAP_DIO_WRITE) { 289 + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; 290 + if (use_fua) 291 + bio->bi_opf |= REQ_FUA; 292 + else 293 + dio->flags &= ~IOMAP_DIO_WRITE_FUA; 294 + task_io_account_write(n); 295 + } else { 296 + bio->bi_opf = REQ_OP_READ; 297 + if (dio->flags & IOMAP_DIO_DIRTY) 298 + bio_set_pages_dirty(bio); 299 + } 300 + 301 + iov_iter_advance(dio->submit.iter, n); 302 + 303 + dio->size += n; 304 + pos += n; 305 + copied += n; 306 + 307 + nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); 308 + iomap_dio_submit_bio(dio, iomap, bio); 309 + } while (nr_pages); 310 + 311 + /* 312 + * We need to zeroout the tail of a sub-block write if the extent type 313 + * requires zeroing or the write extends beyond EOF. If we don't zero 314 + * the block tail in the latter case, we can expose stale data via mmap 315 + * reads of the EOF block. 316 + */ 317 + zero_tail: 318 + if (need_zeroout || 319 + ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) { 320 + /* zero out from the end of the write to the end of the block */ 321 + pad = pos & (fs_block_size - 1); 322 + if (pad) 323 + iomap_dio_zero(dio, iomap, pos, fs_block_size - pad); 324 + } 325 + return copied ? copied : ret; 326 + } 327 + 328 + static loff_t 329 + iomap_dio_hole_actor(loff_t length, struct iomap_dio *dio) 330 + { 331 + length = iov_iter_zero(length, dio->submit.iter); 332 + dio->size += length; 333 + return length; 334 + } 335 + 336 + static loff_t 337 + iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, 338 + struct iomap_dio *dio, struct iomap *iomap) 339 + { 340 + struct iov_iter *iter = dio->submit.iter; 341 + size_t copied; 342 + 343 + BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data)); 344 + 345 + if (dio->flags & IOMAP_DIO_WRITE) { 346 + loff_t size = inode->i_size; 347 + 348 + if (pos > size) 349 + memset(iomap->inline_data + size, 0, pos - size); 350 + copied = copy_from_iter(iomap->inline_data + pos, length, iter); 351 + if (copied) { 352 + if (pos + copied > size) 353 + i_size_write(inode, pos + copied); 354 + mark_inode_dirty(inode); 355 + } 356 + } else { 357 + copied = copy_to_iter(iomap->inline_data + pos, length, iter); 358 + } 359 + dio->size += copied; 360 + return copied; 361 + } 362 + 363 + static loff_t 364 + iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, 365 + void *data, struct iomap *iomap) 366 + { 367 + struct iomap_dio *dio = data; 368 + 369 + switch (iomap->type) { 370 + case IOMAP_HOLE: 371 + if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE)) 372 + return -EIO; 373 + return iomap_dio_hole_actor(length, dio); 374 + case IOMAP_UNWRITTEN: 375 + if (!(dio->flags & IOMAP_DIO_WRITE)) 376 + return iomap_dio_hole_actor(length, dio); 377 + return iomap_dio_bio_actor(inode, pos, length, dio, iomap); 378 + case IOMAP_MAPPED: 379 + return iomap_dio_bio_actor(inode, pos, length, dio, iomap); 380 + case IOMAP_INLINE: 381 + return iomap_dio_inline_actor(inode, pos, length, dio, iomap); 382 + default: 383 + WARN_ON_ONCE(1); 384 + return -EIO; 385 + } 386 + } 387 + 388 + /* 389 + * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO 390 + * is being issued as AIO or not. This allows us to optimise pure data writes 391 + * to use REQ_FUA rather than requiring generic_write_sync() to issue a 392 + * REQ_FLUSH post write. This is slightly tricky because a single request here 393 + * can be mapped into multiple disjoint IOs and only a subset of the IOs issued 394 + * may be pure data writes. In that case, we still need to do a full data sync 395 + * completion. 396 + */ 397 + ssize_t 398 + iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, 399 + const struct iomap_ops *ops, iomap_dio_end_io_t end_io) 400 + { 401 + struct address_space *mapping = iocb->ki_filp->f_mapping; 402 + struct inode *inode = file_inode(iocb->ki_filp); 403 + size_t count = iov_iter_count(iter); 404 + loff_t pos = iocb->ki_pos, start = pos; 405 + loff_t end = iocb->ki_pos + count - 1, ret = 0; 406 + unsigned int flags = IOMAP_DIRECT; 407 + bool wait_for_completion = is_sync_kiocb(iocb); 408 + struct blk_plug plug; 409 + struct iomap_dio *dio; 410 + 411 + lockdep_assert_held(&inode->i_rwsem); 412 + 413 + if (!count) 414 + return 0; 415 + 416 + dio = kmalloc(sizeof(*dio), GFP_KERNEL); 417 + if (!dio) 418 + return -ENOMEM; 419 + 420 + dio->iocb = iocb; 421 + atomic_set(&dio->ref, 1); 422 + dio->size = 0; 423 + dio->i_size = i_size_read(inode); 424 + dio->end_io = end_io; 425 + dio->error = 0; 426 + dio->flags = 0; 427 + 428 + dio->submit.iter = iter; 429 + dio->submit.waiter = current; 430 + dio->submit.cookie = BLK_QC_T_NONE; 431 + dio->submit.last_queue = NULL; 432 + 433 + if (iov_iter_rw(iter) == READ) { 434 + if (pos >= dio->i_size) 435 + goto out_free_dio; 436 + 437 + if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ) 438 + dio->flags |= IOMAP_DIO_DIRTY; 439 + } else { 440 + flags |= IOMAP_WRITE; 441 + dio->flags |= IOMAP_DIO_WRITE; 442 + 443 + /* for data sync or sync, we need sync completion processing */ 444 + if (iocb->ki_flags & IOCB_DSYNC) 445 + dio->flags |= IOMAP_DIO_NEED_SYNC; 446 + 447 + /* 448 + * For datasync only writes, we optimistically try using FUA for 449 + * this IO. Any non-FUA write that occurs will clear this flag, 450 + * hence we know before completion whether a cache flush is 451 + * necessary. 452 + */ 453 + if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC) 454 + dio->flags |= IOMAP_DIO_WRITE_FUA; 455 + } 456 + 457 + if (iocb->ki_flags & IOCB_NOWAIT) { 458 + if (filemap_range_has_page(mapping, start, end)) { 459 + ret = -EAGAIN; 460 + goto out_free_dio; 461 + } 462 + flags |= IOMAP_NOWAIT; 463 + } 464 + 465 + ret = filemap_write_and_wait_range(mapping, start, end); 466 + if (ret) 467 + goto out_free_dio; 468 + 469 + /* 470 + * Try to invalidate cache pages for the range we're direct 471 + * writing. If this invalidation fails, tough, the write will 472 + * still work, but racing two incompatible write paths is a 473 + * pretty crazy thing to do, so we don't support it 100%. 474 + */ 475 + ret = invalidate_inode_pages2_range(mapping, 476 + start >> PAGE_SHIFT, end >> PAGE_SHIFT); 477 + if (ret) 478 + dio_warn_stale_pagecache(iocb->ki_filp); 479 + ret = 0; 480 + 481 + if (iov_iter_rw(iter) == WRITE && !wait_for_completion && 482 + !inode->i_sb->s_dio_done_wq) { 483 + ret = sb_init_dio_done_wq(inode->i_sb); 484 + if (ret < 0) 485 + goto out_free_dio; 486 + } 487 + 488 + inode_dio_begin(inode); 489 + 490 + blk_start_plug(&plug); 491 + do { 492 + ret = iomap_apply(inode, pos, count, flags, ops, dio, 493 + iomap_dio_actor); 494 + if (ret <= 0) { 495 + /* magic error code to fall back to buffered I/O */ 496 + if (ret == -ENOTBLK) { 497 + wait_for_completion = true; 498 + ret = 0; 499 + } 500 + break; 501 + } 502 + pos += ret; 503 + 504 + if (iov_iter_rw(iter) == READ && pos >= dio->i_size) 505 + break; 506 + } while ((count = iov_iter_count(iter)) > 0); 507 + blk_finish_plug(&plug); 508 + 509 + if (ret < 0) 510 + iomap_dio_set_error(dio, ret); 511 + 512 + /* 513 + * If all the writes we issued were FUA, we don't need to flush the 514 + * cache on IO completion. Clear the sync flag for this case. 515 + */ 516 + if (dio->flags & IOMAP_DIO_WRITE_FUA) 517 + dio->flags &= ~IOMAP_DIO_NEED_SYNC; 518 + 519 + WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie); 520 + WRITE_ONCE(iocb->private, dio->submit.last_queue); 521 + 522 + /* 523 + * We are about to drop our additional submission reference, which 524 + * might be the last reference to the dio. There are three three 525 + * different ways we can progress here: 526 + * 527 + * (a) If this is the last reference we will always complete and free 528 + * the dio ourselves. 529 + * (b) If this is not the last reference, and we serve an asynchronous 530 + * iocb, we must never touch the dio after the decrement, the 531 + * I/O completion handler will complete and free it. 532 + * (c) If this is not the last reference, but we serve a synchronous 533 + * iocb, the I/O completion handler will wake us up on the drop 534 + * of the final reference, and we will complete and free it here 535 + * after we got woken by the I/O completion handler. 536 + */ 537 + dio->wait_for_completion = wait_for_completion; 538 + if (!atomic_dec_and_test(&dio->ref)) { 539 + if (!wait_for_completion) 540 + return -EIOCBQUEUED; 541 + 542 + for (;;) { 543 + set_current_state(TASK_UNINTERRUPTIBLE); 544 + if (!READ_ONCE(dio->submit.waiter)) 545 + break; 546 + 547 + if (!(iocb->ki_flags & IOCB_HIPRI) || 548 + !dio->submit.last_queue || 549 + !blk_poll(dio->submit.last_queue, 550 + dio->submit.cookie, true)) 551 + io_schedule(); 552 + } 553 + __set_current_state(TASK_RUNNING); 554 + } 555 + 556 + return iomap_dio_complete(dio); 557 + 558 + out_free_dio: 559 + kfree(dio); 560 + return ret; 561 + } 562 + EXPORT_SYMBOL_GPL(iomap_dio_rw);

+144

fs/iomap/fiemap.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (c) 2016-2018 Christoph Hellwig. 4 + */ 5 + #include <linux/module.h> 6 + #include <linux/compiler.h> 7 + #include <linux/fs.h> 8 + #include <linux/iomap.h> 9 + 10 + struct fiemap_ctx { 11 + struct fiemap_extent_info *fi; 12 + struct iomap prev; 13 + }; 14 + 15 + static int iomap_to_fiemap(struct fiemap_extent_info *fi, 16 + struct iomap *iomap, u32 flags) 17 + { 18 + switch (iomap->type) { 19 + case IOMAP_HOLE: 20 + /* skip holes */ 21 + return 0; 22 + case IOMAP_DELALLOC: 23 + flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN; 24 + break; 25 + case IOMAP_MAPPED: 26 + break; 27 + case IOMAP_UNWRITTEN: 28 + flags |= FIEMAP_EXTENT_UNWRITTEN; 29 + break; 30 + case IOMAP_INLINE: 31 + flags |= FIEMAP_EXTENT_DATA_INLINE; 32 + break; 33 + } 34 + 35 + if (iomap->flags & IOMAP_F_MERGED) 36 + flags |= FIEMAP_EXTENT_MERGED; 37 + if (iomap->flags & IOMAP_F_SHARED) 38 + flags |= FIEMAP_EXTENT_SHARED; 39 + 40 + return fiemap_fill_next_extent(fi, iomap->offset, 41 + iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0, 42 + iomap->length, flags); 43 + } 44 + 45 + static loff_t 46 + iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 47 + struct iomap *iomap) 48 + { 49 + struct fiemap_ctx *ctx = data; 50 + loff_t ret = length; 51 + 52 + if (iomap->type == IOMAP_HOLE) 53 + return length; 54 + 55 + ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0); 56 + ctx->prev = *iomap; 57 + switch (ret) { 58 + case 0: /* success */ 59 + return length; 60 + case 1: /* extent array full */ 61 + return 0; 62 + default: 63 + return ret; 64 + } 65 + } 66 + 67 + int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, 68 + loff_t start, loff_t len, const struct iomap_ops *ops) 69 + { 70 + struct fiemap_ctx ctx; 71 + loff_t ret; 72 + 73 + memset(&ctx, 0, sizeof(ctx)); 74 + ctx.fi = fi; 75 + ctx.prev.type = IOMAP_HOLE; 76 + 77 + ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC); 78 + if (ret) 79 + return ret; 80 + 81 + if (fi->fi_flags & FIEMAP_FLAG_SYNC) { 82 + ret = filemap_write_and_wait(inode->i_mapping); 83 + if (ret) 84 + return ret; 85 + } 86 + 87 + while (len > 0) { 88 + ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx, 89 + iomap_fiemap_actor); 90 + /* inode with no (attribute) mapping will give ENOENT */ 91 + if (ret == -ENOENT) 92 + break; 93 + if (ret < 0) 94 + return ret; 95 + if (ret == 0) 96 + break; 97 + 98 + start += ret; 99 + len -= ret; 100 + } 101 + 102 + if (ctx.prev.type != IOMAP_HOLE) { 103 + ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST); 104 + if (ret < 0) 105 + return ret; 106 + } 107 + 108 + return 0; 109 + } 110 + EXPORT_SYMBOL_GPL(iomap_fiemap); 111 + 112 + static loff_t 113 + iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length, 114 + void *data, struct iomap *iomap) 115 + { 116 + sector_t *bno = data, addr; 117 + 118 + if (iomap->type == IOMAP_MAPPED) { 119 + addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits; 120 + if (addr > INT_MAX) 121 + WARN(1, "would truncate bmap result\n"); 122 + else 123 + *bno = addr; 124 + } 125 + return 0; 126 + } 127 + 128 + /* legacy ->bmap interface. 0 is the error return (!) */ 129 + sector_t 130 + iomap_bmap(struct address_space *mapping, sector_t bno, 131 + const struct iomap_ops *ops) 132 + { 133 + struct inode *inode = mapping->host; 134 + loff_t pos = bno << inode->i_blkbits; 135 + unsigned blocksize = i_blocksize(inode); 136 + 137 + if (filemap_write_and_wait(mapping)) 138 + return 0; 139 + 140 + bno = 0; 141 + iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor); 142 + return bno; 143 + } 144 + EXPORT_SYMBOL_GPL(iomap_bmap);

+212

fs/iomap/seek.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2017 Red Hat, Inc. 4 + * Copyright (c) 2018 Christoph Hellwig. 5 + */ 6 + #include <linux/module.h> 7 + #include <linux/compiler.h> 8 + #include <linux/fs.h> 9 + #include <linux/iomap.h> 10 + #include <linux/pagemap.h> 11 + #include <linux/pagevec.h> 12 + 13 + /* 14 + * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff. 15 + * Returns true if found and updates @lastoff to the offset in file. 16 + */ 17 + static bool 18 + page_seek_hole_data(struct inode *inode, struct page *page, loff_t *lastoff, 19 + int whence) 20 + { 21 + const struct address_space_operations *ops = inode->i_mapping->a_ops; 22 + unsigned int bsize = i_blocksize(inode), off; 23 + bool seek_data = whence == SEEK_DATA; 24 + loff_t poff = page_offset(page); 25 + 26 + if (WARN_ON_ONCE(*lastoff >= poff + PAGE_SIZE)) 27 + return false; 28 + 29 + if (*lastoff < poff) { 30 + /* 31 + * Last offset smaller than the start of the page means we found 32 + * a hole: 33 + */ 34 + if (whence == SEEK_HOLE) 35 + return true; 36 + *lastoff = poff; 37 + } 38 + 39 + /* 40 + * Just check the page unless we can and should check block ranges: 41 + */ 42 + if (bsize == PAGE_SIZE || !ops->is_partially_uptodate) 43 + return PageUptodate(page) == seek_data; 44 + 45 + lock_page(page); 46 + if (unlikely(page->mapping != inode->i_mapping)) 47 + goto out_unlock_not_found; 48 + 49 + for (off = 0; off < PAGE_SIZE; off += bsize) { 50 + if (offset_in_page(*lastoff) >= off + bsize) 51 + continue; 52 + if (ops->is_partially_uptodate(page, off, bsize) == seek_data) { 53 + unlock_page(page); 54 + return true; 55 + } 56 + *lastoff = poff + off + bsize; 57 + } 58 + 59 + out_unlock_not_found: 60 + unlock_page(page); 61 + return false; 62 + } 63 + 64 + /* 65 + * Seek for SEEK_DATA / SEEK_HOLE in the page cache. 66 + * 67 + * Within unwritten extents, the page cache determines which parts are holes 68 + * and which are data: uptodate buffer heads count as data; everything else 69 + * counts as a hole. 70 + * 71 + * Returns the resulting offset on successs, and -ENOENT otherwise. 72 + */ 73 + static loff_t 74 + page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, 75 + int whence) 76 + { 77 + pgoff_t index = offset >> PAGE_SHIFT; 78 + pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE); 79 + loff_t lastoff = offset; 80 + struct pagevec pvec; 81 + 82 + if (length <= 0) 83 + return -ENOENT; 84 + 85 + pagevec_init(&pvec); 86 + 87 + do { 88 + unsigned nr_pages, i; 89 + 90 + nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index, 91 + end - 1); 92 + if (nr_pages == 0) 93 + break; 94 + 95 + for (i = 0; i < nr_pages; i++) { 96 + struct page *page = pvec.pages[i]; 97 + 98 + if (page_seek_hole_data(inode, page, &lastoff, whence)) 99 + goto check_range; 100 + lastoff = page_offset(page) + PAGE_SIZE; 101 + } 102 + pagevec_release(&pvec); 103 + } while (index < end); 104 + 105 + /* When no page at lastoff and we are not done, we found a hole. */ 106 + if (whence != SEEK_HOLE) 107 + goto not_found; 108 + 109 + check_range: 110 + if (lastoff < offset + length) 111 + goto out; 112 + not_found: 113 + lastoff = -ENOENT; 114 + out: 115 + pagevec_release(&pvec); 116 + return lastoff; 117 + } 118 + 119 + 120 + static loff_t 121 + iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length, 122 + void *data, struct iomap *iomap) 123 + { 124 + switch (iomap->type) { 125 + case IOMAP_UNWRITTEN: 126 + offset = page_cache_seek_hole_data(inode, offset, length, 127 + SEEK_HOLE); 128 + if (offset < 0) 129 + return length; 130 + /* fall through */ 131 + case IOMAP_HOLE: 132 + *(loff_t *)data = offset; 133 + return 0; 134 + default: 135 + return length; 136 + } 137 + } 138 + 139 + loff_t 140 + iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops) 141 + { 142 + loff_t size = i_size_read(inode); 143 + loff_t length = size - offset; 144 + loff_t ret; 145 + 146 + /* Nothing to be found before or beyond the end of the file. */ 147 + if (offset < 0 || offset >= size) 148 + return -ENXIO; 149 + 150 + while (length > 0) { 151 + ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops, 152 + &offset, iomap_seek_hole_actor); 153 + if (ret < 0) 154 + return ret; 155 + if (ret == 0) 156 + break; 157 + 158 + offset += ret; 159 + length -= ret; 160 + } 161 + 162 + return offset; 163 + } 164 + EXPORT_SYMBOL_GPL(iomap_seek_hole); 165 + 166 + static loff_t 167 + iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length, 168 + void *data, struct iomap *iomap) 169 + { 170 + switch (iomap->type) { 171 + case IOMAP_HOLE: 172 + return length; 173 + case IOMAP_UNWRITTEN: 174 + offset = page_cache_seek_hole_data(inode, offset, length, 175 + SEEK_DATA); 176 + if (offset < 0) 177 + return length; 178 + /*FALLTHRU*/ 179 + default: 180 + *(loff_t *)data = offset; 181 + return 0; 182 + } 183 + } 184 + 185 + loff_t 186 + iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops) 187 + { 188 + loff_t size = i_size_read(inode); 189 + loff_t length = size - offset; 190 + loff_t ret; 191 + 192 + /* Nothing to be found before or beyond the end of the file. */ 193 + if (offset < 0 || offset >= size) 194 + return -ENXIO; 195 + 196 + while (length > 0) { 197 + ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops, 198 + &offset, iomap_seek_data_actor); 199 + if (ret < 0) 200 + return ret; 201 + if (ret == 0) 202 + break; 203 + 204 + offset += ret; 205 + length -= ret; 206 + } 207 + 208 + if (length <= 0) 209 + return -ENXIO; 210 + return offset; 211 + } 212 + EXPORT_SYMBOL_GPL(iomap_seek_data);

+178

fs/iomap/swapfile.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2018 Oracle. All Rights Reserved. 4 + * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 + */ 6 + #include <linux/module.h> 7 + #include <linux/compiler.h> 8 + #include <linux/fs.h> 9 + #include <linux/iomap.h> 10 + #include <linux/swap.h> 11 + 12 + /* Swapfile activation */ 13 + 14 + struct iomap_swapfile_info { 15 + struct iomap iomap; /* accumulated iomap */ 16 + struct swap_info_struct *sis; 17 + uint64_t lowest_ppage; /* lowest physical addr seen (pages) */ 18 + uint64_t highest_ppage; /* highest physical addr seen (pages) */ 19 + unsigned long nr_pages; /* number of pages collected */ 20 + int nr_extents; /* extent count */ 21 + }; 22 + 23 + /* 24 + * Collect physical extents for this swap file. Physical extents reported to 25 + * the swap code must be trimmed to align to a page boundary. The logical 26 + * offset within the file is irrelevant since the swapfile code maps logical 27 + * page numbers of the swap device to the physical page-aligned extents. 28 + */ 29 + static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) 30 + { 31 + struct iomap *iomap = &isi->iomap; 32 + unsigned long nr_pages; 33 + uint64_t first_ppage; 34 + uint64_t first_ppage_reported; 35 + uint64_t next_ppage; 36 + int error; 37 + 38 + /* 39 + * Round the start up and the end down so that the physical 40 + * extent aligns to a page boundary. 41 + */ 42 + first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT; 43 + next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >> 44 + PAGE_SHIFT; 45 + 46 + /* Skip too-short physical extents. */ 47 + if (first_ppage >= next_ppage) 48 + return 0; 49 + nr_pages = next_ppage - first_ppage; 50 + 51 + /* 52 + * Calculate how much swap space we're adding; the first page contains 53 + * the swap header and doesn't count. The mm still wants that first 54 + * page fed to add_swap_extent, however. 55 + */ 56 + first_ppage_reported = first_ppage; 57 + if (iomap->offset == 0) 58 + first_ppage_reported++; 59 + if (isi->lowest_ppage > first_ppage_reported) 60 + isi->lowest_ppage = first_ppage_reported; 61 + if (isi->highest_ppage < (next_ppage - 1)) 62 + isi->highest_ppage = next_ppage - 1; 63 + 64 + /* Add extent, set up for the next call. */ 65 + error = add_swap_extent(isi->sis, isi->nr_pages, nr_pages, first_ppage); 66 + if (error < 0) 67 + return error; 68 + isi->nr_extents += error; 69 + isi->nr_pages += nr_pages; 70 + return 0; 71 + } 72 + 73 + /* 74 + * Accumulate iomaps for this swap file. We have to accumulate iomaps because 75 + * swap only cares about contiguous page-aligned physical extents and makes no 76 + * distinction between written and unwritten extents. 77 + */ 78 + static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos, 79 + loff_t count, void *data, struct iomap *iomap) 80 + { 81 + struct iomap_swapfile_info *isi = data; 82 + int error; 83 + 84 + switch (iomap->type) { 85 + case IOMAP_MAPPED: 86 + case IOMAP_UNWRITTEN: 87 + /* Only real or unwritten extents. */ 88 + break; 89 + case IOMAP_INLINE: 90 + /* No inline data. */ 91 + pr_err("swapon: file is inline\n"); 92 + return -EINVAL; 93 + default: 94 + pr_err("swapon: file has unallocated extents\n"); 95 + return -EINVAL; 96 + } 97 + 98 + /* No uncommitted metadata or shared blocks. */ 99 + if (iomap->flags & IOMAP_F_DIRTY) { 100 + pr_err("swapon: file is not committed\n"); 101 + return -EINVAL; 102 + } 103 + if (iomap->flags & IOMAP_F_SHARED) { 104 + pr_err("swapon: file has shared extents\n"); 105 + return -EINVAL; 106 + } 107 + 108 + /* Only one bdev per swap file. */ 109 + if (iomap->bdev != isi->sis->bdev) { 110 + pr_err("swapon: file is on multiple devices\n"); 111 + return -EINVAL; 112 + } 113 + 114 + if (isi->iomap.length == 0) { 115 + /* No accumulated extent, so just store it. */ 116 + memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); 117 + } else if (isi->iomap.addr + isi->iomap.length == iomap->addr) { 118 + /* Append this to the accumulated extent. */ 119 + isi->iomap.length += iomap->length; 120 + } else { 121 + /* Otherwise, add the retained iomap and store this one. */ 122 + error = iomap_swapfile_add_extent(isi); 123 + if (error) 124 + return error; 125 + memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); 126 + } 127 + return count; 128 + } 129 + 130 + /* 131 + * Iterate a swap file's iomaps to construct physical extents that can be 132 + * passed to the swapfile subsystem. 133 + */ 134 + int iomap_swapfile_activate(struct swap_info_struct *sis, 135 + struct file *swap_file, sector_t *pagespan, 136 + const struct iomap_ops *ops) 137 + { 138 + struct iomap_swapfile_info isi = { 139 + .sis = sis, 140 + .lowest_ppage = (sector_t)-1ULL, 141 + }; 142 + struct address_space *mapping = swap_file->f_mapping; 143 + struct inode *inode = mapping->host; 144 + loff_t pos = 0; 145 + loff_t len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE); 146 + loff_t ret; 147 + 148 + /* 149 + * Persist all file mapping metadata so that we won't have any 150 + * IOMAP_F_DIRTY iomaps. 151 + */ 152 + ret = vfs_fsync(swap_file, 1); 153 + if (ret) 154 + return ret; 155 + 156 + while (len > 0) { 157 + ret = iomap_apply(inode, pos, len, IOMAP_REPORT, 158 + ops, &isi, iomap_swapfile_activate_actor); 159 + if (ret <= 0) 160 + return ret; 161 + 162 + pos += ret; 163 + len -= ret; 164 + } 165 + 166 + if (isi.iomap.length) { 167 + ret = iomap_swapfile_add_extent(&isi); 168 + if (ret) 169 + return ret; 170 + } 171 + 172 + *pagespan = 1 + isi.highest_ppage - isi.lowest_ppage; 173 + sis->max = isi.nr_pages; 174 + sis->pages = isi.nr_pages - 1; 175 + sis->highest_bit = isi.nr_pages - 1; 176 + return isi.nr_extents; 177 + } 178 + EXPORT_SYMBOL_GPL(iomap_swapfile_activate);

+17

include/linux/iomap.h

··· 7 7 #include <linux/mm.h> 8 8 #include <linux/types.h> 9 9 #include <linux/mm_types.h> 10 + #include <linux/blkdev.h> 10 11 11 12 struct address_space; 12 13 struct fiemap_extent_info; ··· 70 69 const struct iomap_page_ops *page_ops; 71 70 }; 72 71 72 + static inline sector_t 73 + iomap_sector(struct iomap *iomap, loff_t pos) 74 + { 75 + return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT; 76 + } 77 + 73 78 /* 74 79 * When a filesystem sets page_ops in an iomap mapping it returns, page_prepare 75 80 * and page_done will be called for each page written to. This only applies to ··· 121 114 int (*iomap_end)(struct inode *inode, loff_t pos, loff_t length, 122 115 ssize_t written, unsigned flags, struct iomap *iomap); 123 116 }; 117 + 118 + /* 119 + * Main iomap iterator function. 120 + */ 121 + typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, 122 + void *data, struct iomap *iomap); 123 + 124 + loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, 125 + unsigned flags, const struct iomap_ops *ops, void *data, 126 + iomap_actor_t actor); 124 127 125 128 /* 126 129 * Structure allocate for each page when block size < PAGE_SIZE to track