Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v4.16 1091 lines 26 kB view raw
1/* 2 * Copyright (C) 2010 Red Hat, Inc. 3 * Copyright (c) 2016 Christoph Hellwig. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 */ 14#include <linux/module.h> 15#include <linux/compiler.h> 16#include <linux/fs.h> 17#include <linux/iomap.h> 18#include <linux/uaccess.h> 19#include <linux/gfp.h> 20#include <linux/mm.h> 21#include <linux/swap.h> 22#include <linux/pagemap.h> 23#include <linux/file.h> 24#include <linux/uio.h> 25#include <linux/backing-dev.h> 26#include <linux/buffer_head.h> 27#include <linux/task_io_accounting_ops.h> 28#include <linux/dax.h> 29#include <linux/sched/signal.h> 30 31#include "internal.h" 32 33/* 34 * Execute a iomap write on a segment of the mapping that spans a 35 * contiguous range of pages that have identical block mapping state. 36 * 37 * This avoids the need to map pages individually, do individual allocations 38 * for each page and most importantly avoid the need for filesystem specific 39 * locking per page. Instead, all the operations are amortised over the entire 40 * range of pages. It is assumed that the filesystems will lock whatever 41 * resources they require in the iomap_begin call, and release them in the 42 * iomap_end call. 43 */ 44loff_t 45iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, 46 const struct iomap_ops *ops, void *data, iomap_actor_t actor) 47{ 48 struct iomap iomap = { 0 }; 49 loff_t written = 0, ret; 50 51 /* 52 * Need to map a range from start position for length bytes. This can 53 * span multiple pages - it is only guaranteed to return a range of a 54 * single type of pages (e.g. all into a hole, all mapped or all 55 * unwritten). Failure at this point has nothing to undo. 56 * 57 * If allocation is required for this range, reserve the space now so 58 * that the allocation is guaranteed to succeed later on. Once we copy 59 * the data into the page cache pages, then we cannot fail otherwise we 60 * expose transient stale data. If the reserve fails, we can safely 61 * back out at this point as there is nothing to undo. 62 */ 63 ret = ops->iomap_begin(inode, pos, length, flags, &iomap); 64 if (ret) 65 return ret; 66 if (WARN_ON(iomap.offset > pos)) 67 return -EIO; 68 if (WARN_ON(iomap.length == 0)) 69 return -EIO; 70 71 /* 72 * Cut down the length to the one actually provided by the filesystem, 73 * as it might not be able to give us the whole size that we requested. 74 */ 75 if (iomap.offset + iomap.length < pos + length) 76 length = iomap.offset + iomap.length - pos; 77 78 /* 79 * Now that we have guaranteed that the space allocation will succeed. 80 * we can do the copy-in page by page without having to worry about 81 * failures exposing transient data. 82 */ 83 written = actor(inode, pos, length, data, &iomap); 84 85 /* 86 * Now the data has been copied, commit the range we've copied. This 87 * should not fail unless the filesystem has had a fatal error. 88 */ 89 if (ops->iomap_end) { 90 ret = ops->iomap_end(inode, pos, length, 91 written > 0 ? written : 0, 92 flags, &iomap); 93 } 94 95 return written ? written : ret; 96} 97 98static void 99iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) 100{ 101 loff_t i_size = i_size_read(inode); 102 103 /* 104 * Only truncate newly allocated pages beyoned EOF, even if the 105 * write started inside the existing inode size. 106 */ 107 if (pos + len > i_size) 108 truncate_pagecache_range(inode, max(pos, i_size), pos + len); 109} 110 111static int 112iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, 113 struct page **pagep, struct iomap *iomap) 114{ 115 pgoff_t index = pos >> PAGE_SHIFT; 116 struct page *page; 117 int status = 0; 118 119 BUG_ON(pos + len > iomap->offset + iomap->length); 120 121 if (fatal_signal_pending(current)) 122 return -EINTR; 123 124 page = grab_cache_page_write_begin(inode->i_mapping, index, flags); 125 if (!page) 126 return -ENOMEM; 127 128 status = __block_write_begin_int(page, pos, len, NULL, iomap); 129 if (unlikely(status)) { 130 unlock_page(page); 131 put_page(page); 132 page = NULL; 133 134 iomap_write_failed(inode, pos, len); 135 } 136 137 *pagep = page; 138 return status; 139} 140 141static int 142iomap_write_end(struct inode *inode, loff_t pos, unsigned len, 143 unsigned copied, struct page *page) 144{ 145 int ret; 146 147 ret = generic_write_end(NULL, inode->i_mapping, pos, len, 148 copied, page, NULL); 149 if (ret < len) 150 iomap_write_failed(inode, pos, len); 151 return ret; 152} 153 154static loff_t 155iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 156 struct iomap *iomap) 157{ 158 struct iov_iter *i = data; 159 long status = 0; 160 ssize_t written = 0; 161 unsigned int flags = AOP_FLAG_NOFS; 162 163 do { 164 struct page *page; 165 unsigned long offset; /* Offset into pagecache page */ 166 unsigned long bytes; /* Bytes to write to page */ 167 size_t copied; /* Bytes copied from user */ 168 169 offset = (pos & (PAGE_SIZE - 1)); 170 bytes = min_t(unsigned long, PAGE_SIZE - offset, 171 iov_iter_count(i)); 172again: 173 if (bytes > length) 174 bytes = length; 175 176 /* 177 * Bring in the user page that we will copy from _first_. 178 * Otherwise there's a nasty deadlock on copying from the 179 * same page as we're writing to, without it being marked 180 * up-to-date. 181 * 182 * Not only is this an optimisation, but it is also required 183 * to check that the address is actually valid, when atomic 184 * usercopies are used, below. 185 */ 186 if (unlikely(iov_iter_fault_in_readable(i, bytes))) { 187 status = -EFAULT; 188 break; 189 } 190 191 status = iomap_write_begin(inode, pos, bytes, flags, &page, 192 iomap); 193 if (unlikely(status)) 194 break; 195 196 if (mapping_writably_mapped(inode->i_mapping)) 197 flush_dcache_page(page); 198 199 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 200 201 flush_dcache_page(page); 202 203 status = iomap_write_end(inode, pos, bytes, copied, page); 204 if (unlikely(status < 0)) 205 break; 206 copied = status; 207 208 cond_resched(); 209 210 iov_iter_advance(i, copied); 211 if (unlikely(copied == 0)) { 212 /* 213 * If we were unable to copy any data at all, we must 214 * fall back to a single segment length write. 215 * 216 * If we didn't fallback here, we could livelock 217 * because not all segments in the iov can be copied at 218 * once without a pagefault. 219 */ 220 bytes = min_t(unsigned long, PAGE_SIZE - offset, 221 iov_iter_single_seg_count(i)); 222 goto again; 223 } 224 pos += copied; 225 written += copied; 226 length -= copied; 227 228 balance_dirty_pages_ratelimited(inode->i_mapping); 229 } while (iov_iter_count(i) && length); 230 231 return written ? written : status; 232} 233 234ssize_t 235iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, 236 const struct iomap_ops *ops) 237{ 238 struct inode *inode = iocb->ki_filp->f_mapping->host; 239 loff_t pos = iocb->ki_pos, ret = 0, written = 0; 240 241 while (iov_iter_count(iter)) { 242 ret = iomap_apply(inode, pos, iov_iter_count(iter), 243 IOMAP_WRITE, ops, iter, iomap_write_actor); 244 if (ret <= 0) 245 break; 246 pos += ret; 247 written += ret; 248 } 249 250 return written ? written : ret; 251} 252EXPORT_SYMBOL_GPL(iomap_file_buffered_write); 253 254static struct page * 255__iomap_read_page(struct inode *inode, loff_t offset) 256{ 257 struct address_space *mapping = inode->i_mapping; 258 struct page *page; 259 260 page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL); 261 if (IS_ERR(page)) 262 return page; 263 if (!PageUptodate(page)) { 264 put_page(page); 265 return ERR_PTR(-EIO); 266 } 267 return page; 268} 269 270static loff_t 271iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 272 struct iomap *iomap) 273{ 274 long status = 0; 275 ssize_t written = 0; 276 277 do { 278 struct page *page, *rpage; 279 unsigned long offset; /* Offset into pagecache page */ 280 unsigned long bytes; /* Bytes to write to page */ 281 282 offset = (pos & (PAGE_SIZE - 1)); 283 bytes = min_t(loff_t, PAGE_SIZE - offset, length); 284 285 rpage = __iomap_read_page(inode, pos); 286 if (IS_ERR(rpage)) 287 return PTR_ERR(rpage); 288 289 status = iomap_write_begin(inode, pos, bytes, 290 AOP_FLAG_NOFS, &page, iomap); 291 put_page(rpage); 292 if (unlikely(status)) 293 return status; 294 295 WARN_ON_ONCE(!PageUptodate(page)); 296 297 status = iomap_write_end(inode, pos, bytes, bytes, page); 298 if (unlikely(status <= 0)) { 299 if (WARN_ON_ONCE(status == 0)) 300 return -EIO; 301 return status; 302 } 303 304 cond_resched(); 305 306 pos += status; 307 written += status; 308 length -= status; 309 310 balance_dirty_pages_ratelimited(inode->i_mapping); 311 } while (length); 312 313 return written; 314} 315 316int 317iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, 318 const struct iomap_ops *ops) 319{ 320 loff_t ret; 321 322 while (len) { 323 ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL, 324 iomap_dirty_actor); 325 if (ret <= 0) 326 return ret; 327 pos += ret; 328 len -= ret; 329 } 330 331 return 0; 332} 333EXPORT_SYMBOL_GPL(iomap_file_dirty); 334 335static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, 336 unsigned bytes, struct iomap *iomap) 337{ 338 struct page *page; 339 int status; 340 341 status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page, 342 iomap); 343 if (status) 344 return status; 345 346 zero_user(page, offset, bytes); 347 mark_page_accessed(page); 348 349 return iomap_write_end(inode, pos, bytes, bytes, page); 350} 351 352static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, 353 struct iomap *iomap) 354{ 355 sector_t sector = (iomap->addr + 356 (pos & PAGE_MASK) - iomap->offset) >> 9; 357 358 return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector, 359 offset, bytes); 360} 361 362static loff_t 363iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, 364 void *data, struct iomap *iomap) 365{ 366 bool *did_zero = data; 367 loff_t written = 0; 368 int status; 369 370 /* already zeroed? we're done. */ 371 if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) 372 return count; 373 374 do { 375 unsigned offset, bytes; 376 377 offset = pos & (PAGE_SIZE - 1); /* Within page */ 378 bytes = min_t(loff_t, PAGE_SIZE - offset, count); 379 380 if (IS_DAX(inode)) 381 status = iomap_dax_zero(pos, offset, bytes, iomap); 382 else 383 status = iomap_zero(inode, pos, offset, bytes, iomap); 384 if (status < 0) 385 return status; 386 387 pos += bytes; 388 count -= bytes; 389 written += bytes; 390 if (did_zero) 391 *did_zero = true; 392 } while (count > 0); 393 394 return written; 395} 396 397int 398iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, 399 const struct iomap_ops *ops) 400{ 401 loff_t ret; 402 403 while (len > 0) { 404 ret = iomap_apply(inode, pos, len, IOMAP_ZERO, 405 ops, did_zero, iomap_zero_range_actor); 406 if (ret <= 0) 407 return ret; 408 409 pos += ret; 410 len -= ret; 411 } 412 413 return 0; 414} 415EXPORT_SYMBOL_GPL(iomap_zero_range); 416 417int 418iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, 419 const struct iomap_ops *ops) 420{ 421 unsigned int blocksize = i_blocksize(inode); 422 unsigned int off = pos & (blocksize - 1); 423 424 /* Block boundary? Nothing to do */ 425 if (!off) 426 return 0; 427 return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); 428} 429EXPORT_SYMBOL_GPL(iomap_truncate_page); 430 431static loff_t 432iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, 433 void *data, struct iomap *iomap) 434{ 435 struct page *page = data; 436 int ret; 437 438 ret = __block_write_begin_int(page, pos, length, NULL, iomap); 439 if (ret) 440 return ret; 441 442 block_commit_write(page, 0, length); 443 return length; 444} 445 446int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) 447{ 448 struct page *page = vmf->page; 449 struct inode *inode = file_inode(vmf->vma->vm_file); 450 unsigned long length; 451 loff_t offset, size; 452 ssize_t ret; 453 454 lock_page(page); 455 size = i_size_read(inode); 456 if ((page->mapping != inode->i_mapping) || 457 (page_offset(page) > size)) { 458 /* We overload EFAULT to mean page got truncated */ 459 ret = -EFAULT; 460 goto out_unlock; 461 } 462 463 /* page is wholly or partially inside EOF */ 464 if (((page->index + 1) << PAGE_SHIFT) > size) 465 length = size & ~PAGE_MASK; 466 else 467 length = PAGE_SIZE; 468 469 offset = page_offset(page); 470 while (length > 0) { 471 ret = iomap_apply(inode, offset, length, 472 IOMAP_WRITE | IOMAP_FAULT, ops, page, 473 iomap_page_mkwrite_actor); 474 if (unlikely(ret <= 0)) 475 goto out_unlock; 476 offset += ret; 477 length -= ret; 478 } 479 480 set_page_dirty(page); 481 wait_for_stable_page(page); 482 return VM_FAULT_LOCKED; 483out_unlock: 484 unlock_page(page); 485 return block_page_mkwrite_return(ret); 486} 487EXPORT_SYMBOL_GPL(iomap_page_mkwrite); 488 489struct fiemap_ctx { 490 struct fiemap_extent_info *fi; 491 struct iomap prev; 492}; 493 494static int iomap_to_fiemap(struct fiemap_extent_info *fi, 495 struct iomap *iomap, u32 flags) 496{ 497 switch (iomap->type) { 498 case IOMAP_HOLE: 499 /* skip holes */ 500 return 0; 501 case IOMAP_DELALLOC: 502 flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN; 503 break; 504 case IOMAP_UNWRITTEN: 505 flags |= FIEMAP_EXTENT_UNWRITTEN; 506 break; 507 case IOMAP_MAPPED: 508 break; 509 } 510 511 if (iomap->flags & IOMAP_F_MERGED) 512 flags |= FIEMAP_EXTENT_MERGED; 513 if (iomap->flags & IOMAP_F_SHARED) 514 flags |= FIEMAP_EXTENT_SHARED; 515 if (iomap->flags & IOMAP_F_DATA_INLINE) 516 flags |= FIEMAP_EXTENT_DATA_INLINE; 517 518 return fiemap_fill_next_extent(fi, iomap->offset, 519 iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0, 520 iomap->length, flags); 521} 522 523static loff_t 524iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 525 struct iomap *iomap) 526{ 527 struct fiemap_ctx *ctx = data; 528 loff_t ret = length; 529 530 if (iomap->type == IOMAP_HOLE) 531 return length; 532 533 ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0); 534 ctx->prev = *iomap; 535 switch (ret) { 536 case 0: /* success */ 537 return length; 538 case 1: /* extent array full */ 539 return 0; 540 default: 541 return ret; 542 } 543} 544 545int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, 546 loff_t start, loff_t len, const struct iomap_ops *ops) 547{ 548 struct fiemap_ctx ctx; 549 loff_t ret; 550 551 memset(&ctx, 0, sizeof(ctx)); 552 ctx.fi = fi; 553 ctx.prev.type = IOMAP_HOLE; 554 555 ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC); 556 if (ret) 557 return ret; 558 559 if (fi->fi_flags & FIEMAP_FLAG_SYNC) { 560 ret = filemap_write_and_wait(inode->i_mapping); 561 if (ret) 562 return ret; 563 } 564 565 while (len > 0) { 566 ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx, 567 iomap_fiemap_actor); 568 /* inode with no (attribute) mapping will give ENOENT */ 569 if (ret == -ENOENT) 570 break; 571 if (ret < 0) 572 return ret; 573 if (ret == 0) 574 break; 575 576 start += ret; 577 len -= ret; 578 } 579 580 if (ctx.prev.type != IOMAP_HOLE) { 581 ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST); 582 if (ret < 0) 583 return ret; 584 } 585 586 return 0; 587} 588EXPORT_SYMBOL_GPL(iomap_fiemap); 589 590static loff_t 591iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length, 592 void *data, struct iomap *iomap) 593{ 594 switch (iomap->type) { 595 case IOMAP_UNWRITTEN: 596 offset = page_cache_seek_hole_data(inode, offset, length, 597 SEEK_HOLE); 598 if (offset < 0) 599 return length; 600 /* fall through */ 601 case IOMAP_HOLE: 602 *(loff_t *)data = offset; 603 return 0; 604 default: 605 return length; 606 } 607} 608 609loff_t 610iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops) 611{ 612 loff_t size = i_size_read(inode); 613 loff_t length = size - offset; 614 loff_t ret; 615 616 /* Nothing to be found before or beyond the end of the file. */ 617 if (offset < 0 || offset >= size) 618 return -ENXIO; 619 620 while (length > 0) { 621 ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops, 622 &offset, iomap_seek_hole_actor); 623 if (ret < 0) 624 return ret; 625 if (ret == 0) 626 break; 627 628 offset += ret; 629 length -= ret; 630 } 631 632 return offset; 633} 634EXPORT_SYMBOL_GPL(iomap_seek_hole); 635 636static loff_t 637iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length, 638 void *data, struct iomap *iomap) 639{ 640 switch (iomap->type) { 641 case IOMAP_HOLE: 642 return length; 643 case IOMAP_UNWRITTEN: 644 offset = page_cache_seek_hole_data(inode, offset, length, 645 SEEK_DATA); 646 if (offset < 0) 647 return length; 648 /*FALLTHRU*/ 649 default: 650 *(loff_t *)data = offset; 651 return 0; 652 } 653} 654 655loff_t 656iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops) 657{ 658 loff_t size = i_size_read(inode); 659 loff_t length = size - offset; 660 loff_t ret; 661 662 /* Nothing to be found before or beyond the end of the file. */ 663 if (offset < 0 || offset >= size) 664 return -ENXIO; 665 666 while (length > 0) { 667 ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops, 668 &offset, iomap_seek_data_actor); 669 if (ret < 0) 670 return ret; 671 if (ret == 0) 672 break; 673 674 offset += ret; 675 length -= ret; 676 } 677 678 if (length <= 0) 679 return -ENXIO; 680 return offset; 681} 682EXPORT_SYMBOL_GPL(iomap_seek_data); 683 684/* 685 * Private flags for iomap_dio, must not overlap with the public ones in 686 * iomap.h: 687 */ 688#define IOMAP_DIO_WRITE (1 << 30) 689#define IOMAP_DIO_DIRTY (1 << 31) 690 691struct iomap_dio { 692 struct kiocb *iocb; 693 iomap_dio_end_io_t *end_io; 694 loff_t i_size; 695 loff_t size; 696 atomic_t ref; 697 unsigned flags; 698 int error; 699 700 union { 701 /* used during submission and for synchronous completion: */ 702 struct { 703 struct iov_iter *iter; 704 struct task_struct *waiter; 705 struct request_queue *last_queue; 706 blk_qc_t cookie; 707 } submit; 708 709 /* used for aio completion: */ 710 struct { 711 struct work_struct work; 712 } aio; 713 }; 714}; 715 716static ssize_t iomap_dio_complete(struct iomap_dio *dio) 717{ 718 struct kiocb *iocb = dio->iocb; 719 struct inode *inode = file_inode(iocb->ki_filp); 720 loff_t offset = iocb->ki_pos; 721 ssize_t ret; 722 723 if (dio->end_io) { 724 ret = dio->end_io(iocb, 725 dio->error ? dio->error : dio->size, 726 dio->flags); 727 } else { 728 ret = dio->error; 729 } 730 731 if (likely(!ret)) { 732 ret = dio->size; 733 /* check for short read */ 734 if (offset + ret > dio->i_size && 735 !(dio->flags & IOMAP_DIO_WRITE)) 736 ret = dio->i_size - offset; 737 iocb->ki_pos += ret; 738 } 739 740 /* 741 * Try again to invalidate clean pages which might have been cached by 742 * non-direct readahead, or faulted in by get_user_pages() if the source 743 * of the write was an mmap'ed region of the file we're writing. Either 744 * one is a pretty crazy thing to do, so we don't support it 100%. If 745 * this invalidation fails, tough, the write still worked... 746 * 747 * And this page cache invalidation has to be after dio->end_io(), as 748 * some filesystems convert unwritten extents to real allocations in 749 * end_io() when necessary, otherwise a racing buffer read would cache 750 * zeros from unwritten extents. 751 */ 752 if (!dio->error && 753 (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) { 754 int err; 755 err = invalidate_inode_pages2_range(inode->i_mapping, 756 offset >> PAGE_SHIFT, 757 (offset + dio->size - 1) >> PAGE_SHIFT); 758 if (err) 759 dio_warn_stale_pagecache(iocb->ki_filp); 760 } 761 762 inode_dio_end(file_inode(iocb->ki_filp)); 763 kfree(dio); 764 765 return ret; 766} 767 768static void iomap_dio_complete_work(struct work_struct *work) 769{ 770 struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); 771 struct kiocb *iocb = dio->iocb; 772 bool is_write = (dio->flags & IOMAP_DIO_WRITE); 773 ssize_t ret; 774 775 ret = iomap_dio_complete(dio); 776 if (is_write && ret > 0) 777 ret = generic_write_sync(iocb, ret); 778 iocb->ki_complete(iocb, ret, 0); 779} 780 781/* 782 * Set an error in the dio if none is set yet. We have to use cmpxchg 783 * as the submission context and the completion context(s) can race to 784 * update the error. 785 */ 786static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret) 787{ 788 cmpxchg(&dio->error, 0, ret); 789} 790 791static void iomap_dio_bio_end_io(struct bio *bio) 792{ 793 struct iomap_dio *dio = bio->bi_private; 794 bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); 795 796 if (bio->bi_status) 797 iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); 798 799 if (atomic_dec_and_test(&dio->ref)) { 800 if (is_sync_kiocb(dio->iocb)) { 801 struct task_struct *waiter = dio->submit.waiter; 802 803 WRITE_ONCE(dio->submit.waiter, NULL); 804 wake_up_process(waiter); 805 } else if (dio->flags & IOMAP_DIO_WRITE) { 806 struct inode *inode = file_inode(dio->iocb->ki_filp); 807 808 INIT_WORK(&dio->aio.work, iomap_dio_complete_work); 809 queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); 810 } else { 811 iomap_dio_complete_work(&dio->aio.work); 812 } 813 } 814 815 if (should_dirty) { 816 bio_check_pages_dirty(bio); 817 } else { 818 struct bio_vec *bvec; 819 int i; 820 821 bio_for_each_segment_all(bvec, bio, i) 822 put_page(bvec->bv_page); 823 bio_put(bio); 824 } 825} 826 827static blk_qc_t 828iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, 829 unsigned len) 830{ 831 struct page *page = ZERO_PAGE(0); 832 struct bio *bio; 833 834 bio = bio_alloc(GFP_KERNEL, 1); 835 bio_set_dev(bio, iomap->bdev); 836 bio->bi_iter.bi_sector = 837 (iomap->addr + pos - iomap->offset) >> 9; 838 bio->bi_private = dio; 839 bio->bi_end_io = iomap_dio_bio_end_io; 840 841 get_page(page); 842 if (bio_add_page(bio, page, len, 0) != len) 843 BUG(); 844 bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE); 845 846 atomic_inc(&dio->ref); 847 return submit_bio(bio); 848} 849 850static loff_t 851iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, 852 void *data, struct iomap *iomap) 853{ 854 struct iomap_dio *dio = data; 855 unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); 856 unsigned int fs_block_size = i_blocksize(inode), pad; 857 unsigned int align = iov_iter_alignment(dio->submit.iter); 858 struct iov_iter iter; 859 struct bio *bio; 860 bool need_zeroout = false; 861 int nr_pages, ret; 862 size_t copied = 0; 863 864 if ((pos | length | align) & ((1 << blkbits) - 1)) 865 return -EINVAL; 866 867 switch (iomap->type) { 868 case IOMAP_HOLE: 869 if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE)) 870 return -EIO; 871 /*FALLTHRU*/ 872 case IOMAP_UNWRITTEN: 873 if (!(dio->flags & IOMAP_DIO_WRITE)) { 874 length = iov_iter_zero(length, dio->submit.iter); 875 dio->size += length; 876 return length; 877 } 878 dio->flags |= IOMAP_DIO_UNWRITTEN; 879 need_zeroout = true; 880 break; 881 case IOMAP_MAPPED: 882 if (iomap->flags & IOMAP_F_SHARED) 883 dio->flags |= IOMAP_DIO_COW; 884 if (iomap->flags & IOMAP_F_NEW) 885 need_zeroout = true; 886 break; 887 default: 888 WARN_ON_ONCE(1); 889 return -EIO; 890 } 891 892 /* 893 * Operate on a partial iter trimmed to the extent we were called for. 894 * We'll update the iter in the dio once we're done with this extent. 895 */ 896 iter = *dio->submit.iter; 897 iov_iter_truncate(&iter, length); 898 899 nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); 900 if (nr_pages <= 0) 901 return nr_pages; 902 903 if (need_zeroout) { 904 /* zero out from the start of the block to the write offset */ 905 pad = pos & (fs_block_size - 1); 906 if (pad) 907 iomap_dio_zero(dio, iomap, pos - pad, pad); 908 } 909 910 do { 911 size_t n; 912 if (dio->error) { 913 iov_iter_revert(dio->submit.iter, copied); 914 return 0; 915 } 916 917 bio = bio_alloc(GFP_KERNEL, nr_pages); 918 bio_set_dev(bio, iomap->bdev); 919 bio->bi_iter.bi_sector = 920 (iomap->addr + pos - iomap->offset) >> 9; 921 bio->bi_write_hint = dio->iocb->ki_hint; 922 bio->bi_private = dio; 923 bio->bi_end_io = iomap_dio_bio_end_io; 924 925 ret = bio_iov_iter_get_pages(bio, &iter); 926 if (unlikely(ret)) { 927 bio_put(bio); 928 return copied ? copied : ret; 929 } 930 931 n = bio->bi_iter.bi_size; 932 if (dio->flags & IOMAP_DIO_WRITE) { 933 bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE); 934 task_io_account_write(n); 935 } else { 936 bio_set_op_attrs(bio, REQ_OP_READ, 0); 937 if (dio->flags & IOMAP_DIO_DIRTY) 938 bio_set_pages_dirty(bio); 939 } 940 941 iov_iter_advance(dio->submit.iter, n); 942 943 dio->size += n; 944 pos += n; 945 copied += n; 946 947 nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); 948 949 atomic_inc(&dio->ref); 950 951 dio->submit.last_queue = bdev_get_queue(iomap->bdev); 952 dio->submit.cookie = submit_bio(bio); 953 } while (nr_pages); 954 955 if (need_zeroout) { 956 /* zero out from the end of the write to the end of the block */ 957 pad = pos & (fs_block_size - 1); 958 if (pad) 959 iomap_dio_zero(dio, iomap, pos, fs_block_size - pad); 960 } 961 return copied; 962} 963 964ssize_t 965iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, 966 const struct iomap_ops *ops, iomap_dio_end_io_t end_io) 967{ 968 struct address_space *mapping = iocb->ki_filp->f_mapping; 969 struct inode *inode = file_inode(iocb->ki_filp); 970 size_t count = iov_iter_count(iter); 971 loff_t pos = iocb->ki_pos, start = pos; 972 loff_t end = iocb->ki_pos + count - 1, ret = 0; 973 unsigned int flags = IOMAP_DIRECT; 974 struct blk_plug plug; 975 struct iomap_dio *dio; 976 977 lockdep_assert_held(&inode->i_rwsem); 978 979 if (!count) 980 return 0; 981 982 dio = kmalloc(sizeof(*dio), GFP_KERNEL); 983 if (!dio) 984 return -ENOMEM; 985 986 dio->iocb = iocb; 987 atomic_set(&dio->ref, 1); 988 dio->size = 0; 989 dio->i_size = i_size_read(inode); 990 dio->end_io = end_io; 991 dio->error = 0; 992 dio->flags = 0; 993 994 dio->submit.iter = iter; 995 if (is_sync_kiocb(iocb)) { 996 dio->submit.waiter = current; 997 dio->submit.cookie = BLK_QC_T_NONE; 998 dio->submit.last_queue = NULL; 999 } 1000 1001 if (iov_iter_rw(iter) == READ) { 1002 if (pos >= dio->i_size) 1003 goto out_free_dio; 1004 1005 if (iter->type == ITER_IOVEC) 1006 dio->flags |= IOMAP_DIO_DIRTY; 1007 } else { 1008 dio->flags |= IOMAP_DIO_WRITE; 1009 flags |= IOMAP_WRITE; 1010 } 1011 1012 if (iocb->ki_flags & IOCB_NOWAIT) { 1013 if (filemap_range_has_page(mapping, start, end)) { 1014 ret = -EAGAIN; 1015 goto out_free_dio; 1016 } 1017 flags |= IOMAP_NOWAIT; 1018 } 1019 1020 ret = filemap_write_and_wait_range(mapping, start, end); 1021 if (ret) 1022 goto out_free_dio; 1023 1024 /* 1025 * Try to invalidate cache pages for the range we're direct 1026 * writing. If this invalidation fails, tough, the write will 1027 * still work, but racing two incompatible write paths is a 1028 * pretty crazy thing to do, so we don't support it 100%. 1029 */ 1030 ret = invalidate_inode_pages2_range(mapping, 1031 start >> PAGE_SHIFT, end >> PAGE_SHIFT); 1032 if (ret) 1033 dio_warn_stale_pagecache(iocb->ki_filp); 1034 ret = 0; 1035 1036 if (iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) && 1037 !inode->i_sb->s_dio_done_wq) { 1038 ret = sb_init_dio_done_wq(inode->i_sb); 1039 if (ret < 0) 1040 goto out_free_dio; 1041 } 1042 1043 inode_dio_begin(inode); 1044 1045 blk_start_plug(&plug); 1046 do { 1047 ret = iomap_apply(inode, pos, count, flags, ops, dio, 1048 iomap_dio_actor); 1049 if (ret <= 0) { 1050 /* magic error code to fall back to buffered I/O */ 1051 if (ret == -ENOTBLK) 1052 ret = 0; 1053 break; 1054 } 1055 pos += ret; 1056 1057 if (iov_iter_rw(iter) == READ && pos >= dio->i_size) 1058 break; 1059 } while ((count = iov_iter_count(iter)) > 0); 1060 blk_finish_plug(&plug); 1061 1062 if (ret < 0) 1063 iomap_dio_set_error(dio, ret); 1064 1065 if (!atomic_dec_and_test(&dio->ref)) { 1066 if (!is_sync_kiocb(iocb)) 1067 return -EIOCBQUEUED; 1068 1069 for (;;) { 1070 set_current_state(TASK_UNINTERRUPTIBLE); 1071 if (!READ_ONCE(dio->submit.waiter)) 1072 break; 1073 1074 if (!(iocb->ki_flags & IOCB_HIPRI) || 1075 !dio->submit.last_queue || 1076 !blk_poll(dio->submit.last_queue, 1077 dio->submit.cookie)) 1078 io_schedule(); 1079 } 1080 __set_current_state(TASK_RUNNING); 1081 } 1082 1083 ret = iomap_dio_complete(dio); 1084 1085 return ret; 1086 1087out_free_dio: 1088 kfree(dio); 1089 return ret; 1090} 1091EXPORT_SYMBOL_GPL(iomap_dio_rw);