Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v4.9-rc6 585 lines 14 kB view raw
1/* 2 * Copyright (C) 2010 Red Hat, Inc. 3 * Copyright (c) 2016 Christoph Hellwig. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 */ 14#include <linux/module.h> 15#include <linux/compiler.h> 16#include <linux/fs.h> 17#include <linux/iomap.h> 18#include <linux/uaccess.h> 19#include <linux/gfp.h> 20#include <linux/mm.h> 21#include <linux/swap.h> 22#include <linux/pagemap.h> 23#include <linux/file.h> 24#include <linux/uio.h> 25#include <linux/backing-dev.h> 26#include <linux/buffer_head.h> 27#include <linux/dax.h> 28#include "internal.h" 29 30/* 31 * Execute a iomap write on a segment of the mapping that spans a 32 * contiguous range of pages that have identical block mapping state. 33 * 34 * This avoids the need to map pages individually, do individual allocations 35 * for each page and most importantly avoid the need for filesystem specific 36 * locking per page. Instead, all the operations are amortised over the entire 37 * range of pages. It is assumed that the filesystems will lock whatever 38 * resources they require in the iomap_begin call, and release them in the 39 * iomap_end call. 40 */ 41loff_t 42iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, 43 struct iomap_ops *ops, void *data, iomap_actor_t actor) 44{ 45 struct iomap iomap = { 0 }; 46 loff_t written = 0, ret; 47 48 /* 49 * Need to map a range from start position for length bytes. This can 50 * span multiple pages - it is only guaranteed to return a range of a 51 * single type of pages (e.g. all into a hole, all mapped or all 52 * unwritten). Failure at this point has nothing to undo. 53 * 54 * If allocation is required for this range, reserve the space now so 55 * that the allocation is guaranteed to succeed later on. Once we copy 56 * the data into the page cache pages, then we cannot fail otherwise we 57 * expose transient stale data. If the reserve fails, we can safely 58 * back out at this point as there is nothing to undo. 59 */ 60 ret = ops->iomap_begin(inode, pos, length, flags, &iomap); 61 if (ret) 62 return ret; 63 if (WARN_ON(iomap.offset > pos)) 64 return -EIO; 65 66 /* 67 * Cut down the length to the one actually provided by the filesystem, 68 * as it might not be able to give us the whole size that we requested. 69 */ 70 if (iomap.offset + iomap.length < pos + length) 71 length = iomap.offset + iomap.length - pos; 72 73 /* 74 * Now that we have guaranteed that the space allocation will succeed. 75 * we can do the copy-in page by page without having to worry about 76 * failures exposing transient data. 77 */ 78 written = actor(inode, pos, length, data, &iomap); 79 80 /* 81 * Now the data has been copied, commit the range we've copied. This 82 * should not fail unless the filesystem has had a fatal error. 83 */ 84 if (ops->iomap_end) { 85 ret = ops->iomap_end(inode, pos, length, 86 written > 0 ? written : 0, 87 flags, &iomap); 88 } 89 90 return written ? written : ret; 91} 92 93static void 94iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) 95{ 96 loff_t i_size = i_size_read(inode); 97 98 /* 99 * Only truncate newly allocated pages beyoned EOF, even if the 100 * write started inside the existing inode size. 101 */ 102 if (pos + len > i_size) 103 truncate_pagecache_range(inode, max(pos, i_size), pos + len); 104} 105 106static int 107iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, 108 struct page **pagep, struct iomap *iomap) 109{ 110 pgoff_t index = pos >> PAGE_SHIFT; 111 struct page *page; 112 int status = 0; 113 114 BUG_ON(pos + len > iomap->offset + iomap->length); 115 116 page = grab_cache_page_write_begin(inode->i_mapping, index, flags); 117 if (!page) 118 return -ENOMEM; 119 120 status = __block_write_begin_int(page, pos, len, NULL, iomap); 121 if (unlikely(status)) { 122 unlock_page(page); 123 put_page(page); 124 page = NULL; 125 126 iomap_write_failed(inode, pos, len); 127 } 128 129 *pagep = page; 130 return status; 131} 132 133static int 134iomap_write_end(struct inode *inode, loff_t pos, unsigned len, 135 unsigned copied, struct page *page) 136{ 137 int ret; 138 139 ret = generic_write_end(NULL, inode->i_mapping, pos, len, 140 copied, page, NULL); 141 if (ret < len) 142 iomap_write_failed(inode, pos, len); 143 return ret; 144} 145 146static loff_t 147iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 148 struct iomap *iomap) 149{ 150 struct iov_iter *i = data; 151 long status = 0; 152 ssize_t written = 0; 153 unsigned int flags = AOP_FLAG_NOFS; 154 155 /* 156 * Copies from kernel address space cannot fail (NFSD is a big user). 157 */ 158 if (!iter_is_iovec(i)) 159 flags |= AOP_FLAG_UNINTERRUPTIBLE; 160 161 do { 162 struct page *page; 163 unsigned long offset; /* Offset into pagecache page */ 164 unsigned long bytes; /* Bytes to write to page */ 165 size_t copied; /* Bytes copied from user */ 166 167 offset = (pos & (PAGE_SIZE - 1)); 168 bytes = min_t(unsigned long, PAGE_SIZE - offset, 169 iov_iter_count(i)); 170again: 171 if (bytes > length) 172 bytes = length; 173 174 /* 175 * Bring in the user page that we will copy from _first_. 176 * Otherwise there's a nasty deadlock on copying from the 177 * same page as we're writing to, without it being marked 178 * up-to-date. 179 * 180 * Not only is this an optimisation, but it is also required 181 * to check that the address is actually valid, when atomic 182 * usercopies are used, below. 183 */ 184 if (unlikely(iov_iter_fault_in_readable(i, bytes))) { 185 status = -EFAULT; 186 break; 187 } 188 189 status = iomap_write_begin(inode, pos, bytes, flags, &page, 190 iomap); 191 if (unlikely(status)) 192 break; 193 194 if (mapping_writably_mapped(inode->i_mapping)) 195 flush_dcache_page(page); 196 197 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 198 199 flush_dcache_page(page); 200 201 status = iomap_write_end(inode, pos, bytes, copied, page); 202 if (unlikely(status < 0)) 203 break; 204 copied = status; 205 206 cond_resched(); 207 208 iov_iter_advance(i, copied); 209 if (unlikely(copied == 0)) { 210 /* 211 * If we were unable to copy any data at all, we must 212 * fall back to a single segment length write. 213 * 214 * If we didn't fallback here, we could livelock 215 * because not all segments in the iov can be copied at 216 * once without a pagefault. 217 */ 218 bytes = min_t(unsigned long, PAGE_SIZE - offset, 219 iov_iter_single_seg_count(i)); 220 goto again; 221 } 222 pos += copied; 223 written += copied; 224 length -= copied; 225 226 balance_dirty_pages_ratelimited(inode->i_mapping); 227 } while (iov_iter_count(i) && length); 228 229 return written ? written : status; 230} 231 232ssize_t 233iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, 234 struct iomap_ops *ops) 235{ 236 struct inode *inode = iocb->ki_filp->f_mapping->host; 237 loff_t pos = iocb->ki_pos, ret = 0, written = 0; 238 239 while (iov_iter_count(iter)) { 240 ret = iomap_apply(inode, pos, iov_iter_count(iter), 241 IOMAP_WRITE, ops, iter, iomap_write_actor); 242 if (ret <= 0) 243 break; 244 pos += ret; 245 written += ret; 246 } 247 248 return written ? written : ret; 249} 250EXPORT_SYMBOL_GPL(iomap_file_buffered_write); 251 252static struct page * 253__iomap_read_page(struct inode *inode, loff_t offset) 254{ 255 struct address_space *mapping = inode->i_mapping; 256 struct page *page; 257 258 page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL); 259 if (IS_ERR(page)) 260 return page; 261 if (!PageUptodate(page)) { 262 put_page(page); 263 return ERR_PTR(-EIO); 264 } 265 return page; 266} 267 268static loff_t 269iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 270 struct iomap *iomap) 271{ 272 long status = 0; 273 ssize_t written = 0; 274 275 do { 276 struct page *page, *rpage; 277 unsigned long offset; /* Offset into pagecache page */ 278 unsigned long bytes; /* Bytes to write to page */ 279 280 offset = (pos & (PAGE_SIZE - 1)); 281 bytes = min_t(unsigned long, PAGE_SIZE - offset, length); 282 283 rpage = __iomap_read_page(inode, pos); 284 if (IS_ERR(rpage)) 285 return PTR_ERR(rpage); 286 287 status = iomap_write_begin(inode, pos, bytes, 288 AOP_FLAG_NOFS | AOP_FLAG_UNINTERRUPTIBLE, 289 &page, iomap); 290 put_page(rpage); 291 if (unlikely(status)) 292 return status; 293 294 WARN_ON_ONCE(!PageUptodate(page)); 295 296 status = iomap_write_end(inode, pos, bytes, bytes, page); 297 if (unlikely(status <= 0)) { 298 if (WARN_ON_ONCE(status == 0)) 299 return -EIO; 300 return status; 301 } 302 303 cond_resched(); 304 305 pos += status; 306 written += status; 307 length -= status; 308 309 balance_dirty_pages_ratelimited(inode->i_mapping); 310 } while (length); 311 312 return written; 313} 314 315int 316iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, 317 struct iomap_ops *ops) 318{ 319 loff_t ret; 320 321 while (len) { 322 ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL, 323 iomap_dirty_actor); 324 if (ret <= 0) 325 return ret; 326 pos += ret; 327 len -= ret; 328 } 329 330 return 0; 331} 332EXPORT_SYMBOL_GPL(iomap_file_dirty); 333 334static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, 335 unsigned bytes, struct iomap *iomap) 336{ 337 struct page *page; 338 int status; 339 340 status = iomap_write_begin(inode, pos, bytes, 341 AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS, &page, iomap); 342 if (status) 343 return status; 344 345 zero_user(page, offset, bytes); 346 mark_page_accessed(page); 347 348 return iomap_write_end(inode, pos, bytes, bytes, page); 349} 350 351static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, 352 struct iomap *iomap) 353{ 354 sector_t sector = iomap->blkno + 355 (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9); 356 357 return __dax_zero_page_range(iomap->bdev, sector, offset, bytes); 358} 359 360static loff_t 361iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, 362 void *data, struct iomap *iomap) 363{ 364 bool *did_zero = data; 365 loff_t written = 0; 366 int status; 367 368 /* already zeroed? we're done. */ 369 if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) 370 return count; 371 372 do { 373 unsigned offset, bytes; 374 375 offset = pos & (PAGE_SIZE - 1); /* Within page */ 376 bytes = min_t(unsigned, PAGE_SIZE - offset, count); 377 378 if (IS_DAX(inode)) 379 status = iomap_dax_zero(pos, offset, bytes, iomap); 380 else 381 status = iomap_zero(inode, pos, offset, bytes, iomap); 382 if (status < 0) 383 return status; 384 385 pos += bytes; 386 count -= bytes; 387 written += bytes; 388 if (did_zero) 389 *did_zero = true; 390 } while (count > 0); 391 392 return written; 393} 394 395int 396iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, 397 struct iomap_ops *ops) 398{ 399 loff_t ret; 400 401 while (len > 0) { 402 ret = iomap_apply(inode, pos, len, IOMAP_ZERO, 403 ops, did_zero, iomap_zero_range_actor); 404 if (ret <= 0) 405 return ret; 406 407 pos += ret; 408 len -= ret; 409 } 410 411 return 0; 412} 413EXPORT_SYMBOL_GPL(iomap_zero_range); 414 415int 416iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, 417 struct iomap_ops *ops) 418{ 419 unsigned blocksize = (1 << inode->i_blkbits); 420 unsigned off = pos & (blocksize - 1); 421 422 /* Block boundary? Nothing to do */ 423 if (!off) 424 return 0; 425 return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); 426} 427EXPORT_SYMBOL_GPL(iomap_truncate_page); 428 429static loff_t 430iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, 431 void *data, struct iomap *iomap) 432{ 433 struct page *page = data; 434 int ret; 435 436 ret = __block_write_begin_int(page, pos, length, NULL, iomap); 437 if (ret) 438 return ret; 439 440 block_commit_write(page, 0, length); 441 return length; 442} 443 444int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, 445 struct iomap_ops *ops) 446{ 447 struct page *page = vmf->page; 448 struct inode *inode = file_inode(vma->vm_file); 449 unsigned long length; 450 loff_t offset, size; 451 ssize_t ret; 452 453 lock_page(page); 454 size = i_size_read(inode); 455 if ((page->mapping != inode->i_mapping) || 456 (page_offset(page) > size)) { 457 /* We overload EFAULT to mean page got truncated */ 458 ret = -EFAULT; 459 goto out_unlock; 460 } 461 462 /* page is wholly or partially inside EOF */ 463 if (((page->index + 1) << PAGE_SHIFT) > size) 464 length = size & ~PAGE_MASK; 465 else 466 length = PAGE_SIZE; 467 468 offset = page_offset(page); 469 while (length > 0) { 470 ret = iomap_apply(inode, offset, length, IOMAP_WRITE, 471 ops, page, iomap_page_mkwrite_actor); 472 if (unlikely(ret <= 0)) 473 goto out_unlock; 474 offset += ret; 475 length -= ret; 476 } 477 478 set_page_dirty(page); 479 wait_for_stable_page(page); 480 return 0; 481out_unlock: 482 unlock_page(page); 483 return ret; 484} 485EXPORT_SYMBOL_GPL(iomap_page_mkwrite); 486 487struct fiemap_ctx { 488 struct fiemap_extent_info *fi; 489 struct iomap prev; 490}; 491 492static int iomap_to_fiemap(struct fiemap_extent_info *fi, 493 struct iomap *iomap, u32 flags) 494{ 495 switch (iomap->type) { 496 case IOMAP_HOLE: 497 /* skip holes */ 498 return 0; 499 case IOMAP_DELALLOC: 500 flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN; 501 break; 502 case IOMAP_UNWRITTEN: 503 flags |= FIEMAP_EXTENT_UNWRITTEN; 504 break; 505 case IOMAP_MAPPED: 506 break; 507 } 508 509 if (iomap->flags & IOMAP_F_MERGED) 510 flags |= FIEMAP_EXTENT_MERGED; 511 if (iomap->flags & IOMAP_F_SHARED) 512 flags |= FIEMAP_EXTENT_SHARED; 513 514 return fiemap_fill_next_extent(fi, iomap->offset, 515 iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0, 516 iomap->length, flags); 517 518} 519 520static loff_t 521iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 522 struct iomap *iomap) 523{ 524 struct fiemap_ctx *ctx = data; 525 loff_t ret = length; 526 527 if (iomap->type == IOMAP_HOLE) 528 return length; 529 530 ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0); 531 ctx->prev = *iomap; 532 switch (ret) { 533 case 0: /* success */ 534 return length; 535 case 1: /* extent array full */ 536 return 0; 537 default: 538 return ret; 539 } 540} 541 542int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, 543 loff_t start, loff_t len, struct iomap_ops *ops) 544{ 545 struct fiemap_ctx ctx; 546 loff_t ret; 547 548 memset(&ctx, 0, sizeof(ctx)); 549 ctx.fi = fi; 550 ctx.prev.type = IOMAP_HOLE; 551 552 ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC); 553 if (ret) 554 return ret; 555 556 if (fi->fi_flags & FIEMAP_FLAG_SYNC) { 557 ret = filemap_write_and_wait(inode->i_mapping); 558 if (ret) 559 return ret; 560 } 561 562 while (len > 0) { 563 ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx, 564 iomap_fiemap_actor); 565 /* inode with no (attribute) mapping will give ENOENT */ 566 if (ret == -ENOENT) 567 break; 568 if (ret < 0) 569 return ret; 570 if (ret == 0) 571 break; 572 573 start += ret; 574 len -= ret; 575 } 576 577 if (ctx.prev.type != IOMAP_HOLE) { 578 ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST); 579 if (ret < 0) 580 return ret; 581 } 582 583 return 0; 584} 585EXPORT_SYMBOL_GPL(iomap_fiemap);