Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v4.8-rc6 505 lines 13 kB view raw
1/* 2 * Copyright (C) 2010 Red Hat, Inc. 3 * Copyright (c) 2016 Christoph Hellwig. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 */ 14#include <linux/module.h> 15#include <linux/compiler.h> 16#include <linux/fs.h> 17#include <linux/iomap.h> 18#include <linux/uaccess.h> 19#include <linux/gfp.h> 20#include <linux/mm.h> 21#include <linux/swap.h> 22#include <linux/pagemap.h> 23#include <linux/file.h> 24#include <linux/uio.h> 25#include <linux/backing-dev.h> 26#include <linux/buffer_head.h> 27#include <linux/dax.h> 28#include "internal.h" 29 30typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, 31 void *data, struct iomap *iomap); 32 33/* 34 * Execute a iomap write on a segment of the mapping that spans a 35 * contiguous range of pages that have identical block mapping state. 36 * 37 * This avoids the need to map pages individually, do individual allocations 38 * for each page and most importantly avoid the need for filesystem specific 39 * locking per page. Instead, all the operations are amortised over the entire 40 * range of pages. It is assumed that the filesystems will lock whatever 41 * resources they require in the iomap_begin call, and release them in the 42 * iomap_end call. 43 */ 44static loff_t 45iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, 46 struct iomap_ops *ops, void *data, iomap_actor_t actor) 47{ 48 struct iomap iomap = { 0 }; 49 loff_t written = 0, ret; 50 51 /* 52 * Need to map a range from start position for length bytes. This can 53 * span multiple pages - it is only guaranteed to return a range of a 54 * single type of pages (e.g. all into a hole, all mapped or all 55 * unwritten). Failure at this point has nothing to undo. 56 * 57 * If allocation is required for this range, reserve the space now so 58 * that the allocation is guaranteed to succeed later on. Once we copy 59 * the data into the page cache pages, then we cannot fail otherwise we 60 * expose transient stale data. If the reserve fails, we can safely 61 * back out at this point as there is nothing to undo. 62 */ 63 ret = ops->iomap_begin(inode, pos, length, flags, &iomap); 64 if (ret) 65 return ret; 66 if (WARN_ON(iomap.offset > pos)) 67 return -EIO; 68 69 /* 70 * Cut down the length to the one actually provided by the filesystem, 71 * as it might not be able to give us the whole size that we requested. 72 */ 73 if (iomap.offset + iomap.length < pos + length) 74 length = iomap.offset + iomap.length - pos; 75 76 /* 77 * Now that we have guaranteed that the space allocation will succeed. 78 * we can do the copy-in page by page without having to worry about 79 * failures exposing transient data. 80 */ 81 written = actor(inode, pos, length, data, &iomap); 82 83 /* 84 * Now the data has been copied, commit the range we've copied. This 85 * should not fail unless the filesystem has had a fatal error. 86 */ 87 if (ops->iomap_end) { 88 ret = ops->iomap_end(inode, pos, length, 89 written > 0 ? written : 0, 90 flags, &iomap); 91 } 92 93 return written ? written : ret; 94} 95 96static void 97iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) 98{ 99 loff_t i_size = i_size_read(inode); 100 101 /* 102 * Only truncate newly allocated pages beyoned EOF, even if the 103 * write started inside the existing inode size. 104 */ 105 if (pos + len > i_size) 106 truncate_pagecache_range(inode, max(pos, i_size), pos + len); 107} 108 109static int 110iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, 111 struct page **pagep, struct iomap *iomap) 112{ 113 pgoff_t index = pos >> PAGE_SHIFT; 114 struct page *page; 115 int status = 0; 116 117 BUG_ON(pos + len > iomap->offset + iomap->length); 118 119 page = grab_cache_page_write_begin(inode->i_mapping, index, flags); 120 if (!page) 121 return -ENOMEM; 122 123 status = __block_write_begin_int(page, pos, len, NULL, iomap); 124 if (unlikely(status)) { 125 unlock_page(page); 126 put_page(page); 127 page = NULL; 128 129 iomap_write_failed(inode, pos, len); 130 } 131 132 *pagep = page; 133 return status; 134} 135 136static int 137iomap_write_end(struct inode *inode, loff_t pos, unsigned len, 138 unsigned copied, struct page *page) 139{ 140 int ret; 141 142 ret = generic_write_end(NULL, inode->i_mapping, pos, len, 143 copied, page, NULL); 144 if (ret < len) 145 iomap_write_failed(inode, pos, len); 146 return ret; 147} 148 149static loff_t 150iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 151 struct iomap *iomap) 152{ 153 struct iov_iter *i = data; 154 long status = 0; 155 ssize_t written = 0; 156 unsigned int flags = AOP_FLAG_NOFS; 157 158 /* 159 * Copies from kernel address space cannot fail (NFSD is a big user). 160 */ 161 if (!iter_is_iovec(i)) 162 flags |= AOP_FLAG_UNINTERRUPTIBLE; 163 164 do { 165 struct page *page; 166 unsigned long offset; /* Offset into pagecache page */ 167 unsigned long bytes; /* Bytes to write to page */ 168 size_t copied; /* Bytes copied from user */ 169 170 offset = (pos & (PAGE_SIZE - 1)); 171 bytes = min_t(unsigned long, PAGE_SIZE - offset, 172 iov_iter_count(i)); 173again: 174 if (bytes > length) 175 bytes = length; 176 177 /* 178 * Bring in the user page that we will copy from _first_. 179 * Otherwise there's a nasty deadlock on copying from the 180 * same page as we're writing to, without it being marked 181 * up-to-date. 182 * 183 * Not only is this an optimisation, but it is also required 184 * to check that the address is actually valid, when atomic 185 * usercopies are used, below. 186 */ 187 if (unlikely(iov_iter_fault_in_readable(i, bytes))) { 188 status = -EFAULT; 189 break; 190 } 191 192 status = iomap_write_begin(inode, pos, bytes, flags, &page, 193 iomap); 194 if (unlikely(status)) 195 break; 196 197 if (mapping_writably_mapped(inode->i_mapping)) 198 flush_dcache_page(page); 199 200 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 201 202 flush_dcache_page(page); 203 204 status = iomap_write_end(inode, pos, bytes, copied, page); 205 if (unlikely(status < 0)) 206 break; 207 copied = status; 208 209 cond_resched(); 210 211 iov_iter_advance(i, copied); 212 if (unlikely(copied == 0)) { 213 /* 214 * If we were unable to copy any data at all, we must 215 * fall back to a single segment length write. 216 * 217 * If we didn't fallback here, we could livelock 218 * because not all segments in the iov can be copied at 219 * once without a pagefault. 220 */ 221 bytes = min_t(unsigned long, PAGE_SIZE - offset, 222 iov_iter_single_seg_count(i)); 223 goto again; 224 } 225 pos += copied; 226 written += copied; 227 length -= copied; 228 229 balance_dirty_pages_ratelimited(inode->i_mapping); 230 } while (iov_iter_count(i) && length); 231 232 return written ? written : status; 233} 234 235ssize_t 236iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, 237 struct iomap_ops *ops) 238{ 239 struct inode *inode = iocb->ki_filp->f_mapping->host; 240 loff_t pos = iocb->ki_pos, ret = 0, written = 0; 241 242 while (iov_iter_count(iter)) { 243 ret = iomap_apply(inode, pos, iov_iter_count(iter), 244 IOMAP_WRITE, ops, iter, iomap_write_actor); 245 if (ret <= 0) 246 break; 247 pos += ret; 248 written += ret; 249 } 250 251 return written ? written : ret; 252} 253EXPORT_SYMBOL_GPL(iomap_file_buffered_write); 254 255static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, 256 unsigned bytes, struct iomap *iomap) 257{ 258 struct page *page; 259 int status; 260 261 status = iomap_write_begin(inode, pos, bytes, 262 AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS, &page, iomap); 263 if (status) 264 return status; 265 266 zero_user(page, offset, bytes); 267 mark_page_accessed(page); 268 269 return iomap_write_end(inode, pos, bytes, bytes, page); 270} 271 272static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, 273 struct iomap *iomap) 274{ 275 sector_t sector = iomap->blkno + 276 (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9); 277 278 return __dax_zero_page_range(iomap->bdev, sector, offset, bytes); 279} 280 281static loff_t 282iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, 283 void *data, struct iomap *iomap) 284{ 285 bool *did_zero = data; 286 loff_t written = 0; 287 int status; 288 289 /* already zeroed? we're done. */ 290 if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) 291 return count; 292 293 do { 294 unsigned offset, bytes; 295 296 offset = pos & (PAGE_SIZE - 1); /* Within page */ 297 bytes = min_t(unsigned, PAGE_SIZE - offset, count); 298 299 if (IS_DAX(inode)) 300 status = iomap_dax_zero(pos, offset, bytes, iomap); 301 else 302 status = iomap_zero(inode, pos, offset, bytes, iomap); 303 if (status < 0) 304 return status; 305 306 pos += bytes; 307 count -= bytes; 308 written += bytes; 309 if (did_zero) 310 *did_zero = true; 311 } while (count > 0); 312 313 return written; 314} 315 316int 317iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, 318 struct iomap_ops *ops) 319{ 320 loff_t ret; 321 322 while (len > 0) { 323 ret = iomap_apply(inode, pos, len, IOMAP_ZERO, 324 ops, did_zero, iomap_zero_range_actor); 325 if (ret <= 0) 326 return ret; 327 328 pos += ret; 329 len -= ret; 330 } 331 332 return 0; 333} 334EXPORT_SYMBOL_GPL(iomap_zero_range); 335 336int 337iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, 338 struct iomap_ops *ops) 339{ 340 unsigned blocksize = (1 << inode->i_blkbits); 341 unsigned off = pos & (blocksize - 1); 342 343 /* Block boundary? Nothing to do */ 344 if (!off) 345 return 0; 346 return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); 347} 348EXPORT_SYMBOL_GPL(iomap_truncate_page); 349 350static loff_t 351iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, 352 void *data, struct iomap *iomap) 353{ 354 struct page *page = data; 355 int ret; 356 357 ret = __block_write_begin_int(page, pos & ~PAGE_MASK, length, 358 NULL, iomap); 359 if (ret) 360 return ret; 361 362 block_commit_write(page, 0, length); 363 return length; 364} 365 366int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, 367 struct iomap_ops *ops) 368{ 369 struct page *page = vmf->page; 370 struct inode *inode = file_inode(vma->vm_file); 371 unsigned long length; 372 loff_t offset, size; 373 ssize_t ret; 374 375 lock_page(page); 376 size = i_size_read(inode); 377 if ((page->mapping != inode->i_mapping) || 378 (page_offset(page) > size)) { 379 /* We overload EFAULT to mean page got truncated */ 380 ret = -EFAULT; 381 goto out_unlock; 382 } 383 384 /* page is wholly or partially inside EOF */ 385 if (((page->index + 1) << PAGE_SHIFT) > size) 386 length = size & ~PAGE_MASK; 387 else 388 length = PAGE_SIZE; 389 390 offset = page_offset(page); 391 while (length > 0) { 392 ret = iomap_apply(inode, offset, length, IOMAP_WRITE, 393 ops, page, iomap_page_mkwrite_actor); 394 if (unlikely(ret <= 0)) 395 goto out_unlock; 396 offset += ret; 397 length -= ret; 398 } 399 400 set_page_dirty(page); 401 wait_for_stable_page(page); 402 return 0; 403out_unlock: 404 unlock_page(page); 405 return ret; 406} 407EXPORT_SYMBOL_GPL(iomap_page_mkwrite); 408 409struct fiemap_ctx { 410 struct fiemap_extent_info *fi; 411 struct iomap prev; 412}; 413 414static int iomap_to_fiemap(struct fiemap_extent_info *fi, 415 struct iomap *iomap, u32 flags) 416{ 417 switch (iomap->type) { 418 case IOMAP_HOLE: 419 /* skip holes */ 420 return 0; 421 case IOMAP_DELALLOC: 422 flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN; 423 break; 424 case IOMAP_UNWRITTEN: 425 flags |= FIEMAP_EXTENT_UNWRITTEN; 426 break; 427 case IOMAP_MAPPED: 428 break; 429 } 430 431 if (iomap->flags & IOMAP_F_MERGED) 432 flags |= FIEMAP_EXTENT_MERGED; 433 434 return fiemap_fill_next_extent(fi, iomap->offset, 435 iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0, 436 iomap->length, flags); 437 438} 439 440static loff_t 441iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 442 struct iomap *iomap) 443{ 444 struct fiemap_ctx *ctx = data; 445 loff_t ret = length; 446 447 if (iomap->type == IOMAP_HOLE) 448 return length; 449 450 ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0); 451 ctx->prev = *iomap; 452 switch (ret) { 453 case 0: /* success */ 454 return length; 455 case 1: /* extent array full */ 456 return 0; 457 default: 458 return ret; 459 } 460} 461 462int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, 463 loff_t start, loff_t len, struct iomap_ops *ops) 464{ 465 struct fiemap_ctx ctx; 466 loff_t ret; 467 468 memset(&ctx, 0, sizeof(ctx)); 469 ctx.fi = fi; 470 ctx.prev.type = IOMAP_HOLE; 471 472 ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC); 473 if (ret) 474 return ret; 475 476 if (fi->fi_flags & FIEMAP_FLAG_SYNC) { 477 ret = filemap_write_and_wait(inode->i_mapping); 478 if (ret) 479 return ret; 480 } 481 482 while (len > 0) { 483 ret = iomap_apply(inode, start, len, 0, ops, &ctx, 484 iomap_fiemap_actor); 485 /* inode with no (attribute) mapping will give ENOENT */ 486 if (ret == -ENOENT) 487 break; 488 if (ret < 0) 489 return ret; 490 if (ret == 0) 491 break; 492 493 start += ret; 494 len -= ret; 495 } 496 497 if (ctx.prev.type != IOMAP_HOLE) { 498 ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST); 499 if (ret < 0) 500 return ret; 501 } 502 503 return 0; 504} 505EXPORT_SYMBOL_GPL(iomap_fiemap);