fs/iomap.c at v4.9-rc6 · tjh.dev/kernel

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / fs / iomap.c
at v4.9-rc6 585 lines 14 kB view raw
wrap content
  1/*
  2 * Copyright (C) 2010 Red Hat, Inc.
  3 * Copyright (c) 2016 Christoph Hellwig.
  4 *
  5 * This program is free software; you can redistribute it and/or modify it
  6 * under the terms and conditions of the GNU General Public License,
  7 * version 2, as published by the Free Software Foundation.
  8 *
  9 * This program is distributed in the hope it will be useful, but WITHOUT
 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 12 * more details.
 13 */
 14#include <linux/module.h>
 15#include <linux/compiler.h>
 16#include <linux/fs.h>
 17#include <linux/iomap.h>
 18#include <linux/uaccess.h>
 19#include <linux/gfp.h>
 20#include <linux/mm.h>
 21#include <linux/swap.h>
 22#include <linux/pagemap.h>
 23#include <linux/file.h>
 24#include <linux/uio.h>
 25#include <linux/backing-dev.h>
 26#include <linux/buffer_head.h>
 27#include <linux/dax.h>
 28#include "internal.h"
 29
 30/*
 31 * Execute a iomap write on a segment of the mapping that spans a
 32 * contiguous range of pages that have identical block mapping state.
 33 *
 34 * This avoids the need to map pages individually, do individual allocations
 35 * for each page and most importantly avoid the need for filesystem specific
 36 * locking per page. Instead, all the operations are amortised over the entire
 37 * range of pages. It is assumed that the filesystems will lock whatever
 38 * resources they require in the iomap_begin call, and release them in the
 39 * iomap_end call.
 40 */
 41loff_t
 42iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
 43		struct iomap_ops *ops, void *data, iomap_actor_t actor)
 44{
 45	struct iomap iomap = { 0 };
 46	loff_t written = 0, ret;
 47
 48	/*
 49	 * Need to map a range from start position for length bytes. This can
 50	 * span multiple pages - it is only guaranteed to return a range of a
 51	 * single type of pages (e.g. all into a hole, all mapped or all
 52	 * unwritten). Failure at this point has nothing to undo.
 53	 *
 54	 * If allocation is required for this range, reserve the space now so
 55	 * that the allocation is guaranteed to succeed later on. Once we copy
 56	 * the data into the page cache pages, then we cannot fail otherwise we
 57	 * expose transient stale data. If the reserve fails, we can safely
 58	 * back out at this point as there is nothing to undo.
 59	 */
 60	ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
 61	if (ret)
 62		return ret;
 63	if (WARN_ON(iomap.offset > pos))
 64		return -EIO;
 65
 66	/*
 67	 * Cut down the length to the one actually provided by the filesystem,
 68	 * as it might not be able to give us the whole size that we requested.
 69	 */
 70	if (iomap.offset + iomap.length < pos + length)
 71		length = iomap.offset + iomap.length - pos;
 72
 73	/*
 74	 * Now that we have guaranteed that the space allocation will succeed.
 75	 * we can do the copy-in page by page without having to worry about
 76	 * failures exposing transient data.
 77	 */
 78	written = actor(inode, pos, length, data, &iomap);
 79
 80	/*
 81	 * Now the data has been copied, commit the range we've copied.  This
 82	 * should not fail unless the filesystem has had a fatal error.
 83	 */
 84	if (ops->iomap_end) {
 85		ret = ops->iomap_end(inode, pos, length,
 86				     written > 0 ? written : 0,
 87				     flags, &iomap);
 88	}
 89
 90	return written ? written : ret;
 91}
 92
 93static void
 94iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
 95{
 96	loff_t i_size = i_size_read(inode);
 97
 98	/*
 99	 * Only truncate newly allocated pages beyoned EOF, even if the
100	 * write started inside the existing inode size.
101	 */
102	if (pos + len > i_size)
103		truncate_pagecache_range(inode, max(pos, i_size), pos + len);
104}
105
106static int
107iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
108		struct page **pagep, struct iomap *iomap)
109{
110	pgoff_t index = pos >> PAGE_SHIFT;
111	struct page *page;
112	int status = 0;
113
114	BUG_ON(pos + len > iomap->offset + iomap->length);
115
116	page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
117	if (!page)
118		return -ENOMEM;
119
120	status = __block_write_begin_int(page, pos, len, NULL, iomap);
121	if (unlikely(status)) {
122		unlock_page(page);
123		put_page(page);
124		page = NULL;
125
126		iomap_write_failed(inode, pos, len);
127	}
128
129	*pagep = page;
130	return status;
131}
132
133static int
134iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
135		unsigned copied, struct page *page)
136{
137	int ret;
138
139	ret = generic_write_end(NULL, inode->i_mapping, pos, len,
140			copied, page, NULL);
141	if (ret < len)
142		iomap_write_failed(inode, pos, len);
143	return ret;
144}
145
146static loff_t
147iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
148		struct iomap *iomap)
149{
150	struct iov_iter *i = data;
151	long status = 0;
152	ssize_t written = 0;
153	unsigned int flags = AOP_FLAG_NOFS;
154
155	/*
156	 * Copies from kernel address space cannot fail (NFSD is a big user).
157	 */
158	if (!iter_is_iovec(i))
159		flags |= AOP_FLAG_UNINTERRUPTIBLE;
160
161	do {
162		struct page *page;
163		unsigned long offset;	/* Offset into pagecache page */
164		unsigned long bytes;	/* Bytes to write to page */
165		size_t copied;		/* Bytes copied from user */
166
167		offset = (pos & (PAGE_SIZE - 1));
168		bytes = min_t(unsigned long, PAGE_SIZE - offset,
169						iov_iter_count(i));
170again:
171		if (bytes > length)
172			bytes = length;
173
174		/*
175		 * Bring in the user page that we will copy from _first_.
176		 * Otherwise there's a nasty deadlock on copying from the
177		 * same page as we're writing to, without it being marked
178		 * up-to-date.
179		 *
180		 * Not only is this an optimisation, but it is also required
181		 * to check that the address is actually valid, when atomic
182		 * usercopies are used, below.
183		 */
184		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
185			status = -EFAULT;
186			break;
187		}
188
189		status = iomap_write_begin(inode, pos, bytes, flags, &page,
190				iomap);
191		if (unlikely(status))
192			break;
193
194		if (mapping_writably_mapped(inode->i_mapping))
195			flush_dcache_page(page);
196
197		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
198
199		flush_dcache_page(page);
200
201		status = iomap_write_end(inode, pos, bytes, copied, page);
202		if (unlikely(status < 0))
203			break;
204		copied = status;
205
206		cond_resched();
207
208		iov_iter_advance(i, copied);
209		if (unlikely(copied == 0)) {
210			/*
211			 * If we were unable to copy any data at all, we must
212			 * fall back to a single segment length write.
213			 *
214			 * If we didn't fallback here, we could livelock
215			 * because not all segments in the iov can be copied at
216			 * once without a pagefault.
217			 */
218			bytes = min_t(unsigned long, PAGE_SIZE - offset,
219						iov_iter_single_seg_count(i));
220			goto again;
221		}
222		pos += copied;
223		written += copied;
224		length -= copied;
225
226		balance_dirty_pages_ratelimited(inode->i_mapping);
227	} while (iov_iter_count(i) && length);
228
229	return written ? written : status;
230}
231
232ssize_t
233iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
234		struct iomap_ops *ops)
235{
236	struct inode *inode = iocb->ki_filp->f_mapping->host;
237	loff_t pos = iocb->ki_pos, ret = 0, written = 0;
238
239	while (iov_iter_count(iter)) {
240		ret = iomap_apply(inode, pos, iov_iter_count(iter),
241				IOMAP_WRITE, ops, iter, iomap_write_actor);
242		if (ret <= 0)
243			break;
244		pos += ret;
245		written += ret;
246	}
247
248	return written ? written : ret;
249}
250EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
251
252static struct page *
253__iomap_read_page(struct inode *inode, loff_t offset)
254{
255	struct address_space *mapping = inode->i_mapping;
256	struct page *page;
257
258	page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL);
259	if (IS_ERR(page))
260		return page;
261	if (!PageUptodate(page)) {
262		put_page(page);
263		return ERR_PTR(-EIO);
264	}
265	return page;
266}
267
268static loff_t
269iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
270		struct iomap *iomap)
271{
272	long status = 0;
273	ssize_t written = 0;
274
275	do {
276		struct page *page, *rpage;
277		unsigned long offset;	/* Offset into pagecache page */
278		unsigned long bytes;	/* Bytes to write to page */
279
280		offset = (pos & (PAGE_SIZE - 1));
281		bytes = min_t(unsigned long, PAGE_SIZE - offset, length);
282
283		rpage = __iomap_read_page(inode, pos);
284		if (IS_ERR(rpage))
285			return PTR_ERR(rpage);
286
287		status = iomap_write_begin(inode, pos, bytes,
288				AOP_FLAG_NOFS | AOP_FLAG_UNINTERRUPTIBLE,
289				&page, iomap);
290		put_page(rpage);
291		if (unlikely(status))
292			return status;
293
294		WARN_ON_ONCE(!PageUptodate(page));
295
296		status = iomap_write_end(inode, pos, bytes, bytes, page);
297		if (unlikely(status <= 0)) {
298			if (WARN_ON_ONCE(status == 0))
299				return -EIO;
300			return status;
301		}
302
303		cond_resched();
304
305		pos += status;
306		written += status;
307		length -= status;
308
309		balance_dirty_pages_ratelimited(inode->i_mapping);
310	} while (length);
311
312	return written;
313}
314
315int
316iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
317		struct iomap_ops *ops)
318{
319	loff_t ret;
320
321	while (len) {
322		ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL,
323				iomap_dirty_actor);
324		if (ret <= 0)
325			return ret;
326		pos += ret;
327		len -= ret;
328	}
329
330	return 0;
331}
332EXPORT_SYMBOL_GPL(iomap_file_dirty);
333
334static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
335		unsigned bytes, struct iomap *iomap)
336{
337	struct page *page;
338	int status;
339
340	status = iomap_write_begin(inode, pos, bytes,
341			AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS, &page, iomap);
342	if (status)
343		return status;
344
345	zero_user(page, offset, bytes);
346	mark_page_accessed(page);
347
348	return iomap_write_end(inode, pos, bytes, bytes, page);
349}
350
351static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
352		struct iomap *iomap)
353{
354	sector_t sector = iomap->blkno +
355		(((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
356
357	return __dax_zero_page_range(iomap->bdev, sector, offset, bytes);
358}
359
360static loff_t
361iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
362		void *data, struct iomap *iomap)
363{
364	bool *did_zero = data;
365	loff_t written = 0;
366	int status;
367
368	/* already zeroed?  we're done. */
369	if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
370	    	return count;
371
372	do {
373		unsigned offset, bytes;
374
375		offset = pos & (PAGE_SIZE - 1); /* Within page */
376		bytes = min_t(unsigned, PAGE_SIZE - offset, count);
377
378		if (IS_DAX(inode))
379			status = iomap_dax_zero(pos, offset, bytes, iomap);
380		else
381			status = iomap_zero(inode, pos, offset, bytes, iomap);
382		if (status < 0)
383			return status;
384
385		pos += bytes;
386		count -= bytes;
387		written += bytes;
388		if (did_zero)
389			*did_zero = true;
390	} while (count > 0);
391
392	return written;
393}
394
395int
396iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
397		struct iomap_ops *ops)
398{
399	loff_t ret;
400
401	while (len > 0) {
402		ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
403				ops, did_zero, iomap_zero_range_actor);
404		if (ret <= 0)
405			return ret;
406
407		pos += ret;
408		len -= ret;
409	}
410
411	return 0;
412}
413EXPORT_SYMBOL_GPL(iomap_zero_range);
414
415int
416iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
417		struct iomap_ops *ops)
418{
419	unsigned blocksize = (1 << inode->i_blkbits);
420	unsigned off = pos & (blocksize - 1);
421
422	/* Block boundary? Nothing to do */
423	if (!off)
424		return 0;
425	return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
426}
427EXPORT_SYMBOL_GPL(iomap_truncate_page);
428
429static loff_t
430iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
431		void *data, struct iomap *iomap)
432{
433	struct page *page = data;
434	int ret;
435
436	ret = __block_write_begin_int(page, pos, length, NULL, iomap);
437	if (ret)
438		return ret;
439
440	block_commit_write(page, 0, length);
441	return length;
442}
443
444int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
445		struct iomap_ops *ops)
446{
447	struct page *page = vmf->page;
448	struct inode *inode = file_inode(vma->vm_file);
449	unsigned long length;
450	loff_t offset, size;
451	ssize_t ret;
452
453	lock_page(page);
454	size = i_size_read(inode);
455	if ((page->mapping != inode->i_mapping) ||
456	    (page_offset(page) > size)) {
457		/* We overload EFAULT to mean page got truncated */
458		ret = -EFAULT;
459		goto out_unlock;
460	}
461
462	/* page is wholly or partially inside EOF */
463	if (((page->index + 1) << PAGE_SHIFT) > size)
464		length = size & ~PAGE_MASK;
465	else
466		length = PAGE_SIZE;
467
468	offset = page_offset(page);
469	while (length > 0) {
470		ret = iomap_apply(inode, offset, length, IOMAP_WRITE,
471				ops, page, iomap_page_mkwrite_actor);
472		if (unlikely(ret <= 0))
473			goto out_unlock;
474		offset += ret;
475		length -= ret;
476	}
477
478	set_page_dirty(page);
479	wait_for_stable_page(page);
480	return 0;
481out_unlock:
482	unlock_page(page);
483	return ret;
484}
485EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
486
487struct fiemap_ctx {
488	struct fiemap_extent_info *fi;
489	struct iomap prev;
490};
491
492static int iomap_to_fiemap(struct fiemap_extent_info *fi,
493		struct iomap *iomap, u32 flags)
494{
495	switch (iomap->type) {
496	case IOMAP_HOLE:
497		/* skip holes */
498		return 0;
499	case IOMAP_DELALLOC:
500		flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
501		break;
502	case IOMAP_UNWRITTEN:
503		flags |= FIEMAP_EXTENT_UNWRITTEN;
504		break;
505	case IOMAP_MAPPED:
506		break;
507	}
508
509	if (iomap->flags & IOMAP_F_MERGED)
510		flags |= FIEMAP_EXTENT_MERGED;
511	if (iomap->flags & IOMAP_F_SHARED)
512		flags |= FIEMAP_EXTENT_SHARED;
513
514	return fiemap_fill_next_extent(fi, iomap->offset,
515			iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
516			iomap->length, flags);
517
518}
519
520static loff_t
521iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
522		struct iomap *iomap)
523{
524	struct fiemap_ctx *ctx = data;
525	loff_t ret = length;
526
527	if (iomap->type == IOMAP_HOLE)
528		return length;
529
530	ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0);
531	ctx->prev = *iomap;
532	switch (ret) {
533	case 0:		/* success */
534		return length;
535	case 1:		/* extent array full */
536		return 0;
537	default:
538		return ret;
539	}
540}
541
542int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
543		loff_t start, loff_t len, struct iomap_ops *ops)
544{
545	struct fiemap_ctx ctx;
546	loff_t ret;
547
548	memset(&ctx, 0, sizeof(ctx));
549	ctx.fi = fi;
550	ctx.prev.type = IOMAP_HOLE;
551
552	ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC);
553	if (ret)
554		return ret;
555
556	if (fi->fi_flags & FIEMAP_FLAG_SYNC) {
557		ret = filemap_write_and_wait(inode->i_mapping);
558		if (ret)
559			return ret;
560	}
561
562	while (len > 0) {
563		ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx,
564				iomap_fiemap_actor);
565		/* inode with no (attribute) mapping will give ENOENT */
566		if (ret == -ENOENT)
567			break;
568		if (ret < 0)
569			return ret;
570		if (ret == 0)
571			break;
572
573		start += ret;
574		len -= ret;
575	}
576
577	if (ctx.prev.type != IOMAP_HOLE) {
578		ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST);
579		if (ret < 0)
580			return ret;
581	}
582
583	return 0;
584}
585EXPORT_SYMBOL_GPL(iomap_fiemap);
Configure Feed

Configure Feed