fs/xfs/xfs_pnfs.c at master · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / xfs / xfs_pnfs.c
at master 8.7 kB view raw
  1// SPDX-License-Identifier: GPL-2.0
  2/*
  3 * Copyright (c) 2014 Christoph Hellwig.
  4 */
  5#include "xfs.h"
  6#include "xfs_shared.h"
  7#include "xfs_format.h"
  8#include "xfs_log_format.h"
  9#include "xfs_trans_resv.h"
 10#include "xfs_mount.h"
 11#include "xfs_inode.h"
 12#include "xfs_trans.h"
 13#include "xfs_bmap.h"
 14#include "xfs_iomap.h"
 15#include "xfs_pnfs.h"
 16
 17/*
 18 * Ensure that we do not have any outstanding pNFS layouts that can be used by
 19 * clients to directly read from or write to this inode.  This must be called
 20 * before every operation that can remove blocks from the extent map.
 21 * Additionally we call it during the write operation, where aren't concerned
 22 * about exposing unallocated blocks but just want to provide basic
 23 * synchronization between a local writer and pNFS clients.  mmap writes would
 24 * also benefit from this sort of synchronization, but due to the tricky locking
 25 * rules in the page fault path we don't bother.
 26 */
 27int
 28xfs_break_leased_layouts(
 29	struct inode		*inode,
 30	uint			*iolock,
 31	bool			*did_unlock)
 32{
 33	struct xfs_inode	*ip = XFS_I(inode);
 34	int			error;
 35
 36	while ((error = break_layout(inode, false)) == -EWOULDBLOCK) {
 37		xfs_iunlock(ip, *iolock);
 38		*did_unlock = true;
 39		error = break_layout(inode, true);
 40		*iolock &= ~XFS_IOLOCK_SHARED;
 41		*iolock |= XFS_IOLOCK_EXCL;
 42		xfs_ilock(ip, *iolock);
 43	}
 44
 45	return error;
 46}
 47
 48/*
 49 * Get a unique ID including its location so that the client can identify
 50 * the exported device.
 51 */
 52int
 53xfs_fs_get_uuid(
 54	struct super_block	*sb,
 55	u8			*buf,
 56	u32			*len,
 57	u64			*offset)
 58{
 59	struct xfs_mount	*mp = XFS_M(sb);
 60
 61	if (*len < sizeof(uuid_t))
 62		return -EINVAL;
 63
 64	memcpy(buf, &mp->m_sb.sb_uuid, sizeof(uuid_t));
 65	*len = sizeof(uuid_t);
 66	*offset = offsetof(struct xfs_dsb, sb_uuid);
 67	return 0;
 68}
 69
 70/*
 71 * We cannot use file based VFS helpers such as file_modified() to update
 72 * inode state as we modify the data/metadata in the inode here. Hence we have
 73 * to open code the timestamp updates and SUID/SGID stripping. We also need
 74 * to set the inode prealloc flag to ensure that the extents we allocate are not
 75 * removed if the inode is reclaimed from memory before xfs_fs_block_commit()
 76 * is from the client to indicate that data has been written and the file size
 77 * can be extended.
 78 */
 79static int
 80xfs_fs_map_update_inode(
 81	struct xfs_inode	*ip)
 82{
 83	struct xfs_trans	*tp;
 84	int			error;
 85
 86	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
 87			0, 0, 0, &tp);
 88	if (error)
 89		return error;
 90
 91	xfs_ilock(ip, XFS_ILOCK_EXCL);
 92	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 93
 94	VFS_I(ip)->i_mode &= ~S_ISUID;
 95	if (VFS_I(ip)->i_mode & S_IXGRP)
 96		VFS_I(ip)->i_mode &= ~S_ISGID;
 97	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 98	ip->i_diflags |= XFS_DIFLAG_PREALLOC;
 99
100	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
101	return xfs_trans_commit(tp);
102}
103
104/*
105 * Get a layout for the pNFS client.
106 */
107int
108xfs_fs_map_blocks(
109	struct inode		*inode,
110	loff_t			offset,
111	u64			length,
112	struct iomap		*iomap,
113	bool			write,
114	u32			*device_generation)
115{
116	struct xfs_inode	*ip = XFS_I(inode);
117	struct xfs_mount	*mp = ip->i_mount;
118	struct xfs_bmbt_irec	imap;
119	xfs_fileoff_t		offset_fsb, end_fsb;
120	loff_t			limit;
121	int			bmapi_flags = XFS_BMAPI_ENTIRE;
122	int			nimaps = 1;
123	uint			lock_flags;
124	int			error = 0;
125	u64			seq;
126
127	if (xfs_is_shutdown(mp))
128		return -EIO;
129
130	/*
131	 * We can't export inodes residing on the realtime device.  The realtime
132	 * device doesn't have a UUID to identify it, so the client has no way
133	 * to find it.
134	 */
135	if (XFS_IS_REALTIME_INODE(ip))
136		return -ENXIO;
137
138	/*
139	 * The pNFS block layout spec actually supports reflink like
140	 * functionality, but the Linux pNFS server doesn't implement it yet.
141	 */
142	if (xfs_is_reflink_inode(ip))
143		return -ENXIO;
144
145	/*
146	 * Lock out any other I/O before we flush and invalidate the pagecache,
147	 * and then hand out a layout to the remote system.  This is very
148	 * similar to direct I/O, except that the synchronization is much more
149	 * complicated.  See the comment near xfs_break_leased_layouts
150	 * for a detailed explanation.
151	 */
152	xfs_ilock(ip, XFS_IOLOCK_EXCL);
153
154	error = -EINVAL;
155	limit = mp->m_super->s_maxbytes;
156	if (!write)
157		limit = max(limit, round_up(i_size_read(inode),
158				     inode->i_sb->s_blocksize));
159	if (offset > limit)
160		goto out_unlock;
161	if (offset > limit - length)
162		length = limit - offset;
163
164	error = filemap_write_and_wait(inode->i_mapping);
165	if (error)
166		goto out_unlock;
167	error = invalidate_inode_pages2(inode->i_mapping);
168	if (WARN_ON_ONCE(error))
169		goto out_unlock;
170
171	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length);
172	offset_fsb = XFS_B_TO_FSBT(mp, offset);
173
174	lock_flags = xfs_ilock_data_map_shared(ip);
175	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
176				&imap, &nimaps, bmapi_flags);
177	seq = xfs_iomap_inode_sequence(ip, 0);
178
179	ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK);
180
181	if (!error && write &&
182	    (!nimaps || imap.br_startblock == HOLESTARTBLOCK)) {
183		if (offset + length > XFS_ISIZE(ip))
184			end_fsb = xfs_iomap_eof_align_last_fsb(ip, end_fsb);
185		else if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
186			end_fsb = min(end_fsb, imap.br_startoff +
187					       imap.br_blockcount);
188		xfs_iunlock(ip, lock_flags);
189
190		error = xfs_iomap_write_direct(ip, offset_fsb,
191				end_fsb - offset_fsb, 0, &imap, &seq);
192		if (error)
193			goto out_unlock;
194
195		/*
196		 * Ensure the next transaction is committed synchronously so
197		 * that the blocks allocated and handed out to the client are
198		 * guaranteed to be present even after a server crash.
199		 */
200		error = xfs_fs_map_update_inode(ip);
201		if (!error)
202			error = xfs_log_force_inode(ip);
203		if (error)
204			goto out_unlock;
205
206	} else {
207		xfs_iunlock(ip, lock_flags);
208	}
209	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
210
211	error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq);
212	*device_generation = mp->m_generation;
213	return error;
214out_unlock:
215	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
216	return error;
217}
218
219/*
220 * Ensure the size update falls into a valid allocated block.
221 */
222static int
223xfs_pnfs_validate_isize(
224	struct xfs_inode	*ip,
225	xfs_off_t		isize)
226{
227	struct xfs_bmbt_irec	imap;
228	int			nimaps = 1;
229	int			error = 0;
230
231	xfs_ilock(ip, XFS_ILOCK_SHARED);
232	error = xfs_bmapi_read(ip, XFS_B_TO_FSBT(ip->i_mount, isize - 1), 1,
233				&imap, &nimaps, 0);
234	xfs_iunlock(ip, XFS_ILOCK_SHARED);
235	if (error)
236		return error;
237
238	if (imap.br_startblock == HOLESTARTBLOCK ||
239	    imap.br_startblock == DELAYSTARTBLOCK ||
240	    imap.br_state == XFS_EXT_UNWRITTEN)
241		return -EIO;
242	return 0;
243}
244
245/*
246 * Make sure the blocks described by maps are stable on disk.  This includes
247 * converting any unwritten extents, flushing the disk cache and updating the
248 * time stamps.
249 *
250 * Note that we rely on the caller to always send us a timestamp update so that
251 * we always commit a transaction here.  If that stops being true we will have
252 * to manually flush the cache here similar to what the fsync code path does
253 * for datasyncs on files that have no dirty metadata.
254 */
255int
256xfs_fs_commit_blocks(
257	struct inode		*inode,
258	struct iomap		*maps,
259	int			nr_maps,
260	struct iattr		*iattr)
261{
262	struct xfs_inode	*ip = XFS_I(inode);
263	struct xfs_mount	*mp = ip->i_mount;
264	struct xfs_trans	*tp;
265	bool			update_isize = false;
266	int			error, i;
267	loff_t			size;
268
269	ASSERT(iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME));
270
271	xfs_ilock(ip, XFS_IOLOCK_EXCL);
272
273	size = i_size_read(inode);
274	if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size > size) {
275		update_isize = true;
276		size = iattr->ia_size;
277	}
278
279	for (i = 0; i < nr_maps; i++) {
280		u64 start, length, end;
281
282		start = maps[i].offset;
283		if (start > size)
284			continue;
285
286		end = start + maps[i].length;
287		if (end > size)
288			end = size;
289
290		length = end - start;
291		if (!length)
292			continue;
293
294		/*
295		 * Make sure reads through the pagecache see the new data.
296		 */
297		error = invalidate_inode_pages2_range(inode->i_mapping,
298					start >> PAGE_SHIFT,
299					(end - 1) >> PAGE_SHIFT);
300		WARN_ON_ONCE(error);
301
302		error = xfs_iomap_write_unwritten(ip, start, length, false);
303		if (error)
304			goto out_drop_iolock;
305	}
306
307	if (update_isize) {
308		error = xfs_pnfs_validate_isize(ip, size);
309		if (error)
310			goto out_drop_iolock;
311	}
312
313	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
314	if (error)
315		goto out_drop_iolock;
316
317	xfs_ilock(ip, XFS_ILOCK_EXCL);
318	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
319	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
320
321	ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID)));
322	setattr_copy(&nop_mnt_idmap, inode, iattr);
323	if (update_isize) {
324		i_size_write(inode, iattr->ia_size);
325		ip->i_disk_size = iattr->ia_size;
326	}
327
328	xfs_trans_set_sync(tp);
329	error = xfs_trans_commit(tp);
330
331out_drop_iolock:
332	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
333	return error;
334}