Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: LGPL-2.1
2/*
3 * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd.
4 * Written by Takashi Sato <t-sato@yk.jp.nec.com>
5 * Akira Fujita <a-fujita@rs.jp.nec.com>
6 */
7
8#include <linux/fs.h>
9#include <linux/quotaops.h>
10#include <linux/slab.h>
11#include <linux/sched/mm.h>
12#include "ext4_jbd2.h"
13#include "ext4.h"
14#include "ext4_extents.h"
15
16#include <trace/events/ext4.h>
17
18struct mext_data {
19 struct inode *orig_inode; /* Origin file inode */
20 struct inode *donor_inode; /* Donor file inode */
21 struct ext4_map_blocks orig_map;/* Origin file's move mapping */
22 ext4_lblk_t donor_lblk; /* Start block of the donor file */
23};
24
25/**
26 * ext4_double_down_write_data_sem() - write lock two inodes's i_data_sem
27 * @first: inode to be locked
28 * @second: inode to be locked
29 *
30 * Acquire write lock of i_data_sem of the two inodes
31 */
32void
33ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
34{
35 if (first < second) {
36 down_write(&EXT4_I(first)->i_data_sem);
37 down_write_nested(&EXT4_I(second)->i_data_sem, I_DATA_SEM_OTHER);
38 } else {
39 down_write(&EXT4_I(second)->i_data_sem);
40 down_write_nested(&EXT4_I(first)->i_data_sem, I_DATA_SEM_OTHER);
41 }
42}
43
44/**
45 * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem
46 *
47 * @orig_inode: original inode structure to be released its lock first
48 * @donor_inode: donor inode structure to be released its lock second
49 * Release write lock of i_data_sem of two inodes (orig and donor).
50 */
51void
52ext4_double_up_write_data_sem(struct inode *orig_inode,
53 struct inode *donor_inode)
54{
55 up_write(&EXT4_I(orig_inode)->i_data_sem);
56 up_write(&EXT4_I(donor_inode)->i_data_sem);
57}
58
59/* Grab and lock folio on both @inode1 and @inode2 by inode order. */
60static int mext_folio_double_lock(struct inode *inode1, struct inode *inode2,
61 pgoff_t index1, pgoff_t index2, size_t len,
62 struct folio *folio[2])
63{
64 struct address_space *mapping[2];
65 unsigned int flags;
66 fgf_t fgp_flags = FGP_WRITEBEGIN;
67
68 BUG_ON(!inode1 || !inode2);
69 if (inode1 < inode2) {
70 mapping[0] = inode1->i_mapping;
71 mapping[1] = inode2->i_mapping;
72 } else {
73 swap(index1, index2);
74 mapping[0] = inode2->i_mapping;
75 mapping[1] = inode1->i_mapping;
76 }
77
78 flags = memalloc_nofs_save();
79 fgp_flags |= fgf_set_order(len);
80 folio[0] = __filemap_get_folio(mapping[0], index1, fgp_flags,
81 mapping_gfp_mask(mapping[0]));
82 if (IS_ERR(folio[0])) {
83 memalloc_nofs_restore(flags);
84 return PTR_ERR(folio[0]);
85 }
86
87 folio[1] = __filemap_get_folio(mapping[1], index2, fgp_flags,
88 mapping_gfp_mask(mapping[1]));
89 memalloc_nofs_restore(flags);
90 if (IS_ERR(folio[1])) {
91 folio_unlock(folio[0]);
92 folio_put(folio[0]);
93 return PTR_ERR(folio[1]);
94 }
95 /*
96 * __filemap_get_folio() may not wait on folio's writeback if
97 * BDI not demand that. But it is reasonable to be very conservative
98 * here and explicitly wait on folio's writeback
99 */
100 folio_wait_writeback(folio[0]);
101 folio_wait_writeback(folio[1]);
102 if (inode1 > inode2)
103 swap(folio[0], folio[1]);
104
105 return 0;
106}
107
108static void mext_folio_double_unlock(struct folio *folio[2])
109{
110 folio_unlock(folio[0]);
111 folio_put(folio[0]);
112 folio_unlock(folio[1]);
113 folio_put(folio[1]);
114}
115
116/* Force folio buffers uptodate w/o dropping folio's lock */
117static int mext_folio_mkuptodate(struct folio *folio, size_t from, size_t to)
118{
119 struct inode *inode = folio->mapping->host;
120 sector_t block;
121 struct buffer_head *bh, *head;
122 unsigned int blocksize, block_start, block_end;
123 int nr = 0;
124 bool partial = false;
125
126 BUG_ON(!folio_test_locked(folio));
127 BUG_ON(folio_test_writeback(folio));
128
129 if (folio_test_uptodate(folio))
130 return 0;
131
132 blocksize = i_blocksize(inode);
133 head = folio_buffers(folio);
134 if (!head)
135 head = create_empty_buffers(folio, blocksize, 0);
136
137 block = folio_pos(folio) >> inode->i_blkbits;
138 block_end = 0;
139 bh = head;
140 do {
141 block_start = block_end;
142 block_end = block_start + blocksize;
143 if (block_end <= from || block_start >= to) {
144 if (!buffer_uptodate(bh))
145 partial = true;
146 continue;
147 }
148 if (buffer_uptodate(bh))
149 continue;
150 if (!buffer_mapped(bh)) {
151 int err = ext4_get_block(inode, block, bh, 0);
152 if (err)
153 return err;
154 if (!buffer_mapped(bh)) {
155 folio_zero_range(folio, block_start, blocksize);
156 set_buffer_uptodate(bh);
157 continue;
158 }
159 }
160 lock_buffer(bh);
161 if (buffer_uptodate(bh)) {
162 unlock_buffer(bh);
163 continue;
164 }
165 ext4_read_bh_nowait(bh, 0, NULL, false);
166 nr++;
167 } while (block++, (bh = bh->b_this_page) != head);
168
169 /* No io required */
170 if (!nr)
171 goto out;
172
173 bh = head;
174 do {
175 if (bh_offset(bh) + blocksize <= from)
176 continue;
177 if (bh_offset(bh) >= to)
178 break;
179 wait_on_buffer(bh);
180 if (buffer_uptodate(bh))
181 continue;
182 return -EIO;
183 } while ((bh = bh->b_this_page) != head);
184out:
185 if (!partial)
186 folio_mark_uptodate(folio);
187 return 0;
188}
189
190enum mext_move_type {MEXT_SKIP_EXTENT, MEXT_MOVE_EXTENT, MEXT_COPY_DATA};
191
192/*
193 * Start to move extent between the origin inode and the donor inode,
194 * hold one folio for each inode and check the candidate moving extent
195 * mapping status again.
196 */
197static int mext_move_begin(struct mext_data *mext, struct folio *folio[2],
198 enum mext_move_type *move_type)
199{
200 struct inode *orig_inode = mext->orig_inode;
201 struct inode *donor_inode = mext->donor_inode;
202 unsigned int blkbits = orig_inode->i_blkbits;
203 struct ext4_map_blocks donor_map = {0};
204 loff_t orig_pos, donor_pos;
205 size_t move_len;
206 int ret;
207
208 orig_pos = ((loff_t)mext->orig_map.m_lblk) << blkbits;
209 donor_pos = ((loff_t)mext->donor_lblk) << blkbits;
210 ret = mext_folio_double_lock(orig_inode, donor_inode,
211 orig_pos >> PAGE_SHIFT, donor_pos >> PAGE_SHIFT,
212 ((size_t)mext->orig_map.m_len) << blkbits, folio);
213 if (ret)
214 return ret;
215
216 /*
217 * Check the origin inode's mapping information again under the
218 * folio lock, as we do not hold the i_data_sem at all times, and
219 * it may change during the concurrent write-back operation.
220 */
221 if (mext->orig_map.m_seq != READ_ONCE(EXT4_I(orig_inode)->i_es_seq)) {
222 ret = -ESTALE;
223 goto error;
224 }
225
226 /* Adjust the moving length according to the length of shorter folio. */
227 move_len = umin(folio_pos(folio[0]) + folio_size(folio[0]) - orig_pos,
228 folio_pos(folio[1]) + folio_size(folio[1]) - donor_pos);
229 move_len >>= blkbits;
230 if (move_len < mext->orig_map.m_len)
231 mext->orig_map.m_len = move_len;
232
233 donor_map.m_lblk = mext->donor_lblk;
234 donor_map.m_len = mext->orig_map.m_len;
235 donor_map.m_flags = 0;
236 ret = ext4_map_blocks(NULL, donor_inode, &donor_map, 0);
237 if (ret < 0)
238 goto error;
239
240 /* Adjust the moving length according to the donor mapping length. */
241 mext->orig_map.m_len = donor_map.m_len;
242
243 /* Skip moving if the donor range is a hole or a delalloc extent. */
244 if (!(donor_map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN)))
245 *move_type = MEXT_SKIP_EXTENT;
246 /* If both mapping ranges are unwritten, no need to copy data. */
247 else if ((mext->orig_map.m_flags & EXT4_MAP_UNWRITTEN) &&
248 (donor_map.m_flags & EXT4_MAP_UNWRITTEN))
249 *move_type = MEXT_MOVE_EXTENT;
250 else
251 *move_type = MEXT_COPY_DATA;
252
253 return 0;
254error:
255 mext_folio_double_unlock(folio);
256 return ret;
257}
258
259/*
260 * Re-create the new moved mapping buffers of the original inode and commit
261 * the entire written range.
262 */
263static int mext_folio_mkwrite(struct inode *inode, struct folio *folio,
264 size_t from, size_t to)
265{
266 unsigned int blocksize = i_blocksize(inode);
267 struct buffer_head *bh, *head;
268 size_t block_start, block_end;
269 sector_t block;
270 int ret;
271
272 head = folio_buffers(folio);
273 if (!head)
274 head = create_empty_buffers(folio, blocksize, 0);
275
276 block = folio_pos(folio) >> inode->i_blkbits;
277 block_end = 0;
278 bh = head;
279 do {
280 block_start = block_end;
281 block_end = block_start + blocksize;
282 if (block_end <= from || block_start >= to)
283 continue;
284
285 ret = ext4_get_block(inode, block, bh, 0);
286 if (ret)
287 return ret;
288 } while (block++, (bh = bh->b_this_page) != head);
289
290 block_commit_write(folio, from, to);
291 return 0;
292}
293
294/*
295 * Save the data in original inode extent blocks and replace one folio size
296 * aligned original inode extent with one or one partial donor inode extent,
297 * and then write out the saved data in new original inode blocks. Pass out
298 * the replaced block count through m_len. Return 0 on success, and an error
299 * code otherwise.
300 */
301static int mext_move_extent(struct mext_data *mext, u64 *m_len)
302{
303 struct inode *orig_inode = mext->orig_inode;
304 struct inode *donor_inode = mext->donor_inode;
305 struct ext4_map_blocks *orig_map = &mext->orig_map;
306 unsigned int blkbits = orig_inode->i_blkbits;
307 struct folio *folio[2] = {NULL, NULL};
308 loff_t from, length;
309 enum mext_move_type move_type = 0;
310 handle_t *handle;
311 u64 r_len = 0;
312 unsigned int credits;
313 int ret, ret2;
314
315 *m_len = 0;
316 trace_ext4_move_extent_enter(orig_inode, orig_map, donor_inode,
317 mext->donor_lblk);
318 credits = ext4_chunk_trans_extent(orig_inode, 0) * 2;
319 handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, credits);
320 if (IS_ERR(handle)) {
321 ret = PTR_ERR(handle);
322 goto out;
323 }
324
325 ret = mext_move_begin(mext, folio, &move_type);
326 if (ret)
327 goto stop_handle;
328
329 if (move_type == MEXT_SKIP_EXTENT)
330 goto unlock;
331
332 /*
333 * Copy the data. First, read the original inode data into the page
334 * cache. Then, release the existing mapping relationships and swap
335 * the extent. Finally, re-establish the new mapping relationships
336 * and dirty the page cache.
337 */
338 if (move_type == MEXT_COPY_DATA) {
339 from = offset_in_folio(folio[0],
340 ((loff_t)orig_map->m_lblk) << blkbits);
341 length = ((loff_t)orig_map->m_len) << blkbits;
342
343 ret = mext_folio_mkuptodate(folio[0], from, from + length);
344 if (ret)
345 goto unlock;
346 }
347
348 if (!filemap_release_folio(folio[0], 0) ||
349 !filemap_release_folio(folio[1], 0)) {
350 ret = -EBUSY;
351 goto unlock;
352 }
353
354 /* Move extent */
355 ext4_double_down_write_data_sem(orig_inode, donor_inode);
356 *m_len = ext4_swap_extents(handle, orig_inode, donor_inode,
357 orig_map->m_lblk, mext->donor_lblk,
358 orig_map->m_len, 1, &ret);
359 ext4_double_up_write_data_sem(orig_inode, donor_inode);
360
361 /* A short-length swap cannot occur after a successful swap extent. */
362 if (WARN_ON_ONCE(!ret && (*m_len != orig_map->m_len)))
363 ret = -EIO;
364
365 if (!(*m_len) || (move_type == MEXT_MOVE_EXTENT))
366 goto unlock;
367
368 /* Copy data */
369 length = (*m_len) << blkbits;
370 ret2 = mext_folio_mkwrite(orig_inode, folio[0], from, from + length);
371 if (ret2) {
372 if (!ret)
373 ret = ret2;
374 goto repair_branches;
375 }
376 /*
377 * Even in case of data=writeback it is reasonable to pin
378 * inode to transaction, to prevent unexpected data loss.
379 */
380 ret2 = ext4_jbd2_inode_add_write(handle, orig_inode,
381 ((loff_t)orig_map->m_lblk) << blkbits, length);
382 if (!ret)
383 ret = ret2;
384unlock:
385 mext_folio_double_unlock(folio);
386stop_handle:
387 ext4_journal_stop(handle);
388out:
389 trace_ext4_move_extent_exit(orig_inode, orig_map->m_lblk, donor_inode,
390 mext->donor_lblk, orig_map->m_len, *m_len,
391 move_type, ret);
392 return ret;
393
394repair_branches:
395 ret2 = 0;
396 r_len = ext4_swap_extents(handle, donor_inode, orig_inode,
397 mext->donor_lblk, orig_map->m_lblk,
398 *m_len, 0, &ret2);
399 if (ret2 || r_len != *m_len) {
400 ext4_error_inode_block(orig_inode, (sector_t)(orig_map->m_lblk),
401 EIO, "Unable to copy data block, data will be lost!");
402 ret = -EIO;
403 }
404 *m_len = 0;
405 goto unlock;
406}
407
408/*
409 * Check the validity of the basic filesystem environment and the
410 * inodes' support status.
411 */
412static int mext_check_validity(struct inode *orig_inode,
413 struct inode *donor_inode)
414{
415 struct super_block *sb = orig_inode->i_sb;
416
417 /* origin and donor should be different inodes */
418 if (orig_inode == donor_inode) {
419 ext4_debug("ext4 move extent: The argument files should not be same inode [ino:orig %lu, donor %lu]\n",
420 orig_inode->i_ino, donor_inode->i_ino);
421 return -EINVAL;
422 }
423
424 /* origin and donor should belone to the same filesystem */
425 if (orig_inode->i_sb != donor_inode->i_sb) {
426 ext4_debug("ext4 move extent: The argument files should be in same FS [ino:orig %lu, donor %lu]\n",
427 orig_inode->i_ino, donor_inode->i_ino);
428 return -EINVAL;
429 }
430
431 /* Regular file check */
432 if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
433 ext4_debug("ext4 move extent: The argument files should be regular file [ino:orig %lu, donor %lu]\n",
434 orig_inode->i_ino, donor_inode->i_ino);
435 return -EINVAL;
436 }
437
438 if (ext4_has_feature_bigalloc(sb)) {
439 ext4_msg(sb, KERN_ERR,
440 "Online defrag not supported with bigalloc");
441 return -EOPNOTSUPP;
442 }
443
444 if (IS_DAX(orig_inode)) {
445 ext4_msg(sb, KERN_ERR,
446 "Online defrag not supported with DAX");
447 return -EOPNOTSUPP;
448 }
449
450 /*
451 * TODO: it's not obvious how to swap blocks for inodes with full
452 * journaling enabled.
453 */
454 if (ext4_should_journal_data(orig_inode) ||
455 ext4_should_journal_data(donor_inode)) {
456 ext4_msg(sb, KERN_ERR,
457 "Online defrag not supported with data journaling");
458 return -EOPNOTSUPP;
459 }
460
461 if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) {
462 ext4_msg(sb, KERN_ERR,
463 "Online defrag not supported for encrypted files");
464 return -EOPNOTSUPP;
465 }
466
467 /* Ext4 move extent supports only extent based file */
468 if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS)) ||
469 !(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
470 ext4_msg(sb, KERN_ERR,
471 "Online defrag not supported for non-extent files");
472 return -EOPNOTSUPP;
473 }
474
475 if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
476 ext4_debug("ext4 move extent: suid or sgid is set to donor file [ino:orig %lu, donor %lu]\n",
477 orig_inode->i_ino, donor_inode->i_ino);
478 return -EINVAL;
479 }
480
481 if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) {
482 ext4_debug("ext4 move extent: donor should not be immutable or append file [ino:orig %lu, donor %lu]\n",
483 orig_inode->i_ino, donor_inode->i_ino);
484 return -EPERM;
485 }
486
487 /* Ext4 move extent does not support swap files */
488 if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
489 ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %lu, donor %lu]\n",
490 orig_inode->i_ino, donor_inode->i_ino);
491 return -ETXTBSY;
492 }
493
494 if (ext4_is_quota_file(orig_inode) || ext4_is_quota_file(donor_inode)) {
495 ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n",
496 orig_inode->i_ino, donor_inode->i_ino);
497 return -EOPNOTSUPP;
498 }
499
500 if ((!orig_inode->i_size) || (!donor_inode->i_size)) {
501 ext4_debug("ext4 move extent: File size is 0 byte\n");
502 return -EINVAL;
503 }
504
505 return 0;
506}
507
508/*
509 * Check the moving range of ext4_move_extents() whether the files can be
510 * exchanged with each other, and adjust the length to fit within the file
511 * size. Return 0 on success, or a negative error value on failure.
512 */
513static int mext_check_adjust_range(struct inode *orig_inode,
514 struct inode *donor_inode, __u64 orig_start,
515 __u64 donor_start, __u64 *len)
516{
517 __u64 orig_eof, donor_eof;
518
519 /* Start offset should be same */
520 if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) !=
521 (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) {
522 ext4_debug("ext4 move extent: orig and donor's start offsets are not aligned [ino:orig %lu, donor %lu]\n",
523 orig_inode->i_ino, donor_inode->i_ino);
524 return -EINVAL;
525 }
526
527 if ((orig_start >= EXT_MAX_BLOCKS) ||
528 (donor_start >= EXT_MAX_BLOCKS) ||
529 (*len > EXT_MAX_BLOCKS) ||
530 (donor_start + *len >= EXT_MAX_BLOCKS) ||
531 (orig_start + *len >= EXT_MAX_BLOCKS)) {
532 ext4_debug("ext4 move extent: Can't handle over [%u] blocks [ino:orig %lu, donor %lu]\n",
533 EXT_MAX_BLOCKS,
534 orig_inode->i_ino, donor_inode->i_ino);
535 return -EINVAL;
536 }
537
538 orig_eof = EXT4_B_TO_LBLK(orig_inode, i_size_read(orig_inode));
539 donor_eof = EXT4_B_TO_LBLK(donor_inode, i_size_read(donor_inode));
540 if (orig_eof <= orig_start)
541 *len = 0;
542 else if (orig_eof < orig_start + *len - 1)
543 *len = orig_eof - orig_start;
544 if (donor_eof <= donor_start)
545 *len = 0;
546 else if (donor_eof < donor_start + *len - 1)
547 *len = donor_eof - donor_start;
548 if (!*len) {
549 ext4_debug("ext4 move extent: len should not be 0 [ino:orig %lu, donor %lu]\n",
550 orig_inode->i_ino, donor_inode->i_ino);
551 return -EINVAL;
552 }
553
554 return 0;
555}
556
557/**
558 * ext4_move_extents - Exchange the specified range of a file
559 *
560 * @o_filp: file structure of the original file
561 * @d_filp: file structure of the donor file
562 * @orig_blk: start offset in block for orig
563 * @donor_blk: start offset in block for donor
564 * @len: the number of blocks to be moved
565 * @moved_len: moved block length
566 *
567 * This function returns 0 and moved block length is set in moved_len
568 * if succeed, otherwise returns error value.
569 */
570int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
571 __u64 donor_blk, __u64 len, __u64 *moved_len)
572{
573 struct inode *orig_inode = file_inode(o_filp);
574 struct inode *donor_inode = file_inode(d_filp);
575 struct mext_data mext;
576 struct super_block *sb = orig_inode->i_sb;
577 struct ext4_sb_info *sbi = EXT4_SB(sb);
578 int retries = 0;
579 u64 m_len;
580 int ret;
581
582 *moved_len = 0;
583
584 /* Protect orig and donor inodes against a truncate */
585 lock_two_nondirectories(orig_inode, donor_inode);
586
587 ret = mext_check_validity(orig_inode, donor_inode);
588 if (ret)
589 goto out;
590
591 /* Wait for all existing dio workers */
592 inode_dio_wait(orig_inode);
593 inode_dio_wait(donor_inode);
594
595 /* Check and adjust the specified move_extent range. */
596 ret = mext_check_adjust_range(orig_inode, donor_inode, orig_blk,
597 donor_blk, &len);
598 if (ret)
599 goto out;
600
601 mext.orig_inode = orig_inode;
602 mext.donor_inode = donor_inode;
603 while (len) {
604 mext.orig_map.m_lblk = orig_blk;
605 mext.orig_map.m_len = len;
606 mext.orig_map.m_flags = 0;
607 mext.donor_lblk = donor_blk;
608
609 ret = ext4_map_blocks(NULL, orig_inode, &mext.orig_map, 0);
610 if (ret < 0)
611 goto out;
612
613 /* Skip moving if it is a hole or a delalloc extent. */
614 if (mext.orig_map.m_flags &
615 (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN)) {
616 ret = mext_move_extent(&mext, &m_len);
617 *moved_len += m_len;
618 if (!ret)
619 goto next;
620
621 /* Move failed or partially failed. */
622 if (m_len) {
623 orig_blk += m_len;
624 donor_blk += m_len;
625 len -= m_len;
626 }
627 if (ret == -ESTALE)
628 continue;
629 if (ret == -ENOSPC &&
630 ext4_should_retry_alloc(sb, &retries))
631 continue;
632 if (ret == -EBUSY &&
633 sbi->s_journal && retries++ < 4 &&
634 jbd2_journal_force_commit_nested(sbi->s_journal))
635 continue;
636
637 goto out;
638 }
639next:
640 orig_blk += mext.orig_map.m_len;
641 donor_blk += mext.orig_map.m_len;
642 len -= mext.orig_map.m_len;
643 retries = 0;
644 }
645
646out:
647 if (*moved_len) {
648 ext4_discard_preallocations(orig_inode);
649 ext4_discard_preallocations(donor_inode);
650 }
651
652 unlock_two_nondirectories(orig_inode, donor_inode);
653 return ret;
654}