Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0
2
3/*
4 * fs/ext4/fast_commit.c
5 *
6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7 *
8 * Ext4 fast commits routines.
9 */
10#include "ext4.h"
11#include "ext4_jbd2.h"
12#include "ext4_extents.h"
13#include "mballoc.h"
14
15/*
16 * Ext4 Fast Commits
17 * -----------------
18 *
19 * Ext4 fast commits implement fine grained journalling for Ext4.
20 *
21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23 * TLV during the recovery phase. For the scenarios for which we currently
24 * don't have replay code, fast commit falls back to full commits.
25 * Fast commits record delta in one of the following three categories.
26 *
27 * (A) Directory entry updates:
28 *
29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink
30 * - EXT4_FC_TAG_LINK - records directory entry link
31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation
32 *
33 * (B) File specific data range updates:
34 *
35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
37 *
38 * (C) Inode metadata (mtime / ctime etc):
39 *
40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed
41 * during recovery. Note that iblocks field is
42 * not replayed and instead derived during
43 * replay.
44 * Commit Operation
45 * ----------------
46 * With fast commits, we maintain all the directory entry operations in the
47 * order in which they are issued in an in-memory queue. This queue is flushed
48 * to disk during the commit operation. We also maintain a list of inodes
49 * that need to be committed during a fast commit in another in memory queue of
50 * inodes. During the commit operation, we commit in the following order:
51 *
52 * [1] Lock inodes for any further data updates by setting COMMITTING state
53 * [2] Submit data buffers of all the inodes
54 * [3] Wait for [2] to complete
55 * [4] Commit all the directory entry updates in the fast commit space
56 * [5] Commit all the changed inode structures
57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
58 * section for more details).
59 * [7] Wait for [4], [5] and [6] to complete.
60 *
61 * All the inode updates must call ext4_fc_start_update() before starting an
62 * update. If such an ongoing update is present, fast commit waits for it to
63 * complete. The completion of such an update is marked by
64 * ext4_fc_stop_update().
65 *
66 * Fast Commit Ineligibility
67 * -------------------------
68 * Not all operations are supported by fast commits today (e.g extended
69 * attributes). Fast commit ineligiblity is marked by calling one of the
70 * two following functions:
71 *
72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73 * back to full commit. This is useful in case of transient errors.
74 *
75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76 * the fast commits happening between ext4_fc_start_ineligible() and
77 * ext4_fc_stop_ineligible() and one fast commit after the call to
78 * ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79 * make one more fast commit to fall back to full commit after stop call so
80 * that it guaranteed that the fast commit ineligible operation contained
81 * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82 * followed by at least 1 full commit.
83 *
84 * Atomicity of commits
85 * --------------------
86 * In order to guarantee atomicity during the commit operation, fast commit
87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88 * tag contains CRC of the contents and TID of the transaction after which
89 * this fast commit should be applied. Recovery code replays fast commit
90 * logs only if there's at least 1 valid tail present. For every fast commit
91 * operation, there is 1 tail. This means, we may end up with multiple tails
92 * in the fast commit space. Here's an example:
93 *
94 * - Create a new file A and remove existing file B
95 * - fsync()
96 * - Append contents to file A
97 * - Truncate file A
98 * - fsync()
99 *
100 * The fast commit space at the end of above operations would look like this:
101 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->|
103 *
104 * Replay code should thus check for all the valid tails in the FC area.
105 *
106 * Fast Commit Replay Idempotence
107 * ------------------------------
108 *
109 * Fast commits tags are idempotent in nature provided the recovery code follows
110 * certain rules. The guiding principle that the commit path follows while
111 * committing is that it stores the result of a particular operation instead of
112 * storing the procedure.
113 *
114 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
115 * was associated with inode 10. During fast commit, instead of storing this
116 * operation as a procedure "rename a to b", we store the resulting file system
117 * state as a "series" of outcomes:
118 *
119 * - Link dirent b to inode 10
120 * - Unlink dirent a
121 * - Inode <10> with valid refcount
122 *
123 * Now when recovery code runs, it needs "enforce" this state on the file
124 * system. This is what guarantees idempotence of fast commit replay.
125 *
126 * Let's take an example of a procedure that is not idempotent and see how fast
127 * commits make it idempotent. Consider following sequence of operations:
128 *
129 * rm A; mv B A; read A
130 * (x) (y) (z)
131 *
132 * (x), (y) and (z) are the points at which we can crash. If we store this
133 * sequence of operations as is then the replay is not idempotent. Let's say
134 * while in replay, we crash at (z). During the second replay, file A (which was
135 * actually created as a result of "mv B A" operation) would get deleted. Thus,
136 * file named A would be absent when we try to read A. So, this sequence of
137 * operations is not idempotent. However, as mentioned above, instead of storing
138 * the procedure fast commits store the outcome of each procedure. Thus the fast
139 * commit log for above procedure would be as follows:
140 *
141 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
142 * inode 11 before the replay)
143 *
144 * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11]
145 * (w) (x) (y) (z)
146 *
147 * If we crash at (z), we will have file A linked to inode 11. During the second
148 * replay, we will remove file A (inode 11). But we will create it back and make
149 * it point to inode 11. We won't find B, so we'll just skip that step. At this
150 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
151 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
152 * similarly. Thus, by converting a non-idempotent procedure into a series of
153 * idempotent outcomes, fast commits ensured idempotence during the replay.
154 *
155 * TODOs
156 * -----
157 *
158 * 0) Fast commit replay path hardening: Fast commit replay code should use
159 * journal handles to make sure all the updates it does during the replay
160 * path are atomic. With that if we crash during fast commit replay, after
161 * trying to do recovery again, we will find a file system where fast commit
162 * area is invalid (because new full commit would be found). In order to deal
163 * with that, fast commit replay code should ensure that the "FC_REPLAY"
164 * superblock state is persisted before starting the replay, so that after
165 * the crash, fast commit recovery code can look at that flag and perform
166 * fast commit recovery even if that area is invalidated by later full
167 * commits.
168 *
169 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
170 * eligible update must be protected within ext4_fc_start_update() and
171 * ext4_fc_stop_update(). These routines are called at much higher
172 * routines. This can be made more fine grained by combining with
173 * ext4_journal_start().
174 *
175 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
176 *
177 * 3) Handle more ineligible cases.
178 */
179
180#include <trace/events/ext4.h>
181static struct kmem_cache *ext4_fc_dentry_cachep;
182
183static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
184{
185 BUFFER_TRACE(bh, "");
186 if (uptodate) {
187 ext4_debug("%s: Block %lld up-to-date",
188 __func__, bh->b_blocknr);
189 set_buffer_uptodate(bh);
190 } else {
191 ext4_debug("%s: Block %lld not up-to-date",
192 __func__, bh->b_blocknr);
193 clear_buffer_uptodate(bh);
194 }
195
196 unlock_buffer(bh);
197}
198
199static inline void ext4_fc_reset_inode(struct inode *inode)
200{
201 struct ext4_inode_info *ei = EXT4_I(inode);
202
203 ei->i_fc_lblk_start = 0;
204 ei->i_fc_lblk_len = 0;
205}
206
207void ext4_fc_init_inode(struct inode *inode)
208{
209 struct ext4_inode_info *ei = EXT4_I(inode);
210
211 ext4_fc_reset_inode(inode);
212 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
213 INIT_LIST_HEAD(&ei->i_fc_list);
214 init_waitqueue_head(&ei->i_fc_wait);
215 atomic_set(&ei->i_fc_updates, 0);
216}
217
218/* This function must be called with sbi->s_fc_lock held. */
219static void ext4_fc_wait_committing_inode(struct inode *inode)
220__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
221{
222 wait_queue_head_t *wq;
223 struct ext4_inode_info *ei = EXT4_I(inode);
224
225#if (BITS_PER_LONG < 64)
226 DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
227 EXT4_STATE_FC_COMMITTING);
228 wq = bit_waitqueue(&ei->i_state_flags,
229 EXT4_STATE_FC_COMMITTING);
230#else
231 DEFINE_WAIT_BIT(wait, &ei->i_flags,
232 EXT4_STATE_FC_COMMITTING);
233 wq = bit_waitqueue(&ei->i_flags,
234 EXT4_STATE_FC_COMMITTING);
235#endif
236 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
237 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
238 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
239 schedule();
240 finish_wait(wq, &wait.wq_entry);
241}
242
243/*
244 * Inform Ext4's fast about start of an inode update
245 *
246 * This function is called by the high level call VFS callbacks before
247 * performing any inode update. This function blocks if there's an ongoing
248 * fast commit on the inode in question.
249 */
250void ext4_fc_start_update(struct inode *inode)
251{
252 struct ext4_inode_info *ei = EXT4_I(inode);
253
254 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
255 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
256 return;
257
258restart:
259 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
260 if (list_empty(&ei->i_fc_list))
261 goto out;
262
263 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
264 ext4_fc_wait_committing_inode(inode);
265 goto restart;
266 }
267out:
268 atomic_inc(&ei->i_fc_updates);
269 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
270}
271
272/*
273 * Stop inode update and wake up waiting fast commits if any.
274 */
275void ext4_fc_stop_update(struct inode *inode)
276{
277 struct ext4_inode_info *ei = EXT4_I(inode);
278
279 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
280 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
281 return;
282
283 if (atomic_dec_and_test(&ei->i_fc_updates))
284 wake_up_all(&ei->i_fc_wait);
285}
286
287/*
288 * Remove inode from fast commit list. If the inode is being committed
289 * we wait until inode commit is done.
290 */
291void ext4_fc_del(struct inode *inode)
292{
293 struct ext4_inode_info *ei = EXT4_I(inode);
294
295 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
296 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
297 return;
298
299restart:
300 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
301 if (list_empty(&ei->i_fc_list)) {
302 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
303 return;
304 }
305
306 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
307 ext4_fc_wait_committing_inode(inode);
308 goto restart;
309 }
310 list_del_init(&ei->i_fc_list);
311 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
312}
313
314/*
315 * Mark file system as fast commit ineligible. This means that next commit
316 * operation would result in a full jbd2 commit.
317 */
318void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
319{
320 struct ext4_sb_info *sbi = EXT4_SB(sb);
321
322 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
323 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
324 return;
325
326 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
327 WARN_ON(reason >= EXT4_FC_REASON_MAX);
328 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
329}
330
331/*
332 * Start a fast commit ineligible update. Any commits that happen while
333 * such an operation is in progress fall back to full commits.
334 */
335void ext4_fc_start_ineligible(struct super_block *sb, int reason)
336{
337 struct ext4_sb_info *sbi = EXT4_SB(sb);
338
339 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
340 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
341 return;
342
343 WARN_ON(reason >= EXT4_FC_REASON_MAX);
344 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
345 atomic_inc(&sbi->s_fc_ineligible_updates);
346}
347
348/*
349 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
350 * to ensure that after stopping the ineligible update, at least one full
351 * commit takes place.
352 */
353void ext4_fc_stop_ineligible(struct super_block *sb)
354{
355 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
356 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
357 return;
358
359 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
360 atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
361}
362
363static inline int ext4_fc_is_ineligible(struct super_block *sb)
364{
365 return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
366 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
367}
368
369/*
370 * Generic fast commit tracking function. If this is the first time this we are
371 * called after a full commit, we initialize fast commit fields and then call
372 * __fc_track_fn() with update = 0. If we have already been called after a full
373 * commit, we pass update = 1. Based on that, the track function can determine
374 * if it needs to track a field for the first time or if it needs to just
375 * update the previously tracked value.
376 *
377 * If enqueue is set, this function enqueues the inode in fast commit list.
378 */
379static int ext4_fc_track_template(
380 handle_t *handle, struct inode *inode,
381 int (*__fc_track_fn)(struct inode *, void *, bool),
382 void *args, int enqueue)
383{
384 bool update = false;
385 struct ext4_inode_info *ei = EXT4_I(inode);
386 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
387 tid_t tid = 0;
388 int ret;
389
390 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
391 (sbi->s_mount_state & EXT4_FC_REPLAY))
392 return -EOPNOTSUPP;
393
394 if (ext4_fc_is_ineligible(inode->i_sb))
395 return -EINVAL;
396
397 tid = handle->h_transaction->t_tid;
398 mutex_lock(&ei->i_fc_lock);
399 if (tid == ei->i_sync_tid) {
400 update = true;
401 } else {
402 ext4_fc_reset_inode(inode);
403 ei->i_sync_tid = tid;
404 }
405 ret = __fc_track_fn(inode, args, update);
406 mutex_unlock(&ei->i_fc_lock);
407
408 if (!enqueue)
409 return ret;
410
411 spin_lock(&sbi->s_fc_lock);
412 if (list_empty(&EXT4_I(inode)->i_fc_list))
413 list_add_tail(&EXT4_I(inode)->i_fc_list,
414 (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
415 &sbi->s_fc_q[FC_Q_STAGING] :
416 &sbi->s_fc_q[FC_Q_MAIN]);
417 spin_unlock(&sbi->s_fc_lock);
418
419 return ret;
420}
421
422struct __track_dentry_update_args {
423 struct dentry *dentry;
424 int op;
425};
426
427/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
428static int __track_dentry_update(struct inode *inode, void *arg, bool update)
429{
430 struct ext4_fc_dentry_update *node;
431 struct ext4_inode_info *ei = EXT4_I(inode);
432 struct __track_dentry_update_args *dentry_update =
433 (struct __track_dentry_update_args *)arg;
434 struct dentry *dentry = dentry_update->dentry;
435 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
436
437 mutex_unlock(&ei->i_fc_lock);
438 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
439 if (!node) {
440 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
441 mutex_lock(&ei->i_fc_lock);
442 return -ENOMEM;
443 }
444
445 node->fcd_op = dentry_update->op;
446 node->fcd_parent = dentry->d_parent->d_inode->i_ino;
447 node->fcd_ino = inode->i_ino;
448 if (dentry->d_name.len > DNAME_INLINE_LEN) {
449 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
450 if (!node->fcd_name.name) {
451 kmem_cache_free(ext4_fc_dentry_cachep, node);
452 ext4_fc_mark_ineligible(inode->i_sb,
453 EXT4_FC_REASON_NOMEM);
454 mutex_lock(&ei->i_fc_lock);
455 return -ENOMEM;
456 }
457 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
458 dentry->d_name.len);
459 } else {
460 memcpy(node->fcd_iname, dentry->d_name.name,
461 dentry->d_name.len);
462 node->fcd_name.name = node->fcd_iname;
463 }
464 node->fcd_name.len = dentry->d_name.len;
465
466 spin_lock(&sbi->s_fc_lock);
467 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
468 list_add_tail(&node->fcd_list,
469 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
470 else
471 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
472 spin_unlock(&sbi->s_fc_lock);
473 mutex_lock(&ei->i_fc_lock);
474
475 return 0;
476}
477
478void __ext4_fc_track_unlink(handle_t *handle,
479 struct inode *inode, struct dentry *dentry)
480{
481 struct __track_dentry_update_args args;
482 int ret;
483
484 args.dentry = dentry;
485 args.op = EXT4_FC_TAG_UNLINK;
486
487 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
488 (void *)&args, 0);
489 trace_ext4_fc_track_unlink(inode, dentry, ret);
490}
491
492void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
493{
494 __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
495}
496
497void __ext4_fc_track_link(handle_t *handle,
498 struct inode *inode, struct dentry *dentry)
499{
500 struct __track_dentry_update_args args;
501 int ret;
502
503 args.dentry = dentry;
504 args.op = EXT4_FC_TAG_LINK;
505
506 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
507 (void *)&args, 0);
508 trace_ext4_fc_track_link(inode, dentry, ret);
509}
510
511void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
512{
513 __ext4_fc_track_link(handle, d_inode(dentry), dentry);
514}
515
516void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
517 struct dentry *dentry)
518{
519 struct __track_dentry_update_args args;
520 int ret;
521
522 args.dentry = dentry;
523 args.op = EXT4_FC_TAG_CREAT;
524
525 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
526 (void *)&args, 0);
527 trace_ext4_fc_track_create(inode, dentry, ret);
528}
529
530void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
531{
532 __ext4_fc_track_create(handle, d_inode(dentry), dentry);
533}
534
535/* __track_fn for inode tracking */
536static int __track_inode(struct inode *inode, void *arg, bool update)
537{
538 if (update)
539 return -EEXIST;
540
541 EXT4_I(inode)->i_fc_lblk_len = 0;
542
543 return 0;
544}
545
546void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
547{
548 int ret;
549
550 if (S_ISDIR(inode->i_mode))
551 return;
552
553 if (ext4_should_journal_data(inode)) {
554 ext4_fc_mark_ineligible(inode->i_sb,
555 EXT4_FC_REASON_INODE_JOURNAL_DATA);
556 return;
557 }
558
559 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
560 trace_ext4_fc_track_inode(inode, ret);
561}
562
563struct __track_range_args {
564 ext4_lblk_t start, end;
565};
566
567/* __track_fn for tracking data updates */
568static int __track_range(struct inode *inode, void *arg, bool update)
569{
570 struct ext4_inode_info *ei = EXT4_I(inode);
571 ext4_lblk_t oldstart;
572 struct __track_range_args *__arg =
573 (struct __track_range_args *)arg;
574
575 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
576 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
577 return -ECANCELED;
578 }
579
580 oldstart = ei->i_fc_lblk_start;
581
582 if (update && ei->i_fc_lblk_len > 0) {
583 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
584 ei->i_fc_lblk_len =
585 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
586 ei->i_fc_lblk_start + 1;
587 } else {
588 ei->i_fc_lblk_start = __arg->start;
589 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
590 }
591
592 return 0;
593}
594
595void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
596 ext4_lblk_t end)
597{
598 struct __track_range_args args;
599 int ret;
600
601 if (S_ISDIR(inode->i_mode))
602 return;
603
604 args.start = start;
605 args.end = end;
606
607 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
608
609 trace_ext4_fc_track_range(inode, start, end, ret);
610}
611
612static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
613{
614 int write_flags = REQ_SYNC;
615 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
616
617 /* Add REQ_FUA | REQ_PREFLUSH only its tail */
618 if (test_opt(sb, BARRIER) && is_tail)
619 write_flags |= REQ_FUA | REQ_PREFLUSH;
620 lock_buffer(bh);
621 set_buffer_dirty(bh);
622 set_buffer_uptodate(bh);
623 bh->b_end_io = ext4_end_buffer_io_sync;
624 submit_bh(REQ_OP_WRITE, write_flags, bh);
625 EXT4_SB(sb)->s_fc_bh = NULL;
626}
627
628/* Ext4 commit path routines */
629
630/* memzero and update CRC */
631static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
632 u32 *crc)
633{
634 void *ret;
635
636 ret = memset(dst, 0, len);
637 if (crc)
638 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
639 return ret;
640}
641
642/*
643 * Allocate len bytes on a fast commit buffer.
644 *
645 * During the commit time this function is used to manage fast commit
646 * block space. We don't split a fast commit log onto different
647 * blocks. So this function makes sure that if there's not enough space
648 * on the current block, the remaining space in the current block is
649 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
650 * new block is from jbd2 and CRC is updated to reflect the padding
651 * we added.
652 */
653static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
654{
655 struct ext4_fc_tl *tl;
656 struct ext4_sb_info *sbi = EXT4_SB(sb);
657 struct buffer_head *bh;
658 int bsize = sbi->s_journal->j_blocksize;
659 int ret, off = sbi->s_fc_bytes % bsize;
660 int pad_len;
661
662 /*
663 * After allocating len, we should have space at least for a 0 byte
664 * padding.
665 */
666 if (len + sizeof(struct ext4_fc_tl) > bsize)
667 return NULL;
668
669 if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
670 /*
671 * Only allocate from current buffer if we have enough space for
672 * this request AND we have space to add a zero byte padding.
673 */
674 if (!sbi->s_fc_bh) {
675 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
676 if (ret)
677 return NULL;
678 sbi->s_fc_bh = bh;
679 }
680 sbi->s_fc_bytes += len;
681 return sbi->s_fc_bh->b_data + off;
682 }
683 /* Need to add PAD tag */
684 tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
685 tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
686 pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
687 tl->fc_len = cpu_to_le16(pad_len);
688 if (crc)
689 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
690 if (pad_len > 0)
691 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
692 ext4_fc_submit_bh(sb, false);
693
694 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
695 if (ret)
696 return NULL;
697 sbi->s_fc_bh = bh;
698 sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
699 return sbi->s_fc_bh->b_data;
700}
701
702/* memcpy to fc reserved space and update CRC */
703static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
704 int len, u32 *crc)
705{
706 if (crc)
707 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
708 return memcpy(dst, src, len);
709}
710
711/*
712 * Complete a fast commit by writing tail tag.
713 *
714 * Writing tail tag marks the end of a fast commit. In order to guarantee
715 * atomicity, after writing tail tag, even if there's space remaining
716 * in the block, next commit shouldn't use it. That's why tail tag
717 * has the length as that of the remaining space on the block.
718 */
719static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
720{
721 struct ext4_sb_info *sbi = EXT4_SB(sb);
722 struct ext4_fc_tl tl;
723 struct ext4_fc_tail tail;
724 int off, bsize = sbi->s_journal->j_blocksize;
725 u8 *dst;
726
727 /*
728 * ext4_fc_reserve_space takes care of allocating an extra block if
729 * there's no enough space on this block for accommodating this tail.
730 */
731 dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
732 if (!dst)
733 return -ENOSPC;
734
735 off = sbi->s_fc_bytes % bsize;
736
737 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
738 tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
739 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
740
741 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
742 dst += sizeof(tl);
743 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
744 ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
745 dst += sizeof(tail.fc_tid);
746 tail.fc_crc = cpu_to_le32(crc);
747 ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
748
749 ext4_fc_submit_bh(sb, true);
750
751 return 0;
752}
753
754/*
755 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
756 * Returns false if there's not enough space.
757 */
758static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
759 u32 *crc)
760{
761 struct ext4_fc_tl tl;
762 u8 *dst;
763
764 dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
765 if (!dst)
766 return false;
767
768 tl.fc_tag = cpu_to_le16(tag);
769 tl.fc_len = cpu_to_le16(len);
770
771 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
772 ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
773
774 return true;
775}
776
777/* Same as above, but adds dentry tlv. */
778static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
779 int parent_ino, int ino, int dlen,
780 const unsigned char *dname,
781 u32 *crc)
782{
783 struct ext4_fc_dentry_info fcd;
784 struct ext4_fc_tl tl;
785 u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
786 crc);
787
788 if (!dst)
789 return false;
790
791 fcd.fc_parent_ino = cpu_to_le32(parent_ino);
792 fcd.fc_ino = cpu_to_le32(ino);
793 tl.fc_tag = cpu_to_le16(tag);
794 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
795 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
796 dst += sizeof(tl);
797 ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
798 dst += sizeof(fcd);
799 ext4_fc_memcpy(sb, dst, dname, dlen, crc);
800 dst += dlen;
801
802 return true;
803}
804
805/*
806 * Writes inode in the fast commit space under TLV with tag @tag.
807 * Returns 0 on success, error on failure.
808 */
809static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
810{
811 struct ext4_inode_info *ei = EXT4_I(inode);
812 int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
813 int ret;
814 struct ext4_iloc iloc;
815 struct ext4_fc_inode fc_inode;
816 struct ext4_fc_tl tl;
817 u8 *dst;
818
819 ret = ext4_get_inode_loc(inode, &iloc);
820 if (ret)
821 return ret;
822
823 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
824 inode_len += ei->i_extra_isize;
825
826 fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
827 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
828 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
829
830 dst = ext4_fc_reserve_space(inode->i_sb,
831 sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
832 if (!dst)
833 return -ECANCELED;
834
835 if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
836 return -ECANCELED;
837 dst += sizeof(tl);
838 if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
839 return -ECANCELED;
840 dst += sizeof(fc_inode);
841 if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
842 inode_len, crc))
843 return -ECANCELED;
844
845 return 0;
846}
847
848/*
849 * Writes updated data ranges for the inode in question. Updates CRC.
850 * Returns 0 on success, error otherwise.
851 */
852static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
853{
854 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
855 struct ext4_inode_info *ei = EXT4_I(inode);
856 struct ext4_map_blocks map;
857 struct ext4_fc_add_range fc_ext;
858 struct ext4_fc_del_range lrange;
859 struct ext4_extent *ex;
860 int ret;
861
862 mutex_lock(&ei->i_fc_lock);
863 if (ei->i_fc_lblk_len == 0) {
864 mutex_unlock(&ei->i_fc_lock);
865 return 0;
866 }
867 old_blk_size = ei->i_fc_lblk_start;
868 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
869 ei->i_fc_lblk_len = 0;
870 mutex_unlock(&ei->i_fc_lock);
871
872 cur_lblk_off = old_blk_size;
873 jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
874 __func__, cur_lblk_off, new_blk_size, inode->i_ino);
875
876 while (cur_lblk_off <= new_blk_size) {
877 map.m_lblk = cur_lblk_off;
878 map.m_len = new_blk_size - cur_lblk_off + 1;
879 ret = ext4_map_blocks(NULL, inode, &map, 0);
880 if (ret < 0)
881 return -ECANCELED;
882
883 if (map.m_len == 0) {
884 cur_lblk_off++;
885 continue;
886 }
887
888 if (ret == 0) {
889 lrange.fc_ino = cpu_to_le32(inode->i_ino);
890 lrange.fc_lblk = cpu_to_le32(map.m_lblk);
891 lrange.fc_len = cpu_to_le32(map.m_len);
892 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
893 sizeof(lrange), (u8 *)&lrange, crc))
894 return -ENOSPC;
895 } else {
896 fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
897 ex = (struct ext4_extent *)&fc_ext.fc_ex;
898 ex->ee_block = cpu_to_le32(map.m_lblk);
899 ex->ee_len = cpu_to_le16(map.m_len);
900 ext4_ext_store_pblock(ex, map.m_pblk);
901 if (map.m_flags & EXT4_MAP_UNWRITTEN)
902 ext4_ext_mark_unwritten(ex);
903 else
904 ext4_ext_mark_initialized(ex);
905 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
906 sizeof(fc_ext), (u8 *)&fc_ext, crc))
907 return -ENOSPC;
908 }
909
910 cur_lblk_off += map.m_len;
911 }
912
913 return 0;
914}
915
916
917/* Submit data for all the fast commit inodes */
918static int ext4_fc_submit_inode_data_all(journal_t *journal)
919{
920 struct super_block *sb = (struct super_block *)(journal->j_private);
921 struct ext4_sb_info *sbi = EXT4_SB(sb);
922 struct ext4_inode_info *ei;
923 int ret = 0;
924
925 spin_lock(&sbi->s_fc_lock);
926 ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
927 list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
928 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
929 while (atomic_read(&ei->i_fc_updates)) {
930 DEFINE_WAIT(wait);
931
932 prepare_to_wait(&ei->i_fc_wait, &wait,
933 TASK_UNINTERRUPTIBLE);
934 if (atomic_read(&ei->i_fc_updates)) {
935 spin_unlock(&sbi->s_fc_lock);
936 schedule();
937 spin_lock(&sbi->s_fc_lock);
938 }
939 finish_wait(&ei->i_fc_wait, &wait);
940 }
941 spin_unlock(&sbi->s_fc_lock);
942 ret = jbd2_submit_inode_data(ei->jinode);
943 if (ret)
944 return ret;
945 spin_lock(&sbi->s_fc_lock);
946 }
947 spin_unlock(&sbi->s_fc_lock);
948
949 return ret;
950}
951
952/* Wait for completion of data for all the fast commit inodes */
953static int ext4_fc_wait_inode_data_all(journal_t *journal)
954{
955 struct super_block *sb = (struct super_block *)(journal->j_private);
956 struct ext4_sb_info *sbi = EXT4_SB(sb);
957 struct ext4_inode_info *pos, *n;
958 int ret = 0;
959
960 spin_lock(&sbi->s_fc_lock);
961 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
962 if (!ext4_test_inode_state(&pos->vfs_inode,
963 EXT4_STATE_FC_COMMITTING))
964 continue;
965 spin_unlock(&sbi->s_fc_lock);
966
967 ret = jbd2_wait_inode_data(journal, pos->jinode);
968 if (ret)
969 return ret;
970 spin_lock(&sbi->s_fc_lock);
971 }
972 spin_unlock(&sbi->s_fc_lock);
973
974 return 0;
975}
976
977/* Commit all the directory entry updates */
978static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
979__acquires(&sbi->s_fc_lock)
980__releases(&sbi->s_fc_lock)
981{
982 struct super_block *sb = (struct super_block *)(journal->j_private);
983 struct ext4_sb_info *sbi = EXT4_SB(sb);
984 struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
985 struct inode *inode;
986 struct ext4_inode_info *ei, *ei_n;
987 int ret;
988
989 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
990 return 0;
991 list_for_each_entry_safe(fc_dentry, fc_dentry_n,
992 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
993 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
994 spin_unlock(&sbi->s_fc_lock);
995 if (!ext4_fc_add_dentry_tlv(
996 sb, fc_dentry->fcd_op,
997 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
998 fc_dentry->fcd_name.len,
999 fc_dentry->fcd_name.name, crc)) {
1000 ret = -ENOSPC;
1001 goto lock_and_exit;
1002 }
1003 spin_lock(&sbi->s_fc_lock);
1004 continue;
1005 }
1006
1007 inode = NULL;
1008 list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
1009 i_fc_list) {
1010 if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
1011 inode = &ei->vfs_inode;
1012 break;
1013 }
1014 }
1015 /*
1016 * If we don't find inode in our list, then it was deleted,
1017 * in which case, we don't need to record it's create tag.
1018 */
1019 if (!inode)
1020 continue;
1021 spin_unlock(&sbi->s_fc_lock);
1022
1023 /*
1024 * We first write the inode and then the create dirent. This
1025 * allows the recovery code to create an unnamed inode first
1026 * and then link it to a directory entry. This allows us
1027 * to use namei.c routines almost as is and simplifies
1028 * the recovery code.
1029 */
1030 ret = ext4_fc_write_inode(inode, crc);
1031 if (ret)
1032 goto lock_and_exit;
1033
1034 ret = ext4_fc_write_inode_data(inode, crc);
1035 if (ret)
1036 goto lock_and_exit;
1037
1038 if (!ext4_fc_add_dentry_tlv(
1039 sb, fc_dentry->fcd_op,
1040 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
1041 fc_dentry->fcd_name.len,
1042 fc_dentry->fcd_name.name, crc)) {
1043 ret = -ENOSPC;
1044 goto lock_and_exit;
1045 }
1046
1047 spin_lock(&sbi->s_fc_lock);
1048 }
1049 return 0;
1050lock_and_exit:
1051 spin_lock(&sbi->s_fc_lock);
1052 return ret;
1053}
1054
1055static int ext4_fc_perform_commit(journal_t *journal)
1056{
1057 struct super_block *sb = (struct super_block *)(journal->j_private);
1058 struct ext4_sb_info *sbi = EXT4_SB(sb);
1059 struct ext4_inode_info *iter;
1060 struct ext4_fc_head head;
1061 struct inode *inode;
1062 struct blk_plug plug;
1063 int ret = 0;
1064 u32 crc = 0;
1065
1066 ret = ext4_fc_submit_inode_data_all(journal);
1067 if (ret)
1068 return ret;
1069
1070 ret = ext4_fc_wait_inode_data_all(journal);
1071 if (ret)
1072 return ret;
1073
1074 /*
1075 * If file system device is different from journal device, issue a cache
1076 * flush before we start writing fast commit blocks.
1077 */
1078 if (journal->j_fs_dev != journal->j_dev)
1079 blkdev_issue_flush(journal->j_fs_dev);
1080
1081 blk_start_plug(&plug);
1082 if (sbi->s_fc_bytes == 0) {
1083 /*
1084 * Add a head tag only if this is the first fast commit
1085 * in this TID.
1086 */
1087 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1088 head.fc_tid = cpu_to_le32(
1089 sbi->s_journal->j_running_transaction->t_tid);
1090 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1091 (u8 *)&head, &crc))
1092 goto out;
1093 }
1094
1095 spin_lock(&sbi->s_fc_lock);
1096 ret = ext4_fc_commit_dentry_updates(journal, &crc);
1097 if (ret) {
1098 spin_unlock(&sbi->s_fc_lock);
1099 goto out;
1100 }
1101
1102 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1103 inode = &iter->vfs_inode;
1104 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1105 continue;
1106
1107 spin_unlock(&sbi->s_fc_lock);
1108 ret = ext4_fc_write_inode_data(inode, &crc);
1109 if (ret)
1110 goto out;
1111 ret = ext4_fc_write_inode(inode, &crc);
1112 if (ret)
1113 goto out;
1114 spin_lock(&sbi->s_fc_lock);
1115 }
1116 spin_unlock(&sbi->s_fc_lock);
1117
1118 ret = ext4_fc_write_tail(sb, crc);
1119
1120out:
1121 blk_finish_plug(&plug);
1122 return ret;
1123}
1124
1125/*
1126 * The main commit entry point. Performs a fast commit for transaction
1127 * commit_tid if needed. If it's not possible to perform a fast commit
1128 * due to various reasons, we fall back to full commit. Returns 0
1129 * on success, error otherwise.
1130 */
1131int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1132{
1133 struct super_block *sb = (struct super_block *)(journal->j_private);
1134 struct ext4_sb_info *sbi = EXT4_SB(sb);
1135 int nblks = 0, ret, bsize = journal->j_blocksize;
1136 int subtid = atomic_read(&sbi->s_fc_subtid);
1137 int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1138 ktime_t start_time, commit_time;
1139
1140 trace_ext4_fc_commit_start(sb);
1141
1142 start_time = ktime_get();
1143
1144 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1145 (ext4_fc_is_ineligible(sb))) {
1146 reason = EXT4_FC_REASON_INELIGIBLE;
1147 goto out;
1148 }
1149
1150restart_fc:
1151 ret = jbd2_fc_begin_commit(journal, commit_tid);
1152 if (ret == -EALREADY) {
1153 /* There was an ongoing commit, check if we need to restart */
1154 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1155 commit_tid > journal->j_commit_sequence)
1156 goto restart_fc;
1157 reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1158 goto out;
1159 } else if (ret) {
1160 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1161 reason = EXT4_FC_REASON_FC_START_FAILED;
1162 goto out;
1163 }
1164
1165 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1166 ret = ext4_fc_perform_commit(journal);
1167 if (ret < 0) {
1168 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1169 reason = EXT4_FC_REASON_FC_FAILED;
1170 goto out;
1171 }
1172 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1173 ret = jbd2_fc_wait_bufs(journal, nblks);
1174 if (ret < 0) {
1175 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1176 reason = EXT4_FC_REASON_FC_FAILED;
1177 goto out;
1178 }
1179 atomic_inc(&sbi->s_fc_subtid);
1180 jbd2_fc_end_commit(journal);
1181out:
1182 /* Has any ineligible update happened since we started? */
1183 if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1184 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1185 reason = EXT4_FC_REASON_INELIGIBLE;
1186 }
1187
1188 spin_lock(&sbi->s_fc_lock);
1189 if (reason != EXT4_FC_REASON_OK &&
1190 reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1191 sbi->s_fc_stats.fc_ineligible_commits++;
1192 } else {
1193 sbi->s_fc_stats.fc_num_commits++;
1194 sbi->s_fc_stats.fc_numblks += nblks;
1195 }
1196 spin_unlock(&sbi->s_fc_lock);
1197 nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1198 trace_ext4_fc_commit_stop(sb, nblks, reason);
1199 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1200 /*
1201 * weight the commit time higher than the average time so we don't
1202 * react too strongly to vast changes in the commit time
1203 */
1204 if (likely(sbi->s_fc_avg_commit_time))
1205 sbi->s_fc_avg_commit_time = (commit_time +
1206 sbi->s_fc_avg_commit_time * 3) / 4;
1207 else
1208 sbi->s_fc_avg_commit_time = commit_time;
1209 jbd_debug(1,
1210 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1211 nblks, reason, subtid);
1212 if (reason == EXT4_FC_REASON_FC_FAILED)
1213 return jbd2_fc_end_commit_fallback(journal);
1214 if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1215 reason == EXT4_FC_REASON_INELIGIBLE)
1216 return jbd2_complete_transaction(journal, commit_tid);
1217 return 0;
1218}
1219
1220/*
1221 * Fast commit cleanup routine. This is called after every fast commit and
1222 * full commit. full is true if we are called after a full commit.
1223 */
1224static void ext4_fc_cleanup(journal_t *journal, int full)
1225{
1226 struct super_block *sb = journal->j_private;
1227 struct ext4_sb_info *sbi = EXT4_SB(sb);
1228 struct ext4_inode_info *iter, *iter_n;
1229 struct ext4_fc_dentry_update *fc_dentry;
1230
1231 if (full && sbi->s_fc_bh)
1232 sbi->s_fc_bh = NULL;
1233
1234 jbd2_fc_release_bufs(journal);
1235
1236 spin_lock(&sbi->s_fc_lock);
1237 list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1238 i_fc_list) {
1239 list_del_init(&iter->i_fc_list);
1240 ext4_clear_inode_state(&iter->vfs_inode,
1241 EXT4_STATE_FC_COMMITTING);
1242 ext4_fc_reset_inode(&iter->vfs_inode);
1243 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1244 smp_mb();
1245#if (BITS_PER_LONG < 64)
1246 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1247#else
1248 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1249#endif
1250 }
1251
1252 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1253 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1254 struct ext4_fc_dentry_update,
1255 fcd_list);
1256 list_del_init(&fc_dentry->fcd_list);
1257 spin_unlock(&sbi->s_fc_lock);
1258
1259 if (fc_dentry->fcd_name.name &&
1260 fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1261 kfree(fc_dentry->fcd_name.name);
1262 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1263 spin_lock(&sbi->s_fc_lock);
1264 }
1265
1266 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1267 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1268 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1269 &sbi->s_fc_q[FC_Q_MAIN]);
1270
1271 ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1272 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1273
1274 if (full)
1275 sbi->s_fc_bytes = 0;
1276 spin_unlock(&sbi->s_fc_lock);
1277 trace_ext4_fc_stats(sb);
1278}
1279
1280/* Ext4 Replay Path Routines */
1281
1282/* Helper struct for dentry replay routines */
1283struct dentry_info_args {
1284 int parent_ino, dname_len, ino, inode_len;
1285 char *dname;
1286};
1287
1288static inline void tl_to_darg(struct dentry_info_args *darg,
1289 struct ext4_fc_tl *tl)
1290{
1291 struct ext4_fc_dentry_info *fcd;
1292
1293 fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
1294
1295 darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
1296 darg->ino = le32_to_cpu(fcd->fc_ino);
1297 darg->dname = fcd->fc_dname;
1298 darg->dname_len = ext4_fc_tag_len(tl) -
1299 sizeof(struct ext4_fc_dentry_info);
1300}
1301
1302/* Unlink replay function */
1303static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
1304{
1305 struct inode *inode, *old_parent;
1306 struct qstr entry;
1307 struct dentry_info_args darg;
1308 int ret = 0;
1309
1310 tl_to_darg(&darg, tl);
1311
1312 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1313 darg.parent_ino, darg.dname_len);
1314
1315 entry.name = darg.dname;
1316 entry.len = darg.dname_len;
1317 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1318
1319 if (IS_ERR(inode)) {
1320 jbd_debug(1, "Inode %d not found", darg.ino);
1321 return 0;
1322 }
1323
1324 old_parent = ext4_iget(sb, darg.parent_ino,
1325 EXT4_IGET_NORMAL);
1326 if (IS_ERR(old_parent)) {
1327 jbd_debug(1, "Dir with inode %d not found", darg.parent_ino);
1328 iput(inode);
1329 return 0;
1330 }
1331
1332 ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1333 /* -ENOENT ok coz it might not exist anymore. */
1334 if (ret == -ENOENT)
1335 ret = 0;
1336 iput(old_parent);
1337 iput(inode);
1338 return ret;
1339}
1340
1341static int ext4_fc_replay_link_internal(struct super_block *sb,
1342 struct dentry_info_args *darg,
1343 struct inode *inode)
1344{
1345 struct inode *dir = NULL;
1346 struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1347 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1348 int ret = 0;
1349
1350 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1351 if (IS_ERR(dir)) {
1352 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1353 dir = NULL;
1354 goto out;
1355 }
1356
1357 dentry_dir = d_obtain_alias(dir);
1358 if (IS_ERR(dentry_dir)) {
1359 jbd_debug(1, "Failed to obtain dentry");
1360 dentry_dir = NULL;
1361 goto out;
1362 }
1363
1364 dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1365 if (!dentry_inode) {
1366 jbd_debug(1, "Inode dentry not created.");
1367 ret = -ENOMEM;
1368 goto out;
1369 }
1370
1371 ret = __ext4_link(dir, inode, dentry_inode);
1372 /*
1373 * It's possible that link already existed since data blocks
1374 * for the dir in question got persisted before we crashed OR
1375 * we replayed this tag and crashed before the entire replay
1376 * could complete.
1377 */
1378 if (ret && ret != -EEXIST) {
1379 jbd_debug(1, "Failed to link\n");
1380 goto out;
1381 }
1382
1383 ret = 0;
1384out:
1385 if (dentry_dir) {
1386 d_drop(dentry_dir);
1387 dput(dentry_dir);
1388 } else if (dir) {
1389 iput(dir);
1390 }
1391 if (dentry_inode) {
1392 d_drop(dentry_inode);
1393 dput(dentry_inode);
1394 }
1395
1396 return ret;
1397}
1398
1399/* Link replay function */
1400static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
1401{
1402 struct inode *inode;
1403 struct dentry_info_args darg;
1404 int ret = 0;
1405
1406 tl_to_darg(&darg, tl);
1407 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1408 darg.parent_ino, darg.dname_len);
1409
1410 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1411 if (IS_ERR(inode)) {
1412 jbd_debug(1, "Inode not found.");
1413 return 0;
1414 }
1415
1416 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1417 iput(inode);
1418 return ret;
1419}
1420
1421/*
1422 * Record all the modified inodes during replay. We use this later to setup
1423 * block bitmaps correctly.
1424 */
1425static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1426{
1427 struct ext4_fc_replay_state *state;
1428 int i;
1429
1430 state = &EXT4_SB(sb)->s_fc_replay_state;
1431 for (i = 0; i < state->fc_modified_inodes_used; i++)
1432 if (state->fc_modified_inodes[i] == ino)
1433 return 0;
1434 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1435 state->fc_modified_inodes_size +=
1436 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1437 state->fc_modified_inodes = krealloc(
1438 state->fc_modified_inodes, sizeof(int) *
1439 state->fc_modified_inodes_size,
1440 GFP_KERNEL);
1441 if (!state->fc_modified_inodes)
1442 return -ENOMEM;
1443 }
1444 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1445 return 0;
1446}
1447
1448/*
1449 * Inode replay function
1450 */
1451static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
1452{
1453 struct ext4_fc_inode *fc_inode;
1454 struct ext4_inode *raw_inode;
1455 struct ext4_inode *raw_fc_inode;
1456 struct inode *inode = NULL;
1457 struct ext4_iloc iloc;
1458 int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1459 struct ext4_extent_header *eh;
1460
1461 fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
1462
1463 ino = le32_to_cpu(fc_inode->fc_ino);
1464 trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1465
1466 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1467 if (!IS_ERR(inode)) {
1468 ext4_ext_clear_bb(inode);
1469 iput(inode);
1470 }
1471 inode = NULL;
1472
1473 ext4_fc_record_modified_inode(sb, ino);
1474
1475 raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
1476 ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1477 if (ret)
1478 goto out;
1479
1480 inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
1481 raw_inode = ext4_raw_inode(&iloc);
1482
1483 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1484 memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1485 inode_len - offsetof(struct ext4_inode, i_generation));
1486 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1487 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1488 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1489 memset(eh, 0, sizeof(*eh));
1490 eh->eh_magic = EXT4_EXT_MAGIC;
1491 eh->eh_max = cpu_to_le16(
1492 (sizeof(raw_inode->i_block) -
1493 sizeof(struct ext4_extent_header))
1494 / sizeof(struct ext4_extent));
1495 }
1496 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1497 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1498 sizeof(raw_inode->i_block));
1499 }
1500
1501 /* Immediately update the inode on disk. */
1502 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1503 if (ret)
1504 goto out;
1505 ret = sync_dirty_buffer(iloc.bh);
1506 if (ret)
1507 goto out;
1508 ret = ext4_mark_inode_used(sb, ino);
1509 if (ret)
1510 goto out;
1511
1512 /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1513 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1514 if (IS_ERR(inode)) {
1515 jbd_debug(1, "Inode not found.");
1516 return -EFSCORRUPTED;
1517 }
1518
1519 /*
1520 * Our allocator could have made different decisions than before
1521 * crashing. This should be fixed but until then, we calculate
1522 * the number of blocks the inode.
1523 */
1524 ext4_ext_replay_set_iblocks(inode);
1525
1526 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1527 ext4_reset_inode_seed(inode);
1528
1529 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1530 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1531 sync_dirty_buffer(iloc.bh);
1532 brelse(iloc.bh);
1533out:
1534 iput(inode);
1535 if (!ret)
1536 blkdev_issue_flush(sb->s_bdev);
1537
1538 return 0;
1539}
1540
1541/*
1542 * Dentry create replay function.
1543 *
1544 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1545 * inode for which we are trying to create a dentry here, should already have
1546 * been replayed before we start here.
1547 */
1548static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
1549{
1550 int ret = 0;
1551 struct inode *inode = NULL;
1552 struct inode *dir = NULL;
1553 struct dentry_info_args darg;
1554
1555 tl_to_darg(&darg, tl);
1556
1557 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1558 darg.parent_ino, darg.dname_len);
1559
1560 /* This takes care of update group descriptor and other metadata */
1561 ret = ext4_mark_inode_used(sb, darg.ino);
1562 if (ret)
1563 goto out;
1564
1565 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1566 if (IS_ERR(inode)) {
1567 jbd_debug(1, "inode %d not found.", darg.ino);
1568 inode = NULL;
1569 ret = -EINVAL;
1570 goto out;
1571 }
1572
1573 if (S_ISDIR(inode->i_mode)) {
1574 /*
1575 * If we are creating a directory, we need to make sure that the
1576 * dot and dot dot dirents are setup properly.
1577 */
1578 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1579 if (IS_ERR(dir)) {
1580 jbd_debug(1, "Dir %d not found.", darg.ino);
1581 goto out;
1582 }
1583 ret = ext4_init_new_dir(NULL, dir, inode);
1584 iput(dir);
1585 if (ret) {
1586 ret = 0;
1587 goto out;
1588 }
1589 }
1590 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1591 if (ret)
1592 goto out;
1593 set_nlink(inode, 1);
1594 ext4_mark_inode_dirty(NULL, inode);
1595out:
1596 if (inode)
1597 iput(inode);
1598 return ret;
1599}
1600
1601/*
1602 * Record physical disk regions which are in use as per fast commit area. Our
1603 * simple replay phase allocator excludes these regions from allocation.
1604 */
1605static int ext4_fc_record_regions(struct super_block *sb, int ino,
1606 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1607{
1608 struct ext4_fc_replay_state *state;
1609 struct ext4_fc_alloc_region *region;
1610
1611 state = &EXT4_SB(sb)->s_fc_replay_state;
1612 if (state->fc_regions_used == state->fc_regions_size) {
1613 state->fc_regions_size +=
1614 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1615 state->fc_regions = krealloc(
1616 state->fc_regions,
1617 state->fc_regions_size *
1618 sizeof(struct ext4_fc_alloc_region),
1619 GFP_KERNEL);
1620 if (!state->fc_regions)
1621 return -ENOMEM;
1622 }
1623 region = &state->fc_regions[state->fc_regions_used++];
1624 region->ino = ino;
1625 region->lblk = lblk;
1626 region->pblk = pblk;
1627 region->len = len;
1628
1629 return 0;
1630}
1631
1632/* Replay add range tag */
1633static int ext4_fc_replay_add_range(struct super_block *sb,
1634 struct ext4_fc_tl *tl)
1635{
1636 struct ext4_fc_add_range *fc_add_ex;
1637 struct ext4_extent newex, *ex;
1638 struct inode *inode;
1639 ext4_lblk_t start, cur;
1640 int remaining, len;
1641 ext4_fsblk_t start_pblk;
1642 struct ext4_map_blocks map;
1643 struct ext4_ext_path *path = NULL;
1644 int ret;
1645
1646 fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1647 ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
1648
1649 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1650 le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
1651 ext4_ext_get_actual_len(ex));
1652
1653 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
1654 EXT4_IGET_NORMAL);
1655 if (IS_ERR(inode)) {
1656 jbd_debug(1, "Inode not found.");
1657 return 0;
1658 }
1659
1660 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1661
1662 start = le32_to_cpu(ex->ee_block);
1663 start_pblk = ext4_ext_pblock(ex);
1664 len = ext4_ext_get_actual_len(ex);
1665
1666 cur = start;
1667 remaining = len;
1668 jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1669 start, start_pblk, len, ext4_ext_is_unwritten(ex),
1670 inode->i_ino);
1671
1672 while (remaining > 0) {
1673 map.m_lblk = cur;
1674 map.m_len = remaining;
1675 map.m_pblk = 0;
1676 ret = ext4_map_blocks(NULL, inode, &map, 0);
1677
1678 if (ret < 0) {
1679 iput(inode);
1680 return 0;
1681 }
1682
1683 if (ret == 0) {
1684 /* Range is not mapped */
1685 path = ext4_find_extent(inode, cur, NULL, 0);
1686 if (IS_ERR(path)) {
1687 iput(inode);
1688 return 0;
1689 }
1690 memset(&newex, 0, sizeof(newex));
1691 newex.ee_block = cpu_to_le32(cur);
1692 ext4_ext_store_pblock(
1693 &newex, start_pblk + cur - start);
1694 newex.ee_len = cpu_to_le16(map.m_len);
1695 if (ext4_ext_is_unwritten(ex))
1696 ext4_ext_mark_unwritten(&newex);
1697 down_write(&EXT4_I(inode)->i_data_sem);
1698 ret = ext4_ext_insert_extent(
1699 NULL, inode, &path, &newex, 0);
1700 up_write((&EXT4_I(inode)->i_data_sem));
1701 ext4_ext_drop_refs(path);
1702 kfree(path);
1703 if (ret) {
1704 iput(inode);
1705 return 0;
1706 }
1707 goto next;
1708 }
1709
1710 if (start_pblk + cur - start != map.m_pblk) {
1711 /*
1712 * Logical to physical mapping changed. This can happen
1713 * if this range was removed and then reallocated to
1714 * map to new physical blocks during a fast commit.
1715 */
1716 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1717 ext4_ext_is_unwritten(ex),
1718 start_pblk + cur - start);
1719 if (ret) {
1720 iput(inode);
1721 return 0;
1722 }
1723 /*
1724 * Mark the old blocks as free since they aren't used
1725 * anymore. We maintain an array of all the modified
1726 * inodes. In case these blocks are still used at either
1727 * a different logical range in the same inode or in
1728 * some different inode, we will mark them as allocated
1729 * at the end of the FC replay using our array of
1730 * modified inodes.
1731 */
1732 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1733 goto next;
1734 }
1735
1736 /* Range is mapped and needs a state change */
1737 jbd_debug(1, "Converting from %d to %d %lld",
1738 map.m_flags & EXT4_MAP_UNWRITTEN,
1739 ext4_ext_is_unwritten(ex), map.m_pblk);
1740 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1741 ext4_ext_is_unwritten(ex), map.m_pblk);
1742 if (ret) {
1743 iput(inode);
1744 return 0;
1745 }
1746 /*
1747 * We may have split the extent tree while toggling the state.
1748 * Try to shrink the extent tree now.
1749 */
1750 ext4_ext_replay_shrink_inode(inode, start + len);
1751next:
1752 cur += map.m_len;
1753 remaining -= map.m_len;
1754 }
1755 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1756 sb->s_blocksize_bits);
1757 iput(inode);
1758 return 0;
1759}
1760
1761/* Replay DEL_RANGE tag */
1762static int
1763ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
1764{
1765 struct inode *inode;
1766 struct ext4_fc_del_range *lrange;
1767 struct ext4_map_blocks map;
1768 ext4_lblk_t cur, remaining;
1769 int ret;
1770
1771 lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
1772 cur = le32_to_cpu(lrange->fc_lblk);
1773 remaining = le32_to_cpu(lrange->fc_len);
1774
1775 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1776 le32_to_cpu(lrange->fc_ino), cur, remaining);
1777
1778 inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
1779 if (IS_ERR(inode)) {
1780 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
1781 return 0;
1782 }
1783
1784 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1785
1786 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1787 inode->i_ino, le32_to_cpu(lrange->fc_lblk),
1788 le32_to_cpu(lrange->fc_len));
1789 while (remaining > 0) {
1790 map.m_lblk = cur;
1791 map.m_len = remaining;
1792
1793 ret = ext4_map_blocks(NULL, inode, &map, 0);
1794 if (ret < 0) {
1795 iput(inode);
1796 return 0;
1797 }
1798 if (ret > 0) {
1799 remaining -= ret;
1800 cur += ret;
1801 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1802 } else {
1803 remaining -= map.m_len;
1804 cur += map.m_len;
1805 }
1806 }
1807
1808 ret = ext4_punch_hole(inode,
1809 le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
1810 le32_to_cpu(lrange->fc_len) << sb->s_blocksize_bits);
1811 if (ret)
1812 jbd_debug(1, "ext4_punch_hole returned %d", ret);
1813 ext4_ext_replay_shrink_inode(inode,
1814 i_size_read(inode) >> sb->s_blocksize_bits);
1815 ext4_mark_inode_dirty(NULL, inode);
1816 iput(inode);
1817
1818 return 0;
1819}
1820
1821static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1822{
1823 struct ext4_fc_replay_state *state;
1824 struct inode *inode;
1825 struct ext4_ext_path *path = NULL;
1826 struct ext4_map_blocks map;
1827 int i, ret, j;
1828 ext4_lblk_t cur, end;
1829
1830 state = &EXT4_SB(sb)->s_fc_replay_state;
1831 for (i = 0; i < state->fc_modified_inodes_used; i++) {
1832 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1833 EXT4_IGET_NORMAL);
1834 if (IS_ERR(inode)) {
1835 jbd_debug(1, "Inode %d not found.",
1836 state->fc_modified_inodes[i]);
1837 continue;
1838 }
1839 cur = 0;
1840 end = EXT_MAX_BLOCKS;
1841 while (cur < end) {
1842 map.m_lblk = cur;
1843 map.m_len = end - cur;
1844
1845 ret = ext4_map_blocks(NULL, inode, &map, 0);
1846 if (ret < 0)
1847 break;
1848
1849 if (ret > 0) {
1850 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1851 if (!IS_ERR(path)) {
1852 for (j = 0; j < path->p_depth; j++)
1853 ext4_mb_mark_bb(inode->i_sb,
1854 path[j].p_block, 1, 1);
1855 ext4_ext_drop_refs(path);
1856 kfree(path);
1857 }
1858 cur += ret;
1859 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1860 map.m_len, 1);
1861 } else {
1862 cur = cur + (map.m_len ? map.m_len : 1);
1863 }
1864 }
1865 iput(inode);
1866 }
1867}
1868
1869/*
1870 * Check if block is in excluded regions for block allocation. The simple
1871 * allocator that runs during replay phase is calls this function to see
1872 * if it is okay to use a block.
1873 */
1874bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1875{
1876 int i;
1877 struct ext4_fc_replay_state *state;
1878
1879 state = &EXT4_SB(sb)->s_fc_replay_state;
1880 for (i = 0; i < state->fc_regions_valid; i++) {
1881 if (state->fc_regions[i].ino == 0 ||
1882 state->fc_regions[i].len == 0)
1883 continue;
1884 if (blk >= state->fc_regions[i].pblk &&
1885 blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1886 return true;
1887 }
1888 return false;
1889}
1890
1891/* Cleanup function called after replay */
1892void ext4_fc_replay_cleanup(struct super_block *sb)
1893{
1894 struct ext4_sb_info *sbi = EXT4_SB(sb);
1895
1896 sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1897 kfree(sbi->s_fc_replay_state.fc_regions);
1898 kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1899}
1900
1901/*
1902 * Recovery Scan phase handler
1903 *
1904 * This function is called during the scan phase and is responsible
1905 * for doing following things:
1906 * - Make sure the fast commit area has valid tags for replay
1907 * - Count number of tags that need to be replayed by the replay handler
1908 * - Verify CRC
1909 * - Create a list of excluded blocks for allocation during replay phase
1910 *
1911 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1912 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1913 * to indicate that scan has finished and JBD2 can now start replay phase.
1914 * It returns a negative error to indicate that there was an error. At the end
1915 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1916 * to indicate the number of tags that need to replayed during the replay phase.
1917 */
1918static int ext4_fc_replay_scan(journal_t *journal,
1919 struct buffer_head *bh, int off,
1920 tid_t expected_tid)
1921{
1922 struct super_block *sb = journal->j_private;
1923 struct ext4_sb_info *sbi = EXT4_SB(sb);
1924 struct ext4_fc_replay_state *state;
1925 int ret = JBD2_FC_REPLAY_CONTINUE;
1926 struct ext4_fc_add_range *ext;
1927 struct ext4_fc_tl *tl;
1928 struct ext4_fc_tail *tail;
1929 __u8 *start, *end;
1930 struct ext4_fc_head *head;
1931 struct ext4_extent *ex;
1932
1933 state = &sbi->s_fc_replay_state;
1934
1935 start = (u8 *)bh->b_data;
1936 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1937
1938 if (state->fc_replay_expected_off == 0) {
1939 state->fc_cur_tag = 0;
1940 state->fc_replay_num_tags = 0;
1941 state->fc_crc = 0;
1942 state->fc_regions = NULL;
1943 state->fc_regions_valid = state->fc_regions_used =
1944 state->fc_regions_size = 0;
1945 /* Check if we can stop early */
1946 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1947 != EXT4_FC_TAG_HEAD)
1948 return 0;
1949 }
1950
1951 if (off != state->fc_replay_expected_off) {
1952 ret = -EFSCORRUPTED;
1953 goto out_err;
1954 }
1955
1956 state->fc_replay_expected_off++;
1957 fc_for_each_tl(start, end, tl) {
1958 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1959 tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
1960 switch (le16_to_cpu(tl->fc_tag)) {
1961 case EXT4_FC_TAG_ADD_RANGE:
1962 ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1963 ex = (struct ext4_extent *)&ext->fc_ex;
1964 ret = ext4_fc_record_regions(sb,
1965 le32_to_cpu(ext->fc_ino),
1966 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1967 ext4_ext_get_actual_len(ex));
1968 if (ret < 0)
1969 break;
1970 ret = JBD2_FC_REPLAY_CONTINUE;
1971 fallthrough;
1972 case EXT4_FC_TAG_DEL_RANGE:
1973 case EXT4_FC_TAG_LINK:
1974 case EXT4_FC_TAG_UNLINK:
1975 case EXT4_FC_TAG_CREAT:
1976 case EXT4_FC_TAG_INODE:
1977 case EXT4_FC_TAG_PAD:
1978 state->fc_cur_tag++;
1979 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1980 sizeof(*tl) + ext4_fc_tag_len(tl));
1981 break;
1982 case EXT4_FC_TAG_TAIL:
1983 state->fc_cur_tag++;
1984 tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
1985 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1986 sizeof(*tl) +
1987 offsetof(struct ext4_fc_tail,
1988 fc_crc));
1989 if (le32_to_cpu(tail->fc_tid) == expected_tid &&
1990 le32_to_cpu(tail->fc_crc) == state->fc_crc) {
1991 state->fc_replay_num_tags = state->fc_cur_tag;
1992 state->fc_regions_valid =
1993 state->fc_regions_used;
1994 } else {
1995 ret = state->fc_replay_num_tags ?
1996 JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1997 }
1998 state->fc_crc = 0;
1999 break;
2000 case EXT4_FC_TAG_HEAD:
2001 head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
2002 if (le32_to_cpu(head->fc_features) &
2003 ~EXT4_FC_SUPPORTED_FEATURES) {
2004 ret = -EOPNOTSUPP;
2005 break;
2006 }
2007 if (le32_to_cpu(head->fc_tid) != expected_tid) {
2008 ret = JBD2_FC_REPLAY_STOP;
2009 break;
2010 }
2011 state->fc_cur_tag++;
2012 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
2013 sizeof(*tl) + ext4_fc_tag_len(tl));
2014 break;
2015 default:
2016 ret = state->fc_replay_num_tags ?
2017 JBD2_FC_REPLAY_STOP : -ECANCELED;
2018 }
2019 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2020 break;
2021 }
2022
2023out_err:
2024 trace_ext4_fc_replay_scan(sb, ret, off);
2025 return ret;
2026}
2027
2028/*
2029 * Main recovery path entry point.
2030 * The meaning of return codes is similar as above.
2031 */
2032static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2033 enum passtype pass, int off, tid_t expected_tid)
2034{
2035 struct super_block *sb = journal->j_private;
2036 struct ext4_sb_info *sbi = EXT4_SB(sb);
2037 struct ext4_fc_tl *tl;
2038 __u8 *start, *end;
2039 int ret = JBD2_FC_REPLAY_CONTINUE;
2040 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2041 struct ext4_fc_tail *tail;
2042
2043 if (pass == PASS_SCAN) {
2044 state->fc_current_pass = PASS_SCAN;
2045 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2046 }
2047
2048 if (state->fc_current_pass != pass) {
2049 state->fc_current_pass = pass;
2050 sbi->s_mount_state |= EXT4_FC_REPLAY;
2051 }
2052 if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2053 jbd_debug(1, "Replay stops\n");
2054 ext4_fc_set_bitmaps_and_counters(sb);
2055 return 0;
2056 }
2057
2058#ifdef CONFIG_EXT4_DEBUG
2059 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2060 pr_warn("Dropping fc block %d because max_replay set\n", off);
2061 return JBD2_FC_REPLAY_STOP;
2062 }
2063#endif
2064
2065 start = (u8 *)bh->b_data;
2066 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2067
2068 fc_for_each_tl(start, end, tl) {
2069 if (state->fc_replay_num_tags == 0) {
2070 ret = JBD2_FC_REPLAY_STOP;
2071 ext4_fc_set_bitmaps_and_counters(sb);
2072 break;
2073 }
2074 jbd_debug(3, "Replay phase, tag:%s\n",
2075 tag2str(le16_to_cpu(tl->fc_tag)));
2076 state->fc_replay_num_tags--;
2077 switch (le16_to_cpu(tl->fc_tag)) {
2078 case EXT4_FC_TAG_LINK:
2079 ret = ext4_fc_replay_link(sb, tl);
2080 break;
2081 case EXT4_FC_TAG_UNLINK:
2082 ret = ext4_fc_replay_unlink(sb, tl);
2083 break;
2084 case EXT4_FC_TAG_ADD_RANGE:
2085 ret = ext4_fc_replay_add_range(sb, tl);
2086 break;
2087 case EXT4_FC_TAG_CREAT:
2088 ret = ext4_fc_replay_create(sb, tl);
2089 break;
2090 case EXT4_FC_TAG_DEL_RANGE:
2091 ret = ext4_fc_replay_del_range(sb, tl);
2092 break;
2093 case EXT4_FC_TAG_INODE:
2094 ret = ext4_fc_replay_inode(sb, tl);
2095 break;
2096 case EXT4_FC_TAG_PAD:
2097 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2098 ext4_fc_tag_len(tl), 0);
2099 break;
2100 case EXT4_FC_TAG_TAIL:
2101 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2102 ext4_fc_tag_len(tl), 0);
2103 tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
2104 WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
2105 break;
2106 case EXT4_FC_TAG_HEAD:
2107 break;
2108 default:
2109 trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
2110 ext4_fc_tag_len(tl), 0);
2111 ret = -ECANCELED;
2112 break;
2113 }
2114 if (ret < 0)
2115 break;
2116 ret = JBD2_FC_REPLAY_CONTINUE;
2117 }
2118 return ret;
2119}
2120
2121void ext4_fc_init(struct super_block *sb, journal_t *journal)
2122{
2123 /*
2124 * We set replay callback even if fast commit disabled because we may
2125 * could still have fast commit blocks that need to be replayed even if
2126 * fast commit has now been turned off.
2127 */
2128 journal->j_fc_replay_callback = ext4_fc_replay;
2129 if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2130 return;
2131 journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2132}
2133
2134static const char *fc_ineligible_reasons[] = {
2135 "Extended attributes changed",
2136 "Cross rename",
2137 "Journal flag changed",
2138 "Insufficient memory",
2139 "Swap boot",
2140 "Resize",
2141 "Dir renamed",
2142 "Falloc range op",
2143 "Data journalling",
2144 "FC Commit Failed"
2145};
2146
2147int ext4_fc_info_show(struct seq_file *seq, void *v)
2148{
2149 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2150 struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2151 int i;
2152
2153 if (v != SEQ_START_TOKEN)
2154 return 0;
2155
2156 seq_printf(seq,
2157 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2158 stats->fc_num_commits, stats->fc_ineligible_commits,
2159 stats->fc_numblks,
2160 div_u64(sbi->s_fc_avg_commit_time, 1000));
2161 seq_puts(seq, "Ineligible reasons:\n");
2162 for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2163 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2164 stats->fc_ineligible_reason_count[i]);
2165
2166 return 0;
2167}
2168
2169int __init ext4_fc_init_dentry_cache(void)
2170{
2171 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2172 SLAB_RECLAIM_ACCOUNT);
2173
2174 if (ext4_fc_dentry_cachep == NULL)
2175 return -ENOMEM;
2176
2177 return 0;
2178}