fs/ext4/fast_commit.c at v5.14 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / ext4 / fast_commit.c
at v5.14 63 kB view raw
   1// SPDX-License-Identifier: GPL-2.0
   2
   3/*
   4 * fs/ext4/fast_commit.c
   5 *
   6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
   7 *
   8 * Ext4 fast commits routines.
   9 */
  10#include "ext4.h"
  11#include "ext4_jbd2.h"
  12#include "ext4_extents.h"
  13#include "mballoc.h"
  14
  15/*
  16 * Ext4 Fast Commits
  17 * -----------------
  18 *
  19 * Ext4 fast commits implement fine grained journalling for Ext4.
  20 *
  21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
  22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
  23 * TLV during the recovery phase. For the scenarios for which we currently
  24 * don't have replay code, fast commit falls back to full commits.
  25 * Fast commits record delta in one of the following three categories.
  26 *
  27 * (A) Directory entry updates:
  28 *
  29 * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
  30 * - EXT4_FC_TAG_LINK		- records directory entry link
  31 * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
  32 *
  33 * (B) File specific data range updates:
  34 *
  35 * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
  36 * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
  37 *
  38 * (C) Inode metadata (mtime / ctime etc):
  39 *
  40 * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
  41 *				  during recovery. Note that iblocks field is
  42 *				  not replayed and instead derived during
  43 *				  replay.
  44 * Commit Operation
  45 * ----------------
  46 * With fast commits, we maintain all the directory entry operations in the
  47 * order in which they are issued in an in-memory queue. This queue is flushed
  48 * to disk during the commit operation. We also maintain a list of inodes
  49 * that need to be committed during a fast commit in another in memory queue of
  50 * inodes. During the commit operation, we commit in the following order:
  51 *
  52 * [1] Lock inodes for any further data updates by setting COMMITTING state
  53 * [2] Submit data buffers of all the inodes
  54 * [3] Wait for [2] to complete
  55 * [4] Commit all the directory entry updates in the fast commit space
  56 * [5] Commit all the changed inode structures
  57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
  58 *     section for more details).
  59 * [7] Wait for [4], [5] and [6] to complete.
  60 *
  61 * All the inode updates must call ext4_fc_start_update() before starting an
  62 * update. If such an ongoing update is present, fast commit waits for it to
  63 * complete. The completion of such an update is marked by
  64 * ext4_fc_stop_update().
  65 *
  66 * Fast Commit Ineligibility
  67 * -------------------------
  68 * Not all operations are supported by fast commits today (e.g extended
  69 * attributes). Fast commit ineligibility is marked by calling one of the
  70 * two following functions:
  71 *
  72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
  73 *   back to full commit. This is useful in case of transient errors.
  74 *
  75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
  76 *   the fast commits happening between ext4_fc_start_ineligible() and
  77 *   ext4_fc_stop_ineligible() and one fast commit after the call to
  78 *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
  79 *   make one more fast commit to fall back to full commit after stop call so
  80 *   that it guaranteed that the fast commit ineligible operation contained
  81 *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
  82 *   followed by at least 1 full commit.
  83 *
  84 * Atomicity of commits
  85 * --------------------
  86 * In order to guarantee atomicity during the commit operation, fast commit
  87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
  88 * tag contains CRC of the contents and TID of the transaction after which
  89 * this fast commit should be applied. Recovery code replays fast commit
  90 * logs only if there's at least 1 valid tail present. For every fast commit
  91 * operation, there is 1 tail. This means, we may end up with multiple tails
  92 * in the fast commit space. Here's an example:
  93 *
  94 * - Create a new file A and remove existing file B
  95 * - fsync()
  96 * - Append contents to file A
  97 * - Truncate file A
  98 * - fsync()
  99 *
 100 * The fast commit space at the end of above operations would look like this:
 101 *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
 102 *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
 103 *
 104 * Replay code should thus check for all the valid tails in the FC area.
 105 *
 106 * Fast Commit Replay Idempotence
 107 * ------------------------------
 108 *
 109 * Fast commits tags are idempotent in nature provided the recovery code follows
 110 * certain rules. The guiding principle that the commit path follows while
 111 * committing is that it stores the result of a particular operation instead of
 112 * storing the procedure.
 113 *
 114 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
 115 * was associated with inode 10. During fast commit, instead of storing this
 116 * operation as a procedure "rename a to b", we store the resulting file system
 117 * state as a "series" of outcomes:
 118 *
 119 * - Link dirent b to inode 10
 120 * - Unlink dirent a
 121 * - Inode <10> with valid refcount
 122 *
 123 * Now when recovery code runs, it needs "enforce" this state on the file
 124 * system. This is what guarantees idempotence of fast commit replay.
 125 *
 126 * Let's take an example of a procedure that is not idempotent and see how fast
 127 * commits make it idempotent. Consider following sequence of operations:
 128 *
 129 *     rm A;    mv B A;    read A
 130 *  (x)     (y)        (z)
 131 *
 132 * (x), (y) and (z) are the points at which we can crash. If we store this
 133 * sequence of operations as is then the replay is not idempotent. Let's say
 134 * while in replay, we crash at (z). During the second replay, file A (which was
 135 * actually created as a result of "mv B A" operation) would get deleted. Thus,
 136 * file named A would be absent when we try to read A. So, this sequence of
 137 * operations is not idempotent. However, as mentioned above, instead of storing
 138 * the procedure fast commits store the outcome of each procedure. Thus the fast
 139 * commit log for above procedure would be as follows:
 140 *
 141 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
 142 * inode 11 before the replay)
 143 *
 144 *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
 145 * (w)          (x)                    (y)          (z)
 146 *
 147 * If we crash at (z), we will have file A linked to inode 11. During the second
 148 * replay, we will remove file A (inode 11). But we will create it back and make
 149 * it point to inode 11. We won't find B, so we'll just skip that step. At this
 150 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
 151 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
 152 * similarly. Thus, by converting a non-idempotent procedure into a series of
 153 * idempotent outcomes, fast commits ensured idempotence during the replay.
 154 *
 155 * TODOs
 156 * -----
 157 *
 158 * 0) Fast commit replay path hardening: Fast commit replay code should use
 159 *    journal handles to make sure all the updates it does during the replay
 160 *    path are atomic. With that if we crash during fast commit replay, after
 161 *    trying to do recovery again, we will find a file system where fast commit
 162 *    area is invalid (because new full commit would be found). In order to deal
 163 *    with that, fast commit replay code should ensure that the "FC_REPLAY"
 164 *    superblock state is persisted before starting the replay, so that after
 165 *    the crash, fast commit recovery code can look at that flag and perform
 166 *    fast commit recovery even if that area is invalidated by later full
 167 *    commits.
 168 *
 169 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
 170 *    eligible update must be protected within ext4_fc_start_update() and
 171 *    ext4_fc_stop_update(). These routines are called at much higher
 172 *    routines. This can be made more fine grained by combining with
 173 *    ext4_journal_start().
 174 *
 175 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
 176 *
 177 * 3) Handle more ineligible cases.
 178 */
 179
 180#include <trace/events/ext4.h>
 181static struct kmem_cache *ext4_fc_dentry_cachep;
 182
 183static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 184{
 185	BUFFER_TRACE(bh, "");
 186	if (uptodate) {
 187		ext4_debug("%s: Block %lld up-to-date",
 188			   __func__, bh->b_blocknr);
 189		set_buffer_uptodate(bh);
 190	} else {
 191		ext4_debug("%s: Block %lld not up-to-date",
 192			   __func__, bh->b_blocknr);
 193		clear_buffer_uptodate(bh);
 194	}
 195
 196	unlock_buffer(bh);
 197}
 198
 199static inline void ext4_fc_reset_inode(struct inode *inode)
 200{
 201	struct ext4_inode_info *ei = EXT4_I(inode);
 202
 203	ei->i_fc_lblk_start = 0;
 204	ei->i_fc_lblk_len = 0;
 205}
 206
 207void ext4_fc_init_inode(struct inode *inode)
 208{
 209	struct ext4_inode_info *ei = EXT4_I(inode);
 210
 211	ext4_fc_reset_inode(inode);
 212	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 213	INIT_LIST_HEAD(&ei->i_fc_list);
 214	init_waitqueue_head(&ei->i_fc_wait);
 215	atomic_set(&ei->i_fc_updates, 0);
 216}
 217
 218/* This function must be called with sbi->s_fc_lock held. */
 219static void ext4_fc_wait_committing_inode(struct inode *inode)
 220__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
 221{
 222	wait_queue_head_t *wq;
 223	struct ext4_inode_info *ei = EXT4_I(inode);
 224
 225#if (BITS_PER_LONG < 64)
 226	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
 227			EXT4_STATE_FC_COMMITTING);
 228	wq = bit_waitqueue(&ei->i_state_flags,
 229				EXT4_STATE_FC_COMMITTING);
 230#else
 231	DEFINE_WAIT_BIT(wait, &ei->i_flags,
 232			EXT4_STATE_FC_COMMITTING);
 233	wq = bit_waitqueue(&ei->i_flags,
 234				EXT4_STATE_FC_COMMITTING);
 235#endif
 236	lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
 237	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 238	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 239	schedule();
 240	finish_wait(wq, &wait.wq_entry);
 241}
 242
 243/*
 244 * Inform Ext4's fast about start of an inode update
 245 *
 246 * This function is called by the high level call VFS callbacks before
 247 * performing any inode update. This function blocks if there's an ongoing
 248 * fast commit on the inode in question.
 249 */
 250void ext4_fc_start_update(struct inode *inode)
 251{
 252	struct ext4_inode_info *ei = EXT4_I(inode);
 253
 254	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 255	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 256		return;
 257
 258restart:
 259	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 260	if (list_empty(&ei->i_fc_list))
 261		goto out;
 262
 263	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 264		ext4_fc_wait_committing_inode(inode);
 265		goto restart;
 266	}
 267out:
 268	atomic_inc(&ei->i_fc_updates);
 269	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 270}
 271
 272/*
 273 * Stop inode update and wake up waiting fast commits if any.
 274 */
 275void ext4_fc_stop_update(struct inode *inode)
 276{
 277	struct ext4_inode_info *ei = EXT4_I(inode);
 278
 279	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 280	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 281		return;
 282
 283	if (atomic_dec_and_test(&ei->i_fc_updates))
 284		wake_up_all(&ei->i_fc_wait);
 285}
 286
 287/*
 288 * Remove inode from fast commit list. If the inode is being committed
 289 * we wait until inode commit is done.
 290 */
 291void ext4_fc_del(struct inode *inode)
 292{
 293	struct ext4_inode_info *ei = EXT4_I(inode);
 294
 295	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 296	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 297		return;
 298
 299restart:
 300	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 301	if (list_empty(&ei->i_fc_list)) {
 302		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 303		return;
 304	}
 305
 306	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 307		ext4_fc_wait_committing_inode(inode);
 308		goto restart;
 309	}
 310	list_del_init(&ei->i_fc_list);
 311	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 312}
 313
 314/*
 315 * Mark file system as fast commit ineligible. This means that next commit
 316 * operation would result in a full jbd2 commit.
 317 */
 318void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
 319{
 320	struct ext4_sb_info *sbi = EXT4_SB(sb);
 321
 322	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 323	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 324		return;
 325
 326	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 327	WARN_ON(reason >= EXT4_FC_REASON_MAX);
 328	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 329}
 330
 331/*
 332 * Start a fast commit ineligible update. Any commits that happen while
 333 * such an operation is in progress fall back to full commits.
 334 */
 335void ext4_fc_start_ineligible(struct super_block *sb, int reason)
 336{
 337	struct ext4_sb_info *sbi = EXT4_SB(sb);
 338
 339	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 340	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 341		return;
 342
 343	WARN_ON(reason >= EXT4_FC_REASON_MAX);
 344	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 345	atomic_inc(&sbi->s_fc_ineligible_updates);
 346}
 347
 348/*
 349 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
 350 * to ensure that after stopping the ineligible update, at least one full
 351 * commit takes place.
 352 */
 353void ext4_fc_stop_ineligible(struct super_block *sb)
 354{
 355	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 356	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 357		return;
 358
 359	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 360	atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
 361}
 362
 363static inline int ext4_fc_is_ineligible(struct super_block *sb)
 364{
 365	return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
 366		atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
 367}
 368
 369/*
 370 * Generic fast commit tracking function. If this is the first time this we are
 371 * called after a full commit, we initialize fast commit fields and then call
 372 * __fc_track_fn() with update = 0. If we have already been called after a full
 373 * commit, we pass update = 1. Based on that, the track function can determine
 374 * if it needs to track a field for the first time or if it needs to just
 375 * update the previously tracked value.
 376 *
 377 * If enqueue is set, this function enqueues the inode in fast commit list.
 378 */
 379static int ext4_fc_track_template(
 380	handle_t *handle, struct inode *inode,
 381	int (*__fc_track_fn)(struct inode *, void *, bool),
 382	void *args, int enqueue)
 383{
 384	bool update = false;
 385	struct ext4_inode_info *ei = EXT4_I(inode);
 386	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 387	tid_t tid = 0;
 388	int ret;
 389
 390	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 391	    (sbi->s_mount_state & EXT4_FC_REPLAY))
 392		return -EOPNOTSUPP;
 393
 394	if (ext4_fc_is_ineligible(inode->i_sb))
 395		return -EINVAL;
 396
 397	tid = handle->h_transaction->t_tid;
 398	mutex_lock(&ei->i_fc_lock);
 399	if (tid == ei->i_sync_tid) {
 400		update = true;
 401	} else {
 402		ext4_fc_reset_inode(inode);
 403		ei->i_sync_tid = tid;
 404	}
 405	ret = __fc_track_fn(inode, args, update);
 406	mutex_unlock(&ei->i_fc_lock);
 407
 408	if (!enqueue)
 409		return ret;
 410
 411	spin_lock(&sbi->s_fc_lock);
 412	if (list_empty(&EXT4_I(inode)->i_fc_list))
 413		list_add_tail(&EXT4_I(inode)->i_fc_list,
 414				(ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
 415				&sbi->s_fc_q[FC_Q_STAGING] :
 416				&sbi->s_fc_q[FC_Q_MAIN]);
 417	spin_unlock(&sbi->s_fc_lock);
 418
 419	return ret;
 420}
 421
 422struct __track_dentry_update_args {
 423	struct dentry *dentry;
 424	int op;
 425};
 426
 427/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
 428static int __track_dentry_update(struct inode *inode, void *arg, bool update)
 429{
 430	struct ext4_fc_dentry_update *node;
 431	struct ext4_inode_info *ei = EXT4_I(inode);
 432	struct __track_dentry_update_args *dentry_update =
 433		(struct __track_dentry_update_args *)arg;
 434	struct dentry *dentry = dentry_update->dentry;
 435	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 436
 437	mutex_unlock(&ei->i_fc_lock);
 438	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
 439	if (!node) {
 440		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
 441		mutex_lock(&ei->i_fc_lock);
 442		return -ENOMEM;
 443	}
 444
 445	node->fcd_op = dentry_update->op;
 446	node->fcd_parent = dentry->d_parent->d_inode->i_ino;
 447	node->fcd_ino = inode->i_ino;
 448	if (dentry->d_name.len > DNAME_INLINE_LEN) {
 449		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
 450		if (!node->fcd_name.name) {
 451			kmem_cache_free(ext4_fc_dentry_cachep, node);
 452			ext4_fc_mark_ineligible(inode->i_sb,
 453				EXT4_FC_REASON_NOMEM);
 454			mutex_lock(&ei->i_fc_lock);
 455			return -ENOMEM;
 456		}
 457		memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
 458			dentry->d_name.len);
 459	} else {
 460		memcpy(node->fcd_iname, dentry->d_name.name,
 461			dentry->d_name.len);
 462		node->fcd_name.name = node->fcd_iname;
 463	}
 464	node->fcd_name.len = dentry->d_name.len;
 465
 466	spin_lock(&sbi->s_fc_lock);
 467	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
 468		list_add_tail(&node->fcd_list,
 469				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
 470	else
 471		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
 472	spin_unlock(&sbi->s_fc_lock);
 473	mutex_lock(&ei->i_fc_lock);
 474
 475	return 0;
 476}
 477
 478void __ext4_fc_track_unlink(handle_t *handle,
 479		struct inode *inode, struct dentry *dentry)
 480{
 481	struct __track_dentry_update_args args;
 482	int ret;
 483
 484	args.dentry = dentry;
 485	args.op = EXT4_FC_TAG_UNLINK;
 486
 487	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 488					(void *)&args, 0);
 489	trace_ext4_fc_track_unlink(inode, dentry, ret);
 490}
 491
 492void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
 493{
 494	__ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
 495}
 496
 497void __ext4_fc_track_link(handle_t *handle,
 498	struct inode *inode, struct dentry *dentry)
 499{
 500	struct __track_dentry_update_args args;
 501	int ret;
 502
 503	args.dentry = dentry;
 504	args.op = EXT4_FC_TAG_LINK;
 505
 506	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 507					(void *)&args, 0);
 508	trace_ext4_fc_track_link(inode, dentry, ret);
 509}
 510
 511void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
 512{
 513	__ext4_fc_track_link(handle, d_inode(dentry), dentry);
 514}
 515
 516void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
 517			  struct dentry *dentry)
 518{
 519	struct __track_dentry_update_args args;
 520	int ret;
 521
 522	args.dentry = dentry;
 523	args.op = EXT4_FC_TAG_CREAT;
 524
 525	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 526					(void *)&args, 0);
 527	trace_ext4_fc_track_create(inode, dentry, ret);
 528}
 529
 530void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
 531{
 532	__ext4_fc_track_create(handle, d_inode(dentry), dentry);
 533}
 534
 535/* __track_fn for inode tracking */
 536static int __track_inode(struct inode *inode, void *arg, bool update)
 537{
 538	if (update)
 539		return -EEXIST;
 540
 541	EXT4_I(inode)->i_fc_lblk_len = 0;
 542
 543	return 0;
 544}
 545
 546void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 547{
 548	int ret;
 549
 550	if (S_ISDIR(inode->i_mode))
 551		return;
 552
 553	if (ext4_should_journal_data(inode)) {
 554		ext4_fc_mark_ineligible(inode->i_sb,
 555					EXT4_FC_REASON_INODE_JOURNAL_DATA);
 556		return;
 557	}
 558
 559	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
 560	trace_ext4_fc_track_inode(inode, ret);
 561}
 562
 563struct __track_range_args {
 564	ext4_lblk_t start, end;
 565};
 566
 567/* __track_fn for tracking data updates */
 568static int __track_range(struct inode *inode, void *arg, bool update)
 569{
 570	struct ext4_inode_info *ei = EXT4_I(inode);
 571	ext4_lblk_t oldstart;
 572	struct __track_range_args *__arg =
 573		(struct __track_range_args *)arg;
 574
 575	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
 576		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
 577		return -ECANCELED;
 578	}
 579
 580	oldstart = ei->i_fc_lblk_start;
 581
 582	if (update && ei->i_fc_lblk_len > 0) {
 583		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
 584		ei->i_fc_lblk_len =
 585			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
 586				ei->i_fc_lblk_start + 1;
 587	} else {
 588		ei->i_fc_lblk_start = __arg->start;
 589		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
 590	}
 591
 592	return 0;
 593}
 594
 595void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
 596			 ext4_lblk_t end)
 597{
 598	struct __track_range_args args;
 599	int ret;
 600
 601	if (S_ISDIR(inode->i_mode))
 602		return;
 603
 604	args.start = start;
 605	args.end = end;
 606
 607	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
 608
 609	trace_ext4_fc_track_range(inode, start, end, ret);
 610}
 611
 612static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
 613{
 614	int write_flags = REQ_SYNC;
 615	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
 616
 617	/* Add REQ_FUA | REQ_PREFLUSH only its tail */
 618	if (test_opt(sb, BARRIER) && is_tail)
 619		write_flags |= REQ_FUA | REQ_PREFLUSH;
 620	lock_buffer(bh);
 621	set_buffer_dirty(bh);
 622	set_buffer_uptodate(bh);
 623	bh->b_end_io = ext4_end_buffer_io_sync;
 624	submit_bh(REQ_OP_WRITE, write_flags, bh);
 625	EXT4_SB(sb)->s_fc_bh = NULL;
 626}
 627
 628/* Ext4 commit path routines */
 629
 630/* memzero and update CRC */
 631static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
 632				u32 *crc)
 633{
 634	void *ret;
 635
 636	ret = memset(dst, 0, len);
 637	if (crc)
 638		*crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
 639	return ret;
 640}
 641
 642/*
 643 * Allocate len bytes on a fast commit buffer.
 644 *
 645 * During the commit time this function is used to manage fast commit
 646 * block space. We don't split a fast commit log onto different
 647 * blocks. So this function makes sure that if there's not enough space
 648 * on the current block, the remaining space in the current block is
 649 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 650 * new block is from jbd2 and CRC is updated to reflect the padding
 651 * we added.
 652 */
 653static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 654{
 655	struct ext4_fc_tl *tl;
 656	struct ext4_sb_info *sbi = EXT4_SB(sb);
 657	struct buffer_head *bh;
 658	int bsize = sbi->s_journal->j_blocksize;
 659	int ret, off = sbi->s_fc_bytes % bsize;
 660	int pad_len;
 661
 662	/*
 663	 * After allocating len, we should have space at least for a 0 byte
 664	 * padding.
 665	 */
 666	if (len + sizeof(struct ext4_fc_tl) > bsize)
 667		return NULL;
 668
 669	if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
 670		/*
 671		 * Only allocate from current buffer if we have enough space for
 672		 * this request AND we have space to add a zero byte padding.
 673		 */
 674		if (!sbi->s_fc_bh) {
 675			ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 676			if (ret)
 677				return NULL;
 678			sbi->s_fc_bh = bh;
 679		}
 680		sbi->s_fc_bytes += len;
 681		return sbi->s_fc_bh->b_data + off;
 682	}
 683	/* Need to add PAD tag */
 684	tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
 685	tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
 686	pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
 687	tl->fc_len = cpu_to_le16(pad_len);
 688	if (crc)
 689		*crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
 690	if (pad_len > 0)
 691		ext4_fc_memzero(sb, tl + 1, pad_len, crc);
 692	ext4_fc_submit_bh(sb, false);
 693
 694	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 695	if (ret)
 696		return NULL;
 697	sbi->s_fc_bh = bh;
 698	sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
 699	return sbi->s_fc_bh->b_data;
 700}
 701
 702/* memcpy to fc reserved space and update CRC */
 703static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
 704				int len, u32 *crc)
 705{
 706	if (crc)
 707		*crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
 708	return memcpy(dst, src, len);
 709}
 710
 711/*
 712 * Complete a fast commit by writing tail tag.
 713 *
 714 * Writing tail tag marks the end of a fast commit. In order to guarantee
 715 * atomicity, after writing tail tag, even if there's space remaining
 716 * in the block, next commit shouldn't use it. That's why tail tag
 717 * has the length as that of the remaining space on the block.
 718 */
 719static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
 720{
 721	struct ext4_sb_info *sbi = EXT4_SB(sb);
 722	struct ext4_fc_tl tl;
 723	struct ext4_fc_tail tail;
 724	int off, bsize = sbi->s_journal->j_blocksize;
 725	u8 *dst;
 726
 727	/*
 728	 * ext4_fc_reserve_space takes care of allocating an extra block if
 729	 * there's no enough space on this block for accommodating this tail.
 730	 */
 731	dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
 732	if (!dst)
 733		return -ENOSPC;
 734
 735	off = sbi->s_fc_bytes % bsize;
 736
 737	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
 738	tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
 739	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
 740
 741	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
 742	dst += sizeof(tl);
 743	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
 744	ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
 745	dst += sizeof(tail.fc_tid);
 746	tail.fc_crc = cpu_to_le32(crc);
 747	ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
 748
 749	ext4_fc_submit_bh(sb, true);
 750
 751	return 0;
 752}
 753
 754/*
 755 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 756 * Returns false if there's not enough space.
 757 */
 758static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
 759			   u32 *crc)
 760{
 761	struct ext4_fc_tl tl;
 762	u8 *dst;
 763
 764	dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
 765	if (!dst)
 766		return false;
 767
 768	tl.fc_tag = cpu_to_le16(tag);
 769	tl.fc_len = cpu_to_le16(len);
 770
 771	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 772	ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
 773
 774	return true;
 775}
 776
 777/* Same as above, but adds dentry tlv. */
 778static  bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
 779					int parent_ino, int ino, int dlen,
 780					const unsigned char *dname,
 781					u32 *crc)
 782{
 783	struct ext4_fc_dentry_info fcd;
 784	struct ext4_fc_tl tl;
 785	u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
 786					crc);
 787
 788	if (!dst)
 789		return false;
 790
 791	fcd.fc_parent_ino = cpu_to_le32(parent_ino);
 792	fcd.fc_ino = cpu_to_le32(ino);
 793	tl.fc_tag = cpu_to_le16(tag);
 794	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
 795	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 796	dst += sizeof(tl);
 797	ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
 798	dst += sizeof(fcd);
 799	ext4_fc_memcpy(sb, dst, dname, dlen, crc);
 800	dst += dlen;
 801
 802	return true;
 803}
 804
 805/*
 806 * Writes inode in the fast commit space under TLV with tag @tag.
 807 * Returns 0 on success, error on failure.
 808 */
 809static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 810{
 811	struct ext4_inode_info *ei = EXT4_I(inode);
 812	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 813	int ret;
 814	struct ext4_iloc iloc;
 815	struct ext4_fc_inode fc_inode;
 816	struct ext4_fc_tl tl;
 817	u8 *dst;
 818
 819	ret = ext4_get_inode_loc(inode, &iloc);
 820	if (ret)
 821		return ret;
 822
 823	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
 824		inode_len += ei->i_extra_isize;
 825
 826	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 827	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
 828	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
 829
 830	dst = ext4_fc_reserve_space(inode->i_sb,
 831			sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
 832	if (!dst)
 833		return -ECANCELED;
 834
 835	if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
 836		return -ECANCELED;
 837	dst += sizeof(tl);
 838	if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
 839		return -ECANCELED;
 840	dst += sizeof(fc_inode);
 841	if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
 842					inode_len, crc))
 843		return -ECANCELED;
 844
 845	return 0;
 846}
 847
 848/*
 849 * Writes updated data ranges for the inode in question. Updates CRC.
 850 * Returns 0 on success, error otherwise.
 851 */
 852static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 853{
 854	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
 855	struct ext4_inode_info *ei = EXT4_I(inode);
 856	struct ext4_map_blocks map;
 857	struct ext4_fc_add_range fc_ext;
 858	struct ext4_fc_del_range lrange;
 859	struct ext4_extent *ex;
 860	int ret;
 861
 862	mutex_lock(&ei->i_fc_lock);
 863	if (ei->i_fc_lblk_len == 0) {
 864		mutex_unlock(&ei->i_fc_lock);
 865		return 0;
 866	}
 867	old_blk_size = ei->i_fc_lblk_start;
 868	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
 869	ei->i_fc_lblk_len = 0;
 870	mutex_unlock(&ei->i_fc_lock);
 871
 872	cur_lblk_off = old_blk_size;
 873	jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
 874		  __func__, cur_lblk_off, new_blk_size, inode->i_ino);
 875
 876	while (cur_lblk_off <= new_blk_size) {
 877		map.m_lblk = cur_lblk_off;
 878		map.m_len = new_blk_size - cur_lblk_off + 1;
 879		ret = ext4_map_blocks(NULL, inode, &map, 0);
 880		if (ret < 0)
 881			return -ECANCELED;
 882
 883		if (map.m_len == 0) {
 884			cur_lblk_off++;
 885			continue;
 886		}
 887
 888		if (ret == 0) {
 889			lrange.fc_ino = cpu_to_le32(inode->i_ino);
 890			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
 891			lrange.fc_len = cpu_to_le32(map.m_len);
 892			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
 893					    sizeof(lrange), (u8 *)&lrange, crc))
 894				return -ENOSPC;
 895		} else {
 896			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
 897			ex = (struct ext4_extent *)&fc_ext.fc_ex;
 898			ex->ee_block = cpu_to_le32(map.m_lblk);
 899			ex->ee_len = cpu_to_le16(map.m_len);
 900			ext4_ext_store_pblock(ex, map.m_pblk);
 901			if (map.m_flags & EXT4_MAP_UNWRITTEN)
 902				ext4_ext_mark_unwritten(ex);
 903			else
 904				ext4_ext_mark_initialized(ex);
 905			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
 906					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
 907				return -ENOSPC;
 908		}
 909
 910		cur_lblk_off += map.m_len;
 911	}
 912
 913	return 0;
 914}
 915
 916
 917/* Submit data for all the fast commit inodes */
 918static int ext4_fc_submit_inode_data_all(journal_t *journal)
 919{
 920	struct super_block *sb = (struct super_block *)(journal->j_private);
 921	struct ext4_sb_info *sbi = EXT4_SB(sb);
 922	struct ext4_inode_info *ei;
 923	int ret = 0;
 924
 925	spin_lock(&sbi->s_fc_lock);
 926	ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
 927	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 928		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
 929		while (atomic_read(&ei->i_fc_updates)) {
 930			DEFINE_WAIT(wait);
 931
 932			prepare_to_wait(&ei->i_fc_wait, &wait,
 933						TASK_UNINTERRUPTIBLE);
 934			if (atomic_read(&ei->i_fc_updates)) {
 935				spin_unlock(&sbi->s_fc_lock);
 936				schedule();
 937				spin_lock(&sbi->s_fc_lock);
 938			}
 939			finish_wait(&ei->i_fc_wait, &wait);
 940		}
 941		spin_unlock(&sbi->s_fc_lock);
 942		ret = jbd2_submit_inode_data(ei->jinode);
 943		if (ret)
 944			return ret;
 945		spin_lock(&sbi->s_fc_lock);
 946	}
 947	spin_unlock(&sbi->s_fc_lock);
 948
 949	return ret;
 950}
 951
 952/* Wait for completion of data for all the fast commit inodes */
 953static int ext4_fc_wait_inode_data_all(journal_t *journal)
 954{
 955	struct super_block *sb = (struct super_block *)(journal->j_private);
 956	struct ext4_sb_info *sbi = EXT4_SB(sb);
 957	struct ext4_inode_info *pos, *n;
 958	int ret = 0;
 959
 960	spin_lock(&sbi->s_fc_lock);
 961	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 962		if (!ext4_test_inode_state(&pos->vfs_inode,
 963					   EXT4_STATE_FC_COMMITTING))
 964			continue;
 965		spin_unlock(&sbi->s_fc_lock);
 966
 967		ret = jbd2_wait_inode_data(journal, pos->jinode);
 968		if (ret)
 969			return ret;
 970		spin_lock(&sbi->s_fc_lock);
 971	}
 972	spin_unlock(&sbi->s_fc_lock);
 973
 974	return 0;
 975}
 976
 977/* Commit all the directory entry updates */
 978static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 979__acquires(&sbi->s_fc_lock)
 980__releases(&sbi->s_fc_lock)
 981{
 982	struct super_block *sb = (struct super_block *)(journal->j_private);
 983	struct ext4_sb_info *sbi = EXT4_SB(sb);
 984	struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
 985	struct inode *inode;
 986	struct ext4_inode_info *ei, *ei_n;
 987	int ret;
 988
 989	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
 990		return 0;
 991	list_for_each_entry_safe(fc_dentry, fc_dentry_n,
 992				 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
 993		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
 994			spin_unlock(&sbi->s_fc_lock);
 995			if (!ext4_fc_add_dentry_tlv(
 996				sb, fc_dentry->fcd_op,
 997				fc_dentry->fcd_parent, fc_dentry->fcd_ino,
 998				fc_dentry->fcd_name.len,
 999				fc_dentry->fcd_name.name, crc)) {
1000				ret = -ENOSPC;
1001				goto lock_and_exit;
1002			}
1003			spin_lock(&sbi->s_fc_lock);
1004			continue;
1005		}
1006
1007		inode = NULL;
1008		list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
1009					 i_fc_list) {
1010			if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
1011				inode = &ei->vfs_inode;
1012				break;
1013			}
1014		}
1015		/*
1016		 * If we don't find inode in our list, then it was deleted,
1017		 * in which case, we don't need to record it's create tag.
1018		 */
1019		if (!inode)
1020			continue;
1021		spin_unlock(&sbi->s_fc_lock);
1022
1023		/*
1024		 * We first write the inode and then the create dirent. This
1025		 * allows the recovery code to create an unnamed inode first
1026		 * and then link it to a directory entry. This allows us
1027		 * to use namei.c routines almost as is and simplifies
1028		 * the recovery code.
1029		 */
1030		ret = ext4_fc_write_inode(inode, crc);
1031		if (ret)
1032			goto lock_and_exit;
1033
1034		ret = ext4_fc_write_inode_data(inode, crc);
1035		if (ret)
1036			goto lock_and_exit;
1037
1038		if (!ext4_fc_add_dentry_tlv(
1039			sb, fc_dentry->fcd_op,
1040			fc_dentry->fcd_parent, fc_dentry->fcd_ino,
1041			fc_dentry->fcd_name.len,
1042			fc_dentry->fcd_name.name, crc)) {
1043			ret = -ENOSPC;
1044			goto lock_and_exit;
1045		}
1046
1047		spin_lock(&sbi->s_fc_lock);
1048	}
1049	return 0;
1050lock_and_exit:
1051	spin_lock(&sbi->s_fc_lock);
1052	return ret;
1053}
1054
1055static int ext4_fc_perform_commit(journal_t *journal)
1056{
1057	struct super_block *sb = (struct super_block *)(journal->j_private);
1058	struct ext4_sb_info *sbi = EXT4_SB(sb);
1059	struct ext4_inode_info *iter;
1060	struct ext4_fc_head head;
1061	struct inode *inode;
1062	struct blk_plug plug;
1063	int ret = 0;
1064	u32 crc = 0;
1065
1066	ret = ext4_fc_submit_inode_data_all(journal);
1067	if (ret)
1068		return ret;
1069
1070	ret = ext4_fc_wait_inode_data_all(journal);
1071	if (ret)
1072		return ret;
1073
1074	/*
1075	 * If file system device is different from journal device, issue a cache
1076	 * flush before we start writing fast commit blocks.
1077	 */
1078	if (journal->j_fs_dev != journal->j_dev)
1079		blkdev_issue_flush(journal->j_fs_dev);
1080
1081	blk_start_plug(&plug);
1082	if (sbi->s_fc_bytes == 0) {
1083		/*
1084		 * Add a head tag only if this is the first fast commit
1085		 * in this TID.
1086		 */
1087		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1088		head.fc_tid = cpu_to_le32(
1089			sbi->s_journal->j_running_transaction->t_tid);
1090		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1091			(u8 *)&head, &crc)) {
1092			ret = -ENOSPC;
1093			goto out;
1094		}
1095	}
1096
1097	spin_lock(&sbi->s_fc_lock);
1098	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1099	if (ret) {
1100		spin_unlock(&sbi->s_fc_lock);
1101		goto out;
1102	}
1103
1104	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1105		inode = &iter->vfs_inode;
1106		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1107			continue;
1108
1109		spin_unlock(&sbi->s_fc_lock);
1110		ret = ext4_fc_write_inode_data(inode, &crc);
1111		if (ret)
1112			goto out;
1113		ret = ext4_fc_write_inode(inode, &crc);
1114		if (ret)
1115			goto out;
1116		spin_lock(&sbi->s_fc_lock);
1117	}
1118	spin_unlock(&sbi->s_fc_lock);
1119
1120	ret = ext4_fc_write_tail(sb, crc);
1121
1122out:
1123	blk_finish_plug(&plug);
1124	return ret;
1125}
1126
1127/*
1128 * The main commit entry point. Performs a fast commit for transaction
1129 * commit_tid if needed. If it's not possible to perform a fast commit
1130 * due to various reasons, we fall back to full commit. Returns 0
1131 * on success, error otherwise.
1132 */
1133int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1134{
1135	struct super_block *sb = (struct super_block *)(journal->j_private);
1136	struct ext4_sb_info *sbi = EXT4_SB(sb);
1137	int nblks = 0, ret, bsize = journal->j_blocksize;
1138	int subtid = atomic_read(&sbi->s_fc_subtid);
1139	int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1140	ktime_t start_time, commit_time;
1141
1142	trace_ext4_fc_commit_start(sb);
1143
1144	start_time = ktime_get();
1145
1146	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1147		(ext4_fc_is_ineligible(sb))) {
1148		reason = EXT4_FC_REASON_INELIGIBLE;
1149		goto out;
1150	}
1151
1152restart_fc:
1153	ret = jbd2_fc_begin_commit(journal, commit_tid);
1154	if (ret == -EALREADY) {
1155		/* There was an ongoing commit, check if we need to restart */
1156		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1157			commit_tid > journal->j_commit_sequence)
1158			goto restart_fc;
1159		reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1160		goto out;
1161	} else if (ret) {
1162		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1163		reason = EXT4_FC_REASON_FC_START_FAILED;
1164		goto out;
1165	}
1166
1167	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1168	ret = ext4_fc_perform_commit(journal);
1169	if (ret < 0) {
1170		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1171		reason = EXT4_FC_REASON_FC_FAILED;
1172		goto out;
1173	}
1174	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1175	ret = jbd2_fc_wait_bufs(journal, nblks);
1176	if (ret < 0) {
1177		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1178		reason = EXT4_FC_REASON_FC_FAILED;
1179		goto out;
1180	}
1181	atomic_inc(&sbi->s_fc_subtid);
1182	jbd2_fc_end_commit(journal);
1183out:
1184	/* Has any ineligible update happened since we started? */
1185	if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1186		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1187		reason = EXT4_FC_REASON_INELIGIBLE;
1188	}
1189
1190	spin_lock(&sbi->s_fc_lock);
1191	if (reason != EXT4_FC_REASON_OK &&
1192		reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1193		sbi->s_fc_stats.fc_ineligible_commits++;
1194	} else {
1195		sbi->s_fc_stats.fc_num_commits++;
1196		sbi->s_fc_stats.fc_numblks += nblks;
1197	}
1198	spin_unlock(&sbi->s_fc_lock);
1199	nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1200	trace_ext4_fc_commit_stop(sb, nblks, reason);
1201	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1202	/*
1203	 * weight the commit time higher than the average time so we don't
1204	 * react too strongly to vast changes in the commit time
1205	 */
1206	if (likely(sbi->s_fc_avg_commit_time))
1207		sbi->s_fc_avg_commit_time = (commit_time +
1208				sbi->s_fc_avg_commit_time * 3) / 4;
1209	else
1210		sbi->s_fc_avg_commit_time = commit_time;
1211	jbd_debug(1,
1212		"Fast commit ended with blks = %d, reason = %d, subtid - %d",
1213		nblks, reason, subtid);
1214	if (reason == EXT4_FC_REASON_FC_FAILED)
1215		return jbd2_fc_end_commit_fallback(journal);
1216	if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1217		reason == EXT4_FC_REASON_INELIGIBLE)
1218		return jbd2_complete_transaction(journal, commit_tid);
1219	return 0;
1220}
1221
1222/*
1223 * Fast commit cleanup routine. This is called after every fast commit and
1224 * full commit. full is true if we are called after a full commit.
1225 */
1226static void ext4_fc_cleanup(journal_t *journal, int full)
1227{
1228	struct super_block *sb = journal->j_private;
1229	struct ext4_sb_info *sbi = EXT4_SB(sb);
1230	struct ext4_inode_info *iter, *iter_n;
1231	struct ext4_fc_dentry_update *fc_dentry;
1232
1233	if (full && sbi->s_fc_bh)
1234		sbi->s_fc_bh = NULL;
1235
1236	jbd2_fc_release_bufs(journal);
1237
1238	spin_lock(&sbi->s_fc_lock);
1239	list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1240				 i_fc_list) {
1241		list_del_init(&iter->i_fc_list);
1242		ext4_clear_inode_state(&iter->vfs_inode,
1243				       EXT4_STATE_FC_COMMITTING);
1244		ext4_fc_reset_inode(&iter->vfs_inode);
1245		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1246		smp_mb();
1247#if (BITS_PER_LONG < 64)
1248		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1249#else
1250		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1251#endif
1252	}
1253
1254	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1255		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1256					     struct ext4_fc_dentry_update,
1257					     fcd_list);
1258		list_del_init(&fc_dentry->fcd_list);
1259		spin_unlock(&sbi->s_fc_lock);
1260
1261		if (fc_dentry->fcd_name.name &&
1262			fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1263			kfree(fc_dentry->fcd_name.name);
1264		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1265		spin_lock(&sbi->s_fc_lock);
1266	}
1267
1268	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1269				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1270	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1271				&sbi->s_fc_q[FC_Q_MAIN]);
1272
1273	ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1274	ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1275
1276	if (full)
1277		sbi->s_fc_bytes = 0;
1278	spin_unlock(&sbi->s_fc_lock);
1279	trace_ext4_fc_stats(sb);
1280}
1281
1282/* Ext4 Replay Path Routines */
1283
1284/* Helper struct for dentry replay routines */
1285struct dentry_info_args {
1286	int parent_ino, dname_len, ino, inode_len;
1287	char *dname;
1288};
1289
1290static inline void tl_to_darg(struct dentry_info_args *darg,
1291			      struct  ext4_fc_tl *tl, u8 *val)
1292{
1293	struct ext4_fc_dentry_info fcd;
1294
1295	memcpy(&fcd, val, sizeof(fcd));
1296
1297	darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1298	darg->ino = le32_to_cpu(fcd.fc_ino);
1299	darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1300	darg->dname_len = le16_to_cpu(tl->fc_len) -
1301		sizeof(struct ext4_fc_dentry_info);
1302}
1303
1304/* Unlink replay function */
1305static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1306				 u8 *val)
1307{
1308	struct inode *inode, *old_parent;
1309	struct qstr entry;
1310	struct dentry_info_args darg;
1311	int ret = 0;
1312
1313	tl_to_darg(&darg, tl, val);
1314
1315	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1316			darg.parent_ino, darg.dname_len);
1317
1318	entry.name = darg.dname;
1319	entry.len = darg.dname_len;
1320	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1321
1322	if (IS_ERR(inode)) {
1323		jbd_debug(1, "Inode %d not found", darg.ino);
1324		return 0;
1325	}
1326
1327	old_parent = ext4_iget(sb, darg.parent_ino,
1328				EXT4_IGET_NORMAL);
1329	if (IS_ERR(old_parent)) {
1330		jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1331		iput(inode);
1332		return 0;
1333	}
1334
1335	ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1336	/* -ENOENT ok coz it might not exist anymore. */
1337	if (ret == -ENOENT)
1338		ret = 0;
1339	iput(old_parent);
1340	iput(inode);
1341	return ret;
1342}
1343
1344static int ext4_fc_replay_link_internal(struct super_block *sb,
1345				struct dentry_info_args *darg,
1346				struct inode *inode)
1347{
1348	struct inode *dir = NULL;
1349	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1350	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1351	int ret = 0;
1352
1353	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1354	if (IS_ERR(dir)) {
1355		jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1356		dir = NULL;
1357		goto out;
1358	}
1359
1360	dentry_dir = d_obtain_alias(dir);
1361	if (IS_ERR(dentry_dir)) {
1362		jbd_debug(1, "Failed to obtain dentry");
1363		dentry_dir = NULL;
1364		goto out;
1365	}
1366
1367	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1368	if (!dentry_inode) {
1369		jbd_debug(1, "Inode dentry not created.");
1370		ret = -ENOMEM;
1371		goto out;
1372	}
1373
1374	ret = __ext4_link(dir, inode, dentry_inode);
1375	/*
1376	 * It's possible that link already existed since data blocks
1377	 * for the dir in question got persisted before we crashed OR
1378	 * we replayed this tag and crashed before the entire replay
1379	 * could complete.
1380	 */
1381	if (ret && ret != -EEXIST) {
1382		jbd_debug(1, "Failed to link\n");
1383		goto out;
1384	}
1385
1386	ret = 0;
1387out:
1388	if (dentry_dir) {
1389		d_drop(dentry_dir);
1390		dput(dentry_dir);
1391	} else if (dir) {
1392		iput(dir);
1393	}
1394	if (dentry_inode) {
1395		d_drop(dentry_inode);
1396		dput(dentry_inode);
1397	}
1398
1399	return ret;
1400}
1401
1402/* Link replay function */
1403static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1404			       u8 *val)
1405{
1406	struct inode *inode;
1407	struct dentry_info_args darg;
1408	int ret = 0;
1409
1410	tl_to_darg(&darg, tl, val);
1411	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1412			darg.parent_ino, darg.dname_len);
1413
1414	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1415	if (IS_ERR(inode)) {
1416		jbd_debug(1, "Inode not found.");
1417		return 0;
1418	}
1419
1420	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1421	iput(inode);
1422	return ret;
1423}
1424
1425/*
1426 * Record all the modified inodes during replay. We use this later to setup
1427 * block bitmaps correctly.
1428 */
1429static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1430{
1431	struct ext4_fc_replay_state *state;
1432	int i;
1433
1434	state = &EXT4_SB(sb)->s_fc_replay_state;
1435	for (i = 0; i < state->fc_modified_inodes_used; i++)
1436		if (state->fc_modified_inodes[i] == ino)
1437			return 0;
1438	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1439		state->fc_modified_inodes_size +=
1440			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1441		state->fc_modified_inodes = krealloc(
1442					state->fc_modified_inodes, sizeof(int) *
1443					state->fc_modified_inodes_size,
1444					GFP_KERNEL);
1445		if (!state->fc_modified_inodes)
1446			return -ENOMEM;
1447	}
1448	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1449	return 0;
1450}
1451
1452/*
1453 * Inode replay function
1454 */
1455static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1456				u8 *val)
1457{
1458	struct ext4_fc_inode fc_inode;
1459	struct ext4_inode *raw_inode;
1460	struct ext4_inode *raw_fc_inode;
1461	struct inode *inode = NULL;
1462	struct ext4_iloc iloc;
1463	int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1464	struct ext4_extent_header *eh;
1465
1466	memcpy(&fc_inode, val, sizeof(fc_inode));
1467
1468	ino = le32_to_cpu(fc_inode.fc_ino);
1469	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1470
1471	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1472	if (!IS_ERR(inode)) {
1473		ext4_ext_clear_bb(inode);
1474		iput(inode);
1475	}
1476	inode = NULL;
1477
1478	ext4_fc_record_modified_inode(sb, ino);
1479
1480	raw_fc_inode = (struct ext4_inode *)
1481		(val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1482	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1483	if (ret)
1484		goto out;
1485
1486	inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1487	raw_inode = ext4_raw_inode(&iloc);
1488
1489	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1490	memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1491		inode_len - offsetof(struct ext4_inode, i_generation));
1492	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1493		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1494		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1495			memset(eh, 0, sizeof(*eh));
1496			eh->eh_magic = EXT4_EXT_MAGIC;
1497			eh->eh_max = cpu_to_le16(
1498				(sizeof(raw_inode->i_block) -
1499				 sizeof(struct ext4_extent_header))
1500				 / sizeof(struct ext4_extent));
1501		}
1502	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1503		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1504			sizeof(raw_inode->i_block));
1505	}
1506
1507	/* Immediately update the inode on disk. */
1508	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1509	if (ret)
1510		goto out;
1511	ret = sync_dirty_buffer(iloc.bh);
1512	if (ret)
1513		goto out;
1514	ret = ext4_mark_inode_used(sb, ino);
1515	if (ret)
1516		goto out;
1517
1518	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
1519	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1520	if (IS_ERR(inode)) {
1521		jbd_debug(1, "Inode not found.");
1522		return -EFSCORRUPTED;
1523	}
1524
1525	/*
1526	 * Our allocator could have made different decisions than before
1527	 * crashing. This should be fixed but until then, we calculate
1528	 * the number of blocks the inode.
1529	 */
1530	ext4_ext_replay_set_iblocks(inode);
1531
1532	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1533	ext4_reset_inode_seed(inode);
1534
1535	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1536	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1537	sync_dirty_buffer(iloc.bh);
1538	brelse(iloc.bh);
1539out:
1540	iput(inode);
1541	if (!ret)
1542		blkdev_issue_flush(sb->s_bdev);
1543
1544	return 0;
1545}
1546
1547/*
1548 * Dentry create replay function.
1549 *
1550 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1551 * inode for which we are trying to create a dentry here, should already have
1552 * been replayed before we start here.
1553 */
1554static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1555				 u8 *val)
1556{
1557	int ret = 0;
1558	struct inode *inode = NULL;
1559	struct inode *dir = NULL;
1560	struct dentry_info_args darg;
1561
1562	tl_to_darg(&darg, tl, val);
1563
1564	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1565			darg.parent_ino, darg.dname_len);
1566
1567	/* This takes care of update group descriptor and other metadata */
1568	ret = ext4_mark_inode_used(sb, darg.ino);
1569	if (ret)
1570		goto out;
1571
1572	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1573	if (IS_ERR(inode)) {
1574		jbd_debug(1, "inode %d not found.", darg.ino);
1575		inode = NULL;
1576		ret = -EINVAL;
1577		goto out;
1578	}
1579
1580	if (S_ISDIR(inode->i_mode)) {
1581		/*
1582		 * If we are creating a directory, we need to make sure that the
1583		 * dot and dot dot dirents are setup properly.
1584		 */
1585		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1586		if (IS_ERR(dir)) {
1587			jbd_debug(1, "Dir %d not found.", darg.ino);
1588			goto out;
1589		}
1590		ret = ext4_init_new_dir(NULL, dir, inode);
1591		iput(dir);
1592		if (ret) {
1593			ret = 0;
1594			goto out;
1595		}
1596	}
1597	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1598	if (ret)
1599		goto out;
1600	set_nlink(inode, 1);
1601	ext4_mark_inode_dirty(NULL, inode);
1602out:
1603	if (inode)
1604		iput(inode);
1605	return ret;
1606}
1607
1608/*
1609 * Record physical disk regions which are in use as per fast commit area. Our
1610 * simple replay phase allocator excludes these regions from allocation.
1611 */
1612static int ext4_fc_record_regions(struct super_block *sb, int ino,
1613		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1614{
1615	struct ext4_fc_replay_state *state;
1616	struct ext4_fc_alloc_region *region;
1617
1618	state = &EXT4_SB(sb)->s_fc_replay_state;
1619	if (state->fc_regions_used == state->fc_regions_size) {
1620		state->fc_regions_size +=
1621			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1622		state->fc_regions = krealloc(
1623					state->fc_regions,
1624					state->fc_regions_size *
1625					sizeof(struct ext4_fc_alloc_region),
1626					GFP_KERNEL);
1627		if (!state->fc_regions)
1628			return -ENOMEM;
1629	}
1630	region = &state->fc_regions[state->fc_regions_used++];
1631	region->ino = ino;
1632	region->lblk = lblk;
1633	region->pblk = pblk;
1634	region->len = len;
1635
1636	return 0;
1637}
1638
1639/* Replay add range tag */
1640static int ext4_fc_replay_add_range(struct super_block *sb,
1641				    struct ext4_fc_tl *tl, u8 *val)
1642{
1643	struct ext4_fc_add_range fc_add_ex;
1644	struct ext4_extent newex, *ex;
1645	struct inode *inode;
1646	ext4_lblk_t start, cur;
1647	int remaining, len;
1648	ext4_fsblk_t start_pblk;
1649	struct ext4_map_blocks map;
1650	struct ext4_ext_path *path = NULL;
1651	int ret;
1652
1653	memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1654	ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1655
1656	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1657		le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1658		ext4_ext_get_actual_len(ex));
1659
1660	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1661	if (IS_ERR(inode)) {
1662		jbd_debug(1, "Inode not found.");
1663		return 0;
1664	}
1665
1666	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1667
1668	start = le32_to_cpu(ex->ee_block);
1669	start_pblk = ext4_ext_pblock(ex);
1670	len = ext4_ext_get_actual_len(ex);
1671
1672	cur = start;
1673	remaining = len;
1674	jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1675		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1676		  inode->i_ino);
1677
1678	while (remaining > 0) {
1679		map.m_lblk = cur;
1680		map.m_len = remaining;
1681		map.m_pblk = 0;
1682		ret = ext4_map_blocks(NULL, inode, &map, 0);
1683
1684		if (ret < 0) {
1685			iput(inode);
1686			return 0;
1687		}
1688
1689		if (ret == 0) {
1690			/* Range is not mapped */
1691			path = ext4_find_extent(inode, cur, NULL, 0);
1692			if (IS_ERR(path)) {
1693				iput(inode);
1694				return 0;
1695			}
1696			memset(&newex, 0, sizeof(newex));
1697			newex.ee_block = cpu_to_le32(cur);
1698			ext4_ext_store_pblock(
1699				&newex, start_pblk + cur - start);
1700			newex.ee_len = cpu_to_le16(map.m_len);
1701			if (ext4_ext_is_unwritten(ex))
1702				ext4_ext_mark_unwritten(&newex);
1703			down_write(&EXT4_I(inode)->i_data_sem);
1704			ret = ext4_ext_insert_extent(
1705				NULL, inode, &path, &newex, 0);
1706			up_write((&EXT4_I(inode)->i_data_sem));
1707			ext4_ext_drop_refs(path);
1708			kfree(path);
1709			if (ret) {
1710				iput(inode);
1711				return 0;
1712			}
1713			goto next;
1714		}
1715
1716		if (start_pblk + cur - start != map.m_pblk) {
1717			/*
1718			 * Logical to physical mapping changed. This can happen
1719			 * if this range was removed and then reallocated to
1720			 * map to new physical blocks during a fast commit.
1721			 */
1722			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1723					ext4_ext_is_unwritten(ex),
1724					start_pblk + cur - start);
1725			if (ret) {
1726				iput(inode);
1727				return 0;
1728			}
1729			/*
1730			 * Mark the old blocks as free since they aren't used
1731			 * anymore. We maintain an array of all the modified
1732			 * inodes. In case these blocks are still used at either
1733			 * a different logical range in the same inode or in
1734			 * some different inode, we will mark them as allocated
1735			 * at the end of the FC replay using our array of
1736			 * modified inodes.
1737			 */
1738			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1739			goto next;
1740		}
1741
1742		/* Range is mapped and needs a state change */
1743		jbd_debug(1, "Converting from %ld to %d %lld",
1744				map.m_flags & EXT4_MAP_UNWRITTEN,
1745			ext4_ext_is_unwritten(ex), map.m_pblk);
1746		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1747					ext4_ext_is_unwritten(ex), map.m_pblk);
1748		if (ret) {
1749			iput(inode);
1750			return 0;
1751		}
1752		/*
1753		 * We may have split the extent tree while toggling the state.
1754		 * Try to shrink the extent tree now.
1755		 */
1756		ext4_ext_replay_shrink_inode(inode, start + len);
1757next:
1758		cur += map.m_len;
1759		remaining -= map.m_len;
1760	}
1761	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1762					sb->s_blocksize_bits);
1763	iput(inode);
1764	return 0;
1765}
1766
1767/* Replay DEL_RANGE tag */
1768static int
1769ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1770			 u8 *val)
1771{
1772	struct inode *inode;
1773	struct ext4_fc_del_range lrange;
1774	struct ext4_map_blocks map;
1775	ext4_lblk_t cur, remaining;
1776	int ret;
1777
1778	memcpy(&lrange, val, sizeof(lrange));
1779	cur = le32_to_cpu(lrange.fc_lblk);
1780	remaining = le32_to_cpu(lrange.fc_len);
1781
1782	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1783		le32_to_cpu(lrange.fc_ino), cur, remaining);
1784
1785	inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1786	if (IS_ERR(inode)) {
1787		jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1788		return 0;
1789	}
1790
1791	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1792
1793	jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1794			inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1795			le32_to_cpu(lrange.fc_len));
1796	while (remaining > 0) {
1797		map.m_lblk = cur;
1798		map.m_len = remaining;
1799
1800		ret = ext4_map_blocks(NULL, inode, &map, 0);
1801		if (ret < 0) {
1802			iput(inode);
1803			return 0;
1804		}
1805		if (ret > 0) {
1806			remaining -= ret;
1807			cur += ret;
1808			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1809		} else {
1810			remaining -= map.m_len;
1811			cur += map.m_len;
1812		}
1813	}
1814
1815	ret = ext4_punch_hole(inode,
1816		le32_to_cpu(lrange.fc_lblk) << sb->s_blocksize_bits,
1817		le32_to_cpu(lrange.fc_len) <<  sb->s_blocksize_bits);
1818	if (ret)
1819		jbd_debug(1, "ext4_punch_hole returned %d", ret);
1820	ext4_ext_replay_shrink_inode(inode,
1821		i_size_read(inode) >> sb->s_blocksize_bits);
1822	ext4_mark_inode_dirty(NULL, inode);
1823	iput(inode);
1824
1825	return 0;
1826}
1827
1828static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1829{
1830	struct ext4_fc_replay_state *state;
1831	struct inode *inode;
1832	struct ext4_ext_path *path = NULL;
1833	struct ext4_map_blocks map;
1834	int i, ret, j;
1835	ext4_lblk_t cur, end;
1836
1837	state = &EXT4_SB(sb)->s_fc_replay_state;
1838	for (i = 0; i < state->fc_modified_inodes_used; i++) {
1839		inode = ext4_iget(sb, state->fc_modified_inodes[i],
1840			EXT4_IGET_NORMAL);
1841		if (IS_ERR(inode)) {
1842			jbd_debug(1, "Inode %d not found.",
1843				state->fc_modified_inodes[i]);
1844			continue;
1845		}
1846		cur = 0;
1847		end = EXT_MAX_BLOCKS;
1848		while (cur < end) {
1849			map.m_lblk = cur;
1850			map.m_len = end - cur;
1851
1852			ret = ext4_map_blocks(NULL, inode, &map, 0);
1853			if (ret < 0)
1854				break;
1855
1856			if (ret > 0) {
1857				path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1858				if (!IS_ERR(path)) {
1859					for (j = 0; j < path->p_depth; j++)
1860						ext4_mb_mark_bb(inode->i_sb,
1861							path[j].p_block, 1, 1);
1862					ext4_ext_drop_refs(path);
1863					kfree(path);
1864				}
1865				cur += ret;
1866				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1867							map.m_len, 1);
1868			} else {
1869				cur = cur + (map.m_len ? map.m_len : 1);
1870			}
1871		}
1872		iput(inode);
1873	}
1874}
1875
1876/*
1877 * Check if block is in excluded regions for block allocation. The simple
1878 * allocator that runs during replay phase is calls this function to see
1879 * if it is okay to use a block.
1880 */
1881bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1882{
1883	int i;
1884	struct ext4_fc_replay_state *state;
1885
1886	state = &EXT4_SB(sb)->s_fc_replay_state;
1887	for (i = 0; i < state->fc_regions_valid; i++) {
1888		if (state->fc_regions[i].ino == 0 ||
1889			state->fc_regions[i].len == 0)
1890			continue;
1891		if (blk >= state->fc_regions[i].pblk &&
1892		    blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1893			return true;
1894	}
1895	return false;
1896}
1897
1898/* Cleanup function called after replay */
1899void ext4_fc_replay_cleanup(struct super_block *sb)
1900{
1901	struct ext4_sb_info *sbi = EXT4_SB(sb);
1902
1903	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1904	kfree(sbi->s_fc_replay_state.fc_regions);
1905	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1906}
1907
1908/*
1909 * Recovery Scan phase handler
1910 *
1911 * This function is called during the scan phase and is responsible
1912 * for doing following things:
1913 * - Make sure the fast commit area has valid tags for replay
1914 * - Count number of tags that need to be replayed by the replay handler
1915 * - Verify CRC
1916 * - Create a list of excluded blocks for allocation during replay phase
1917 *
1918 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1919 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1920 * to indicate that scan has finished and JBD2 can now start replay phase.
1921 * It returns a negative error to indicate that there was an error. At the end
1922 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1923 * to indicate the number of tags that need to replayed during the replay phase.
1924 */
1925static int ext4_fc_replay_scan(journal_t *journal,
1926				struct buffer_head *bh, int off,
1927				tid_t expected_tid)
1928{
1929	struct super_block *sb = journal->j_private;
1930	struct ext4_sb_info *sbi = EXT4_SB(sb);
1931	struct ext4_fc_replay_state *state;
1932	int ret = JBD2_FC_REPLAY_CONTINUE;
1933	struct ext4_fc_add_range ext;
1934	struct ext4_fc_tl tl;
1935	struct ext4_fc_tail tail;
1936	__u8 *start, *end, *cur, *val;
1937	struct ext4_fc_head head;
1938	struct ext4_extent *ex;
1939
1940	state = &sbi->s_fc_replay_state;
1941
1942	start = (u8 *)bh->b_data;
1943	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1944
1945	if (state->fc_replay_expected_off == 0) {
1946		state->fc_cur_tag = 0;
1947		state->fc_replay_num_tags = 0;
1948		state->fc_crc = 0;
1949		state->fc_regions = NULL;
1950		state->fc_regions_valid = state->fc_regions_used =
1951			state->fc_regions_size = 0;
1952		/* Check if we can stop early */
1953		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1954			!= EXT4_FC_TAG_HEAD)
1955			return 0;
1956	}
1957
1958	if (off != state->fc_replay_expected_off) {
1959		ret = -EFSCORRUPTED;
1960		goto out_err;
1961	}
1962
1963	state->fc_replay_expected_off++;
1964	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
1965		memcpy(&tl, cur, sizeof(tl));
1966		val = cur + sizeof(tl);
1967		jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1968			  tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
1969		switch (le16_to_cpu(tl.fc_tag)) {
1970		case EXT4_FC_TAG_ADD_RANGE:
1971			memcpy(&ext, val, sizeof(ext));
1972			ex = (struct ext4_extent *)&ext.fc_ex;
1973			ret = ext4_fc_record_regions(sb,
1974				le32_to_cpu(ext.fc_ino),
1975				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1976				ext4_ext_get_actual_len(ex));
1977			if (ret < 0)
1978				break;
1979			ret = JBD2_FC_REPLAY_CONTINUE;
1980			fallthrough;
1981		case EXT4_FC_TAG_DEL_RANGE:
1982		case EXT4_FC_TAG_LINK:
1983		case EXT4_FC_TAG_UNLINK:
1984		case EXT4_FC_TAG_CREAT:
1985		case EXT4_FC_TAG_INODE:
1986		case EXT4_FC_TAG_PAD:
1987			state->fc_cur_tag++;
1988			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1989					sizeof(tl) + le16_to_cpu(tl.fc_len));
1990			break;
1991		case EXT4_FC_TAG_TAIL:
1992			state->fc_cur_tag++;
1993			memcpy(&tail, val, sizeof(tail));
1994			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1995						sizeof(tl) +
1996						offsetof(struct ext4_fc_tail,
1997						fc_crc));
1998			if (le32_to_cpu(tail.fc_tid) == expected_tid &&
1999				le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2000				state->fc_replay_num_tags = state->fc_cur_tag;
2001				state->fc_regions_valid =
2002					state->fc_regions_used;
2003			} else {
2004				ret = state->fc_replay_num_tags ?
2005					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2006			}
2007			state->fc_crc = 0;
2008			break;
2009		case EXT4_FC_TAG_HEAD:
2010			memcpy(&head, val, sizeof(head));
2011			if (le32_to_cpu(head.fc_features) &
2012				~EXT4_FC_SUPPORTED_FEATURES) {
2013				ret = -EOPNOTSUPP;
2014				break;
2015			}
2016			if (le32_to_cpu(head.fc_tid) != expected_tid) {
2017				ret = JBD2_FC_REPLAY_STOP;
2018				break;
2019			}
2020			state->fc_cur_tag++;
2021			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2022					    sizeof(tl) + le16_to_cpu(tl.fc_len));
2023			break;
2024		default:
2025			ret = state->fc_replay_num_tags ?
2026				JBD2_FC_REPLAY_STOP : -ECANCELED;
2027		}
2028		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2029			break;
2030	}
2031
2032out_err:
2033	trace_ext4_fc_replay_scan(sb, ret, off);
2034	return ret;
2035}
2036
2037/*
2038 * Main recovery path entry point.
2039 * The meaning of return codes is similar as above.
2040 */
2041static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2042				enum passtype pass, int off, tid_t expected_tid)
2043{
2044	struct super_block *sb = journal->j_private;
2045	struct ext4_sb_info *sbi = EXT4_SB(sb);
2046	struct ext4_fc_tl tl;
2047	__u8 *start, *end, *cur, *val;
2048	int ret = JBD2_FC_REPLAY_CONTINUE;
2049	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2050	struct ext4_fc_tail tail;
2051
2052	if (pass == PASS_SCAN) {
2053		state->fc_current_pass = PASS_SCAN;
2054		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2055	}
2056
2057	if (state->fc_current_pass != pass) {
2058		state->fc_current_pass = pass;
2059		sbi->s_mount_state |= EXT4_FC_REPLAY;
2060	}
2061	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2062		jbd_debug(1, "Replay stops\n");
2063		ext4_fc_set_bitmaps_and_counters(sb);
2064		return 0;
2065	}
2066
2067#ifdef CONFIG_EXT4_DEBUG
2068	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2069		pr_warn("Dropping fc block %d because max_replay set\n", off);
2070		return JBD2_FC_REPLAY_STOP;
2071	}
2072#endif
2073
2074	start = (u8 *)bh->b_data;
2075	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2076
2077	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2078		memcpy(&tl, cur, sizeof(tl));
2079		val = cur + sizeof(tl);
2080
2081		if (state->fc_replay_num_tags == 0) {
2082			ret = JBD2_FC_REPLAY_STOP;
2083			ext4_fc_set_bitmaps_and_counters(sb);
2084			break;
2085		}
2086		jbd_debug(3, "Replay phase, tag:%s\n",
2087				tag2str(le16_to_cpu(tl.fc_tag)));
2088		state->fc_replay_num_tags--;
2089		switch (le16_to_cpu(tl.fc_tag)) {
2090		case EXT4_FC_TAG_LINK:
2091			ret = ext4_fc_replay_link(sb, &tl, val);
2092			break;
2093		case EXT4_FC_TAG_UNLINK:
2094			ret = ext4_fc_replay_unlink(sb, &tl, val);
2095			break;
2096		case EXT4_FC_TAG_ADD_RANGE:
2097			ret = ext4_fc_replay_add_range(sb, &tl, val);
2098			break;
2099		case EXT4_FC_TAG_CREAT:
2100			ret = ext4_fc_replay_create(sb, &tl, val);
2101			break;
2102		case EXT4_FC_TAG_DEL_RANGE:
2103			ret = ext4_fc_replay_del_range(sb, &tl, val);
2104			break;
2105		case EXT4_FC_TAG_INODE:
2106			ret = ext4_fc_replay_inode(sb, &tl, val);
2107			break;
2108		case EXT4_FC_TAG_PAD:
2109			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2110					     le16_to_cpu(tl.fc_len), 0);
2111			break;
2112		case EXT4_FC_TAG_TAIL:
2113			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2114					     le16_to_cpu(tl.fc_len), 0);
2115			memcpy(&tail, val, sizeof(tail));
2116			WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2117			break;
2118		case EXT4_FC_TAG_HEAD:
2119			break;
2120		default:
2121			trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2122					     le16_to_cpu(tl.fc_len), 0);
2123			ret = -ECANCELED;
2124			break;
2125		}
2126		if (ret < 0)
2127			break;
2128		ret = JBD2_FC_REPLAY_CONTINUE;
2129	}
2130	return ret;
2131}
2132
2133void ext4_fc_init(struct super_block *sb, journal_t *journal)
2134{
2135	/*
2136	 * We set replay callback even if fast commit disabled because we may
2137	 * could still have fast commit blocks that need to be replayed even if
2138	 * fast commit has now been turned off.
2139	 */
2140	journal->j_fc_replay_callback = ext4_fc_replay;
2141	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2142		return;
2143	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2144}
2145
2146static const char *fc_ineligible_reasons[] = {
2147	"Extended attributes changed",
2148	"Cross rename",
2149	"Journal flag changed",
2150	"Insufficient memory",
2151	"Swap boot",
2152	"Resize",
2153	"Dir renamed",
2154	"Falloc range op",
2155	"Data journalling",
2156	"FC Commit Failed"
2157};
2158
2159int ext4_fc_info_show(struct seq_file *seq, void *v)
2160{
2161	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2162	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2163	int i;
2164
2165	if (v != SEQ_START_TOKEN)
2166		return 0;
2167
2168	seq_printf(seq,
2169		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2170		   stats->fc_num_commits, stats->fc_ineligible_commits,
2171		   stats->fc_numblks,
2172		   div_u64(sbi->s_fc_avg_commit_time, 1000));
2173	seq_puts(seq, "Ineligible reasons:\n");
2174	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2175		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2176			stats->fc_ineligible_reason_count[i]);
2177
2178	return 0;
2179}
2180
2181int __init ext4_fc_init_dentry_cache(void)
2182{
2183	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2184					   SLAB_RECLAIM_ACCOUNT);
2185
2186	if (ext4_fc_dentry_cachep == NULL)
2187		return -ENOMEM;
2188
2189	return 0;
2190}