fs/ext4/fast_commit.c at v5.12 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / ext4 / fast_commit.c
at v5.12 62 kB view raw
   1// SPDX-License-Identifier: GPL-2.0
   2
   3/*
   4 * fs/ext4/fast_commit.c
   5 *
   6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
   7 *
   8 * Ext4 fast commits routines.
   9 */
  10#include "ext4.h"
  11#include "ext4_jbd2.h"
  12#include "ext4_extents.h"
  13#include "mballoc.h"
  14
  15/*
  16 * Ext4 Fast Commits
  17 * -----------------
  18 *
  19 * Ext4 fast commits implement fine grained journalling for Ext4.
  20 *
  21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
  22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
  23 * TLV during the recovery phase. For the scenarios for which we currently
  24 * don't have replay code, fast commit falls back to full commits.
  25 * Fast commits record delta in one of the following three categories.
  26 *
  27 * (A) Directory entry updates:
  28 *
  29 * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
  30 * - EXT4_FC_TAG_LINK		- records directory entry link
  31 * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
  32 *
  33 * (B) File specific data range updates:
  34 *
  35 * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
  36 * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
  37 *
  38 * (C) Inode metadata (mtime / ctime etc):
  39 *
  40 * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
  41 *				  during recovery. Note that iblocks field is
  42 *				  not replayed and instead derived during
  43 *				  replay.
  44 * Commit Operation
  45 * ----------------
  46 * With fast commits, we maintain all the directory entry operations in the
  47 * order in which they are issued in an in-memory queue. This queue is flushed
  48 * to disk during the commit operation. We also maintain a list of inodes
  49 * that need to be committed during a fast commit in another in memory queue of
  50 * inodes. During the commit operation, we commit in the following order:
  51 *
  52 * [1] Lock inodes for any further data updates by setting COMMITTING state
  53 * [2] Submit data buffers of all the inodes
  54 * [3] Wait for [2] to complete
  55 * [4] Commit all the directory entry updates in the fast commit space
  56 * [5] Commit all the changed inode structures
  57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
  58 *     section for more details).
  59 * [7] Wait for [4], [5] and [6] to complete.
  60 *
  61 * All the inode updates must call ext4_fc_start_update() before starting an
  62 * update. If such an ongoing update is present, fast commit waits for it to
  63 * complete. The completion of such an update is marked by
  64 * ext4_fc_stop_update().
  65 *
  66 * Fast Commit Ineligibility
  67 * -------------------------
  68 * Not all operations are supported by fast commits today (e.g extended
  69 * attributes). Fast commit ineligiblity is marked by calling one of the
  70 * two following functions:
  71 *
  72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
  73 *   back to full commit. This is useful in case of transient errors.
  74 *
  75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
  76 *   the fast commits happening between ext4_fc_start_ineligible() and
  77 *   ext4_fc_stop_ineligible() and one fast commit after the call to
  78 *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
  79 *   make one more fast commit to fall back to full commit after stop call so
  80 *   that it guaranteed that the fast commit ineligible operation contained
  81 *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
  82 *   followed by at least 1 full commit.
  83 *
  84 * Atomicity of commits
  85 * --------------------
  86 * In order to guarantee atomicity during the commit operation, fast commit
  87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
  88 * tag contains CRC of the contents and TID of the transaction after which
  89 * this fast commit should be applied. Recovery code replays fast commit
  90 * logs only if there's at least 1 valid tail present. For every fast commit
  91 * operation, there is 1 tail. This means, we may end up with multiple tails
  92 * in the fast commit space. Here's an example:
  93 *
  94 * - Create a new file A and remove existing file B
  95 * - fsync()
  96 * - Append contents to file A
  97 * - Truncate file A
  98 * - fsync()
  99 *
 100 * The fast commit space at the end of above operations would look like this:
 101 *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
 102 *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
 103 *
 104 * Replay code should thus check for all the valid tails in the FC area.
 105 *
 106 * Fast Commit Replay Idempotence
 107 * ------------------------------
 108 *
 109 * Fast commits tags are idempotent in nature provided the recovery code follows
 110 * certain rules. The guiding principle that the commit path follows while
 111 * committing is that it stores the result of a particular operation instead of
 112 * storing the procedure.
 113 *
 114 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
 115 * was associated with inode 10. During fast commit, instead of storing this
 116 * operation as a procedure "rename a to b", we store the resulting file system
 117 * state as a "series" of outcomes:
 118 *
 119 * - Link dirent b to inode 10
 120 * - Unlink dirent a
 121 * - Inode <10> with valid refcount
 122 *
 123 * Now when recovery code runs, it needs "enforce" this state on the file
 124 * system. This is what guarantees idempotence of fast commit replay.
 125 *
 126 * Let's take an example of a procedure that is not idempotent and see how fast
 127 * commits make it idempotent. Consider following sequence of operations:
 128 *
 129 *     rm A;    mv B A;    read A
 130 *  (x)     (y)        (z)
 131 *
 132 * (x), (y) and (z) are the points at which we can crash. If we store this
 133 * sequence of operations as is then the replay is not idempotent. Let's say
 134 * while in replay, we crash at (z). During the second replay, file A (which was
 135 * actually created as a result of "mv B A" operation) would get deleted. Thus,
 136 * file named A would be absent when we try to read A. So, this sequence of
 137 * operations is not idempotent. However, as mentioned above, instead of storing
 138 * the procedure fast commits store the outcome of each procedure. Thus the fast
 139 * commit log for above procedure would be as follows:
 140 *
 141 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
 142 * inode 11 before the replay)
 143 *
 144 *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
 145 * (w)          (x)                    (y)          (z)
 146 *
 147 * If we crash at (z), we will have file A linked to inode 11. During the second
 148 * replay, we will remove file A (inode 11). But we will create it back and make
 149 * it point to inode 11. We won't find B, so we'll just skip that step. At this
 150 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
 151 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
 152 * similarly. Thus, by converting a non-idempotent procedure into a series of
 153 * idempotent outcomes, fast commits ensured idempotence during the replay.
 154 *
 155 * TODOs
 156 * -----
 157 *
 158 * 0) Fast commit replay path hardening: Fast commit replay code should use
 159 *    journal handles to make sure all the updates it does during the replay
 160 *    path are atomic. With that if we crash during fast commit replay, after
 161 *    trying to do recovery again, we will find a file system where fast commit
 162 *    area is invalid (because new full commit would be found). In order to deal
 163 *    with that, fast commit replay code should ensure that the "FC_REPLAY"
 164 *    superblock state is persisted before starting the replay, so that after
 165 *    the crash, fast commit recovery code can look at that flag and perform
 166 *    fast commit recovery even if that area is invalidated by later full
 167 *    commits.
 168 *
 169 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
 170 *    eligible update must be protected within ext4_fc_start_update() and
 171 *    ext4_fc_stop_update(). These routines are called at much higher
 172 *    routines. This can be made more fine grained by combining with
 173 *    ext4_journal_start().
 174 *
 175 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
 176 *
 177 * 3) Handle more ineligible cases.
 178 */
 179
 180#include <trace/events/ext4.h>
 181static struct kmem_cache *ext4_fc_dentry_cachep;
 182
 183static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 184{
 185	BUFFER_TRACE(bh, "");
 186	if (uptodate) {
 187		ext4_debug("%s: Block %lld up-to-date",
 188			   __func__, bh->b_blocknr);
 189		set_buffer_uptodate(bh);
 190	} else {
 191		ext4_debug("%s: Block %lld not up-to-date",
 192			   __func__, bh->b_blocknr);
 193		clear_buffer_uptodate(bh);
 194	}
 195
 196	unlock_buffer(bh);
 197}
 198
 199static inline void ext4_fc_reset_inode(struct inode *inode)
 200{
 201	struct ext4_inode_info *ei = EXT4_I(inode);
 202
 203	ei->i_fc_lblk_start = 0;
 204	ei->i_fc_lblk_len = 0;
 205}
 206
 207void ext4_fc_init_inode(struct inode *inode)
 208{
 209	struct ext4_inode_info *ei = EXT4_I(inode);
 210
 211	ext4_fc_reset_inode(inode);
 212	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 213	INIT_LIST_HEAD(&ei->i_fc_list);
 214	init_waitqueue_head(&ei->i_fc_wait);
 215	atomic_set(&ei->i_fc_updates, 0);
 216}
 217
 218/* This function must be called with sbi->s_fc_lock held. */
 219static void ext4_fc_wait_committing_inode(struct inode *inode)
 220__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
 221{
 222	wait_queue_head_t *wq;
 223	struct ext4_inode_info *ei = EXT4_I(inode);
 224
 225#if (BITS_PER_LONG < 64)
 226	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
 227			EXT4_STATE_FC_COMMITTING);
 228	wq = bit_waitqueue(&ei->i_state_flags,
 229				EXT4_STATE_FC_COMMITTING);
 230#else
 231	DEFINE_WAIT_BIT(wait, &ei->i_flags,
 232			EXT4_STATE_FC_COMMITTING);
 233	wq = bit_waitqueue(&ei->i_flags,
 234				EXT4_STATE_FC_COMMITTING);
 235#endif
 236	lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
 237	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 238	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 239	schedule();
 240	finish_wait(wq, &wait.wq_entry);
 241}
 242
 243/*
 244 * Inform Ext4's fast about start of an inode update
 245 *
 246 * This function is called by the high level call VFS callbacks before
 247 * performing any inode update. This function blocks if there's an ongoing
 248 * fast commit on the inode in question.
 249 */
 250void ext4_fc_start_update(struct inode *inode)
 251{
 252	struct ext4_inode_info *ei = EXT4_I(inode);
 253
 254	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 255	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 256		return;
 257
 258restart:
 259	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 260	if (list_empty(&ei->i_fc_list))
 261		goto out;
 262
 263	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 264		ext4_fc_wait_committing_inode(inode);
 265		goto restart;
 266	}
 267out:
 268	atomic_inc(&ei->i_fc_updates);
 269	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 270}
 271
 272/*
 273 * Stop inode update and wake up waiting fast commits if any.
 274 */
 275void ext4_fc_stop_update(struct inode *inode)
 276{
 277	struct ext4_inode_info *ei = EXT4_I(inode);
 278
 279	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 280	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 281		return;
 282
 283	if (atomic_dec_and_test(&ei->i_fc_updates))
 284		wake_up_all(&ei->i_fc_wait);
 285}
 286
 287/*
 288 * Remove inode from fast commit list. If the inode is being committed
 289 * we wait until inode commit is done.
 290 */
 291void ext4_fc_del(struct inode *inode)
 292{
 293	struct ext4_inode_info *ei = EXT4_I(inode);
 294
 295	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 296	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 297		return;
 298
 299restart:
 300	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 301	if (list_empty(&ei->i_fc_list)) {
 302		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 303		return;
 304	}
 305
 306	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 307		ext4_fc_wait_committing_inode(inode);
 308		goto restart;
 309	}
 310	list_del_init(&ei->i_fc_list);
 311	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 312}
 313
 314/*
 315 * Mark file system as fast commit ineligible. This means that next commit
 316 * operation would result in a full jbd2 commit.
 317 */
 318void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
 319{
 320	struct ext4_sb_info *sbi = EXT4_SB(sb);
 321
 322	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 323	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 324		return;
 325
 326	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 327	WARN_ON(reason >= EXT4_FC_REASON_MAX);
 328	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 329}
 330
 331/*
 332 * Start a fast commit ineligible update. Any commits that happen while
 333 * such an operation is in progress fall back to full commits.
 334 */
 335void ext4_fc_start_ineligible(struct super_block *sb, int reason)
 336{
 337	struct ext4_sb_info *sbi = EXT4_SB(sb);
 338
 339	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 340	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 341		return;
 342
 343	WARN_ON(reason >= EXT4_FC_REASON_MAX);
 344	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 345	atomic_inc(&sbi->s_fc_ineligible_updates);
 346}
 347
 348/*
 349 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
 350 * to ensure that after stopping the ineligible update, at least one full
 351 * commit takes place.
 352 */
 353void ext4_fc_stop_ineligible(struct super_block *sb)
 354{
 355	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 356	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 357		return;
 358
 359	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 360	atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
 361}
 362
 363static inline int ext4_fc_is_ineligible(struct super_block *sb)
 364{
 365	return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
 366		atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
 367}
 368
 369/*
 370 * Generic fast commit tracking function. If this is the first time this we are
 371 * called after a full commit, we initialize fast commit fields and then call
 372 * __fc_track_fn() with update = 0. If we have already been called after a full
 373 * commit, we pass update = 1. Based on that, the track function can determine
 374 * if it needs to track a field for the first time or if it needs to just
 375 * update the previously tracked value.
 376 *
 377 * If enqueue is set, this function enqueues the inode in fast commit list.
 378 */
 379static int ext4_fc_track_template(
 380	handle_t *handle, struct inode *inode,
 381	int (*__fc_track_fn)(struct inode *, void *, bool),
 382	void *args, int enqueue)
 383{
 384	bool update = false;
 385	struct ext4_inode_info *ei = EXT4_I(inode);
 386	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 387	tid_t tid = 0;
 388	int ret;
 389
 390	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 391	    (sbi->s_mount_state & EXT4_FC_REPLAY))
 392		return -EOPNOTSUPP;
 393
 394	if (ext4_fc_is_ineligible(inode->i_sb))
 395		return -EINVAL;
 396
 397	tid = handle->h_transaction->t_tid;
 398	mutex_lock(&ei->i_fc_lock);
 399	if (tid == ei->i_sync_tid) {
 400		update = true;
 401	} else {
 402		ext4_fc_reset_inode(inode);
 403		ei->i_sync_tid = tid;
 404	}
 405	ret = __fc_track_fn(inode, args, update);
 406	mutex_unlock(&ei->i_fc_lock);
 407
 408	if (!enqueue)
 409		return ret;
 410
 411	spin_lock(&sbi->s_fc_lock);
 412	if (list_empty(&EXT4_I(inode)->i_fc_list))
 413		list_add_tail(&EXT4_I(inode)->i_fc_list,
 414				(ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
 415				&sbi->s_fc_q[FC_Q_STAGING] :
 416				&sbi->s_fc_q[FC_Q_MAIN]);
 417	spin_unlock(&sbi->s_fc_lock);
 418
 419	return ret;
 420}
 421
 422struct __track_dentry_update_args {
 423	struct dentry *dentry;
 424	int op;
 425};
 426
 427/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
 428static int __track_dentry_update(struct inode *inode, void *arg, bool update)
 429{
 430	struct ext4_fc_dentry_update *node;
 431	struct ext4_inode_info *ei = EXT4_I(inode);
 432	struct __track_dentry_update_args *dentry_update =
 433		(struct __track_dentry_update_args *)arg;
 434	struct dentry *dentry = dentry_update->dentry;
 435	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 436
 437	mutex_unlock(&ei->i_fc_lock);
 438	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
 439	if (!node) {
 440		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
 441		mutex_lock(&ei->i_fc_lock);
 442		return -ENOMEM;
 443	}
 444
 445	node->fcd_op = dentry_update->op;
 446	node->fcd_parent = dentry->d_parent->d_inode->i_ino;
 447	node->fcd_ino = inode->i_ino;
 448	if (dentry->d_name.len > DNAME_INLINE_LEN) {
 449		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
 450		if (!node->fcd_name.name) {
 451			kmem_cache_free(ext4_fc_dentry_cachep, node);
 452			ext4_fc_mark_ineligible(inode->i_sb,
 453				EXT4_FC_REASON_NOMEM);
 454			mutex_lock(&ei->i_fc_lock);
 455			return -ENOMEM;
 456		}
 457		memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
 458			dentry->d_name.len);
 459	} else {
 460		memcpy(node->fcd_iname, dentry->d_name.name,
 461			dentry->d_name.len);
 462		node->fcd_name.name = node->fcd_iname;
 463	}
 464	node->fcd_name.len = dentry->d_name.len;
 465
 466	spin_lock(&sbi->s_fc_lock);
 467	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
 468		list_add_tail(&node->fcd_list,
 469				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
 470	else
 471		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
 472	spin_unlock(&sbi->s_fc_lock);
 473	mutex_lock(&ei->i_fc_lock);
 474
 475	return 0;
 476}
 477
 478void __ext4_fc_track_unlink(handle_t *handle,
 479		struct inode *inode, struct dentry *dentry)
 480{
 481	struct __track_dentry_update_args args;
 482	int ret;
 483
 484	args.dentry = dentry;
 485	args.op = EXT4_FC_TAG_UNLINK;
 486
 487	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 488					(void *)&args, 0);
 489	trace_ext4_fc_track_unlink(inode, dentry, ret);
 490}
 491
 492void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
 493{
 494	__ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
 495}
 496
 497void __ext4_fc_track_link(handle_t *handle,
 498	struct inode *inode, struct dentry *dentry)
 499{
 500	struct __track_dentry_update_args args;
 501	int ret;
 502
 503	args.dentry = dentry;
 504	args.op = EXT4_FC_TAG_LINK;
 505
 506	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 507					(void *)&args, 0);
 508	trace_ext4_fc_track_link(inode, dentry, ret);
 509}
 510
 511void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
 512{
 513	__ext4_fc_track_link(handle, d_inode(dentry), dentry);
 514}
 515
 516void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
 517			  struct dentry *dentry)
 518{
 519	struct __track_dentry_update_args args;
 520	int ret;
 521
 522	args.dentry = dentry;
 523	args.op = EXT4_FC_TAG_CREAT;
 524
 525	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 526					(void *)&args, 0);
 527	trace_ext4_fc_track_create(inode, dentry, ret);
 528}
 529
 530void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
 531{
 532	__ext4_fc_track_create(handle, d_inode(dentry), dentry);
 533}
 534
 535/* __track_fn for inode tracking */
 536static int __track_inode(struct inode *inode, void *arg, bool update)
 537{
 538	if (update)
 539		return -EEXIST;
 540
 541	EXT4_I(inode)->i_fc_lblk_len = 0;
 542
 543	return 0;
 544}
 545
 546void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 547{
 548	int ret;
 549
 550	if (S_ISDIR(inode->i_mode))
 551		return;
 552
 553	if (ext4_should_journal_data(inode)) {
 554		ext4_fc_mark_ineligible(inode->i_sb,
 555					EXT4_FC_REASON_INODE_JOURNAL_DATA);
 556		return;
 557	}
 558
 559	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
 560	trace_ext4_fc_track_inode(inode, ret);
 561}
 562
 563struct __track_range_args {
 564	ext4_lblk_t start, end;
 565};
 566
 567/* __track_fn for tracking data updates */
 568static int __track_range(struct inode *inode, void *arg, bool update)
 569{
 570	struct ext4_inode_info *ei = EXT4_I(inode);
 571	ext4_lblk_t oldstart;
 572	struct __track_range_args *__arg =
 573		(struct __track_range_args *)arg;
 574
 575	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
 576		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
 577		return -ECANCELED;
 578	}
 579
 580	oldstart = ei->i_fc_lblk_start;
 581
 582	if (update && ei->i_fc_lblk_len > 0) {
 583		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
 584		ei->i_fc_lblk_len =
 585			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
 586				ei->i_fc_lblk_start + 1;
 587	} else {
 588		ei->i_fc_lblk_start = __arg->start;
 589		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
 590	}
 591
 592	return 0;
 593}
 594
 595void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
 596			 ext4_lblk_t end)
 597{
 598	struct __track_range_args args;
 599	int ret;
 600
 601	if (S_ISDIR(inode->i_mode))
 602		return;
 603
 604	args.start = start;
 605	args.end = end;
 606
 607	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
 608
 609	trace_ext4_fc_track_range(inode, start, end, ret);
 610}
 611
 612static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
 613{
 614	int write_flags = REQ_SYNC;
 615	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
 616
 617	/* Add REQ_FUA | REQ_PREFLUSH only its tail */
 618	if (test_opt(sb, BARRIER) && is_tail)
 619		write_flags |= REQ_FUA | REQ_PREFLUSH;
 620	lock_buffer(bh);
 621	set_buffer_dirty(bh);
 622	set_buffer_uptodate(bh);
 623	bh->b_end_io = ext4_end_buffer_io_sync;
 624	submit_bh(REQ_OP_WRITE, write_flags, bh);
 625	EXT4_SB(sb)->s_fc_bh = NULL;
 626}
 627
 628/* Ext4 commit path routines */
 629
 630/* memzero and update CRC */
 631static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
 632				u32 *crc)
 633{
 634	void *ret;
 635
 636	ret = memset(dst, 0, len);
 637	if (crc)
 638		*crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
 639	return ret;
 640}
 641
 642/*
 643 * Allocate len bytes on a fast commit buffer.
 644 *
 645 * During the commit time this function is used to manage fast commit
 646 * block space. We don't split a fast commit log onto different
 647 * blocks. So this function makes sure that if there's not enough space
 648 * on the current block, the remaining space in the current block is
 649 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 650 * new block is from jbd2 and CRC is updated to reflect the padding
 651 * we added.
 652 */
 653static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 654{
 655	struct ext4_fc_tl *tl;
 656	struct ext4_sb_info *sbi = EXT4_SB(sb);
 657	struct buffer_head *bh;
 658	int bsize = sbi->s_journal->j_blocksize;
 659	int ret, off = sbi->s_fc_bytes % bsize;
 660	int pad_len;
 661
 662	/*
 663	 * After allocating len, we should have space at least for a 0 byte
 664	 * padding.
 665	 */
 666	if (len + sizeof(struct ext4_fc_tl) > bsize)
 667		return NULL;
 668
 669	if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
 670		/*
 671		 * Only allocate from current buffer if we have enough space for
 672		 * this request AND we have space to add a zero byte padding.
 673		 */
 674		if (!sbi->s_fc_bh) {
 675			ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 676			if (ret)
 677				return NULL;
 678			sbi->s_fc_bh = bh;
 679		}
 680		sbi->s_fc_bytes += len;
 681		return sbi->s_fc_bh->b_data + off;
 682	}
 683	/* Need to add PAD tag */
 684	tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
 685	tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
 686	pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
 687	tl->fc_len = cpu_to_le16(pad_len);
 688	if (crc)
 689		*crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
 690	if (pad_len > 0)
 691		ext4_fc_memzero(sb, tl + 1, pad_len, crc);
 692	ext4_fc_submit_bh(sb, false);
 693
 694	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 695	if (ret)
 696		return NULL;
 697	sbi->s_fc_bh = bh;
 698	sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
 699	return sbi->s_fc_bh->b_data;
 700}
 701
 702/* memcpy to fc reserved space and update CRC */
 703static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
 704				int len, u32 *crc)
 705{
 706	if (crc)
 707		*crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
 708	return memcpy(dst, src, len);
 709}
 710
 711/*
 712 * Complete a fast commit by writing tail tag.
 713 *
 714 * Writing tail tag marks the end of a fast commit. In order to guarantee
 715 * atomicity, after writing tail tag, even if there's space remaining
 716 * in the block, next commit shouldn't use it. That's why tail tag
 717 * has the length as that of the remaining space on the block.
 718 */
 719static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
 720{
 721	struct ext4_sb_info *sbi = EXT4_SB(sb);
 722	struct ext4_fc_tl tl;
 723	struct ext4_fc_tail tail;
 724	int off, bsize = sbi->s_journal->j_blocksize;
 725	u8 *dst;
 726
 727	/*
 728	 * ext4_fc_reserve_space takes care of allocating an extra block if
 729	 * there's no enough space on this block for accommodating this tail.
 730	 */
 731	dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
 732	if (!dst)
 733		return -ENOSPC;
 734
 735	off = sbi->s_fc_bytes % bsize;
 736
 737	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
 738	tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
 739	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
 740
 741	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
 742	dst += sizeof(tl);
 743	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
 744	ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
 745	dst += sizeof(tail.fc_tid);
 746	tail.fc_crc = cpu_to_le32(crc);
 747	ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
 748
 749	ext4_fc_submit_bh(sb, true);
 750
 751	return 0;
 752}
 753
 754/*
 755 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 756 * Returns false if there's not enough space.
 757 */
 758static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
 759			   u32 *crc)
 760{
 761	struct ext4_fc_tl tl;
 762	u8 *dst;
 763
 764	dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
 765	if (!dst)
 766		return false;
 767
 768	tl.fc_tag = cpu_to_le16(tag);
 769	tl.fc_len = cpu_to_le16(len);
 770
 771	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 772	ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
 773
 774	return true;
 775}
 776
 777/* Same as above, but adds dentry tlv. */
 778static  bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
 779					int parent_ino, int ino, int dlen,
 780					const unsigned char *dname,
 781					u32 *crc)
 782{
 783	struct ext4_fc_dentry_info fcd;
 784	struct ext4_fc_tl tl;
 785	u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
 786					crc);
 787
 788	if (!dst)
 789		return false;
 790
 791	fcd.fc_parent_ino = cpu_to_le32(parent_ino);
 792	fcd.fc_ino = cpu_to_le32(ino);
 793	tl.fc_tag = cpu_to_le16(tag);
 794	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
 795	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 796	dst += sizeof(tl);
 797	ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
 798	dst += sizeof(fcd);
 799	ext4_fc_memcpy(sb, dst, dname, dlen, crc);
 800	dst += dlen;
 801
 802	return true;
 803}
 804
 805/*
 806 * Writes inode in the fast commit space under TLV with tag @tag.
 807 * Returns 0 on success, error on failure.
 808 */
 809static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 810{
 811	struct ext4_inode_info *ei = EXT4_I(inode);
 812	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 813	int ret;
 814	struct ext4_iloc iloc;
 815	struct ext4_fc_inode fc_inode;
 816	struct ext4_fc_tl tl;
 817	u8 *dst;
 818
 819	ret = ext4_get_inode_loc(inode, &iloc);
 820	if (ret)
 821		return ret;
 822
 823	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
 824		inode_len += ei->i_extra_isize;
 825
 826	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 827	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
 828	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
 829
 830	dst = ext4_fc_reserve_space(inode->i_sb,
 831			sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
 832	if (!dst)
 833		return -ECANCELED;
 834
 835	if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
 836		return -ECANCELED;
 837	dst += sizeof(tl);
 838	if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
 839		return -ECANCELED;
 840	dst += sizeof(fc_inode);
 841	if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
 842					inode_len, crc))
 843		return -ECANCELED;
 844
 845	return 0;
 846}
 847
 848/*
 849 * Writes updated data ranges for the inode in question. Updates CRC.
 850 * Returns 0 on success, error otherwise.
 851 */
 852static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 853{
 854	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
 855	struct ext4_inode_info *ei = EXT4_I(inode);
 856	struct ext4_map_blocks map;
 857	struct ext4_fc_add_range fc_ext;
 858	struct ext4_fc_del_range lrange;
 859	struct ext4_extent *ex;
 860	int ret;
 861
 862	mutex_lock(&ei->i_fc_lock);
 863	if (ei->i_fc_lblk_len == 0) {
 864		mutex_unlock(&ei->i_fc_lock);
 865		return 0;
 866	}
 867	old_blk_size = ei->i_fc_lblk_start;
 868	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
 869	ei->i_fc_lblk_len = 0;
 870	mutex_unlock(&ei->i_fc_lock);
 871
 872	cur_lblk_off = old_blk_size;
 873	jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
 874		  __func__, cur_lblk_off, new_blk_size, inode->i_ino);
 875
 876	while (cur_lblk_off <= new_blk_size) {
 877		map.m_lblk = cur_lblk_off;
 878		map.m_len = new_blk_size - cur_lblk_off + 1;
 879		ret = ext4_map_blocks(NULL, inode, &map, 0);
 880		if (ret < 0)
 881			return -ECANCELED;
 882
 883		if (map.m_len == 0) {
 884			cur_lblk_off++;
 885			continue;
 886		}
 887
 888		if (ret == 0) {
 889			lrange.fc_ino = cpu_to_le32(inode->i_ino);
 890			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
 891			lrange.fc_len = cpu_to_le32(map.m_len);
 892			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
 893					    sizeof(lrange), (u8 *)&lrange, crc))
 894				return -ENOSPC;
 895		} else {
 896			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
 897			ex = (struct ext4_extent *)&fc_ext.fc_ex;
 898			ex->ee_block = cpu_to_le32(map.m_lblk);
 899			ex->ee_len = cpu_to_le16(map.m_len);
 900			ext4_ext_store_pblock(ex, map.m_pblk);
 901			if (map.m_flags & EXT4_MAP_UNWRITTEN)
 902				ext4_ext_mark_unwritten(ex);
 903			else
 904				ext4_ext_mark_initialized(ex);
 905			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
 906					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
 907				return -ENOSPC;
 908		}
 909
 910		cur_lblk_off += map.m_len;
 911	}
 912
 913	return 0;
 914}
 915
 916
 917/* Submit data for all the fast commit inodes */
 918static int ext4_fc_submit_inode_data_all(journal_t *journal)
 919{
 920	struct super_block *sb = (struct super_block *)(journal->j_private);
 921	struct ext4_sb_info *sbi = EXT4_SB(sb);
 922	struct ext4_inode_info *ei;
 923	int ret = 0;
 924
 925	spin_lock(&sbi->s_fc_lock);
 926	ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
 927	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 928		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
 929		while (atomic_read(&ei->i_fc_updates)) {
 930			DEFINE_WAIT(wait);
 931
 932			prepare_to_wait(&ei->i_fc_wait, &wait,
 933						TASK_UNINTERRUPTIBLE);
 934			if (atomic_read(&ei->i_fc_updates)) {
 935				spin_unlock(&sbi->s_fc_lock);
 936				schedule();
 937				spin_lock(&sbi->s_fc_lock);
 938			}
 939			finish_wait(&ei->i_fc_wait, &wait);
 940		}
 941		spin_unlock(&sbi->s_fc_lock);
 942		ret = jbd2_submit_inode_data(ei->jinode);
 943		if (ret)
 944			return ret;
 945		spin_lock(&sbi->s_fc_lock);
 946	}
 947	spin_unlock(&sbi->s_fc_lock);
 948
 949	return ret;
 950}
 951
 952/* Wait for completion of data for all the fast commit inodes */
 953static int ext4_fc_wait_inode_data_all(journal_t *journal)
 954{
 955	struct super_block *sb = (struct super_block *)(journal->j_private);
 956	struct ext4_sb_info *sbi = EXT4_SB(sb);
 957	struct ext4_inode_info *pos, *n;
 958	int ret = 0;
 959
 960	spin_lock(&sbi->s_fc_lock);
 961	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 962		if (!ext4_test_inode_state(&pos->vfs_inode,
 963					   EXT4_STATE_FC_COMMITTING))
 964			continue;
 965		spin_unlock(&sbi->s_fc_lock);
 966
 967		ret = jbd2_wait_inode_data(journal, pos->jinode);
 968		if (ret)
 969			return ret;
 970		spin_lock(&sbi->s_fc_lock);
 971	}
 972	spin_unlock(&sbi->s_fc_lock);
 973
 974	return 0;
 975}
 976
 977/* Commit all the directory entry updates */
 978static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 979__acquires(&sbi->s_fc_lock)
 980__releases(&sbi->s_fc_lock)
 981{
 982	struct super_block *sb = (struct super_block *)(journal->j_private);
 983	struct ext4_sb_info *sbi = EXT4_SB(sb);
 984	struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
 985	struct inode *inode;
 986	struct ext4_inode_info *ei, *ei_n;
 987	int ret;
 988
 989	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
 990		return 0;
 991	list_for_each_entry_safe(fc_dentry, fc_dentry_n,
 992				 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
 993		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
 994			spin_unlock(&sbi->s_fc_lock);
 995			if (!ext4_fc_add_dentry_tlv(
 996				sb, fc_dentry->fcd_op,
 997				fc_dentry->fcd_parent, fc_dentry->fcd_ino,
 998				fc_dentry->fcd_name.len,
 999				fc_dentry->fcd_name.name, crc)) {
1000				ret = -ENOSPC;
1001				goto lock_and_exit;
1002			}
1003			spin_lock(&sbi->s_fc_lock);
1004			continue;
1005		}
1006
1007		inode = NULL;
1008		list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
1009					 i_fc_list) {
1010			if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
1011				inode = &ei->vfs_inode;
1012				break;
1013			}
1014		}
1015		/*
1016		 * If we don't find inode in our list, then it was deleted,
1017		 * in which case, we don't need to record it's create tag.
1018		 */
1019		if (!inode)
1020			continue;
1021		spin_unlock(&sbi->s_fc_lock);
1022
1023		/*
1024		 * We first write the inode and then the create dirent. This
1025		 * allows the recovery code to create an unnamed inode first
1026		 * and then link it to a directory entry. This allows us
1027		 * to use namei.c routines almost as is and simplifies
1028		 * the recovery code.
1029		 */
1030		ret = ext4_fc_write_inode(inode, crc);
1031		if (ret)
1032			goto lock_and_exit;
1033
1034		ret = ext4_fc_write_inode_data(inode, crc);
1035		if (ret)
1036			goto lock_and_exit;
1037
1038		if (!ext4_fc_add_dentry_tlv(
1039			sb, fc_dentry->fcd_op,
1040			fc_dentry->fcd_parent, fc_dentry->fcd_ino,
1041			fc_dentry->fcd_name.len,
1042			fc_dentry->fcd_name.name, crc)) {
1043			ret = -ENOSPC;
1044			goto lock_and_exit;
1045		}
1046
1047		spin_lock(&sbi->s_fc_lock);
1048	}
1049	return 0;
1050lock_and_exit:
1051	spin_lock(&sbi->s_fc_lock);
1052	return ret;
1053}
1054
1055static int ext4_fc_perform_commit(journal_t *journal)
1056{
1057	struct super_block *sb = (struct super_block *)(journal->j_private);
1058	struct ext4_sb_info *sbi = EXT4_SB(sb);
1059	struct ext4_inode_info *iter;
1060	struct ext4_fc_head head;
1061	struct inode *inode;
1062	struct blk_plug plug;
1063	int ret = 0;
1064	u32 crc = 0;
1065
1066	ret = ext4_fc_submit_inode_data_all(journal);
1067	if (ret)
1068		return ret;
1069
1070	ret = ext4_fc_wait_inode_data_all(journal);
1071	if (ret)
1072		return ret;
1073
1074	/*
1075	 * If file system device is different from journal device, issue a cache
1076	 * flush before we start writing fast commit blocks.
1077	 */
1078	if (journal->j_fs_dev != journal->j_dev)
1079		blkdev_issue_flush(journal->j_fs_dev);
1080
1081	blk_start_plug(&plug);
1082	if (sbi->s_fc_bytes == 0) {
1083		/*
1084		 * Add a head tag only if this is the first fast commit
1085		 * in this TID.
1086		 */
1087		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1088		head.fc_tid = cpu_to_le32(
1089			sbi->s_journal->j_running_transaction->t_tid);
1090		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1091			(u8 *)&head, &crc))
1092			goto out;
1093	}
1094
1095	spin_lock(&sbi->s_fc_lock);
1096	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1097	if (ret) {
1098		spin_unlock(&sbi->s_fc_lock);
1099		goto out;
1100	}
1101
1102	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1103		inode = &iter->vfs_inode;
1104		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1105			continue;
1106
1107		spin_unlock(&sbi->s_fc_lock);
1108		ret = ext4_fc_write_inode_data(inode, &crc);
1109		if (ret)
1110			goto out;
1111		ret = ext4_fc_write_inode(inode, &crc);
1112		if (ret)
1113			goto out;
1114		spin_lock(&sbi->s_fc_lock);
1115	}
1116	spin_unlock(&sbi->s_fc_lock);
1117
1118	ret = ext4_fc_write_tail(sb, crc);
1119
1120out:
1121	blk_finish_plug(&plug);
1122	return ret;
1123}
1124
1125/*
1126 * The main commit entry point. Performs a fast commit for transaction
1127 * commit_tid if needed. If it's not possible to perform a fast commit
1128 * due to various reasons, we fall back to full commit. Returns 0
1129 * on success, error otherwise.
1130 */
1131int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1132{
1133	struct super_block *sb = (struct super_block *)(journal->j_private);
1134	struct ext4_sb_info *sbi = EXT4_SB(sb);
1135	int nblks = 0, ret, bsize = journal->j_blocksize;
1136	int subtid = atomic_read(&sbi->s_fc_subtid);
1137	int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1138	ktime_t start_time, commit_time;
1139
1140	trace_ext4_fc_commit_start(sb);
1141
1142	start_time = ktime_get();
1143
1144	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1145		(ext4_fc_is_ineligible(sb))) {
1146		reason = EXT4_FC_REASON_INELIGIBLE;
1147		goto out;
1148	}
1149
1150restart_fc:
1151	ret = jbd2_fc_begin_commit(journal, commit_tid);
1152	if (ret == -EALREADY) {
1153		/* There was an ongoing commit, check if we need to restart */
1154		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1155			commit_tid > journal->j_commit_sequence)
1156			goto restart_fc;
1157		reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1158		goto out;
1159	} else if (ret) {
1160		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1161		reason = EXT4_FC_REASON_FC_START_FAILED;
1162		goto out;
1163	}
1164
1165	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1166	ret = ext4_fc_perform_commit(journal);
1167	if (ret < 0) {
1168		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1169		reason = EXT4_FC_REASON_FC_FAILED;
1170		goto out;
1171	}
1172	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1173	ret = jbd2_fc_wait_bufs(journal, nblks);
1174	if (ret < 0) {
1175		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1176		reason = EXT4_FC_REASON_FC_FAILED;
1177		goto out;
1178	}
1179	atomic_inc(&sbi->s_fc_subtid);
1180	jbd2_fc_end_commit(journal);
1181out:
1182	/* Has any ineligible update happened since we started? */
1183	if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1184		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1185		reason = EXT4_FC_REASON_INELIGIBLE;
1186	}
1187
1188	spin_lock(&sbi->s_fc_lock);
1189	if (reason != EXT4_FC_REASON_OK &&
1190		reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1191		sbi->s_fc_stats.fc_ineligible_commits++;
1192	} else {
1193		sbi->s_fc_stats.fc_num_commits++;
1194		sbi->s_fc_stats.fc_numblks += nblks;
1195	}
1196	spin_unlock(&sbi->s_fc_lock);
1197	nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1198	trace_ext4_fc_commit_stop(sb, nblks, reason);
1199	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1200	/*
1201	 * weight the commit time higher than the average time so we don't
1202	 * react too strongly to vast changes in the commit time
1203	 */
1204	if (likely(sbi->s_fc_avg_commit_time))
1205		sbi->s_fc_avg_commit_time = (commit_time +
1206				sbi->s_fc_avg_commit_time * 3) / 4;
1207	else
1208		sbi->s_fc_avg_commit_time = commit_time;
1209	jbd_debug(1,
1210		"Fast commit ended with blks = %d, reason = %d, subtid - %d",
1211		nblks, reason, subtid);
1212	if (reason == EXT4_FC_REASON_FC_FAILED)
1213		return jbd2_fc_end_commit_fallback(journal);
1214	if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1215		reason == EXT4_FC_REASON_INELIGIBLE)
1216		return jbd2_complete_transaction(journal, commit_tid);
1217	return 0;
1218}
1219
1220/*
1221 * Fast commit cleanup routine. This is called after every fast commit and
1222 * full commit. full is true if we are called after a full commit.
1223 */
1224static void ext4_fc_cleanup(journal_t *journal, int full)
1225{
1226	struct super_block *sb = journal->j_private;
1227	struct ext4_sb_info *sbi = EXT4_SB(sb);
1228	struct ext4_inode_info *iter, *iter_n;
1229	struct ext4_fc_dentry_update *fc_dentry;
1230
1231	if (full && sbi->s_fc_bh)
1232		sbi->s_fc_bh = NULL;
1233
1234	jbd2_fc_release_bufs(journal);
1235
1236	spin_lock(&sbi->s_fc_lock);
1237	list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1238				 i_fc_list) {
1239		list_del_init(&iter->i_fc_list);
1240		ext4_clear_inode_state(&iter->vfs_inode,
1241				       EXT4_STATE_FC_COMMITTING);
1242		ext4_fc_reset_inode(&iter->vfs_inode);
1243		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1244		smp_mb();
1245#if (BITS_PER_LONG < 64)
1246		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1247#else
1248		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1249#endif
1250	}
1251
1252	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1253		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1254					     struct ext4_fc_dentry_update,
1255					     fcd_list);
1256		list_del_init(&fc_dentry->fcd_list);
1257		spin_unlock(&sbi->s_fc_lock);
1258
1259		if (fc_dentry->fcd_name.name &&
1260			fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1261			kfree(fc_dentry->fcd_name.name);
1262		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1263		spin_lock(&sbi->s_fc_lock);
1264	}
1265
1266	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1267				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1268	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1269				&sbi->s_fc_q[FC_Q_MAIN]);
1270
1271	ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1272	ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1273
1274	if (full)
1275		sbi->s_fc_bytes = 0;
1276	spin_unlock(&sbi->s_fc_lock);
1277	trace_ext4_fc_stats(sb);
1278}
1279
1280/* Ext4 Replay Path Routines */
1281
1282/* Helper struct for dentry replay routines */
1283struct dentry_info_args {
1284	int parent_ino, dname_len, ino, inode_len;
1285	char *dname;
1286};
1287
1288static inline void tl_to_darg(struct dentry_info_args *darg,
1289				struct  ext4_fc_tl *tl)
1290{
1291	struct ext4_fc_dentry_info *fcd;
1292
1293	fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
1294
1295	darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
1296	darg->ino = le32_to_cpu(fcd->fc_ino);
1297	darg->dname = fcd->fc_dname;
1298	darg->dname_len = ext4_fc_tag_len(tl) -
1299			sizeof(struct ext4_fc_dentry_info);
1300}
1301
1302/* Unlink replay function */
1303static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
1304{
1305	struct inode *inode, *old_parent;
1306	struct qstr entry;
1307	struct dentry_info_args darg;
1308	int ret = 0;
1309
1310	tl_to_darg(&darg, tl);
1311
1312	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1313			darg.parent_ino, darg.dname_len);
1314
1315	entry.name = darg.dname;
1316	entry.len = darg.dname_len;
1317	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1318
1319	if (IS_ERR(inode)) {
1320		jbd_debug(1, "Inode %d not found", darg.ino);
1321		return 0;
1322	}
1323
1324	old_parent = ext4_iget(sb, darg.parent_ino,
1325				EXT4_IGET_NORMAL);
1326	if (IS_ERR(old_parent)) {
1327		jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1328		iput(inode);
1329		return 0;
1330	}
1331
1332	ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1333	/* -ENOENT ok coz it might not exist anymore. */
1334	if (ret == -ENOENT)
1335		ret = 0;
1336	iput(old_parent);
1337	iput(inode);
1338	return ret;
1339}
1340
1341static int ext4_fc_replay_link_internal(struct super_block *sb,
1342				struct dentry_info_args *darg,
1343				struct inode *inode)
1344{
1345	struct inode *dir = NULL;
1346	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1347	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1348	int ret = 0;
1349
1350	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1351	if (IS_ERR(dir)) {
1352		jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1353		dir = NULL;
1354		goto out;
1355	}
1356
1357	dentry_dir = d_obtain_alias(dir);
1358	if (IS_ERR(dentry_dir)) {
1359		jbd_debug(1, "Failed to obtain dentry");
1360		dentry_dir = NULL;
1361		goto out;
1362	}
1363
1364	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1365	if (!dentry_inode) {
1366		jbd_debug(1, "Inode dentry not created.");
1367		ret = -ENOMEM;
1368		goto out;
1369	}
1370
1371	ret = __ext4_link(dir, inode, dentry_inode);
1372	/*
1373	 * It's possible that link already existed since data blocks
1374	 * for the dir in question got persisted before we crashed OR
1375	 * we replayed this tag and crashed before the entire replay
1376	 * could complete.
1377	 */
1378	if (ret && ret != -EEXIST) {
1379		jbd_debug(1, "Failed to link\n");
1380		goto out;
1381	}
1382
1383	ret = 0;
1384out:
1385	if (dentry_dir) {
1386		d_drop(dentry_dir);
1387		dput(dentry_dir);
1388	} else if (dir) {
1389		iput(dir);
1390	}
1391	if (dentry_inode) {
1392		d_drop(dentry_inode);
1393		dput(dentry_inode);
1394	}
1395
1396	return ret;
1397}
1398
1399/* Link replay function */
1400static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
1401{
1402	struct inode *inode;
1403	struct dentry_info_args darg;
1404	int ret = 0;
1405
1406	tl_to_darg(&darg, tl);
1407	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1408			darg.parent_ino, darg.dname_len);
1409
1410	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1411	if (IS_ERR(inode)) {
1412		jbd_debug(1, "Inode not found.");
1413		return 0;
1414	}
1415
1416	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1417	iput(inode);
1418	return ret;
1419}
1420
1421/*
1422 * Record all the modified inodes during replay. We use this later to setup
1423 * block bitmaps correctly.
1424 */
1425static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1426{
1427	struct ext4_fc_replay_state *state;
1428	int i;
1429
1430	state = &EXT4_SB(sb)->s_fc_replay_state;
1431	for (i = 0; i < state->fc_modified_inodes_used; i++)
1432		if (state->fc_modified_inodes[i] == ino)
1433			return 0;
1434	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1435		state->fc_modified_inodes_size +=
1436			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1437		state->fc_modified_inodes = krealloc(
1438					state->fc_modified_inodes, sizeof(int) *
1439					state->fc_modified_inodes_size,
1440					GFP_KERNEL);
1441		if (!state->fc_modified_inodes)
1442			return -ENOMEM;
1443	}
1444	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1445	return 0;
1446}
1447
1448/*
1449 * Inode replay function
1450 */
1451static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
1452{
1453	struct ext4_fc_inode *fc_inode;
1454	struct ext4_inode *raw_inode;
1455	struct ext4_inode *raw_fc_inode;
1456	struct inode *inode = NULL;
1457	struct ext4_iloc iloc;
1458	int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1459	struct ext4_extent_header *eh;
1460
1461	fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
1462
1463	ino = le32_to_cpu(fc_inode->fc_ino);
1464	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1465
1466	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1467	if (!IS_ERR(inode)) {
1468		ext4_ext_clear_bb(inode);
1469		iput(inode);
1470	}
1471	inode = NULL;
1472
1473	ext4_fc_record_modified_inode(sb, ino);
1474
1475	raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
1476	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1477	if (ret)
1478		goto out;
1479
1480	inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
1481	raw_inode = ext4_raw_inode(&iloc);
1482
1483	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1484	memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1485		inode_len - offsetof(struct ext4_inode, i_generation));
1486	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1487		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1488		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1489			memset(eh, 0, sizeof(*eh));
1490			eh->eh_magic = EXT4_EXT_MAGIC;
1491			eh->eh_max = cpu_to_le16(
1492				(sizeof(raw_inode->i_block) -
1493				 sizeof(struct ext4_extent_header))
1494				 / sizeof(struct ext4_extent));
1495		}
1496	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1497		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1498			sizeof(raw_inode->i_block));
1499	}
1500
1501	/* Immediately update the inode on disk. */
1502	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1503	if (ret)
1504		goto out;
1505	ret = sync_dirty_buffer(iloc.bh);
1506	if (ret)
1507		goto out;
1508	ret = ext4_mark_inode_used(sb, ino);
1509	if (ret)
1510		goto out;
1511
1512	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
1513	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1514	if (IS_ERR(inode)) {
1515		jbd_debug(1, "Inode not found.");
1516		return -EFSCORRUPTED;
1517	}
1518
1519	/*
1520	 * Our allocator could have made different decisions than before
1521	 * crashing. This should be fixed but until then, we calculate
1522	 * the number of blocks the inode.
1523	 */
1524	ext4_ext_replay_set_iblocks(inode);
1525
1526	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1527	ext4_reset_inode_seed(inode);
1528
1529	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1530	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1531	sync_dirty_buffer(iloc.bh);
1532	brelse(iloc.bh);
1533out:
1534	iput(inode);
1535	if (!ret)
1536		blkdev_issue_flush(sb->s_bdev);
1537
1538	return 0;
1539}
1540
1541/*
1542 * Dentry create replay function.
1543 *
1544 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1545 * inode for which we are trying to create a dentry here, should already have
1546 * been replayed before we start here.
1547 */
1548static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
1549{
1550	int ret = 0;
1551	struct inode *inode = NULL;
1552	struct inode *dir = NULL;
1553	struct dentry_info_args darg;
1554
1555	tl_to_darg(&darg, tl);
1556
1557	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1558			darg.parent_ino, darg.dname_len);
1559
1560	/* This takes care of update group descriptor and other metadata */
1561	ret = ext4_mark_inode_used(sb, darg.ino);
1562	if (ret)
1563		goto out;
1564
1565	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1566	if (IS_ERR(inode)) {
1567		jbd_debug(1, "inode %d not found.", darg.ino);
1568		inode = NULL;
1569		ret = -EINVAL;
1570		goto out;
1571	}
1572
1573	if (S_ISDIR(inode->i_mode)) {
1574		/*
1575		 * If we are creating a directory, we need to make sure that the
1576		 * dot and dot dot dirents are setup properly.
1577		 */
1578		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1579		if (IS_ERR(dir)) {
1580			jbd_debug(1, "Dir %d not found.", darg.ino);
1581			goto out;
1582		}
1583		ret = ext4_init_new_dir(NULL, dir, inode);
1584		iput(dir);
1585		if (ret) {
1586			ret = 0;
1587			goto out;
1588		}
1589	}
1590	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1591	if (ret)
1592		goto out;
1593	set_nlink(inode, 1);
1594	ext4_mark_inode_dirty(NULL, inode);
1595out:
1596	if (inode)
1597		iput(inode);
1598	return ret;
1599}
1600
1601/*
1602 * Record physical disk regions which are in use as per fast commit area. Our
1603 * simple replay phase allocator excludes these regions from allocation.
1604 */
1605static int ext4_fc_record_regions(struct super_block *sb, int ino,
1606		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1607{
1608	struct ext4_fc_replay_state *state;
1609	struct ext4_fc_alloc_region *region;
1610
1611	state = &EXT4_SB(sb)->s_fc_replay_state;
1612	if (state->fc_regions_used == state->fc_regions_size) {
1613		state->fc_regions_size +=
1614			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1615		state->fc_regions = krealloc(
1616					state->fc_regions,
1617					state->fc_regions_size *
1618					sizeof(struct ext4_fc_alloc_region),
1619					GFP_KERNEL);
1620		if (!state->fc_regions)
1621			return -ENOMEM;
1622	}
1623	region = &state->fc_regions[state->fc_regions_used++];
1624	region->ino = ino;
1625	region->lblk = lblk;
1626	region->pblk = pblk;
1627	region->len = len;
1628
1629	return 0;
1630}
1631
1632/* Replay add range tag */
1633static int ext4_fc_replay_add_range(struct super_block *sb,
1634				struct ext4_fc_tl *tl)
1635{
1636	struct ext4_fc_add_range *fc_add_ex;
1637	struct ext4_extent newex, *ex;
1638	struct inode *inode;
1639	ext4_lblk_t start, cur;
1640	int remaining, len;
1641	ext4_fsblk_t start_pblk;
1642	struct ext4_map_blocks map;
1643	struct ext4_ext_path *path = NULL;
1644	int ret;
1645
1646	fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1647	ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
1648
1649	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1650		le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
1651		ext4_ext_get_actual_len(ex));
1652
1653	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
1654				EXT4_IGET_NORMAL);
1655	if (IS_ERR(inode)) {
1656		jbd_debug(1, "Inode not found.");
1657		return 0;
1658	}
1659
1660	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1661
1662	start = le32_to_cpu(ex->ee_block);
1663	start_pblk = ext4_ext_pblock(ex);
1664	len = ext4_ext_get_actual_len(ex);
1665
1666	cur = start;
1667	remaining = len;
1668	jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1669		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1670		  inode->i_ino);
1671
1672	while (remaining > 0) {
1673		map.m_lblk = cur;
1674		map.m_len = remaining;
1675		map.m_pblk = 0;
1676		ret = ext4_map_blocks(NULL, inode, &map, 0);
1677
1678		if (ret < 0) {
1679			iput(inode);
1680			return 0;
1681		}
1682
1683		if (ret == 0) {
1684			/* Range is not mapped */
1685			path = ext4_find_extent(inode, cur, NULL, 0);
1686			if (IS_ERR(path)) {
1687				iput(inode);
1688				return 0;
1689			}
1690			memset(&newex, 0, sizeof(newex));
1691			newex.ee_block = cpu_to_le32(cur);
1692			ext4_ext_store_pblock(
1693				&newex, start_pblk + cur - start);
1694			newex.ee_len = cpu_to_le16(map.m_len);
1695			if (ext4_ext_is_unwritten(ex))
1696				ext4_ext_mark_unwritten(&newex);
1697			down_write(&EXT4_I(inode)->i_data_sem);
1698			ret = ext4_ext_insert_extent(
1699				NULL, inode, &path, &newex, 0);
1700			up_write((&EXT4_I(inode)->i_data_sem));
1701			ext4_ext_drop_refs(path);
1702			kfree(path);
1703			if (ret) {
1704				iput(inode);
1705				return 0;
1706			}
1707			goto next;
1708		}
1709
1710		if (start_pblk + cur - start != map.m_pblk) {
1711			/*
1712			 * Logical to physical mapping changed. This can happen
1713			 * if this range was removed and then reallocated to
1714			 * map to new physical blocks during a fast commit.
1715			 */
1716			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1717					ext4_ext_is_unwritten(ex),
1718					start_pblk + cur - start);
1719			if (ret) {
1720				iput(inode);
1721				return 0;
1722			}
1723			/*
1724			 * Mark the old blocks as free since they aren't used
1725			 * anymore. We maintain an array of all the modified
1726			 * inodes. In case these blocks are still used at either
1727			 * a different logical range in the same inode or in
1728			 * some different inode, we will mark them as allocated
1729			 * at the end of the FC replay using our array of
1730			 * modified inodes.
1731			 */
1732			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1733			goto next;
1734		}
1735
1736		/* Range is mapped and needs a state change */
1737		jbd_debug(1, "Converting from %d to %d %lld",
1738				map.m_flags & EXT4_MAP_UNWRITTEN,
1739			ext4_ext_is_unwritten(ex), map.m_pblk);
1740		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1741					ext4_ext_is_unwritten(ex), map.m_pblk);
1742		if (ret) {
1743			iput(inode);
1744			return 0;
1745		}
1746		/*
1747		 * We may have split the extent tree while toggling the state.
1748		 * Try to shrink the extent tree now.
1749		 */
1750		ext4_ext_replay_shrink_inode(inode, start + len);
1751next:
1752		cur += map.m_len;
1753		remaining -= map.m_len;
1754	}
1755	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1756					sb->s_blocksize_bits);
1757	iput(inode);
1758	return 0;
1759}
1760
1761/* Replay DEL_RANGE tag */
1762static int
1763ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
1764{
1765	struct inode *inode;
1766	struct ext4_fc_del_range *lrange;
1767	struct ext4_map_blocks map;
1768	ext4_lblk_t cur, remaining;
1769	int ret;
1770
1771	lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
1772	cur = le32_to_cpu(lrange->fc_lblk);
1773	remaining = le32_to_cpu(lrange->fc_len);
1774
1775	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1776		le32_to_cpu(lrange->fc_ino), cur, remaining);
1777
1778	inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
1779	if (IS_ERR(inode)) {
1780		jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
1781		return 0;
1782	}
1783
1784	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1785
1786	jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1787			inode->i_ino, le32_to_cpu(lrange->fc_lblk),
1788			le32_to_cpu(lrange->fc_len));
1789	while (remaining > 0) {
1790		map.m_lblk = cur;
1791		map.m_len = remaining;
1792
1793		ret = ext4_map_blocks(NULL, inode, &map, 0);
1794		if (ret < 0) {
1795			iput(inode);
1796			return 0;
1797		}
1798		if (ret > 0) {
1799			remaining -= ret;
1800			cur += ret;
1801			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1802		} else {
1803			remaining -= map.m_len;
1804			cur += map.m_len;
1805		}
1806	}
1807
1808	ret = ext4_punch_hole(inode,
1809		le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
1810		le32_to_cpu(lrange->fc_len) <<  sb->s_blocksize_bits);
1811	if (ret)
1812		jbd_debug(1, "ext4_punch_hole returned %d", ret);
1813	ext4_ext_replay_shrink_inode(inode,
1814		i_size_read(inode) >> sb->s_blocksize_bits);
1815	ext4_mark_inode_dirty(NULL, inode);
1816	iput(inode);
1817
1818	return 0;
1819}
1820
1821static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1822{
1823	struct ext4_fc_replay_state *state;
1824	struct inode *inode;
1825	struct ext4_ext_path *path = NULL;
1826	struct ext4_map_blocks map;
1827	int i, ret, j;
1828	ext4_lblk_t cur, end;
1829
1830	state = &EXT4_SB(sb)->s_fc_replay_state;
1831	for (i = 0; i < state->fc_modified_inodes_used; i++) {
1832		inode = ext4_iget(sb, state->fc_modified_inodes[i],
1833			EXT4_IGET_NORMAL);
1834		if (IS_ERR(inode)) {
1835			jbd_debug(1, "Inode %d not found.",
1836				state->fc_modified_inodes[i]);
1837			continue;
1838		}
1839		cur = 0;
1840		end = EXT_MAX_BLOCKS;
1841		while (cur < end) {
1842			map.m_lblk = cur;
1843			map.m_len = end - cur;
1844
1845			ret = ext4_map_blocks(NULL, inode, &map, 0);
1846			if (ret < 0)
1847				break;
1848
1849			if (ret > 0) {
1850				path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1851				if (!IS_ERR(path)) {
1852					for (j = 0; j < path->p_depth; j++)
1853						ext4_mb_mark_bb(inode->i_sb,
1854							path[j].p_block, 1, 1);
1855					ext4_ext_drop_refs(path);
1856					kfree(path);
1857				}
1858				cur += ret;
1859				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1860							map.m_len, 1);
1861			} else {
1862				cur = cur + (map.m_len ? map.m_len : 1);
1863			}
1864		}
1865		iput(inode);
1866	}
1867}
1868
1869/*
1870 * Check if block is in excluded regions for block allocation. The simple
1871 * allocator that runs during replay phase is calls this function to see
1872 * if it is okay to use a block.
1873 */
1874bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1875{
1876	int i;
1877	struct ext4_fc_replay_state *state;
1878
1879	state = &EXT4_SB(sb)->s_fc_replay_state;
1880	for (i = 0; i < state->fc_regions_valid; i++) {
1881		if (state->fc_regions[i].ino == 0 ||
1882			state->fc_regions[i].len == 0)
1883			continue;
1884		if (blk >= state->fc_regions[i].pblk &&
1885		    blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1886			return true;
1887	}
1888	return false;
1889}
1890
1891/* Cleanup function called after replay */
1892void ext4_fc_replay_cleanup(struct super_block *sb)
1893{
1894	struct ext4_sb_info *sbi = EXT4_SB(sb);
1895
1896	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1897	kfree(sbi->s_fc_replay_state.fc_regions);
1898	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1899}
1900
1901/*
1902 * Recovery Scan phase handler
1903 *
1904 * This function is called during the scan phase and is responsible
1905 * for doing following things:
1906 * - Make sure the fast commit area has valid tags for replay
1907 * - Count number of tags that need to be replayed by the replay handler
1908 * - Verify CRC
1909 * - Create a list of excluded blocks for allocation during replay phase
1910 *
1911 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1912 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1913 * to indicate that scan has finished and JBD2 can now start replay phase.
1914 * It returns a negative error to indicate that there was an error. At the end
1915 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1916 * to indicate the number of tags that need to replayed during the replay phase.
1917 */
1918static int ext4_fc_replay_scan(journal_t *journal,
1919				struct buffer_head *bh, int off,
1920				tid_t expected_tid)
1921{
1922	struct super_block *sb = journal->j_private;
1923	struct ext4_sb_info *sbi = EXT4_SB(sb);
1924	struct ext4_fc_replay_state *state;
1925	int ret = JBD2_FC_REPLAY_CONTINUE;
1926	struct ext4_fc_add_range *ext;
1927	struct ext4_fc_tl *tl;
1928	struct ext4_fc_tail *tail;
1929	__u8 *start, *end;
1930	struct ext4_fc_head *head;
1931	struct ext4_extent *ex;
1932
1933	state = &sbi->s_fc_replay_state;
1934
1935	start = (u8 *)bh->b_data;
1936	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1937
1938	if (state->fc_replay_expected_off == 0) {
1939		state->fc_cur_tag = 0;
1940		state->fc_replay_num_tags = 0;
1941		state->fc_crc = 0;
1942		state->fc_regions = NULL;
1943		state->fc_regions_valid = state->fc_regions_used =
1944			state->fc_regions_size = 0;
1945		/* Check if we can stop early */
1946		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1947			!= EXT4_FC_TAG_HEAD)
1948			return 0;
1949	}
1950
1951	if (off != state->fc_replay_expected_off) {
1952		ret = -EFSCORRUPTED;
1953		goto out_err;
1954	}
1955
1956	state->fc_replay_expected_off++;
1957	fc_for_each_tl(start, end, tl) {
1958		jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1959			  tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
1960		switch (le16_to_cpu(tl->fc_tag)) {
1961		case EXT4_FC_TAG_ADD_RANGE:
1962			ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1963			ex = (struct ext4_extent *)&ext->fc_ex;
1964			ret = ext4_fc_record_regions(sb,
1965				le32_to_cpu(ext->fc_ino),
1966				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1967				ext4_ext_get_actual_len(ex));
1968			if (ret < 0)
1969				break;
1970			ret = JBD2_FC_REPLAY_CONTINUE;
1971			fallthrough;
1972		case EXT4_FC_TAG_DEL_RANGE:
1973		case EXT4_FC_TAG_LINK:
1974		case EXT4_FC_TAG_UNLINK:
1975		case EXT4_FC_TAG_CREAT:
1976		case EXT4_FC_TAG_INODE:
1977		case EXT4_FC_TAG_PAD:
1978			state->fc_cur_tag++;
1979			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1980					sizeof(*tl) + ext4_fc_tag_len(tl));
1981			break;
1982		case EXT4_FC_TAG_TAIL:
1983			state->fc_cur_tag++;
1984			tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
1985			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1986						sizeof(*tl) +
1987						offsetof(struct ext4_fc_tail,
1988						fc_crc));
1989			if (le32_to_cpu(tail->fc_tid) == expected_tid &&
1990				le32_to_cpu(tail->fc_crc) == state->fc_crc) {
1991				state->fc_replay_num_tags = state->fc_cur_tag;
1992				state->fc_regions_valid =
1993					state->fc_regions_used;
1994			} else {
1995				ret = state->fc_replay_num_tags ?
1996					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1997			}
1998			state->fc_crc = 0;
1999			break;
2000		case EXT4_FC_TAG_HEAD:
2001			head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
2002			if (le32_to_cpu(head->fc_features) &
2003				~EXT4_FC_SUPPORTED_FEATURES) {
2004				ret = -EOPNOTSUPP;
2005				break;
2006			}
2007			if (le32_to_cpu(head->fc_tid) != expected_tid) {
2008				ret = JBD2_FC_REPLAY_STOP;
2009				break;
2010			}
2011			state->fc_cur_tag++;
2012			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
2013					sizeof(*tl) + ext4_fc_tag_len(tl));
2014			break;
2015		default:
2016			ret = state->fc_replay_num_tags ?
2017				JBD2_FC_REPLAY_STOP : -ECANCELED;
2018		}
2019		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2020			break;
2021	}
2022
2023out_err:
2024	trace_ext4_fc_replay_scan(sb, ret, off);
2025	return ret;
2026}
2027
2028/*
2029 * Main recovery path entry point.
2030 * The meaning of return codes is similar as above.
2031 */
2032static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2033				enum passtype pass, int off, tid_t expected_tid)
2034{
2035	struct super_block *sb = journal->j_private;
2036	struct ext4_sb_info *sbi = EXT4_SB(sb);
2037	struct ext4_fc_tl *tl;
2038	__u8 *start, *end;
2039	int ret = JBD2_FC_REPLAY_CONTINUE;
2040	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2041	struct ext4_fc_tail *tail;
2042
2043	if (pass == PASS_SCAN) {
2044		state->fc_current_pass = PASS_SCAN;
2045		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2046	}
2047
2048	if (state->fc_current_pass != pass) {
2049		state->fc_current_pass = pass;
2050		sbi->s_mount_state |= EXT4_FC_REPLAY;
2051	}
2052	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2053		jbd_debug(1, "Replay stops\n");
2054		ext4_fc_set_bitmaps_and_counters(sb);
2055		return 0;
2056	}
2057
2058#ifdef CONFIG_EXT4_DEBUG
2059	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2060		pr_warn("Dropping fc block %d because max_replay set\n", off);
2061		return JBD2_FC_REPLAY_STOP;
2062	}
2063#endif
2064
2065	start = (u8 *)bh->b_data;
2066	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2067
2068	fc_for_each_tl(start, end, tl) {
2069		if (state->fc_replay_num_tags == 0) {
2070			ret = JBD2_FC_REPLAY_STOP;
2071			ext4_fc_set_bitmaps_and_counters(sb);
2072			break;
2073		}
2074		jbd_debug(3, "Replay phase, tag:%s\n",
2075				tag2str(le16_to_cpu(tl->fc_tag)));
2076		state->fc_replay_num_tags--;
2077		switch (le16_to_cpu(tl->fc_tag)) {
2078		case EXT4_FC_TAG_LINK:
2079			ret = ext4_fc_replay_link(sb, tl);
2080			break;
2081		case EXT4_FC_TAG_UNLINK:
2082			ret = ext4_fc_replay_unlink(sb, tl);
2083			break;
2084		case EXT4_FC_TAG_ADD_RANGE:
2085			ret = ext4_fc_replay_add_range(sb, tl);
2086			break;
2087		case EXT4_FC_TAG_CREAT:
2088			ret = ext4_fc_replay_create(sb, tl);
2089			break;
2090		case EXT4_FC_TAG_DEL_RANGE:
2091			ret = ext4_fc_replay_del_range(sb, tl);
2092			break;
2093		case EXT4_FC_TAG_INODE:
2094			ret = ext4_fc_replay_inode(sb, tl);
2095			break;
2096		case EXT4_FC_TAG_PAD:
2097			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2098				ext4_fc_tag_len(tl), 0);
2099			break;
2100		case EXT4_FC_TAG_TAIL:
2101			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2102				ext4_fc_tag_len(tl), 0);
2103			tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
2104			WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
2105			break;
2106		case EXT4_FC_TAG_HEAD:
2107			break;
2108		default:
2109			trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
2110				ext4_fc_tag_len(tl), 0);
2111			ret = -ECANCELED;
2112			break;
2113		}
2114		if (ret < 0)
2115			break;
2116		ret = JBD2_FC_REPLAY_CONTINUE;
2117	}
2118	return ret;
2119}
2120
2121void ext4_fc_init(struct super_block *sb, journal_t *journal)
2122{
2123	/*
2124	 * We set replay callback even if fast commit disabled because we may
2125	 * could still have fast commit blocks that need to be replayed even if
2126	 * fast commit has now been turned off.
2127	 */
2128	journal->j_fc_replay_callback = ext4_fc_replay;
2129	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2130		return;
2131	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2132}
2133
2134static const char *fc_ineligible_reasons[] = {
2135	"Extended attributes changed",
2136	"Cross rename",
2137	"Journal flag changed",
2138	"Insufficient memory",
2139	"Swap boot",
2140	"Resize",
2141	"Dir renamed",
2142	"Falloc range op",
2143	"Data journalling",
2144	"FC Commit Failed"
2145};
2146
2147int ext4_fc_info_show(struct seq_file *seq, void *v)
2148{
2149	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2150	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2151	int i;
2152
2153	if (v != SEQ_START_TOKEN)
2154		return 0;
2155
2156	seq_printf(seq,
2157		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2158		   stats->fc_num_commits, stats->fc_ineligible_commits,
2159		   stats->fc_numblks,
2160		   div_u64(sbi->s_fc_avg_commit_time, 1000));
2161	seq_puts(seq, "Ineligible reasons:\n");
2162	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2163		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2164			stats->fc_ineligible_reason_count[i]);
2165
2166	return 0;
2167}
2168
2169int __init ext4_fc_init_dentry_cache(void)
2170{
2171	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2172					   SLAB_RECLAIM_ACCOUNT);
2173
2174	if (ext4_fc_dentry_cachep == NULL)
2175		return -ENOMEM;
2176
2177	return 0;
2178}