fs/ext4/fast_commit.c at v5.11 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / ext4 / fast_commit.c
at v5.11 63 kB view raw
   1// SPDX-License-Identifier: GPL-2.0
   2
   3/*
   4 * fs/ext4/fast_commit.c
   5 *
   6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
   7 *
   8 * Ext4 fast commits routines.
   9 */
  10#include "ext4.h"
  11#include "ext4_jbd2.h"
  12#include "ext4_extents.h"
  13#include "mballoc.h"
  14
  15/*
  16 * Ext4 Fast Commits
  17 * -----------------
  18 *
  19 * Ext4 fast commits implement fine grained journalling for Ext4.
  20 *
  21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
  22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
  23 * TLV during the recovery phase. For the scenarios for which we currently
  24 * don't have replay code, fast commit falls back to full commits.
  25 * Fast commits record delta in one of the following three categories.
  26 *
  27 * (A) Directory entry updates:
  28 *
  29 * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
  30 * - EXT4_FC_TAG_LINK		- records directory entry link
  31 * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
  32 *
  33 * (B) File specific data range updates:
  34 *
  35 * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
  36 * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
  37 *
  38 * (C) Inode metadata (mtime / ctime etc):
  39 *
  40 * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
  41 *				  during recovery. Note that iblocks field is
  42 *				  not replayed and instead derived during
  43 *				  replay.
  44 * Commit Operation
  45 * ----------------
  46 * With fast commits, we maintain all the directory entry operations in the
  47 * order in which they are issued in an in-memory queue. This queue is flushed
  48 * to disk during the commit operation. We also maintain a list of inodes
  49 * that need to be committed during a fast commit in another in memory queue of
  50 * inodes. During the commit operation, we commit in the following order:
  51 *
  52 * [1] Lock inodes for any further data updates by setting COMMITTING state
  53 * [2] Submit data buffers of all the inodes
  54 * [3] Wait for [2] to complete
  55 * [4] Commit all the directory entry updates in the fast commit space
  56 * [5] Commit all the changed inode structures
  57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
  58 *     section for more details).
  59 * [7] Wait for [4], [5] and [6] to complete.
  60 *
  61 * All the inode updates must call ext4_fc_start_update() before starting an
  62 * update. If such an ongoing update is present, fast commit waits for it to
  63 * complete. The completion of such an update is marked by
  64 * ext4_fc_stop_update().
  65 *
  66 * Fast Commit Ineligibility
  67 * -------------------------
  68 * Not all operations are supported by fast commits today (e.g extended
  69 * attributes). Fast commit ineligiblity is marked by calling one of the
  70 * two following functions:
  71 *
  72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
  73 *   back to full commit. This is useful in case of transient errors.
  74 *
  75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
  76 *   the fast commits happening between ext4_fc_start_ineligible() and
  77 *   ext4_fc_stop_ineligible() and one fast commit after the call to
  78 *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
  79 *   make one more fast commit to fall back to full commit after stop call so
  80 *   that it guaranteed that the fast commit ineligible operation contained
  81 *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
  82 *   followed by at least 1 full commit.
  83 *
  84 * Atomicity of commits
  85 * --------------------
  86 * In order to guarantee atomicity during the commit operation, fast commit
  87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
  88 * tag contains CRC of the contents and TID of the transaction after which
  89 * this fast commit should be applied. Recovery code replays fast commit
  90 * logs only if there's at least 1 valid tail present. For every fast commit
  91 * operation, there is 1 tail. This means, we may end up with multiple tails
  92 * in the fast commit space. Here's an example:
  93 *
  94 * - Create a new file A and remove existing file B
  95 * - fsync()
  96 * - Append contents to file A
  97 * - Truncate file A
  98 * - fsync()
  99 *
 100 * The fast commit space at the end of above operations would look like this:
 101 *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
 102 *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
 103 *
 104 * Replay code should thus check for all the valid tails in the FC area.
 105 *
 106 * Fast Commit Replay Idempotence
 107 * ------------------------------
 108 *
 109 * Fast commits tags are idempotent in nature provided the recovery code follows
 110 * certain rules. The guiding principle that the commit path follows while
 111 * committing is that it stores the result of a particular operation instead of
 112 * storing the procedure.
 113 *
 114 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
 115 * was associated with inode 10. During fast commit, instead of storing this
 116 * operation as a procedure "rename a to b", we store the resulting file system
 117 * state as a "series" of outcomes:
 118 *
 119 * - Link dirent b to inode 10
 120 * - Unlink dirent a
 121 * - Inode <10> with valid refcount
 122 *
 123 * Now when recovery code runs, it needs "enforce" this state on the file
 124 * system. This is what guarantees idempotence of fast commit replay.
 125 *
 126 * Let's take an example of a procedure that is not idempotent and see how fast
 127 * commits make it idempotent. Consider following sequence of operations:
 128 *
 129 *     rm A;    mv B A;    read A
 130 *  (x)     (y)        (z)
 131 *
 132 * (x), (y) and (z) are the points at which we can crash. If we store this
 133 * sequence of operations as is then the replay is not idempotent. Let's say
 134 * while in replay, we crash at (z). During the second replay, file A (which was
 135 * actually created as a result of "mv B A" operation) would get deleted. Thus,
 136 * file named A would be absent when we try to read A. So, this sequence of
 137 * operations is not idempotent. However, as mentioned above, instead of storing
 138 * the procedure fast commits store the outcome of each procedure. Thus the fast
 139 * commit log for above procedure would be as follows:
 140 *
 141 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
 142 * inode 11 before the replay)
 143 *
 144 *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
 145 * (w)          (x)                    (y)          (z)
 146 *
 147 * If we crash at (z), we will have file A linked to inode 11. During the second
 148 * replay, we will remove file A (inode 11). But we will create it back and make
 149 * it point to inode 11. We won't find B, so we'll just skip that step. At this
 150 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
 151 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
 152 * similarly. Thus, by converting a non-idempotent procedure into a series of
 153 * idempotent outcomes, fast commits ensured idempotence during the replay.
 154 *
 155 * TODOs
 156 * -----
 157 *
 158 * 0) Fast commit replay path hardening: Fast commit replay code should use
 159 *    journal handles to make sure all the updates it does during the replay
 160 *    path are atomic. With that if we crash during fast commit replay, after
 161 *    trying to do recovery again, we will find a file system where fast commit
 162 *    area is invalid (because new full commit would be found). In order to deal
 163 *    with that, fast commit replay code should ensure that the "FC_REPLAY"
 164 *    superblock state is persisted before starting the replay, so that after
 165 *    the crash, fast commit recovery code can look at that flag and perform
 166 *    fast commit recovery even if that area is invalidated by later full
 167 *    commits.
 168 *
 169 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
 170 *    eligible update must be protected within ext4_fc_start_update() and
 171 *    ext4_fc_stop_update(). These routines are called at much higher
 172 *    routines. This can be made more fine grained by combining with
 173 *    ext4_journal_start().
 174 *
 175 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
 176 *
 177 * 3) Handle more ineligible cases.
 178 */
 179
 180#include <trace/events/ext4.h>
 181static struct kmem_cache *ext4_fc_dentry_cachep;
 182
 183static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 184{
 185	BUFFER_TRACE(bh, "");
 186	if (uptodate) {
 187		ext4_debug("%s: Block %lld up-to-date",
 188			   __func__, bh->b_blocknr);
 189		set_buffer_uptodate(bh);
 190	} else {
 191		ext4_debug("%s: Block %lld not up-to-date",
 192			   __func__, bh->b_blocknr);
 193		clear_buffer_uptodate(bh);
 194	}
 195
 196	unlock_buffer(bh);
 197}
 198
 199static inline void ext4_fc_reset_inode(struct inode *inode)
 200{
 201	struct ext4_inode_info *ei = EXT4_I(inode);
 202
 203	ei->i_fc_lblk_start = 0;
 204	ei->i_fc_lblk_len = 0;
 205}
 206
 207void ext4_fc_init_inode(struct inode *inode)
 208{
 209	struct ext4_inode_info *ei = EXT4_I(inode);
 210
 211	ext4_fc_reset_inode(inode);
 212	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 213	INIT_LIST_HEAD(&ei->i_fc_list);
 214	init_waitqueue_head(&ei->i_fc_wait);
 215	atomic_set(&ei->i_fc_updates, 0);
 216}
 217
 218/* This function must be called with sbi->s_fc_lock held. */
 219static void ext4_fc_wait_committing_inode(struct inode *inode)
 220__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
 221{
 222	wait_queue_head_t *wq;
 223	struct ext4_inode_info *ei = EXT4_I(inode);
 224
 225#if (BITS_PER_LONG < 64)
 226	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
 227			EXT4_STATE_FC_COMMITTING);
 228	wq = bit_waitqueue(&ei->i_state_flags,
 229				EXT4_STATE_FC_COMMITTING);
 230#else
 231	DEFINE_WAIT_BIT(wait, &ei->i_flags,
 232			EXT4_STATE_FC_COMMITTING);
 233	wq = bit_waitqueue(&ei->i_flags,
 234				EXT4_STATE_FC_COMMITTING);
 235#endif
 236	lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
 237	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 238	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 239	schedule();
 240	finish_wait(wq, &wait.wq_entry);
 241}
 242
 243/*
 244 * Inform Ext4's fast about start of an inode update
 245 *
 246 * This function is called by the high level call VFS callbacks before
 247 * performing any inode update. This function blocks if there's an ongoing
 248 * fast commit on the inode in question.
 249 */
 250void ext4_fc_start_update(struct inode *inode)
 251{
 252	struct ext4_inode_info *ei = EXT4_I(inode);
 253
 254	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 255	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 256		return;
 257
 258restart:
 259	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 260	if (list_empty(&ei->i_fc_list))
 261		goto out;
 262
 263	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 264		ext4_fc_wait_committing_inode(inode);
 265		goto restart;
 266	}
 267out:
 268	atomic_inc(&ei->i_fc_updates);
 269	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 270}
 271
 272/*
 273 * Stop inode update and wake up waiting fast commits if any.
 274 */
 275void ext4_fc_stop_update(struct inode *inode)
 276{
 277	struct ext4_inode_info *ei = EXT4_I(inode);
 278
 279	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 280	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 281		return;
 282
 283	if (atomic_dec_and_test(&ei->i_fc_updates))
 284		wake_up_all(&ei->i_fc_wait);
 285}
 286
 287/*
 288 * Remove inode from fast commit list. If the inode is being committed
 289 * we wait until inode commit is done.
 290 */
 291void ext4_fc_del(struct inode *inode)
 292{
 293	struct ext4_inode_info *ei = EXT4_I(inode);
 294
 295	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 296	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 297		return;
 298
 299restart:
 300	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 301	if (list_empty(&ei->i_fc_list)) {
 302		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 303		return;
 304	}
 305
 306	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 307		ext4_fc_wait_committing_inode(inode);
 308		goto restart;
 309	}
 310	list_del_init(&ei->i_fc_list);
 311	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 312}
 313
 314/*
 315 * Mark file system as fast commit ineligible. This means that next commit
 316 * operation would result in a full jbd2 commit.
 317 */
 318void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
 319{
 320	struct ext4_sb_info *sbi = EXT4_SB(sb);
 321
 322	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 323	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 324		return;
 325
 326	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 327	WARN_ON(reason >= EXT4_FC_REASON_MAX);
 328	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 329}
 330
 331/*
 332 * Start a fast commit ineligible update. Any commits that happen while
 333 * such an operation is in progress fall back to full commits.
 334 */
 335void ext4_fc_start_ineligible(struct super_block *sb, int reason)
 336{
 337	struct ext4_sb_info *sbi = EXT4_SB(sb);
 338
 339	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 340	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 341		return;
 342
 343	WARN_ON(reason >= EXT4_FC_REASON_MAX);
 344	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 345	atomic_inc(&sbi->s_fc_ineligible_updates);
 346}
 347
 348/*
 349 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
 350 * to ensure that after stopping the ineligible update, at least one full
 351 * commit takes place.
 352 */
 353void ext4_fc_stop_ineligible(struct super_block *sb)
 354{
 355	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 356	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 357		return;
 358
 359	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 360	atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
 361}
 362
 363static inline int ext4_fc_is_ineligible(struct super_block *sb)
 364{
 365	return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
 366		atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
 367}
 368
 369/*
 370 * Generic fast commit tracking function. If this is the first time this we are
 371 * called after a full commit, we initialize fast commit fields and then call
 372 * __fc_track_fn() with update = 0. If we have already been called after a full
 373 * commit, we pass update = 1. Based on that, the track function can determine
 374 * if it needs to track a field for the first time or if it needs to just
 375 * update the previously tracked value.
 376 *
 377 * If enqueue is set, this function enqueues the inode in fast commit list.
 378 */
 379static int ext4_fc_track_template(
 380	handle_t *handle, struct inode *inode,
 381	int (*__fc_track_fn)(struct inode *, void *, bool),
 382	void *args, int enqueue)
 383{
 384	bool update = false;
 385	struct ext4_inode_info *ei = EXT4_I(inode);
 386	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 387	tid_t tid = 0;
 388	int ret;
 389
 390	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 391	    (sbi->s_mount_state & EXT4_FC_REPLAY))
 392		return -EOPNOTSUPP;
 393
 394	if (ext4_fc_is_ineligible(inode->i_sb))
 395		return -EINVAL;
 396
 397	tid = handle->h_transaction->t_tid;
 398	mutex_lock(&ei->i_fc_lock);
 399	if (tid == ei->i_sync_tid) {
 400		update = true;
 401	} else {
 402		ext4_fc_reset_inode(inode);
 403		ei->i_sync_tid = tid;
 404	}
 405	ret = __fc_track_fn(inode, args, update);
 406	mutex_unlock(&ei->i_fc_lock);
 407
 408	if (!enqueue)
 409		return ret;
 410
 411	spin_lock(&sbi->s_fc_lock);
 412	if (list_empty(&EXT4_I(inode)->i_fc_list))
 413		list_add_tail(&EXT4_I(inode)->i_fc_list,
 414				(ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
 415				&sbi->s_fc_q[FC_Q_STAGING] :
 416				&sbi->s_fc_q[FC_Q_MAIN]);
 417	spin_unlock(&sbi->s_fc_lock);
 418
 419	return ret;
 420}
 421
 422struct __track_dentry_update_args {
 423	struct dentry *dentry;
 424	int op;
 425};
 426
 427/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
 428static int __track_dentry_update(struct inode *inode, void *arg, bool update)
 429{
 430	struct ext4_fc_dentry_update *node;
 431	struct ext4_inode_info *ei = EXT4_I(inode);
 432	struct __track_dentry_update_args *dentry_update =
 433		(struct __track_dentry_update_args *)arg;
 434	struct dentry *dentry = dentry_update->dentry;
 435	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 436
 437	mutex_unlock(&ei->i_fc_lock);
 438	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
 439	if (!node) {
 440		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
 441		mutex_lock(&ei->i_fc_lock);
 442		return -ENOMEM;
 443	}
 444
 445	node->fcd_op = dentry_update->op;
 446	node->fcd_parent = dentry->d_parent->d_inode->i_ino;
 447	node->fcd_ino = inode->i_ino;
 448	if (dentry->d_name.len > DNAME_INLINE_LEN) {
 449		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
 450		if (!node->fcd_name.name) {
 451			kmem_cache_free(ext4_fc_dentry_cachep, node);
 452			ext4_fc_mark_ineligible(inode->i_sb,
 453				EXT4_FC_REASON_NOMEM);
 454			mutex_lock(&ei->i_fc_lock);
 455			return -ENOMEM;
 456		}
 457		memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
 458			dentry->d_name.len);
 459	} else {
 460		memcpy(node->fcd_iname, dentry->d_name.name,
 461			dentry->d_name.len);
 462		node->fcd_name.name = node->fcd_iname;
 463	}
 464	node->fcd_name.len = dentry->d_name.len;
 465
 466	spin_lock(&sbi->s_fc_lock);
 467	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
 468		list_add_tail(&node->fcd_list,
 469				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
 470	else
 471		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
 472	spin_unlock(&sbi->s_fc_lock);
 473	mutex_lock(&ei->i_fc_lock);
 474
 475	return 0;
 476}
 477
 478void __ext4_fc_track_unlink(handle_t *handle,
 479		struct inode *inode, struct dentry *dentry)
 480{
 481	struct __track_dentry_update_args args;
 482	int ret;
 483
 484	args.dentry = dentry;
 485	args.op = EXT4_FC_TAG_UNLINK;
 486
 487	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 488					(void *)&args, 0);
 489	trace_ext4_fc_track_unlink(inode, dentry, ret);
 490}
 491
 492void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
 493{
 494	__ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
 495}
 496
 497void __ext4_fc_track_link(handle_t *handle,
 498	struct inode *inode, struct dentry *dentry)
 499{
 500	struct __track_dentry_update_args args;
 501	int ret;
 502
 503	args.dentry = dentry;
 504	args.op = EXT4_FC_TAG_LINK;
 505
 506	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 507					(void *)&args, 0);
 508	trace_ext4_fc_track_link(inode, dentry, ret);
 509}
 510
 511void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
 512{
 513	__ext4_fc_track_link(handle, d_inode(dentry), dentry);
 514}
 515
 516void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
 517{
 518	struct __track_dentry_update_args args;
 519	struct inode *inode = d_inode(dentry);
 520	int ret;
 521
 522	args.dentry = dentry;
 523	args.op = EXT4_FC_TAG_CREAT;
 524
 525	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 526					(void *)&args, 0);
 527	trace_ext4_fc_track_create(inode, dentry, ret);
 528}
 529
 530/* __track_fn for inode tracking */
 531static int __track_inode(struct inode *inode, void *arg, bool update)
 532{
 533	if (update)
 534		return -EEXIST;
 535
 536	EXT4_I(inode)->i_fc_lblk_len = 0;
 537
 538	return 0;
 539}
 540
 541void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 542{
 543	int ret;
 544
 545	if (S_ISDIR(inode->i_mode))
 546		return;
 547
 548	if (ext4_should_journal_data(inode)) {
 549		ext4_fc_mark_ineligible(inode->i_sb,
 550					EXT4_FC_REASON_INODE_JOURNAL_DATA);
 551		return;
 552	}
 553
 554	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
 555	trace_ext4_fc_track_inode(inode, ret);
 556}
 557
 558struct __track_range_args {
 559	ext4_lblk_t start, end;
 560};
 561
 562/* __track_fn for tracking data updates */
 563static int __track_range(struct inode *inode, void *arg, bool update)
 564{
 565	struct ext4_inode_info *ei = EXT4_I(inode);
 566	ext4_lblk_t oldstart;
 567	struct __track_range_args *__arg =
 568		(struct __track_range_args *)arg;
 569
 570	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
 571		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
 572		return -ECANCELED;
 573	}
 574
 575	oldstart = ei->i_fc_lblk_start;
 576
 577	if (update && ei->i_fc_lblk_len > 0) {
 578		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
 579		ei->i_fc_lblk_len =
 580			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
 581				ei->i_fc_lblk_start + 1;
 582	} else {
 583		ei->i_fc_lblk_start = __arg->start;
 584		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
 585	}
 586
 587	return 0;
 588}
 589
 590void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
 591			 ext4_lblk_t end)
 592{
 593	struct __track_range_args args;
 594	int ret;
 595
 596	if (S_ISDIR(inode->i_mode))
 597		return;
 598
 599	args.start = start;
 600	args.end = end;
 601
 602	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
 603
 604	trace_ext4_fc_track_range(inode, start, end, ret);
 605}
 606
 607static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
 608{
 609	int write_flags = REQ_SYNC;
 610	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
 611
 612	/* Add REQ_FUA | REQ_PREFLUSH only its tail */
 613	if (test_opt(sb, BARRIER) && is_tail)
 614		write_flags |= REQ_FUA | REQ_PREFLUSH;
 615	lock_buffer(bh);
 616	set_buffer_dirty(bh);
 617	set_buffer_uptodate(bh);
 618	bh->b_end_io = ext4_end_buffer_io_sync;
 619	submit_bh(REQ_OP_WRITE, write_flags, bh);
 620	EXT4_SB(sb)->s_fc_bh = NULL;
 621}
 622
 623/* Ext4 commit path routines */
 624
 625/* memzero and update CRC */
 626static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
 627				u32 *crc)
 628{
 629	void *ret;
 630
 631	ret = memset(dst, 0, len);
 632	if (crc)
 633		*crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
 634	return ret;
 635}
 636
 637/*
 638 * Allocate len bytes on a fast commit buffer.
 639 *
 640 * During the commit time this function is used to manage fast commit
 641 * block space. We don't split a fast commit log onto different
 642 * blocks. So this function makes sure that if there's not enough space
 643 * on the current block, the remaining space in the current block is
 644 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 645 * new block is from jbd2 and CRC is updated to reflect the padding
 646 * we added.
 647 */
 648static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 649{
 650	struct ext4_fc_tl *tl;
 651	struct ext4_sb_info *sbi = EXT4_SB(sb);
 652	struct buffer_head *bh;
 653	int bsize = sbi->s_journal->j_blocksize;
 654	int ret, off = sbi->s_fc_bytes % bsize;
 655	int pad_len;
 656
 657	/*
 658	 * After allocating len, we should have space at least for a 0 byte
 659	 * padding.
 660	 */
 661	if (len + sizeof(struct ext4_fc_tl) > bsize)
 662		return NULL;
 663
 664	if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
 665		/*
 666		 * Only allocate from current buffer if we have enough space for
 667		 * this request AND we have space to add a zero byte padding.
 668		 */
 669		if (!sbi->s_fc_bh) {
 670			ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 671			if (ret)
 672				return NULL;
 673			sbi->s_fc_bh = bh;
 674		}
 675		sbi->s_fc_bytes += len;
 676		return sbi->s_fc_bh->b_data + off;
 677	}
 678	/* Need to add PAD tag */
 679	tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
 680	tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
 681	pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
 682	tl->fc_len = cpu_to_le16(pad_len);
 683	if (crc)
 684		*crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
 685	if (pad_len > 0)
 686		ext4_fc_memzero(sb, tl + 1, pad_len, crc);
 687	ext4_fc_submit_bh(sb, false);
 688
 689	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 690	if (ret)
 691		return NULL;
 692	sbi->s_fc_bh = bh;
 693	sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
 694	return sbi->s_fc_bh->b_data;
 695}
 696
 697/* memcpy to fc reserved space and update CRC */
 698static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
 699				int len, u32 *crc)
 700{
 701	if (crc)
 702		*crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
 703	return memcpy(dst, src, len);
 704}
 705
 706/*
 707 * Complete a fast commit by writing tail tag.
 708 *
 709 * Writing tail tag marks the end of a fast commit. In order to guarantee
 710 * atomicity, after writing tail tag, even if there's space remaining
 711 * in the block, next commit shouldn't use it. That's why tail tag
 712 * has the length as that of the remaining space on the block.
 713 */
 714static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
 715{
 716	struct ext4_sb_info *sbi = EXT4_SB(sb);
 717	struct ext4_fc_tl tl;
 718	struct ext4_fc_tail tail;
 719	int off, bsize = sbi->s_journal->j_blocksize;
 720	u8 *dst;
 721
 722	/*
 723	 * ext4_fc_reserve_space takes care of allocating an extra block if
 724	 * there's no enough space on this block for accommodating this tail.
 725	 */
 726	dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
 727	if (!dst)
 728		return -ENOSPC;
 729
 730	off = sbi->s_fc_bytes % bsize;
 731
 732	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
 733	tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
 734	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
 735
 736	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
 737	dst += sizeof(tl);
 738	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
 739	ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
 740	dst += sizeof(tail.fc_tid);
 741	tail.fc_crc = cpu_to_le32(crc);
 742	ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
 743
 744	ext4_fc_submit_bh(sb, true);
 745
 746	return 0;
 747}
 748
 749/*
 750 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 751 * Returns false if there's not enough space.
 752 */
 753static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
 754			   u32 *crc)
 755{
 756	struct ext4_fc_tl tl;
 757	u8 *dst;
 758
 759	dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
 760	if (!dst)
 761		return false;
 762
 763	tl.fc_tag = cpu_to_le16(tag);
 764	tl.fc_len = cpu_to_le16(len);
 765
 766	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 767	ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
 768
 769	return true;
 770}
 771
 772/* Same as above, but adds dentry tlv. */
 773static  bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
 774					int parent_ino, int ino, int dlen,
 775					const unsigned char *dname,
 776					u32 *crc)
 777{
 778	struct ext4_fc_dentry_info fcd;
 779	struct ext4_fc_tl tl;
 780	u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
 781					crc);
 782
 783	if (!dst)
 784		return false;
 785
 786	fcd.fc_parent_ino = cpu_to_le32(parent_ino);
 787	fcd.fc_ino = cpu_to_le32(ino);
 788	tl.fc_tag = cpu_to_le16(tag);
 789	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
 790	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 791	dst += sizeof(tl);
 792	ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
 793	dst += sizeof(fcd);
 794	ext4_fc_memcpy(sb, dst, dname, dlen, crc);
 795	dst += dlen;
 796
 797	return true;
 798}
 799
 800/*
 801 * Writes inode in the fast commit space under TLV with tag @tag.
 802 * Returns 0 on success, error on failure.
 803 */
 804static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 805{
 806	struct ext4_inode_info *ei = EXT4_I(inode);
 807	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 808	int ret;
 809	struct ext4_iloc iloc;
 810	struct ext4_fc_inode fc_inode;
 811	struct ext4_fc_tl tl;
 812	u8 *dst;
 813
 814	ret = ext4_get_inode_loc(inode, &iloc);
 815	if (ret)
 816		return ret;
 817
 818	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
 819		inode_len += ei->i_extra_isize;
 820
 821	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 822	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
 823	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
 824
 825	dst = ext4_fc_reserve_space(inode->i_sb,
 826			sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
 827	if (!dst)
 828		return -ECANCELED;
 829
 830	if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
 831		return -ECANCELED;
 832	dst += sizeof(tl);
 833	if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
 834		return -ECANCELED;
 835	dst += sizeof(fc_inode);
 836	if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
 837					inode_len, crc))
 838		return -ECANCELED;
 839
 840	return 0;
 841}
 842
 843/*
 844 * Writes updated data ranges for the inode in question. Updates CRC.
 845 * Returns 0 on success, error otherwise.
 846 */
 847static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 848{
 849	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
 850	struct ext4_inode_info *ei = EXT4_I(inode);
 851	struct ext4_map_blocks map;
 852	struct ext4_fc_add_range fc_ext;
 853	struct ext4_fc_del_range lrange;
 854	struct ext4_extent *ex;
 855	int ret;
 856
 857	mutex_lock(&ei->i_fc_lock);
 858	if (ei->i_fc_lblk_len == 0) {
 859		mutex_unlock(&ei->i_fc_lock);
 860		return 0;
 861	}
 862	old_blk_size = ei->i_fc_lblk_start;
 863	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
 864	ei->i_fc_lblk_len = 0;
 865	mutex_unlock(&ei->i_fc_lock);
 866
 867	cur_lblk_off = old_blk_size;
 868	jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
 869		  __func__, cur_lblk_off, new_blk_size, inode->i_ino);
 870
 871	while (cur_lblk_off <= new_blk_size) {
 872		map.m_lblk = cur_lblk_off;
 873		map.m_len = new_blk_size - cur_lblk_off + 1;
 874		ret = ext4_map_blocks(NULL, inode, &map, 0);
 875		if (ret < 0)
 876			return -ECANCELED;
 877
 878		if (map.m_len == 0) {
 879			cur_lblk_off++;
 880			continue;
 881		}
 882
 883		if (ret == 0) {
 884			lrange.fc_ino = cpu_to_le32(inode->i_ino);
 885			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
 886			lrange.fc_len = cpu_to_le32(map.m_len);
 887			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
 888					    sizeof(lrange), (u8 *)&lrange, crc))
 889				return -ENOSPC;
 890		} else {
 891			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
 892			ex = (struct ext4_extent *)&fc_ext.fc_ex;
 893			ex->ee_block = cpu_to_le32(map.m_lblk);
 894			ex->ee_len = cpu_to_le16(map.m_len);
 895			ext4_ext_store_pblock(ex, map.m_pblk);
 896			if (map.m_flags & EXT4_MAP_UNWRITTEN)
 897				ext4_ext_mark_unwritten(ex);
 898			else
 899				ext4_ext_mark_initialized(ex);
 900			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
 901					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
 902				return -ENOSPC;
 903		}
 904
 905		cur_lblk_off += map.m_len;
 906	}
 907
 908	return 0;
 909}
 910
 911
 912/* Submit data for all the fast commit inodes */
 913static int ext4_fc_submit_inode_data_all(journal_t *journal)
 914{
 915	struct super_block *sb = (struct super_block *)(journal->j_private);
 916	struct ext4_sb_info *sbi = EXT4_SB(sb);
 917	struct ext4_inode_info *ei;
 918	struct list_head *pos;
 919	int ret = 0;
 920
 921	spin_lock(&sbi->s_fc_lock);
 922	ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
 923	list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
 924		ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
 925		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
 926		while (atomic_read(&ei->i_fc_updates)) {
 927			DEFINE_WAIT(wait);
 928
 929			prepare_to_wait(&ei->i_fc_wait, &wait,
 930						TASK_UNINTERRUPTIBLE);
 931			if (atomic_read(&ei->i_fc_updates)) {
 932				spin_unlock(&sbi->s_fc_lock);
 933				schedule();
 934				spin_lock(&sbi->s_fc_lock);
 935			}
 936			finish_wait(&ei->i_fc_wait, &wait);
 937		}
 938		spin_unlock(&sbi->s_fc_lock);
 939		ret = jbd2_submit_inode_data(ei->jinode);
 940		if (ret)
 941			return ret;
 942		spin_lock(&sbi->s_fc_lock);
 943	}
 944	spin_unlock(&sbi->s_fc_lock);
 945
 946	return ret;
 947}
 948
 949/* Wait for completion of data for all the fast commit inodes */
 950static int ext4_fc_wait_inode_data_all(journal_t *journal)
 951{
 952	struct super_block *sb = (struct super_block *)(journal->j_private);
 953	struct ext4_sb_info *sbi = EXT4_SB(sb);
 954	struct ext4_inode_info *pos, *n;
 955	int ret = 0;
 956
 957	spin_lock(&sbi->s_fc_lock);
 958	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 959		if (!ext4_test_inode_state(&pos->vfs_inode,
 960					   EXT4_STATE_FC_COMMITTING))
 961			continue;
 962		spin_unlock(&sbi->s_fc_lock);
 963
 964		ret = jbd2_wait_inode_data(journal, pos->jinode);
 965		if (ret)
 966			return ret;
 967		spin_lock(&sbi->s_fc_lock);
 968	}
 969	spin_unlock(&sbi->s_fc_lock);
 970
 971	return 0;
 972}
 973
 974/* Commit all the directory entry updates */
 975static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 976__acquires(&sbi->s_fc_lock)
 977__releases(&sbi->s_fc_lock)
 978{
 979	struct super_block *sb = (struct super_block *)(journal->j_private);
 980	struct ext4_sb_info *sbi = EXT4_SB(sb);
 981	struct ext4_fc_dentry_update *fc_dentry;
 982	struct inode *inode;
 983	struct list_head *pos, *n, *fcd_pos, *fcd_n;
 984	struct ext4_inode_info *ei;
 985	int ret;
 986
 987	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
 988		return 0;
 989	list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
 990		fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
 991					fcd_list);
 992		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
 993			spin_unlock(&sbi->s_fc_lock);
 994			if (!ext4_fc_add_dentry_tlv(
 995				sb, fc_dentry->fcd_op,
 996				fc_dentry->fcd_parent, fc_dentry->fcd_ino,
 997				fc_dentry->fcd_name.len,
 998				fc_dentry->fcd_name.name, crc)) {
 999				ret = -ENOSPC;
1000				goto lock_and_exit;
1001			}
1002			spin_lock(&sbi->s_fc_lock);
1003			continue;
1004		}
1005
1006		inode = NULL;
1007		list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1008			ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
1009			if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
1010				inode = &ei->vfs_inode;
1011				break;
1012			}
1013		}
1014		/*
1015		 * If we don't find inode in our list, then it was deleted,
1016		 * in which case, we don't need to record it's create tag.
1017		 */
1018		if (!inode)
1019			continue;
1020		spin_unlock(&sbi->s_fc_lock);
1021
1022		/*
1023		 * We first write the inode and then the create dirent. This
1024		 * allows the recovery code to create an unnamed inode first
1025		 * and then link it to a directory entry. This allows us
1026		 * to use namei.c routines almost as is and simplifies
1027		 * the recovery code.
1028		 */
1029		ret = ext4_fc_write_inode(inode, crc);
1030		if (ret)
1031			goto lock_and_exit;
1032
1033		ret = ext4_fc_write_inode_data(inode, crc);
1034		if (ret)
1035			goto lock_and_exit;
1036
1037		if (!ext4_fc_add_dentry_tlv(
1038			sb, fc_dentry->fcd_op,
1039			fc_dentry->fcd_parent, fc_dentry->fcd_ino,
1040			fc_dentry->fcd_name.len,
1041			fc_dentry->fcd_name.name, crc)) {
1042			ret = -ENOSPC;
1043			goto lock_and_exit;
1044		}
1045
1046		spin_lock(&sbi->s_fc_lock);
1047	}
1048	return 0;
1049lock_and_exit:
1050	spin_lock(&sbi->s_fc_lock);
1051	return ret;
1052}
1053
1054static int ext4_fc_perform_commit(journal_t *journal)
1055{
1056	struct super_block *sb = (struct super_block *)(journal->j_private);
1057	struct ext4_sb_info *sbi = EXT4_SB(sb);
1058	struct ext4_inode_info *iter;
1059	struct ext4_fc_head head;
1060	struct list_head *pos;
1061	struct inode *inode;
1062	struct blk_plug plug;
1063	int ret = 0;
1064	u32 crc = 0;
1065
1066	ret = ext4_fc_submit_inode_data_all(journal);
1067	if (ret)
1068		return ret;
1069
1070	ret = ext4_fc_wait_inode_data_all(journal);
1071	if (ret)
1072		return ret;
1073
1074	/*
1075	 * If file system device is different from journal device, issue a cache
1076	 * flush before we start writing fast commit blocks.
1077	 */
1078	if (journal->j_fs_dev != journal->j_dev)
1079		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
1080
1081	blk_start_plug(&plug);
1082	if (sbi->s_fc_bytes == 0) {
1083		/*
1084		 * Add a head tag only if this is the first fast commit
1085		 * in this TID.
1086		 */
1087		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1088		head.fc_tid = cpu_to_le32(
1089			sbi->s_journal->j_running_transaction->t_tid);
1090		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1091			(u8 *)&head, &crc))
1092			goto out;
1093	}
1094
1095	spin_lock(&sbi->s_fc_lock);
1096	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1097	if (ret) {
1098		spin_unlock(&sbi->s_fc_lock);
1099		goto out;
1100	}
1101
1102	list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
1103		iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1104		inode = &iter->vfs_inode;
1105		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1106			continue;
1107
1108		spin_unlock(&sbi->s_fc_lock);
1109		ret = ext4_fc_write_inode_data(inode, &crc);
1110		if (ret)
1111			goto out;
1112		ret = ext4_fc_write_inode(inode, &crc);
1113		if (ret)
1114			goto out;
1115		spin_lock(&sbi->s_fc_lock);
1116	}
1117	spin_unlock(&sbi->s_fc_lock);
1118
1119	ret = ext4_fc_write_tail(sb, crc);
1120
1121out:
1122	blk_finish_plug(&plug);
1123	return ret;
1124}
1125
1126/*
1127 * The main commit entry point. Performs a fast commit for transaction
1128 * commit_tid if needed. If it's not possible to perform a fast commit
1129 * due to various reasons, we fall back to full commit. Returns 0
1130 * on success, error otherwise.
1131 */
1132int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1133{
1134	struct super_block *sb = (struct super_block *)(journal->j_private);
1135	struct ext4_sb_info *sbi = EXT4_SB(sb);
1136	int nblks = 0, ret, bsize = journal->j_blocksize;
1137	int subtid = atomic_read(&sbi->s_fc_subtid);
1138	int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1139	ktime_t start_time, commit_time;
1140
1141	trace_ext4_fc_commit_start(sb);
1142
1143	start_time = ktime_get();
1144
1145	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1146		(ext4_fc_is_ineligible(sb))) {
1147		reason = EXT4_FC_REASON_INELIGIBLE;
1148		goto out;
1149	}
1150
1151restart_fc:
1152	ret = jbd2_fc_begin_commit(journal, commit_tid);
1153	if (ret == -EALREADY) {
1154		/* There was an ongoing commit, check if we need to restart */
1155		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1156			commit_tid > journal->j_commit_sequence)
1157			goto restart_fc;
1158		reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1159		goto out;
1160	} else if (ret) {
1161		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1162		reason = EXT4_FC_REASON_FC_START_FAILED;
1163		goto out;
1164	}
1165
1166	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1167	ret = ext4_fc_perform_commit(journal);
1168	if (ret < 0) {
1169		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1170		reason = EXT4_FC_REASON_FC_FAILED;
1171		goto out;
1172	}
1173	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1174	ret = jbd2_fc_wait_bufs(journal, nblks);
1175	if (ret < 0) {
1176		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1177		reason = EXT4_FC_REASON_FC_FAILED;
1178		goto out;
1179	}
1180	atomic_inc(&sbi->s_fc_subtid);
1181	jbd2_fc_end_commit(journal);
1182out:
1183	/* Has any ineligible update happened since we started? */
1184	if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1185		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1186		reason = EXT4_FC_REASON_INELIGIBLE;
1187	}
1188
1189	spin_lock(&sbi->s_fc_lock);
1190	if (reason != EXT4_FC_REASON_OK &&
1191		reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1192		sbi->s_fc_stats.fc_ineligible_commits++;
1193	} else {
1194		sbi->s_fc_stats.fc_num_commits++;
1195		sbi->s_fc_stats.fc_numblks += nblks;
1196	}
1197	spin_unlock(&sbi->s_fc_lock);
1198	nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1199	trace_ext4_fc_commit_stop(sb, nblks, reason);
1200	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1201	/*
1202	 * weight the commit time higher than the average time so we don't
1203	 * react too strongly to vast changes in the commit time
1204	 */
1205	if (likely(sbi->s_fc_avg_commit_time))
1206		sbi->s_fc_avg_commit_time = (commit_time +
1207				sbi->s_fc_avg_commit_time * 3) / 4;
1208	else
1209		sbi->s_fc_avg_commit_time = commit_time;
1210	jbd_debug(1,
1211		"Fast commit ended with blks = %d, reason = %d, subtid - %d",
1212		nblks, reason, subtid);
1213	if (reason == EXT4_FC_REASON_FC_FAILED)
1214		return jbd2_fc_end_commit_fallback(journal);
1215	if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1216		reason == EXT4_FC_REASON_INELIGIBLE)
1217		return jbd2_complete_transaction(journal, commit_tid);
1218	return 0;
1219}
1220
1221/*
1222 * Fast commit cleanup routine. This is called after every fast commit and
1223 * full commit. full is true if we are called after a full commit.
1224 */
1225static void ext4_fc_cleanup(journal_t *journal, int full)
1226{
1227	struct super_block *sb = journal->j_private;
1228	struct ext4_sb_info *sbi = EXT4_SB(sb);
1229	struct ext4_inode_info *iter;
1230	struct ext4_fc_dentry_update *fc_dentry;
1231	struct list_head *pos, *n;
1232
1233	if (full && sbi->s_fc_bh)
1234		sbi->s_fc_bh = NULL;
1235
1236	jbd2_fc_release_bufs(journal);
1237
1238	spin_lock(&sbi->s_fc_lock);
1239	list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1240		iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1241		list_del_init(&iter->i_fc_list);
1242		ext4_clear_inode_state(&iter->vfs_inode,
1243				       EXT4_STATE_FC_COMMITTING);
1244		ext4_fc_reset_inode(&iter->vfs_inode);
1245		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1246		smp_mb();
1247#if (BITS_PER_LONG < 64)
1248		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1249#else
1250		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1251#endif
1252	}
1253
1254	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1255		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1256					     struct ext4_fc_dentry_update,
1257					     fcd_list);
1258		list_del_init(&fc_dentry->fcd_list);
1259		spin_unlock(&sbi->s_fc_lock);
1260
1261		if (fc_dentry->fcd_name.name &&
1262			fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1263			kfree(fc_dentry->fcd_name.name);
1264		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1265		spin_lock(&sbi->s_fc_lock);
1266	}
1267
1268	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1269				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1270	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1271				&sbi->s_fc_q[FC_Q_MAIN]);
1272
1273	ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1274	ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1275
1276	if (full)
1277		sbi->s_fc_bytes = 0;
1278	spin_unlock(&sbi->s_fc_lock);
1279	trace_ext4_fc_stats(sb);
1280}
1281
1282/* Ext4 Replay Path Routines */
1283
1284/* Helper struct for dentry replay routines */
1285struct dentry_info_args {
1286	int parent_ino, dname_len, ino, inode_len;
1287	char *dname;
1288};
1289
1290static inline void tl_to_darg(struct dentry_info_args *darg,
1291				struct  ext4_fc_tl *tl)
1292{
1293	struct ext4_fc_dentry_info *fcd;
1294
1295	fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
1296
1297	darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
1298	darg->ino = le32_to_cpu(fcd->fc_ino);
1299	darg->dname = fcd->fc_dname;
1300	darg->dname_len = ext4_fc_tag_len(tl) -
1301			sizeof(struct ext4_fc_dentry_info);
1302}
1303
1304/* Unlink replay function */
1305static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
1306{
1307	struct inode *inode, *old_parent;
1308	struct qstr entry;
1309	struct dentry_info_args darg;
1310	int ret = 0;
1311
1312	tl_to_darg(&darg, tl);
1313
1314	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1315			darg.parent_ino, darg.dname_len);
1316
1317	entry.name = darg.dname;
1318	entry.len = darg.dname_len;
1319	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1320
1321	if (IS_ERR(inode)) {
1322		jbd_debug(1, "Inode %d not found", darg.ino);
1323		return 0;
1324	}
1325
1326	old_parent = ext4_iget(sb, darg.parent_ino,
1327				EXT4_IGET_NORMAL);
1328	if (IS_ERR(old_parent)) {
1329		jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1330		iput(inode);
1331		return 0;
1332	}
1333
1334	ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1335	/* -ENOENT ok coz it might not exist anymore. */
1336	if (ret == -ENOENT)
1337		ret = 0;
1338	iput(old_parent);
1339	iput(inode);
1340	return ret;
1341}
1342
1343static int ext4_fc_replay_link_internal(struct super_block *sb,
1344				struct dentry_info_args *darg,
1345				struct inode *inode)
1346{
1347	struct inode *dir = NULL;
1348	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1349	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1350	int ret = 0;
1351
1352	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1353	if (IS_ERR(dir)) {
1354		jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1355		dir = NULL;
1356		goto out;
1357	}
1358
1359	dentry_dir = d_obtain_alias(dir);
1360	if (IS_ERR(dentry_dir)) {
1361		jbd_debug(1, "Failed to obtain dentry");
1362		dentry_dir = NULL;
1363		goto out;
1364	}
1365
1366	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1367	if (!dentry_inode) {
1368		jbd_debug(1, "Inode dentry not created.");
1369		ret = -ENOMEM;
1370		goto out;
1371	}
1372
1373	ret = __ext4_link(dir, inode, dentry_inode);
1374	/*
1375	 * It's possible that link already existed since data blocks
1376	 * for the dir in question got persisted before we crashed OR
1377	 * we replayed this tag and crashed before the entire replay
1378	 * could complete.
1379	 */
1380	if (ret && ret != -EEXIST) {
1381		jbd_debug(1, "Failed to link\n");
1382		goto out;
1383	}
1384
1385	ret = 0;
1386out:
1387	if (dentry_dir) {
1388		d_drop(dentry_dir);
1389		dput(dentry_dir);
1390	} else if (dir) {
1391		iput(dir);
1392	}
1393	if (dentry_inode) {
1394		d_drop(dentry_inode);
1395		dput(dentry_inode);
1396	}
1397
1398	return ret;
1399}
1400
1401/* Link replay function */
1402static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
1403{
1404	struct inode *inode;
1405	struct dentry_info_args darg;
1406	int ret = 0;
1407
1408	tl_to_darg(&darg, tl);
1409	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1410			darg.parent_ino, darg.dname_len);
1411
1412	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1413	if (IS_ERR(inode)) {
1414		jbd_debug(1, "Inode not found.");
1415		return 0;
1416	}
1417
1418	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1419	iput(inode);
1420	return ret;
1421}
1422
1423/*
1424 * Record all the modified inodes during replay. We use this later to setup
1425 * block bitmaps correctly.
1426 */
1427static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1428{
1429	struct ext4_fc_replay_state *state;
1430	int i;
1431
1432	state = &EXT4_SB(sb)->s_fc_replay_state;
1433	for (i = 0; i < state->fc_modified_inodes_used; i++)
1434		if (state->fc_modified_inodes[i] == ino)
1435			return 0;
1436	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1437		state->fc_modified_inodes_size +=
1438			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1439		state->fc_modified_inodes = krealloc(
1440					state->fc_modified_inodes, sizeof(int) *
1441					state->fc_modified_inodes_size,
1442					GFP_KERNEL);
1443		if (!state->fc_modified_inodes)
1444			return -ENOMEM;
1445	}
1446	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1447	return 0;
1448}
1449
1450/*
1451 * Inode replay function
1452 */
1453static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
1454{
1455	struct ext4_fc_inode *fc_inode;
1456	struct ext4_inode *raw_inode;
1457	struct ext4_inode *raw_fc_inode;
1458	struct inode *inode = NULL;
1459	struct ext4_iloc iloc;
1460	int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1461	struct ext4_extent_header *eh;
1462
1463	fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
1464
1465	ino = le32_to_cpu(fc_inode->fc_ino);
1466	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1467
1468	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1469	if (!IS_ERR(inode)) {
1470		ext4_ext_clear_bb(inode);
1471		iput(inode);
1472	}
1473	inode = NULL;
1474
1475	ext4_fc_record_modified_inode(sb, ino);
1476
1477	raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
1478	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1479	if (ret)
1480		goto out;
1481
1482	inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
1483	raw_inode = ext4_raw_inode(&iloc);
1484
1485	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1486	memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1487		inode_len - offsetof(struct ext4_inode, i_generation));
1488	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1489		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1490		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1491			memset(eh, 0, sizeof(*eh));
1492			eh->eh_magic = EXT4_EXT_MAGIC;
1493			eh->eh_max = cpu_to_le16(
1494				(sizeof(raw_inode->i_block) -
1495				 sizeof(struct ext4_extent_header))
1496				 / sizeof(struct ext4_extent));
1497		}
1498	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1499		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1500			sizeof(raw_inode->i_block));
1501	}
1502
1503	/* Immediately update the inode on disk. */
1504	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1505	if (ret)
1506		goto out;
1507	ret = sync_dirty_buffer(iloc.bh);
1508	if (ret)
1509		goto out;
1510	ret = ext4_mark_inode_used(sb, ino);
1511	if (ret)
1512		goto out;
1513
1514	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
1515	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1516	if (IS_ERR(inode)) {
1517		jbd_debug(1, "Inode not found.");
1518		return -EFSCORRUPTED;
1519	}
1520
1521	/*
1522	 * Our allocator could have made different decisions than before
1523	 * crashing. This should be fixed but until then, we calculate
1524	 * the number of blocks the inode.
1525	 */
1526	ext4_ext_replay_set_iblocks(inode);
1527
1528	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1529	ext4_reset_inode_seed(inode);
1530
1531	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1532	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1533	sync_dirty_buffer(iloc.bh);
1534	brelse(iloc.bh);
1535out:
1536	iput(inode);
1537	if (!ret)
1538		blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
1539
1540	return 0;
1541}
1542
1543/*
1544 * Dentry create replay function.
1545 *
1546 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1547 * inode for which we are trying to create a dentry here, should already have
1548 * been replayed before we start here.
1549 */
1550static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
1551{
1552	int ret = 0;
1553	struct inode *inode = NULL;
1554	struct inode *dir = NULL;
1555	struct dentry_info_args darg;
1556
1557	tl_to_darg(&darg, tl);
1558
1559	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1560			darg.parent_ino, darg.dname_len);
1561
1562	/* This takes care of update group descriptor and other metadata */
1563	ret = ext4_mark_inode_used(sb, darg.ino);
1564	if (ret)
1565		goto out;
1566
1567	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1568	if (IS_ERR(inode)) {
1569		jbd_debug(1, "inode %d not found.", darg.ino);
1570		inode = NULL;
1571		ret = -EINVAL;
1572		goto out;
1573	}
1574
1575	if (S_ISDIR(inode->i_mode)) {
1576		/*
1577		 * If we are creating a directory, we need to make sure that the
1578		 * dot and dot dot dirents are setup properly.
1579		 */
1580		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1581		if (IS_ERR(dir)) {
1582			jbd_debug(1, "Dir %d not found.", darg.ino);
1583			goto out;
1584		}
1585		ret = ext4_init_new_dir(NULL, dir, inode);
1586		iput(dir);
1587		if (ret) {
1588			ret = 0;
1589			goto out;
1590		}
1591	}
1592	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1593	if (ret)
1594		goto out;
1595	set_nlink(inode, 1);
1596	ext4_mark_inode_dirty(NULL, inode);
1597out:
1598	if (inode)
1599		iput(inode);
1600	return ret;
1601}
1602
1603/*
1604 * Record physical disk regions which are in use as per fast commit area. Our
1605 * simple replay phase allocator excludes these regions from allocation.
1606 */
1607static int ext4_fc_record_regions(struct super_block *sb, int ino,
1608		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1609{
1610	struct ext4_fc_replay_state *state;
1611	struct ext4_fc_alloc_region *region;
1612
1613	state = &EXT4_SB(sb)->s_fc_replay_state;
1614	if (state->fc_regions_used == state->fc_regions_size) {
1615		state->fc_regions_size +=
1616			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1617		state->fc_regions = krealloc(
1618					state->fc_regions,
1619					state->fc_regions_size *
1620					sizeof(struct ext4_fc_alloc_region),
1621					GFP_KERNEL);
1622		if (!state->fc_regions)
1623			return -ENOMEM;
1624	}
1625	region = &state->fc_regions[state->fc_regions_used++];
1626	region->ino = ino;
1627	region->lblk = lblk;
1628	region->pblk = pblk;
1629	region->len = len;
1630
1631	return 0;
1632}
1633
1634/* Replay add range tag */
1635static int ext4_fc_replay_add_range(struct super_block *sb,
1636				struct ext4_fc_tl *tl)
1637{
1638	struct ext4_fc_add_range *fc_add_ex;
1639	struct ext4_extent newex, *ex;
1640	struct inode *inode;
1641	ext4_lblk_t start, cur;
1642	int remaining, len;
1643	ext4_fsblk_t start_pblk;
1644	struct ext4_map_blocks map;
1645	struct ext4_ext_path *path = NULL;
1646	int ret;
1647
1648	fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1649	ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
1650
1651	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1652		le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
1653		ext4_ext_get_actual_len(ex));
1654
1655	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
1656				EXT4_IGET_NORMAL);
1657	if (IS_ERR(inode)) {
1658		jbd_debug(1, "Inode not found.");
1659		return 0;
1660	}
1661
1662	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1663
1664	start = le32_to_cpu(ex->ee_block);
1665	start_pblk = ext4_ext_pblock(ex);
1666	len = ext4_ext_get_actual_len(ex);
1667
1668	cur = start;
1669	remaining = len;
1670	jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1671		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1672		  inode->i_ino);
1673
1674	while (remaining > 0) {
1675		map.m_lblk = cur;
1676		map.m_len = remaining;
1677		map.m_pblk = 0;
1678		ret = ext4_map_blocks(NULL, inode, &map, 0);
1679
1680		if (ret < 0) {
1681			iput(inode);
1682			return 0;
1683		}
1684
1685		if (ret == 0) {
1686			/* Range is not mapped */
1687			path = ext4_find_extent(inode, cur, NULL, 0);
1688			if (IS_ERR(path)) {
1689				iput(inode);
1690				return 0;
1691			}
1692			memset(&newex, 0, sizeof(newex));
1693			newex.ee_block = cpu_to_le32(cur);
1694			ext4_ext_store_pblock(
1695				&newex, start_pblk + cur - start);
1696			newex.ee_len = cpu_to_le16(map.m_len);
1697			if (ext4_ext_is_unwritten(ex))
1698				ext4_ext_mark_unwritten(&newex);
1699			down_write(&EXT4_I(inode)->i_data_sem);
1700			ret = ext4_ext_insert_extent(
1701				NULL, inode, &path, &newex, 0);
1702			up_write((&EXT4_I(inode)->i_data_sem));
1703			ext4_ext_drop_refs(path);
1704			kfree(path);
1705			if (ret) {
1706				iput(inode);
1707				return 0;
1708			}
1709			goto next;
1710		}
1711
1712		if (start_pblk + cur - start != map.m_pblk) {
1713			/*
1714			 * Logical to physical mapping changed. This can happen
1715			 * if this range was removed and then reallocated to
1716			 * map to new physical blocks during a fast commit.
1717			 */
1718			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1719					ext4_ext_is_unwritten(ex),
1720					start_pblk + cur - start);
1721			if (ret) {
1722				iput(inode);
1723				return 0;
1724			}
1725			/*
1726			 * Mark the old blocks as free since they aren't used
1727			 * anymore. We maintain an array of all the modified
1728			 * inodes. In case these blocks are still used at either
1729			 * a different logical range in the same inode or in
1730			 * some different inode, we will mark them as allocated
1731			 * at the end of the FC replay using our array of
1732			 * modified inodes.
1733			 */
1734			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1735			goto next;
1736		}
1737
1738		/* Range is mapped and needs a state change */
1739		jbd_debug(1, "Converting from %d to %d %lld",
1740				map.m_flags & EXT4_MAP_UNWRITTEN,
1741			ext4_ext_is_unwritten(ex), map.m_pblk);
1742		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1743					ext4_ext_is_unwritten(ex), map.m_pblk);
1744		if (ret) {
1745			iput(inode);
1746			return 0;
1747		}
1748		/*
1749		 * We may have split the extent tree while toggling the state.
1750		 * Try to shrink the extent tree now.
1751		 */
1752		ext4_ext_replay_shrink_inode(inode, start + len);
1753next:
1754		cur += map.m_len;
1755		remaining -= map.m_len;
1756	}
1757	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1758					sb->s_blocksize_bits);
1759	iput(inode);
1760	return 0;
1761}
1762
1763/* Replay DEL_RANGE tag */
1764static int
1765ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
1766{
1767	struct inode *inode;
1768	struct ext4_fc_del_range *lrange;
1769	struct ext4_map_blocks map;
1770	ext4_lblk_t cur, remaining;
1771	int ret;
1772
1773	lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
1774	cur = le32_to_cpu(lrange->fc_lblk);
1775	remaining = le32_to_cpu(lrange->fc_len);
1776
1777	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1778		le32_to_cpu(lrange->fc_ino), cur, remaining);
1779
1780	inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
1781	if (IS_ERR(inode)) {
1782		jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
1783		return 0;
1784	}
1785
1786	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1787
1788	jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1789			inode->i_ino, le32_to_cpu(lrange->fc_lblk),
1790			le32_to_cpu(lrange->fc_len));
1791	while (remaining > 0) {
1792		map.m_lblk = cur;
1793		map.m_len = remaining;
1794
1795		ret = ext4_map_blocks(NULL, inode, &map, 0);
1796		if (ret < 0) {
1797			iput(inode);
1798			return 0;
1799		}
1800		if (ret > 0) {
1801			remaining -= ret;
1802			cur += ret;
1803			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1804		} else {
1805			remaining -= map.m_len;
1806			cur += map.m_len;
1807		}
1808	}
1809
1810	ret = ext4_punch_hole(inode,
1811		le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
1812		le32_to_cpu(lrange->fc_len) <<  sb->s_blocksize_bits);
1813	if (ret)
1814		jbd_debug(1, "ext4_punch_hole returned %d", ret);
1815	ext4_ext_replay_shrink_inode(inode,
1816		i_size_read(inode) >> sb->s_blocksize_bits);
1817	ext4_mark_inode_dirty(NULL, inode);
1818	iput(inode);
1819
1820	return 0;
1821}
1822
1823static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1824{
1825	struct ext4_fc_replay_state *state;
1826	struct inode *inode;
1827	struct ext4_ext_path *path = NULL;
1828	struct ext4_map_blocks map;
1829	int i, ret, j;
1830	ext4_lblk_t cur, end;
1831
1832	state = &EXT4_SB(sb)->s_fc_replay_state;
1833	for (i = 0; i < state->fc_modified_inodes_used; i++) {
1834		inode = ext4_iget(sb, state->fc_modified_inodes[i],
1835			EXT4_IGET_NORMAL);
1836		if (IS_ERR(inode)) {
1837			jbd_debug(1, "Inode %d not found.",
1838				state->fc_modified_inodes[i]);
1839			continue;
1840		}
1841		cur = 0;
1842		end = EXT_MAX_BLOCKS;
1843		while (cur < end) {
1844			map.m_lblk = cur;
1845			map.m_len = end - cur;
1846
1847			ret = ext4_map_blocks(NULL, inode, &map, 0);
1848			if (ret < 0)
1849				break;
1850
1851			if (ret > 0) {
1852				path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1853				if (!IS_ERR(path)) {
1854					for (j = 0; j < path->p_depth; j++)
1855						ext4_mb_mark_bb(inode->i_sb,
1856							path[j].p_block, 1, 1);
1857					ext4_ext_drop_refs(path);
1858					kfree(path);
1859				}
1860				cur += ret;
1861				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1862							map.m_len, 1);
1863			} else {
1864				cur = cur + (map.m_len ? map.m_len : 1);
1865			}
1866		}
1867		iput(inode);
1868	}
1869}
1870
1871/*
1872 * Check if block is in excluded regions for block allocation. The simple
1873 * allocator that runs during replay phase is calls this function to see
1874 * if it is okay to use a block.
1875 */
1876bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1877{
1878	int i;
1879	struct ext4_fc_replay_state *state;
1880
1881	state = &EXT4_SB(sb)->s_fc_replay_state;
1882	for (i = 0; i < state->fc_regions_valid; i++) {
1883		if (state->fc_regions[i].ino == 0 ||
1884			state->fc_regions[i].len == 0)
1885			continue;
1886		if (blk >= state->fc_regions[i].pblk &&
1887		    blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1888			return true;
1889	}
1890	return false;
1891}
1892
1893/* Cleanup function called after replay */
1894void ext4_fc_replay_cleanup(struct super_block *sb)
1895{
1896	struct ext4_sb_info *sbi = EXT4_SB(sb);
1897
1898	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1899	kfree(sbi->s_fc_replay_state.fc_regions);
1900	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1901}
1902
1903/*
1904 * Recovery Scan phase handler
1905 *
1906 * This function is called during the scan phase and is responsible
1907 * for doing following things:
1908 * - Make sure the fast commit area has valid tags for replay
1909 * - Count number of tags that need to be replayed by the replay handler
1910 * - Verify CRC
1911 * - Create a list of excluded blocks for allocation during replay phase
1912 *
1913 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1914 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1915 * to indicate that scan has finished and JBD2 can now start replay phase.
1916 * It returns a negative error to indicate that there was an error. At the end
1917 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1918 * to indicate the number of tags that need to replayed during the replay phase.
1919 */
1920static int ext4_fc_replay_scan(journal_t *journal,
1921				struct buffer_head *bh, int off,
1922				tid_t expected_tid)
1923{
1924	struct super_block *sb = journal->j_private;
1925	struct ext4_sb_info *sbi = EXT4_SB(sb);
1926	struct ext4_fc_replay_state *state;
1927	int ret = JBD2_FC_REPLAY_CONTINUE;
1928	struct ext4_fc_add_range *ext;
1929	struct ext4_fc_tl *tl;
1930	struct ext4_fc_tail *tail;
1931	__u8 *start, *end;
1932	struct ext4_fc_head *head;
1933	struct ext4_extent *ex;
1934
1935	state = &sbi->s_fc_replay_state;
1936
1937	start = (u8 *)bh->b_data;
1938	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1939
1940	if (state->fc_replay_expected_off == 0) {
1941		state->fc_cur_tag = 0;
1942		state->fc_replay_num_tags = 0;
1943		state->fc_crc = 0;
1944		state->fc_regions = NULL;
1945		state->fc_regions_valid = state->fc_regions_used =
1946			state->fc_regions_size = 0;
1947		/* Check if we can stop early */
1948		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1949			!= EXT4_FC_TAG_HEAD)
1950			return 0;
1951	}
1952
1953	if (off != state->fc_replay_expected_off) {
1954		ret = -EFSCORRUPTED;
1955		goto out_err;
1956	}
1957
1958	state->fc_replay_expected_off++;
1959	fc_for_each_tl(start, end, tl) {
1960		jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1961			  tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
1962		switch (le16_to_cpu(tl->fc_tag)) {
1963		case EXT4_FC_TAG_ADD_RANGE:
1964			ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1965			ex = (struct ext4_extent *)&ext->fc_ex;
1966			ret = ext4_fc_record_regions(sb,
1967				le32_to_cpu(ext->fc_ino),
1968				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1969				ext4_ext_get_actual_len(ex));
1970			if (ret < 0)
1971				break;
1972			ret = JBD2_FC_REPLAY_CONTINUE;
1973			fallthrough;
1974		case EXT4_FC_TAG_DEL_RANGE:
1975		case EXT4_FC_TAG_LINK:
1976		case EXT4_FC_TAG_UNLINK:
1977		case EXT4_FC_TAG_CREAT:
1978		case EXT4_FC_TAG_INODE:
1979		case EXT4_FC_TAG_PAD:
1980			state->fc_cur_tag++;
1981			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1982					sizeof(*tl) + ext4_fc_tag_len(tl));
1983			break;
1984		case EXT4_FC_TAG_TAIL:
1985			state->fc_cur_tag++;
1986			tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
1987			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1988						sizeof(*tl) +
1989						offsetof(struct ext4_fc_tail,
1990						fc_crc));
1991			if (le32_to_cpu(tail->fc_tid) == expected_tid &&
1992				le32_to_cpu(tail->fc_crc) == state->fc_crc) {
1993				state->fc_replay_num_tags = state->fc_cur_tag;
1994				state->fc_regions_valid =
1995					state->fc_regions_used;
1996			} else {
1997				ret = state->fc_replay_num_tags ?
1998					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1999			}
2000			state->fc_crc = 0;
2001			break;
2002		case EXT4_FC_TAG_HEAD:
2003			head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
2004			if (le32_to_cpu(head->fc_features) &
2005				~EXT4_FC_SUPPORTED_FEATURES) {
2006				ret = -EOPNOTSUPP;
2007				break;
2008			}
2009			if (le32_to_cpu(head->fc_tid) != expected_tid) {
2010				ret = JBD2_FC_REPLAY_STOP;
2011				break;
2012			}
2013			state->fc_cur_tag++;
2014			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
2015					sizeof(*tl) + ext4_fc_tag_len(tl));
2016			break;
2017		default:
2018			ret = state->fc_replay_num_tags ?
2019				JBD2_FC_REPLAY_STOP : -ECANCELED;
2020		}
2021		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2022			break;
2023	}
2024
2025out_err:
2026	trace_ext4_fc_replay_scan(sb, ret, off);
2027	return ret;
2028}
2029
2030/*
2031 * Main recovery path entry point.
2032 * The meaning of return codes is similar as above.
2033 */
2034static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2035				enum passtype pass, int off, tid_t expected_tid)
2036{
2037	struct super_block *sb = journal->j_private;
2038	struct ext4_sb_info *sbi = EXT4_SB(sb);
2039	struct ext4_fc_tl *tl;
2040	__u8 *start, *end;
2041	int ret = JBD2_FC_REPLAY_CONTINUE;
2042	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2043	struct ext4_fc_tail *tail;
2044
2045	if (pass == PASS_SCAN) {
2046		state->fc_current_pass = PASS_SCAN;
2047		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2048	}
2049
2050	if (state->fc_current_pass != pass) {
2051		state->fc_current_pass = pass;
2052		sbi->s_mount_state |= EXT4_FC_REPLAY;
2053	}
2054	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2055		jbd_debug(1, "Replay stops\n");
2056		ext4_fc_set_bitmaps_and_counters(sb);
2057		return 0;
2058	}
2059
2060#ifdef CONFIG_EXT4_DEBUG
2061	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2062		pr_warn("Dropping fc block %d because max_replay set\n", off);
2063		return JBD2_FC_REPLAY_STOP;
2064	}
2065#endif
2066
2067	start = (u8 *)bh->b_data;
2068	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2069
2070	fc_for_each_tl(start, end, tl) {
2071		if (state->fc_replay_num_tags == 0) {
2072			ret = JBD2_FC_REPLAY_STOP;
2073			ext4_fc_set_bitmaps_and_counters(sb);
2074			break;
2075		}
2076		jbd_debug(3, "Replay phase, tag:%s\n",
2077				tag2str(le16_to_cpu(tl->fc_tag)));
2078		state->fc_replay_num_tags--;
2079		switch (le16_to_cpu(tl->fc_tag)) {
2080		case EXT4_FC_TAG_LINK:
2081			ret = ext4_fc_replay_link(sb, tl);
2082			break;
2083		case EXT4_FC_TAG_UNLINK:
2084			ret = ext4_fc_replay_unlink(sb, tl);
2085			break;
2086		case EXT4_FC_TAG_ADD_RANGE:
2087			ret = ext4_fc_replay_add_range(sb, tl);
2088			break;
2089		case EXT4_FC_TAG_CREAT:
2090			ret = ext4_fc_replay_create(sb, tl);
2091			break;
2092		case EXT4_FC_TAG_DEL_RANGE:
2093			ret = ext4_fc_replay_del_range(sb, tl);
2094			break;
2095		case EXT4_FC_TAG_INODE:
2096			ret = ext4_fc_replay_inode(sb, tl);
2097			break;
2098		case EXT4_FC_TAG_PAD:
2099			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2100				ext4_fc_tag_len(tl), 0);
2101			break;
2102		case EXT4_FC_TAG_TAIL:
2103			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2104				ext4_fc_tag_len(tl), 0);
2105			tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
2106			WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
2107			break;
2108		case EXT4_FC_TAG_HEAD:
2109			break;
2110		default:
2111			trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
2112				ext4_fc_tag_len(tl), 0);
2113			ret = -ECANCELED;
2114			break;
2115		}
2116		if (ret < 0)
2117			break;
2118		ret = JBD2_FC_REPLAY_CONTINUE;
2119	}
2120	return ret;
2121}
2122
2123void ext4_fc_init(struct super_block *sb, journal_t *journal)
2124{
2125	/*
2126	 * We set replay callback even if fast commit disabled because we may
2127	 * could still have fast commit blocks that need to be replayed even if
2128	 * fast commit has now been turned off.
2129	 */
2130	journal->j_fc_replay_callback = ext4_fc_replay;
2131	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2132		return;
2133	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2134}
2135
2136static const char *fc_ineligible_reasons[] = {
2137	"Extended attributes changed",
2138	"Cross rename",
2139	"Journal flag changed",
2140	"Insufficient memory",
2141	"Swap boot",
2142	"Resize",
2143	"Dir renamed",
2144	"Falloc range op",
2145	"Data journalling",
2146	"FC Commit Failed"
2147};
2148
2149int ext4_fc_info_show(struct seq_file *seq, void *v)
2150{
2151	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2152	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2153	int i;
2154
2155	if (v != SEQ_START_TOKEN)
2156		return 0;
2157
2158	seq_printf(seq,
2159		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2160		   stats->fc_num_commits, stats->fc_ineligible_commits,
2161		   stats->fc_numblks,
2162		   div_u64(sbi->s_fc_avg_commit_time, 1000));
2163	seq_puts(seq, "Ineligible reasons:\n");
2164	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2165		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2166			stats->fc_ineligible_reason_count[i]);
2167
2168	return 0;
2169}
2170
2171int __init ext4_fc_init_dentry_cache(void)
2172{
2173	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2174					   SLAB_RECLAIM_ACCOUNT);
2175
2176	if (ext4_fc_dentry_cachep == NULL)
2177		return -ENOMEM;
2178
2179	return 0;
2180}