fs/jbd2/commit.c at v5.17 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / jbd2 / commit.c
at v5.17 37 kB view raw
   1// SPDX-License-Identifier: GPL-2.0+
   2/*
   3 * linux/fs/jbd2/commit.c
   4 *
   5 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   6 *
   7 * Copyright 1998 Red Hat corp --- All Rights Reserved
   8 *
   9 * Journal commit routines for the generic filesystem journaling code;
  10 * part of the ext2fs journaling system.
  11 */
  12
  13#include <linux/time.h>
  14#include <linux/fs.h>
  15#include <linux/jbd2.h>
  16#include <linux/errno.h>
  17#include <linux/slab.h>
  18#include <linux/mm.h>
  19#include <linux/pagemap.h>
  20#include <linux/jiffies.h>
  21#include <linux/crc32.h>
  22#include <linux/writeback.h>
  23#include <linux/backing-dev.h>
  24#include <linux/bio.h>
  25#include <linux/blkdev.h>
  26#include <linux/bitops.h>
  27#include <trace/events/jbd2.h>
  28
  29/*
  30 * IO end handler for temporary buffer_heads handling writes to the journal.
  31 */
  32static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  33{
  34	struct buffer_head *orig_bh = bh->b_private;
  35
  36	BUFFER_TRACE(bh, "");
  37	if (uptodate)
  38		set_buffer_uptodate(bh);
  39	else
  40		clear_buffer_uptodate(bh);
  41	if (orig_bh) {
  42		clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
  43		smp_mb__after_atomic();
  44		wake_up_bit(&orig_bh->b_state, BH_Shadow);
  45	}
  46	unlock_buffer(bh);
  47}
  48
  49/*
  50 * When an ext4 file is truncated, it is possible that some pages are not
  51 * successfully freed, because they are attached to a committing transaction.
  52 * After the transaction commits, these pages are left on the LRU, with no
  53 * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  54 * by the VM, but their apparent absence upsets the VM accounting, and it makes
  55 * the numbers in /proc/meminfo look odd.
  56 *
  57 * So here, we have a buffer which has just come off the forget list.  Look to
  58 * see if we can strip all buffers from the backing page.
  59 *
  60 * Called under lock_journal(), and possibly under journal_datalist_lock.  The
  61 * caller provided us with a ref against the buffer, and we drop that here.
  62 */
  63static void release_buffer_page(struct buffer_head *bh)
  64{
  65	struct page *page;
  66
  67	if (buffer_dirty(bh))
  68		goto nope;
  69	if (atomic_read(&bh->b_count) != 1)
  70		goto nope;
  71	page = bh->b_page;
  72	if (!page)
  73		goto nope;
  74	if (page->mapping)
  75		goto nope;
  76
  77	/* OK, it's a truncated page */
  78	if (!trylock_page(page))
  79		goto nope;
  80
  81	get_page(page);
  82	__brelse(bh);
  83	try_to_free_buffers(page);
  84	unlock_page(page);
  85	put_page(page);
  86	return;
  87
  88nope:
  89	__brelse(bh);
  90}
  91
  92static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
  93{
  94	struct commit_header *h;
  95	__u32 csum;
  96
  97	if (!jbd2_journal_has_csum_v2or3(j))
  98		return;
  99
 100	h = (struct commit_header *)(bh->b_data);
 101	h->h_chksum_type = 0;
 102	h->h_chksum_size = 0;
 103	h->h_chksum[0] = 0;
 104	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
 105	h->h_chksum[0] = cpu_to_be32(csum);
 106}
 107
 108/*
 109 * Done it all: now submit the commit record.  We should have
 110 * cleaned up our previous buffers by now, so if we are in abort
 111 * mode we can now just skip the rest of the journal write
 112 * entirely.
 113 *
 114 * Returns 1 if the journal needs to be aborted or 0 on success
 115 */
 116static int journal_submit_commit_record(journal_t *journal,
 117					transaction_t *commit_transaction,
 118					struct buffer_head **cbh,
 119					__u32 crc32_sum)
 120{
 121	struct commit_header *tmp;
 122	struct buffer_head *bh;
 123	int ret;
 124	struct timespec64 now;
 125
 126	*cbh = NULL;
 127
 128	if (is_journal_aborted(journal))
 129		return 0;
 130
 131	bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
 132						JBD2_COMMIT_BLOCK);
 133	if (!bh)
 134		return 1;
 135
 136	tmp = (struct commit_header *)bh->b_data;
 137	ktime_get_coarse_real_ts64(&now);
 138	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
 139	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
 140
 141	if (jbd2_has_feature_checksum(journal)) {
 142		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
 143		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
 144		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
 145	}
 146	jbd2_commit_block_csum_set(journal, bh);
 147
 148	BUFFER_TRACE(bh, "submit commit block");
 149	lock_buffer(bh);
 150	clear_buffer_dirty(bh);
 151	set_buffer_uptodate(bh);
 152	bh->b_end_io = journal_end_buffer_io_sync;
 153
 154	if (journal->j_flags & JBD2_BARRIER &&
 155	    !jbd2_has_feature_async_commit(journal))
 156		ret = submit_bh(REQ_OP_WRITE,
 157			REQ_SYNC | REQ_PREFLUSH | REQ_FUA, bh);
 158	else
 159		ret = submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
 160
 161	*cbh = bh;
 162	return ret;
 163}
 164
 165/*
 166 * This function along with journal_submit_commit_record
 167 * allows to write the commit record asynchronously.
 168 */
 169static int journal_wait_on_commit_record(journal_t *journal,
 170					 struct buffer_head *bh)
 171{
 172	int ret = 0;
 173
 174	clear_buffer_dirty(bh);
 175	wait_on_buffer(bh);
 176
 177	if (unlikely(!buffer_uptodate(bh)))
 178		ret = -EIO;
 179	put_bh(bh);            /* One for getblk() */
 180
 181	return ret;
 182}
 183
 184/*
 185 * write the filemap data using writepage() address_space_operations.
 186 * We don't do block allocation here even for delalloc. We don't
 187 * use writepages() because with delayed allocation we may be doing
 188 * block allocation in writepages().
 189 */
 190int jbd2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
 191{
 192	struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
 193	struct writeback_control wbc = {
 194		.sync_mode =  WB_SYNC_ALL,
 195		.nr_to_write = mapping->nrpages * 2,
 196		.range_start = jinode->i_dirty_start,
 197		.range_end = jinode->i_dirty_end,
 198	};
 199
 200	/*
 201	 * submit the inode data buffers. We use writepage
 202	 * instead of writepages. Because writepages can do
 203	 * block allocation with delalloc. We need to write
 204	 * only allocated blocks here.
 205	 */
 206	return generic_writepages(mapping, &wbc);
 207}
 208
 209/* Send all the data buffers related to an inode */
 210int jbd2_submit_inode_data(struct jbd2_inode *jinode)
 211{
 212
 213	if (!jinode || !(jinode->i_flags & JI_WRITE_DATA))
 214		return 0;
 215
 216	trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
 217	return jbd2_journal_submit_inode_data_buffers(jinode);
 218
 219}
 220EXPORT_SYMBOL(jbd2_submit_inode_data);
 221
 222int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode)
 223{
 224	if (!jinode || !(jinode->i_flags & JI_WAIT_DATA) ||
 225		!jinode->i_vfs_inode || !jinode->i_vfs_inode->i_mapping)
 226		return 0;
 227	return filemap_fdatawait_range_keep_errors(
 228		jinode->i_vfs_inode->i_mapping, jinode->i_dirty_start,
 229		jinode->i_dirty_end);
 230}
 231EXPORT_SYMBOL(jbd2_wait_inode_data);
 232
 233/*
 234 * Submit all the data buffers of inode associated with the transaction to
 235 * disk.
 236 *
 237 * We are in a committing transaction. Therefore no new inode can be added to
 238 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
 239 * operate on from being released while we write out pages.
 240 */
 241static int journal_submit_data_buffers(journal_t *journal,
 242		transaction_t *commit_transaction)
 243{
 244	struct jbd2_inode *jinode;
 245	int err, ret = 0;
 246
 247	spin_lock(&journal->j_list_lock);
 248	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 249		if (!(jinode->i_flags & JI_WRITE_DATA))
 250			continue;
 251		jinode->i_flags |= JI_COMMIT_RUNNING;
 252		spin_unlock(&journal->j_list_lock);
 253		/* submit the inode data buffers. */
 254		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
 255		if (journal->j_submit_inode_data_buffers) {
 256			err = journal->j_submit_inode_data_buffers(jinode);
 257			if (!ret)
 258				ret = err;
 259		}
 260		spin_lock(&journal->j_list_lock);
 261		J_ASSERT(jinode->i_transaction == commit_transaction);
 262		jinode->i_flags &= ~JI_COMMIT_RUNNING;
 263		smp_mb();
 264		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 265	}
 266	spin_unlock(&journal->j_list_lock);
 267	return ret;
 268}
 269
 270int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
 271{
 272	struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
 273
 274	return filemap_fdatawait_range_keep_errors(mapping,
 275						   jinode->i_dirty_start,
 276						   jinode->i_dirty_end);
 277}
 278
 279/*
 280 * Wait for data submitted for writeout, refile inodes to proper
 281 * transaction if needed.
 282 *
 283 */
 284static int journal_finish_inode_data_buffers(journal_t *journal,
 285		transaction_t *commit_transaction)
 286{
 287	struct jbd2_inode *jinode, *next_i;
 288	int err, ret = 0;
 289
 290	/* For locking, see the comment in journal_submit_data_buffers() */
 291	spin_lock(&journal->j_list_lock);
 292	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 293		if (!(jinode->i_flags & JI_WAIT_DATA))
 294			continue;
 295		jinode->i_flags |= JI_COMMIT_RUNNING;
 296		spin_unlock(&journal->j_list_lock);
 297		/* wait for the inode data buffers writeout. */
 298		if (journal->j_finish_inode_data_buffers) {
 299			err = journal->j_finish_inode_data_buffers(jinode);
 300			if (!ret)
 301				ret = err;
 302		}
 303		spin_lock(&journal->j_list_lock);
 304		jinode->i_flags &= ~JI_COMMIT_RUNNING;
 305		smp_mb();
 306		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 307	}
 308
 309	/* Now refile inode to proper lists */
 310	list_for_each_entry_safe(jinode, next_i,
 311				 &commit_transaction->t_inode_list, i_list) {
 312		list_del(&jinode->i_list);
 313		if (jinode->i_next_transaction) {
 314			jinode->i_transaction = jinode->i_next_transaction;
 315			jinode->i_next_transaction = NULL;
 316			list_add(&jinode->i_list,
 317				&jinode->i_transaction->t_inode_list);
 318		} else {
 319			jinode->i_transaction = NULL;
 320			jinode->i_dirty_start = 0;
 321			jinode->i_dirty_end = 0;
 322		}
 323	}
 324	spin_unlock(&journal->j_list_lock);
 325
 326	return ret;
 327}
 328
 329static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
 330{
 331	struct page *page = bh->b_page;
 332	char *addr;
 333	__u32 checksum;
 334
 335	addr = kmap_atomic(page);
 336	checksum = crc32_be(crc32_sum,
 337		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
 338	kunmap_atomic(addr);
 339
 340	return checksum;
 341}
 342
 343static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
 344				   unsigned long long block)
 345{
 346	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
 347	if (jbd2_has_feature_64bit(j))
 348		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 349}
 350
 351static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
 352				    struct buffer_head *bh, __u32 sequence)
 353{
 354	journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
 355	struct page *page = bh->b_page;
 356	__u8 *addr;
 357	__u32 csum32;
 358	__be32 seq;
 359
 360	if (!jbd2_journal_has_csum_v2or3(j))
 361		return;
 362
 363	seq = cpu_to_be32(sequence);
 364	addr = kmap_atomic(page);
 365	csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
 366	csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
 367			     bh->b_size);
 368	kunmap_atomic(addr);
 369
 370	if (jbd2_has_feature_csum3(j))
 371		tag3->t_checksum = cpu_to_be32(csum32);
 372	else
 373		tag->t_checksum = cpu_to_be16(csum32);
 374}
 375/*
 376 * jbd2_journal_commit_transaction
 377 *
 378 * The primary function for committing a transaction to the log.  This
 379 * function is called by the journal thread to begin a complete commit.
 380 */
 381void jbd2_journal_commit_transaction(journal_t *journal)
 382{
 383	struct transaction_stats_s stats;
 384	transaction_t *commit_transaction;
 385	struct journal_head *jh;
 386	struct buffer_head *descriptor;
 387	struct buffer_head **wbuf = journal->j_wbuf;
 388	int bufs;
 389	int flags;
 390	int err;
 391	unsigned long long blocknr;
 392	ktime_t start_time;
 393	u64 commit_time;
 394	char *tagp = NULL;
 395	journal_block_tag_t *tag = NULL;
 396	int space_left = 0;
 397	int first_tag = 0;
 398	int tag_flag;
 399	int i;
 400	int tag_bytes = journal_tag_bytes(journal);
 401	struct buffer_head *cbh = NULL; /* For transactional checksums */
 402	__u32 crc32_sum = ~0;
 403	struct blk_plug plug;
 404	/* Tail of the journal */
 405	unsigned long first_block;
 406	tid_t first_tid;
 407	int update_tail;
 408	int csum_size = 0;
 409	LIST_HEAD(io_bufs);
 410	LIST_HEAD(log_bufs);
 411
 412	if (jbd2_journal_has_csum_v2or3(journal))
 413		csum_size = sizeof(struct jbd2_journal_block_tail);
 414
 415	/*
 416	 * First job: lock down the current transaction and wait for
 417	 * all outstanding updates to complete.
 418	 */
 419
 420	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
 421	if (journal->j_flags & JBD2_FLUSHED) {
 422		jbd_debug(3, "super block updated\n");
 423		mutex_lock_io(&journal->j_checkpoint_mutex);
 424		/*
 425		 * We hold j_checkpoint_mutex so tail cannot change under us.
 426		 * We don't need any special data guarantees for writing sb
 427		 * since journal is empty and it is ok for write to be
 428		 * flushed only with transaction commit.
 429		 */
 430		jbd2_journal_update_sb_log_tail(journal,
 431						journal->j_tail_sequence,
 432						journal->j_tail,
 433						REQ_SYNC);
 434		mutex_unlock(&journal->j_checkpoint_mutex);
 435	} else {
 436		jbd_debug(3, "superblock not updated\n");
 437	}
 438
 439	J_ASSERT(journal->j_running_transaction != NULL);
 440	J_ASSERT(journal->j_committing_transaction == NULL);
 441
 442	write_lock(&journal->j_state_lock);
 443	journal->j_flags |= JBD2_FULL_COMMIT_ONGOING;
 444	while (journal->j_flags & JBD2_FAST_COMMIT_ONGOING) {
 445		DEFINE_WAIT(wait);
 446
 447		prepare_to_wait(&journal->j_fc_wait, &wait,
 448				TASK_UNINTERRUPTIBLE);
 449		write_unlock(&journal->j_state_lock);
 450		schedule();
 451		write_lock(&journal->j_state_lock);
 452		finish_wait(&journal->j_fc_wait, &wait);
 453		/*
 454		 * TODO: by blocking fast commits here, we are increasing
 455		 * fsync() latency slightly. Strictly speaking, we don't need
 456		 * to block fast commits until the transaction enters T_FLUSH
 457		 * state. So an optimization is possible where we block new fast
 458		 * commits here and wait for existing ones to complete
 459		 * just before we enter T_FLUSH. That way, the existing fast
 460		 * commits and this full commit can proceed parallely.
 461		 */
 462	}
 463	write_unlock(&journal->j_state_lock);
 464
 465	commit_transaction = journal->j_running_transaction;
 466
 467	trace_jbd2_start_commit(journal, commit_transaction);
 468	jbd_debug(1, "JBD2: starting commit of transaction %d\n",
 469			commit_transaction->t_tid);
 470
 471	write_lock(&journal->j_state_lock);
 472	journal->j_fc_off = 0;
 473	J_ASSERT(commit_transaction->t_state == T_RUNNING);
 474	commit_transaction->t_state = T_LOCKED;
 475
 476	trace_jbd2_commit_locking(journal, commit_transaction);
 477	stats.run.rs_wait = commit_transaction->t_max_wait;
 478	stats.run.rs_request_delay = 0;
 479	stats.run.rs_locked = jiffies;
 480	if (commit_transaction->t_requested)
 481		stats.run.rs_request_delay =
 482			jbd2_time_diff(commit_transaction->t_requested,
 483				       stats.run.rs_locked);
 484	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
 485					      stats.run.rs_locked);
 486
 487	// waits for any t_updates to finish
 488	jbd2_journal_wait_updates(journal);
 489
 490	commit_transaction->t_state = T_SWITCH;
 491	write_unlock(&journal->j_state_lock);
 492
 493	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
 494			journal->j_max_transaction_buffers);
 495
 496	/*
 497	 * First thing we are allowed to do is to discard any remaining
 498	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 499	 * that there are no such buffers: if a large filesystem
 500	 * operation like a truncate needs to split itself over multiple
 501	 * transactions, then it may try to do a jbd2_journal_restart() while
 502	 * there are still BJ_Reserved buffers outstanding.  These must
 503	 * be released cleanly from the current transaction.
 504	 *
 505	 * In this case, the filesystem must still reserve write access
 506	 * again before modifying the buffer in the new transaction, but
 507	 * we do not require it to remember exactly which old buffers it
 508	 * has reserved.  This is consistent with the existing behaviour
 509	 * that multiple jbd2_journal_get_write_access() calls to the same
 510	 * buffer are perfectly permissible.
 511	 */
 512	while (commit_transaction->t_reserved_list) {
 513		jh = commit_transaction->t_reserved_list;
 514		JBUFFER_TRACE(jh, "reserved, unused: refile");
 515		/*
 516		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
 517		 * leave undo-committed data.
 518		 */
 519		if (jh->b_committed_data) {
 520			struct buffer_head *bh = jh2bh(jh);
 521
 522			spin_lock(&jh->b_state_lock);
 523			jbd2_free(jh->b_committed_data, bh->b_size);
 524			jh->b_committed_data = NULL;
 525			spin_unlock(&jh->b_state_lock);
 526		}
 527		jbd2_journal_refile_buffer(journal, jh);
 528	}
 529
 530	/*
 531	 * Now try to drop any written-back buffers from the journal's
 532	 * checkpoint lists.  We do this *before* commit because it potentially
 533	 * frees some memory
 534	 */
 535	spin_lock(&journal->j_list_lock);
 536	__jbd2_journal_clean_checkpoint_list(journal, false);
 537	spin_unlock(&journal->j_list_lock);
 538
 539	jbd_debug(3, "JBD2: commit phase 1\n");
 540
 541	/*
 542	 * Clear revoked flag to reflect there is no revoked buffers
 543	 * in the next transaction which is going to be started.
 544	 */
 545	jbd2_clear_buffer_revoked_flags(journal);
 546
 547	/*
 548	 * Switch to a new revoke table.
 549	 */
 550	jbd2_journal_switch_revoke_table(journal);
 551
 552	/*
 553	 * Reserved credits cannot be claimed anymore, free them
 554	 */
 555	atomic_sub(atomic_read(&journal->j_reserved_credits),
 556		   &commit_transaction->t_outstanding_credits);
 557
 558	write_lock(&journal->j_state_lock);
 559	trace_jbd2_commit_flushing(journal, commit_transaction);
 560	stats.run.rs_flushing = jiffies;
 561	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
 562					     stats.run.rs_flushing);
 563
 564	commit_transaction->t_state = T_FLUSH;
 565	journal->j_committing_transaction = commit_transaction;
 566	journal->j_running_transaction = NULL;
 567	start_time = ktime_get();
 568	commit_transaction->t_log_start = journal->j_head;
 569	wake_up(&journal->j_wait_transaction_locked);
 570	write_unlock(&journal->j_state_lock);
 571
 572	jbd_debug(3, "JBD2: commit phase 2a\n");
 573
 574	/*
 575	 * Now start flushing things to disk, in the order they appear
 576	 * on the transaction lists.  Data blocks go first.
 577	 */
 578	err = journal_submit_data_buffers(journal, commit_transaction);
 579	if (err)
 580		jbd2_journal_abort(journal, err);
 581
 582	blk_start_plug(&plug);
 583	jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
 584
 585	jbd_debug(3, "JBD2: commit phase 2b\n");
 586
 587	/*
 588	 * Way to go: we have now written out all of the data for a
 589	 * transaction!  Now comes the tricky part: we need to write out
 590	 * metadata.  Loop over the transaction's entire buffer list:
 591	 */
 592	write_lock(&journal->j_state_lock);
 593	commit_transaction->t_state = T_COMMIT;
 594	write_unlock(&journal->j_state_lock);
 595
 596	trace_jbd2_commit_logging(journal, commit_transaction);
 597	stats.run.rs_logging = jiffies;
 598	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
 599					       stats.run.rs_logging);
 600	stats.run.rs_blocks = commit_transaction->t_nr_buffers;
 601	stats.run.rs_blocks_logged = 0;
 602
 603	J_ASSERT(commit_transaction->t_nr_buffers <=
 604		 atomic_read(&commit_transaction->t_outstanding_credits));
 605
 606	err = 0;
 607	bufs = 0;
 608	descriptor = NULL;
 609	while (commit_transaction->t_buffers) {
 610
 611		/* Find the next buffer to be journaled... */
 612
 613		jh = commit_transaction->t_buffers;
 614
 615		/* If we're in abort mode, we just un-journal the buffer and
 616		   release it. */
 617
 618		if (is_journal_aborted(journal)) {
 619			clear_buffer_jbddirty(jh2bh(jh));
 620			JBUFFER_TRACE(jh, "journal is aborting: refile");
 621			jbd2_buffer_abort_trigger(jh,
 622						  jh->b_frozen_data ?
 623						  jh->b_frozen_triggers :
 624						  jh->b_triggers);
 625			jbd2_journal_refile_buffer(journal, jh);
 626			/* If that was the last one, we need to clean up
 627			 * any descriptor buffers which may have been
 628			 * already allocated, even if we are now
 629			 * aborting. */
 630			if (!commit_transaction->t_buffers)
 631				goto start_journal_io;
 632			continue;
 633		}
 634
 635		/* Make sure we have a descriptor block in which to
 636		   record the metadata buffer. */
 637
 638		if (!descriptor) {
 639			J_ASSERT (bufs == 0);
 640
 641			jbd_debug(4, "JBD2: get descriptor\n");
 642
 643			descriptor = jbd2_journal_get_descriptor_buffer(
 644							commit_transaction,
 645							JBD2_DESCRIPTOR_BLOCK);
 646			if (!descriptor) {
 647				jbd2_journal_abort(journal, -EIO);
 648				continue;
 649			}
 650
 651			jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
 652				(unsigned long long)descriptor->b_blocknr,
 653				descriptor->b_data);
 654			tagp = &descriptor->b_data[sizeof(journal_header_t)];
 655			space_left = descriptor->b_size -
 656						sizeof(journal_header_t);
 657			first_tag = 1;
 658			set_buffer_jwrite(descriptor);
 659			set_buffer_dirty(descriptor);
 660			wbuf[bufs++] = descriptor;
 661
 662			/* Record it so that we can wait for IO
 663                           completion later */
 664			BUFFER_TRACE(descriptor, "ph3: file as descriptor");
 665			jbd2_file_log_bh(&log_bufs, descriptor);
 666		}
 667
 668		/* Where is the buffer to be written? */
 669
 670		err = jbd2_journal_next_log_block(journal, &blocknr);
 671		/* If the block mapping failed, just abandon the buffer
 672		   and repeat this loop: we'll fall into the
 673		   refile-on-abort condition above. */
 674		if (err) {
 675			jbd2_journal_abort(journal, err);
 676			continue;
 677		}
 678
 679		/*
 680		 * start_this_handle() uses t_outstanding_credits to determine
 681		 * the free space in the log.
 682		 */
 683		atomic_dec(&commit_transaction->t_outstanding_credits);
 684
 685		/* Bump b_count to prevent truncate from stumbling over
 686                   the shadowed buffer!  @@@ This can go if we ever get
 687                   rid of the shadow pairing of buffers. */
 688		atomic_inc(&jh2bh(jh)->b_count);
 689
 690		/*
 691		 * Make a temporary IO buffer with which to write it out
 692		 * (this will requeue the metadata buffer to BJ_Shadow).
 693		 */
 694		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 695		JBUFFER_TRACE(jh, "ph3: write metadata");
 696		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
 697						jh, &wbuf[bufs], blocknr);
 698		if (flags < 0) {
 699			jbd2_journal_abort(journal, flags);
 700			continue;
 701		}
 702		jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
 703
 704		/* Record the new block's tag in the current descriptor
 705                   buffer */
 706
 707		tag_flag = 0;
 708		if (flags & 1)
 709			tag_flag |= JBD2_FLAG_ESCAPE;
 710		if (!first_tag)
 711			tag_flag |= JBD2_FLAG_SAME_UUID;
 712
 713		tag = (journal_block_tag_t *) tagp;
 714		write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
 715		tag->t_flags = cpu_to_be16(tag_flag);
 716		jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
 717					commit_transaction->t_tid);
 718		tagp += tag_bytes;
 719		space_left -= tag_bytes;
 720		bufs++;
 721
 722		if (first_tag) {
 723			memcpy (tagp, journal->j_uuid, 16);
 724			tagp += 16;
 725			space_left -= 16;
 726			first_tag = 0;
 727		}
 728
 729		/* If there's no more to do, or if the descriptor is full,
 730		   let the IO rip! */
 731
 732		if (bufs == journal->j_wbufsize ||
 733		    commit_transaction->t_buffers == NULL ||
 734		    space_left < tag_bytes + 16 + csum_size) {
 735
 736			jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
 737
 738			/* Write an end-of-descriptor marker before
 739                           submitting the IOs.  "tag" still points to
 740                           the last tag we set up. */
 741
 742			tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
 743start_journal_io:
 744			if (descriptor)
 745				jbd2_descriptor_block_csum_set(journal,
 746							descriptor);
 747
 748			for (i = 0; i < bufs; i++) {
 749				struct buffer_head *bh = wbuf[i];
 750				/*
 751				 * Compute checksum.
 752				 */
 753				if (jbd2_has_feature_checksum(journal)) {
 754					crc32_sum =
 755					    jbd2_checksum_data(crc32_sum, bh);
 756				}
 757
 758				lock_buffer(bh);
 759				clear_buffer_dirty(bh);
 760				set_buffer_uptodate(bh);
 761				bh->b_end_io = journal_end_buffer_io_sync;
 762				submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
 763			}
 764			cond_resched();
 765
 766			/* Force a new descriptor to be generated next
 767                           time round the loop. */
 768			descriptor = NULL;
 769			bufs = 0;
 770		}
 771	}
 772
 773	err = journal_finish_inode_data_buffers(journal, commit_transaction);
 774	if (err) {
 775		printk(KERN_WARNING
 776			"JBD2: Detected IO errors while flushing file data "
 777		       "on %s\n", journal->j_devname);
 778		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
 779			jbd2_journal_abort(journal, err);
 780		err = 0;
 781	}
 782
 783	/*
 784	 * Get current oldest transaction in the log before we issue flush
 785	 * to the filesystem device. After the flush we can be sure that
 786	 * blocks of all older transactions are checkpointed to persistent
 787	 * storage and we will be safe to update journal start in the
 788	 * superblock with the numbers we get here.
 789	 */
 790	update_tail =
 791		jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
 792
 793	write_lock(&journal->j_state_lock);
 794	if (update_tail) {
 795		long freed = first_block - journal->j_tail;
 796
 797		if (first_block < journal->j_tail)
 798			freed += journal->j_last - journal->j_first;
 799		/* Update tail only if we free significant amount of space */
 800		if (freed < jbd2_journal_get_max_txn_bufs(journal))
 801			update_tail = 0;
 802	}
 803	J_ASSERT(commit_transaction->t_state == T_COMMIT);
 804	commit_transaction->t_state = T_COMMIT_DFLUSH;
 805	write_unlock(&journal->j_state_lock);
 806
 807	/*
 808	 * If the journal is not located on the file system device,
 809	 * then we must flush the file system device before we issue
 810	 * the commit record
 811	 */
 812	if (commit_transaction->t_need_data_flush &&
 813	    (journal->j_fs_dev != journal->j_dev) &&
 814	    (journal->j_flags & JBD2_BARRIER))
 815		blkdev_issue_flush(journal->j_fs_dev);
 816
 817	/* Done it all: now write the commit record asynchronously. */
 818	if (jbd2_has_feature_async_commit(journal)) {
 819		err = journal_submit_commit_record(journal, commit_transaction,
 820						 &cbh, crc32_sum);
 821		if (err)
 822			jbd2_journal_abort(journal, err);
 823	}
 824
 825	blk_finish_plug(&plug);
 826
 827	/* Lo and behold: we have just managed to send a transaction to
 828           the log.  Before we can commit it, wait for the IO so far to
 829           complete.  Control buffers being written are on the
 830           transaction's t_log_list queue, and metadata buffers are on
 831           the io_bufs list.
 832
 833	   Wait for the buffers in reverse order.  That way we are
 834	   less likely to be woken up until all IOs have completed, and
 835	   so we incur less scheduling load.
 836	*/
 837
 838	jbd_debug(3, "JBD2: commit phase 3\n");
 839
 840	while (!list_empty(&io_bufs)) {
 841		struct buffer_head *bh = list_entry(io_bufs.prev,
 842						    struct buffer_head,
 843						    b_assoc_buffers);
 844
 845		wait_on_buffer(bh);
 846		cond_resched();
 847
 848		if (unlikely(!buffer_uptodate(bh)))
 849			err = -EIO;
 850		jbd2_unfile_log_bh(bh);
 851		stats.run.rs_blocks_logged++;
 852
 853		/*
 854		 * The list contains temporary buffer heads created by
 855		 * jbd2_journal_write_metadata_buffer().
 856		 */
 857		BUFFER_TRACE(bh, "dumping temporary bh");
 858		__brelse(bh);
 859		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 860		free_buffer_head(bh);
 861
 862		/* We also have to refile the corresponding shadowed buffer */
 863		jh = commit_transaction->t_shadow_list->b_tprev;
 864		bh = jh2bh(jh);
 865		clear_buffer_jwrite(bh);
 866		J_ASSERT_BH(bh, buffer_jbddirty(bh));
 867		J_ASSERT_BH(bh, !buffer_shadow(bh));
 868
 869		/* The metadata is now released for reuse, but we need
 870                   to remember it against this transaction so that when
 871                   we finally commit, we can do any checkpointing
 872                   required. */
 873		JBUFFER_TRACE(jh, "file as BJ_Forget");
 874		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
 875		JBUFFER_TRACE(jh, "brelse shadowed buffer");
 876		__brelse(bh);
 877	}
 878
 879	J_ASSERT (commit_transaction->t_shadow_list == NULL);
 880
 881	jbd_debug(3, "JBD2: commit phase 4\n");
 882
 883	/* Here we wait for the revoke record and descriptor record buffers */
 884	while (!list_empty(&log_bufs)) {
 885		struct buffer_head *bh;
 886
 887		bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
 888		wait_on_buffer(bh);
 889		cond_resched();
 890
 891		if (unlikely(!buffer_uptodate(bh)))
 892			err = -EIO;
 893
 894		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 895		clear_buffer_jwrite(bh);
 896		jbd2_unfile_log_bh(bh);
 897		stats.run.rs_blocks_logged++;
 898		__brelse(bh);		/* One for getblk */
 899		/* AKPM: bforget here */
 900	}
 901
 902	if (err)
 903		jbd2_journal_abort(journal, err);
 904
 905	jbd_debug(3, "JBD2: commit phase 5\n");
 906	write_lock(&journal->j_state_lock);
 907	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
 908	commit_transaction->t_state = T_COMMIT_JFLUSH;
 909	write_unlock(&journal->j_state_lock);
 910
 911	if (!jbd2_has_feature_async_commit(journal)) {
 912		err = journal_submit_commit_record(journal, commit_transaction,
 913						&cbh, crc32_sum);
 914		if (err)
 915			jbd2_journal_abort(journal, err);
 916	}
 917	if (cbh)
 918		err = journal_wait_on_commit_record(journal, cbh);
 919	stats.run.rs_blocks_logged++;
 920	if (jbd2_has_feature_async_commit(journal) &&
 921	    journal->j_flags & JBD2_BARRIER) {
 922		blkdev_issue_flush(journal->j_dev);
 923	}
 924
 925	if (err)
 926		jbd2_journal_abort(journal, err);
 927
 928	WARN_ON_ONCE(
 929		atomic_read(&commit_transaction->t_outstanding_credits) < 0);
 930
 931	/*
 932	 * Now disk caches for filesystem device are flushed so we are safe to
 933	 * erase checkpointed transactions from the log by updating journal
 934	 * superblock.
 935	 */
 936	if (update_tail)
 937		jbd2_update_log_tail(journal, first_tid, first_block);
 938
 939	/* End of a transaction!  Finally, we can do checkpoint
 940           processing: any buffers committed as a result of this
 941           transaction can be removed from any checkpoint list it was on
 942           before. */
 943
 944	jbd_debug(3, "JBD2: commit phase 6\n");
 945
 946	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 947	J_ASSERT(commit_transaction->t_buffers == NULL);
 948	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 949	J_ASSERT(commit_transaction->t_shadow_list == NULL);
 950
 951restart_loop:
 952	/*
 953	 * As there are other places (journal_unmap_buffer()) adding buffers
 954	 * to this list we have to be careful and hold the j_list_lock.
 955	 */
 956	spin_lock(&journal->j_list_lock);
 957	while (commit_transaction->t_forget) {
 958		transaction_t *cp_transaction;
 959		struct buffer_head *bh;
 960		int try_to_free = 0;
 961		bool drop_ref;
 962
 963		jh = commit_transaction->t_forget;
 964		spin_unlock(&journal->j_list_lock);
 965		bh = jh2bh(jh);
 966		/*
 967		 * Get a reference so that bh cannot be freed before we are
 968		 * done with it.
 969		 */
 970		get_bh(bh);
 971		spin_lock(&jh->b_state_lock);
 972		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
 973
 974		/*
 975		 * If there is undo-protected committed data against
 976		 * this buffer, then we can remove it now.  If it is a
 977		 * buffer needing such protection, the old frozen_data
 978		 * field now points to a committed version of the
 979		 * buffer, so rotate that field to the new committed
 980		 * data.
 981		 *
 982		 * Otherwise, we can just throw away the frozen data now.
 983		 *
 984		 * We also know that the frozen data has already fired
 985		 * its triggers if they exist, so we can clear that too.
 986		 */
 987		if (jh->b_committed_data) {
 988			jbd2_free(jh->b_committed_data, bh->b_size);
 989			jh->b_committed_data = NULL;
 990			if (jh->b_frozen_data) {
 991				jh->b_committed_data = jh->b_frozen_data;
 992				jh->b_frozen_data = NULL;
 993				jh->b_frozen_triggers = NULL;
 994			}
 995		} else if (jh->b_frozen_data) {
 996			jbd2_free(jh->b_frozen_data, bh->b_size);
 997			jh->b_frozen_data = NULL;
 998			jh->b_frozen_triggers = NULL;
 999		}
1000
1001		spin_lock(&journal->j_list_lock);
1002		cp_transaction = jh->b_cp_transaction;
1003		if (cp_transaction) {
1004			JBUFFER_TRACE(jh, "remove from old cp transaction");
1005			cp_transaction->t_chp_stats.cs_dropped++;
1006			__jbd2_journal_remove_checkpoint(jh);
1007		}
1008
1009		/* Only re-checkpoint the buffer_head if it is marked
1010		 * dirty.  If the buffer was added to the BJ_Forget list
1011		 * by jbd2_journal_forget, it may no longer be dirty and
1012		 * there's no point in keeping a checkpoint record for
1013		 * it. */
1014
1015		/*
1016		 * A buffer which has been freed while still being journaled
1017		 * by a previous transaction, refile the buffer to BJ_Forget of
1018		 * the running transaction. If the just committed transaction
1019		 * contains "add to orphan" operation, we can completely
1020		 * invalidate the buffer now. We are rather through in that
1021		 * since the buffer may be still accessible when blocksize <
1022		 * pagesize and it is attached to the last partial page.
1023		 */
1024		if (buffer_freed(bh) && !jh->b_next_transaction) {
1025			struct address_space *mapping;
1026
1027			clear_buffer_freed(bh);
1028			clear_buffer_jbddirty(bh);
1029
1030			/*
1031			 * Block device buffers need to stay mapped all the
1032			 * time, so it is enough to clear buffer_jbddirty and
1033			 * buffer_freed bits. For the file mapping buffers (i.e.
1034			 * journalled data) we need to unmap buffer and clear
1035			 * more bits. We also need to be careful about the check
1036			 * because the data page mapping can get cleared under
1037			 * our hands. Note that if mapping == NULL, we don't
1038			 * need to make buffer unmapped because the page is
1039			 * already detached from the mapping and buffers cannot
1040			 * get reused.
1041			 */
1042			mapping = READ_ONCE(bh->b_page->mapping);
1043			if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) {
1044				clear_buffer_mapped(bh);
1045				clear_buffer_new(bh);
1046				clear_buffer_req(bh);
1047				bh->b_bdev = NULL;
1048			}
1049		}
1050
1051		if (buffer_jbddirty(bh)) {
1052			JBUFFER_TRACE(jh, "add to new checkpointing trans");
1053			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
1054			if (is_journal_aborted(journal))
1055				clear_buffer_jbddirty(bh);
1056		} else {
1057			J_ASSERT_BH(bh, !buffer_dirty(bh));
1058			/*
1059			 * The buffer on BJ_Forget list and not jbddirty means
1060			 * it has been freed by this transaction and hence it
1061			 * could not have been reallocated until this
1062			 * transaction has committed. *BUT* it could be
1063			 * reallocated once we have written all the data to
1064			 * disk and before we process the buffer on BJ_Forget
1065			 * list.
1066			 */
1067			if (!jh->b_next_transaction)
1068				try_to_free = 1;
1069		}
1070		JBUFFER_TRACE(jh, "refile or unfile buffer");
1071		drop_ref = __jbd2_journal_refile_buffer(jh);
1072		spin_unlock(&jh->b_state_lock);
1073		if (drop_ref)
1074			jbd2_journal_put_journal_head(jh);
1075		if (try_to_free)
1076			release_buffer_page(bh);	/* Drops bh reference */
1077		else
1078			__brelse(bh);
1079		cond_resched_lock(&journal->j_list_lock);
1080	}
1081	spin_unlock(&journal->j_list_lock);
1082	/*
1083	 * This is a bit sleazy.  We use j_list_lock to protect transition
1084	 * of a transaction into T_FINISHED state and calling
1085	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
1086	 * other checkpointing code processing the transaction...
1087	 */
1088	write_lock(&journal->j_state_lock);
1089	spin_lock(&journal->j_list_lock);
1090	/*
1091	 * Now recheck if some buffers did not get attached to the transaction
1092	 * while the lock was dropped...
1093	 */
1094	if (commit_transaction->t_forget) {
1095		spin_unlock(&journal->j_list_lock);
1096		write_unlock(&journal->j_state_lock);
1097		goto restart_loop;
1098	}
1099
1100	/* Add the transaction to the checkpoint list
1101	 * __journal_remove_checkpoint() can not destroy transaction
1102	 * under us because it is not marked as T_FINISHED yet */
1103	if (journal->j_checkpoint_transactions == NULL) {
1104		journal->j_checkpoint_transactions = commit_transaction;
1105		commit_transaction->t_cpnext = commit_transaction;
1106		commit_transaction->t_cpprev = commit_transaction;
1107	} else {
1108		commit_transaction->t_cpnext =
1109			journal->j_checkpoint_transactions;
1110		commit_transaction->t_cpprev =
1111			commit_transaction->t_cpnext->t_cpprev;
1112		commit_transaction->t_cpnext->t_cpprev =
1113			commit_transaction;
1114		commit_transaction->t_cpprev->t_cpnext =
1115				commit_transaction;
1116	}
1117	spin_unlock(&journal->j_list_lock);
1118
1119	/* Done with this transaction! */
1120
1121	jbd_debug(3, "JBD2: commit phase 7\n");
1122
1123	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1124
1125	commit_transaction->t_start = jiffies;
1126	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1127					      commit_transaction->t_start);
1128
1129	/*
1130	 * File the transaction statistics
1131	 */
1132	stats.ts_tid = commit_transaction->t_tid;
1133	stats.run.rs_handle_count =
1134		atomic_read(&commit_transaction->t_handle_count);
1135	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1136			     commit_transaction->t_tid, &stats.run);
1137	stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1138
1139	commit_transaction->t_state = T_COMMIT_CALLBACK;
1140	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1141	journal->j_commit_sequence = commit_transaction->t_tid;
1142	journal->j_committing_transaction = NULL;
1143	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1144
1145	/*
1146	 * weight the commit time higher than the average time so we don't
1147	 * react too strongly to vast changes in the commit time
1148	 */
1149	if (likely(journal->j_average_commit_time))
1150		journal->j_average_commit_time = (commit_time +
1151				journal->j_average_commit_time*3) / 4;
1152	else
1153		journal->j_average_commit_time = commit_time;
1154
1155	write_unlock(&journal->j_state_lock);
1156
1157	if (journal->j_commit_callback)
1158		journal->j_commit_callback(journal, commit_transaction);
1159	if (journal->j_fc_cleanup_callback)
1160		journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid);
1161
1162	trace_jbd2_end_commit(journal, commit_transaction);
1163	jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1164		  journal->j_commit_sequence, journal->j_tail_sequence);
1165
1166	write_lock(&journal->j_state_lock);
1167	journal->j_flags &= ~JBD2_FULL_COMMIT_ONGOING;
1168	journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
1169	spin_lock(&journal->j_list_lock);
1170	commit_transaction->t_state = T_FINISHED;
1171	/* Check if the transaction can be dropped now that we are finished */
1172	if (commit_transaction->t_checkpoint_list == NULL &&
1173	    commit_transaction->t_checkpoint_io_list == NULL) {
1174		__jbd2_journal_drop_transaction(journal, commit_transaction);
1175		jbd2_journal_free_transaction(commit_transaction);
1176	}
1177	spin_unlock(&journal->j_list_lock);
1178	write_unlock(&journal->j_state_lock);
1179	wake_up(&journal->j_wait_done_commit);
1180	wake_up(&journal->j_fc_wait);
1181
1182	/*
1183	 * Calculate overall stats
1184	 */
1185	spin_lock(&journal->j_history_lock);
1186	journal->j_stats.ts_tid++;
1187	journal->j_stats.ts_requested += stats.ts_requested;
1188	journal->j_stats.run.rs_wait += stats.run.rs_wait;
1189	journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1190	journal->j_stats.run.rs_running += stats.run.rs_running;
1191	journal->j_stats.run.rs_locked += stats.run.rs_locked;
1192	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1193	journal->j_stats.run.rs_logging += stats.run.rs_logging;
1194	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1195	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1196	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1197	spin_unlock(&journal->j_history_lock);
1198}