fs/jbd2/commit.c at v2.6.26-rc4 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / jbd2 / commit.c
at v2.6.26-rc4 1064 lines 32 kB view raw
   1/*
   2 * linux/fs/jbd2/commit.c
   3 *
   4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5 *
   6 * Copyright 1998 Red Hat corp --- All Rights Reserved
   7 *
   8 * This file is part of the Linux kernel and is made available under
   9 * the terms of the GNU General Public License, version 2, or at your
  10 * option, any later version, incorporated herein by reference.
  11 *
  12 * Journal commit routines for the generic filesystem journaling code;
  13 * part of the ext2fs journaling system.
  14 */
  15
  16#include <linux/time.h>
  17#include <linux/fs.h>
  18#include <linux/jbd2.h>
  19#include <linux/errno.h>
  20#include <linux/slab.h>
  21#include <linux/mm.h>
  22#include <linux/pagemap.h>
  23#include <linux/jiffies.h>
  24#include <linux/crc32.h>
  25
  26/*
  27 * Default IO end handler for temporary BJ_IO buffer_heads.
  28 */
  29static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  30{
  31	BUFFER_TRACE(bh, "");
  32	if (uptodate)
  33		set_buffer_uptodate(bh);
  34	else
  35		clear_buffer_uptodate(bh);
  36	unlock_buffer(bh);
  37}
  38
  39/*
  40 * When an ext3-ordered file is truncated, it is possible that many pages are
  41 * not sucessfully freed, because they are attached to a committing transaction.
  42 * After the transaction commits, these pages are left on the LRU, with no
  43 * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  44 * by the VM, but their apparent absence upsets the VM accounting, and it makes
  45 * the numbers in /proc/meminfo look odd.
  46 *
  47 * So here, we have a buffer which has just come off the forget list.  Look to
  48 * see if we can strip all buffers from the backing page.
  49 *
  50 * Called under lock_journal(), and possibly under journal_datalist_lock.  The
  51 * caller provided us with a ref against the buffer, and we drop that here.
  52 */
  53static void release_buffer_page(struct buffer_head *bh)
  54{
  55	struct page *page;
  56
  57	if (buffer_dirty(bh))
  58		goto nope;
  59	if (atomic_read(&bh->b_count) != 1)
  60		goto nope;
  61	page = bh->b_page;
  62	if (!page)
  63		goto nope;
  64	if (page->mapping)
  65		goto nope;
  66
  67	/* OK, it's a truncated page */
  68	if (TestSetPageLocked(page))
  69		goto nope;
  70
  71	page_cache_get(page);
  72	__brelse(bh);
  73	try_to_free_buffers(page);
  74	unlock_page(page);
  75	page_cache_release(page);
  76	return;
  77
  78nope:
  79	__brelse(bh);
  80}
  81
  82/*
  83 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
  84 * held.  For ranking reasons we must trylock.  If we lose, schedule away and
  85 * return 0.  j_list_lock is dropped in this case.
  86 */
  87static int inverted_lock(journal_t *journal, struct buffer_head *bh)
  88{
  89	if (!jbd_trylock_bh_state(bh)) {
  90		spin_unlock(&journal->j_list_lock);
  91		schedule();
  92		return 0;
  93	}
  94	return 1;
  95}
  96
  97/*
  98 * Done it all: now submit the commit record.  We should have
  99 * cleaned up our previous buffers by now, so if we are in abort
 100 * mode we can now just skip the rest of the journal write
 101 * entirely.
 102 *
 103 * Returns 1 if the journal needs to be aborted or 0 on success
 104 */
 105static int journal_submit_commit_record(journal_t *journal,
 106					transaction_t *commit_transaction,
 107					struct buffer_head **cbh,
 108					__u32 crc32_sum)
 109{
 110	struct journal_head *descriptor;
 111	struct commit_header *tmp;
 112	struct buffer_head *bh;
 113	int ret;
 114	int barrier_done = 0;
 115
 116	if (is_journal_aborted(journal))
 117		return 0;
 118
 119	descriptor = jbd2_journal_get_descriptor_buffer(journal);
 120	if (!descriptor)
 121		return 1;
 122
 123	bh = jh2bh(descriptor);
 124
 125	tmp = (struct commit_header *)bh->b_data;
 126	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 127	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
 128	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 129
 130	if (JBD2_HAS_COMPAT_FEATURE(journal,
 131				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
 132		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
 133		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
 134		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
 135	}
 136
 137	JBUFFER_TRACE(descriptor, "submit commit block");
 138	lock_buffer(bh);
 139	get_bh(bh);
 140	set_buffer_dirty(bh);
 141	set_buffer_uptodate(bh);
 142	bh->b_end_io = journal_end_buffer_io_sync;
 143
 144	if (journal->j_flags & JBD2_BARRIER &&
 145		!JBD2_HAS_INCOMPAT_FEATURE(journal,
 146					 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 147		set_buffer_ordered(bh);
 148		barrier_done = 1;
 149	}
 150	ret = submit_bh(WRITE, bh);
 151	if (barrier_done)
 152		clear_buffer_ordered(bh);
 153
 154	/* is it possible for another commit to fail at roughly
 155	 * the same time as this one?  If so, we don't want to
 156	 * trust the barrier flag in the super, but instead want
 157	 * to remember if we sent a barrier request
 158	 */
 159	if (ret == -EOPNOTSUPP && barrier_done) {
 160		char b[BDEVNAME_SIZE];
 161
 162		printk(KERN_WARNING
 163			"JBD: barrier-based sync failed on %s - "
 164			"disabling barriers\n",
 165			bdevname(journal->j_dev, b));
 166		spin_lock(&journal->j_state_lock);
 167		journal->j_flags &= ~JBD2_BARRIER;
 168		spin_unlock(&journal->j_state_lock);
 169
 170		/* And try again, without the barrier */
 171		set_buffer_uptodate(bh);
 172		set_buffer_dirty(bh);
 173		ret = submit_bh(WRITE, bh);
 174	}
 175	*cbh = bh;
 176	return ret;
 177}
 178
 179/*
 180 * This function along with journal_submit_commit_record
 181 * allows to write the commit record asynchronously.
 182 */
 183static int journal_wait_on_commit_record(struct buffer_head *bh)
 184{
 185	int ret = 0;
 186
 187	clear_buffer_dirty(bh);
 188	wait_on_buffer(bh);
 189
 190	if (unlikely(!buffer_uptodate(bh)))
 191		ret = -EIO;
 192	put_bh(bh);            /* One for getblk() */
 193	jbd2_journal_put_journal_head(bh2jh(bh));
 194
 195	return ret;
 196}
 197
 198/*
 199 * Wait for all submitted IO to complete.
 200 */
 201static int journal_wait_on_locked_list(journal_t *journal,
 202				       transaction_t *commit_transaction)
 203{
 204	int ret = 0;
 205	struct journal_head *jh;
 206
 207	while (commit_transaction->t_locked_list) {
 208		struct buffer_head *bh;
 209
 210		jh = commit_transaction->t_locked_list->b_tprev;
 211		bh = jh2bh(jh);
 212		get_bh(bh);
 213		if (buffer_locked(bh)) {
 214			spin_unlock(&journal->j_list_lock);
 215			wait_on_buffer(bh);
 216			if (unlikely(!buffer_uptodate(bh)))
 217				ret = -EIO;
 218			spin_lock(&journal->j_list_lock);
 219		}
 220		if (!inverted_lock(journal, bh)) {
 221			put_bh(bh);
 222			spin_lock(&journal->j_list_lock);
 223			continue;
 224		}
 225		if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
 226			__jbd2_journal_unfile_buffer(jh);
 227			jbd_unlock_bh_state(bh);
 228			jbd2_journal_remove_journal_head(bh);
 229			put_bh(bh);
 230		} else {
 231			jbd_unlock_bh_state(bh);
 232		}
 233		put_bh(bh);
 234		cond_resched_lock(&journal->j_list_lock);
 235	}
 236	return ret;
 237  }
 238
 239static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
 240{
 241	int i;
 242
 243	for (i = 0; i < bufs; i++) {
 244		wbuf[i]->b_end_io = end_buffer_write_sync;
 245		/* We use-up our safety reference in submit_bh() */
 246		submit_bh(WRITE, wbuf[i]);
 247	}
 248}
 249
 250/*
 251 *  Submit all the data buffers to disk
 252 */
 253static void journal_submit_data_buffers(journal_t *journal,
 254				transaction_t *commit_transaction)
 255{
 256	struct journal_head *jh;
 257	struct buffer_head *bh;
 258	int locked;
 259	int bufs = 0;
 260	struct buffer_head **wbuf = journal->j_wbuf;
 261
 262	/*
 263	 * Whenever we unlock the journal and sleep, things can get added
 264	 * onto ->t_sync_datalist, so we have to keep looping back to
 265	 * write_out_data until we *know* that the list is empty.
 266	 *
 267	 * Cleanup any flushed data buffers from the data list.  Even in
 268	 * abort mode, we want to flush this out as soon as possible.
 269	 */
 270write_out_data:
 271	cond_resched();
 272	spin_lock(&journal->j_list_lock);
 273
 274	while (commit_transaction->t_sync_datalist) {
 275		jh = commit_transaction->t_sync_datalist;
 276		bh = jh2bh(jh);
 277		locked = 0;
 278
 279		/* Get reference just to make sure buffer does not disappear
 280		 * when we are forced to drop various locks */
 281		get_bh(bh);
 282		/* If the buffer is dirty, we need to submit IO and hence
 283		 * we need the buffer lock. We try to lock the buffer without
 284		 * blocking. If we fail, we need to drop j_list_lock and do
 285		 * blocking lock_buffer().
 286		 */
 287		if (buffer_dirty(bh)) {
 288			if (test_set_buffer_locked(bh)) {
 289				BUFFER_TRACE(bh, "needs blocking lock");
 290				spin_unlock(&journal->j_list_lock);
 291				/* Write out all data to prevent deadlocks */
 292				journal_do_submit_data(wbuf, bufs);
 293				bufs = 0;
 294				lock_buffer(bh);
 295				spin_lock(&journal->j_list_lock);
 296			}
 297			locked = 1;
 298		}
 299		/* We have to get bh_state lock. Again out of order, sigh. */
 300		if (!inverted_lock(journal, bh)) {
 301			jbd_lock_bh_state(bh);
 302			spin_lock(&journal->j_list_lock);
 303		}
 304		/* Someone already cleaned up the buffer? */
 305		if (!buffer_jbd(bh)
 306			|| jh->b_transaction != commit_transaction
 307			|| jh->b_jlist != BJ_SyncData) {
 308			jbd_unlock_bh_state(bh);
 309			if (locked)
 310				unlock_buffer(bh);
 311			BUFFER_TRACE(bh, "already cleaned up");
 312			put_bh(bh);
 313			continue;
 314		}
 315		if (locked && test_clear_buffer_dirty(bh)) {
 316			BUFFER_TRACE(bh, "needs writeout, adding to array");
 317			wbuf[bufs++] = bh;
 318			__jbd2_journal_file_buffer(jh, commit_transaction,
 319						BJ_Locked);
 320			jbd_unlock_bh_state(bh);
 321			if (bufs == journal->j_wbufsize) {
 322				spin_unlock(&journal->j_list_lock);
 323				journal_do_submit_data(wbuf, bufs);
 324				bufs = 0;
 325				goto write_out_data;
 326			}
 327		} else if (!locked && buffer_locked(bh)) {
 328			__jbd2_journal_file_buffer(jh, commit_transaction,
 329						BJ_Locked);
 330			jbd_unlock_bh_state(bh);
 331			put_bh(bh);
 332		} else {
 333			BUFFER_TRACE(bh, "writeout complete: unfile");
 334			__jbd2_journal_unfile_buffer(jh);
 335			jbd_unlock_bh_state(bh);
 336			if (locked)
 337				unlock_buffer(bh);
 338			jbd2_journal_remove_journal_head(bh);
 339			/* Once for our safety reference, once for
 340			 * jbd2_journal_remove_journal_head() */
 341			put_bh(bh);
 342			put_bh(bh);
 343		}
 344
 345		if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
 346			spin_unlock(&journal->j_list_lock);
 347			goto write_out_data;
 348		}
 349	}
 350	spin_unlock(&journal->j_list_lock);
 351	journal_do_submit_data(wbuf, bufs);
 352}
 353
 354static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
 355{
 356	struct page *page = bh->b_page;
 357	char *addr;
 358	__u32 checksum;
 359
 360	addr = kmap_atomic(page, KM_USER0);
 361	checksum = crc32_be(crc32_sum,
 362		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
 363	kunmap_atomic(addr, KM_USER0);
 364
 365	return checksum;
 366}
 367
 368static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
 369				   unsigned long long block)
 370{
 371	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
 372	if (tag_bytes > JBD2_TAG_SIZE32)
 373		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 374}
 375
 376/*
 377 * jbd2_journal_commit_transaction
 378 *
 379 * The primary function for committing a transaction to the log.  This
 380 * function is called by the journal thread to begin a complete commit.
 381 */
 382void jbd2_journal_commit_transaction(journal_t *journal)
 383{
 384	struct transaction_stats_s stats;
 385	transaction_t *commit_transaction;
 386	struct journal_head *jh, *new_jh, *descriptor;
 387	struct buffer_head **wbuf = journal->j_wbuf;
 388	int bufs;
 389	int flags;
 390	int err;
 391	unsigned long long blocknr;
 392	char *tagp = NULL;
 393	journal_header_t *header;
 394	journal_block_tag_t *tag = NULL;
 395	int space_left = 0;
 396	int first_tag = 0;
 397	int tag_flag;
 398	int i;
 399	int tag_bytes = journal_tag_bytes(journal);
 400	struct buffer_head *cbh = NULL; /* For transactional checksums */
 401	__u32 crc32_sum = ~0;
 402
 403	/*
 404	 * First job: lock down the current transaction and wait for
 405	 * all outstanding updates to complete.
 406	 */
 407
 408#ifdef COMMIT_STATS
 409	spin_lock(&journal->j_list_lock);
 410	summarise_journal_usage(journal);
 411	spin_unlock(&journal->j_list_lock);
 412#endif
 413
 414	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
 415	if (journal->j_flags & JBD2_FLUSHED) {
 416		jbd_debug(3, "super block updated\n");
 417		jbd2_journal_update_superblock(journal, 1);
 418	} else {
 419		jbd_debug(3, "superblock not updated\n");
 420	}
 421
 422	J_ASSERT(journal->j_running_transaction != NULL);
 423	J_ASSERT(journal->j_committing_transaction == NULL);
 424
 425	commit_transaction = journal->j_running_transaction;
 426	J_ASSERT(commit_transaction->t_state == T_RUNNING);
 427
 428	jbd_debug(1, "JBD: starting commit of transaction %d\n",
 429			commit_transaction->t_tid);
 430
 431	spin_lock(&journal->j_state_lock);
 432	commit_transaction->t_state = T_LOCKED;
 433
 434	stats.u.run.rs_wait = commit_transaction->t_max_wait;
 435	stats.u.run.rs_locked = jiffies;
 436	stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
 437						stats.u.run.rs_locked);
 438
 439	spin_lock(&commit_transaction->t_handle_lock);
 440	while (commit_transaction->t_updates) {
 441		DEFINE_WAIT(wait);
 442
 443		prepare_to_wait(&journal->j_wait_updates, &wait,
 444					TASK_UNINTERRUPTIBLE);
 445		if (commit_transaction->t_updates) {
 446			spin_unlock(&commit_transaction->t_handle_lock);
 447			spin_unlock(&journal->j_state_lock);
 448			schedule();
 449			spin_lock(&journal->j_state_lock);
 450			spin_lock(&commit_transaction->t_handle_lock);
 451		}
 452		finish_wait(&journal->j_wait_updates, &wait);
 453	}
 454	spin_unlock(&commit_transaction->t_handle_lock);
 455
 456	J_ASSERT (commit_transaction->t_outstanding_credits <=
 457			journal->j_max_transaction_buffers);
 458
 459	/*
 460	 * First thing we are allowed to do is to discard any remaining
 461	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 462	 * that there are no such buffers: if a large filesystem
 463	 * operation like a truncate needs to split itself over multiple
 464	 * transactions, then it may try to do a jbd2_journal_restart() while
 465	 * there are still BJ_Reserved buffers outstanding.  These must
 466	 * be released cleanly from the current transaction.
 467	 *
 468	 * In this case, the filesystem must still reserve write access
 469	 * again before modifying the buffer in the new transaction, but
 470	 * we do not require it to remember exactly which old buffers it
 471	 * has reserved.  This is consistent with the existing behaviour
 472	 * that multiple jbd2_journal_get_write_access() calls to the same
 473	 * buffer are perfectly permissable.
 474	 */
 475	while (commit_transaction->t_reserved_list) {
 476		jh = commit_transaction->t_reserved_list;
 477		JBUFFER_TRACE(jh, "reserved, unused: refile");
 478		/*
 479		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
 480		 * leave undo-committed data.
 481		 */
 482		if (jh->b_committed_data) {
 483			struct buffer_head *bh = jh2bh(jh);
 484
 485			jbd_lock_bh_state(bh);
 486			jbd2_free(jh->b_committed_data, bh->b_size);
 487			jh->b_committed_data = NULL;
 488			jbd_unlock_bh_state(bh);
 489		}
 490		jbd2_journal_refile_buffer(journal, jh);
 491	}
 492
 493	/*
 494	 * Now try to drop any written-back buffers from the journal's
 495	 * checkpoint lists.  We do this *before* commit because it potentially
 496	 * frees some memory
 497	 */
 498	spin_lock(&journal->j_list_lock);
 499	__jbd2_journal_clean_checkpoint_list(journal);
 500	spin_unlock(&journal->j_list_lock);
 501
 502	jbd_debug (3, "JBD: commit phase 1\n");
 503
 504	/*
 505	 * Switch to a new revoke table.
 506	 */
 507	jbd2_journal_switch_revoke_table(journal);
 508
 509	stats.u.run.rs_flushing = jiffies;
 510	stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
 511					       stats.u.run.rs_flushing);
 512
 513	commit_transaction->t_state = T_FLUSH;
 514	journal->j_committing_transaction = commit_transaction;
 515	journal->j_running_transaction = NULL;
 516	commit_transaction->t_log_start = journal->j_head;
 517	wake_up(&journal->j_wait_transaction_locked);
 518	spin_unlock(&journal->j_state_lock);
 519
 520	jbd_debug (3, "JBD: commit phase 2\n");
 521
 522	/*
 523	 * Now start flushing things to disk, in the order they appear
 524	 * on the transaction lists.  Data blocks go first.
 525	 */
 526	err = 0;
 527	journal_submit_data_buffers(journal, commit_transaction);
 528
 529	/*
 530	 * Wait for all previously submitted IO to complete if commit
 531	 * record is to be written synchronously.
 532	 */
 533	spin_lock(&journal->j_list_lock);
 534	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 535		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
 536		err = journal_wait_on_locked_list(journal,
 537						commit_transaction);
 538
 539	spin_unlock(&journal->j_list_lock);
 540
 541	if (err)
 542		jbd2_journal_abort(journal, err);
 543
 544	jbd2_journal_write_revoke_records(journal, commit_transaction);
 545
 546	jbd_debug(3, "JBD: commit phase 2\n");
 547
 548	/*
 549	 * If we found any dirty or locked buffers, then we should have
 550	 * looped back up to the write_out_data label.  If there weren't
 551	 * any then journal_clean_data_list should have wiped the list
 552	 * clean by now, so check that it is in fact empty.
 553	 */
 554	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
 555
 556	jbd_debug (3, "JBD: commit phase 3\n");
 557
 558	/*
 559	 * Way to go: we have now written out all of the data for a
 560	 * transaction!  Now comes the tricky part: we need to write out
 561	 * metadata.  Loop over the transaction's entire buffer list:
 562	 */
 563	spin_lock(&journal->j_state_lock);
 564	commit_transaction->t_state = T_COMMIT;
 565	spin_unlock(&journal->j_state_lock);
 566
 567	stats.u.run.rs_logging = jiffies;
 568	stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
 569						 stats.u.run.rs_logging);
 570	stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
 571	stats.u.run.rs_blocks_logged = 0;
 572
 573	J_ASSERT(commit_transaction->t_nr_buffers <=
 574		 commit_transaction->t_outstanding_credits);
 575
 576	descriptor = NULL;
 577	bufs = 0;
 578	while (commit_transaction->t_buffers) {
 579
 580		/* Find the next buffer to be journaled... */
 581
 582		jh = commit_transaction->t_buffers;
 583
 584		/* If we're in abort mode, we just un-journal the buffer and
 585		   release it for background writing. */
 586
 587		if (is_journal_aborted(journal)) {
 588			JBUFFER_TRACE(jh, "journal is aborting: refile");
 589			jbd2_journal_refile_buffer(journal, jh);
 590			/* If that was the last one, we need to clean up
 591			 * any descriptor buffers which may have been
 592			 * already allocated, even if we are now
 593			 * aborting. */
 594			if (!commit_transaction->t_buffers)
 595				goto start_journal_io;
 596			continue;
 597		}
 598
 599		/* Make sure we have a descriptor block in which to
 600		   record the metadata buffer. */
 601
 602		if (!descriptor) {
 603			struct buffer_head *bh;
 604
 605			J_ASSERT (bufs == 0);
 606
 607			jbd_debug(4, "JBD: get descriptor\n");
 608
 609			descriptor = jbd2_journal_get_descriptor_buffer(journal);
 610			if (!descriptor) {
 611				jbd2_journal_abort(journal, -EIO);
 612				continue;
 613			}
 614
 615			bh = jh2bh(descriptor);
 616			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
 617				(unsigned long long)bh->b_blocknr, bh->b_data);
 618			header = (journal_header_t *)&bh->b_data[0];
 619			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
 620			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
 621			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 622
 623			tagp = &bh->b_data[sizeof(journal_header_t)];
 624			space_left = bh->b_size - sizeof(journal_header_t);
 625			first_tag = 1;
 626			set_buffer_jwrite(bh);
 627			set_buffer_dirty(bh);
 628			wbuf[bufs++] = bh;
 629
 630			/* Record it so that we can wait for IO
 631                           completion later */
 632			BUFFER_TRACE(bh, "ph3: file as descriptor");
 633			jbd2_journal_file_buffer(descriptor, commit_transaction,
 634					BJ_LogCtl);
 635		}
 636
 637		/* Where is the buffer to be written? */
 638
 639		err = jbd2_journal_next_log_block(journal, &blocknr);
 640		/* If the block mapping failed, just abandon the buffer
 641		   and repeat this loop: we'll fall into the
 642		   refile-on-abort condition above. */
 643		if (err) {
 644			jbd2_journal_abort(journal, err);
 645			continue;
 646		}
 647
 648		/*
 649		 * start_this_handle() uses t_outstanding_credits to determine
 650		 * the free space in the log, but this counter is changed
 651		 * by jbd2_journal_next_log_block() also.
 652		 */
 653		commit_transaction->t_outstanding_credits--;
 654
 655		/* Bump b_count to prevent truncate from stumbling over
 656                   the shadowed buffer!  @@@ This can go if we ever get
 657                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
 658		atomic_inc(&jh2bh(jh)->b_count);
 659
 660		/* Make a temporary IO buffer with which to write it out
 661                   (this will requeue both the metadata buffer and the
 662                   temporary IO buffer). new_bh goes on BJ_IO*/
 663
 664		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 665		/*
 666		 * akpm: jbd2_journal_write_metadata_buffer() sets
 667		 * new_bh->b_transaction to commit_transaction.
 668		 * We need to clean this up before we release new_bh
 669		 * (which is of type BJ_IO)
 670		 */
 671		JBUFFER_TRACE(jh, "ph3: write metadata");
 672		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
 673						      jh, &new_jh, blocknr);
 674		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
 675		wbuf[bufs++] = jh2bh(new_jh);
 676
 677		/* Record the new block's tag in the current descriptor
 678                   buffer */
 679
 680		tag_flag = 0;
 681		if (flags & 1)
 682			tag_flag |= JBD2_FLAG_ESCAPE;
 683		if (!first_tag)
 684			tag_flag |= JBD2_FLAG_SAME_UUID;
 685
 686		tag = (journal_block_tag_t *) tagp;
 687		write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
 688		tag->t_flags = cpu_to_be32(tag_flag);
 689		tagp += tag_bytes;
 690		space_left -= tag_bytes;
 691
 692		if (first_tag) {
 693			memcpy (tagp, journal->j_uuid, 16);
 694			tagp += 16;
 695			space_left -= 16;
 696			first_tag = 0;
 697		}
 698
 699		/* If there's no more to do, or if the descriptor is full,
 700		   let the IO rip! */
 701
 702		if (bufs == journal->j_wbufsize ||
 703		    commit_transaction->t_buffers == NULL ||
 704		    space_left < tag_bytes + 16) {
 705
 706			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
 707
 708			/* Write an end-of-descriptor marker before
 709                           submitting the IOs.  "tag" still points to
 710                           the last tag we set up. */
 711
 712			tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
 713
 714start_journal_io:
 715			for (i = 0; i < bufs; i++) {
 716				struct buffer_head *bh = wbuf[i];
 717				/*
 718				 * Compute checksum.
 719				 */
 720				if (JBD2_HAS_COMPAT_FEATURE(journal,
 721					JBD2_FEATURE_COMPAT_CHECKSUM)) {
 722					crc32_sum =
 723					    jbd2_checksum_data(crc32_sum, bh);
 724				}
 725
 726				lock_buffer(bh);
 727				clear_buffer_dirty(bh);
 728				set_buffer_uptodate(bh);
 729				bh->b_end_io = journal_end_buffer_io_sync;
 730				submit_bh(WRITE, bh);
 731			}
 732			cond_resched();
 733			stats.u.run.rs_blocks_logged += bufs;
 734
 735			/* Force a new descriptor to be generated next
 736                           time round the loop. */
 737			descriptor = NULL;
 738			bufs = 0;
 739		}
 740	}
 741
 742	/* Done it all: now write the commit record asynchronously. */
 743
 744	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 745		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 746		err = journal_submit_commit_record(journal, commit_transaction,
 747						 &cbh, crc32_sum);
 748		if (err)
 749			__jbd2_journal_abort_hard(journal);
 750
 751		spin_lock(&journal->j_list_lock);
 752		err = journal_wait_on_locked_list(journal,
 753						commit_transaction);
 754		spin_unlock(&journal->j_list_lock);
 755		if (err)
 756			__jbd2_journal_abort_hard(journal);
 757	}
 758
 759	/* Lo and behold: we have just managed to send a transaction to
 760           the log.  Before we can commit it, wait for the IO so far to
 761           complete.  Control buffers being written are on the
 762           transaction's t_log_list queue, and metadata buffers are on
 763           the t_iobuf_list queue.
 764
 765	   Wait for the buffers in reverse order.  That way we are
 766	   less likely to be woken up until all IOs have completed, and
 767	   so we incur less scheduling load.
 768	*/
 769
 770	jbd_debug(3, "JBD: commit phase 4\n");
 771
 772	/*
 773	 * akpm: these are BJ_IO, and j_list_lock is not needed.
 774	 * See __journal_try_to_free_buffer.
 775	 */
 776wait_for_iobuf:
 777	while (commit_transaction->t_iobuf_list != NULL) {
 778		struct buffer_head *bh;
 779
 780		jh = commit_transaction->t_iobuf_list->b_tprev;
 781		bh = jh2bh(jh);
 782		if (buffer_locked(bh)) {
 783			wait_on_buffer(bh);
 784			goto wait_for_iobuf;
 785		}
 786		if (cond_resched())
 787			goto wait_for_iobuf;
 788
 789		if (unlikely(!buffer_uptodate(bh)))
 790			err = -EIO;
 791
 792		clear_buffer_jwrite(bh);
 793
 794		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
 795		jbd2_journal_unfile_buffer(journal, jh);
 796
 797		/*
 798		 * ->t_iobuf_list should contain only dummy buffer_heads
 799		 * which were created by jbd2_journal_write_metadata_buffer().
 800		 */
 801		BUFFER_TRACE(bh, "dumping temporary bh");
 802		jbd2_journal_put_journal_head(jh);
 803		__brelse(bh);
 804		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 805		free_buffer_head(bh);
 806
 807		/* We also have to unlock and free the corresponding
 808                   shadowed buffer */
 809		jh = commit_transaction->t_shadow_list->b_tprev;
 810		bh = jh2bh(jh);
 811		clear_bit(BH_JWrite, &bh->b_state);
 812		J_ASSERT_BH(bh, buffer_jbddirty(bh));
 813
 814		/* The metadata is now released for reuse, but we need
 815                   to remember it against this transaction so that when
 816                   we finally commit, we can do any checkpointing
 817                   required. */
 818		JBUFFER_TRACE(jh, "file as BJ_Forget");
 819		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
 820		/* Wake up any transactions which were waiting for this
 821		   IO to complete */
 822		wake_up_bit(&bh->b_state, BH_Unshadow);
 823		JBUFFER_TRACE(jh, "brelse shadowed buffer");
 824		__brelse(bh);
 825	}
 826
 827	J_ASSERT (commit_transaction->t_shadow_list == NULL);
 828
 829	jbd_debug(3, "JBD: commit phase 5\n");
 830
 831	/* Here we wait for the revoke record and descriptor record buffers */
 832 wait_for_ctlbuf:
 833	while (commit_transaction->t_log_list != NULL) {
 834		struct buffer_head *bh;
 835
 836		jh = commit_transaction->t_log_list->b_tprev;
 837		bh = jh2bh(jh);
 838		if (buffer_locked(bh)) {
 839			wait_on_buffer(bh);
 840			goto wait_for_ctlbuf;
 841		}
 842		if (cond_resched())
 843			goto wait_for_ctlbuf;
 844
 845		if (unlikely(!buffer_uptodate(bh)))
 846			err = -EIO;
 847
 848		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 849		clear_buffer_jwrite(bh);
 850		jbd2_journal_unfile_buffer(journal, jh);
 851		jbd2_journal_put_journal_head(jh);
 852		__brelse(bh);		/* One for getblk */
 853		/* AKPM: bforget here */
 854	}
 855
 856	jbd_debug(3, "JBD: commit phase 6\n");
 857
 858	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 859		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 860		err = journal_submit_commit_record(journal, commit_transaction,
 861						&cbh, crc32_sum);
 862		if (err)
 863			__jbd2_journal_abort_hard(journal);
 864	}
 865	if (!err && !is_journal_aborted(journal))
 866		err = journal_wait_on_commit_record(cbh);
 867
 868	if (err)
 869		jbd2_journal_abort(journal, err);
 870
 871	/* End of a transaction!  Finally, we can do checkpoint
 872           processing: any buffers committed as a result of this
 873           transaction can be removed from any checkpoint list it was on
 874           before. */
 875
 876	jbd_debug(3, "JBD: commit phase 7\n");
 877
 878	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
 879	J_ASSERT(commit_transaction->t_buffers == NULL);
 880	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 881	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 882	J_ASSERT(commit_transaction->t_shadow_list == NULL);
 883	J_ASSERT(commit_transaction->t_log_list == NULL);
 884
 885restart_loop:
 886	/*
 887	 * As there are other places (journal_unmap_buffer()) adding buffers
 888	 * to this list we have to be careful and hold the j_list_lock.
 889	 */
 890	spin_lock(&journal->j_list_lock);
 891	while (commit_transaction->t_forget) {
 892		transaction_t *cp_transaction;
 893		struct buffer_head *bh;
 894
 895		jh = commit_transaction->t_forget;
 896		spin_unlock(&journal->j_list_lock);
 897		bh = jh2bh(jh);
 898		jbd_lock_bh_state(bh);
 899		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
 900			jh->b_transaction == journal->j_running_transaction);
 901
 902		/*
 903		 * If there is undo-protected committed data against
 904		 * this buffer, then we can remove it now.  If it is a
 905		 * buffer needing such protection, the old frozen_data
 906		 * field now points to a committed version of the
 907		 * buffer, so rotate that field to the new committed
 908		 * data.
 909		 *
 910		 * Otherwise, we can just throw away the frozen data now.
 911		 */
 912		if (jh->b_committed_data) {
 913			jbd2_free(jh->b_committed_data, bh->b_size);
 914			jh->b_committed_data = NULL;
 915			if (jh->b_frozen_data) {
 916				jh->b_committed_data = jh->b_frozen_data;
 917				jh->b_frozen_data = NULL;
 918			}
 919		} else if (jh->b_frozen_data) {
 920			jbd2_free(jh->b_frozen_data, bh->b_size);
 921			jh->b_frozen_data = NULL;
 922		}
 923
 924		spin_lock(&journal->j_list_lock);
 925		cp_transaction = jh->b_cp_transaction;
 926		if (cp_transaction) {
 927			JBUFFER_TRACE(jh, "remove from old cp transaction");
 928			cp_transaction->t_chp_stats.cs_dropped++;
 929			__jbd2_journal_remove_checkpoint(jh);
 930		}
 931
 932		/* Only re-checkpoint the buffer_head if it is marked
 933		 * dirty.  If the buffer was added to the BJ_Forget list
 934		 * by jbd2_journal_forget, it may no longer be dirty and
 935		 * there's no point in keeping a checkpoint record for
 936		 * it. */
 937
 938		/* A buffer which has been freed while still being
 939		 * journaled by a previous transaction may end up still
 940		 * being dirty here, but we want to avoid writing back
 941		 * that buffer in the future now that the last use has
 942		 * been committed.  That's not only a performance gain,
 943		 * it also stops aliasing problems if the buffer is left
 944		 * behind for writeback and gets reallocated for another
 945		 * use in a different page. */
 946		if (buffer_freed(bh)) {
 947			clear_buffer_freed(bh);
 948			clear_buffer_jbddirty(bh);
 949		}
 950
 951		if (buffer_jbddirty(bh)) {
 952			JBUFFER_TRACE(jh, "add to new checkpointing trans");
 953			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
 954			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 955			__jbd2_journal_refile_buffer(jh);
 956			jbd_unlock_bh_state(bh);
 957		} else {
 958			J_ASSERT_BH(bh, !buffer_dirty(bh));
 959			/* The buffer on BJ_Forget list and not jbddirty means
 960			 * it has been freed by this transaction and hence it
 961			 * could not have been reallocated until this
 962			 * transaction has committed. *BUT* it could be
 963			 * reallocated once we have written all the data to
 964			 * disk and before we process the buffer on BJ_Forget
 965			 * list. */
 966			JBUFFER_TRACE(jh, "refile or unfile freed buffer");
 967			__jbd2_journal_refile_buffer(jh);
 968			if (!jh->b_transaction) {
 969				jbd_unlock_bh_state(bh);
 970				 /* needs a brelse */
 971				jbd2_journal_remove_journal_head(bh);
 972				release_buffer_page(bh);
 973			} else
 974				jbd_unlock_bh_state(bh);
 975		}
 976		cond_resched_lock(&journal->j_list_lock);
 977	}
 978	spin_unlock(&journal->j_list_lock);
 979	/*
 980	 * This is a bit sleazy.  We use j_list_lock to protect transition
 981	 * of a transaction into T_FINISHED state and calling
 982	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
 983	 * other checkpointing code processing the transaction...
 984	 */
 985	spin_lock(&journal->j_state_lock);
 986	spin_lock(&journal->j_list_lock);
 987	/*
 988	 * Now recheck if some buffers did not get attached to the transaction
 989	 * while the lock was dropped...
 990	 */
 991	if (commit_transaction->t_forget) {
 992		spin_unlock(&journal->j_list_lock);
 993		spin_unlock(&journal->j_state_lock);
 994		goto restart_loop;
 995	}
 996
 997	/* Done with this transaction! */
 998
 999	jbd_debug(3, "JBD: commit phase 8\n");
1000
1001	J_ASSERT(commit_transaction->t_state == T_COMMIT);
1002
1003	commit_transaction->t_start = jiffies;
1004	stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
1005						commit_transaction->t_start);
1006
1007	/*
1008	 * File the transaction for history
1009	 */
1010	stats.ts_type = JBD2_STATS_RUN;
1011	stats.ts_tid = commit_transaction->t_tid;
1012	stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
1013	spin_lock(&journal->j_history_lock);
1014	memcpy(journal->j_history + journal->j_history_cur, &stats,
1015			sizeof(stats));
1016	if (++journal->j_history_cur == journal->j_history_max)
1017		journal->j_history_cur = 0;
1018
1019	/*
1020	 * Calculate overall stats
1021	 */
1022	journal->j_stats.ts_tid++;
1023	journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
1024	journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
1025	journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
1026	journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
1027	journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
1028	journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
1029	journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
1030	journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
1031	spin_unlock(&journal->j_history_lock);
1032
1033	commit_transaction->t_state = T_FINISHED;
1034	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1035	journal->j_commit_sequence = commit_transaction->t_tid;
1036	journal->j_committing_transaction = NULL;
1037	spin_unlock(&journal->j_state_lock);
1038
1039	if (commit_transaction->t_checkpoint_list == NULL &&
1040	    commit_transaction->t_checkpoint_io_list == NULL) {
1041		__jbd2_journal_drop_transaction(journal, commit_transaction);
1042	} else {
1043		if (journal->j_checkpoint_transactions == NULL) {
1044			journal->j_checkpoint_transactions = commit_transaction;
1045			commit_transaction->t_cpnext = commit_transaction;
1046			commit_transaction->t_cpprev = commit_transaction;
1047		} else {
1048			commit_transaction->t_cpnext =
1049				journal->j_checkpoint_transactions;
1050			commit_transaction->t_cpprev =
1051				commit_transaction->t_cpnext->t_cpprev;
1052			commit_transaction->t_cpnext->t_cpprev =
1053				commit_transaction;
1054			commit_transaction->t_cpprev->t_cpnext =
1055				commit_transaction;
1056		}
1057	}
1058	spin_unlock(&journal->j_list_lock);
1059
1060	jbd_debug(1, "JBD: commit %d complete, head %d\n",
1061		  journal->j_commit_sequence, journal->j_tail_sequence);
1062
1063	wake_up(&journal->j_wait_done_commit);
1064}