fs/jbd2/commit.c at v2.6.26 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / jbd2 / commit.c
at v2.6.26 1065 lines 32 kB view raw
   1/*
   2 * linux/fs/jbd2/commit.c
   3 *
   4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5 *
   6 * Copyright 1998 Red Hat corp --- All Rights Reserved
   7 *
   8 * This file is part of the Linux kernel and is made available under
   9 * the terms of the GNU General Public License, version 2, or at your
  10 * option, any later version, incorporated herein by reference.
  11 *
  12 * Journal commit routines for the generic filesystem journaling code;
  13 * part of the ext2fs journaling system.
  14 */
  15
  16#include <linux/time.h>
  17#include <linux/fs.h>
  18#include <linux/jbd2.h>
  19#include <linux/errno.h>
  20#include <linux/slab.h>
  21#include <linux/mm.h>
  22#include <linux/pagemap.h>
  23#include <linux/jiffies.h>
  24#include <linux/crc32.h>
  25
  26/*
  27 * Default IO end handler for temporary BJ_IO buffer_heads.
  28 */
  29static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  30{
  31	BUFFER_TRACE(bh, "");
  32	if (uptodate)
  33		set_buffer_uptodate(bh);
  34	else
  35		clear_buffer_uptodate(bh);
  36	unlock_buffer(bh);
  37}
  38
  39/*
  40 * When an ext3-ordered file is truncated, it is possible that many pages are
  41 * not sucessfully freed, because they are attached to a committing transaction.
  42 * After the transaction commits, these pages are left on the LRU, with no
  43 * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  44 * by the VM, but their apparent absence upsets the VM accounting, and it makes
  45 * the numbers in /proc/meminfo look odd.
  46 *
  47 * So here, we have a buffer which has just come off the forget list.  Look to
  48 * see if we can strip all buffers from the backing page.
  49 *
  50 * Called under lock_journal(), and possibly under journal_datalist_lock.  The
  51 * caller provided us with a ref against the buffer, and we drop that here.
  52 */
  53static void release_buffer_page(struct buffer_head *bh)
  54{
  55	struct page *page;
  56
  57	if (buffer_dirty(bh))
  58		goto nope;
  59	if (atomic_read(&bh->b_count) != 1)
  60		goto nope;
  61	page = bh->b_page;
  62	if (!page)
  63		goto nope;
  64	if (page->mapping)
  65		goto nope;
  66
  67	/* OK, it's a truncated page */
  68	if (TestSetPageLocked(page))
  69		goto nope;
  70
  71	page_cache_get(page);
  72	__brelse(bh);
  73	try_to_free_buffers(page);
  74	unlock_page(page);
  75	page_cache_release(page);
  76	return;
  77
  78nope:
  79	__brelse(bh);
  80}
  81
  82/*
  83 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
  84 * held.  For ranking reasons we must trylock.  If we lose, schedule away and
  85 * return 0.  j_list_lock is dropped in this case.
  86 */
  87static int inverted_lock(journal_t *journal, struct buffer_head *bh)
  88{
  89	if (!jbd_trylock_bh_state(bh)) {
  90		spin_unlock(&journal->j_list_lock);
  91		schedule();
  92		return 0;
  93	}
  94	return 1;
  95}
  96
  97/*
  98 * Done it all: now submit the commit record.  We should have
  99 * cleaned up our previous buffers by now, so if we are in abort
 100 * mode we can now just skip the rest of the journal write
 101 * entirely.
 102 *
 103 * Returns 1 if the journal needs to be aborted or 0 on success
 104 */
 105static int journal_submit_commit_record(journal_t *journal,
 106					transaction_t *commit_transaction,
 107					struct buffer_head **cbh,
 108					__u32 crc32_sum)
 109{
 110	struct journal_head *descriptor;
 111	struct commit_header *tmp;
 112	struct buffer_head *bh;
 113	int ret;
 114	int barrier_done = 0;
 115
 116	if (is_journal_aborted(journal))
 117		return 0;
 118
 119	descriptor = jbd2_journal_get_descriptor_buffer(journal);
 120	if (!descriptor)
 121		return 1;
 122
 123	bh = jh2bh(descriptor);
 124
 125	tmp = (struct commit_header *)bh->b_data;
 126	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 127	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
 128	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 129
 130	if (JBD2_HAS_COMPAT_FEATURE(journal,
 131				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
 132		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
 133		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
 134		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
 135	}
 136
 137	JBUFFER_TRACE(descriptor, "submit commit block");
 138	lock_buffer(bh);
 139	get_bh(bh);
 140	set_buffer_dirty(bh);
 141	set_buffer_uptodate(bh);
 142	bh->b_end_io = journal_end_buffer_io_sync;
 143
 144	if (journal->j_flags & JBD2_BARRIER &&
 145		!JBD2_HAS_INCOMPAT_FEATURE(journal,
 146					 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 147		set_buffer_ordered(bh);
 148		barrier_done = 1;
 149	}
 150	ret = submit_bh(WRITE, bh);
 151	if (barrier_done)
 152		clear_buffer_ordered(bh);
 153
 154	/* is it possible for another commit to fail at roughly
 155	 * the same time as this one?  If so, we don't want to
 156	 * trust the barrier flag in the super, but instead want
 157	 * to remember if we sent a barrier request
 158	 */
 159	if (ret == -EOPNOTSUPP && barrier_done) {
 160		char b[BDEVNAME_SIZE];
 161
 162		printk(KERN_WARNING
 163			"JBD: barrier-based sync failed on %s - "
 164			"disabling barriers\n",
 165			bdevname(journal->j_dev, b));
 166		spin_lock(&journal->j_state_lock);
 167		journal->j_flags &= ~JBD2_BARRIER;
 168		spin_unlock(&journal->j_state_lock);
 169
 170		/* And try again, without the barrier */
 171		lock_buffer(bh);
 172		set_buffer_uptodate(bh);
 173		set_buffer_dirty(bh);
 174		ret = submit_bh(WRITE, bh);
 175	}
 176	*cbh = bh;
 177	return ret;
 178}
 179
 180/*
 181 * This function along with journal_submit_commit_record
 182 * allows to write the commit record asynchronously.
 183 */
 184static int journal_wait_on_commit_record(struct buffer_head *bh)
 185{
 186	int ret = 0;
 187
 188	clear_buffer_dirty(bh);
 189	wait_on_buffer(bh);
 190
 191	if (unlikely(!buffer_uptodate(bh)))
 192		ret = -EIO;
 193	put_bh(bh);            /* One for getblk() */
 194	jbd2_journal_put_journal_head(bh2jh(bh));
 195
 196	return ret;
 197}
 198
 199/*
 200 * Wait for all submitted IO to complete.
 201 */
 202static int journal_wait_on_locked_list(journal_t *journal,
 203				       transaction_t *commit_transaction)
 204{
 205	int ret = 0;
 206	struct journal_head *jh;
 207
 208	while (commit_transaction->t_locked_list) {
 209		struct buffer_head *bh;
 210
 211		jh = commit_transaction->t_locked_list->b_tprev;
 212		bh = jh2bh(jh);
 213		get_bh(bh);
 214		if (buffer_locked(bh)) {
 215			spin_unlock(&journal->j_list_lock);
 216			wait_on_buffer(bh);
 217			if (unlikely(!buffer_uptodate(bh)))
 218				ret = -EIO;
 219			spin_lock(&journal->j_list_lock);
 220		}
 221		if (!inverted_lock(journal, bh)) {
 222			put_bh(bh);
 223			spin_lock(&journal->j_list_lock);
 224			continue;
 225		}
 226		if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
 227			__jbd2_journal_unfile_buffer(jh);
 228			jbd_unlock_bh_state(bh);
 229			jbd2_journal_remove_journal_head(bh);
 230			put_bh(bh);
 231		} else {
 232			jbd_unlock_bh_state(bh);
 233		}
 234		put_bh(bh);
 235		cond_resched_lock(&journal->j_list_lock);
 236	}
 237	return ret;
 238  }
 239
 240static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
 241{
 242	int i;
 243
 244	for (i = 0; i < bufs; i++) {
 245		wbuf[i]->b_end_io = end_buffer_write_sync;
 246		/* We use-up our safety reference in submit_bh() */
 247		submit_bh(WRITE, wbuf[i]);
 248	}
 249}
 250
 251/*
 252 *  Submit all the data buffers to disk
 253 */
 254static void journal_submit_data_buffers(journal_t *journal,
 255				transaction_t *commit_transaction)
 256{
 257	struct journal_head *jh;
 258	struct buffer_head *bh;
 259	int locked;
 260	int bufs = 0;
 261	struct buffer_head **wbuf = journal->j_wbuf;
 262
 263	/*
 264	 * Whenever we unlock the journal and sleep, things can get added
 265	 * onto ->t_sync_datalist, so we have to keep looping back to
 266	 * write_out_data until we *know* that the list is empty.
 267	 *
 268	 * Cleanup any flushed data buffers from the data list.  Even in
 269	 * abort mode, we want to flush this out as soon as possible.
 270	 */
 271write_out_data:
 272	cond_resched();
 273	spin_lock(&journal->j_list_lock);
 274
 275	while (commit_transaction->t_sync_datalist) {
 276		jh = commit_transaction->t_sync_datalist;
 277		bh = jh2bh(jh);
 278		locked = 0;
 279
 280		/* Get reference just to make sure buffer does not disappear
 281		 * when we are forced to drop various locks */
 282		get_bh(bh);
 283		/* If the buffer is dirty, we need to submit IO and hence
 284		 * we need the buffer lock. We try to lock the buffer without
 285		 * blocking. If we fail, we need to drop j_list_lock and do
 286		 * blocking lock_buffer().
 287		 */
 288		if (buffer_dirty(bh)) {
 289			if (test_set_buffer_locked(bh)) {
 290				BUFFER_TRACE(bh, "needs blocking lock");
 291				spin_unlock(&journal->j_list_lock);
 292				/* Write out all data to prevent deadlocks */
 293				journal_do_submit_data(wbuf, bufs);
 294				bufs = 0;
 295				lock_buffer(bh);
 296				spin_lock(&journal->j_list_lock);
 297			}
 298			locked = 1;
 299		}
 300		/* We have to get bh_state lock. Again out of order, sigh. */
 301		if (!inverted_lock(journal, bh)) {
 302			jbd_lock_bh_state(bh);
 303			spin_lock(&journal->j_list_lock);
 304		}
 305		/* Someone already cleaned up the buffer? */
 306		if (!buffer_jbd(bh)
 307			|| jh->b_transaction != commit_transaction
 308			|| jh->b_jlist != BJ_SyncData) {
 309			jbd_unlock_bh_state(bh);
 310			if (locked)
 311				unlock_buffer(bh);
 312			BUFFER_TRACE(bh, "already cleaned up");
 313			put_bh(bh);
 314			continue;
 315		}
 316		if (locked && test_clear_buffer_dirty(bh)) {
 317			BUFFER_TRACE(bh, "needs writeout, adding to array");
 318			wbuf[bufs++] = bh;
 319			__jbd2_journal_file_buffer(jh, commit_transaction,
 320						BJ_Locked);
 321			jbd_unlock_bh_state(bh);
 322			if (bufs == journal->j_wbufsize) {
 323				spin_unlock(&journal->j_list_lock);
 324				journal_do_submit_data(wbuf, bufs);
 325				bufs = 0;
 326				goto write_out_data;
 327			}
 328		} else if (!locked && buffer_locked(bh)) {
 329			__jbd2_journal_file_buffer(jh, commit_transaction,
 330						BJ_Locked);
 331			jbd_unlock_bh_state(bh);
 332			put_bh(bh);
 333		} else {
 334			BUFFER_TRACE(bh, "writeout complete: unfile");
 335			__jbd2_journal_unfile_buffer(jh);
 336			jbd_unlock_bh_state(bh);
 337			if (locked)
 338				unlock_buffer(bh);
 339			jbd2_journal_remove_journal_head(bh);
 340			/* Once for our safety reference, once for
 341			 * jbd2_journal_remove_journal_head() */
 342			put_bh(bh);
 343			put_bh(bh);
 344		}
 345
 346		if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
 347			spin_unlock(&journal->j_list_lock);
 348			goto write_out_data;
 349		}
 350	}
 351	spin_unlock(&journal->j_list_lock);
 352	journal_do_submit_data(wbuf, bufs);
 353}
 354
 355static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
 356{
 357	struct page *page = bh->b_page;
 358	char *addr;
 359	__u32 checksum;
 360
 361	addr = kmap_atomic(page, KM_USER0);
 362	checksum = crc32_be(crc32_sum,
 363		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
 364	kunmap_atomic(addr, KM_USER0);
 365
 366	return checksum;
 367}
 368
 369static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
 370				   unsigned long long block)
 371{
 372	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
 373	if (tag_bytes > JBD2_TAG_SIZE32)
 374		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 375}
 376
 377/*
 378 * jbd2_journal_commit_transaction
 379 *
 380 * The primary function for committing a transaction to the log.  This
 381 * function is called by the journal thread to begin a complete commit.
 382 */
 383void jbd2_journal_commit_transaction(journal_t *journal)
 384{
 385	struct transaction_stats_s stats;
 386	transaction_t *commit_transaction;
 387	struct journal_head *jh, *new_jh, *descriptor;
 388	struct buffer_head **wbuf = journal->j_wbuf;
 389	int bufs;
 390	int flags;
 391	int err;
 392	unsigned long long blocknr;
 393	char *tagp = NULL;
 394	journal_header_t *header;
 395	journal_block_tag_t *tag = NULL;
 396	int space_left = 0;
 397	int first_tag = 0;
 398	int tag_flag;
 399	int i;
 400	int tag_bytes = journal_tag_bytes(journal);
 401	struct buffer_head *cbh = NULL; /* For transactional checksums */
 402	__u32 crc32_sum = ~0;
 403
 404	/*
 405	 * First job: lock down the current transaction and wait for
 406	 * all outstanding updates to complete.
 407	 */
 408
 409#ifdef COMMIT_STATS
 410	spin_lock(&journal->j_list_lock);
 411	summarise_journal_usage(journal);
 412	spin_unlock(&journal->j_list_lock);
 413#endif
 414
 415	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
 416	if (journal->j_flags & JBD2_FLUSHED) {
 417		jbd_debug(3, "super block updated\n");
 418		jbd2_journal_update_superblock(journal, 1);
 419	} else {
 420		jbd_debug(3, "superblock not updated\n");
 421	}
 422
 423	J_ASSERT(journal->j_running_transaction != NULL);
 424	J_ASSERT(journal->j_committing_transaction == NULL);
 425
 426	commit_transaction = journal->j_running_transaction;
 427	J_ASSERT(commit_transaction->t_state == T_RUNNING);
 428
 429	jbd_debug(1, "JBD: starting commit of transaction %d\n",
 430			commit_transaction->t_tid);
 431
 432	spin_lock(&journal->j_state_lock);
 433	commit_transaction->t_state = T_LOCKED;
 434
 435	stats.u.run.rs_wait = commit_transaction->t_max_wait;
 436	stats.u.run.rs_locked = jiffies;
 437	stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
 438						stats.u.run.rs_locked);
 439
 440	spin_lock(&commit_transaction->t_handle_lock);
 441	while (commit_transaction->t_updates) {
 442		DEFINE_WAIT(wait);
 443
 444		prepare_to_wait(&journal->j_wait_updates, &wait,
 445					TASK_UNINTERRUPTIBLE);
 446		if (commit_transaction->t_updates) {
 447			spin_unlock(&commit_transaction->t_handle_lock);
 448			spin_unlock(&journal->j_state_lock);
 449			schedule();
 450			spin_lock(&journal->j_state_lock);
 451			spin_lock(&commit_transaction->t_handle_lock);
 452		}
 453		finish_wait(&journal->j_wait_updates, &wait);
 454	}
 455	spin_unlock(&commit_transaction->t_handle_lock);
 456
 457	J_ASSERT (commit_transaction->t_outstanding_credits <=
 458			journal->j_max_transaction_buffers);
 459
 460	/*
 461	 * First thing we are allowed to do is to discard any remaining
 462	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 463	 * that there are no such buffers: if a large filesystem
 464	 * operation like a truncate needs to split itself over multiple
 465	 * transactions, then it may try to do a jbd2_journal_restart() while
 466	 * there are still BJ_Reserved buffers outstanding.  These must
 467	 * be released cleanly from the current transaction.
 468	 *
 469	 * In this case, the filesystem must still reserve write access
 470	 * again before modifying the buffer in the new transaction, but
 471	 * we do not require it to remember exactly which old buffers it
 472	 * has reserved.  This is consistent with the existing behaviour
 473	 * that multiple jbd2_journal_get_write_access() calls to the same
 474	 * buffer are perfectly permissable.
 475	 */
 476	while (commit_transaction->t_reserved_list) {
 477		jh = commit_transaction->t_reserved_list;
 478		JBUFFER_TRACE(jh, "reserved, unused: refile");
 479		/*
 480		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
 481		 * leave undo-committed data.
 482		 */
 483		if (jh->b_committed_data) {
 484			struct buffer_head *bh = jh2bh(jh);
 485
 486			jbd_lock_bh_state(bh);
 487			jbd2_free(jh->b_committed_data, bh->b_size);
 488			jh->b_committed_data = NULL;
 489			jbd_unlock_bh_state(bh);
 490		}
 491		jbd2_journal_refile_buffer(journal, jh);
 492	}
 493
 494	/*
 495	 * Now try to drop any written-back buffers from the journal's
 496	 * checkpoint lists.  We do this *before* commit because it potentially
 497	 * frees some memory
 498	 */
 499	spin_lock(&journal->j_list_lock);
 500	__jbd2_journal_clean_checkpoint_list(journal);
 501	spin_unlock(&journal->j_list_lock);
 502
 503	jbd_debug (3, "JBD: commit phase 1\n");
 504
 505	/*
 506	 * Switch to a new revoke table.
 507	 */
 508	jbd2_journal_switch_revoke_table(journal);
 509
 510	stats.u.run.rs_flushing = jiffies;
 511	stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
 512					       stats.u.run.rs_flushing);
 513
 514	commit_transaction->t_state = T_FLUSH;
 515	journal->j_committing_transaction = commit_transaction;
 516	journal->j_running_transaction = NULL;
 517	commit_transaction->t_log_start = journal->j_head;
 518	wake_up(&journal->j_wait_transaction_locked);
 519	spin_unlock(&journal->j_state_lock);
 520
 521	jbd_debug (3, "JBD: commit phase 2\n");
 522
 523	/*
 524	 * Now start flushing things to disk, in the order they appear
 525	 * on the transaction lists.  Data blocks go first.
 526	 */
 527	err = 0;
 528	journal_submit_data_buffers(journal, commit_transaction);
 529
 530	/*
 531	 * Wait for all previously submitted IO to complete if commit
 532	 * record is to be written synchronously.
 533	 */
 534	spin_lock(&journal->j_list_lock);
 535	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 536		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
 537		err = journal_wait_on_locked_list(journal,
 538						commit_transaction);
 539
 540	spin_unlock(&journal->j_list_lock);
 541
 542	if (err)
 543		jbd2_journal_abort(journal, err);
 544
 545	jbd2_journal_write_revoke_records(journal, commit_transaction);
 546
 547	jbd_debug(3, "JBD: commit phase 2\n");
 548
 549	/*
 550	 * If we found any dirty or locked buffers, then we should have
 551	 * looped back up to the write_out_data label.  If there weren't
 552	 * any then journal_clean_data_list should have wiped the list
 553	 * clean by now, so check that it is in fact empty.
 554	 */
 555	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
 556
 557	jbd_debug (3, "JBD: commit phase 3\n");
 558
 559	/*
 560	 * Way to go: we have now written out all of the data for a
 561	 * transaction!  Now comes the tricky part: we need to write out
 562	 * metadata.  Loop over the transaction's entire buffer list:
 563	 */
 564	spin_lock(&journal->j_state_lock);
 565	commit_transaction->t_state = T_COMMIT;
 566	spin_unlock(&journal->j_state_lock);
 567
 568	stats.u.run.rs_logging = jiffies;
 569	stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
 570						 stats.u.run.rs_logging);
 571	stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
 572	stats.u.run.rs_blocks_logged = 0;
 573
 574	J_ASSERT(commit_transaction->t_nr_buffers <=
 575		 commit_transaction->t_outstanding_credits);
 576
 577	descriptor = NULL;
 578	bufs = 0;
 579	while (commit_transaction->t_buffers) {
 580
 581		/* Find the next buffer to be journaled... */
 582
 583		jh = commit_transaction->t_buffers;
 584
 585		/* If we're in abort mode, we just un-journal the buffer and
 586		   release it for background writing. */
 587
 588		if (is_journal_aborted(journal)) {
 589			JBUFFER_TRACE(jh, "journal is aborting: refile");
 590			jbd2_journal_refile_buffer(journal, jh);
 591			/* If that was the last one, we need to clean up
 592			 * any descriptor buffers which may have been
 593			 * already allocated, even if we are now
 594			 * aborting. */
 595			if (!commit_transaction->t_buffers)
 596				goto start_journal_io;
 597			continue;
 598		}
 599
 600		/* Make sure we have a descriptor block in which to
 601		   record the metadata buffer. */
 602
 603		if (!descriptor) {
 604			struct buffer_head *bh;
 605
 606			J_ASSERT (bufs == 0);
 607
 608			jbd_debug(4, "JBD: get descriptor\n");
 609
 610			descriptor = jbd2_journal_get_descriptor_buffer(journal);
 611			if (!descriptor) {
 612				jbd2_journal_abort(journal, -EIO);
 613				continue;
 614			}
 615
 616			bh = jh2bh(descriptor);
 617			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
 618				(unsigned long long)bh->b_blocknr, bh->b_data);
 619			header = (journal_header_t *)&bh->b_data[0];
 620			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
 621			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
 622			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 623
 624			tagp = &bh->b_data[sizeof(journal_header_t)];
 625			space_left = bh->b_size - sizeof(journal_header_t);
 626			first_tag = 1;
 627			set_buffer_jwrite(bh);
 628			set_buffer_dirty(bh);
 629			wbuf[bufs++] = bh;
 630
 631			/* Record it so that we can wait for IO
 632                           completion later */
 633			BUFFER_TRACE(bh, "ph3: file as descriptor");
 634			jbd2_journal_file_buffer(descriptor, commit_transaction,
 635					BJ_LogCtl);
 636		}
 637
 638		/* Where is the buffer to be written? */
 639
 640		err = jbd2_journal_next_log_block(journal, &blocknr);
 641		/* If the block mapping failed, just abandon the buffer
 642		   and repeat this loop: we'll fall into the
 643		   refile-on-abort condition above. */
 644		if (err) {
 645			jbd2_journal_abort(journal, err);
 646			continue;
 647		}
 648
 649		/*
 650		 * start_this_handle() uses t_outstanding_credits to determine
 651		 * the free space in the log, but this counter is changed
 652		 * by jbd2_journal_next_log_block() also.
 653		 */
 654		commit_transaction->t_outstanding_credits--;
 655
 656		/* Bump b_count to prevent truncate from stumbling over
 657                   the shadowed buffer!  @@@ This can go if we ever get
 658                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
 659		atomic_inc(&jh2bh(jh)->b_count);
 660
 661		/* Make a temporary IO buffer with which to write it out
 662                   (this will requeue both the metadata buffer and the
 663                   temporary IO buffer). new_bh goes on BJ_IO*/
 664
 665		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 666		/*
 667		 * akpm: jbd2_journal_write_metadata_buffer() sets
 668		 * new_bh->b_transaction to commit_transaction.
 669		 * We need to clean this up before we release new_bh
 670		 * (which is of type BJ_IO)
 671		 */
 672		JBUFFER_TRACE(jh, "ph3: write metadata");
 673		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
 674						      jh, &new_jh, blocknr);
 675		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
 676		wbuf[bufs++] = jh2bh(new_jh);
 677
 678		/* Record the new block's tag in the current descriptor
 679                   buffer */
 680
 681		tag_flag = 0;
 682		if (flags & 1)
 683			tag_flag |= JBD2_FLAG_ESCAPE;
 684		if (!first_tag)
 685			tag_flag |= JBD2_FLAG_SAME_UUID;
 686
 687		tag = (journal_block_tag_t *) tagp;
 688		write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
 689		tag->t_flags = cpu_to_be32(tag_flag);
 690		tagp += tag_bytes;
 691		space_left -= tag_bytes;
 692
 693		if (first_tag) {
 694			memcpy (tagp, journal->j_uuid, 16);
 695			tagp += 16;
 696			space_left -= 16;
 697			first_tag = 0;
 698		}
 699
 700		/* If there's no more to do, or if the descriptor is full,
 701		   let the IO rip! */
 702
 703		if (bufs == journal->j_wbufsize ||
 704		    commit_transaction->t_buffers == NULL ||
 705		    space_left < tag_bytes + 16) {
 706
 707			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
 708
 709			/* Write an end-of-descriptor marker before
 710                           submitting the IOs.  "tag" still points to
 711                           the last tag we set up. */
 712
 713			tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
 714
 715start_journal_io:
 716			for (i = 0; i < bufs; i++) {
 717				struct buffer_head *bh = wbuf[i];
 718				/*
 719				 * Compute checksum.
 720				 */
 721				if (JBD2_HAS_COMPAT_FEATURE(journal,
 722					JBD2_FEATURE_COMPAT_CHECKSUM)) {
 723					crc32_sum =
 724					    jbd2_checksum_data(crc32_sum, bh);
 725				}
 726
 727				lock_buffer(bh);
 728				clear_buffer_dirty(bh);
 729				set_buffer_uptodate(bh);
 730				bh->b_end_io = journal_end_buffer_io_sync;
 731				submit_bh(WRITE, bh);
 732			}
 733			cond_resched();
 734			stats.u.run.rs_blocks_logged += bufs;
 735
 736			/* Force a new descriptor to be generated next
 737                           time round the loop. */
 738			descriptor = NULL;
 739			bufs = 0;
 740		}
 741	}
 742
 743	/* Done it all: now write the commit record asynchronously. */
 744
 745	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 746		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 747		err = journal_submit_commit_record(journal, commit_transaction,
 748						 &cbh, crc32_sum);
 749		if (err)
 750			__jbd2_journal_abort_hard(journal);
 751
 752		spin_lock(&journal->j_list_lock);
 753		err = journal_wait_on_locked_list(journal,
 754						commit_transaction);
 755		spin_unlock(&journal->j_list_lock);
 756		if (err)
 757			__jbd2_journal_abort_hard(journal);
 758	}
 759
 760	/* Lo and behold: we have just managed to send a transaction to
 761           the log.  Before we can commit it, wait for the IO so far to
 762           complete.  Control buffers being written are on the
 763           transaction's t_log_list queue, and metadata buffers are on
 764           the t_iobuf_list queue.
 765
 766	   Wait for the buffers in reverse order.  That way we are
 767	   less likely to be woken up until all IOs have completed, and
 768	   so we incur less scheduling load.
 769	*/
 770
 771	jbd_debug(3, "JBD: commit phase 4\n");
 772
 773	/*
 774	 * akpm: these are BJ_IO, and j_list_lock is not needed.
 775	 * See __journal_try_to_free_buffer.
 776	 */
 777wait_for_iobuf:
 778	while (commit_transaction->t_iobuf_list != NULL) {
 779		struct buffer_head *bh;
 780
 781		jh = commit_transaction->t_iobuf_list->b_tprev;
 782		bh = jh2bh(jh);
 783		if (buffer_locked(bh)) {
 784			wait_on_buffer(bh);
 785			goto wait_for_iobuf;
 786		}
 787		if (cond_resched())
 788			goto wait_for_iobuf;
 789
 790		if (unlikely(!buffer_uptodate(bh)))
 791			err = -EIO;
 792
 793		clear_buffer_jwrite(bh);
 794
 795		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
 796		jbd2_journal_unfile_buffer(journal, jh);
 797
 798		/*
 799		 * ->t_iobuf_list should contain only dummy buffer_heads
 800		 * which were created by jbd2_journal_write_metadata_buffer().
 801		 */
 802		BUFFER_TRACE(bh, "dumping temporary bh");
 803		jbd2_journal_put_journal_head(jh);
 804		__brelse(bh);
 805		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 806		free_buffer_head(bh);
 807
 808		/* We also have to unlock and free the corresponding
 809                   shadowed buffer */
 810		jh = commit_transaction->t_shadow_list->b_tprev;
 811		bh = jh2bh(jh);
 812		clear_bit(BH_JWrite, &bh->b_state);
 813		J_ASSERT_BH(bh, buffer_jbddirty(bh));
 814
 815		/* The metadata is now released for reuse, but we need
 816                   to remember it against this transaction so that when
 817                   we finally commit, we can do any checkpointing
 818                   required. */
 819		JBUFFER_TRACE(jh, "file as BJ_Forget");
 820		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
 821		/* Wake up any transactions which were waiting for this
 822		   IO to complete */
 823		wake_up_bit(&bh->b_state, BH_Unshadow);
 824		JBUFFER_TRACE(jh, "brelse shadowed buffer");
 825		__brelse(bh);
 826	}
 827
 828	J_ASSERT (commit_transaction->t_shadow_list == NULL);
 829
 830	jbd_debug(3, "JBD: commit phase 5\n");
 831
 832	/* Here we wait for the revoke record and descriptor record buffers */
 833 wait_for_ctlbuf:
 834	while (commit_transaction->t_log_list != NULL) {
 835		struct buffer_head *bh;
 836
 837		jh = commit_transaction->t_log_list->b_tprev;
 838		bh = jh2bh(jh);
 839		if (buffer_locked(bh)) {
 840			wait_on_buffer(bh);
 841			goto wait_for_ctlbuf;
 842		}
 843		if (cond_resched())
 844			goto wait_for_ctlbuf;
 845
 846		if (unlikely(!buffer_uptodate(bh)))
 847			err = -EIO;
 848
 849		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 850		clear_buffer_jwrite(bh);
 851		jbd2_journal_unfile_buffer(journal, jh);
 852		jbd2_journal_put_journal_head(jh);
 853		__brelse(bh);		/* One for getblk */
 854		/* AKPM: bforget here */
 855	}
 856
 857	jbd_debug(3, "JBD: commit phase 6\n");
 858
 859	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 860		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 861		err = journal_submit_commit_record(journal, commit_transaction,
 862						&cbh, crc32_sum);
 863		if (err)
 864			__jbd2_journal_abort_hard(journal);
 865	}
 866	if (!err && !is_journal_aborted(journal))
 867		err = journal_wait_on_commit_record(cbh);
 868
 869	if (err)
 870		jbd2_journal_abort(journal, err);
 871
 872	/* End of a transaction!  Finally, we can do checkpoint
 873           processing: any buffers committed as a result of this
 874           transaction can be removed from any checkpoint list it was on
 875           before. */
 876
 877	jbd_debug(3, "JBD: commit phase 7\n");
 878
 879	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
 880	J_ASSERT(commit_transaction->t_buffers == NULL);
 881	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 882	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 883	J_ASSERT(commit_transaction->t_shadow_list == NULL);
 884	J_ASSERT(commit_transaction->t_log_list == NULL);
 885
 886restart_loop:
 887	/*
 888	 * As there are other places (journal_unmap_buffer()) adding buffers
 889	 * to this list we have to be careful and hold the j_list_lock.
 890	 */
 891	spin_lock(&journal->j_list_lock);
 892	while (commit_transaction->t_forget) {
 893		transaction_t *cp_transaction;
 894		struct buffer_head *bh;
 895
 896		jh = commit_transaction->t_forget;
 897		spin_unlock(&journal->j_list_lock);
 898		bh = jh2bh(jh);
 899		jbd_lock_bh_state(bh);
 900		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
 901			jh->b_transaction == journal->j_running_transaction);
 902
 903		/*
 904		 * If there is undo-protected committed data against
 905		 * this buffer, then we can remove it now.  If it is a
 906		 * buffer needing such protection, the old frozen_data
 907		 * field now points to a committed version of the
 908		 * buffer, so rotate that field to the new committed
 909		 * data.
 910		 *
 911		 * Otherwise, we can just throw away the frozen data now.
 912		 */
 913		if (jh->b_committed_data) {
 914			jbd2_free(jh->b_committed_data, bh->b_size);
 915			jh->b_committed_data = NULL;
 916			if (jh->b_frozen_data) {
 917				jh->b_committed_data = jh->b_frozen_data;
 918				jh->b_frozen_data = NULL;
 919			}
 920		} else if (jh->b_frozen_data) {
 921			jbd2_free(jh->b_frozen_data, bh->b_size);
 922			jh->b_frozen_data = NULL;
 923		}
 924
 925		spin_lock(&journal->j_list_lock);
 926		cp_transaction = jh->b_cp_transaction;
 927		if (cp_transaction) {
 928			JBUFFER_TRACE(jh, "remove from old cp transaction");
 929			cp_transaction->t_chp_stats.cs_dropped++;
 930			__jbd2_journal_remove_checkpoint(jh);
 931		}
 932
 933		/* Only re-checkpoint the buffer_head if it is marked
 934		 * dirty.  If the buffer was added to the BJ_Forget list
 935		 * by jbd2_journal_forget, it may no longer be dirty and
 936		 * there's no point in keeping a checkpoint record for
 937		 * it. */
 938
 939		/* A buffer which has been freed while still being
 940		 * journaled by a previous transaction may end up still
 941		 * being dirty here, but we want to avoid writing back
 942		 * that buffer in the future now that the last use has
 943		 * been committed.  That's not only a performance gain,
 944		 * it also stops aliasing problems if the buffer is left
 945		 * behind for writeback and gets reallocated for another
 946		 * use in a different page. */
 947		if (buffer_freed(bh)) {
 948			clear_buffer_freed(bh);
 949			clear_buffer_jbddirty(bh);
 950		}
 951
 952		if (buffer_jbddirty(bh)) {
 953			JBUFFER_TRACE(jh, "add to new checkpointing trans");
 954			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
 955			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 956			__jbd2_journal_refile_buffer(jh);
 957			jbd_unlock_bh_state(bh);
 958		} else {
 959			J_ASSERT_BH(bh, !buffer_dirty(bh));
 960			/* The buffer on BJ_Forget list and not jbddirty means
 961			 * it has been freed by this transaction and hence it
 962			 * could not have been reallocated until this
 963			 * transaction has committed. *BUT* it could be
 964			 * reallocated once we have written all the data to
 965			 * disk and before we process the buffer on BJ_Forget
 966			 * list. */
 967			JBUFFER_TRACE(jh, "refile or unfile freed buffer");
 968			__jbd2_journal_refile_buffer(jh);
 969			if (!jh->b_transaction) {
 970				jbd_unlock_bh_state(bh);
 971				 /* needs a brelse */
 972				jbd2_journal_remove_journal_head(bh);
 973				release_buffer_page(bh);
 974			} else
 975				jbd_unlock_bh_state(bh);
 976		}
 977		cond_resched_lock(&journal->j_list_lock);
 978	}
 979	spin_unlock(&journal->j_list_lock);
 980	/*
 981	 * This is a bit sleazy.  We use j_list_lock to protect transition
 982	 * of a transaction into T_FINISHED state and calling
 983	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
 984	 * other checkpointing code processing the transaction...
 985	 */
 986	spin_lock(&journal->j_state_lock);
 987	spin_lock(&journal->j_list_lock);
 988	/*
 989	 * Now recheck if some buffers did not get attached to the transaction
 990	 * while the lock was dropped...
 991	 */
 992	if (commit_transaction->t_forget) {
 993		spin_unlock(&journal->j_list_lock);
 994		spin_unlock(&journal->j_state_lock);
 995		goto restart_loop;
 996	}
 997
 998	/* Done with this transaction! */
 999
1000	jbd_debug(3, "JBD: commit phase 8\n");
1001
1002	J_ASSERT(commit_transaction->t_state == T_COMMIT);
1003
1004	commit_transaction->t_start = jiffies;
1005	stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
1006						commit_transaction->t_start);
1007
1008	/*
1009	 * File the transaction for history
1010	 */
1011	stats.ts_type = JBD2_STATS_RUN;
1012	stats.ts_tid = commit_transaction->t_tid;
1013	stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
1014	spin_lock(&journal->j_history_lock);
1015	memcpy(journal->j_history + journal->j_history_cur, &stats,
1016			sizeof(stats));
1017	if (++journal->j_history_cur == journal->j_history_max)
1018		journal->j_history_cur = 0;
1019
1020	/*
1021	 * Calculate overall stats
1022	 */
1023	journal->j_stats.ts_tid++;
1024	journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
1025	journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
1026	journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
1027	journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
1028	journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
1029	journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
1030	journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
1031	journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
1032	spin_unlock(&journal->j_history_lock);
1033
1034	commit_transaction->t_state = T_FINISHED;
1035	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1036	journal->j_commit_sequence = commit_transaction->t_tid;
1037	journal->j_committing_transaction = NULL;
1038	spin_unlock(&journal->j_state_lock);
1039
1040	if (commit_transaction->t_checkpoint_list == NULL &&
1041	    commit_transaction->t_checkpoint_io_list == NULL) {
1042		__jbd2_journal_drop_transaction(journal, commit_transaction);
1043	} else {
1044		if (journal->j_checkpoint_transactions == NULL) {
1045			journal->j_checkpoint_transactions = commit_transaction;
1046			commit_transaction->t_cpnext = commit_transaction;
1047			commit_transaction->t_cpprev = commit_transaction;
1048		} else {
1049			commit_transaction->t_cpnext =
1050				journal->j_checkpoint_transactions;
1051			commit_transaction->t_cpprev =
1052				commit_transaction->t_cpnext->t_cpprev;
1053			commit_transaction->t_cpnext->t_cpprev =
1054				commit_transaction;
1055			commit_transaction->t_cpprev->t_cpnext =
1056				commit_transaction;
1057		}
1058	}
1059	spin_unlock(&journal->j_list_lock);
1060
1061	jbd_debug(1, "JBD: commit %d complete, head %d\n",
1062		  journal->j_commit_sequence, journal->j_tail_sequence);
1063
1064	wake_up(&journal->j_wait_done_commit);
1065}