fs/jbd/commit.c at v2.6.22 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / jbd / commit.c
at v2.6.22 914 lines 27 kB view raw
  1/*
  2 * linux/fs/jbd/commit.c
  3 *
  4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
  5 *
  6 * Copyright 1998 Red Hat corp --- All Rights Reserved
  7 *
  8 * This file is part of the Linux kernel and is made available under
  9 * the terms of the GNU General Public License, version 2, or at your
 10 * option, any later version, incorporated herein by reference.
 11 *
 12 * Journal commit routines for the generic filesystem journaling code;
 13 * part of the ext2fs journaling system.
 14 */
 15
 16#include <linux/time.h>
 17#include <linux/fs.h>
 18#include <linux/jbd.h>
 19#include <linux/errno.h>
 20#include <linux/slab.h>
 21#include <linux/mm.h>
 22#include <linux/pagemap.h>
 23
 24/*
 25 * Default IO end handler for temporary BJ_IO buffer_heads.
 26 */
 27static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 28{
 29	BUFFER_TRACE(bh, "");
 30	if (uptodate)
 31		set_buffer_uptodate(bh);
 32	else
 33		clear_buffer_uptodate(bh);
 34	unlock_buffer(bh);
 35}
 36
 37/*
 38 * When an ext3-ordered file is truncated, it is possible that many pages are
 39 * not sucessfully freed, because they are attached to a committing transaction.
 40 * After the transaction commits, these pages are left on the LRU, with no
 41 * ->mapping, and with attached buffers.  These pages are trivially reclaimable
 42 * by the VM, but their apparent absence upsets the VM accounting, and it makes
 43 * the numbers in /proc/meminfo look odd.
 44 *
 45 * So here, we have a buffer which has just come off the forget list.  Look to
 46 * see if we can strip all buffers from the backing page.
 47 *
 48 * Called under lock_journal(), and possibly under journal_datalist_lock.  The
 49 * caller provided us with a ref against the buffer, and we drop that here.
 50 */
 51static void release_buffer_page(struct buffer_head *bh)
 52{
 53	struct page *page;
 54
 55	if (buffer_dirty(bh))
 56		goto nope;
 57	if (atomic_read(&bh->b_count) != 1)
 58		goto nope;
 59	page = bh->b_page;
 60	if (!page)
 61		goto nope;
 62	if (page->mapping)
 63		goto nope;
 64
 65	/* OK, it's a truncated page */
 66	if (TestSetPageLocked(page))
 67		goto nope;
 68
 69	page_cache_get(page);
 70	__brelse(bh);
 71	try_to_free_buffers(page);
 72	unlock_page(page);
 73	page_cache_release(page);
 74	return;
 75
 76nope:
 77	__brelse(bh);
 78}
 79
 80/*
 81 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
 82 * held.  For ranking reasons we must trylock.  If we lose, schedule away and
 83 * return 0.  j_list_lock is dropped in this case.
 84 */
 85static int inverted_lock(journal_t *journal, struct buffer_head *bh)
 86{
 87	if (!jbd_trylock_bh_state(bh)) {
 88		spin_unlock(&journal->j_list_lock);
 89		schedule();
 90		return 0;
 91	}
 92	return 1;
 93}
 94
 95/* Done it all: now write the commit record.  We should have
 96 * cleaned up our previous buffers by now, so if we are in abort
 97 * mode we can now just skip the rest of the journal write
 98 * entirely.
 99 *
100 * Returns 1 if the journal needs to be aborted or 0 on success
101 */
102static int journal_write_commit_record(journal_t *journal,
103					transaction_t *commit_transaction)
104{
105	struct journal_head *descriptor;
106	struct buffer_head *bh;
107	int i, ret;
108	int barrier_done = 0;
109
110	if (is_journal_aborted(journal))
111		return 0;
112
113	descriptor = journal_get_descriptor_buffer(journal);
114	if (!descriptor)
115		return 1;
116
117	bh = jh2bh(descriptor);
118
119	/* AKPM: buglet - add `i' to tmp! */
120	for (i = 0; i < bh->b_size; i += 512) {
121		journal_header_t *tmp = (journal_header_t*)bh->b_data;
122		tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
123		tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
124		tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
125	}
126
127	JBUFFER_TRACE(descriptor, "write commit block");
128	set_buffer_dirty(bh);
129	if (journal->j_flags & JFS_BARRIER) {
130		set_buffer_ordered(bh);
131		barrier_done = 1;
132	}
133	ret = sync_dirty_buffer(bh);
134	/* is it possible for another commit to fail at roughly
135	 * the same time as this one?  If so, we don't want to
136	 * trust the barrier flag in the super, but instead want
137	 * to remember if we sent a barrier request
138	 */
139	if (ret == -EOPNOTSUPP && barrier_done) {
140		char b[BDEVNAME_SIZE];
141
142		printk(KERN_WARNING
143			"JBD: barrier-based sync failed on %s - "
144			"disabling barriers\n",
145			bdevname(journal->j_dev, b));
146		spin_lock(&journal->j_state_lock);
147		journal->j_flags &= ~JFS_BARRIER;
148		spin_unlock(&journal->j_state_lock);
149
150		/* And try again, without the barrier */
151		clear_buffer_ordered(bh);
152		set_buffer_uptodate(bh);
153		set_buffer_dirty(bh);
154		ret = sync_dirty_buffer(bh);
155	}
156	put_bh(bh);		/* One for getblk() */
157	journal_put_journal_head(descriptor);
158
159	return (ret == -EIO);
160}
161
162static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
163{
164	int i;
165
166	for (i = 0; i < bufs; i++) {
167		wbuf[i]->b_end_io = end_buffer_write_sync;
168		/* We use-up our safety reference in submit_bh() */
169		submit_bh(WRITE, wbuf[i]);
170	}
171}
172
173/*
174 *  Submit all the data buffers to disk
175 */
176static void journal_submit_data_buffers(journal_t *journal,
177				transaction_t *commit_transaction)
178{
179	struct journal_head *jh;
180	struct buffer_head *bh;
181	int locked;
182	int bufs = 0;
183	struct buffer_head **wbuf = journal->j_wbuf;
184
185	/*
186	 * Whenever we unlock the journal and sleep, things can get added
187	 * onto ->t_sync_datalist, so we have to keep looping back to
188	 * write_out_data until we *know* that the list is empty.
189	 *
190	 * Cleanup any flushed data buffers from the data list.  Even in
191	 * abort mode, we want to flush this out as soon as possible.
192	 */
193write_out_data:
194	cond_resched();
195	spin_lock(&journal->j_list_lock);
196
197	while (commit_transaction->t_sync_datalist) {
198		jh = commit_transaction->t_sync_datalist;
199		bh = jh2bh(jh);
200		locked = 0;
201
202		/* Get reference just to make sure buffer does not disappear
203		 * when we are forced to drop various locks */
204		get_bh(bh);
205		/* If the buffer is dirty, we need to submit IO and hence
206		 * we need the buffer lock. We try to lock the buffer without
207		 * blocking. If we fail, we need to drop j_list_lock and do
208		 * blocking lock_buffer().
209		 */
210		if (buffer_dirty(bh)) {
211			if (test_set_buffer_locked(bh)) {
212				BUFFER_TRACE(bh, "needs blocking lock");
213				spin_unlock(&journal->j_list_lock);
214				/* Write out all data to prevent deadlocks */
215				journal_do_submit_data(wbuf, bufs);
216				bufs = 0;
217				lock_buffer(bh);
218				spin_lock(&journal->j_list_lock);
219			}
220			locked = 1;
221		}
222		/* We have to get bh_state lock. Again out of order, sigh. */
223		if (!inverted_lock(journal, bh)) {
224			jbd_lock_bh_state(bh);
225			spin_lock(&journal->j_list_lock);
226		}
227		/* Someone already cleaned up the buffer? */
228		if (!buffer_jbd(bh)
229			|| jh->b_transaction != commit_transaction
230			|| jh->b_jlist != BJ_SyncData) {
231			jbd_unlock_bh_state(bh);
232			if (locked)
233				unlock_buffer(bh);
234			BUFFER_TRACE(bh, "already cleaned up");
235			put_bh(bh);
236			continue;
237		}
238		if (locked && test_clear_buffer_dirty(bh)) {
239			BUFFER_TRACE(bh, "needs writeout, adding to array");
240			wbuf[bufs++] = bh;
241			__journal_file_buffer(jh, commit_transaction,
242						BJ_Locked);
243			jbd_unlock_bh_state(bh);
244			if (bufs == journal->j_wbufsize) {
245				spin_unlock(&journal->j_list_lock);
246				journal_do_submit_data(wbuf, bufs);
247				bufs = 0;
248				goto write_out_data;
249			}
250		} else if (!locked && buffer_locked(bh)) {
251			__journal_file_buffer(jh, commit_transaction,
252						BJ_Locked);
253			jbd_unlock_bh_state(bh);
254			put_bh(bh);
255		} else {
256			BUFFER_TRACE(bh, "writeout complete: unfile");
257			__journal_unfile_buffer(jh);
258			jbd_unlock_bh_state(bh);
259			if (locked)
260				unlock_buffer(bh);
261			journal_remove_journal_head(bh);
262			/* Once for our safety reference, once for
263			 * journal_remove_journal_head() */
264			put_bh(bh);
265			put_bh(bh);
266		}
267
268		if (lock_need_resched(&journal->j_list_lock)) {
269			spin_unlock(&journal->j_list_lock);
270			goto write_out_data;
271		}
272	}
273	spin_unlock(&journal->j_list_lock);
274	journal_do_submit_data(wbuf, bufs);
275}
276
277/*
278 * journal_commit_transaction
279 *
280 * The primary function for committing a transaction to the log.  This
281 * function is called by the journal thread to begin a complete commit.
282 */
283void journal_commit_transaction(journal_t *journal)
284{
285	transaction_t *commit_transaction;
286	struct journal_head *jh, *new_jh, *descriptor;
287	struct buffer_head **wbuf = journal->j_wbuf;
288	int bufs;
289	int flags;
290	int err;
291	unsigned long blocknr;
292	char *tagp = NULL;
293	journal_header_t *header;
294	journal_block_tag_t *tag = NULL;
295	int space_left = 0;
296	int first_tag = 0;
297	int tag_flag;
298	int i;
299
300	/*
301	 * First job: lock down the current transaction and wait for
302	 * all outstanding updates to complete.
303	 */
304
305#ifdef COMMIT_STATS
306	spin_lock(&journal->j_list_lock);
307	summarise_journal_usage(journal);
308	spin_unlock(&journal->j_list_lock);
309#endif
310
311	/* Do we need to erase the effects of a prior journal_flush? */
312	if (journal->j_flags & JFS_FLUSHED) {
313		jbd_debug(3, "super block updated\n");
314		journal_update_superblock(journal, 1);
315	} else {
316		jbd_debug(3, "superblock not updated\n");
317	}
318
319	J_ASSERT(journal->j_running_transaction != NULL);
320	J_ASSERT(journal->j_committing_transaction == NULL);
321
322	commit_transaction = journal->j_running_transaction;
323	J_ASSERT(commit_transaction->t_state == T_RUNNING);
324
325	jbd_debug(1, "JBD: starting commit of transaction %d\n",
326			commit_transaction->t_tid);
327
328	spin_lock(&journal->j_state_lock);
329	commit_transaction->t_state = T_LOCKED;
330
331	spin_lock(&commit_transaction->t_handle_lock);
332	while (commit_transaction->t_updates) {
333		DEFINE_WAIT(wait);
334
335		prepare_to_wait(&journal->j_wait_updates, &wait,
336					TASK_UNINTERRUPTIBLE);
337		if (commit_transaction->t_updates) {
338			spin_unlock(&commit_transaction->t_handle_lock);
339			spin_unlock(&journal->j_state_lock);
340			schedule();
341			spin_lock(&journal->j_state_lock);
342			spin_lock(&commit_transaction->t_handle_lock);
343		}
344		finish_wait(&journal->j_wait_updates, &wait);
345	}
346	spin_unlock(&commit_transaction->t_handle_lock);
347
348	J_ASSERT (commit_transaction->t_outstanding_credits <=
349			journal->j_max_transaction_buffers);
350
351	/*
352	 * First thing we are allowed to do is to discard any remaining
353	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
354	 * that there are no such buffers: if a large filesystem
355	 * operation like a truncate needs to split itself over multiple
356	 * transactions, then it may try to do a journal_restart() while
357	 * there are still BJ_Reserved buffers outstanding.  These must
358	 * be released cleanly from the current transaction.
359	 *
360	 * In this case, the filesystem must still reserve write access
361	 * again before modifying the buffer in the new transaction, but
362	 * we do not require it to remember exactly which old buffers it
363	 * has reserved.  This is consistent with the existing behaviour
364	 * that multiple journal_get_write_access() calls to the same
365	 * buffer are perfectly permissable.
366	 */
367	while (commit_transaction->t_reserved_list) {
368		jh = commit_transaction->t_reserved_list;
369		JBUFFER_TRACE(jh, "reserved, unused: refile");
370		/*
371		 * A journal_get_undo_access()+journal_release_buffer() may
372		 * leave undo-committed data.
373		 */
374		if (jh->b_committed_data) {
375			struct buffer_head *bh = jh2bh(jh);
376
377			jbd_lock_bh_state(bh);
378			jbd_slab_free(jh->b_committed_data, bh->b_size);
379			jh->b_committed_data = NULL;
380			jbd_unlock_bh_state(bh);
381		}
382		journal_refile_buffer(journal, jh);
383	}
384
385	/*
386	 * Now try to drop any written-back buffers from the journal's
387	 * checkpoint lists.  We do this *before* commit because it potentially
388	 * frees some memory
389	 */
390	spin_lock(&journal->j_list_lock);
391	__journal_clean_checkpoint_list(journal);
392	spin_unlock(&journal->j_list_lock);
393
394	jbd_debug (3, "JBD: commit phase 1\n");
395
396	/*
397	 * Switch to a new revoke table.
398	 */
399	journal_switch_revoke_table(journal);
400
401	commit_transaction->t_state = T_FLUSH;
402	journal->j_committing_transaction = commit_transaction;
403	journal->j_running_transaction = NULL;
404	commit_transaction->t_log_start = journal->j_head;
405	wake_up(&journal->j_wait_transaction_locked);
406	spin_unlock(&journal->j_state_lock);
407
408	jbd_debug (3, "JBD: commit phase 2\n");
409
410	/*
411	 * First, drop modified flag: all accesses to the buffers
412	 * will be tracked for a new trasaction only -bzzz
413	 */
414	spin_lock(&journal->j_list_lock);
415	if (commit_transaction->t_buffers) {
416		new_jh = jh = commit_transaction->t_buffers->b_tnext;
417		do {
418			J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
419					new_jh->b_modified == 0);
420			new_jh->b_modified = 0;
421			new_jh = new_jh->b_tnext;
422		} while (new_jh != jh);
423	}
424	spin_unlock(&journal->j_list_lock);
425
426	/*
427	 * Now start flushing things to disk, in the order they appear
428	 * on the transaction lists.  Data blocks go first.
429	 */
430	err = 0;
431	journal_submit_data_buffers(journal, commit_transaction);
432
433	/*
434	 * Wait for all previously submitted IO to complete.
435	 */
436	spin_lock(&journal->j_list_lock);
437	while (commit_transaction->t_locked_list) {
438		struct buffer_head *bh;
439
440		jh = commit_transaction->t_locked_list->b_tprev;
441		bh = jh2bh(jh);
442		get_bh(bh);
443		if (buffer_locked(bh)) {
444			spin_unlock(&journal->j_list_lock);
445			wait_on_buffer(bh);
446			if (unlikely(!buffer_uptodate(bh)))
447				err = -EIO;
448			spin_lock(&journal->j_list_lock);
449		}
450		if (!inverted_lock(journal, bh)) {
451			put_bh(bh);
452			spin_lock(&journal->j_list_lock);
453			continue;
454		}
455		if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
456			__journal_unfile_buffer(jh);
457			jbd_unlock_bh_state(bh);
458			journal_remove_journal_head(bh);
459			put_bh(bh);
460		} else {
461			jbd_unlock_bh_state(bh);
462		}
463		put_bh(bh);
464		cond_resched_lock(&journal->j_list_lock);
465	}
466	spin_unlock(&journal->j_list_lock);
467
468	if (err)
469		__journal_abort_hard(journal);
470
471	journal_write_revoke_records(journal, commit_transaction);
472
473	jbd_debug(3, "JBD: commit phase 2\n");
474
475	/*
476	 * If we found any dirty or locked buffers, then we should have
477	 * looped back up to the write_out_data label.  If there weren't
478	 * any then journal_clean_data_list should have wiped the list
479	 * clean by now, so check that it is in fact empty.
480	 */
481	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
482
483	jbd_debug (3, "JBD: commit phase 3\n");
484
485	/*
486	 * Way to go: we have now written out all of the data for a
487	 * transaction!  Now comes the tricky part: we need to write out
488	 * metadata.  Loop over the transaction's entire buffer list:
489	 */
490	commit_transaction->t_state = T_COMMIT;
491
492	descriptor = NULL;
493	bufs = 0;
494	while (commit_transaction->t_buffers) {
495
496		/* Find the next buffer to be journaled... */
497
498		jh = commit_transaction->t_buffers;
499
500		/* If we're in abort mode, we just un-journal the buffer and
501		   release it for background writing. */
502
503		if (is_journal_aborted(journal)) {
504			JBUFFER_TRACE(jh, "journal is aborting: refile");
505			journal_refile_buffer(journal, jh);
506			/* If that was the last one, we need to clean up
507			 * any descriptor buffers which may have been
508			 * already allocated, even if we are now
509			 * aborting. */
510			if (!commit_transaction->t_buffers)
511				goto start_journal_io;
512			continue;
513		}
514
515		/* Make sure we have a descriptor block in which to
516		   record the metadata buffer. */
517
518		if (!descriptor) {
519			struct buffer_head *bh;
520
521			J_ASSERT (bufs == 0);
522
523			jbd_debug(4, "JBD: get descriptor\n");
524
525			descriptor = journal_get_descriptor_buffer(journal);
526			if (!descriptor) {
527				__journal_abort_hard(journal);
528				continue;
529			}
530
531			bh = jh2bh(descriptor);
532			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
533				(unsigned long long)bh->b_blocknr, bh->b_data);
534			header = (journal_header_t *)&bh->b_data[0];
535			header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
536			header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
537			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
538
539			tagp = &bh->b_data[sizeof(journal_header_t)];
540			space_left = bh->b_size - sizeof(journal_header_t);
541			first_tag = 1;
542			set_buffer_jwrite(bh);
543			set_buffer_dirty(bh);
544			wbuf[bufs++] = bh;
545
546			/* Record it so that we can wait for IO
547                           completion later */
548			BUFFER_TRACE(bh, "ph3: file as descriptor");
549			journal_file_buffer(descriptor, commit_transaction,
550					BJ_LogCtl);
551		}
552
553		/* Where is the buffer to be written? */
554
555		err = journal_next_log_block(journal, &blocknr);
556		/* If the block mapping failed, just abandon the buffer
557		   and repeat this loop: we'll fall into the
558		   refile-on-abort condition above. */
559		if (err) {
560			__journal_abort_hard(journal);
561			continue;
562		}
563
564		/*
565		 * start_this_handle() uses t_outstanding_credits to determine
566		 * the free space in the log, but this counter is changed
567		 * by journal_next_log_block() also.
568		 */
569		commit_transaction->t_outstanding_credits--;
570
571		/* Bump b_count to prevent truncate from stumbling over
572                   the shadowed buffer!  @@@ This can go if we ever get
573                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
574		atomic_inc(&jh2bh(jh)->b_count);
575
576		/* Make a temporary IO buffer with which to write it out
577                   (this will requeue both the metadata buffer and the
578                   temporary IO buffer). new_bh goes on BJ_IO*/
579
580		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
581		/*
582		 * akpm: journal_write_metadata_buffer() sets
583		 * new_bh->b_transaction to commit_transaction.
584		 * We need to clean this up before we release new_bh
585		 * (which is of type BJ_IO)
586		 */
587		JBUFFER_TRACE(jh, "ph3: write metadata");
588		flags = journal_write_metadata_buffer(commit_transaction,
589						      jh, &new_jh, blocknr);
590		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
591		wbuf[bufs++] = jh2bh(new_jh);
592
593		/* Record the new block's tag in the current descriptor
594                   buffer */
595
596		tag_flag = 0;
597		if (flags & 1)
598			tag_flag |= JFS_FLAG_ESCAPE;
599		if (!first_tag)
600			tag_flag |= JFS_FLAG_SAME_UUID;
601
602		tag = (journal_block_tag_t *) tagp;
603		tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
604		tag->t_flags = cpu_to_be32(tag_flag);
605		tagp += sizeof(journal_block_tag_t);
606		space_left -= sizeof(journal_block_tag_t);
607
608		if (first_tag) {
609			memcpy (tagp, journal->j_uuid, 16);
610			tagp += 16;
611			space_left -= 16;
612			first_tag = 0;
613		}
614
615		/* If there's no more to do, or if the descriptor is full,
616		   let the IO rip! */
617
618		if (bufs == journal->j_wbufsize ||
619		    commit_transaction->t_buffers == NULL ||
620		    space_left < sizeof(journal_block_tag_t) + 16) {
621
622			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
623
624			/* Write an end-of-descriptor marker before
625                           submitting the IOs.  "tag" still points to
626                           the last tag we set up. */
627
628			tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
629
630start_journal_io:
631			for (i = 0; i < bufs; i++) {
632				struct buffer_head *bh = wbuf[i];
633				lock_buffer(bh);
634				clear_buffer_dirty(bh);
635				set_buffer_uptodate(bh);
636				bh->b_end_io = journal_end_buffer_io_sync;
637				submit_bh(WRITE, bh);
638			}
639			cond_resched();
640
641			/* Force a new descriptor to be generated next
642                           time round the loop. */
643			descriptor = NULL;
644			bufs = 0;
645		}
646	}
647
648	/* Lo and behold: we have just managed to send a transaction to
649           the log.  Before we can commit it, wait for the IO so far to
650           complete.  Control buffers being written are on the
651           transaction's t_log_list queue, and metadata buffers are on
652           the t_iobuf_list queue.
653
654	   Wait for the buffers in reverse order.  That way we are
655	   less likely to be woken up until all IOs have completed, and
656	   so we incur less scheduling load.
657	*/
658
659	jbd_debug(3, "JBD: commit phase 4\n");
660
661	/*
662	 * akpm: these are BJ_IO, and j_list_lock is not needed.
663	 * See __journal_try_to_free_buffer.
664	 */
665wait_for_iobuf:
666	while (commit_transaction->t_iobuf_list != NULL) {
667		struct buffer_head *bh;
668
669		jh = commit_transaction->t_iobuf_list->b_tprev;
670		bh = jh2bh(jh);
671		if (buffer_locked(bh)) {
672			wait_on_buffer(bh);
673			goto wait_for_iobuf;
674		}
675		if (cond_resched())
676			goto wait_for_iobuf;
677
678		if (unlikely(!buffer_uptodate(bh)))
679			err = -EIO;
680
681		clear_buffer_jwrite(bh);
682
683		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
684		journal_unfile_buffer(journal, jh);
685
686		/*
687		 * ->t_iobuf_list should contain only dummy buffer_heads
688		 * which were created by journal_write_metadata_buffer().
689		 */
690		BUFFER_TRACE(bh, "dumping temporary bh");
691		journal_put_journal_head(jh);
692		__brelse(bh);
693		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
694		free_buffer_head(bh);
695
696		/* We also have to unlock and free the corresponding
697                   shadowed buffer */
698		jh = commit_transaction->t_shadow_list->b_tprev;
699		bh = jh2bh(jh);
700		clear_bit(BH_JWrite, &bh->b_state);
701		J_ASSERT_BH(bh, buffer_jbddirty(bh));
702
703		/* The metadata is now released for reuse, but we need
704                   to remember it against this transaction so that when
705                   we finally commit, we can do any checkpointing
706                   required. */
707		JBUFFER_TRACE(jh, "file as BJ_Forget");
708		journal_file_buffer(jh, commit_transaction, BJ_Forget);
709		/* Wake up any transactions which were waiting for this
710		   IO to complete */
711		wake_up_bit(&bh->b_state, BH_Unshadow);
712		JBUFFER_TRACE(jh, "brelse shadowed buffer");
713		__brelse(bh);
714	}
715
716	J_ASSERT (commit_transaction->t_shadow_list == NULL);
717
718	jbd_debug(3, "JBD: commit phase 5\n");
719
720	/* Here we wait for the revoke record and descriptor record buffers */
721 wait_for_ctlbuf:
722	while (commit_transaction->t_log_list != NULL) {
723		struct buffer_head *bh;
724
725		jh = commit_transaction->t_log_list->b_tprev;
726		bh = jh2bh(jh);
727		if (buffer_locked(bh)) {
728			wait_on_buffer(bh);
729			goto wait_for_ctlbuf;
730		}
731		if (cond_resched())
732			goto wait_for_ctlbuf;
733
734		if (unlikely(!buffer_uptodate(bh)))
735			err = -EIO;
736
737		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
738		clear_buffer_jwrite(bh);
739		journal_unfile_buffer(journal, jh);
740		journal_put_journal_head(jh);
741		__brelse(bh);		/* One for getblk */
742		/* AKPM: bforget here */
743	}
744
745	jbd_debug(3, "JBD: commit phase 6\n");
746
747	if (journal_write_commit_record(journal, commit_transaction))
748		err = -EIO;
749
750	if (err)
751		__journal_abort_hard(journal);
752
753	/* End of a transaction!  Finally, we can do checkpoint
754           processing: any buffers committed as a result of this
755           transaction can be removed from any checkpoint list it was on
756           before. */
757
758	jbd_debug(3, "JBD: commit phase 7\n");
759
760	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
761	J_ASSERT(commit_transaction->t_buffers == NULL);
762	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
763	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
764	J_ASSERT(commit_transaction->t_shadow_list == NULL);
765	J_ASSERT(commit_transaction->t_log_list == NULL);
766
767restart_loop:
768	/*
769	 * As there are other places (journal_unmap_buffer()) adding buffers
770	 * to this list we have to be careful and hold the j_list_lock.
771	 */
772	spin_lock(&journal->j_list_lock);
773	while (commit_transaction->t_forget) {
774		transaction_t *cp_transaction;
775		struct buffer_head *bh;
776
777		jh = commit_transaction->t_forget;
778		spin_unlock(&journal->j_list_lock);
779		bh = jh2bh(jh);
780		jbd_lock_bh_state(bh);
781		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
782			jh->b_transaction == journal->j_running_transaction);
783
784		/*
785		 * If there is undo-protected committed data against
786		 * this buffer, then we can remove it now.  If it is a
787		 * buffer needing such protection, the old frozen_data
788		 * field now points to a committed version of the
789		 * buffer, so rotate that field to the new committed
790		 * data.
791		 *
792		 * Otherwise, we can just throw away the frozen data now.
793		 */
794		if (jh->b_committed_data) {
795			jbd_slab_free(jh->b_committed_data, bh->b_size);
796			jh->b_committed_data = NULL;
797			if (jh->b_frozen_data) {
798				jh->b_committed_data = jh->b_frozen_data;
799				jh->b_frozen_data = NULL;
800			}
801		} else if (jh->b_frozen_data) {
802			jbd_slab_free(jh->b_frozen_data, bh->b_size);
803			jh->b_frozen_data = NULL;
804		}
805
806		spin_lock(&journal->j_list_lock);
807		cp_transaction = jh->b_cp_transaction;
808		if (cp_transaction) {
809			JBUFFER_TRACE(jh, "remove from old cp transaction");
810			__journal_remove_checkpoint(jh);
811		}
812
813		/* Only re-checkpoint the buffer_head if it is marked
814		 * dirty.  If the buffer was added to the BJ_Forget list
815		 * by journal_forget, it may no longer be dirty and
816		 * there's no point in keeping a checkpoint record for
817		 * it. */
818
819		/* A buffer which has been freed while still being
820		 * journaled by a previous transaction may end up still
821		 * being dirty here, but we want to avoid writing back
822		 * that buffer in the future now that the last use has
823		 * been committed.  That's not only a performance gain,
824		 * it also stops aliasing problems if the buffer is left
825		 * behind for writeback and gets reallocated for another
826		 * use in a different page. */
827		if (buffer_freed(bh)) {
828			clear_buffer_freed(bh);
829			clear_buffer_jbddirty(bh);
830		}
831
832		if (buffer_jbddirty(bh)) {
833			JBUFFER_TRACE(jh, "add to new checkpointing trans");
834			__journal_insert_checkpoint(jh, commit_transaction);
835			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
836			__journal_refile_buffer(jh);
837			jbd_unlock_bh_state(bh);
838		} else {
839			J_ASSERT_BH(bh, !buffer_dirty(bh));
840			/* The buffer on BJ_Forget list and not jbddirty means
841			 * it has been freed by this transaction and hence it
842			 * could not have been reallocated until this
843			 * transaction has committed. *BUT* it could be
844			 * reallocated once we have written all the data to
845			 * disk and before we process the buffer on BJ_Forget
846			 * list. */
847			JBUFFER_TRACE(jh, "refile or unfile freed buffer");
848			__journal_refile_buffer(jh);
849			if (!jh->b_transaction) {
850				jbd_unlock_bh_state(bh);
851				 /* needs a brelse */
852				journal_remove_journal_head(bh);
853				release_buffer_page(bh);
854			} else
855				jbd_unlock_bh_state(bh);
856		}
857		cond_resched_lock(&journal->j_list_lock);
858	}
859	spin_unlock(&journal->j_list_lock);
860	/*
861	 * This is a bit sleazy.  We borrow j_list_lock to protect
862	 * journal->j_committing_transaction in __journal_remove_checkpoint.
863	 * Really, __journal_remove_checkpoint should be using j_state_lock but
864	 * it's a bit hassle to hold that across __journal_remove_checkpoint
865	 */
866	spin_lock(&journal->j_state_lock);
867	spin_lock(&journal->j_list_lock);
868	/*
869	 * Now recheck if some buffers did not get attached to the transaction
870	 * while the lock was dropped...
871	 */
872	if (commit_transaction->t_forget) {
873		spin_unlock(&journal->j_list_lock);
874		spin_unlock(&journal->j_state_lock);
875		goto restart_loop;
876	}
877
878	/* Done with this transaction! */
879
880	jbd_debug(3, "JBD: commit phase 8\n");
881
882	J_ASSERT(commit_transaction->t_state == T_COMMIT);
883
884	commit_transaction->t_state = T_FINISHED;
885	J_ASSERT(commit_transaction == journal->j_committing_transaction);
886	journal->j_commit_sequence = commit_transaction->t_tid;
887	journal->j_committing_transaction = NULL;
888	spin_unlock(&journal->j_state_lock);
889
890	if (commit_transaction->t_checkpoint_list == NULL) {
891		__journal_drop_transaction(journal, commit_transaction);
892	} else {
893		if (journal->j_checkpoint_transactions == NULL) {
894			journal->j_checkpoint_transactions = commit_transaction;
895			commit_transaction->t_cpnext = commit_transaction;
896			commit_transaction->t_cpprev = commit_transaction;
897		} else {
898			commit_transaction->t_cpnext =
899				journal->j_checkpoint_transactions;
900			commit_transaction->t_cpprev =
901				commit_transaction->t_cpnext->t_cpprev;
902			commit_transaction->t_cpnext->t_cpprev =
903				commit_transaction;
904			commit_transaction->t_cpprev->t_cpnext =
905				commit_transaction;
906		}
907	}
908	spin_unlock(&journal->j_list_lock);
909
910	jbd_debug(1, "JBD: commit %d complete, head %d\n",
911		  journal->j_commit_sequence, journal->j_tail_sequence);
912
913	wake_up(&journal->j_wait_done_commit);
914}