fs/jbd/commit.c at v2.6.16 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / jbd / commit.c
at v2.6.16 856 lines 25 kB view raw
  1/*
  2 * linux/fs/commit.c
  3 *
  4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
  5 *
  6 * Copyright 1998 Red Hat corp --- All Rights Reserved
  7 *
  8 * This file is part of the Linux kernel and is made available under
  9 * the terms of the GNU General Public License, version 2, or at your
 10 * option, any later version, incorporated herein by reference.
 11 *
 12 * Journal commit routines for the generic filesystem journaling code;
 13 * part of the ext2fs journaling system.
 14 */
 15
 16#include <linux/time.h>
 17#include <linux/fs.h>
 18#include <linux/jbd.h>
 19#include <linux/errno.h>
 20#include <linux/slab.h>
 21#include <linux/mm.h>
 22#include <linux/pagemap.h>
 23#include <linux/smp_lock.h>
 24
 25/*
 26 * Default IO end handler for temporary BJ_IO buffer_heads.
 27 */
 28static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 29{
 30	BUFFER_TRACE(bh, "");
 31	if (uptodate)
 32		set_buffer_uptodate(bh);
 33	else
 34		clear_buffer_uptodate(bh);
 35	unlock_buffer(bh);
 36}
 37
 38/*
 39 * When an ext3-ordered file is truncated, it is possible that many pages are
 40 * not sucessfully freed, because they are attached to a committing transaction.
 41 * After the transaction commits, these pages are left on the LRU, with no
 42 * ->mapping, and with attached buffers.  These pages are trivially reclaimable
 43 * by the VM, but their apparent absence upsets the VM accounting, and it makes
 44 * the numbers in /proc/meminfo look odd.
 45 *
 46 * So here, we have a buffer which has just come off the forget list.  Look to
 47 * see if we can strip all buffers from the backing page.
 48 *
 49 * Called under lock_journal(), and possibly under journal_datalist_lock.  The
 50 * caller provided us with a ref against the buffer, and we drop that here.
 51 */
 52static void release_buffer_page(struct buffer_head *bh)
 53{
 54	struct page *page;
 55
 56	if (buffer_dirty(bh))
 57		goto nope;
 58	if (atomic_read(&bh->b_count) != 1)
 59		goto nope;
 60	page = bh->b_page;
 61	if (!page)
 62		goto nope;
 63	if (page->mapping)
 64		goto nope;
 65
 66	/* OK, it's a truncated page */
 67	if (TestSetPageLocked(page))
 68		goto nope;
 69
 70	page_cache_get(page);
 71	__brelse(bh);
 72	try_to_free_buffers(page);
 73	unlock_page(page);
 74	page_cache_release(page);
 75	return;
 76
 77nope:
 78	__brelse(bh);
 79}
 80
 81/*
 82 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
 83 * held.  For ranking reasons we must trylock.  If we lose, schedule away and
 84 * return 0.  j_list_lock is dropped in this case.
 85 */
 86static int inverted_lock(journal_t *journal, struct buffer_head *bh)
 87{
 88	if (!jbd_trylock_bh_state(bh)) {
 89		spin_unlock(&journal->j_list_lock);
 90		schedule();
 91		return 0;
 92	}
 93	return 1;
 94}
 95
 96/* Done it all: now write the commit record.  We should have
 97 * cleaned up our previous buffers by now, so if we are in abort
 98 * mode we can now just skip the rest of the journal write
 99 * entirely.
100 *
101 * Returns 1 if the journal needs to be aborted or 0 on success
102 */
103static int journal_write_commit_record(journal_t *journal,
104					transaction_t *commit_transaction)
105{
106	struct journal_head *descriptor;
107	struct buffer_head *bh;
108	int i, ret;
109	int barrier_done = 0;
110
111	if (is_journal_aborted(journal))
112		return 0;
113
114	descriptor = journal_get_descriptor_buffer(journal);
115	if (!descriptor)
116		return 1;
117
118	bh = jh2bh(descriptor);
119
120	/* AKPM: buglet - add `i' to tmp! */
121	for (i = 0; i < bh->b_size; i += 512) {
122		journal_header_t *tmp = (journal_header_t*)bh->b_data;
123		tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
124		tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
125		tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
126	}
127
128	JBUFFER_TRACE(descriptor, "write commit block");
129	set_buffer_dirty(bh);
130	if (journal->j_flags & JFS_BARRIER) {
131		set_buffer_ordered(bh);
132		barrier_done = 1;
133	}
134	ret = sync_dirty_buffer(bh);
135	/* is it possible for another commit to fail at roughly
136	 * the same time as this one?  If so, we don't want to
137	 * trust the barrier flag in the super, but instead want
138	 * to remember if we sent a barrier request
139	 */
140	if (ret == -EOPNOTSUPP && barrier_done) {
141		char b[BDEVNAME_SIZE];
142
143		printk(KERN_WARNING
144			"JBD: barrier-based sync failed on %s - "
145			"disabling barriers\n",
146			bdevname(journal->j_dev, b));
147		spin_lock(&journal->j_state_lock);
148		journal->j_flags &= ~JFS_BARRIER;
149		spin_unlock(&journal->j_state_lock);
150
151		/* And try again, without the barrier */
152		clear_buffer_ordered(bh);
153		set_buffer_uptodate(bh);
154		set_buffer_dirty(bh);
155		ret = sync_dirty_buffer(bh);
156	}
157	put_bh(bh);		/* One for getblk() */
158	journal_put_journal_head(descriptor);
159
160	return (ret == -EIO);
161}
162
163/*
164 * journal_commit_transaction
165 *
166 * The primary function for committing a transaction to the log.  This
167 * function is called by the journal thread to begin a complete commit.
168 */
169void journal_commit_transaction(journal_t *journal)
170{
171	transaction_t *commit_transaction;
172	struct journal_head *jh, *new_jh, *descriptor;
173	struct buffer_head **wbuf = journal->j_wbuf;
174	int bufs;
175	int flags;
176	int err;
177	unsigned long blocknr;
178	char *tagp = NULL;
179	journal_header_t *header;
180	journal_block_tag_t *tag = NULL;
181	int space_left = 0;
182	int first_tag = 0;
183	int tag_flag;
184	int i;
185
186	/*
187	 * First job: lock down the current transaction and wait for
188	 * all outstanding updates to complete.
189	 */
190
191#ifdef COMMIT_STATS
192	spin_lock(&journal->j_list_lock);
193	summarise_journal_usage(journal);
194	spin_unlock(&journal->j_list_lock);
195#endif
196
197	/* Do we need to erase the effects of a prior journal_flush? */
198	if (journal->j_flags & JFS_FLUSHED) {
199		jbd_debug(3, "super block updated\n");
200		journal_update_superblock(journal, 1);
201	} else {
202		jbd_debug(3, "superblock not updated\n");
203	}
204
205	J_ASSERT(journal->j_running_transaction != NULL);
206	J_ASSERT(journal->j_committing_transaction == NULL);
207
208	commit_transaction = journal->j_running_transaction;
209	J_ASSERT(commit_transaction->t_state == T_RUNNING);
210
211	jbd_debug(1, "JBD: starting commit of transaction %d\n",
212			commit_transaction->t_tid);
213
214	spin_lock(&journal->j_state_lock);
215	commit_transaction->t_state = T_LOCKED;
216
217	spin_lock(&commit_transaction->t_handle_lock);
218	while (commit_transaction->t_updates) {
219		DEFINE_WAIT(wait);
220
221		prepare_to_wait(&journal->j_wait_updates, &wait,
222					TASK_UNINTERRUPTIBLE);
223		if (commit_transaction->t_updates) {
224			spin_unlock(&commit_transaction->t_handle_lock);
225			spin_unlock(&journal->j_state_lock);
226			schedule();
227			spin_lock(&journal->j_state_lock);
228			spin_lock(&commit_transaction->t_handle_lock);
229		}
230		finish_wait(&journal->j_wait_updates, &wait);
231	}
232	spin_unlock(&commit_transaction->t_handle_lock);
233
234	J_ASSERT (commit_transaction->t_outstanding_credits <=
235			journal->j_max_transaction_buffers);
236
237	/*
238	 * First thing we are allowed to do is to discard any remaining
239	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
240	 * that there are no such buffers: if a large filesystem
241	 * operation like a truncate needs to split itself over multiple
242	 * transactions, then it may try to do a journal_restart() while
243	 * there are still BJ_Reserved buffers outstanding.  These must
244	 * be released cleanly from the current transaction.
245	 *
246	 * In this case, the filesystem must still reserve write access
247	 * again before modifying the buffer in the new transaction, but
248	 * we do not require it to remember exactly which old buffers it
249	 * has reserved.  This is consistent with the existing behaviour
250	 * that multiple journal_get_write_access() calls to the same
251	 * buffer are perfectly permissable.
252	 */
253	while (commit_transaction->t_reserved_list) {
254		jh = commit_transaction->t_reserved_list;
255		JBUFFER_TRACE(jh, "reserved, unused: refile");
256		/*
257		 * A journal_get_undo_access()+journal_release_buffer() may
258		 * leave undo-committed data.
259		 */
260		if (jh->b_committed_data) {
261			struct buffer_head *bh = jh2bh(jh);
262
263			jbd_lock_bh_state(bh);
264			kfree(jh->b_committed_data);
265			jh->b_committed_data = NULL;
266			jbd_unlock_bh_state(bh);
267		}
268		journal_refile_buffer(journal, jh);
269	}
270
271	/*
272	 * Now try to drop any written-back buffers from the journal's
273	 * checkpoint lists.  We do this *before* commit because it potentially
274	 * frees some memory
275	 */
276	spin_lock(&journal->j_list_lock);
277	__journal_clean_checkpoint_list(journal);
278	spin_unlock(&journal->j_list_lock);
279
280	jbd_debug (3, "JBD: commit phase 1\n");
281
282	/*
283	 * Switch to a new revoke table.
284	 */
285	journal_switch_revoke_table(journal);
286
287	commit_transaction->t_state = T_FLUSH;
288	journal->j_committing_transaction = commit_transaction;
289	journal->j_running_transaction = NULL;
290	commit_transaction->t_log_start = journal->j_head;
291	wake_up(&journal->j_wait_transaction_locked);
292	spin_unlock(&journal->j_state_lock);
293
294	jbd_debug (3, "JBD: commit phase 2\n");
295
296	/*
297	 * First, drop modified flag: all accesses to the buffers
298	 * will be tracked for a new trasaction only -bzzz
299	 */
300	spin_lock(&journal->j_list_lock);
301	if (commit_transaction->t_buffers) {
302		new_jh = jh = commit_transaction->t_buffers->b_tnext;
303		do {
304			J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
305					new_jh->b_modified == 0);
306			new_jh->b_modified = 0;
307			new_jh = new_jh->b_tnext;
308		} while (new_jh != jh);
309	}
310	spin_unlock(&journal->j_list_lock);
311
312	/*
313	 * Now start flushing things to disk, in the order they appear
314	 * on the transaction lists.  Data blocks go first.
315	 */
316
317	err = 0;
318	/*
319	 * Whenever we unlock the journal and sleep, things can get added
320	 * onto ->t_sync_datalist, so we have to keep looping back to
321	 * write_out_data until we *know* that the list is empty.
322	 */
323	bufs = 0;
324	/*
325	 * Cleanup any flushed data buffers from the data list.  Even in
326	 * abort mode, we want to flush this out as soon as possible.
327	 */
328write_out_data:
329	cond_resched();
330	spin_lock(&journal->j_list_lock);
331
332	while (commit_transaction->t_sync_datalist) {
333		struct buffer_head *bh;
334
335		jh = commit_transaction->t_sync_datalist;
336		commit_transaction->t_sync_datalist = jh->b_tnext;
337		bh = jh2bh(jh);
338		if (buffer_locked(bh)) {
339			BUFFER_TRACE(bh, "locked");
340			if (!inverted_lock(journal, bh))
341				goto write_out_data;
342			__journal_temp_unlink_buffer(jh);
343			__journal_file_buffer(jh, commit_transaction,
344						BJ_Locked);
345			jbd_unlock_bh_state(bh);
346			if (lock_need_resched(&journal->j_list_lock)) {
347				spin_unlock(&journal->j_list_lock);
348				goto write_out_data;
349			}
350		} else {
351			if (buffer_dirty(bh)) {
352				BUFFER_TRACE(bh, "start journal writeout");
353				get_bh(bh);
354				wbuf[bufs++] = bh;
355				if (bufs == journal->j_wbufsize) {
356					jbd_debug(2, "submit %d writes\n",
357							bufs);
358					spin_unlock(&journal->j_list_lock);
359					ll_rw_block(SWRITE, bufs, wbuf);
360					journal_brelse_array(wbuf, bufs);
361					bufs = 0;
362					goto write_out_data;
363				}
364			} else {
365				BUFFER_TRACE(bh, "writeout complete: unfile");
366				if (!inverted_lock(journal, bh))
367					goto write_out_data;
368				__journal_unfile_buffer(jh);
369				jbd_unlock_bh_state(bh);
370				journal_remove_journal_head(bh);
371				put_bh(bh);
372				if (lock_need_resched(&journal->j_list_lock)) {
373					spin_unlock(&journal->j_list_lock);
374					goto write_out_data;
375				}
376			}
377		}
378	}
379
380	if (bufs) {
381		spin_unlock(&journal->j_list_lock);
382		ll_rw_block(SWRITE, bufs, wbuf);
383		journal_brelse_array(wbuf, bufs);
384		spin_lock(&journal->j_list_lock);
385	}
386
387	/*
388	 * Wait for all previously submitted IO to complete.
389	 */
390	while (commit_transaction->t_locked_list) {
391		struct buffer_head *bh;
392
393		jh = commit_transaction->t_locked_list->b_tprev;
394		bh = jh2bh(jh);
395		get_bh(bh);
396		if (buffer_locked(bh)) {
397			spin_unlock(&journal->j_list_lock);
398			wait_on_buffer(bh);
399			if (unlikely(!buffer_uptodate(bh)))
400				err = -EIO;
401			spin_lock(&journal->j_list_lock);
402		}
403		if (!inverted_lock(journal, bh)) {
404			put_bh(bh);
405			spin_lock(&journal->j_list_lock);
406			continue;
407		}
408		if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
409			__journal_unfile_buffer(jh);
410			jbd_unlock_bh_state(bh);
411			journal_remove_journal_head(bh);
412			put_bh(bh);
413		} else {
414			jbd_unlock_bh_state(bh);
415		}
416		put_bh(bh);
417		cond_resched_lock(&journal->j_list_lock);
418	}
419	spin_unlock(&journal->j_list_lock);
420
421	if (err)
422		__journal_abort_hard(journal);
423
424	journal_write_revoke_records(journal, commit_transaction);
425
426	jbd_debug(3, "JBD: commit phase 2\n");
427
428	/*
429	 * If we found any dirty or locked buffers, then we should have
430	 * looped back up to the write_out_data label.  If there weren't
431	 * any then journal_clean_data_list should have wiped the list
432	 * clean by now, so check that it is in fact empty.
433	 */
434	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
435
436	jbd_debug (3, "JBD: commit phase 3\n");
437
438	/*
439	 * Way to go: we have now written out all of the data for a
440	 * transaction!  Now comes the tricky part: we need to write out
441	 * metadata.  Loop over the transaction's entire buffer list:
442	 */
443	commit_transaction->t_state = T_COMMIT;
444
445	descriptor = NULL;
446	bufs = 0;
447	while (commit_transaction->t_buffers) {
448
449		/* Find the next buffer to be journaled... */
450
451		jh = commit_transaction->t_buffers;
452
453		/* If we're in abort mode, we just un-journal the buffer and
454		   release it for background writing. */
455
456		if (is_journal_aborted(journal)) {
457			JBUFFER_TRACE(jh, "journal is aborting: refile");
458			journal_refile_buffer(journal, jh);
459			/* If that was the last one, we need to clean up
460			 * any descriptor buffers which may have been
461			 * already allocated, even if we are now
462			 * aborting. */
463			if (!commit_transaction->t_buffers)
464				goto start_journal_io;
465			continue;
466		}
467
468		/* Make sure we have a descriptor block in which to
469		   record the metadata buffer. */
470
471		if (!descriptor) {
472			struct buffer_head *bh;
473
474			J_ASSERT (bufs == 0);
475
476			jbd_debug(4, "JBD: get descriptor\n");
477
478			descriptor = journal_get_descriptor_buffer(journal);
479			if (!descriptor) {
480				__journal_abort_hard(journal);
481				continue;
482			}
483
484			bh = jh2bh(descriptor);
485			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
486				(unsigned long long)bh->b_blocknr, bh->b_data);
487			header = (journal_header_t *)&bh->b_data[0];
488			header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
489			header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
490			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
491
492			tagp = &bh->b_data[sizeof(journal_header_t)];
493			space_left = bh->b_size - sizeof(journal_header_t);
494			first_tag = 1;
495			set_buffer_jwrite(bh);
496			set_buffer_dirty(bh);
497			wbuf[bufs++] = bh;
498
499			/* Record it so that we can wait for IO
500                           completion later */
501			BUFFER_TRACE(bh, "ph3: file as descriptor");
502			journal_file_buffer(descriptor, commit_transaction,
503					BJ_LogCtl);
504		}
505
506		/* Where is the buffer to be written? */
507
508		err = journal_next_log_block(journal, &blocknr);
509		/* If the block mapping failed, just abandon the buffer
510		   and repeat this loop: we'll fall into the
511		   refile-on-abort condition above. */
512		if (err) {
513			__journal_abort_hard(journal);
514			continue;
515		}
516
517		/*
518		 * start_this_handle() uses t_outstanding_credits to determine
519		 * the free space in the log, but this counter is changed
520		 * by journal_next_log_block() also.
521		 */
522		commit_transaction->t_outstanding_credits--;
523
524		/* Bump b_count to prevent truncate from stumbling over
525                   the shadowed buffer!  @@@ This can go if we ever get
526                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
527		atomic_inc(&jh2bh(jh)->b_count);
528
529		/* Make a temporary IO buffer with which to write it out
530                   (this will requeue both the metadata buffer and the
531                   temporary IO buffer). new_bh goes on BJ_IO*/
532
533		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
534		/*
535		 * akpm: journal_write_metadata_buffer() sets
536		 * new_bh->b_transaction to commit_transaction.
537		 * We need to clean this up before we release new_bh
538		 * (which is of type BJ_IO)
539		 */
540		JBUFFER_TRACE(jh, "ph3: write metadata");
541		flags = journal_write_metadata_buffer(commit_transaction,
542						      jh, &new_jh, blocknr);
543		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
544		wbuf[bufs++] = jh2bh(new_jh);
545
546		/* Record the new block's tag in the current descriptor
547                   buffer */
548
549		tag_flag = 0;
550		if (flags & 1)
551			tag_flag |= JFS_FLAG_ESCAPE;
552		if (!first_tag)
553			tag_flag |= JFS_FLAG_SAME_UUID;
554
555		tag = (journal_block_tag_t *) tagp;
556		tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
557		tag->t_flags = cpu_to_be32(tag_flag);
558		tagp += sizeof(journal_block_tag_t);
559		space_left -= sizeof(journal_block_tag_t);
560
561		if (first_tag) {
562			memcpy (tagp, journal->j_uuid, 16);
563			tagp += 16;
564			space_left -= 16;
565			first_tag = 0;
566		}
567
568		/* If there's no more to do, or if the descriptor is full,
569		   let the IO rip! */
570
571		if (bufs == journal->j_wbufsize ||
572		    commit_transaction->t_buffers == NULL ||
573		    space_left < sizeof(journal_block_tag_t) + 16) {
574
575			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
576
577			/* Write an end-of-descriptor marker before
578                           submitting the IOs.  "tag" still points to
579                           the last tag we set up. */
580
581			tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
582
583start_journal_io:
584			for (i = 0; i < bufs; i++) {
585				struct buffer_head *bh = wbuf[i];
586				lock_buffer(bh);
587				clear_buffer_dirty(bh);
588				set_buffer_uptodate(bh);
589				bh->b_end_io = journal_end_buffer_io_sync;
590				submit_bh(WRITE, bh);
591			}
592			cond_resched();
593
594			/* Force a new descriptor to be generated next
595                           time round the loop. */
596			descriptor = NULL;
597			bufs = 0;
598		}
599	}
600
601	/* Lo and behold: we have just managed to send a transaction to
602           the log.  Before we can commit it, wait for the IO so far to
603           complete.  Control buffers being written are on the
604           transaction's t_log_list queue, and metadata buffers are on
605           the t_iobuf_list queue.
606
607	   Wait for the buffers in reverse order.  That way we are
608	   less likely to be woken up until all IOs have completed, and
609	   so we incur less scheduling load.
610	*/
611
612	jbd_debug(3, "JBD: commit phase 4\n");
613
614	/*
615	 * akpm: these are BJ_IO, and j_list_lock is not needed.
616	 * See __journal_try_to_free_buffer.
617	 */
618wait_for_iobuf:
619	while (commit_transaction->t_iobuf_list != NULL) {
620		struct buffer_head *bh;
621
622		jh = commit_transaction->t_iobuf_list->b_tprev;
623		bh = jh2bh(jh);
624		if (buffer_locked(bh)) {
625			wait_on_buffer(bh);
626			goto wait_for_iobuf;
627		}
628		if (cond_resched())
629			goto wait_for_iobuf;
630
631		if (unlikely(!buffer_uptodate(bh)))
632			err = -EIO;
633
634		clear_buffer_jwrite(bh);
635
636		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
637		journal_unfile_buffer(journal, jh);
638
639		/*
640		 * ->t_iobuf_list should contain only dummy buffer_heads
641		 * which were created by journal_write_metadata_buffer().
642		 */
643		BUFFER_TRACE(bh, "dumping temporary bh");
644		journal_put_journal_head(jh);
645		__brelse(bh);
646		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
647		free_buffer_head(bh);
648
649		/* We also have to unlock and free the corresponding
650                   shadowed buffer */
651		jh = commit_transaction->t_shadow_list->b_tprev;
652		bh = jh2bh(jh);
653		clear_bit(BH_JWrite, &bh->b_state);
654		J_ASSERT_BH(bh, buffer_jbddirty(bh));
655
656		/* The metadata is now released for reuse, but we need
657                   to remember it against this transaction so that when
658                   we finally commit, we can do any checkpointing
659                   required. */
660		JBUFFER_TRACE(jh, "file as BJ_Forget");
661		journal_file_buffer(jh, commit_transaction, BJ_Forget);
662		/* Wake up any transactions which were waiting for this
663		   IO to complete */
664		wake_up_bit(&bh->b_state, BH_Unshadow);
665		JBUFFER_TRACE(jh, "brelse shadowed buffer");
666		__brelse(bh);
667	}
668
669	J_ASSERT (commit_transaction->t_shadow_list == NULL);
670
671	jbd_debug(3, "JBD: commit phase 5\n");
672
673	/* Here we wait for the revoke record and descriptor record buffers */
674 wait_for_ctlbuf:
675	while (commit_transaction->t_log_list != NULL) {
676		struct buffer_head *bh;
677
678		jh = commit_transaction->t_log_list->b_tprev;
679		bh = jh2bh(jh);
680		if (buffer_locked(bh)) {
681			wait_on_buffer(bh);
682			goto wait_for_ctlbuf;
683		}
684		if (cond_resched())
685			goto wait_for_ctlbuf;
686
687		if (unlikely(!buffer_uptodate(bh)))
688			err = -EIO;
689
690		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
691		clear_buffer_jwrite(bh);
692		journal_unfile_buffer(journal, jh);
693		journal_put_journal_head(jh);
694		__brelse(bh);		/* One for getblk */
695		/* AKPM: bforget here */
696	}
697
698	jbd_debug(3, "JBD: commit phase 6\n");
699
700	if (journal_write_commit_record(journal, commit_transaction))
701		err = -EIO;
702
703	if (err)
704		__journal_abort_hard(journal);
705
706	/* End of a transaction!  Finally, we can do checkpoint
707           processing: any buffers committed as a result of this
708           transaction can be removed from any checkpoint list it was on
709           before. */
710
711	jbd_debug(3, "JBD: commit phase 7\n");
712
713	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
714	J_ASSERT(commit_transaction->t_buffers == NULL);
715	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
716	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
717	J_ASSERT(commit_transaction->t_shadow_list == NULL);
718	J_ASSERT(commit_transaction->t_log_list == NULL);
719
720restart_loop:
721	/*
722	 * As there are other places (journal_unmap_buffer()) adding buffers
723	 * to this list we have to be careful and hold the j_list_lock.
724	 */
725	spin_lock(&journal->j_list_lock);
726	while (commit_transaction->t_forget) {
727		transaction_t *cp_transaction;
728		struct buffer_head *bh;
729
730		jh = commit_transaction->t_forget;
731		spin_unlock(&journal->j_list_lock);
732		bh = jh2bh(jh);
733		jbd_lock_bh_state(bh);
734		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
735			jh->b_transaction == journal->j_running_transaction);
736
737		/*
738		 * If there is undo-protected committed data against
739		 * this buffer, then we can remove it now.  If it is a
740		 * buffer needing such protection, the old frozen_data
741		 * field now points to a committed version of the
742		 * buffer, so rotate that field to the new committed
743		 * data.
744		 *
745		 * Otherwise, we can just throw away the frozen data now.
746		 */
747		if (jh->b_committed_data) {
748			kfree(jh->b_committed_data);
749			jh->b_committed_data = NULL;
750			if (jh->b_frozen_data) {
751				jh->b_committed_data = jh->b_frozen_data;
752				jh->b_frozen_data = NULL;
753			}
754		} else if (jh->b_frozen_data) {
755			kfree(jh->b_frozen_data);
756			jh->b_frozen_data = NULL;
757		}
758
759		spin_lock(&journal->j_list_lock);
760		cp_transaction = jh->b_cp_transaction;
761		if (cp_transaction) {
762			JBUFFER_TRACE(jh, "remove from old cp transaction");
763			__journal_remove_checkpoint(jh);
764		}
765
766		/* Only re-checkpoint the buffer_head if it is marked
767		 * dirty.  If the buffer was added to the BJ_Forget list
768		 * by journal_forget, it may no longer be dirty and
769		 * there's no point in keeping a checkpoint record for
770		 * it. */
771
772		/* A buffer which has been freed while still being
773		 * journaled by a previous transaction may end up still
774		 * being dirty here, but we want to avoid writing back
775		 * that buffer in the future now that the last use has
776		 * been committed.  That's not only a performance gain,
777		 * it also stops aliasing problems if the buffer is left
778		 * behind for writeback and gets reallocated for another
779		 * use in a different page. */
780		if (buffer_freed(bh)) {
781			clear_buffer_freed(bh);
782			clear_buffer_jbddirty(bh);
783		}
784
785		if (buffer_jbddirty(bh)) {
786			JBUFFER_TRACE(jh, "add to new checkpointing trans");
787			__journal_insert_checkpoint(jh, commit_transaction);
788			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
789			__journal_refile_buffer(jh);
790			jbd_unlock_bh_state(bh);
791		} else {
792			J_ASSERT_BH(bh, !buffer_dirty(bh));
793			J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
794			__journal_unfile_buffer(jh);
795			jbd_unlock_bh_state(bh);
796			journal_remove_journal_head(bh);  /* needs a brelse */
797			release_buffer_page(bh);
798		}
799		cond_resched_lock(&journal->j_list_lock);
800	}
801	spin_unlock(&journal->j_list_lock);
802	/*
803	 * This is a bit sleazy.  We borrow j_list_lock to protect
804	 * journal->j_committing_transaction in __journal_remove_checkpoint.
805	 * Really, __journal_remove_checkpoint should be using j_state_lock but
806	 * it's a bit hassle to hold that across __journal_remove_checkpoint
807	 */
808	spin_lock(&journal->j_state_lock);
809	spin_lock(&journal->j_list_lock);
810	/*
811	 * Now recheck if some buffers did not get attached to the transaction
812	 * while the lock was dropped...
813	 */
814	if (commit_transaction->t_forget) {
815		spin_unlock(&journal->j_list_lock);
816		spin_unlock(&journal->j_state_lock);
817		goto restart_loop;
818	}
819
820	/* Done with this transaction! */
821
822	jbd_debug(3, "JBD: commit phase 8\n");
823
824	J_ASSERT(commit_transaction->t_state == T_COMMIT);
825
826	commit_transaction->t_state = T_FINISHED;
827	J_ASSERT(commit_transaction == journal->j_committing_transaction);
828	journal->j_commit_sequence = commit_transaction->t_tid;
829	journal->j_committing_transaction = NULL;
830	spin_unlock(&journal->j_state_lock);
831
832	if (commit_transaction->t_checkpoint_list == NULL) {
833		__journal_drop_transaction(journal, commit_transaction);
834	} else {
835		if (journal->j_checkpoint_transactions == NULL) {
836			journal->j_checkpoint_transactions = commit_transaction;
837			commit_transaction->t_cpnext = commit_transaction;
838			commit_transaction->t_cpprev = commit_transaction;
839		} else {
840			commit_transaction->t_cpnext =
841				journal->j_checkpoint_transactions;
842			commit_transaction->t_cpprev =
843				commit_transaction->t_cpnext->t_cpprev;
844			commit_transaction->t_cpnext->t_cpprev =
845				commit_transaction;
846			commit_transaction->t_cpprev->t_cpnext =
847				commit_transaction;
848		}
849	}
850	spin_unlock(&journal->j_list_lock);
851
852	jbd_debug(1, "JBD: commit %d complete, head %d\n",
853		  journal->j_commit_sequence, journal->j_tail_sequence);
854
855	wake_up(&journal->j_wait_done_commit);
856}