fs/jbd/commit.c at v2.6.35 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / jbd / commit.c
at v2.6.35 977 lines 29 kB view raw
  1/*
  2 * linux/fs/jbd/commit.c
  3 *
  4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
  5 *
  6 * Copyright 1998 Red Hat corp --- All Rights Reserved
  7 *
  8 * This file is part of the Linux kernel and is made available under
  9 * the terms of the GNU General Public License, version 2, or at your
 10 * option, any later version, incorporated herein by reference.
 11 *
 12 * Journal commit routines for the generic filesystem journaling code;
 13 * part of the ext2fs journaling system.
 14 */
 15
 16#include <linux/time.h>
 17#include <linux/fs.h>
 18#include <linux/jbd.h>
 19#include <linux/errno.h>
 20#include <linux/mm.h>
 21#include <linux/pagemap.h>
 22#include <linux/bio.h>
 23
 24/*
 25 * Default IO end handler for temporary BJ_IO buffer_heads.
 26 */
 27static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 28{
 29	BUFFER_TRACE(bh, "");
 30	if (uptodate)
 31		set_buffer_uptodate(bh);
 32	else
 33		clear_buffer_uptodate(bh);
 34	unlock_buffer(bh);
 35}
 36
 37/*
 38 * When an ext3-ordered file is truncated, it is possible that many pages are
 39 * not successfully freed, because they are attached to a committing transaction.
 40 * After the transaction commits, these pages are left on the LRU, with no
 41 * ->mapping, and with attached buffers.  These pages are trivially reclaimable
 42 * by the VM, but their apparent absence upsets the VM accounting, and it makes
 43 * the numbers in /proc/meminfo look odd.
 44 *
 45 * So here, we have a buffer which has just come off the forget list.  Look to
 46 * see if we can strip all buffers from the backing page.
 47 *
 48 * Called under journal->j_list_lock.  The caller provided us with a ref
 49 * against the buffer, and we drop that here.
 50 */
 51static void release_buffer_page(struct buffer_head *bh)
 52{
 53	struct page *page;
 54
 55	if (buffer_dirty(bh))
 56		goto nope;
 57	if (atomic_read(&bh->b_count) != 1)
 58		goto nope;
 59	page = bh->b_page;
 60	if (!page)
 61		goto nope;
 62	if (page->mapping)
 63		goto nope;
 64
 65	/* OK, it's a truncated page */
 66	if (!trylock_page(page))
 67		goto nope;
 68
 69	page_cache_get(page);
 70	__brelse(bh);
 71	try_to_free_buffers(page);
 72	unlock_page(page);
 73	page_cache_release(page);
 74	return;
 75
 76nope:
 77	__brelse(bh);
 78}
 79
 80/*
 81 * Decrement reference counter for data buffer. If it has been marked
 82 * 'BH_Freed', release it and the page to which it belongs if possible.
 83 */
 84static void release_data_buffer(struct buffer_head *bh)
 85{
 86	if (buffer_freed(bh)) {
 87		clear_buffer_freed(bh);
 88		release_buffer_page(bh);
 89	} else
 90		put_bh(bh);
 91}
 92
 93/*
 94 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
 95 * held.  For ranking reasons we must trylock.  If we lose, schedule away and
 96 * return 0.  j_list_lock is dropped in this case.
 97 */
 98static int inverted_lock(journal_t *journal, struct buffer_head *bh)
 99{
100	if (!jbd_trylock_bh_state(bh)) {
101		spin_unlock(&journal->j_list_lock);
102		schedule();
103		return 0;
104	}
105	return 1;
106}
107
108/* Done it all: now write the commit record.  We should have
109 * cleaned up our previous buffers by now, so if we are in abort
110 * mode we can now just skip the rest of the journal write
111 * entirely.
112 *
113 * Returns 1 if the journal needs to be aborted or 0 on success
114 */
115static int journal_write_commit_record(journal_t *journal,
116					transaction_t *commit_transaction)
117{
118	struct journal_head *descriptor;
119	struct buffer_head *bh;
120	journal_header_t *header;
121	int ret;
122	int barrier_done = 0;
123
124	if (is_journal_aborted(journal))
125		return 0;
126
127	descriptor = journal_get_descriptor_buffer(journal);
128	if (!descriptor)
129		return 1;
130
131	bh = jh2bh(descriptor);
132
133	header = (journal_header_t *)(bh->b_data);
134	header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
135	header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
136	header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
137
138	JBUFFER_TRACE(descriptor, "write commit block");
139	set_buffer_dirty(bh);
140	if (journal->j_flags & JFS_BARRIER) {
141		set_buffer_ordered(bh);
142		barrier_done = 1;
143	}
144	ret = sync_dirty_buffer(bh);
145	if (barrier_done)
146		clear_buffer_ordered(bh);
147	/* is it possible for another commit to fail at roughly
148	 * the same time as this one?  If so, we don't want to
149	 * trust the barrier flag in the super, but instead want
150	 * to remember if we sent a barrier request
151	 */
152	if (ret == -EOPNOTSUPP && barrier_done) {
153		char b[BDEVNAME_SIZE];
154
155		printk(KERN_WARNING
156			"JBD: barrier-based sync failed on %s - "
157			"disabling barriers\n",
158			bdevname(journal->j_dev, b));
159		spin_lock(&journal->j_state_lock);
160		journal->j_flags &= ~JFS_BARRIER;
161		spin_unlock(&journal->j_state_lock);
162
163		/* And try again, without the barrier */
164		set_buffer_uptodate(bh);
165		set_buffer_dirty(bh);
166		ret = sync_dirty_buffer(bh);
167	}
168	put_bh(bh);		/* One for getblk() */
169	journal_put_journal_head(descriptor);
170
171	return (ret == -EIO);
172}
173
174static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
175				   int write_op)
176{
177	int i;
178
179	for (i = 0; i < bufs; i++) {
180		wbuf[i]->b_end_io = end_buffer_write_sync;
181		/* We use-up our safety reference in submit_bh() */
182		submit_bh(write_op, wbuf[i]);
183	}
184}
185
186/*
187 *  Submit all the data buffers to disk
188 */
189static int journal_submit_data_buffers(journal_t *journal,
190				       transaction_t *commit_transaction,
191				       int write_op)
192{
193	struct journal_head *jh;
194	struct buffer_head *bh;
195	int locked;
196	int bufs = 0;
197	struct buffer_head **wbuf = journal->j_wbuf;
198	int err = 0;
199
200	/*
201	 * Whenever we unlock the journal and sleep, things can get added
202	 * onto ->t_sync_datalist, so we have to keep looping back to
203	 * write_out_data until we *know* that the list is empty.
204	 *
205	 * Cleanup any flushed data buffers from the data list.  Even in
206	 * abort mode, we want to flush this out as soon as possible.
207	 */
208write_out_data:
209	cond_resched();
210	spin_lock(&journal->j_list_lock);
211
212	while (commit_transaction->t_sync_datalist) {
213		jh = commit_transaction->t_sync_datalist;
214		bh = jh2bh(jh);
215		locked = 0;
216
217		/* Get reference just to make sure buffer does not disappear
218		 * when we are forced to drop various locks */
219		get_bh(bh);
220		/* If the buffer is dirty, we need to submit IO and hence
221		 * we need the buffer lock. We try to lock the buffer without
222		 * blocking. If we fail, we need to drop j_list_lock and do
223		 * blocking lock_buffer().
224		 */
225		if (buffer_dirty(bh)) {
226			if (!trylock_buffer(bh)) {
227				BUFFER_TRACE(bh, "needs blocking lock");
228				spin_unlock(&journal->j_list_lock);
229				/* Write out all data to prevent deadlocks */
230				journal_do_submit_data(wbuf, bufs, write_op);
231				bufs = 0;
232				lock_buffer(bh);
233				spin_lock(&journal->j_list_lock);
234			}
235			locked = 1;
236		}
237		/* We have to get bh_state lock. Again out of order, sigh. */
238		if (!inverted_lock(journal, bh)) {
239			jbd_lock_bh_state(bh);
240			spin_lock(&journal->j_list_lock);
241		}
242		/* Someone already cleaned up the buffer? */
243		if (!buffer_jbd(bh) || bh2jh(bh) != jh
244			|| jh->b_transaction != commit_transaction
245			|| jh->b_jlist != BJ_SyncData) {
246			jbd_unlock_bh_state(bh);
247			if (locked)
248				unlock_buffer(bh);
249			BUFFER_TRACE(bh, "already cleaned up");
250			release_data_buffer(bh);
251			continue;
252		}
253		if (locked && test_clear_buffer_dirty(bh)) {
254			BUFFER_TRACE(bh, "needs writeout, adding to array");
255			wbuf[bufs++] = bh;
256			__journal_file_buffer(jh, commit_transaction,
257						BJ_Locked);
258			jbd_unlock_bh_state(bh);
259			if (bufs == journal->j_wbufsize) {
260				spin_unlock(&journal->j_list_lock);
261				journal_do_submit_data(wbuf, bufs, write_op);
262				bufs = 0;
263				goto write_out_data;
264			}
265		} else if (!locked && buffer_locked(bh)) {
266			__journal_file_buffer(jh, commit_transaction,
267						BJ_Locked);
268			jbd_unlock_bh_state(bh);
269			put_bh(bh);
270		} else {
271			BUFFER_TRACE(bh, "writeout complete: unfile");
272			if (unlikely(!buffer_uptodate(bh)))
273				err = -EIO;
274			__journal_unfile_buffer(jh);
275			jbd_unlock_bh_state(bh);
276			if (locked)
277				unlock_buffer(bh);
278			journal_remove_journal_head(bh);
279			/* One for our safety reference, other for
280			 * journal_remove_journal_head() */
281			put_bh(bh);
282			release_data_buffer(bh);
283		}
284
285		if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
286			spin_unlock(&journal->j_list_lock);
287			goto write_out_data;
288		}
289	}
290	spin_unlock(&journal->j_list_lock);
291	journal_do_submit_data(wbuf, bufs, write_op);
292
293	return err;
294}
295
296/*
297 * journal_commit_transaction
298 *
299 * The primary function for committing a transaction to the log.  This
300 * function is called by the journal thread to begin a complete commit.
301 */
302void journal_commit_transaction(journal_t *journal)
303{
304	transaction_t *commit_transaction;
305	struct journal_head *jh, *new_jh, *descriptor;
306	struct buffer_head **wbuf = journal->j_wbuf;
307	int bufs;
308	int flags;
309	int err;
310	unsigned int blocknr;
311	ktime_t start_time;
312	u64 commit_time;
313	char *tagp = NULL;
314	journal_header_t *header;
315	journal_block_tag_t *tag = NULL;
316	int space_left = 0;
317	int first_tag = 0;
318	int tag_flag;
319	int i;
320	int write_op = WRITE;
321
322	/*
323	 * First job: lock down the current transaction and wait for
324	 * all outstanding updates to complete.
325	 */
326
327#ifdef COMMIT_STATS
328	spin_lock(&journal->j_list_lock);
329	summarise_journal_usage(journal);
330	spin_unlock(&journal->j_list_lock);
331#endif
332
333	/* Do we need to erase the effects of a prior journal_flush? */
334	if (journal->j_flags & JFS_FLUSHED) {
335		jbd_debug(3, "super block updated\n");
336		journal_update_superblock(journal, 1);
337	} else {
338		jbd_debug(3, "superblock not updated\n");
339	}
340
341	J_ASSERT(journal->j_running_transaction != NULL);
342	J_ASSERT(journal->j_committing_transaction == NULL);
343
344	commit_transaction = journal->j_running_transaction;
345	J_ASSERT(commit_transaction->t_state == T_RUNNING);
346
347	jbd_debug(1, "JBD: starting commit of transaction %d\n",
348			commit_transaction->t_tid);
349
350	spin_lock(&journal->j_state_lock);
351	commit_transaction->t_state = T_LOCKED;
352
353	/*
354	 * Use plugged writes here, since we want to submit several before
355	 * we unplug the device. We don't do explicit unplugging in here,
356	 * instead we rely on sync_buffer() doing the unplug for us.
357	 */
358	if (commit_transaction->t_synchronous_commit)
359		write_op = WRITE_SYNC_PLUG;
360	spin_lock(&commit_transaction->t_handle_lock);
361	while (commit_transaction->t_updates) {
362		DEFINE_WAIT(wait);
363
364		prepare_to_wait(&journal->j_wait_updates, &wait,
365					TASK_UNINTERRUPTIBLE);
366		if (commit_transaction->t_updates) {
367			spin_unlock(&commit_transaction->t_handle_lock);
368			spin_unlock(&journal->j_state_lock);
369			schedule();
370			spin_lock(&journal->j_state_lock);
371			spin_lock(&commit_transaction->t_handle_lock);
372		}
373		finish_wait(&journal->j_wait_updates, &wait);
374	}
375	spin_unlock(&commit_transaction->t_handle_lock);
376
377	J_ASSERT (commit_transaction->t_outstanding_credits <=
378			journal->j_max_transaction_buffers);
379
380	/*
381	 * First thing we are allowed to do is to discard any remaining
382	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
383	 * that there are no such buffers: if a large filesystem
384	 * operation like a truncate needs to split itself over multiple
385	 * transactions, then it may try to do a journal_restart() while
386	 * there are still BJ_Reserved buffers outstanding.  These must
387	 * be released cleanly from the current transaction.
388	 *
389	 * In this case, the filesystem must still reserve write access
390	 * again before modifying the buffer in the new transaction, but
391	 * we do not require it to remember exactly which old buffers it
392	 * has reserved.  This is consistent with the existing behaviour
393	 * that multiple journal_get_write_access() calls to the same
394	 * buffer are perfectly permissable.
395	 */
396	while (commit_transaction->t_reserved_list) {
397		jh = commit_transaction->t_reserved_list;
398		JBUFFER_TRACE(jh, "reserved, unused: refile");
399		/*
400		 * A journal_get_undo_access()+journal_release_buffer() may
401		 * leave undo-committed data.
402		 */
403		if (jh->b_committed_data) {
404			struct buffer_head *bh = jh2bh(jh);
405
406			jbd_lock_bh_state(bh);
407			jbd_free(jh->b_committed_data, bh->b_size);
408			jh->b_committed_data = NULL;
409			jbd_unlock_bh_state(bh);
410		}
411		journal_refile_buffer(journal, jh);
412	}
413
414	/*
415	 * Now try to drop any written-back buffers from the journal's
416	 * checkpoint lists.  We do this *before* commit because it potentially
417	 * frees some memory
418	 */
419	spin_lock(&journal->j_list_lock);
420	__journal_clean_checkpoint_list(journal);
421	spin_unlock(&journal->j_list_lock);
422
423	jbd_debug (3, "JBD: commit phase 1\n");
424
425	/*
426	 * Switch to a new revoke table.
427	 */
428	journal_switch_revoke_table(journal);
429
430	commit_transaction->t_state = T_FLUSH;
431	journal->j_committing_transaction = commit_transaction;
432	journal->j_running_transaction = NULL;
433	start_time = ktime_get();
434	commit_transaction->t_log_start = journal->j_head;
435	wake_up(&journal->j_wait_transaction_locked);
436	spin_unlock(&journal->j_state_lock);
437
438	jbd_debug (3, "JBD: commit phase 2\n");
439
440	/*
441	 * Now start flushing things to disk, in the order they appear
442	 * on the transaction lists.  Data blocks go first.
443	 */
444	err = journal_submit_data_buffers(journal, commit_transaction,
445					  write_op);
446
447	/*
448	 * Wait for all previously submitted IO to complete.
449	 */
450	spin_lock(&journal->j_list_lock);
451	while (commit_transaction->t_locked_list) {
452		struct buffer_head *bh;
453
454		jh = commit_transaction->t_locked_list->b_tprev;
455		bh = jh2bh(jh);
456		get_bh(bh);
457		if (buffer_locked(bh)) {
458			spin_unlock(&journal->j_list_lock);
459			wait_on_buffer(bh);
460			spin_lock(&journal->j_list_lock);
461		}
462		if (unlikely(!buffer_uptodate(bh))) {
463			if (!trylock_page(bh->b_page)) {
464				spin_unlock(&journal->j_list_lock);
465				lock_page(bh->b_page);
466				spin_lock(&journal->j_list_lock);
467			}
468			if (bh->b_page->mapping)
469				set_bit(AS_EIO, &bh->b_page->mapping->flags);
470
471			unlock_page(bh->b_page);
472			SetPageError(bh->b_page);
473			err = -EIO;
474		}
475		if (!inverted_lock(journal, bh)) {
476			put_bh(bh);
477			spin_lock(&journal->j_list_lock);
478			continue;
479		}
480		if (buffer_jbd(bh) && bh2jh(bh) == jh &&
481		    jh->b_transaction == commit_transaction &&
482		    jh->b_jlist == BJ_Locked) {
483			__journal_unfile_buffer(jh);
484			jbd_unlock_bh_state(bh);
485			journal_remove_journal_head(bh);
486			put_bh(bh);
487		} else {
488			jbd_unlock_bh_state(bh);
489		}
490		release_data_buffer(bh);
491		cond_resched_lock(&journal->j_list_lock);
492	}
493	spin_unlock(&journal->j_list_lock);
494
495	if (err) {
496		char b[BDEVNAME_SIZE];
497
498		printk(KERN_WARNING
499			"JBD: Detected IO errors while flushing file data "
500			"on %s\n", bdevname(journal->j_fs_dev, b));
501		if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
502			journal_abort(journal, err);
503		err = 0;
504	}
505
506	journal_write_revoke_records(journal, commit_transaction, write_op);
507
508	/*
509	 * If we found any dirty or locked buffers, then we should have
510	 * looped back up to the write_out_data label.  If there weren't
511	 * any then journal_clean_data_list should have wiped the list
512	 * clean by now, so check that it is in fact empty.
513	 */
514	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
515
516	jbd_debug (3, "JBD: commit phase 3\n");
517
518	/*
519	 * Way to go: we have now written out all of the data for a
520	 * transaction!  Now comes the tricky part: we need to write out
521	 * metadata.  Loop over the transaction's entire buffer list:
522	 */
523	spin_lock(&journal->j_state_lock);
524	commit_transaction->t_state = T_COMMIT;
525	spin_unlock(&journal->j_state_lock);
526
527	J_ASSERT(commit_transaction->t_nr_buffers <=
528		 commit_transaction->t_outstanding_credits);
529
530	descriptor = NULL;
531	bufs = 0;
532	while (commit_transaction->t_buffers) {
533
534		/* Find the next buffer to be journaled... */
535
536		jh = commit_transaction->t_buffers;
537
538		/* If we're in abort mode, we just un-journal the buffer and
539		   release it. */
540
541		if (is_journal_aborted(journal)) {
542			clear_buffer_jbddirty(jh2bh(jh));
543			JBUFFER_TRACE(jh, "journal is aborting: refile");
544			journal_refile_buffer(journal, jh);
545			/* If that was the last one, we need to clean up
546			 * any descriptor buffers which may have been
547			 * already allocated, even if we are now
548			 * aborting. */
549			if (!commit_transaction->t_buffers)
550				goto start_journal_io;
551			continue;
552		}
553
554		/* Make sure we have a descriptor block in which to
555		   record the metadata buffer. */
556
557		if (!descriptor) {
558			struct buffer_head *bh;
559
560			J_ASSERT (bufs == 0);
561
562			jbd_debug(4, "JBD: get descriptor\n");
563
564			descriptor = journal_get_descriptor_buffer(journal);
565			if (!descriptor) {
566				journal_abort(journal, -EIO);
567				continue;
568			}
569
570			bh = jh2bh(descriptor);
571			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
572				(unsigned long long)bh->b_blocknr, bh->b_data);
573			header = (journal_header_t *)&bh->b_data[0];
574			header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
575			header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
576			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
577
578			tagp = &bh->b_data[sizeof(journal_header_t)];
579			space_left = bh->b_size - sizeof(journal_header_t);
580			first_tag = 1;
581			set_buffer_jwrite(bh);
582			set_buffer_dirty(bh);
583			wbuf[bufs++] = bh;
584
585			/* Record it so that we can wait for IO
586                           completion later */
587			BUFFER_TRACE(bh, "ph3: file as descriptor");
588			journal_file_buffer(descriptor, commit_transaction,
589					BJ_LogCtl);
590		}
591
592		/* Where is the buffer to be written? */
593
594		err = journal_next_log_block(journal, &blocknr);
595		/* If the block mapping failed, just abandon the buffer
596		   and repeat this loop: we'll fall into the
597		   refile-on-abort condition above. */
598		if (err) {
599			journal_abort(journal, err);
600			continue;
601		}
602
603		/*
604		 * start_this_handle() uses t_outstanding_credits to determine
605		 * the free space in the log, but this counter is changed
606		 * by journal_next_log_block() also.
607		 */
608		commit_transaction->t_outstanding_credits--;
609
610		/* Bump b_count to prevent truncate from stumbling over
611                   the shadowed buffer!  @@@ This can go if we ever get
612                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
613		atomic_inc(&jh2bh(jh)->b_count);
614
615		/* Make a temporary IO buffer with which to write it out
616                   (this will requeue both the metadata buffer and the
617                   temporary IO buffer). new_bh goes on BJ_IO*/
618
619		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
620		/*
621		 * akpm: journal_write_metadata_buffer() sets
622		 * new_bh->b_transaction to commit_transaction.
623		 * We need to clean this up before we release new_bh
624		 * (which is of type BJ_IO)
625		 */
626		JBUFFER_TRACE(jh, "ph3: write metadata");
627		flags = journal_write_metadata_buffer(commit_transaction,
628						      jh, &new_jh, blocknr);
629		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
630		wbuf[bufs++] = jh2bh(new_jh);
631
632		/* Record the new block's tag in the current descriptor
633                   buffer */
634
635		tag_flag = 0;
636		if (flags & 1)
637			tag_flag |= JFS_FLAG_ESCAPE;
638		if (!first_tag)
639			tag_flag |= JFS_FLAG_SAME_UUID;
640
641		tag = (journal_block_tag_t *) tagp;
642		tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
643		tag->t_flags = cpu_to_be32(tag_flag);
644		tagp += sizeof(journal_block_tag_t);
645		space_left -= sizeof(journal_block_tag_t);
646
647		if (first_tag) {
648			memcpy (tagp, journal->j_uuid, 16);
649			tagp += 16;
650			space_left -= 16;
651			first_tag = 0;
652		}
653
654		/* If there's no more to do, or if the descriptor is full,
655		   let the IO rip! */
656
657		if (bufs == journal->j_wbufsize ||
658		    commit_transaction->t_buffers == NULL ||
659		    space_left < sizeof(journal_block_tag_t) + 16) {
660
661			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
662
663			/* Write an end-of-descriptor marker before
664                           submitting the IOs.  "tag" still points to
665                           the last tag we set up. */
666
667			tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
668
669start_journal_io:
670			for (i = 0; i < bufs; i++) {
671				struct buffer_head *bh = wbuf[i];
672				lock_buffer(bh);
673				clear_buffer_dirty(bh);
674				set_buffer_uptodate(bh);
675				bh->b_end_io = journal_end_buffer_io_sync;
676				submit_bh(write_op, bh);
677			}
678			cond_resched();
679
680			/* Force a new descriptor to be generated next
681                           time round the loop. */
682			descriptor = NULL;
683			bufs = 0;
684		}
685	}
686
687	/* Lo and behold: we have just managed to send a transaction to
688           the log.  Before we can commit it, wait for the IO so far to
689           complete.  Control buffers being written are on the
690           transaction's t_log_list queue, and metadata buffers are on
691           the t_iobuf_list queue.
692
693	   Wait for the buffers in reverse order.  That way we are
694	   less likely to be woken up until all IOs have completed, and
695	   so we incur less scheduling load.
696	*/
697
698	jbd_debug(3, "JBD: commit phase 4\n");
699
700	/*
701	 * akpm: these are BJ_IO, and j_list_lock is not needed.
702	 * See __journal_try_to_free_buffer.
703	 */
704wait_for_iobuf:
705	while (commit_transaction->t_iobuf_list != NULL) {
706		struct buffer_head *bh;
707
708		jh = commit_transaction->t_iobuf_list->b_tprev;
709		bh = jh2bh(jh);
710		if (buffer_locked(bh)) {
711			wait_on_buffer(bh);
712			goto wait_for_iobuf;
713		}
714		if (cond_resched())
715			goto wait_for_iobuf;
716
717		if (unlikely(!buffer_uptodate(bh)))
718			err = -EIO;
719
720		clear_buffer_jwrite(bh);
721
722		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
723		journal_unfile_buffer(journal, jh);
724
725		/*
726		 * ->t_iobuf_list should contain only dummy buffer_heads
727		 * which were created by journal_write_metadata_buffer().
728		 */
729		BUFFER_TRACE(bh, "dumping temporary bh");
730		journal_put_journal_head(jh);
731		__brelse(bh);
732		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
733		free_buffer_head(bh);
734
735		/* We also have to unlock and free the corresponding
736                   shadowed buffer */
737		jh = commit_transaction->t_shadow_list->b_tprev;
738		bh = jh2bh(jh);
739		clear_bit(BH_JWrite, &bh->b_state);
740		J_ASSERT_BH(bh, buffer_jbddirty(bh));
741
742		/* The metadata is now released for reuse, but we need
743                   to remember it against this transaction so that when
744                   we finally commit, we can do any checkpointing
745                   required. */
746		JBUFFER_TRACE(jh, "file as BJ_Forget");
747		journal_file_buffer(jh, commit_transaction, BJ_Forget);
748		/* Wake up any transactions which were waiting for this
749		   IO to complete */
750		wake_up_bit(&bh->b_state, BH_Unshadow);
751		JBUFFER_TRACE(jh, "brelse shadowed buffer");
752		__brelse(bh);
753	}
754
755	J_ASSERT (commit_transaction->t_shadow_list == NULL);
756
757	jbd_debug(3, "JBD: commit phase 5\n");
758
759	/* Here we wait for the revoke record and descriptor record buffers */
760 wait_for_ctlbuf:
761	while (commit_transaction->t_log_list != NULL) {
762		struct buffer_head *bh;
763
764		jh = commit_transaction->t_log_list->b_tprev;
765		bh = jh2bh(jh);
766		if (buffer_locked(bh)) {
767			wait_on_buffer(bh);
768			goto wait_for_ctlbuf;
769		}
770		if (cond_resched())
771			goto wait_for_ctlbuf;
772
773		if (unlikely(!buffer_uptodate(bh)))
774			err = -EIO;
775
776		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
777		clear_buffer_jwrite(bh);
778		journal_unfile_buffer(journal, jh);
779		journal_put_journal_head(jh);
780		__brelse(bh);		/* One for getblk */
781		/* AKPM: bforget here */
782	}
783
784	if (err)
785		journal_abort(journal, err);
786
787	jbd_debug(3, "JBD: commit phase 6\n");
788
789	/* All metadata is written, now write commit record and do cleanup */
790	spin_lock(&journal->j_state_lock);
791	J_ASSERT(commit_transaction->t_state == T_COMMIT);
792	commit_transaction->t_state = T_COMMIT_RECORD;
793	spin_unlock(&journal->j_state_lock);
794
795	if (journal_write_commit_record(journal, commit_transaction))
796		err = -EIO;
797
798	if (err)
799		journal_abort(journal, err);
800
801	/* End of a transaction!  Finally, we can do checkpoint
802           processing: any buffers committed as a result of this
803           transaction can be removed from any checkpoint list it was on
804           before. */
805
806	jbd_debug(3, "JBD: commit phase 7\n");
807
808	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
809	J_ASSERT(commit_transaction->t_buffers == NULL);
810	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
811	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
812	J_ASSERT(commit_transaction->t_shadow_list == NULL);
813	J_ASSERT(commit_transaction->t_log_list == NULL);
814
815restart_loop:
816	/*
817	 * As there are other places (journal_unmap_buffer()) adding buffers
818	 * to this list we have to be careful and hold the j_list_lock.
819	 */
820	spin_lock(&journal->j_list_lock);
821	while (commit_transaction->t_forget) {
822		transaction_t *cp_transaction;
823		struct buffer_head *bh;
824
825		jh = commit_transaction->t_forget;
826		spin_unlock(&journal->j_list_lock);
827		bh = jh2bh(jh);
828		jbd_lock_bh_state(bh);
829		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
830			jh->b_transaction == journal->j_running_transaction);
831
832		/*
833		 * If there is undo-protected committed data against
834		 * this buffer, then we can remove it now.  If it is a
835		 * buffer needing such protection, the old frozen_data
836		 * field now points to a committed version of the
837		 * buffer, so rotate that field to the new committed
838		 * data.
839		 *
840		 * Otherwise, we can just throw away the frozen data now.
841		 */
842		if (jh->b_committed_data) {
843			jbd_free(jh->b_committed_data, bh->b_size);
844			jh->b_committed_data = NULL;
845			if (jh->b_frozen_data) {
846				jh->b_committed_data = jh->b_frozen_data;
847				jh->b_frozen_data = NULL;
848			}
849		} else if (jh->b_frozen_data) {
850			jbd_free(jh->b_frozen_data, bh->b_size);
851			jh->b_frozen_data = NULL;
852		}
853
854		spin_lock(&journal->j_list_lock);
855		cp_transaction = jh->b_cp_transaction;
856		if (cp_transaction) {
857			JBUFFER_TRACE(jh, "remove from old cp transaction");
858			__journal_remove_checkpoint(jh);
859		}
860
861		/* Only re-checkpoint the buffer_head if it is marked
862		 * dirty.  If the buffer was added to the BJ_Forget list
863		 * by journal_forget, it may no longer be dirty and
864		 * there's no point in keeping a checkpoint record for
865		 * it. */
866
867		/* A buffer which has been freed while still being
868		 * journaled by a previous transaction may end up still
869		 * being dirty here, but we want to avoid writing back
870		 * that buffer in the future after the "add to orphan"
871		 * operation been committed,  That's not only a performance
872		 * gain, it also stops aliasing problems if the buffer is
873		 * left behind for writeback and gets reallocated for another
874		 * use in a different page. */
875		if (buffer_freed(bh) && !jh->b_next_transaction) {
876			clear_buffer_freed(bh);
877			clear_buffer_jbddirty(bh);
878		}
879
880		if (buffer_jbddirty(bh)) {
881			JBUFFER_TRACE(jh, "add to new checkpointing trans");
882			__journal_insert_checkpoint(jh, commit_transaction);
883			if (is_journal_aborted(journal))
884				clear_buffer_jbddirty(bh);
885			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
886			__journal_refile_buffer(jh);
887			jbd_unlock_bh_state(bh);
888		} else {
889			J_ASSERT_BH(bh, !buffer_dirty(bh));
890			/* The buffer on BJ_Forget list and not jbddirty means
891			 * it has been freed by this transaction and hence it
892			 * could not have been reallocated until this
893			 * transaction has committed. *BUT* it could be
894			 * reallocated once we have written all the data to
895			 * disk and before we process the buffer on BJ_Forget
896			 * list. */
897			JBUFFER_TRACE(jh, "refile or unfile freed buffer");
898			__journal_refile_buffer(jh);
899			if (!jh->b_transaction) {
900				jbd_unlock_bh_state(bh);
901				 /* needs a brelse */
902				journal_remove_journal_head(bh);
903				release_buffer_page(bh);
904			} else
905				jbd_unlock_bh_state(bh);
906		}
907		cond_resched_lock(&journal->j_list_lock);
908	}
909	spin_unlock(&journal->j_list_lock);
910	/*
911	 * This is a bit sleazy.  We use j_list_lock to protect transition
912	 * of a transaction into T_FINISHED state and calling
913	 * __journal_drop_transaction(). Otherwise we could race with
914	 * other checkpointing code processing the transaction...
915	 */
916	spin_lock(&journal->j_state_lock);
917	spin_lock(&journal->j_list_lock);
918	/*
919	 * Now recheck if some buffers did not get attached to the transaction
920	 * while the lock was dropped...
921	 */
922	if (commit_transaction->t_forget) {
923		spin_unlock(&journal->j_list_lock);
924		spin_unlock(&journal->j_state_lock);
925		goto restart_loop;
926	}
927
928	/* Done with this transaction! */
929
930	jbd_debug(3, "JBD: commit phase 8\n");
931
932	J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
933
934	commit_transaction->t_state = T_FINISHED;
935	J_ASSERT(commit_transaction == journal->j_committing_transaction);
936	journal->j_commit_sequence = commit_transaction->t_tid;
937	journal->j_committing_transaction = NULL;
938	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
939
940	/*
941	 * weight the commit time higher than the average time so we don't
942	 * react too strongly to vast changes in commit time
943	 */
944	if (likely(journal->j_average_commit_time))
945		journal->j_average_commit_time = (commit_time*3 +
946				journal->j_average_commit_time) / 4;
947	else
948		journal->j_average_commit_time = commit_time;
949
950	spin_unlock(&journal->j_state_lock);
951
952	if (commit_transaction->t_checkpoint_list == NULL &&
953	    commit_transaction->t_checkpoint_io_list == NULL) {
954		__journal_drop_transaction(journal, commit_transaction);
955	} else {
956		if (journal->j_checkpoint_transactions == NULL) {
957			journal->j_checkpoint_transactions = commit_transaction;
958			commit_transaction->t_cpnext = commit_transaction;
959			commit_transaction->t_cpprev = commit_transaction;
960		} else {
961			commit_transaction->t_cpnext =
962				journal->j_checkpoint_transactions;
963			commit_transaction->t_cpprev =
964				commit_transaction->t_cpnext->t_cpprev;
965			commit_transaction->t_cpnext->t_cpprev =
966				commit_transaction;
967			commit_transaction->t_cpprev->t_cpnext =
968				commit_transaction;
969		}
970	}
971	spin_unlock(&journal->j_list_lock);
972
973	jbd_debug(1, "JBD: commit %d complete, head %d\n",
974		  journal->j_commit_sequence, journal->j_tail_sequence);
975
976	wake_up(&journal->j_wait_done_commit);
977}