at v2.6.14 637 lines 18 kB view raw
1/* 2 * linux/fs/checkpoint.c 3 * 4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 5 * 6 * Copyright 1999 Red Hat Software --- All Rights Reserved 7 * 8 * This file is part of the Linux kernel and is made available under 9 * the terms of the GNU General Public License, version 2, or at your 10 * option, any later version, incorporated herein by reference. 11 * 12 * Checkpoint routines for the generic filesystem journaling code. 13 * Part of the ext2fs journaling system. 14 * 15 * Checkpointing is the process of ensuring that a section of the log is 16 * committed fully to disk, so that that portion of the log can be 17 * reused. 18 */ 19 20#include <linux/time.h> 21#include <linux/fs.h> 22#include <linux/jbd.h> 23#include <linux/errno.h> 24#include <linux/slab.h> 25 26/* 27 * Unlink a buffer from a transaction. 28 * 29 * Called with j_list_lock held. 30 */ 31 32static inline void __buffer_unlink(struct journal_head *jh) 33{ 34 transaction_t *transaction; 35 36 transaction = jh->b_cp_transaction; 37 jh->b_cp_transaction = NULL; 38 39 jh->b_cpnext->b_cpprev = jh->b_cpprev; 40 jh->b_cpprev->b_cpnext = jh->b_cpnext; 41 if (transaction->t_checkpoint_list == jh) 42 transaction->t_checkpoint_list = jh->b_cpnext; 43 if (transaction->t_checkpoint_list == jh) 44 transaction->t_checkpoint_list = NULL; 45} 46 47/* 48 * Try to release a checkpointed buffer from its transaction. 49 * Returns 1 if we released it. 50 * Requires j_list_lock 51 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 52 */ 53static int __try_to_free_cp_buf(struct journal_head *jh) 54{ 55 int ret = 0; 56 struct buffer_head *bh = jh2bh(jh); 57 58 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { 59 JBUFFER_TRACE(jh, "remove from checkpoint list"); 60 __journal_remove_checkpoint(jh); 61 jbd_unlock_bh_state(bh); 62 journal_remove_journal_head(bh); 63 BUFFER_TRACE(bh, "release"); 64 __brelse(bh); 65 ret = 1; 66 } else { 67 jbd_unlock_bh_state(bh); 68 } 69 return ret; 70} 71 72/* 73 * __log_wait_for_space: wait until there is space in the journal. 74 * 75 * Called under j-state_lock *only*. It will be unlocked if we have to wait 76 * for a checkpoint to free up some space in the log. 77 */ 78void __log_wait_for_space(journal_t *journal) 79{ 80 int nblocks; 81 assert_spin_locked(&journal->j_state_lock); 82 83 nblocks = jbd_space_needed(journal); 84 while (__log_space_left(journal) < nblocks) { 85 if (journal->j_flags & JFS_ABORT) 86 return; 87 spin_unlock(&journal->j_state_lock); 88 down(&journal->j_checkpoint_sem); 89 90 /* 91 * Test again, another process may have checkpointed while we 92 * were waiting for the checkpoint lock 93 */ 94 spin_lock(&journal->j_state_lock); 95 nblocks = jbd_space_needed(journal); 96 if (__log_space_left(journal) < nblocks) { 97 spin_unlock(&journal->j_state_lock); 98 log_do_checkpoint(journal); 99 spin_lock(&journal->j_state_lock); 100 } 101 up(&journal->j_checkpoint_sem); 102 } 103} 104 105/* 106 * We were unable to perform jbd_trylock_bh_state() inside j_list_lock. 107 * The caller must restart a list walk. Wait for someone else to run 108 * jbd_unlock_bh_state(). 109 */ 110static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) 111{ 112 get_bh(bh); 113 spin_unlock(&journal->j_list_lock); 114 jbd_lock_bh_state(bh); 115 jbd_unlock_bh_state(bh); 116 put_bh(bh); 117} 118 119/* 120 * Clean up a transaction's checkpoint list. 121 * 122 * We wait for any pending IO to complete and make sure any clean 123 * buffers are removed from the transaction. 124 * 125 * Return 1 if we performed any actions which might have destroyed the 126 * checkpoint. (journal_remove_checkpoint() deletes the transaction when 127 * the last checkpoint buffer is cleansed) 128 * 129 * Called with j_list_lock held. 130 */ 131static int __cleanup_transaction(journal_t *journal, transaction_t *transaction) 132{ 133 struct journal_head *jh, *next_jh, *last_jh; 134 struct buffer_head *bh; 135 int ret = 0; 136 137 assert_spin_locked(&journal->j_list_lock); 138 jh = transaction->t_checkpoint_list; 139 if (!jh) 140 return 0; 141 142 last_jh = jh->b_cpprev; 143 next_jh = jh; 144 do { 145 jh = next_jh; 146 bh = jh2bh(jh); 147 if (buffer_locked(bh)) { 148 atomic_inc(&bh->b_count); 149 spin_unlock(&journal->j_list_lock); 150 wait_on_buffer(bh); 151 /* the journal_head may have gone by now */ 152 BUFFER_TRACE(bh, "brelse"); 153 __brelse(bh); 154 goto out_return_1; 155 } 156 157 /* 158 * This is foul 159 */ 160 if (!jbd_trylock_bh_state(bh)) { 161 jbd_sync_bh(journal, bh); 162 goto out_return_1; 163 } 164 165 if (jh->b_transaction != NULL) { 166 transaction_t *t = jh->b_transaction; 167 tid_t tid = t->t_tid; 168 169 spin_unlock(&journal->j_list_lock); 170 jbd_unlock_bh_state(bh); 171 log_start_commit(journal, tid); 172 log_wait_commit(journal, tid); 173 goto out_return_1; 174 } 175 176 /* 177 * AKPM: I think the buffer_jbddirty test is redundant - it 178 * shouldn't have NULL b_transaction? 179 */ 180 next_jh = jh->b_cpnext; 181 if (!buffer_dirty(bh) && !buffer_jbddirty(bh)) { 182 BUFFER_TRACE(bh, "remove from checkpoint"); 183 __journal_remove_checkpoint(jh); 184 jbd_unlock_bh_state(bh); 185 journal_remove_journal_head(bh); 186 __brelse(bh); 187 ret = 1; 188 } else { 189 jbd_unlock_bh_state(bh); 190 } 191 } while (jh != last_jh); 192 193 return ret; 194out_return_1: 195 spin_lock(&journal->j_list_lock); 196 return 1; 197} 198 199#define NR_BATCH 64 200 201static void 202__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) 203{ 204 int i; 205 206 spin_unlock(&journal->j_list_lock); 207 ll_rw_block(SWRITE, *batch_count, bhs); 208 spin_lock(&journal->j_list_lock); 209 for (i = 0; i < *batch_count; i++) { 210 struct buffer_head *bh = bhs[i]; 211 clear_buffer_jwrite(bh); 212 BUFFER_TRACE(bh, "brelse"); 213 __brelse(bh); 214 } 215 *batch_count = 0; 216} 217 218/* 219 * Try to flush one buffer from the checkpoint list to disk. 220 * 221 * Return 1 if something happened which requires us to abort the current 222 * scan of the checkpoint list. 223 * 224 * Called with j_list_lock held. 225 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 226 */ 227static int __flush_buffer(journal_t *journal, struct journal_head *jh, 228 struct buffer_head **bhs, int *batch_count, 229 int *drop_count) 230{ 231 struct buffer_head *bh = jh2bh(jh); 232 int ret = 0; 233 234 if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) { 235 J_ASSERT_JH(jh, jh->b_transaction == NULL); 236 237 /* 238 * Important: we are about to write the buffer, and 239 * possibly block, while still holding the journal lock. 240 * We cannot afford to let the transaction logic start 241 * messing around with this buffer before we write it to 242 * disk, as that would break recoverability. 243 */ 244 BUFFER_TRACE(bh, "queue"); 245 get_bh(bh); 246 J_ASSERT_BH(bh, !buffer_jwrite(bh)); 247 set_buffer_jwrite(bh); 248 bhs[*batch_count] = bh; 249 jbd_unlock_bh_state(bh); 250 (*batch_count)++; 251 if (*batch_count == NR_BATCH) { 252 __flush_batch(journal, bhs, batch_count); 253 ret = 1; 254 } 255 } else { 256 int last_buffer = 0; 257 if (jh->b_cpnext == jh) { 258 /* We may be about to drop the transaction. Tell the 259 * caller that the lists have changed. 260 */ 261 last_buffer = 1; 262 } 263 if (__try_to_free_cp_buf(jh)) { 264 (*drop_count)++; 265 ret = last_buffer; 266 } 267 } 268 return ret; 269} 270 271/* 272 * Perform an actual checkpoint. We don't write out only enough to 273 * satisfy the current blocked requests: rather we submit a reasonably 274 * sized chunk of the outstanding data to disk at once for 275 * efficiency. __log_wait_for_space() will retry if we didn't free enough. 276 * 277 * However, we _do_ take into account the amount requested so that once 278 * the IO has been queued, we can return as soon as enough of it has 279 * completed to disk. 280 * 281 * The journal should be locked before calling this function. 282 */ 283int log_do_checkpoint(journal_t *journal) 284{ 285 int result; 286 int batch_count = 0; 287 struct buffer_head *bhs[NR_BATCH]; 288 289 jbd_debug(1, "Start checkpoint\n"); 290 291 /* 292 * First thing: if there are any transactions in the log which 293 * don't need checkpointing, just eliminate them from the 294 * journal straight away. 295 */ 296 result = cleanup_journal_tail(journal); 297 jbd_debug(1, "cleanup_journal_tail returned %d\n", result); 298 if (result <= 0) 299 return result; 300 301 /* 302 * OK, we need to start writing disk blocks. Try to free up a 303 * quarter of the log in a single checkpoint if we can. 304 */ 305 /* 306 * AKPM: check this code. I had a feeling a while back that it 307 * degenerates into a busy loop at unmount time. 308 */ 309 spin_lock(&journal->j_list_lock); 310 while (journal->j_checkpoint_transactions) { 311 transaction_t *transaction; 312 struct journal_head *jh, *last_jh, *next_jh; 313 int drop_count = 0; 314 int cleanup_ret, retry = 0; 315 tid_t this_tid; 316 317 transaction = journal->j_checkpoint_transactions; 318 this_tid = transaction->t_tid; 319 jh = transaction->t_checkpoint_list; 320 last_jh = jh->b_cpprev; 321 next_jh = jh; 322 do { 323 struct buffer_head *bh; 324 325 jh = next_jh; 326 next_jh = jh->b_cpnext; 327 bh = jh2bh(jh); 328 if (!jbd_trylock_bh_state(bh)) { 329 jbd_sync_bh(journal, bh); 330 spin_lock(&journal->j_list_lock); 331 retry = 1; 332 break; 333 } 334 retry = __flush_buffer(journal, jh, bhs, &batch_count, &drop_count); 335 if (cond_resched_lock(&journal->j_list_lock)) { 336 retry = 1; 337 break; 338 } 339 } while (jh != last_jh && !retry); 340 341 if (batch_count) { 342 __flush_batch(journal, bhs, &batch_count); 343 retry = 1; 344 } 345 346 /* 347 * If someone cleaned up this transaction while we slept, we're 348 * done 349 */ 350 if (journal->j_checkpoint_transactions != transaction) 351 break; 352 if (retry) 353 continue; 354 /* 355 * Maybe it's a new transaction, but it fell at the same 356 * address 357 */ 358 if (transaction->t_tid != this_tid) 359 continue; 360 /* 361 * We have walked the whole transaction list without 362 * finding anything to write to disk. We had better be 363 * able to make some progress or we are in trouble. 364 */ 365 cleanup_ret = __cleanup_transaction(journal, transaction); 366 J_ASSERT(drop_count != 0 || cleanup_ret != 0); 367 if (journal->j_checkpoint_transactions != transaction) 368 break; 369 } 370 spin_unlock(&journal->j_list_lock); 371 result = cleanup_journal_tail(journal); 372 if (result < 0) 373 return result; 374 375 return 0; 376} 377 378/* 379 * Check the list of checkpoint transactions for the journal to see if 380 * we have already got rid of any since the last update of the log tail 381 * in the journal superblock. If so, we can instantly roll the 382 * superblock forward to remove those transactions from the log. 383 * 384 * Return <0 on error, 0 on success, 1 if there was nothing to clean up. 385 * 386 * Called with the journal lock held. 387 * 388 * This is the only part of the journaling code which really needs to be 389 * aware of transaction aborts. Checkpointing involves writing to the 390 * main filesystem area rather than to the journal, so it can proceed 391 * even in abort state, but we must not update the journal superblock if 392 * we have an abort error outstanding. 393 */ 394 395int cleanup_journal_tail(journal_t *journal) 396{ 397 transaction_t * transaction; 398 tid_t first_tid; 399 unsigned long blocknr, freed; 400 401 /* OK, work out the oldest transaction remaining in the log, and 402 * the log block it starts at. 403 * 404 * If the log is now empty, we need to work out which is the 405 * next transaction ID we will write, and where it will 406 * start. */ 407 408 spin_lock(&journal->j_state_lock); 409 spin_lock(&journal->j_list_lock); 410 transaction = journal->j_checkpoint_transactions; 411 if (transaction) { 412 first_tid = transaction->t_tid; 413 blocknr = transaction->t_log_start; 414 } else if ((transaction = journal->j_committing_transaction) != NULL) { 415 first_tid = transaction->t_tid; 416 blocknr = transaction->t_log_start; 417 } else if ((transaction = journal->j_running_transaction) != NULL) { 418 first_tid = transaction->t_tid; 419 blocknr = journal->j_head; 420 } else { 421 first_tid = journal->j_transaction_sequence; 422 blocknr = journal->j_head; 423 } 424 spin_unlock(&journal->j_list_lock); 425 J_ASSERT(blocknr != 0); 426 427 /* If the oldest pinned transaction is at the tail of the log 428 already then there's not much we can do right now. */ 429 if (journal->j_tail_sequence == first_tid) { 430 spin_unlock(&journal->j_state_lock); 431 return 1; 432 } 433 434 /* OK, update the superblock to recover the freed space. 435 * Physical blocks come first: have we wrapped beyond the end of 436 * the log? */ 437 freed = blocknr - journal->j_tail; 438 if (blocknr < journal->j_tail) 439 freed = freed + journal->j_last - journal->j_first; 440 441 jbd_debug(1, 442 "Cleaning journal tail from %d to %d (offset %lu), " 443 "freeing %lu\n", 444 journal->j_tail_sequence, first_tid, blocknr, freed); 445 446 journal->j_free += freed; 447 journal->j_tail_sequence = first_tid; 448 journal->j_tail = blocknr; 449 spin_unlock(&journal->j_state_lock); 450 if (!(journal->j_flags & JFS_ABORT)) 451 journal_update_superblock(journal, 1); 452 return 0; 453} 454 455 456/* Checkpoint list management */ 457 458/* 459 * journal_clean_checkpoint_list 460 * 461 * Find all the written-back checkpoint buffers in the journal and release them. 462 * 463 * Called with the journal locked. 464 * Called with j_list_lock held. 465 * Returns number of bufers reaped (for debug) 466 */ 467 468int __journal_clean_checkpoint_list(journal_t *journal) 469{ 470 transaction_t *transaction, *last_transaction, *next_transaction; 471 int ret = 0; 472 473 transaction = journal->j_checkpoint_transactions; 474 if (transaction == 0) 475 goto out; 476 477 last_transaction = transaction->t_cpprev; 478 next_transaction = transaction; 479 do { 480 struct journal_head *jh; 481 482 transaction = next_transaction; 483 next_transaction = transaction->t_cpnext; 484 jh = transaction->t_checkpoint_list; 485 if (jh) { 486 struct journal_head *last_jh = jh->b_cpprev; 487 struct journal_head *next_jh = jh; 488 489 do { 490 jh = next_jh; 491 next_jh = jh->b_cpnext; 492 /* Use trylock because of the ranknig */ 493 if (jbd_trylock_bh_state(jh2bh(jh))) 494 ret += __try_to_free_cp_buf(jh); 495 /* 496 * This function only frees up some memory 497 * if possible so we dont have an obligation 498 * to finish processing. Bail out if preemption 499 * requested: 500 */ 501 if (need_resched()) 502 goto out; 503 } while (jh != last_jh); 504 } 505 } while (transaction != last_transaction); 506out: 507 return ret; 508} 509 510/* 511 * journal_remove_checkpoint: called after a buffer has been committed 512 * to disk (either by being write-back flushed to disk, or being 513 * committed to the log). 514 * 515 * We cannot safely clean a transaction out of the log until all of the 516 * buffer updates committed in that transaction have safely been stored 517 * elsewhere on disk. To achieve this, all of the buffers in a 518 * transaction need to be maintained on the transaction's checkpoint 519 * list until they have been rewritten, at which point this function is 520 * called to remove the buffer from the existing transaction's 521 * checkpoint list. 522 * 523 * This function is called with the journal locked. 524 * This function is called with j_list_lock held. 525 */ 526 527void __journal_remove_checkpoint(struct journal_head *jh) 528{ 529 transaction_t *transaction; 530 journal_t *journal; 531 532 JBUFFER_TRACE(jh, "entry"); 533 534 if ((transaction = jh->b_cp_transaction) == NULL) { 535 JBUFFER_TRACE(jh, "not on transaction"); 536 goto out; 537 } 538 journal = transaction->t_journal; 539 540 __buffer_unlink(jh); 541 542 if (transaction->t_checkpoint_list != NULL) 543 goto out; 544 JBUFFER_TRACE(jh, "transaction has no more buffers"); 545 546 /* 547 * There is one special case to worry about: if we have just pulled the 548 * buffer off a committing transaction's forget list, then even if the 549 * checkpoint list is empty, the transaction obviously cannot be 550 * dropped! 551 * 552 * The locking here around j_committing_transaction is a bit sleazy. 553 * See the comment at the end of journal_commit_transaction(). 554 */ 555 if (transaction == journal->j_committing_transaction) { 556 JBUFFER_TRACE(jh, "belongs to committing transaction"); 557 goto out; 558 } 559 560 /* OK, that was the last buffer for the transaction: we can now 561 safely remove this transaction from the log */ 562 563 __journal_drop_transaction(journal, transaction); 564 565 /* Just in case anybody was waiting for more transactions to be 566 checkpointed... */ 567 wake_up(&journal->j_wait_logspace); 568out: 569 JBUFFER_TRACE(jh, "exit"); 570} 571 572/* 573 * journal_insert_checkpoint: put a committed buffer onto a checkpoint 574 * list so that we know when it is safe to clean the transaction out of 575 * the log. 576 * 577 * Called with the journal locked. 578 * Called with j_list_lock held. 579 */ 580void __journal_insert_checkpoint(struct journal_head *jh, 581 transaction_t *transaction) 582{ 583 JBUFFER_TRACE(jh, "entry"); 584 J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); 585 J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); 586 587 jh->b_cp_transaction = transaction; 588 589 if (!transaction->t_checkpoint_list) { 590 jh->b_cpnext = jh->b_cpprev = jh; 591 } else { 592 jh->b_cpnext = transaction->t_checkpoint_list; 593 jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; 594 jh->b_cpprev->b_cpnext = jh; 595 jh->b_cpnext->b_cpprev = jh; 596 } 597 transaction->t_checkpoint_list = jh; 598} 599 600/* 601 * We've finished with this transaction structure: adios... 602 * 603 * The transaction must have no links except for the checkpoint by this 604 * point. 605 * 606 * Called with the journal locked. 607 * Called with j_list_lock held. 608 */ 609 610void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) 611{ 612 assert_spin_locked(&journal->j_list_lock); 613 if (transaction->t_cpnext) { 614 transaction->t_cpnext->t_cpprev = transaction->t_cpprev; 615 transaction->t_cpprev->t_cpnext = transaction->t_cpnext; 616 if (journal->j_checkpoint_transactions == transaction) 617 journal->j_checkpoint_transactions = 618 transaction->t_cpnext; 619 if (journal->j_checkpoint_transactions == transaction) 620 journal->j_checkpoint_transactions = NULL; 621 } 622 623 J_ASSERT(transaction->t_state == T_FINISHED); 624 J_ASSERT(transaction->t_buffers == NULL); 625 J_ASSERT(transaction->t_sync_datalist == NULL); 626 J_ASSERT(transaction->t_forget == NULL); 627 J_ASSERT(transaction->t_iobuf_list == NULL); 628 J_ASSERT(transaction->t_shadow_list == NULL); 629 J_ASSERT(transaction->t_log_list == NULL); 630 J_ASSERT(transaction->t_checkpoint_list == NULL); 631 J_ASSERT(transaction->t_updates == 0); 632 J_ASSERT(journal->j_committing_transaction != transaction); 633 J_ASSERT(journal->j_running_transaction != transaction); 634 635 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); 636 kfree(transaction); 637}