at v2.6.25 698 lines 20 kB view raw
1/* 2 * linux/fs/jbd/checkpoint.c 3 * 4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 5 * 6 * Copyright 1999 Red Hat Software --- All Rights Reserved 7 * 8 * This file is part of the Linux kernel and is made available under 9 * the terms of the GNU General Public License, version 2, or at your 10 * option, any later version, incorporated herein by reference. 11 * 12 * Checkpoint routines for the generic filesystem journaling code. 13 * Part of the ext2fs journaling system. 14 * 15 * Checkpointing is the process of ensuring that a section of the log is 16 * committed fully to disk, so that that portion of the log can be 17 * reused. 18 */ 19 20#include <linux/time.h> 21#include <linux/fs.h> 22#include <linux/jbd.h> 23#include <linux/errno.h> 24#include <linux/slab.h> 25 26/* 27 * Unlink a buffer from a transaction checkpoint list. 28 * 29 * Called with j_list_lock held. 30 */ 31static inline void __buffer_unlink_first(struct journal_head *jh) 32{ 33 transaction_t *transaction = jh->b_cp_transaction; 34 35 jh->b_cpnext->b_cpprev = jh->b_cpprev; 36 jh->b_cpprev->b_cpnext = jh->b_cpnext; 37 if (transaction->t_checkpoint_list == jh) { 38 transaction->t_checkpoint_list = jh->b_cpnext; 39 if (transaction->t_checkpoint_list == jh) 40 transaction->t_checkpoint_list = NULL; 41 } 42} 43 44/* 45 * Unlink a buffer from a transaction checkpoint(io) list. 46 * 47 * Called with j_list_lock held. 48 */ 49static inline void __buffer_unlink(struct journal_head *jh) 50{ 51 transaction_t *transaction = jh->b_cp_transaction; 52 53 __buffer_unlink_first(jh); 54 if (transaction->t_checkpoint_io_list == jh) { 55 transaction->t_checkpoint_io_list = jh->b_cpnext; 56 if (transaction->t_checkpoint_io_list == jh) 57 transaction->t_checkpoint_io_list = NULL; 58 } 59} 60 61/* 62 * Move a buffer from the checkpoint list to the checkpoint io list 63 * 64 * Called with j_list_lock held 65 */ 66static inline void __buffer_relink_io(struct journal_head *jh) 67{ 68 transaction_t *transaction = jh->b_cp_transaction; 69 70 __buffer_unlink_first(jh); 71 72 if (!transaction->t_checkpoint_io_list) { 73 jh->b_cpnext = jh->b_cpprev = jh; 74 } else { 75 jh->b_cpnext = transaction->t_checkpoint_io_list; 76 jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; 77 jh->b_cpprev->b_cpnext = jh; 78 jh->b_cpnext->b_cpprev = jh; 79 } 80 transaction->t_checkpoint_io_list = jh; 81} 82 83/* 84 * Try to release a checkpointed buffer from its transaction. 85 * Returns 1 if we released it and 2 if we also released the 86 * whole transaction. 87 * 88 * Requires j_list_lock 89 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 90 */ 91static int __try_to_free_cp_buf(struct journal_head *jh) 92{ 93 int ret = 0; 94 struct buffer_head *bh = jh2bh(jh); 95 96 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { 97 JBUFFER_TRACE(jh, "remove from checkpoint list"); 98 ret = __journal_remove_checkpoint(jh) + 1; 99 jbd_unlock_bh_state(bh); 100 journal_remove_journal_head(bh); 101 BUFFER_TRACE(bh, "release"); 102 __brelse(bh); 103 } else { 104 jbd_unlock_bh_state(bh); 105 } 106 return ret; 107} 108 109/* 110 * __log_wait_for_space: wait until there is space in the journal. 111 * 112 * Called under j-state_lock *only*. It will be unlocked if we have to wait 113 * for a checkpoint to free up some space in the log. 114 */ 115void __log_wait_for_space(journal_t *journal) 116{ 117 int nblocks; 118 assert_spin_locked(&journal->j_state_lock); 119 120 nblocks = jbd_space_needed(journal); 121 while (__log_space_left(journal) < nblocks) { 122 if (journal->j_flags & JFS_ABORT) 123 return; 124 spin_unlock(&journal->j_state_lock); 125 mutex_lock(&journal->j_checkpoint_mutex); 126 127 /* 128 * Test again, another process may have checkpointed while we 129 * were waiting for the checkpoint lock 130 */ 131 spin_lock(&journal->j_state_lock); 132 nblocks = jbd_space_needed(journal); 133 if (__log_space_left(journal) < nblocks) { 134 spin_unlock(&journal->j_state_lock); 135 log_do_checkpoint(journal); 136 spin_lock(&journal->j_state_lock); 137 } 138 mutex_unlock(&journal->j_checkpoint_mutex); 139 } 140} 141 142/* 143 * We were unable to perform jbd_trylock_bh_state() inside j_list_lock. 144 * The caller must restart a list walk. Wait for someone else to run 145 * jbd_unlock_bh_state(). 146 */ 147static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) 148 __releases(journal->j_list_lock) 149{ 150 get_bh(bh); 151 spin_unlock(&journal->j_list_lock); 152 jbd_lock_bh_state(bh); 153 jbd_unlock_bh_state(bh); 154 put_bh(bh); 155} 156 157/* 158 * Clean up transaction's list of buffers submitted for io. 159 * We wait for any pending IO to complete and remove any clean 160 * buffers. Note that we take the buffers in the opposite ordering 161 * from the one in which they were submitted for IO. 162 * 163 * Called with j_list_lock held. 164 */ 165static void __wait_cp_io(journal_t *journal, transaction_t *transaction) 166{ 167 struct journal_head *jh; 168 struct buffer_head *bh; 169 tid_t this_tid; 170 int released = 0; 171 172 this_tid = transaction->t_tid; 173restart: 174 /* Did somebody clean up the transaction in the meanwhile? */ 175 if (journal->j_checkpoint_transactions != transaction || 176 transaction->t_tid != this_tid) 177 return; 178 while (!released && transaction->t_checkpoint_io_list) { 179 jh = transaction->t_checkpoint_io_list; 180 bh = jh2bh(jh); 181 if (!jbd_trylock_bh_state(bh)) { 182 jbd_sync_bh(journal, bh); 183 spin_lock(&journal->j_list_lock); 184 goto restart; 185 } 186 if (buffer_locked(bh)) { 187 atomic_inc(&bh->b_count); 188 spin_unlock(&journal->j_list_lock); 189 jbd_unlock_bh_state(bh); 190 wait_on_buffer(bh); 191 /* the journal_head may have gone by now */ 192 BUFFER_TRACE(bh, "brelse"); 193 __brelse(bh); 194 spin_lock(&journal->j_list_lock); 195 goto restart; 196 } 197 /* 198 * Now in whatever state the buffer currently is, we know that 199 * it has been written out and so we can drop it from the list 200 */ 201 released = __journal_remove_checkpoint(jh); 202 jbd_unlock_bh_state(bh); 203 journal_remove_journal_head(bh); 204 __brelse(bh); 205 } 206} 207 208#define NR_BATCH 64 209 210static void 211__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) 212{ 213 int i; 214 215 ll_rw_block(SWRITE, *batch_count, bhs); 216 for (i = 0; i < *batch_count; i++) { 217 struct buffer_head *bh = bhs[i]; 218 clear_buffer_jwrite(bh); 219 BUFFER_TRACE(bh, "brelse"); 220 __brelse(bh); 221 } 222 *batch_count = 0; 223} 224 225/* 226 * Try to flush one buffer from the checkpoint list to disk. 227 * 228 * Return 1 if something happened which requires us to abort the current 229 * scan of the checkpoint list. 230 * 231 * Called with j_list_lock held and drops it if 1 is returned 232 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 233 */ 234static int __process_buffer(journal_t *journal, struct journal_head *jh, 235 struct buffer_head **bhs, int *batch_count) 236{ 237 struct buffer_head *bh = jh2bh(jh); 238 int ret = 0; 239 240 if (buffer_locked(bh)) { 241 atomic_inc(&bh->b_count); 242 spin_unlock(&journal->j_list_lock); 243 jbd_unlock_bh_state(bh); 244 wait_on_buffer(bh); 245 /* the journal_head may have gone by now */ 246 BUFFER_TRACE(bh, "brelse"); 247 __brelse(bh); 248 ret = 1; 249 } else if (jh->b_transaction != NULL) { 250 transaction_t *t = jh->b_transaction; 251 tid_t tid = t->t_tid; 252 253 spin_unlock(&journal->j_list_lock); 254 jbd_unlock_bh_state(bh); 255 log_start_commit(journal, tid); 256 log_wait_commit(journal, tid); 257 ret = 1; 258 } else if (!buffer_dirty(bh)) { 259 J_ASSERT_JH(jh, !buffer_jbddirty(bh)); 260 BUFFER_TRACE(bh, "remove from checkpoint"); 261 __journal_remove_checkpoint(jh); 262 spin_unlock(&journal->j_list_lock); 263 jbd_unlock_bh_state(bh); 264 journal_remove_journal_head(bh); 265 __brelse(bh); 266 ret = 1; 267 } else { 268 /* 269 * Important: we are about to write the buffer, and 270 * possibly block, while still holding the journal lock. 271 * We cannot afford to let the transaction logic start 272 * messing around with this buffer before we write it to 273 * disk, as that would break recoverability. 274 */ 275 BUFFER_TRACE(bh, "queue"); 276 get_bh(bh); 277 J_ASSERT_BH(bh, !buffer_jwrite(bh)); 278 set_buffer_jwrite(bh); 279 bhs[*batch_count] = bh; 280 __buffer_relink_io(jh); 281 jbd_unlock_bh_state(bh); 282 (*batch_count)++; 283 if (*batch_count == NR_BATCH) { 284 spin_unlock(&journal->j_list_lock); 285 __flush_batch(journal, bhs, batch_count); 286 ret = 1; 287 } 288 } 289 return ret; 290} 291 292/* 293 * Perform an actual checkpoint. We take the first transaction on the 294 * list of transactions to be checkpointed and send all its buffers 295 * to disk. We submit larger chunks of data at once. 296 * 297 * The journal should be locked before calling this function. 298 */ 299int log_do_checkpoint(journal_t *journal) 300{ 301 transaction_t *transaction; 302 tid_t this_tid; 303 int result; 304 305 jbd_debug(1, "Start checkpoint\n"); 306 307 /* 308 * First thing: if there are any transactions in the log which 309 * don't need checkpointing, just eliminate them from the 310 * journal straight away. 311 */ 312 result = cleanup_journal_tail(journal); 313 jbd_debug(1, "cleanup_journal_tail returned %d\n", result); 314 if (result <= 0) 315 return result; 316 317 /* 318 * OK, we need to start writing disk blocks. Take one transaction 319 * and write it. 320 */ 321 spin_lock(&journal->j_list_lock); 322 if (!journal->j_checkpoint_transactions) 323 goto out; 324 transaction = journal->j_checkpoint_transactions; 325 this_tid = transaction->t_tid; 326restart: 327 /* 328 * If someone cleaned up this transaction while we slept, we're 329 * done (maybe it's a new transaction, but it fell at the same 330 * address). 331 */ 332 if (journal->j_checkpoint_transactions == transaction && 333 transaction->t_tid == this_tid) { 334 int batch_count = 0; 335 struct buffer_head *bhs[NR_BATCH]; 336 struct journal_head *jh; 337 int retry = 0; 338 339 while (!retry && transaction->t_checkpoint_list) { 340 struct buffer_head *bh; 341 342 jh = transaction->t_checkpoint_list; 343 bh = jh2bh(jh); 344 if (!jbd_trylock_bh_state(bh)) { 345 jbd_sync_bh(journal, bh); 346 retry = 1; 347 break; 348 } 349 retry = __process_buffer(journal, jh, bhs,&batch_count); 350 if (!retry && (need_resched() || 351 spin_needbreak(&journal->j_list_lock))) { 352 spin_unlock(&journal->j_list_lock); 353 retry = 1; 354 break; 355 } 356 } 357 358 if (batch_count) { 359 if (!retry) { 360 spin_unlock(&journal->j_list_lock); 361 retry = 1; 362 } 363 __flush_batch(journal, bhs, &batch_count); 364 } 365 366 if (retry) { 367 spin_lock(&journal->j_list_lock); 368 goto restart; 369 } 370 /* 371 * Now we have cleaned up the first transaction's checkpoint 372 * list. Let's clean up the second one 373 */ 374 __wait_cp_io(journal, transaction); 375 } 376out: 377 spin_unlock(&journal->j_list_lock); 378 result = cleanup_journal_tail(journal); 379 if (result < 0) 380 return result; 381 return 0; 382} 383 384/* 385 * Check the list of checkpoint transactions for the journal to see if 386 * we have already got rid of any since the last update of the log tail 387 * in the journal superblock. If so, we can instantly roll the 388 * superblock forward to remove those transactions from the log. 389 * 390 * Return <0 on error, 0 on success, 1 if there was nothing to clean up. 391 * 392 * Called with the journal lock held. 393 * 394 * This is the only part of the journaling code which really needs to be 395 * aware of transaction aborts. Checkpointing involves writing to the 396 * main filesystem area rather than to the journal, so it can proceed 397 * even in abort state, but we must not update the journal superblock if 398 * we have an abort error outstanding. 399 */ 400 401int cleanup_journal_tail(journal_t *journal) 402{ 403 transaction_t * transaction; 404 tid_t first_tid; 405 unsigned long blocknr, freed; 406 407 /* OK, work out the oldest transaction remaining in the log, and 408 * the log block it starts at. 409 * 410 * If the log is now empty, we need to work out which is the 411 * next transaction ID we will write, and where it will 412 * start. */ 413 414 spin_lock(&journal->j_state_lock); 415 spin_lock(&journal->j_list_lock); 416 transaction = journal->j_checkpoint_transactions; 417 if (transaction) { 418 first_tid = transaction->t_tid; 419 blocknr = transaction->t_log_start; 420 } else if ((transaction = journal->j_committing_transaction) != NULL) { 421 first_tid = transaction->t_tid; 422 blocknr = transaction->t_log_start; 423 } else if ((transaction = journal->j_running_transaction) != NULL) { 424 first_tid = transaction->t_tid; 425 blocknr = journal->j_head; 426 } else { 427 first_tid = journal->j_transaction_sequence; 428 blocknr = journal->j_head; 429 } 430 spin_unlock(&journal->j_list_lock); 431 J_ASSERT(blocknr != 0); 432 433 /* If the oldest pinned transaction is at the tail of the log 434 already then there's not much we can do right now. */ 435 if (journal->j_tail_sequence == first_tid) { 436 spin_unlock(&journal->j_state_lock); 437 return 1; 438 } 439 440 /* OK, update the superblock to recover the freed space. 441 * Physical blocks come first: have we wrapped beyond the end of 442 * the log? */ 443 freed = blocknr - journal->j_tail; 444 if (blocknr < journal->j_tail) 445 freed = freed + journal->j_last - journal->j_first; 446 447 jbd_debug(1, 448 "Cleaning journal tail from %d to %d (offset %lu), " 449 "freeing %lu\n", 450 journal->j_tail_sequence, first_tid, blocknr, freed); 451 452 journal->j_free += freed; 453 journal->j_tail_sequence = first_tid; 454 journal->j_tail = blocknr; 455 spin_unlock(&journal->j_state_lock); 456 if (!(journal->j_flags & JFS_ABORT)) 457 journal_update_superblock(journal, 1); 458 return 0; 459} 460 461 462/* Checkpoint list management */ 463 464/* 465 * journal_clean_one_cp_list 466 * 467 * Find all the written-back checkpoint buffers in the given list and release them. 468 * 469 * Called with the journal locked. 470 * Called with j_list_lock held. 471 * Returns number of bufers reaped (for debug) 472 */ 473 474static int journal_clean_one_cp_list(struct journal_head *jh, int *released) 475{ 476 struct journal_head *last_jh; 477 struct journal_head *next_jh = jh; 478 int ret, freed = 0; 479 480 *released = 0; 481 if (!jh) 482 return 0; 483 484 last_jh = jh->b_cpprev; 485 do { 486 jh = next_jh; 487 next_jh = jh->b_cpnext; 488 /* Use trylock because of the ranking */ 489 if (jbd_trylock_bh_state(jh2bh(jh))) { 490 ret = __try_to_free_cp_buf(jh); 491 if (ret) { 492 freed++; 493 if (ret == 2) { 494 *released = 1; 495 return freed; 496 } 497 } 498 } 499 /* 500 * This function only frees up some memory 501 * if possible so we dont have an obligation 502 * to finish processing. Bail out if preemption 503 * requested: 504 */ 505 if (need_resched()) 506 return freed; 507 } while (jh != last_jh); 508 509 return freed; 510} 511 512/* 513 * journal_clean_checkpoint_list 514 * 515 * Find all the written-back checkpoint buffers in the journal and release them. 516 * 517 * Called with the journal locked. 518 * Called with j_list_lock held. 519 * Returns number of buffers reaped (for debug) 520 */ 521 522int __journal_clean_checkpoint_list(journal_t *journal) 523{ 524 transaction_t *transaction, *last_transaction, *next_transaction; 525 int ret = 0; 526 int released; 527 528 transaction = journal->j_checkpoint_transactions; 529 if (!transaction) 530 goto out; 531 532 last_transaction = transaction->t_cpprev; 533 next_transaction = transaction; 534 do { 535 transaction = next_transaction; 536 next_transaction = transaction->t_cpnext; 537 ret += journal_clean_one_cp_list(transaction-> 538 t_checkpoint_list, &released); 539 /* 540 * This function only frees up some memory if possible so we 541 * dont have an obligation to finish processing. Bail out if 542 * preemption requested: 543 */ 544 if (need_resched()) 545 goto out; 546 if (released) 547 continue; 548 /* 549 * It is essential that we are as careful as in the case of 550 * t_checkpoint_list with removing the buffer from the list as 551 * we can possibly see not yet submitted buffers on io_list 552 */ 553 ret += journal_clean_one_cp_list(transaction-> 554 t_checkpoint_io_list, &released); 555 if (need_resched()) 556 goto out; 557 } while (transaction != last_transaction); 558out: 559 return ret; 560} 561 562/* 563 * journal_remove_checkpoint: called after a buffer has been committed 564 * to disk (either by being write-back flushed to disk, or being 565 * committed to the log). 566 * 567 * We cannot safely clean a transaction out of the log until all of the 568 * buffer updates committed in that transaction have safely been stored 569 * elsewhere on disk. To achieve this, all of the buffers in a 570 * transaction need to be maintained on the transaction's checkpoint 571 * lists until they have been rewritten, at which point this function is 572 * called to remove the buffer from the existing transaction's 573 * checkpoint lists. 574 * 575 * The function returns 1 if it frees the transaction, 0 otherwise. 576 * 577 * This function is called with the journal locked. 578 * This function is called with j_list_lock held. 579 * This function is called with jbd_lock_bh_state(jh2bh(jh)) 580 */ 581 582int __journal_remove_checkpoint(struct journal_head *jh) 583{ 584 transaction_t *transaction; 585 journal_t *journal; 586 int ret = 0; 587 588 JBUFFER_TRACE(jh, "entry"); 589 590 if ((transaction = jh->b_cp_transaction) == NULL) { 591 JBUFFER_TRACE(jh, "not on transaction"); 592 goto out; 593 } 594 journal = transaction->t_journal; 595 596 __buffer_unlink(jh); 597 jh->b_cp_transaction = NULL; 598 599 if (transaction->t_checkpoint_list != NULL || 600 transaction->t_checkpoint_io_list != NULL) 601 goto out; 602 JBUFFER_TRACE(jh, "transaction has no more buffers"); 603 604 /* 605 * There is one special case to worry about: if we have just pulled the 606 * buffer off a running or committing transaction's checkpoing list, 607 * then even if the checkpoint list is empty, the transaction obviously 608 * cannot be dropped! 609 * 610 * The locking here around t_state is a bit sleazy. 611 * See the comment at the end of journal_commit_transaction(). 612 */ 613 if (transaction->t_state != T_FINISHED) { 614 JBUFFER_TRACE(jh, "belongs to running/committing transaction"); 615 goto out; 616 } 617 618 /* OK, that was the last buffer for the transaction: we can now 619 safely remove this transaction from the log */ 620 621 __journal_drop_transaction(journal, transaction); 622 623 /* Just in case anybody was waiting for more transactions to be 624 checkpointed... */ 625 wake_up(&journal->j_wait_logspace); 626 ret = 1; 627out: 628 JBUFFER_TRACE(jh, "exit"); 629 return ret; 630} 631 632/* 633 * journal_insert_checkpoint: put a committed buffer onto a checkpoint 634 * list so that we know when it is safe to clean the transaction out of 635 * the log. 636 * 637 * Called with the journal locked. 638 * Called with j_list_lock held. 639 */ 640void __journal_insert_checkpoint(struct journal_head *jh, 641 transaction_t *transaction) 642{ 643 JBUFFER_TRACE(jh, "entry"); 644 J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); 645 J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); 646 647 jh->b_cp_transaction = transaction; 648 649 if (!transaction->t_checkpoint_list) { 650 jh->b_cpnext = jh->b_cpprev = jh; 651 } else { 652 jh->b_cpnext = transaction->t_checkpoint_list; 653 jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; 654 jh->b_cpprev->b_cpnext = jh; 655 jh->b_cpnext->b_cpprev = jh; 656 } 657 transaction->t_checkpoint_list = jh; 658} 659 660/* 661 * We've finished with this transaction structure: adios... 662 * 663 * The transaction must have no links except for the checkpoint by this 664 * point. 665 * 666 * Called with the journal locked. 667 * Called with j_list_lock held. 668 */ 669 670void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) 671{ 672 assert_spin_locked(&journal->j_list_lock); 673 if (transaction->t_cpnext) { 674 transaction->t_cpnext->t_cpprev = transaction->t_cpprev; 675 transaction->t_cpprev->t_cpnext = transaction->t_cpnext; 676 if (journal->j_checkpoint_transactions == transaction) 677 journal->j_checkpoint_transactions = 678 transaction->t_cpnext; 679 if (journal->j_checkpoint_transactions == transaction) 680 journal->j_checkpoint_transactions = NULL; 681 } 682 683 J_ASSERT(transaction->t_state == T_FINISHED); 684 J_ASSERT(transaction->t_buffers == NULL); 685 J_ASSERT(transaction->t_sync_datalist == NULL); 686 J_ASSERT(transaction->t_forget == NULL); 687 J_ASSERT(transaction->t_iobuf_list == NULL); 688 J_ASSERT(transaction->t_shadow_list == NULL); 689 J_ASSERT(transaction->t_log_list == NULL); 690 J_ASSERT(transaction->t_checkpoint_list == NULL); 691 J_ASSERT(transaction->t_checkpoint_io_list == NULL); 692 J_ASSERT(transaction->t_updates == 0); 693 J_ASSERT(journal->j_committing_transaction != transaction); 694 J_ASSERT(journal->j_running_transaction != transaction); 695 696 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); 697 kfree(transaction); 698}