at v2.6.29 1986 lines 54 kB view raw
1/* 2 * linux/fs/jbd/journal.c 3 * 4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 5 * 6 * Copyright 1998 Red Hat corp --- All Rights Reserved 7 * 8 * This file is part of the Linux kernel and is made available under 9 * the terms of the GNU General Public License, version 2, or at your 10 * option, any later version, incorporated herein by reference. 11 * 12 * Generic filesystem journal-writing code; part of the ext2fs 13 * journaling system. 14 * 15 * This file manages journals: areas of disk reserved for logging 16 * transactional updates. This includes the kernel journaling thread 17 * which is responsible for scheduling updates to the log. 18 * 19 * We do not actually manage the physical storage of the journal in this 20 * file: that is left to a per-journal policy function, which allows us 21 * to store the journal within a filesystem-specified area for ext2 22 * journaling (ext2 can use a reserved inode for storing the log). 23 */ 24 25#include <linux/module.h> 26#include <linux/time.h> 27#include <linux/fs.h> 28#include <linux/jbd.h> 29#include <linux/errno.h> 30#include <linux/slab.h> 31#include <linux/init.h> 32#include <linux/mm.h> 33#include <linux/freezer.h> 34#include <linux/pagemap.h> 35#include <linux/kthread.h> 36#include <linux/poison.h> 37#include <linux/proc_fs.h> 38#include <linux/debugfs.h> 39 40#include <asm/uaccess.h> 41#include <asm/page.h> 42 43EXPORT_SYMBOL(journal_start); 44EXPORT_SYMBOL(journal_restart); 45EXPORT_SYMBOL(journal_extend); 46EXPORT_SYMBOL(journal_stop); 47EXPORT_SYMBOL(journal_lock_updates); 48EXPORT_SYMBOL(journal_unlock_updates); 49EXPORT_SYMBOL(journal_get_write_access); 50EXPORT_SYMBOL(journal_get_create_access); 51EXPORT_SYMBOL(journal_get_undo_access); 52EXPORT_SYMBOL(journal_dirty_data); 53EXPORT_SYMBOL(journal_dirty_metadata); 54EXPORT_SYMBOL(journal_release_buffer); 55EXPORT_SYMBOL(journal_forget); 56#if 0 57EXPORT_SYMBOL(journal_sync_buffer); 58#endif 59EXPORT_SYMBOL(journal_flush); 60EXPORT_SYMBOL(journal_revoke); 61 62EXPORT_SYMBOL(journal_init_dev); 63EXPORT_SYMBOL(journal_init_inode); 64EXPORT_SYMBOL(journal_update_format); 65EXPORT_SYMBOL(journal_check_used_features); 66EXPORT_SYMBOL(journal_check_available_features); 67EXPORT_SYMBOL(journal_set_features); 68EXPORT_SYMBOL(journal_create); 69EXPORT_SYMBOL(journal_load); 70EXPORT_SYMBOL(journal_destroy); 71EXPORT_SYMBOL(journal_abort); 72EXPORT_SYMBOL(journal_errno); 73EXPORT_SYMBOL(journal_ack_err); 74EXPORT_SYMBOL(journal_clear_err); 75EXPORT_SYMBOL(log_wait_commit); 76EXPORT_SYMBOL(journal_start_commit); 77EXPORT_SYMBOL(journal_force_commit_nested); 78EXPORT_SYMBOL(journal_wipe); 79EXPORT_SYMBOL(journal_blocks_per_page); 80EXPORT_SYMBOL(journal_invalidatepage); 81EXPORT_SYMBOL(journal_try_to_free_buffers); 82EXPORT_SYMBOL(journal_force_commit); 83 84static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); 85static void __journal_abort_soft (journal_t *journal, int errno); 86 87/* 88 * Helper function used to manage commit timeouts 89 */ 90 91static void commit_timeout(unsigned long __data) 92{ 93 struct task_struct * p = (struct task_struct *) __data; 94 95 wake_up_process(p); 96} 97 98/* 99 * kjournald: The main thread function used to manage a logging device 100 * journal. 101 * 102 * This kernel thread is responsible for two things: 103 * 104 * 1) COMMIT: Every so often we need to commit the current state of the 105 * filesystem to disk. The journal thread is responsible for writing 106 * all of the metadata buffers to disk. 107 * 108 * 2) CHECKPOINT: We cannot reuse a used section of the log file until all 109 * of the data in that part of the log has been rewritten elsewhere on 110 * the disk. Flushing these old buffers to reclaim space in the log is 111 * known as checkpointing, and this thread is responsible for that job. 112 */ 113 114static int kjournald(void *arg) 115{ 116 journal_t *journal = arg; 117 transaction_t *transaction; 118 119 /* 120 * Set up an interval timer which can be used to trigger a commit wakeup 121 * after the commit interval expires 122 */ 123 setup_timer(&journal->j_commit_timer, commit_timeout, 124 (unsigned long)current); 125 126 /* Record that the journal thread is running */ 127 journal->j_task = current; 128 wake_up(&journal->j_wait_done_commit); 129 130 printk(KERN_INFO "kjournald starting. Commit interval %ld seconds\n", 131 journal->j_commit_interval / HZ); 132 133 /* 134 * And now, wait forever for commit wakeup events. 135 */ 136 spin_lock(&journal->j_state_lock); 137 138loop: 139 if (journal->j_flags & JFS_UNMOUNT) 140 goto end_loop; 141 142 jbd_debug(1, "commit_sequence=%d, commit_request=%d\n", 143 journal->j_commit_sequence, journal->j_commit_request); 144 145 if (journal->j_commit_sequence != journal->j_commit_request) { 146 jbd_debug(1, "OK, requests differ\n"); 147 spin_unlock(&journal->j_state_lock); 148 del_timer_sync(&journal->j_commit_timer); 149 journal_commit_transaction(journal); 150 spin_lock(&journal->j_state_lock); 151 goto loop; 152 } 153 154 wake_up(&journal->j_wait_done_commit); 155 if (freezing(current)) { 156 /* 157 * The simpler the better. Flushing journal isn't a 158 * good idea, because that depends on threads that may 159 * be already stopped. 160 */ 161 jbd_debug(1, "Now suspending kjournald\n"); 162 spin_unlock(&journal->j_state_lock); 163 refrigerator(); 164 spin_lock(&journal->j_state_lock); 165 } else { 166 /* 167 * We assume on resume that commits are already there, 168 * so we don't sleep 169 */ 170 DEFINE_WAIT(wait); 171 int should_sleep = 1; 172 173 prepare_to_wait(&journal->j_wait_commit, &wait, 174 TASK_INTERRUPTIBLE); 175 if (journal->j_commit_sequence != journal->j_commit_request) 176 should_sleep = 0; 177 transaction = journal->j_running_transaction; 178 if (transaction && time_after_eq(jiffies, 179 transaction->t_expires)) 180 should_sleep = 0; 181 if (journal->j_flags & JFS_UNMOUNT) 182 should_sleep = 0; 183 if (should_sleep) { 184 spin_unlock(&journal->j_state_lock); 185 schedule(); 186 spin_lock(&journal->j_state_lock); 187 } 188 finish_wait(&journal->j_wait_commit, &wait); 189 } 190 191 jbd_debug(1, "kjournald wakes\n"); 192 193 /* 194 * Were we woken up by a commit wakeup event? 195 */ 196 transaction = journal->j_running_transaction; 197 if (transaction && time_after_eq(jiffies, transaction->t_expires)) { 198 journal->j_commit_request = transaction->t_tid; 199 jbd_debug(1, "woke because of timeout\n"); 200 } 201 goto loop; 202 203end_loop: 204 spin_unlock(&journal->j_state_lock); 205 del_timer_sync(&journal->j_commit_timer); 206 journal->j_task = NULL; 207 wake_up(&journal->j_wait_done_commit); 208 jbd_debug(1, "Journal thread exiting.\n"); 209 return 0; 210} 211 212static int journal_start_thread(journal_t *journal) 213{ 214 struct task_struct *t; 215 216 t = kthread_run(kjournald, journal, "kjournald"); 217 if (IS_ERR(t)) 218 return PTR_ERR(t); 219 220 wait_event(journal->j_wait_done_commit, journal->j_task != NULL); 221 return 0; 222} 223 224static void journal_kill_thread(journal_t *journal) 225{ 226 spin_lock(&journal->j_state_lock); 227 journal->j_flags |= JFS_UNMOUNT; 228 229 while (journal->j_task) { 230 wake_up(&journal->j_wait_commit); 231 spin_unlock(&journal->j_state_lock); 232 wait_event(journal->j_wait_done_commit, 233 journal->j_task == NULL); 234 spin_lock(&journal->j_state_lock); 235 } 236 spin_unlock(&journal->j_state_lock); 237} 238 239/* 240 * journal_write_metadata_buffer: write a metadata buffer to the journal. 241 * 242 * Writes a metadata buffer to a given disk block. The actual IO is not 243 * performed but a new buffer_head is constructed which labels the data 244 * to be written with the correct destination disk block. 245 * 246 * Any magic-number escaping which needs to be done will cause a 247 * copy-out here. If the buffer happens to start with the 248 * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the 249 * magic number is only written to the log for descripter blocks. In 250 * this case, we copy the data and replace the first word with 0, and we 251 * return a result code which indicates that this buffer needs to be 252 * marked as an escaped buffer in the corresponding log descriptor 253 * block. The missing word can then be restored when the block is read 254 * during recovery. 255 * 256 * If the source buffer has already been modified by a new transaction 257 * since we took the last commit snapshot, we use the frozen copy of 258 * that data for IO. If we end up using the existing buffer_head's data 259 * for the write, then we *have* to lock the buffer to prevent anyone 260 * else from using and possibly modifying it while the IO is in 261 * progress. 262 * 263 * The function returns a pointer to the buffer_heads to be used for IO. 264 * 265 * We assume that the journal has already been locked in this function. 266 * 267 * Return value: 268 * <0: Error 269 * >=0: Finished OK 270 * 271 * On success: 272 * Bit 0 set == escape performed on the data 273 * Bit 1 set == buffer copy-out performed (kfree the data after IO) 274 */ 275 276int journal_write_metadata_buffer(transaction_t *transaction, 277 struct journal_head *jh_in, 278 struct journal_head **jh_out, 279 unsigned long blocknr) 280{ 281 int need_copy_out = 0; 282 int done_copy_out = 0; 283 int do_escape = 0; 284 char *mapped_data; 285 struct buffer_head *new_bh; 286 struct journal_head *new_jh; 287 struct page *new_page; 288 unsigned int new_offset; 289 struct buffer_head *bh_in = jh2bh(jh_in); 290 291 /* 292 * The buffer really shouldn't be locked: only the current committing 293 * transaction is allowed to write it, so nobody else is allowed 294 * to do any IO. 295 * 296 * akpm: except if we're journalling data, and write() output is 297 * also part of a shared mapping, and another thread has 298 * decided to launch a writepage() against this buffer. 299 */ 300 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); 301 302 new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); 303 304 /* 305 * If a new transaction has already done a buffer copy-out, then 306 * we use that version of the data for the commit. 307 */ 308 jbd_lock_bh_state(bh_in); 309repeat: 310 if (jh_in->b_frozen_data) { 311 done_copy_out = 1; 312 new_page = virt_to_page(jh_in->b_frozen_data); 313 new_offset = offset_in_page(jh_in->b_frozen_data); 314 } else { 315 new_page = jh2bh(jh_in)->b_page; 316 new_offset = offset_in_page(jh2bh(jh_in)->b_data); 317 } 318 319 mapped_data = kmap_atomic(new_page, KM_USER0); 320 /* 321 * Check for escaping 322 */ 323 if (*((__be32 *)(mapped_data + new_offset)) == 324 cpu_to_be32(JFS_MAGIC_NUMBER)) { 325 need_copy_out = 1; 326 do_escape = 1; 327 } 328 kunmap_atomic(mapped_data, KM_USER0); 329 330 /* 331 * Do we need to do a data copy? 332 */ 333 if (need_copy_out && !done_copy_out) { 334 char *tmp; 335 336 jbd_unlock_bh_state(bh_in); 337 tmp = jbd_alloc(bh_in->b_size, GFP_NOFS); 338 jbd_lock_bh_state(bh_in); 339 if (jh_in->b_frozen_data) { 340 jbd_free(tmp, bh_in->b_size); 341 goto repeat; 342 } 343 344 jh_in->b_frozen_data = tmp; 345 mapped_data = kmap_atomic(new_page, KM_USER0); 346 memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size); 347 kunmap_atomic(mapped_data, KM_USER0); 348 349 new_page = virt_to_page(tmp); 350 new_offset = offset_in_page(tmp); 351 done_copy_out = 1; 352 } 353 354 /* 355 * Did we need to do an escaping? Now we've done all the 356 * copying, we can finally do so. 357 */ 358 if (do_escape) { 359 mapped_data = kmap_atomic(new_page, KM_USER0); 360 *((unsigned int *)(mapped_data + new_offset)) = 0; 361 kunmap_atomic(mapped_data, KM_USER0); 362 } 363 364 /* keep subsequent assertions sane */ 365 new_bh->b_state = 0; 366 init_buffer(new_bh, NULL, NULL); 367 atomic_set(&new_bh->b_count, 1); 368 jbd_unlock_bh_state(bh_in); 369 370 new_jh = journal_add_journal_head(new_bh); /* This sleeps */ 371 372 set_bh_page(new_bh, new_page, new_offset); 373 new_jh->b_transaction = NULL; 374 new_bh->b_size = jh2bh(jh_in)->b_size; 375 new_bh->b_bdev = transaction->t_journal->j_dev; 376 new_bh->b_blocknr = blocknr; 377 set_buffer_mapped(new_bh); 378 set_buffer_dirty(new_bh); 379 380 *jh_out = new_jh; 381 382 /* 383 * The to-be-written buffer needs to get moved to the io queue, 384 * and the original buffer whose contents we are shadowing or 385 * copying is moved to the transaction's shadow queue. 386 */ 387 JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); 388 journal_file_buffer(jh_in, transaction, BJ_Shadow); 389 JBUFFER_TRACE(new_jh, "file as BJ_IO"); 390 journal_file_buffer(new_jh, transaction, BJ_IO); 391 392 return do_escape | (done_copy_out << 1); 393} 394 395/* 396 * Allocation code for the journal file. Manage the space left in the 397 * journal, so that we can begin checkpointing when appropriate. 398 */ 399 400/* 401 * __log_space_left: Return the number of free blocks left in the journal. 402 * 403 * Called with the journal already locked. 404 * 405 * Called under j_state_lock 406 */ 407 408int __log_space_left(journal_t *journal) 409{ 410 int left = journal->j_free; 411 412 assert_spin_locked(&journal->j_state_lock); 413 414 /* 415 * Be pessimistic here about the number of those free blocks which 416 * might be required for log descriptor control blocks. 417 */ 418 419#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */ 420 421 left -= MIN_LOG_RESERVED_BLOCKS; 422 423 if (left <= 0) 424 return 0; 425 left -= (left >> 3); 426 return left; 427} 428 429/* 430 * Called under j_state_lock. Returns true if a transaction commit was started. 431 */ 432int __log_start_commit(journal_t *journal, tid_t target) 433{ 434 /* 435 * Are we already doing a recent enough commit? 436 */ 437 if (!tid_geq(journal->j_commit_request, target)) { 438 /* 439 * We want a new commit: OK, mark the request and wakup the 440 * commit thread. We do _not_ do the commit ourselves. 441 */ 442 443 journal->j_commit_request = target; 444 jbd_debug(1, "JBD: requesting commit %d/%d\n", 445 journal->j_commit_request, 446 journal->j_commit_sequence); 447 wake_up(&journal->j_wait_commit); 448 return 1; 449 } 450 return 0; 451} 452 453int log_start_commit(journal_t *journal, tid_t tid) 454{ 455 int ret; 456 457 spin_lock(&journal->j_state_lock); 458 ret = __log_start_commit(journal, tid); 459 spin_unlock(&journal->j_state_lock); 460 return ret; 461} 462 463/* 464 * Force and wait upon a commit if the calling process is not within 465 * transaction. This is used for forcing out undo-protected data which contains 466 * bitmaps, when the fs is running out of space. 467 * 468 * We can only force the running transaction if we don't have an active handle; 469 * otherwise, we will deadlock. 470 * 471 * Returns true if a transaction was started. 472 */ 473int journal_force_commit_nested(journal_t *journal) 474{ 475 transaction_t *transaction = NULL; 476 tid_t tid; 477 478 spin_lock(&journal->j_state_lock); 479 if (journal->j_running_transaction && !current->journal_info) { 480 transaction = journal->j_running_transaction; 481 __log_start_commit(journal, transaction->t_tid); 482 } else if (journal->j_committing_transaction) 483 transaction = journal->j_committing_transaction; 484 485 if (!transaction) { 486 spin_unlock(&journal->j_state_lock); 487 return 0; /* Nothing to retry */ 488 } 489 490 tid = transaction->t_tid; 491 spin_unlock(&journal->j_state_lock); 492 log_wait_commit(journal, tid); 493 return 1; 494} 495 496/* 497 * Start a commit of the current running transaction (if any). Returns true 498 * if a transaction is going to be committed (or is currently already 499 * committing), and fills its tid in at *ptid 500 */ 501int journal_start_commit(journal_t *journal, tid_t *ptid) 502{ 503 int ret = 0; 504 505 spin_lock(&journal->j_state_lock); 506 if (journal->j_running_transaction) { 507 tid_t tid = journal->j_running_transaction->t_tid; 508 509 __log_start_commit(journal, tid); 510 /* There's a running transaction and we've just made sure 511 * it's commit has been scheduled. */ 512 if (ptid) 513 *ptid = tid; 514 ret = 1; 515 } else if (journal->j_committing_transaction) { 516 /* 517 * If ext3_write_super() recently started a commit, then we 518 * have to wait for completion of that transaction 519 */ 520 if (ptid) 521 *ptid = journal->j_committing_transaction->t_tid; 522 ret = 1; 523 } 524 spin_unlock(&journal->j_state_lock); 525 return ret; 526} 527 528/* 529 * Wait for a specified commit to complete. 530 * The caller may not hold the journal lock. 531 */ 532int log_wait_commit(journal_t *journal, tid_t tid) 533{ 534 int err = 0; 535 536#ifdef CONFIG_JBD_DEBUG 537 spin_lock(&journal->j_state_lock); 538 if (!tid_geq(journal->j_commit_request, tid)) { 539 printk(KERN_EMERG 540 "%s: error: j_commit_request=%d, tid=%d\n", 541 __func__, journal->j_commit_request, tid); 542 } 543 spin_unlock(&journal->j_state_lock); 544#endif 545 spin_lock(&journal->j_state_lock); 546 while (tid_gt(tid, journal->j_commit_sequence)) { 547 jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", 548 tid, journal->j_commit_sequence); 549 wake_up(&journal->j_wait_commit); 550 spin_unlock(&journal->j_state_lock); 551 wait_event(journal->j_wait_done_commit, 552 !tid_gt(tid, journal->j_commit_sequence)); 553 spin_lock(&journal->j_state_lock); 554 } 555 spin_unlock(&journal->j_state_lock); 556 557 if (unlikely(is_journal_aborted(journal))) { 558 printk(KERN_EMERG "journal commit I/O error\n"); 559 err = -EIO; 560 } 561 return err; 562} 563 564/* 565 * Log buffer allocation routines: 566 */ 567 568int journal_next_log_block(journal_t *journal, unsigned long *retp) 569{ 570 unsigned long blocknr; 571 572 spin_lock(&journal->j_state_lock); 573 J_ASSERT(journal->j_free > 1); 574 575 blocknr = journal->j_head; 576 journal->j_head++; 577 journal->j_free--; 578 if (journal->j_head == journal->j_last) 579 journal->j_head = journal->j_first; 580 spin_unlock(&journal->j_state_lock); 581 return journal_bmap(journal, blocknr, retp); 582} 583 584/* 585 * Conversion of logical to physical block numbers for the journal 586 * 587 * On external journals the journal blocks are identity-mapped, so 588 * this is a no-op. If needed, we can use j_blk_offset - everything is 589 * ready. 590 */ 591int journal_bmap(journal_t *journal, unsigned long blocknr, 592 unsigned long *retp) 593{ 594 int err = 0; 595 unsigned long ret; 596 597 if (journal->j_inode) { 598 ret = bmap(journal->j_inode, blocknr); 599 if (ret) 600 *retp = ret; 601 else { 602 char b[BDEVNAME_SIZE]; 603 604 printk(KERN_ALERT "%s: journal block not found " 605 "at offset %lu on %s\n", 606 __func__, 607 blocknr, 608 bdevname(journal->j_dev, b)); 609 err = -EIO; 610 __journal_abort_soft(journal, err); 611 } 612 } else { 613 *retp = blocknr; /* +journal->j_blk_offset */ 614 } 615 return err; 616} 617 618/* 619 * We play buffer_head aliasing tricks to write data/metadata blocks to 620 * the journal without copying their contents, but for journal 621 * descriptor blocks we do need to generate bona fide buffers. 622 * 623 * After the caller of journal_get_descriptor_buffer() has finished modifying 624 * the buffer's contents they really should run flush_dcache_page(bh->b_page). 625 * But we don't bother doing that, so there will be coherency problems with 626 * mmaps of blockdevs which hold live JBD-controlled filesystems. 627 */ 628struct journal_head *journal_get_descriptor_buffer(journal_t *journal) 629{ 630 struct buffer_head *bh; 631 unsigned long blocknr; 632 int err; 633 634 err = journal_next_log_block(journal, &blocknr); 635 636 if (err) 637 return NULL; 638 639 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 640 lock_buffer(bh); 641 memset(bh->b_data, 0, journal->j_blocksize); 642 set_buffer_uptodate(bh); 643 unlock_buffer(bh); 644 BUFFER_TRACE(bh, "return this buffer"); 645 return journal_add_journal_head(bh); 646} 647 648/* 649 * Management for journal control blocks: functions to create and 650 * destroy journal_t structures, and to initialise and read existing 651 * journal blocks from disk. */ 652 653/* First: create and setup a journal_t object in memory. We initialise 654 * very few fields yet: that has to wait until we have created the 655 * journal structures from from scratch, or loaded them from disk. */ 656 657static journal_t * journal_init_common (void) 658{ 659 journal_t *journal; 660 int err; 661 662 journal = kzalloc(sizeof(*journal), GFP_KERNEL); 663 if (!journal) 664 goto fail; 665 666 init_waitqueue_head(&journal->j_wait_transaction_locked); 667 init_waitqueue_head(&journal->j_wait_logspace); 668 init_waitqueue_head(&journal->j_wait_done_commit); 669 init_waitqueue_head(&journal->j_wait_checkpoint); 670 init_waitqueue_head(&journal->j_wait_commit); 671 init_waitqueue_head(&journal->j_wait_updates); 672 mutex_init(&journal->j_barrier); 673 mutex_init(&journal->j_checkpoint_mutex); 674 spin_lock_init(&journal->j_revoke_lock); 675 spin_lock_init(&journal->j_list_lock); 676 spin_lock_init(&journal->j_state_lock); 677 678 journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE); 679 680 /* The journal is marked for error until we succeed with recovery! */ 681 journal->j_flags = JFS_ABORT; 682 683 /* Set up a default-sized revoke table for the new mount. */ 684 err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); 685 if (err) { 686 kfree(journal); 687 goto fail; 688 } 689 return journal; 690fail: 691 return NULL; 692} 693 694/* journal_init_dev and journal_init_inode: 695 * 696 * Create a journal structure assigned some fixed set of disk blocks to 697 * the journal. We don't actually touch those disk blocks yet, but we 698 * need to set up all of the mapping information to tell the journaling 699 * system where the journal blocks are. 700 * 701 */ 702 703/** 704 * journal_t * journal_init_dev() - creates and initialises a journal structure 705 * @bdev: Block device on which to create the journal 706 * @fs_dev: Device which hold journalled filesystem for this journal. 707 * @start: Block nr Start of journal. 708 * @len: Length of the journal in blocks. 709 * @blocksize: blocksize of journalling device 710 * 711 * Returns: a newly created journal_t * 712 * 713 * journal_init_dev creates a journal which maps a fixed contiguous 714 * range of blocks on an arbitrary block device. 715 * 716 */ 717journal_t * journal_init_dev(struct block_device *bdev, 718 struct block_device *fs_dev, 719 int start, int len, int blocksize) 720{ 721 journal_t *journal = journal_init_common(); 722 struct buffer_head *bh; 723 int n; 724 725 if (!journal) 726 return NULL; 727 728 /* journal descriptor can store up to n blocks -bzzz */ 729 journal->j_blocksize = blocksize; 730 n = journal->j_blocksize / sizeof(journal_block_tag_t); 731 journal->j_wbufsize = n; 732 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); 733 if (!journal->j_wbuf) { 734 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 735 __func__); 736 kfree(journal); 737 journal = NULL; 738 goto out; 739 } 740 journal->j_dev = bdev; 741 journal->j_fs_dev = fs_dev; 742 journal->j_blk_offset = start; 743 journal->j_maxlen = len; 744 745 bh = __getblk(journal->j_dev, start, journal->j_blocksize); 746 J_ASSERT(bh != NULL); 747 journal->j_sb_buffer = bh; 748 journal->j_superblock = (journal_superblock_t *)bh->b_data; 749out: 750 return journal; 751} 752 753/** 754 * journal_t * journal_init_inode () - creates a journal which maps to a inode. 755 * @inode: An inode to create the journal in 756 * 757 * journal_init_inode creates a journal which maps an on-disk inode as 758 * the journal. The inode must exist already, must support bmap() and 759 * must have all data blocks preallocated. 760 */ 761journal_t * journal_init_inode (struct inode *inode) 762{ 763 struct buffer_head *bh; 764 journal_t *journal = journal_init_common(); 765 int err; 766 int n; 767 unsigned long blocknr; 768 769 if (!journal) 770 return NULL; 771 772 journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev; 773 journal->j_inode = inode; 774 jbd_debug(1, 775 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", 776 journal, inode->i_sb->s_id, inode->i_ino, 777 (long long) inode->i_size, 778 inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); 779 780 journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; 781 journal->j_blocksize = inode->i_sb->s_blocksize; 782 783 /* journal descriptor can store up to n blocks -bzzz */ 784 n = journal->j_blocksize / sizeof(journal_block_tag_t); 785 journal->j_wbufsize = n; 786 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); 787 if (!journal->j_wbuf) { 788 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 789 __func__); 790 kfree(journal); 791 return NULL; 792 } 793 794 err = journal_bmap(journal, 0, &blocknr); 795 /* If that failed, give up */ 796 if (err) { 797 printk(KERN_ERR "%s: Cannnot locate journal superblock\n", 798 __func__); 799 kfree(journal); 800 return NULL; 801 } 802 803 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 804 J_ASSERT(bh != NULL); 805 journal->j_sb_buffer = bh; 806 journal->j_superblock = (journal_superblock_t *)bh->b_data; 807 808 return journal; 809} 810 811/* 812 * If the journal init or create aborts, we need to mark the journal 813 * superblock as being NULL to prevent the journal destroy from writing 814 * back a bogus superblock. 815 */ 816static void journal_fail_superblock (journal_t *journal) 817{ 818 struct buffer_head *bh = journal->j_sb_buffer; 819 brelse(bh); 820 journal->j_sb_buffer = NULL; 821} 822 823/* 824 * Given a journal_t structure, initialise the various fields for 825 * startup of a new journaling session. We use this both when creating 826 * a journal, and after recovering an old journal to reset it for 827 * subsequent use. 828 */ 829 830static int journal_reset(journal_t *journal) 831{ 832 journal_superblock_t *sb = journal->j_superblock; 833 unsigned long first, last; 834 835 first = be32_to_cpu(sb->s_first); 836 last = be32_to_cpu(sb->s_maxlen); 837 838 journal->j_first = first; 839 journal->j_last = last; 840 841 journal->j_head = first; 842 journal->j_tail = first; 843 journal->j_free = last - first; 844 845 journal->j_tail_sequence = journal->j_transaction_sequence; 846 journal->j_commit_sequence = journal->j_transaction_sequence - 1; 847 journal->j_commit_request = journal->j_commit_sequence; 848 849 journal->j_max_transaction_buffers = journal->j_maxlen / 4; 850 851 /* Add the dynamic fields and write it to disk. */ 852 journal_update_superblock(journal, 1); 853 return journal_start_thread(journal); 854} 855 856/** 857 * int journal_create() - Initialise the new journal file 858 * @journal: Journal to create. This structure must have been initialised 859 * 860 * Given a journal_t structure which tells us which disk blocks we can 861 * use, create a new journal superblock and initialise all of the 862 * journal fields from scratch. 863 **/ 864int journal_create(journal_t *journal) 865{ 866 unsigned long blocknr; 867 struct buffer_head *bh; 868 journal_superblock_t *sb; 869 int i, err; 870 871 if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) { 872 printk (KERN_ERR "Journal length (%d blocks) too short.\n", 873 journal->j_maxlen); 874 journal_fail_superblock(journal); 875 return -EINVAL; 876 } 877 878 if (journal->j_inode == NULL) { 879 /* 880 * We don't know what block to start at! 881 */ 882 printk(KERN_EMERG 883 "%s: creation of journal on external device!\n", 884 __func__); 885 BUG(); 886 } 887 888 /* Zero out the entire journal on disk. We cannot afford to 889 have any blocks on disk beginning with JFS_MAGIC_NUMBER. */ 890 jbd_debug(1, "JBD: Zeroing out journal blocks...\n"); 891 for (i = 0; i < journal->j_maxlen; i++) { 892 err = journal_bmap(journal, i, &blocknr); 893 if (err) 894 return err; 895 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 896 lock_buffer(bh); 897 memset (bh->b_data, 0, journal->j_blocksize); 898 BUFFER_TRACE(bh, "marking dirty"); 899 mark_buffer_dirty(bh); 900 BUFFER_TRACE(bh, "marking uptodate"); 901 set_buffer_uptodate(bh); 902 unlock_buffer(bh); 903 __brelse(bh); 904 } 905 906 sync_blockdev(journal->j_dev); 907 jbd_debug(1, "JBD: journal cleared.\n"); 908 909 /* OK, fill in the initial static fields in the new superblock */ 910 sb = journal->j_superblock; 911 912 sb->s_header.h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); 913 sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2); 914 915 sb->s_blocksize = cpu_to_be32(journal->j_blocksize); 916 sb->s_maxlen = cpu_to_be32(journal->j_maxlen); 917 sb->s_first = cpu_to_be32(1); 918 919 journal->j_transaction_sequence = 1; 920 921 journal->j_flags &= ~JFS_ABORT; 922 journal->j_format_version = 2; 923 924 return journal_reset(journal); 925} 926 927/** 928 * void journal_update_superblock() - Update journal sb on disk. 929 * @journal: The journal to update. 930 * @wait: Set to '0' if you don't want to wait for IO completion. 931 * 932 * Update a journal's dynamic superblock fields and write it to disk, 933 * optionally waiting for the IO to complete. 934 */ 935void journal_update_superblock(journal_t *journal, int wait) 936{ 937 journal_superblock_t *sb = journal->j_superblock; 938 struct buffer_head *bh = journal->j_sb_buffer; 939 940 /* 941 * As a special case, if the on-disk copy is already marked as needing 942 * no recovery (s_start == 0) and there are no outstanding transactions 943 * in the filesystem, then we can safely defer the superblock update 944 * until the next commit by setting JFS_FLUSHED. This avoids 945 * attempting a write to a potential-readonly device. 946 */ 947 if (sb->s_start == 0 && journal->j_tail_sequence == 948 journal->j_transaction_sequence) { 949 jbd_debug(1,"JBD: Skipping superblock update on recovered sb " 950 "(start %ld, seq %d, errno %d)\n", 951 journal->j_tail, journal->j_tail_sequence, 952 journal->j_errno); 953 goto out; 954 } 955 956 spin_lock(&journal->j_state_lock); 957 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", 958 journal->j_tail, journal->j_tail_sequence, journal->j_errno); 959 960 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); 961 sb->s_start = cpu_to_be32(journal->j_tail); 962 sb->s_errno = cpu_to_be32(journal->j_errno); 963 spin_unlock(&journal->j_state_lock); 964 965 BUFFER_TRACE(bh, "marking dirty"); 966 mark_buffer_dirty(bh); 967 if (wait) 968 sync_dirty_buffer(bh); 969 else 970 ll_rw_block(SWRITE, 1, &bh); 971 972out: 973 /* If we have just flushed the log (by marking s_start==0), then 974 * any future commit will have to be careful to update the 975 * superblock again to re-record the true start of the log. */ 976 977 spin_lock(&journal->j_state_lock); 978 if (sb->s_start) 979 journal->j_flags &= ~JFS_FLUSHED; 980 else 981 journal->j_flags |= JFS_FLUSHED; 982 spin_unlock(&journal->j_state_lock); 983} 984 985/* 986 * Read the superblock for a given journal, performing initial 987 * validation of the format. 988 */ 989 990static int journal_get_superblock(journal_t *journal) 991{ 992 struct buffer_head *bh; 993 journal_superblock_t *sb; 994 int err = -EIO; 995 996 bh = journal->j_sb_buffer; 997 998 J_ASSERT(bh != NULL); 999 if (!buffer_uptodate(bh)) { 1000 ll_rw_block(READ, 1, &bh); 1001 wait_on_buffer(bh); 1002 if (!buffer_uptodate(bh)) { 1003 printk (KERN_ERR 1004 "JBD: IO error reading journal superblock\n"); 1005 goto out; 1006 } 1007 } 1008 1009 sb = journal->j_superblock; 1010 1011 err = -EINVAL; 1012 1013 if (sb->s_header.h_magic != cpu_to_be32(JFS_MAGIC_NUMBER) || 1014 sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { 1015 printk(KERN_WARNING "JBD: no valid journal superblock found\n"); 1016 goto out; 1017 } 1018 1019 switch(be32_to_cpu(sb->s_header.h_blocktype)) { 1020 case JFS_SUPERBLOCK_V1: 1021 journal->j_format_version = 1; 1022 break; 1023 case JFS_SUPERBLOCK_V2: 1024 journal->j_format_version = 2; 1025 break; 1026 default: 1027 printk(KERN_WARNING "JBD: unrecognised superblock format ID\n"); 1028 goto out; 1029 } 1030 1031 if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen) 1032 journal->j_maxlen = be32_to_cpu(sb->s_maxlen); 1033 else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) { 1034 printk (KERN_WARNING "JBD: journal file too short\n"); 1035 goto out; 1036 } 1037 1038 return 0; 1039 1040out: 1041 journal_fail_superblock(journal); 1042 return err; 1043} 1044 1045/* 1046 * Load the on-disk journal superblock and read the key fields into the 1047 * journal_t. 1048 */ 1049 1050static int load_superblock(journal_t *journal) 1051{ 1052 int err; 1053 journal_superblock_t *sb; 1054 1055 err = journal_get_superblock(journal); 1056 if (err) 1057 return err; 1058 1059 sb = journal->j_superblock; 1060 1061 journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); 1062 journal->j_tail = be32_to_cpu(sb->s_start); 1063 journal->j_first = be32_to_cpu(sb->s_first); 1064 journal->j_last = be32_to_cpu(sb->s_maxlen); 1065 journal->j_errno = be32_to_cpu(sb->s_errno); 1066 1067 return 0; 1068} 1069 1070 1071/** 1072 * int journal_load() - Read journal from disk. 1073 * @journal: Journal to act on. 1074 * 1075 * Given a journal_t structure which tells us which disk blocks contain 1076 * a journal, read the journal from disk to initialise the in-memory 1077 * structures. 1078 */ 1079int journal_load(journal_t *journal) 1080{ 1081 int err; 1082 journal_superblock_t *sb; 1083 1084 err = load_superblock(journal); 1085 if (err) 1086 return err; 1087 1088 sb = journal->j_superblock; 1089 /* If this is a V2 superblock, then we have to check the 1090 * features flags on it. */ 1091 1092 if (journal->j_format_version >= 2) { 1093 if ((sb->s_feature_ro_compat & 1094 ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) || 1095 (sb->s_feature_incompat & 1096 ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) { 1097 printk (KERN_WARNING 1098 "JBD: Unrecognised features on journal\n"); 1099 return -EINVAL; 1100 } 1101 } 1102 1103 /* Let the recovery code check whether it needs to recover any 1104 * data from the journal. */ 1105 if (journal_recover(journal)) 1106 goto recovery_error; 1107 1108 /* OK, we've finished with the dynamic journal bits: 1109 * reinitialise the dynamic contents of the superblock in memory 1110 * and reset them on disk. */ 1111 if (journal_reset(journal)) 1112 goto recovery_error; 1113 1114 journal->j_flags &= ~JFS_ABORT; 1115 journal->j_flags |= JFS_LOADED; 1116 return 0; 1117 1118recovery_error: 1119 printk (KERN_WARNING "JBD: recovery failed\n"); 1120 return -EIO; 1121} 1122 1123/** 1124 * void journal_destroy() - Release a journal_t structure. 1125 * @journal: Journal to act on. 1126 * 1127 * Release a journal_t structure once it is no longer in use by the 1128 * journaled object. 1129 * Return <0 if we couldn't clean up the journal. 1130 */ 1131int journal_destroy(journal_t *journal) 1132{ 1133 int err = 0; 1134 1135 /* Wait for the commit thread to wake up and die. */ 1136 journal_kill_thread(journal); 1137 1138 /* Force a final log commit */ 1139 if (journal->j_running_transaction) 1140 journal_commit_transaction(journal); 1141 1142 /* Force any old transactions to disk */ 1143 1144 /* Totally anal locking here... */ 1145 spin_lock(&journal->j_list_lock); 1146 while (journal->j_checkpoint_transactions != NULL) { 1147 spin_unlock(&journal->j_list_lock); 1148 log_do_checkpoint(journal); 1149 spin_lock(&journal->j_list_lock); 1150 } 1151 1152 J_ASSERT(journal->j_running_transaction == NULL); 1153 J_ASSERT(journal->j_committing_transaction == NULL); 1154 J_ASSERT(journal->j_checkpoint_transactions == NULL); 1155 spin_unlock(&journal->j_list_lock); 1156 1157 if (journal->j_sb_buffer) { 1158 if (!is_journal_aborted(journal)) { 1159 /* We can now mark the journal as empty. */ 1160 journal->j_tail = 0; 1161 journal->j_tail_sequence = 1162 ++journal->j_transaction_sequence; 1163 journal_update_superblock(journal, 1); 1164 } else { 1165 err = -EIO; 1166 } 1167 brelse(journal->j_sb_buffer); 1168 } 1169 1170 if (journal->j_inode) 1171 iput(journal->j_inode); 1172 if (journal->j_revoke) 1173 journal_destroy_revoke(journal); 1174 kfree(journal->j_wbuf); 1175 kfree(journal); 1176 1177 return err; 1178} 1179 1180 1181/** 1182 *int journal_check_used_features () - Check if features specified are used. 1183 * @journal: Journal to check. 1184 * @compat: bitmask of compatible features 1185 * @ro: bitmask of features that force read-only mount 1186 * @incompat: bitmask of incompatible features 1187 * 1188 * Check whether the journal uses all of a given set of 1189 * features. Return true (non-zero) if it does. 1190 **/ 1191 1192int journal_check_used_features (journal_t *journal, unsigned long compat, 1193 unsigned long ro, unsigned long incompat) 1194{ 1195 journal_superblock_t *sb; 1196 1197 if (!compat && !ro && !incompat) 1198 return 1; 1199 if (journal->j_format_version == 1) 1200 return 0; 1201 1202 sb = journal->j_superblock; 1203 1204 if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) && 1205 ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) && 1206 ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat)) 1207 return 1; 1208 1209 return 0; 1210} 1211 1212/** 1213 * int journal_check_available_features() - Check feature set in journalling layer 1214 * @journal: Journal to check. 1215 * @compat: bitmask of compatible features 1216 * @ro: bitmask of features that force read-only mount 1217 * @incompat: bitmask of incompatible features 1218 * 1219 * Check whether the journaling code supports the use of 1220 * all of a given set of features on this journal. Return true 1221 * (non-zero) if it can. */ 1222 1223int journal_check_available_features (journal_t *journal, unsigned long compat, 1224 unsigned long ro, unsigned long incompat) 1225{ 1226 journal_superblock_t *sb; 1227 1228 if (!compat && !ro && !incompat) 1229 return 1; 1230 1231 sb = journal->j_superblock; 1232 1233 /* We can support any known requested features iff the 1234 * superblock is in version 2. Otherwise we fail to support any 1235 * extended sb features. */ 1236 1237 if (journal->j_format_version != 2) 1238 return 0; 1239 1240 if ((compat & JFS_KNOWN_COMPAT_FEATURES) == compat && 1241 (ro & JFS_KNOWN_ROCOMPAT_FEATURES) == ro && 1242 (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat) 1243 return 1; 1244 1245 return 0; 1246} 1247 1248/** 1249 * int journal_set_features () - Mark a given journal feature in the superblock 1250 * @journal: Journal to act on. 1251 * @compat: bitmask of compatible features 1252 * @ro: bitmask of features that force read-only mount 1253 * @incompat: bitmask of incompatible features 1254 * 1255 * Mark a given journal feature as present on the 1256 * superblock. Returns true if the requested features could be set. 1257 * 1258 */ 1259 1260int journal_set_features (journal_t *journal, unsigned long compat, 1261 unsigned long ro, unsigned long incompat) 1262{ 1263 journal_superblock_t *sb; 1264 1265 if (journal_check_used_features(journal, compat, ro, incompat)) 1266 return 1; 1267 1268 if (!journal_check_available_features(journal, compat, ro, incompat)) 1269 return 0; 1270 1271 jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", 1272 compat, ro, incompat); 1273 1274 sb = journal->j_superblock; 1275 1276 sb->s_feature_compat |= cpu_to_be32(compat); 1277 sb->s_feature_ro_compat |= cpu_to_be32(ro); 1278 sb->s_feature_incompat |= cpu_to_be32(incompat); 1279 1280 return 1; 1281} 1282 1283 1284/** 1285 * int journal_update_format () - Update on-disk journal structure. 1286 * @journal: Journal to act on. 1287 * 1288 * Given an initialised but unloaded journal struct, poke about in the 1289 * on-disk structure to update it to the most recent supported version. 1290 */ 1291int journal_update_format (journal_t *journal) 1292{ 1293 journal_superblock_t *sb; 1294 int err; 1295 1296 err = journal_get_superblock(journal); 1297 if (err) 1298 return err; 1299 1300 sb = journal->j_superblock; 1301 1302 switch (be32_to_cpu(sb->s_header.h_blocktype)) { 1303 case JFS_SUPERBLOCK_V2: 1304 return 0; 1305 case JFS_SUPERBLOCK_V1: 1306 return journal_convert_superblock_v1(journal, sb); 1307 default: 1308 break; 1309 } 1310 return -EINVAL; 1311} 1312 1313static int journal_convert_superblock_v1(journal_t *journal, 1314 journal_superblock_t *sb) 1315{ 1316 int offset, blocksize; 1317 struct buffer_head *bh; 1318 1319 printk(KERN_WARNING 1320 "JBD: Converting superblock from version 1 to 2.\n"); 1321 1322 /* Pre-initialise new fields to zero */ 1323 offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb); 1324 blocksize = be32_to_cpu(sb->s_blocksize); 1325 memset(&sb->s_feature_compat, 0, blocksize-offset); 1326 1327 sb->s_nr_users = cpu_to_be32(1); 1328 sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2); 1329 journal->j_format_version = 2; 1330 1331 bh = journal->j_sb_buffer; 1332 BUFFER_TRACE(bh, "marking dirty"); 1333 mark_buffer_dirty(bh); 1334 sync_dirty_buffer(bh); 1335 return 0; 1336} 1337 1338 1339/** 1340 * int journal_flush () - Flush journal 1341 * @journal: Journal to act on. 1342 * 1343 * Flush all data for a given journal to disk and empty the journal. 1344 * Filesystems can use this when remounting readonly to ensure that 1345 * recovery does not need to happen on remount. 1346 */ 1347 1348int journal_flush(journal_t *journal) 1349{ 1350 int err = 0; 1351 transaction_t *transaction = NULL; 1352 unsigned long old_tail; 1353 1354 spin_lock(&journal->j_state_lock); 1355 1356 /* Force everything buffered to the log... */ 1357 if (journal->j_running_transaction) { 1358 transaction = journal->j_running_transaction; 1359 __log_start_commit(journal, transaction->t_tid); 1360 } else if (journal->j_committing_transaction) 1361 transaction = journal->j_committing_transaction; 1362 1363 /* Wait for the log commit to complete... */ 1364 if (transaction) { 1365 tid_t tid = transaction->t_tid; 1366 1367 spin_unlock(&journal->j_state_lock); 1368 log_wait_commit(journal, tid); 1369 } else { 1370 spin_unlock(&journal->j_state_lock); 1371 } 1372 1373 /* ...and flush everything in the log out to disk. */ 1374 spin_lock(&journal->j_list_lock); 1375 while (!err && journal->j_checkpoint_transactions != NULL) { 1376 spin_unlock(&journal->j_list_lock); 1377 mutex_lock(&journal->j_checkpoint_mutex); 1378 err = log_do_checkpoint(journal); 1379 mutex_unlock(&journal->j_checkpoint_mutex); 1380 spin_lock(&journal->j_list_lock); 1381 } 1382 spin_unlock(&journal->j_list_lock); 1383 1384 if (is_journal_aborted(journal)) 1385 return -EIO; 1386 1387 cleanup_journal_tail(journal); 1388 1389 /* Finally, mark the journal as really needing no recovery. 1390 * This sets s_start==0 in the underlying superblock, which is 1391 * the magic code for a fully-recovered superblock. Any future 1392 * commits of data to the journal will restore the current 1393 * s_start value. */ 1394 spin_lock(&journal->j_state_lock); 1395 old_tail = journal->j_tail; 1396 journal->j_tail = 0; 1397 spin_unlock(&journal->j_state_lock); 1398 journal_update_superblock(journal, 1); 1399 spin_lock(&journal->j_state_lock); 1400 journal->j_tail = old_tail; 1401 1402 J_ASSERT(!journal->j_running_transaction); 1403 J_ASSERT(!journal->j_committing_transaction); 1404 J_ASSERT(!journal->j_checkpoint_transactions); 1405 J_ASSERT(journal->j_head == journal->j_tail); 1406 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); 1407 spin_unlock(&journal->j_state_lock); 1408 return 0; 1409} 1410 1411/** 1412 * int journal_wipe() - Wipe journal contents 1413 * @journal: Journal to act on. 1414 * @write: flag (see below) 1415 * 1416 * Wipe out all of the contents of a journal, safely. This will produce 1417 * a warning if the journal contains any valid recovery information. 1418 * Must be called between journal_init_*() and journal_load(). 1419 * 1420 * If 'write' is non-zero, then we wipe out the journal on disk; otherwise 1421 * we merely suppress recovery. 1422 */ 1423 1424int journal_wipe(journal_t *journal, int write) 1425{ 1426 journal_superblock_t *sb; 1427 int err = 0; 1428 1429 J_ASSERT (!(journal->j_flags & JFS_LOADED)); 1430 1431 err = load_superblock(journal); 1432 if (err) 1433 return err; 1434 1435 sb = journal->j_superblock; 1436 1437 if (!journal->j_tail) 1438 goto no_recovery; 1439 1440 printk (KERN_WARNING "JBD: %s recovery information on journal\n", 1441 write ? "Clearing" : "Ignoring"); 1442 1443 err = journal_skip_recovery(journal); 1444 if (write) 1445 journal_update_superblock(journal, 1); 1446 1447 no_recovery: 1448 return err; 1449} 1450 1451/* 1452 * journal_dev_name: format a character string to describe on what 1453 * device this journal is present. 1454 */ 1455 1456static const char *journal_dev_name(journal_t *journal, char *buffer) 1457{ 1458 struct block_device *bdev; 1459 1460 if (journal->j_inode) 1461 bdev = journal->j_inode->i_sb->s_bdev; 1462 else 1463 bdev = journal->j_dev; 1464 1465 return bdevname(bdev, buffer); 1466} 1467 1468/* 1469 * Journal abort has very specific semantics, which we describe 1470 * for journal abort. 1471 * 1472 * Two internal function, which provide abort to te jbd layer 1473 * itself are here. 1474 */ 1475 1476/* 1477 * Quick version for internal journal use (doesn't lock the journal). 1478 * Aborts hard --- we mark the abort as occurred, but do _nothing_ else, 1479 * and don't attempt to make any other journal updates. 1480 */ 1481static void __journal_abort_hard(journal_t *journal) 1482{ 1483 transaction_t *transaction; 1484 char b[BDEVNAME_SIZE]; 1485 1486 if (journal->j_flags & JFS_ABORT) 1487 return; 1488 1489 printk(KERN_ERR "Aborting journal on device %s.\n", 1490 journal_dev_name(journal, b)); 1491 1492 spin_lock(&journal->j_state_lock); 1493 journal->j_flags |= JFS_ABORT; 1494 transaction = journal->j_running_transaction; 1495 if (transaction) 1496 __log_start_commit(journal, transaction->t_tid); 1497 spin_unlock(&journal->j_state_lock); 1498} 1499 1500/* Soft abort: record the abort error status in the journal superblock, 1501 * but don't do any other IO. */ 1502static void __journal_abort_soft (journal_t *journal, int errno) 1503{ 1504 if (journal->j_flags & JFS_ABORT) 1505 return; 1506 1507 if (!journal->j_errno) 1508 journal->j_errno = errno; 1509 1510 __journal_abort_hard(journal); 1511 1512 if (errno) 1513 journal_update_superblock(journal, 1); 1514} 1515 1516/** 1517 * void journal_abort () - Shutdown the journal immediately. 1518 * @journal: the journal to shutdown. 1519 * @errno: an error number to record in the journal indicating 1520 * the reason for the shutdown. 1521 * 1522 * Perform a complete, immediate shutdown of the ENTIRE 1523 * journal (not of a single transaction). This operation cannot be 1524 * undone without closing and reopening the journal. 1525 * 1526 * The journal_abort function is intended to support higher level error 1527 * recovery mechanisms such as the ext2/ext3 remount-readonly error 1528 * mode. 1529 * 1530 * Journal abort has very specific semantics. Any existing dirty, 1531 * unjournaled buffers in the main filesystem will still be written to 1532 * disk by bdflush, but the journaling mechanism will be suspended 1533 * immediately and no further transaction commits will be honoured. 1534 * 1535 * Any dirty, journaled buffers will be written back to disk without 1536 * hitting the journal. Atomicity cannot be guaranteed on an aborted 1537 * filesystem, but we _do_ attempt to leave as much data as possible 1538 * behind for fsck to use for cleanup. 1539 * 1540 * Any attempt to get a new transaction handle on a journal which is in 1541 * ABORT state will just result in an -EROFS error return. A 1542 * journal_stop on an existing handle will return -EIO if we have 1543 * entered abort state during the update. 1544 * 1545 * Recursive transactions are not disturbed by journal abort until the 1546 * final journal_stop, which will receive the -EIO error. 1547 * 1548 * Finally, the journal_abort call allows the caller to supply an errno 1549 * which will be recorded (if possible) in the journal superblock. This 1550 * allows a client to record failure conditions in the middle of a 1551 * transaction without having to complete the transaction to record the 1552 * failure to disk. ext3_error, for example, now uses this 1553 * functionality. 1554 * 1555 * Errors which originate from within the journaling layer will NOT 1556 * supply an errno; a null errno implies that absolutely no further 1557 * writes are done to the journal (unless there are any already in 1558 * progress). 1559 * 1560 */ 1561 1562void journal_abort(journal_t *journal, int errno) 1563{ 1564 __journal_abort_soft(journal, errno); 1565} 1566 1567/** 1568 * int journal_errno () - returns the journal's error state. 1569 * @journal: journal to examine. 1570 * 1571 * This is the errno numbet set with journal_abort(), the last 1572 * time the journal was mounted - if the journal was stopped 1573 * without calling abort this will be 0. 1574 * 1575 * If the journal has been aborted on this mount time -EROFS will 1576 * be returned. 1577 */ 1578int journal_errno(journal_t *journal) 1579{ 1580 int err; 1581 1582 spin_lock(&journal->j_state_lock); 1583 if (journal->j_flags & JFS_ABORT) 1584 err = -EROFS; 1585 else 1586 err = journal->j_errno; 1587 spin_unlock(&journal->j_state_lock); 1588 return err; 1589} 1590 1591/** 1592 * int journal_clear_err () - clears the journal's error state 1593 * @journal: journal to act on. 1594 * 1595 * An error must be cleared or Acked to take a FS out of readonly 1596 * mode. 1597 */ 1598int journal_clear_err(journal_t *journal) 1599{ 1600 int err = 0; 1601 1602 spin_lock(&journal->j_state_lock); 1603 if (journal->j_flags & JFS_ABORT) 1604 err = -EROFS; 1605 else 1606 journal->j_errno = 0; 1607 spin_unlock(&journal->j_state_lock); 1608 return err; 1609} 1610 1611/** 1612 * void journal_ack_err() - Ack journal err. 1613 * @journal: journal to act on. 1614 * 1615 * An error must be cleared or Acked to take a FS out of readonly 1616 * mode. 1617 */ 1618void journal_ack_err(journal_t *journal) 1619{ 1620 spin_lock(&journal->j_state_lock); 1621 if (journal->j_errno) 1622 journal->j_flags |= JFS_ACK_ERR; 1623 spin_unlock(&journal->j_state_lock); 1624} 1625 1626int journal_blocks_per_page(struct inode *inode) 1627{ 1628 return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 1629} 1630 1631/* 1632 * Journal_head storage management 1633 */ 1634static struct kmem_cache *journal_head_cache; 1635#ifdef CONFIG_JBD_DEBUG 1636static atomic_t nr_journal_heads = ATOMIC_INIT(0); 1637#endif 1638 1639static int journal_init_journal_head_cache(void) 1640{ 1641 int retval; 1642 1643 J_ASSERT(journal_head_cache == NULL); 1644 journal_head_cache = kmem_cache_create("journal_head", 1645 sizeof(struct journal_head), 1646 0, /* offset */ 1647 SLAB_TEMPORARY, /* flags */ 1648 NULL); /* ctor */ 1649 retval = 0; 1650 if (!journal_head_cache) { 1651 retval = -ENOMEM; 1652 printk(KERN_EMERG "JBD: no memory for journal_head cache\n"); 1653 } 1654 return retval; 1655} 1656 1657static void journal_destroy_journal_head_cache(void) 1658{ 1659 if (journal_head_cache) { 1660 kmem_cache_destroy(journal_head_cache); 1661 journal_head_cache = NULL; 1662 } 1663} 1664 1665/* 1666 * journal_head splicing and dicing 1667 */ 1668static struct journal_head *journal_alloc_journal_head(void) 1669{ 1670 struct journal_head *ret; 1671 static unsigned long last_warning; 1672 1673#ifdef CONFIG_JBD_DEBUG 1674 atomic_inc(&nr_journal_heads); 1675#endif 1676 ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); 1677 if (ret == NULL) { 1678 jbd_debug(1, "out of memory for journal_head\n"); 1679 if (time_after(jiffies, last_warning + 5*HZ)) { 1680 printk(KERN_NOTICE "ENOMEM in %s, retrying.\n", 1681 __func__); 1682 last_warning = jiffies; 1683 } 1684 while (ret == NULL) { 1685 yield(); 1686 ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); 1687 } 1688 } 1689 return ret; 1690} 1691 1692static void journal_free_journal_head(struct journal_head *jh) 1693{ 1694#ifdef CONFIG_JBD_DEBUG 1695 atomic_dec(&nr_journal_heads); 1696 memset(jh, JBD_POISON_FREE, sizeof(*jh)); 1697#endif 1698 kmem_cache_free(journal_head_cache, jh); 1699} 1700 1701/* 1702 * A journal_head is attached to a buffer_head whenever JBD has an 1703 * interest in the buffer. 1704 * 1705 * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit 1706 * is set. This bit is tested in core kernel code where we need to take 1707 * JBD-specific actions. Testing the zeroness of ->b_private is not reliable 1708 * there. 1709 * 1710 * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one. 1711 * 1712 * When a buffer has its BH_JBD bit set it is immune from being released by 1713 * core kernel code, mainly via ->b_count. 1714 * 1715 * A journal_head may be detached from its buffer_head when the journal_head's 1716 * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL. 1717 * Various places in JBD call journal_remove_journal_head() to indicate that the 1718 * journal_head can be dropped if needed. 1719 * 1720 * Various places in the kernel want to attach a journal_head to a buffer_head 1721 * _before_ attaching the journal_head to a transaction. To protect the 1722 * journal_head in this situation, journal_add_journal_head elevates the 1723 * journal_head's b_jcount refcount by one. The caller must call 1724 * journal_put_journal_head() to undo this. 1725 * 1726 * So the typical usage would be: 1727 * 1728 * (Attach a journal_head if needed. Increments b_jcount) 1729 * struct journal_head *jh = journal_add_journal_head(bh); 1730 * ... 1731 * jh->b_transaction = xxx; 1732 * journal_put_journal_head(jh); 1733 * 1734 * Now, the journal_head's b_jcount is zero, but it is safe from being released 1735 * because it has a non-zero b_transaction. 1736 */ 1737 1738/* 1739 * Give a buffer_head a journal_head. 1740 * 1741 * Doesn't need the journal lock. 1742 * May sleep. 1743 */ 1744struct journal_head *journal_add_journal_head(struct buffer_head *bh) 1745{ 1746 struct journal_head *jh; 1747 struct journal_head *new_jh = NULL; 1748 1749repeat: 1750 if (!buffer_jbd(bh)) { 1751 new_jh = journal_alloc_journal_head(); 1752 memset(new_jh, 0, sizeof(*new_jh)); 1753 } 1754 1755 jbd_lock_bh_journal_head(bh); 1756 if (buffer_jbd(bh)) { 1757 jh = bh2jh(bh); 1758 } else { 1759 J_ASSERT_BH(bh, 1760 (atomic_read(&bh->b_count) > 0) || 1761 (bh->b_page && bh->b_page->mapping)); 1762 1763 if (!new_jh) { 1764 jbd_unlock_bh_journal_head(bh); 1765 goto repeat; 1766 } 1767 1768 jh = new_jh; 1769 new_jh = NULL; /* We consumed it */ 1770 set_buffer_jbd(bh); 1771 bh->b_private = jh; 1772 jh->b_bh = bh; 1773 get_bh(bh); 1774 BUFFER_TRACE(bh, "added journal_head"); 1775 } 1776 jh->b_jcount++; 1777 jbd_unlock_bh_journal_head(bh); 1778 if (new_jh) 1779 journal_free_journal_head(new_jh); 1780 return bh->b_private; 1781} 1782 1783/* 1784 * Grab a ref against this buffer_head's journal_head. If it ended up not 1785 * having a journal_head, return NULL 1786 */ 1787struct journal_head *journal_grab_journal_head(struct buffer_head *bh) 1788{ 1789 struct journal_head *jh = NULL; 1790 1791 jbd_lock_bh_journal_head(bh); 1792 if (buffer_jbd(bh)) { 1793 jh = bh2jh(bh); 1794 jh->b_jcount++; 1795 } 1796 jbd_unlock_bh_journal_head(bh); 1797 return jh; 1798} 1799 1800static void __journal_remove_journal_head(struct buffer_head *bh) 1801{ 1802 struct journal_head *jh = bh2jh(bh); 1803 1804 J_ASSERT_JH(jh, jh->b_jcount >= 0); 1805 1806 get_bh(bh); 1807 if (jh->b_jcount == 0) { 1808 if (jh->b_transaction == NULL && 1809 jh->b_next_transaction == NULL && 1810 jh->b_cp_transaction == NULL) { 1811 J_ASSERT_JH(jh, jh->b_jlist == BJ_None); 1812 J_ASSERT_BH(bh, buffer_jbd(bh)); 1813 J_ASSERT_BH(bh, jh2bh(jh) == bh); 1814 BUFFER_TRACE(bh, "remove journal_head"); 1815 if (jh->b_frozen_data) { 1816 printk(KERN_WARNING "%s: freeing " 1817 "b_frozen_data\n", 1818 __func__); 1819 jbd_free(jh->b_frozen_data, bh->b_size); 1820 } 1821 if (jh->b_committed_data) { 1822 printk(KERN_WARNING "%s: freeing " 1823 "b_committed_data\n", 1824 __func__); 1825 jbd_free(jh->b_committed_data, bh->b_size); 1826 } 1827 bh->b_private = NULL; 1828 jh->b_bh = NULL; /* debug, really */ 1829 clear_buffer_jbd(bh); 1830 __brelse(bh); 1831 journal_free_journal_head(jh); 1832 } else { 1833 BUFFER_TRACE(bh, "journal_head was locked"); 1834 } 1835 } 1836} 1837 1838/* 1839 * journal_remove_journal_head(): if the buffer isn't attached to a transaction 1840 * and has a zero b_jcount then remove and release its journal_head. If we did 1841 * see that the buffer is not used by any transaction we also "logically" 1842 * decrement ->b_count. 1843 * 1844 * We in fact take an additional increment on ->b_count as a convenience, 1845 * because the caller usually wants to do additional things with the bh 1846 * after calling here. 1847 * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some 1848 * time. Once the caller has run __brelse(), the buffer is eligible for 1849 * reaping by try_to_free_buffers(). 1850 */ 1851void journal_remove_journal_head(struct buffer_head *bh) 1852{ 1853 jbd_lock_bh_journal_head(bh); 1854 __journal_remove_journal_head(bh); 1855 jbd_unlock_bh_journal_head(bh); 1856} 1857 1858/* 1859 * Drop a reference on the passed journal_head. If it fell to zero then try to 1860 * release the journal_head from the buffer_head. 1861 */ 1862void journal_put_journal_head(struct journal_head *jh) 1863{ 1864 struct buffer_head *bh = jh2bh(jh); 1865 1866 jbd_lock_bh_journal_head(bh); 1867 J_ASSERT_JH(jh, jh->b_jcount > 0); 1868 --jh->b_jcount; 1869 if (!jh->b_jcount && !jh->b_transaction) { 1870 __journal_remove_journal_head(bh); 1871 __brelse(bh); 1872 } 1873 jbd_unlock_bh_journal_head(bh); 1874} 1875 1876/* 1877 * debugfs tunables 1878 */ 1879#ifdef CONFIG_JBD_DEBUG 1880 1881u8 journal_enable_debug __read_mostly; 1882EXPORT_SYMBOL(journal_enable_debug); 1883 1884static struct dentry *jbd_debugfs_dir; 1885static struct dentry *jbd_debug; 1886 1887static void __init jbd_create_debugfs_entry(void) 1888{ 1889 jbd_debugfs_dir = debugfs_create_dir("jbd", NULL); 1890 if (jbd_debugfs_dir) 1891 jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO, 1892 jbd_debugfs_dir, 1893 &journal_enable_debug); 1894} 1895 1896static void __exit jbd_remove_debugfs_entry(void) 1897{ 1898 debugfs_remove(jbd_debug); 1899 debugfs_remove(jbd_debugfs_dir); 1900} 1901 1902#else 1903 1904static inline void jbd_create_debugfs_entry(void) 1905{ 1906} 1907 1908static inline void jbd_remove_debugfs_entry(void) 1909{ 1910} 1911 1912#endif 1913 1914struct kmem_cache *jbd_handle_cache; 1915 1916static int __init journal_init_handle_cache(void) 1917{ 1918 jbd_handle_cache = kmem_cache_create("journal_handle", 1919 sizeof(handle_t), 1920 0, /* offset */ 1921 SLAB_TEMPORARY, /* flags */ 1922 NULL); /* ctor */ 1923 if (jbd_handle_cache == NULL) { 1924 printk(KERN_EMERG "JBD: failed to create handle cache\n"); 1925 return -ENOMEM; 1926 } 1927 return 0; 1928} 1929 1930static void journal_destroy_handle_cache(void) 1931{ 1932 if (jbd_handle_cache) 1933 kmem_cache_destroy(jbd_handle_cache); 1934} 1935 1936/* 1937 * Module startup and shutdown 1938 */ 1939 1940static int __init journal_init_caches(void) 1941{ 1942 int ret; 1943 1944 ret = journal_init_revoke_caches(); 1945 if (ret == 0) 1946 ret = journal_init_journal_head_cache(); 1947 if (ret == 0) 1948 ret = journal_init_handle_cache(); 1949 return ret; 1950} 1951 1952static void journal_destroy_caches(void) 1953{ 1954 journal_destroy_revoke_caches(); 1955 journal_destroy_journal_head_cache(); 1956 journal_destroy_handle_cache(); 1957} 1958 1959static int __init journal_init(void) 1960{ 1961 int ret; 1962 1963 BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024); 1964 1965 ret = journal_init_caches(); 1966 if (ret != 0) 1967 journal_destroy_caches(); 1968 jbd_create_debugfs_entry(); 1969 return ret; 1970} 1971 1972static void __exit journal_exit(void) 1973{ 1974#ifdef CONFIG_JBD_DEBUG 1975 int n = atomic_read(&nr_journal_heads); 1976 if (n) 1977 printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); 1978#endif 1979 jbd_remove_debugfs_entry(); 1980 journal_destroy_caches(); 1981} 1982 1983MODULE_LICENSE("GPL"); 1984module_init(journal_init); 1985module_exit(journal_exit); 1986