Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

jbd2,ext4: add a shrinker to release checkpointed buffers

Current metadata buffer release logic in bdev_try_to_free_page() have
a lot of use-after-free issues when umount filesystem concurrently, and
it is difficult to fix directly because ext4 is the only user of
s_op->bdev_try_to_free_page callback and we may have to add more special
refcount or lock that is only used by ext4 into the common vfs layer,
which is unacceptable.

One better solution is remove the bdev_try_to_free_page callback, but
the real problem is we cannot easily release journal_head on the
checkpointed buffer, so try_to_free_buffers() cannot release buffers and
page under memory pressure, which is more likely to trigger
out-of-memory. So we cannot remove the callback directly before we find
another way to release journal_head.

This patch introduce a shrinker to free journal_head on the checkpointed
transaction. After the journal_head got freed, try_to_free_buffers()
could free buffer properly.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Suggested-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20210610112440.3438139-6-yi.zhang@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>

authored by

Zhang Yi and committed by
Theodore Ts'o
4ba3fcdd 214eb5a4

+369
+8
fs/ext4/super.c
··· 1174 1174 ext4_unregister_sysfs(sb); 1175 1175 1176 1176 if (sbi->s_journal) { 1177 + jbd2_journal_unregister_shrinker(sbi->s_journal); 1177 1178 aborted = is_journal_aborted(sbi->s_journal); 1178 1179 err = jbd2_journal_destroy(sbi->s_journal); 1179 1180 sbi->s_journal = NULL; ··· 5187 5186 sbi->s_ea_block_cache = NULL; 5188 5187 5189 5188 if (sbi->s_journal) { 5189 + jbd2_journal_unregister_shrinker(sbi->s_journal); 5190 5190 jbd2_journal_destroy(sbi->s_journal); 5191 5191 sbi->s_journal = NULL; 5192 5192 } ··· 5511 5509 5512 5510 /* Make sure we flush the recovery flag to disk. */ 5513 5511 ext4_commit_super(sb); 5512 + } 5513 + 5514 + err = jbd2_journal_register_shrinker(journal); 5515 + if (err) { 5516 + EXT4_SB(sb)->s_journal = NULL; 5517 + goto err_out; 5514 5518 } 5515 5519 5516 5520 return 0;
+147
fs/jbd2/checkpoint.c
··· 80 80 } 81 81 82 82 /* 83 + * Check a checkpoint buffer could be release or not. 84 + * 85 + * Requires j_list_lock 86 + */ 87 + static inline bool __cp_buffer_busy(struct journal_head *jh) 88 + { 89 + struct buffer_head *bh = jh2bh(jh); 90 + 91 + return (jh->b_transaction || buffer_locked(bh) || buffer_dirty(bh)); 92 + } 93 + 94 + /* 83 95 * Try to release a checkpointed buffer from its transaction. 84 96 * Returns 1 if we released it and 2 if we also released the 85 97 * whole transaction. ··· 471 459 } 472 460 473 461 /* 462 + * journal_shrink_one_cp_list 463 + * 464 + * Find 'nr_to_scan' written-back checkpoint buffers in the given list 465 + * and try to release them. If the whole transaction is released, set 466 + * the 'released' parameter. Return the number of released checkpointed 467 + * buffers. 468 + * 469 + * Called with j_list_lock held. 470 + */ 471 + static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, 472 + unsigned long *nr_to_scan, 473 + bool *released) 474 + { 475 + struct journal_head *last_jh; 476 + struct journal_head *next_jh = jh; 477 + unsigned long nr_freed = 0; 478 + int ret; 479 + 480 + if (!jh || *nr_to_scan == 0) 481 + return 0; 482 + 483 + last_jh = jh->b_cpprev; 484 + do { 485 + jh = next_jh; 486 + next_jh = jh->b_cpnext; 487 + 488 + (*nr_to_scan)--; 489 + if (__cp_buffer_busy(jh)) 490 + continue; 491 + 492 + nr_freed++; 493 + ret = __jbd2_journal_remove_checkpoint(jh); 494 + if (ret) { 495 + *released = true; 496 + break; 497 + } 498 + 499 + if (need_resched()) 500 + break; 501 + } while (jh != last_jh && *nr_to_scan); 502 + 503 + return nr_freed; 504 + } 505 + 506 + /* 507 + * jbd2_journal_shrink_checkpoint_list 508 + * 509 + * Find 'nr_to_scan' written-back checkpoint buffers in the journal 510 + * and try to release them. Return the number of released checkpointed 511 + * buffers. 512 + * 513 + * Called with j_list_lock held. 514 + */ 515 + unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, 516 + unsigned long *nr_to_scan) 517 + { 518 + transaction_t *transaction, *last_transaction, *next_transaction; 519 + bool released; 520 + tid_t first_tid = 0, last_tid = 0, next_tid = 0; 521 + tid_t tid = 0; 522 + unsigned long nr_freed = 0; 523 + unsigned long nr_scanned = *nr_to_scan; 524 + 525 + again: 526 + spin_lock(&journal->j_list_lock); 527 + if (!journal->j_checkpoint_transactions) { 528 + spin_unlock(&journal->j_list_lock); 529 + goto out; 530 + } 531 + 532 + /* 533 + * Get next shrink transaction, resume previous scan or start 534 + * over again. If some others do checkpoint and drop transaction 535 + * from the checkpoint list, we ignore saved j_shrink_transaction 536 + * and start over unconditionally. 537 + */ 538 + if (journal->j_shrink_transaction) 539 + transaction = journal->j_shrink_transaction; 540 + else 541 + transaction = journal->j_checkpoint_transactions; 542 + 543 + if (!first_tid) 544 + first_tid = transaction->t_tid; 545 + last_transaction = journal->j_checkpoint_transactions->t_cpprev; 546 + next_transaction = transaction; 547 + last_tid = last_transaction->t_tid; 548 + do { 549 + transaction = next_transaction; 550 + next_transaction = transaction->t_cpnext; 551 + tid = transaction->t_tid; 552 + released = false; 553 + 554 + nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_list, 555 + nr_to_scan, &released); 556 + if (*nr_to_scan == 0) 557 + break; 558 + if (need_resched() || spin_needbreak(&journal->j_list_lock)) 559 + break; 560 + if (released) 561 + continue; 562 + 563 + nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_io_list, 564 + nr_to_scan, &released); 565 + if (*nr_to_scan == 0) 566 + break; 567 + if (need_resched() || spin_needbreak(&journal->j_list_lock)) 568 + break; 569 + } while (transaction != last_transaction); 570 + 571 + if (transaction != last_transaction) { 572 + journal->j_shrink_transaction = next_transaction; 573 + next_tid = next_transaction->t_tid; 574 + } else { 575 + journal->j_shrink_transaction = NULL; 576 + next_tid = 0; 577 + } 578 + 579 + spin_unlock(&journal->j_list_lock); 580 + cond_resched(); 581 + 582 + if (*nr_to_scan && next_tid) 583 + goto again; 584 + out: 585 + nr_scanned -= *nr_to_scan; 586 + trace_jbd2_shrink_checkpoint_list(journal, first_tid, tid, last_tid, 587 + nr_freed, nr_scanned, next_tid); 588 + 589 + return nr_freed; 590 + } 591 + 592 + /* 474 593 * journal_clean_checkpoint_list 475 594 * 476 595 * Find all the written-back checkpoint buffers in the journal and release them. ··· 723 580 724 581 __buffer_unlink(jh); 725 582 jh->b_cp_transaction = NULL; 583 + percpu_counter_dec(&journal->j_jh_shrink_count); 726 584 jbd2_journal_put_journal_head(jh); 727 585 728 586 /* Is this transaction empty? */ ··· 786 642 jh->b_cpnext->b_cpprev = jh; 787 643 } 788 644 transaction->t_checkpoint_list = jh; 645 + percpu_counter_inc(&transaction->t_journal->j_jh_shrink_count); 789 646 } 790 647 791 648 /* ··· 802 657 void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction) 803 658 { 804 659 assert_spin_locked(&journal->j_list_lock); 660 + 661 + journal->j_shrink_transaction = NULL; 805 662 if (transaction->t_cpnext) { 806 663 transaction->t_cpnext->t_cpprev = transaction->t_cpprev; 807 664 transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
+87
fs/jbd2/journal.c
··· 2051 2051 } 2052 2052 2053 2053 /** 2054 + * jbd2_journal_shrink_scan() 2055 + * 2056 + * Scan the checkpointed buffer on the checkpoint list and release the 2057 + * journal_head. 2058 + */ 2059 + static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink, 2060 + struct shrink_control *sc) 2061 + { 2062 + journal_t *journal = container_of(shrink, journal_t, j_shrinker); 2063 + unsigned long nr_to_scan = sc->nr_to_scan; 2064 + unsigned long nr_shrunk; 2065 + unsigned long count; 2066 + 2067 + count = percpu_counter_read_positive(&journal->j_jh_shrink_count); 2068 + trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count); 2069 + 2070 + nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan); 2071 + 2072 + count = percpu_counter_read_positive(&journal->j_jh_shrink_count); 2073 + trace_jbd2_shrink_scan_exit(journal, nr_to_scan, nr_shrunk, count); 2074 + 2075 + return nr_shrunk; 2076 + } 2077 + 2078 + /** 2079 + * jbd2_journal_shrink_count() 2080 + * 2081 + * Count the number of checkpoint buffers on the checkpoint list. 2082 + */ 2083 + static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink, 2084 + struct shrink_control *sc) 2085 + { 2086 + journal_t *journal = container_of(shrink, journal_t, j_shrinker); 2087 + unsigned long count; 2088 + 2089 + count = percpu_counter_read_positive(&journal->j_jh_shrink_count); 2090 + trace_jbd2_shrink_count(journal, sc->nr_to_scan, count); 2091 + 2092 + return count; 2093 + } 2094 + 2095 + /** 2096 + * jbd2_journal_register_shrinker() 2097 + * @journal: Journal to act on. 2098 + * 2099 + * Init a percpu counter to record the checkpointed buffers on the checkpoint 2100 + * list and register a shrinker to release their journal_head. 2101 + */ 2102 + int jbd2_journal_register_shrinker(journal_t *journal) 2103 + { 2104 + int err; 2105 + 2106 + journal->j_shrink_transaction = NULL; 2107 + 2108 + err = percpu_counter_init(&journal->j_jh_shrink_count, 0, GFP_KERNEL); 2109 + if (err) 2110 + return err; 2111 + 2112 + journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan; 2113 + journal->j_shrinker.count_objects = jbd2_journal_shrink_count; 2114 + journal->j_shrinker.seeks = DEFAULT_SEEKS; 2115 + journal->j_shrinker.batch = journal->j_max_transaction_buffers; 2116 + 2117 + err = register_shrinker(&journal->j_shrinker); 2118 + if (err) { 2119 + percpu_counter_destroy(&journal->j_jh_shrink_count); 2120 + return err; 2121 + } 2122 + 2123 + return 0; 2124 + } 2125 + 2126 + /** 2127 + * jbd2_journal_unregister_shrinker() 2128 + * @journal: Journal to act on. 2129 + * 2130 + * Unregister the checkpointed buffer shrinker and destroy the percpu counter. 2131 + */ 2132 + void jbd2_journal_unregister_shrinker(journal_t *journal) 2133 + { 2134 + percpu_counter_destroy(&journal->j_jh_shrink_count); 2135 + unregister_shrinker(&journal->j_shrinker); 2136 + } 2137 + 2138 + /** 2054 2139 * jbd2_journal_destroy() - Release a journal_t structure. 2055 2140 * @journal: Journal to act on. 2056 2141 * ··· 2206 2121 err = -EIO; 2207 2122 brelse(journal->j_sb_buffer); 2208 2123 } 2124 + 2125 + jbd2_journal_unregister_shrinker(journal); 2209 2126 2210 2127 if (journal->j_proc_entry) 2211 2128 jbd2_stats_proc_exit(journal);
+26
include/linux/jbd2.h
··· 910 910 struct buffer_head *j_chkpt_bhs[JBD2_NR_BATCH]; 911 911 912 912 /** 913 + * @j_shrinker: 914 + * 915 + * Journal head shrinker, reclaim buffer's journal head which 916 + * has been written back. 917 + */ 918 + struct shrinker j_shrinker; 919 + 920 + /** 921 + * @j_jh_shrink_count: 922 + * 923 + * Number of journal buffers on the checkpoint list. [j_list_lock] 924 + */ 925 + struct percpu_counter j_jh_shrink_count; 926 + 927 + /** 928 + * @j_shrink_transaction: 929 + * 930 + * Record next transaction will shrink on the checkpoint list. 931 + * [j_list_lock] 932 + */ 933 + transaction_t *j_shrink_transaction; 934 + 935 + /** 913 936 * @j_head: 914 937 * 915 938 * Journal head: identifies the first unused block in the journal. ··· 1445 1422 1446 1423 /* Checkpoint list management */ 1447 1424 void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy); 1425 + unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan); 1448 1426 int __jbd2_journal_remove_checkpoint(struct journal_head *); 1449 1427 void jbd2_journal_destroy_checkpoint(journal_t *journal); 1450 1428 void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *); ··· 1556 1532 (journal_t *, unsigned long, unsigned long, unsigned long); 1557 1533 extern void jbd2_journal_clear_features 1558 1534 (journal_t *, unsigned long, unsigned long, unsigned long); 1535 + extern int jbd2_journal_register_shrinker(journal_t *journal); 1536 + extern void jbd2_journal_unregister_shrinker(journal_t *journal); 1559 1537 extern int jbd2_journal_load (journal_t *journal); 1560 1538 extern int jbd2_journal_destroy (journal_t *); 1561 1539 extern int jbd2_journal_recover (journal_t *journal);
+101
include/trace/events/jbd2.h
··· 394 394 __entry->stall_ms) 395 395 ); 396 396 397 + DECLARE_EVENT_CLASS(jbd2_journal_shrink, 398 + 399 + TP_PROTO(journal_t *journal, unsigned long nr_to_scan, 400 + unsigned long count), 401 + 402 + TP_ARGS(journal, nr_to_scan, count), 403 + 404 + TP_STRUCT__entry( 405 + __field(dev_t, dev) 406 + __field(unsigned long, nr_to_scan) 407 + __field(unsigned long, count) 408 + ), 409 + 410 + TP_fast_assign( 411 + __entry->dev = journal->j_fs_dev->bd_dev; 412 + __entry->nr_to_scan = nr_to_scan; 413 + __entry->count = count; 414 + ), 415 + 416 + TP_printk("dev %d,%d nr_to_scan %lu count %lu", 417 + MAJOR(__entry->dev), MINOR(__entry->dev), 418 + __entry->nr_to_scan, __entry->count) 419 + ); 420 + 421 + DEFINE_EVENT(jbd2_journal_shrink, jbd2_shrink_count, 422 + 423 + TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count), 424 + 425 + TP_ARGS(journal, nr_to_scan, count) 426 + ); 427 + 428 + DEFINE_EVENT(jbd2_journal_shrink, jbd2_shrink_scan_enter, 429 + 430 + TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count), 431 + 432 + TP_ARGS(journal, nr_to_scan, count) 433 + ); 434 + 435 + TRACE_EVENT(jbd2_shrink_scan_exit, 436 + 437 + TP_PROTO(journal_t *journal, unsigned long nr_to_scan, 438 + unsigned long nr_shrunk, unsigned long count), 439 + 440 + TP_ARGS(journal, nr_to_scan, nr_shrunk, count), 441 + 442 + TP_STRUCT__entry( 443 + __field(dev_t, dev) 444 + __field(unsigned long, nr_to_scan) 445 + __field(unsigned long, nr_shrunk) 446 + __field(unsigned long, count) 447 + ), 448 + 449 + TP_fast_assign( 450 + __entry->dev = journal->j_fs_dev->bd_dev; 451 + __entry->nr_to_scan = nr_to_scan; 452 + __entry->nr_shrunk = nr_shrunk; 453 + __entry->count = count; 454 + ), 455 + 456 + TP_printk("dev %d,%d nr_to_scan %lu nr_shrunk %lu count %lu", 457 + MAJOR(__entry->dev), MINOR(__entry->dev), 458 + __entry->nr_to_scan, __entry->nr_shrunk, 459 + __entry->count) 460 + ); 461 + 462 + TRACE_EVENT(jbd2_shrink_checkpoint_list, 463 + 464 + TP_PROTO(journal_t *journal, tid_t first_tid, tid_t tid, tid_t last_tid, 465 + unsigned long nr_freed, unsigned long nr_scanned, 466 + tid_t next_tid), 467 + 468 + TP_ARGS(journal, first_tid, tid, last_tid, nr_freed, 469 + nr_scanned, next_tid), 470 + 471 + TP_STRUCT__entry( 472 + __field(dev_t, dev) 473 + __field(tid_t, first_tid) 474 + __field(tid_t, tid) 475 + __field(tid_t, last_tid) 476 + __field(unsigned long, nr_freed) 477 + __field(unsigned long, nr_scanned) 478 + __field(tid_t, next_tid) 479 + ), 480 + 481 + TP_fast_assign( 482 + __entry->dev = journal->j_fs_dev->bd_dev; 483 + __entry->first_tid = first_tid; 484 + __entry->tid = tid; 485 + __entry->last_tid = last_tid; 486 + __entry->nr_freed = nr_freed; 487 + __entry->nr_scanned = nr_scanned; 488 + __entry->next_tid = next_tid; 489 + ), 490 + 491 + TP_printk("dev %d,%d shrink transaction %u-%u(%u) freed %lu " 492 + "scanned %lu next transaction %u", 493 + MAJOR(__entry->dev), MINOR(__entry->dev), 494 + __entry->first_tid, __entry->tid, __entry->last_tid, 495 + __entry->nr_freed, __entry->nr_scanned, __entry->next_tid) 496 + ); 497 + 397 498 #endif /* _TRACE_JBD2_H */ 398 499 399 500 /* This part must be outside protection */