Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'vfs-6.18-rc1.writeback' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull vfs writeback updates from Christian Brauner:
"This contains work adressing lockups reported by users when a systemd
unit reading lots of files from a filesystem mounted with the lazytime
mount option exits.

With the lazytime mount option enabled we can be switching many dirty
inodes on cgroup exit to the parent cgroup. The numbers observed in
practice when systemd slice of a large cron job exits can easily reach
hundreds of thousands or millions.

The logic in inode_do_switch_wbs() which sorts the inode into
appropriate place in b_dirty list of the target wb however has linear
complexity in the number of dirty inodes thus overall time complexity
of switching all the inodes is quadratic leading to workers being
pegged for hours consuming 100% of the CPU and switching inodes to the
parent wb.

Simple reproducer of the issue:

FILES=10000
# Filesystem mounted with lazytime mount option
MNT=/mnt/
echo "Creating files and switching timestamps"
for (( j = 0; j < 50; j ++ )); do
mkdir $MNT/dir$j
for (( i = 0; i < $FILES; i++ )); do
echo "foo" >$MNT/dir$j/file$i
done
touch -a -t 202501010000 $MNT/dir$j/file*
done
wait
echo "Syncing and flushing"
sync
echo 3 >/proc/sys/vm/drop_caches

echo "Reading all files from a cgroup"
mkdir /sys/fs/cgroup/unified/mycg1 || exit
echo $$ >/sys/fs/cgroup/unified/mycg1/cgroup.procs || exit
for (( j = 0; j < 50; j ++ )); do
cat /mnt/dir$j/file* >/dev/null &
done
wait
echo "Switching wbs"
# Now rmdir the cgroup after the script exits

This can be solved by:

- Avoiding contention on the wb->list_lock when switching inodes by
running a single work item per wb and managing a queue of items
switching to the wb

- Allowing rescheduling when switching inodes over to a different
cgroup to avoid softlockups

- Maintaining the b_dirty list ordering instead of sorting it"

* tag 'vfs-6.18-rc1.writeback' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
writeback: Add tracepoint to track pending inode switches
writeback: Avoid excessively long inode switching times
writeback: Avoid softlockup when switching many inodes
writeback: Avoid contention on wb->list_lock when switching inodes

+126 -47
+86 -47
fs/fs-writeback.c
··· 368 368 } 369 369 370 370 struct inode_switch_wbs_context { 371 - struct rcu_work work; 371 + /* List of queued switching contexts for the wb */ 372 + struct llist_node list; 372 373 373 374 /* 374 375 * Multiple inodes can be switched at once. The switching procedure ··· 379 378 * array embedded into struct inode_switch_wbs_context. Otherwise 380 379 * an inode could be left in a non-consistent state. 381 380 */ 382 - struct bdi_writeback *new_wb; 383 381 struct inode *inodes[]; 384 382 }; 385 383 ··· 445 445 * Transfer to @new_wb's IO list if necessary. If the @inode is dirty, 446 446 * the specific list @inode was on is ignored and the @inode is put on 447 447 * ->b_dirty which is always correct including from ->b_dirty_time. 448 - * The transfer preserves @inode->dirtied_when ordering. If the @inode 449 - * was clean, it means it was on the b_attached list, so move it onto 450 - * the b_attached list of @new_wb. 448 + * If the @inode was clean, it means it was on the b_attached list, so 449 + * move it onto the b_attached list of @new_wb. 451 450 */ 452 451 if (!list_empty(&inode->i_io_list)) { 453 452 inode->i_wb = new_wb; 454 453 455 454 if (inode->i_state & I_DIRTY_ALL) { 456 - struct inode *pos; 457 - 458 - list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) 459 - if (time_after_eq(inode->dirtied_when, 460 - pos->dirtied_when)) 461 - break; 455 + /* 456 + * We need to keep b_dirty list sorted by 457 + * dirtied_time_when. However properly sorting the 458 + * inode in the list gets too expensive when switching 459 + * many inodes. So just attach inode at the end of the 460 + * dirty list and clobber the dirtied_time_when. 461 + */ 462 + inode->dirtied_time_when = jiffies; 462 463 inode_io_list_move_locked(inode, new_wb, 463 - pos->i_io_list.prev); 464 + &new_wb->b_dirty); 464 465 } else { 465 466 inode_cgwb_move_to_attached(inode, new_wb); 466 467 } ··· 487 486 return switched; 488 487 } 489 488 490 - static void inode_switch_wbs_work_fn(struct work_struct *work) 489 + static void process_inode_switch_wbs(struct bdi_writeback *new_wb, 490 + struct inode_switch_wbs_context *isw) 491 491 { 492 - struct inode_switch_wbs_context *isw = 493 - container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); 494 492 struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]); 495 493 struct bdi_writeback *old_wb = isw->inodes[0]->i_wb; 496 - struct bdi_writeback *new_wb = isw->new_wb; 497 494 unsigned long nr_switched = 0; 498 495 struct inode **inodep; 499 496 ··· 501 502 */ 502 503 down_read(&bdi->wb_switch_rwsem); 503 504 505 + inodep = isw->inodes; 504 506 /* 505 507 * By the time control reaches here, RCU grace period has passed 506 508 * since I_WB_SWITCH assertion and all wb stat update transactions ··· 512 512 * gives us exclusion against all wb related operations on @inode 513 513 * including IO list manipulations and stat updates. 514 514 */ 515 + relock: 515 516 if (old_wb < new_wb) { 516 517 spin_lock(&old_wb->list_lock); 517 518 spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); ··· 521 520 spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); 522 521 } 523 522 524 - for (inodep = isw->inodes; *inodep; inodep++) { 523 + while (*inodep) { 525 524 WARN_ON_ONCE((*inodep)->i_wb != old_wb); 526 525 if (inode_do_switch_wbs(*inodep, old_wb, new_wb)) 527 526 nr_switched++; 527 + inodep++; 528 + if (*inodep && need_resched()) { 529 + spin_unlock(&new_wb->list_lock); 530 + spin_unlock(&old_wb->list_lock); 531 + cond_resched(); 532 + goto relock; 533 + } 528 534 } 529 535 530 536 spin_unlock(&new_wb->list_lock); ··· 549 541 wb_put(new_wb); 550 542 kfree(isw); 551 543 atomic_dec(&isw_nr_in_flight); 544 + } 545 + 546 + void inode_switch_wbs_work_fn(struct work_struct *work) 547 + { 548 + struct bdi_writeback *new_wb = container_of(work, struct bdi_writeback, 549 + switch_work); 550 + struct inode_switch_wbs_context *isw, *next_isw; 551 + struct llist_node *list; 552 + 553 + /* 554 + * Grab out reference to wb so that it cannot get freed under us 555 + * after we process all the isw items. 556 + */ 557 + wb_get(new_wb); 558 + while (1) { 559 + list = llist_del_all(&new_wb->switch_wbs_ctxs); 560 + /* Nothing to do? */ 561 + if (!list) 562 + break; 563 + /* 564 + * In addition to synchronizing among switchers, I_WB_SWITCH 565 + * tells the RCU protected stat update paths to grab the i_page 566 + * lock so that stat transfer can synchronize against them. 567 + * Let's continue after I_WB_SWITCH is guaranteed to be 568 + * visible. 569 + */ 570 + synchronize_rcu(); 571 + 572 + llist_for_each_entry_safe(isw, next_isw, list, list) 573 + process_inode_switch_wbs(new_wb, isw); 574 + } 575 + wb_put(new_wb); 552 576 } 553 577 554 578 static bool inode_prepare_wbs_switch(struct inode *inode, ··· 612 572 return true; 613 573 } 614 574 575 + static void wb_queue_isw(struct bdi_writeback *wb, 576 + struct inode_switch_wbs_context *isw) 577 + { 578 + if (llist_add(&isw->list, &wb->switch_wbs_ctxs)) 579 + queue_work(isw_wq, &wb->switch_work); 580 + } 581 + 615 582 /** 616 583 * inode_switch_wbs - change the wb association of an inode 617 584 * @inode: target inode ··· 632 585 struct backing_dev_info *bdi = inode_to_bdi(inode); 633 586 struct cgroup_subsys_state *memcg_css; 634 587 struct inode_switch_wbs_context *isw; 588 + struct bdi_writeback *new_wb = NULL; 635 589 636 590 /* noop if seems to be already in progress */ 637 591 if (inode->i_state & I_WB_SWITCH) ··· 657 609 if (!memcg_css) 658 610 goto out_free; 659 611 660 - isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); 612 + new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); 661 613 css_put(memcg_css); 662 - if (!isw->new_wb) 614 + if (!new_wb) 663 615 goto out_free; 664 616 665 - if (!inode_prepare_wbs_switch(inode, isw->new_wb)) 617 + if (!inode_prepare_wbs_switch(inode, new_wb)) 666 618 goto out_free; 667 619 668 620 isw->inodes[0] = inode; 669 621 670 - /* 671 - * In addition to synchronizing among switchers, I_WB_SWITCH tells 672 - * the RCU protected stat update paths to grab the i_page 673 - * lock so that stat transfer can synchronize against them. 674 - * Let's continue after I_WB_SWITCH is guaranteed to be visible. 675 - */ 676 - INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); 677 - queue_rcu_work(isw_wq, &isw->work); 622 + trace_inode_switch_wbs_queue(inode->i_wb, new_wb, 1); 623 + wb_queue_isw(new_wb, isw); 678 624 return; 679 625 680 626 out_free: 681 627 atomic_dec(&isw_nr_in_flight); 682 - if (isw->new_wb) 683 - wb_put(isw->new_wb); 628 + if (new_wb) 629 + wb_put(new_wb); 684 630 kfree(isw); 685 631 } 686 632 687 - static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw, 633 + static bool isw_prepare_wbs_switch(struct bdi_writeback *new_wb, 634 + struct inode_switch_wbs_context *isw, 688 635 struct list_head *list, int *nr) 689 636 { 690 637 struct inode *inode; 691 638 692 639 list_for_each_entry(inode, list, i_io_list) { 693 - if (!inode_prepare_wbs_switch(inode, isw->new_wb)) 640 + if (!inode_prepare_wbs_switch(inode, new_wb)) 694 641 continue; 695 642 696 643 isw->inodes[*nr] = inode; ··· 709 666 { 710 667 struct cgroup_subsys_state *memcg_css; 711 668 struct inode_switch_wbs_context *isw; 669 + struct bdi_writeback *new_wb; 712 670 int nr; 713 671 bool restart = false; 714 672 ··· 722 678 723 679 for (memcg_css = wb->memcg_css->parent; memcg_css; 724 680 memcg_css = memcg_css->parent) { 725 - isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); 726 - if (isw->new_wb) 681 + new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); 682 + if (new_wb) 727 683 break; 728 684 } 729 - if (unlikely(!isw->new_wb)) 730 - isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ 685 + if (unlikely(!new_wb)) 686 + new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ 731 687 732 688 nr = 0; 733 689 spin_lock(&wb->list_lock); ··· 739 695 * bandwidth restrictions, as writeback of inode metadata is not 740 696 * accounted for. 741 697 */ 742 - restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr); 698 + restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_attached, &nr); 743 699 if (!restart) 744 - restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr); 700 + restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_dirty_time, 701 + &nr); 745 702 spin_unlock(&wb->list_lock); 746 703 747 704 /* no attached inodes? bail out */ 748 705 if (nr == 0) { 749 706 atomic_dec(&isw_nr_in_flight); 750 - wb_put(isw->new_wb); 707 + wb_put(new_wb); 751 708 kfree(isw); 752 709 return restart; 753 710 } 754 711 755 - /* 756 - * In addition to synchronizing among switchers, I_WB_SWITCH tells 757 - * the RCU protected stat update paths to grab the i_page 758 - * lock so that stat transfer can synchronize against them. 759 - * Let's continue after I_WB_SWITCH is guaranteed to be visible. 760 - */ 761 - INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); 762 - queue_rcu_work(isw_wq, &isw->work); 712 + trace_inode_switch_wbs_queue(wb, new_wb, nr); 713 + wb_queue_isw(new_wb, isw); 763 714 764 715 return restart; 765 716 }
+4
include/linux/backing-dev-defs.h
··· 152 152 struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */ 153 153 struct list_head b_attached; /* attached inodes, protected by list_lock */ 154 154 struct list_head offline_node; /* anchored at offline_cgwbs */ 155 + struct work_struct switch_work; /* work used to perform inode switching 156 + * to this wb */ 157 + struct llist_head switch_wbs_ctxs; /* queued contexts for 158 + * writeback switching */ 155 159 156 160 union { 157 161 struct work_struct release_work;
+2
include/linux/writeback.h
··· 265 265 bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css); 266 266 } 267 267 268 + void inode_switch_wbs_work_fn(struct work_struct *work); 269 + 268 270 #else /* CONFIG_CGROUP_WRITEBACK */ 269 271 270 272 static inline void inode_attach_wb(struct inode *inode, struct folio *folio)
+29
include/trace/events/writeback.h
··· 213 213 ) 214 214 ); 215 215 216 + TRACE_EVENT(inode_switch_wbs_queue, 217 + 218 + TP_PROTO(struct bdi_writeback *old_wb, struct bdi_writeback *new_wb, 219 + unsigned int count), 220 + 221 + TP_ARGS(old_wb, new_wb, count), 222 + 223 + TP_STRUCT__entry( 224 + __array(char, name, 32) 225 + __field(ino_t, old_cgroup_ino) 226 + __field(ino_t, new_cgroup_ino) 227 + __field(unsigned int, count) 228 + ), 229 + 230 + TP_fast_assign( 231 + strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32); 232 + __entry->old_cgroup_ino = __trace_wb_assign_cgroup(old_wb); 233 + __entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb); 234 + __entry->count = count; 235 + ), 236 + 237 + TP_printk("bdi %s: old_cgroup_ino=%lu new_cgroup_ino=%lu count=%u", 238 + __entry->name, 239 + (unsigned long)__entry->old_cgroup_ino, 240 + (unsigned long)__entry->new_cgroup_ino, 241 + __entry->count 242 + ) 243 + ); 244 + 216 245 TRACE_EVENT(inode_switch_wbs, 217 246 218 247 TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb,
+5
mm/backing-dev.c
··· 633 633 wb_exit(wb); 634 634 bdi_put(bdi); 635 635 WARN_ON_ONCE(!list_empty(&wb->b_attached)); 636 + WARN_ON_ONCE(work_pending(&wb->switch_work)); 636 637 call_rcu(&wb->rcu, cgwb_free_rcu); 637 638 } 638 639 ··· 710 709 wb->memcg_css = memcg_css; 711 710 wb->blkcg_css = blkcg_css; 712 711 INIT_LIST_HEAD(&wb->b_attached); 712 + INIT_WORK(&wb->switch_work, inode_switch_wbs_work_fn); 713 + init_llist_head(&wb->switch_wbs_ctxs); 713 714 INIT_WORK(&wb->release_work, cgwb_release_workfn); 714 715 set_bit(WB_registered, &wb->state); 715 716 bdi_get(bdi); ··· 842 839 if (!ret) { 843 840 bdi->wb.memcg_css = &root_mem_cgroup->css; 844 841 bdi->wb.blkcg_css = blkcg_root_css; 842 + INIT_WORK(&bdi->wb.switch_work, inode_switch_wbs_work_fn); 843 + init_llist_head(&bdi->wb.switch_wbs_ctxs); 845 844 } 846 845 return ret; 847 846 }