writeback, cgroup: release dying cgwbs by switching attached inodes

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Asynchronously try to release dying cgwbs by switching attached inodes to
the nearest living ancestor wb. It helps to get rid of per-cgroup
writeback structures themselves and of pinned memory and block cgroups,
which are significantly larger structures (mostly due to large per-cpu
statistics data). This prevents memory waste and helps to avoid different
scalability problems caused by large piles of dying cgroups.

Reuse the existing mechanism of inode switching used for foreign inode
detection. To speed things up batch up to 115 inode switching in a single
operation (the maximum number is selected so that the resulting struct
inode_switch_wbs_context can fit into 1024 bytes). Because every
switching consists of two steps divided by an RCU grace period, it would
be too slow without batching. Please note that the whole batch counts as
a single operation (when increasing/decreasing isw_nr_in_flight). This
allows to keep umounting working (flush the switching queue), however
prevents cleanups from consuming the whole switching quota and effectively
blocking the frn switching.

A cgwb cleanup operation can fail due to different reasons (e.g. not
enough memory, the cgwb has an in-flight/pending io, an attached inode in
a wrong state, etc). In this case the next scheduled cleanup will make a
new attempt. An attempt is made each time a new cgwb is offlined (in
other words a memcg and/or a blkcg is deleted by a user). In the future
an additional attempt scheduled by a timer can be implemented.

[guro@fb.com: replace open-coded "115" with arithmetic]
Link: https://lkml.kernel.org/r/YMEcSBcq/VXMiPPO@carbon.dhcp.thefacebook.com
[guro@fb.com: add smp_mb() to inode_prepare_wbs_switch()]
Link: https://lkml.kernel.org/r/YMFa+guFw7OFjf3X@carbon.dhcp.thefacebook.com
[willy@infradead.org: fix documentation]
Link: https://lkml.kernel.org/r/20210615200242.1716568-2-willy@infradead.org

Link: https://lkml.kernel.org/r/20210608230225.2078447-9-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Dennis Zhou <dennis@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Jan Kara <jack@suse.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Roman Gushchin and committed by

Linus Torvalds 4 years ago c22d70a1 f5fbe6b7

+165 -12

4 changed files

expand all

fs-writeback.c

include

linux

backing-dev-defs.h

writeback.h

backing-dev.c

+101 -10

fs/fs-writeback.c

··· 225 225 /* one round can affect upto 5 slots */ 226 226 #define WB_FRN_MAX_IN_FLIGHT 1024 /* don't queue too many concurrently */ 227 227 228 + /* 229 + * Maximum inodes per isw. A specific value has been chosen to make 230 + * struct inode_switch_wbs_context fit into 1024 bytes kmalloc. 231 + */ 232 + #define WB_MAX_INODES_PER_ISW ((1024UL - sizeof(struct inode_switch_wbs_context)) \ 233 + / sizeof(struct inode *)) 234 + 228 235 static atomic_t isw_nr_in_flight = ATOMIC_INIT(0); 229 236 static struct workqueue_struct *isw_wq; 230 237 ··· 510 503 atomic_dec(&isw_nr_in_flight); 511 504 } 512 505 506 + static bool inode_prepare_wbs_switch(struct inode *inode, 507 + struct bdi_writeback *new_wb) 508 + { 509 + /* 510 + * Paired with smp_mb() in cgroup_writeback_umount(). 511 + * isw_nr_in_flight must be increased before checking SB_ACTIVE and 512 + * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0 513 + * in cgroup_writeback_umount() and the isw_wq will be not flushed. 514 + */ 515 + smp_mb(); 516 + 517 + /* while holding I_WB_SWITCH, no one else can update the association */ 518 + spin_lock(&inode->i_lock); 519 + if (!(inode->i_sb->s_flags & SB_ACTIVE) || 520 + inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) || 521 + inode_to_wb(inode) == new_wb) { 522 + spin_unlock(&inode->i_lock); 523 + return false; 524 + } 525 + inode->i_state |= I_WB_SWITCH; 526 + __iget(inode); 527 + spin_unlock(&inode->i_lock); 528 + 529 + return true; 530 + } 531 + 513 532 /** 514 533 * inode_switch_wbs - change the wb association of an inode 515 534 * @inode: target inode ··· 573 540 if (!isw->new_wb) 574 541 goto out_free; 575 542 576 - /* while holding I_WB_SWITCH, no one else can update the association */ 577 - spin_lock(&inode->i_lock); 578 - if (!(inode->i_sb->s_flags & SB_ACTIVE) || 579 - inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) || 580 - inode_to_wb(inode) == isw->new_wb) { 581 - spin_unlock(&inode->i_lock); 543 + if (!inode_prepare_wbs_switch(inode, isw->new_wb)) 582 544 goto out_free; 583 - } 584 - inode->i_state |= I_WB_SWITCH; 585 - __iget(inode); 586 - spin_unlock(&inode->i_lock); 587 545 588 546 isw->inodes[0] = inode; 589 547 ··· 593 569 if (isw->new_wb) 594 570 wb_put(isw->new_wb); 595 571 kfree(isw); 572 + } 573 + 574 + /** 575 + * cleanup_offline_cgwb - detach associated inodes 576 + * @wb: target wb 577 + * 578 + * Switch all inodes attached to @wb to a nearest living ancestor's wb in order 579 + * to eventually release the dying @wb. Returns %true if not all inodes were 580 + * switched and the function has to be restarted. 581 + */ 582 + bool cleanup_offline_cgwb(struct bdi_writeback *wb) 583 + { 584 + struct cgroup_subsys_state *memcg_css; 585 + struct inode_switch_wbs_context *isw; 586 + struct inode *inode; 587 + int nr; 588 + bool restart = false; 589 + 590 + isw = kzalloc(sizeof(*isw) + WB_MAX_INODES_PER_ISW * 591 + sizeof(struct inode *), GFP_KERNEL); 592 + if (!isw) 593 + return restart; 594 + 595 + atomic_inc(&isw_nr_in_flight); 596 + 597 + for (memcg_css = wb->memcg_css->parent; memcg_css; 598 + memcg_css = memcg_css->parent) { 599 + isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); 600 + if (isw->new_wb) 601 + break; 602 + } 603 + if (unlikely(!isw->new_wb)) 604 + isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ 605 + 606 + nr = 0; 607 + spin_lock(&wb->list_lock); 608 + list_for_each_entry(inode, &wb->b_attached, i_io_list) { 609 + if (!inode_prepare_wbs_switch(inode, isw->new_wb)) 610 + continue; 611 + 612 + isw->inodes[nr++] = inode; 613 + 614 + if (nr >= WB_MAX_INODES_PER_ISW - 1) { 615 + restart = true; 616 + break; 617 + } 618 + } 619 + spin_unlock(&wb->list_lock); 620 + 621 + /* no attached inodes? bail out */ 622 + if (nr == 0) { 623 + atomic_dec(&isw_nr_in_flight); 624 + wb_put(isw->new_wb); 625 + kfree(isw); 626 + return restart; 627 + } 628 + 629 + /* 630 + * In addition to synchronizing among switchers, I_WB_SWITCH tells 631 + * the RCU protected stat update paths to grab the i_page 632 + * lock so that stat transfer can synchronize against them. 633 + * Let's continue after I_WB_SWITCH is guaranteed to be visible. 634 + */ 635 + INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); 636 + queue_rcu_work(isw_wq, &isw->work); 637 + 638 + return restart; 596 639 } 597 640 598 641 /**

include/linux/backing-dev-defs.h

··· 155 155 struct list_head memcg_node; /* anchored at memcg->cgwb_list */ 156 156 struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */ 157 157 struct list_head b_attached; /* attached inodes, protected by list_lock */ 158 + struct list_head offline_node; /* anchored at offline_cgwbs */ 158 159 159 160 union { 160 161 struct work_struct release_work;

include/linux/writeback.h

··· 221 221 int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr_pages, 222 222 enum wb_reason reason, struct wb_completion *done); 223 223 void cgroup_writeback_umount(void); 224 + bool cleanup_offline_cgwb(struct bdi_writeback *wb); 224 225 225 226 /** 226 227 * inode_attach_wb - associate an inode with its wb

+62 -2

mm/backing-dev.c

··· 371 371 #include <linux/memcontrol.h> 372 372 373 373 /* 374 - * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, and memcg->cgwb_list. 375 - * bdi->cgwb_tree is also RCU protected. 374 + * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and 375 + * memcg->cgwb_list. bdi->cgwb_tree is also RCU protected. 376 376 */ 377 377 static DEFINE_SPINLOCK(cgwb_lock); 378 378 static struct workqueue_struct *cgwb_release_wq; 379 + 380 + static LIST_HEAD(offline_cgwbs); 381 + static void cleanup_offline_cgwbs_workfn(struct work_struct *work); 382 + static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn); 379 383 380 384 static void cgwb_release_workfn(struct work_struct *work) 381 385 { ··· 399 395 400 396 fprop_local_destroy_percpu(&wb->memcg_completions); 401 397 percpu_ref_exit(&wb->refcnt); 398 + 399 + spin_lock_irq(&cgwb_lock); 400 + list_del(&wb->offline_node); 401 + spin_unlock_irq(&cgwb_lock); 402 + 402 403 wb_exit(wb); 403 404 WARN_ON_ONCE(!list_empty(&wb->b_attached)); 404 405 kfree_rcu(wb, rcu); ··· 423 414 WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id)); 424 415 list_del(&wb->memcg_node); 425 416 list_del(&wb->blkcg_node); 417 + list_add(&wb->offline_node, &offline_cgwbs); 426 418 percpu_ref_kill(&wb->refcnt); 427 419 } 428 420 ··· 645 635 mutex_unlock(&bdi->cgwb_release_mutex); 646 636 } 647 637 638 + /* 639 + * cleanup_offline_cgwbs_workfn - try to release dying cgwbs 640 + * 641 + * Try to release dying cgwbs by switching attached inodes to the nearest 642 + * living ancestor's writeback. Processed wbs are placed at the end 643 + * of the list to guarantee the forward progress. 644 + */ 645 + static void cleanup_offline_cgwbs_workfn(struct work_struct *work) 646 + { 647 + struct bdi_writeback *wb; 648 + LIST_HEAD(processed); 649 + 650 + spin_lock_irq(&cgwb_lock); 651 + 652 + while (!list_empty(&offline_cgwbs)) { 653 + wb = list_first_entry(&offline_cgwbs, struct bdi_writeback, 654 + offline_node); 655 + list_move(&wb->offline_node, &processed); 656 + 657 + /* 658 + * If wb is dirty, cleaning up the writeback by switching 659 + * attached inodes will result in an effective removal of any 660 + * bandwidth restrictions, which isn't the goal. Instead, 661 + * it can be postponed until the next time, when all io 662 + * will be likely completed. If in the meantime some inodes 663 + * will get re-dirtied, they should be eventually switched to 664 + * a new cgwb. 665 + */ 666 + if (wb_has_dirty_io(wb)) 667 + continue; 668 + 669 + if (!wb_tryget(wb)) 670 + continue; 671 + 672 + spin_unlock_irq(&cgwb_lock); 673 + while (cleanup_offline_cgwb(wb)) 674 + cond_resched(); 675 + spin_lock_irq(&cgwb_lock); 676 + 677 + wb_put(wb); 678 + } 679 + 680 + if (!list_empty(&processed)) 681 + list_splice_tail(&processed, &offline_cgwbs); 682 + 683 + spin_unlock_irq(&cgwb_lock); 684 + } 685 + 648 686 /** 649 687 * wb_memcg_offline - kill all wb's associated with a memcg being offlined 650 688 * @memcg: memcg being offlined ··· 709 651 cgwb_kill(wb); 710 652 memcg_cgwb_list->next = NULL; /* prevent new wb's */ 711 653 spin_unlock_irq(&cgwb_lock); 654 + 655 + queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work); 712 656 } 713 657 714 658 /**