Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: list_lru: replace linear array with xarray

If we run 10k containers in the system, the size of the
list_lru_memcg->lrus can be ~96KB per list_lru. When we decrease the
number containers, the size of the array will not be shrinked. It is
not scalable. The xarray is a good choice for this case. We can save a
lot of memory when there are tens of thousands continers in the system.
If we use xarray, we also can remove the logic code of resizing array,
which can simplify the code.

[akpm@linux-foundation.org: remove unused local]

Link: https://lkml.kernel.org/r/20220228122126.37293-13-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Cc: Alex Shi <alexs@kernel.org>
Cc: Anna Schumaker <Anna.Schumaker@Netapp.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Fam Zheng <fam.zheng@bytedance.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kari Argillander <kari.argillander@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Muchun Song and committed by
Linus Torvalds
bbca91cc 1f391eb2

+73 -243
+2 -11
include/linux/list_lru.h
··· 11 11 #include <linux/list.h> 12 12 #include <linux/nodemask.h> 13 13 #include <linux/shrinker.h> 14 + #include <linux/xarray.h> 14 15 15 16 struct mem_cgroup; 16 17 ··· 38 37 struct list_lru_one node[]; 39 38 }; 40 39 41 - struct list_lru_memcg { 42 - struct rcu_head rcu; 43 - /* array of per cgroup lists, indexed by memcg_cache_id */ 44 - struct list_lru_per_memcg __rcu *mlru[]; 45 - }; 46 - 47 40 struct list_lru_node { 48 41 /* protects all lists on the node, including per cgroup */ 49 42 spinlock_t lock; ··· 52 57 struct list_head list; 53 58 int shrinker_id; 54 59 bool memcg_aware; 55 - /* protects ->mlrus->mlru[i] */ 56 - spinlock_t lock; 57 - /* for cgroup aware lrus points to per cgroup lists, otherwise NULL */ 58 - struct list_lru_memcg __rcu *mlrus; 60 + struct xarray xa; 59 61 #endif 60 62 }; 61 63 ··· 69 77 70 78 int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, 71 79 gfp_t gfp); 72 - int memcg_update_all_list_lrus(int num_memcgs); 73 80 void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent); 74 81 75 82 /**
-23
include/linux/memcontrol.h
··· 1685 1685 1686 1686 extern struct static_key_false memcg_kmem_enabled_key; 1687 1687 1688 - extern int memcg_nr_cache_ids; 1689 - void memcg_get_cache_ids(void); 1690 - void memcg_put_cache_ids(void); 1691 - 1692 - /* 1693 - * Helper macro to loop through all memcg-specific caches. Callers must still 1694 - * check if the cache is valid (it is either valid or NULL). 1695 - * the slab_mutex must be held when looping through those caches 1696 - */ 1697 - #define for_each_memcg_cache_index(_idx) \ 1698 - for ((_idx) = 0; (_idx) < memcg_nr_cache_ids; (_idx)++) 1699 - 1700 1688 static inline bool memcg_kmem_enabled(void) 1701 1689 { 1702 1690 return static_branch_likely(&memcg_kmem_enabled_key); ··· 1741 1753 { 1742 1754 } 1743 1755 1744 - #define for_each_memcg_cache_index(_idx) \ 1745 - for (; NULL; ) 1746 - 1747 1756 static inline bool memcg_kmem_enabled(void) 1748 1757 { 1749 1758 return false; ··· 1749 1764 static inline int memcg_cache_id(struct mem_cgroup *memcg) 1750 1765 { 1751 1766 return -1; 1752 - } 1753 - 1754 - static inline void memcg_get_cache_ids(void) 1755 - { 1756 - } 1757 - 1758 - static inline void memcg_put_cache_ids(void) 1759 - { 1760 1767 } 1761 1768 1762 1769 static inline struct mem_cgroup *mem_cgroup_from_obj(void *p)
+66 -137
mm/list_lru.c
··· 52 52 static inline struct list_lru_one * 53 53 list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) 54 54 { 55 - struct list_lru_memcg *mlrus; 56 - struct list_lru_node *nlru = &lru->node[nid]; 55 + if (list_lru_memcg_aware(lru) && idx >= 0) { 56 + struct list_lru_per_memcg *mlru = xa_load(&lru->xa, idx); 57 57 58 - /* 59 - * Either lock or RCU protects the array of per cgroup lists 60 - * from relocation (see memcg_update_list_lru). 61 - */ 62 - mlrus = rcu_dereference_check(lru->mlrus, lockdep_is_held(&nlru->lock)); 63 - if (mlrus && idx >= 0) { 64 - struct list_lru_per_memcg *mlru; 65 - 66 - mlru = rcu_dereference_check(mlrus->mlru[idx], true); 67 58 return mlru ? &mlru->node[nid] : NULL; 68 59 } 69 - return &nlru->lru; 60 + return &lru->node[nid].lru; 70 61 } 71 62 72 63 static inline struct list_lru_one * ··· 68 77 struct list_lru_one *l = &nlru->lru; 69 78 struct mem_cgroup *memcg = NULL; 70 79 71 - if (!lru->mlrus) 80 + if (!list_lru_memcg_aware(lru)) 72 81 goto out; 73 82 74 83 memcg = mem_cgroup_from_obj(ptr); ··· 300 309 unsigned long *nr_to_walk) 301 310 { 302 311 long isolated = 0; 303 - int memcg_idx; 304 312 305 313 isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg, 306 314 nr_to_walk); 315 + 316 + #ifdef CONFIG_MEMCG_KMEM 307 317 if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { 308 - for_each_memcg_cache_index(memcg_idx) { 318 + struct list_lru_per_memcg *mlru; 319 + unsigned long index; 320 + 321 + xa_for_each(&lru->xa, index, mlru) { 309 322 struct list_lru_node *nlru = &lru->node[nid]; 310 323 311 324 spin_lock(&nlru->lock); 312 - isolated += __list_lru_walk_one(lru, nid, memcg_idx, 325 + isolated += __list_lru_walk_one(lru, nid, index, 313 326 isolate, cb_arg, 314 327 nr_to_walk); 315 328 spin_unlock(&nlru->lock); ··· 322 327 break; 323 328 } 324 329 } 330 + #endif 331 + 325 332 return isolated; 326 333 } 327 334 EXPORT_SYMBOL_GPL(list_lru_walk_node); ··· 335 338 } 336 339 337 340 #ifdef CONFIG_MEMCG_KMEM 338 - static void memcg_destroy_list_lru_range(struct list_lru_memcg *mlrus, 339 - int begin, int end) 340 - { 341 - int i; 342 - 343 - for (i = begin; i < end; i++) 344 - kfree(mlrus->mlru[i]); 345 - } 346 - 347 341 static struct list_lru_per_memcg *memcg_init_list_lru_one(gfp_t gfp) 348 342 { 349 343 int nid; ··· 352 364 353 365 static void memcg_list_lru_free(struct list_lru *lru, int src_idx) 354 366 { 355 - struct list_lru_memcg *mlrus; 356 - struct list_lru_per_memcg *mlru; 357 - 358 - spin_lock_irq(&lru->lock); 359 - mlrus = rcu_dereference_protected(lru->mlrus, true); 360 - mlru = rcu_dereference_protected(mlrus->mlru[src_idx], true); 361 - rcu_assign_pointer(mlrus->mlru[src_idx], NULL); 362 - spin_unlock_irq(&lru->lock); 367 + struct list_lru_per_memcg *mlru = xa_erase_irq(&lru->xa, src_idx); 363 368 364 369 /* 365 370 * The __list_lru_walk_one() can walk the list of this node. ··· 364 383 kvfree_rcu(mlru, rcu); 365 384 } 366 385 367 - static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) 386 + static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) 368 387 { 369 - struct list_lru_memcg *mlrus; 370 - int size = memcg_nr_cache_ids; 371 - 388 + if (memcg_aware) 389 + xa_init_flags(&lru->xa, XA_FLAGS_LOCK_IRQ); 372 390 lru->memcg_aware = memcg_aware; 373 - if (!memcg_aware) 374 - return 0; 375 - 376 - spin_lock_init(&lru->lock); 377 - 378 - mlrus = kvzalloc(struct_size(mlrus, mlru, size), GFP_KERNEL); 379 - if (!mlrus) 380 - return -ENOMEM; 381 - 382 - RCU_INIT_POINTER(lru->mlrus, mlrus); 383 - 384 - return 0; 385 391 } 386 392 387 393 static void memcg_destroy_list_lru(struct list_lru *lru) 388 394 { 389 - struct list_lru_memcg *mlrus; 395 + XA_STATE(xas, &lru->xa, 0); 396 + struct list_lru_per_memcg *mlru; 390 397 391 398 if (!list_lru_memcg_aware(lru)) 392 399 return; 393 400 394 - /* 395 - * This is called when shrinker has already been unregistered, 396 - * and nobody can use it. So, there is no need to use kvfree_rcu(). 397 - */ 398 - mlrus = rcu_dereference_protected(lru->mlrus, true); 399 - memcg_destroy_list_lru_range(mlrus, 0, memcg_nr_cache_ids); 400 - kvfree(mlrus); 401 - } 402 - 403 - static int memcg_update_list_lru(struct list_lru *lru, int old_size, int new_size) 404 - { 405 - struct list_lru_memcg *old, *new; 406 - 407 - BUG_ON(old_size > new_size); 408 - 409 - old = rcu_dereference_protected(lru->mlrus, 410 - lockdep_is_held(&list_lrus_mutex)); 411 - new = kvmalloc(struct_size(new, mlru, new_size), GFP_KERNEL); 412 - if (!new) 413 - return -ENOMEM; 414 - 415 - spin_lock_irq(&lru->lock); 416 - memcpy(&new->mlru, &old->mlru, flex_array_size(new, mlru, old_size)); 417 - memset(&new->mlru[old_size], 0, flex_array_size(new, mlru, new_size - old_size)); 418 - rcu_assign_pointer(lru->mlrus, new); 419 - spin_unlock_irq(&lru->lock); 420 - 421 - kvfree_rcu(old, rcu); 422 - return 0; 423 - } 424 - 425 - int memcg_update_all_list_lrus(int new_size) 426 - { 427 - int ret = 0; 428 - struct list_lru *lru; 429 - int old_size = memcg_nr_cache_ids; 430 - 431 - mutex_lock(&list_lrus_mutex); 432 - list_for_each_entry(lru, &memcg_list_lrus, list) { 433 - ret = memcg_update_list_lru(lru, old_size, new_size); 434 - if (ret) 435 - break; 401 + xas_lock_irq(&xas); 402 + xas_for_each(&xas, mlru, ULONG_MAX) { 403 + kfree(mlru); 404 + xas_store(&xas, NULL); 436 405 } 437 - mutex_unlock(&list_lrus_mutex); 438 - return ret; 406 + xas_unlock_irq(&xas); 439 407 } 440 408 441 409 static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, ··· 451 521 struct mem_cgroup *child; 452 522 453 523 child = mem_cgroup_from_css(css); 454 - child->kmemcg_id = parent->kmemcg_id; 524 + WRITE_ONCE(child->kmemcg_id, parent->kmemcg_id); 455 525 } 456 526 rcu_read_unlock(); 457 527 ··· 461 531 mutex_unlock(&list_lrus_mutex); 462 532 } 463 533 464 - static bool memcg_list_lru_allocated(struct mem_cgroup *memcg, 465 - struct list_lru *lru) 534 + static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg, 535 + struct list_lru *lru) 466 536 { 467 - bool allocated; 468 - int idx; 537 + int idx = memcg->kmemcg_id; 469 538 470 - idx = memcg->kmemcg_id; 471 - if (unlikely(idx < 0)) 472 - return true; 473 - 474 - rcu_read_lock(); 475 - allocated = !!rcu_access_pointer(rcu_dereference(lru->mlrus)->mlru[idx]); 476 - rcu_read_unlock(); 477 - 478 - return allocated; 539 + return idx < 0 || xa_load(&lru->xa, idx); 479 540 } 480 541 481 542 int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, ··· 474 553 { 475 554 int i; 476 555 unsigned long flags; 477 - struct list_lru_memcg *mlrus; 478 556 struct list_lru_memcg_table { 479 557 struct list_lru_per_memcg *mlru; 480 558 struct mem_cgroup *memcg; 481 559 } *table; 560 + XA_STATE(xas, &lru->xa, 0); 482 561 483 562 if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru)) 484 563 return 0; ··· 507 586 } 508 587 } 509 588 510 - spin_lock_irqsave(&lru->lock, flags); 511 - mlrus = rcu_dereference_protected(lru->mlrus, true); 589 + xas_lock_irqsave(&xas, flags); 512 590 while (i--) { 513 - int index = table[i].memcg->kmemcg_id; 591 + int index = READ_ONCE(table[i].memcg->kmemcg_id); 514 592 struct list_lru_per_memcg *mlru = table[i].mlru; 515 593 516 - if (index < 0 || rcu_dereference_protected(mlrus->mlru[index], true)) 594 + xas_set(&xas, index); 595 + retry: 596 + if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) { 517 597 kfree(mlru); 518 - else 519 - rcu_assign_pointer(mlrus->mlru[index], mlru); 598 + } else { 599 + xas_store(&xas, mlru); 600 + if (xas_error(&xas) == -ENOMEM) { 601 + xas_unlock_irqrestore(&xas, flags); 602 + if (xas_nomem(&xas, gfp)) 603 + xas_set_err(&xas, 0); 604 + xas_lock_irqsave(&xas, flags); 605 + /* 606 + * The xas lock has been released, this memcg 607 + * can be reparented before us. So reload 608 + * memcg id. More details see the comments 609 + * in memcg_reparent_list_lrus(). 610 + */ 611 + index = READ_ONCE(table[i].memcg->kmemcg_id); 612 + if (index < 0) 613 + xas_set_err(&xas, 0); 614 + else if (!xas_error(&xas) && index != xas.xa_index) 615 + xas_set(&xas, index); 616 + goto retry; 617 + } 618 + } 520 619 } 521 - spin_unlock_irqrestore(&lru->lock, flags); 522 - 620 + /* xas_nomem() is used to free memory instead of memory allocation. */ 621 + if (xas.xa_alloc) 622 + xas_nomem(&xas, gfp); 623 + xas_unlock_irqrestore(&xas, flags); 523 624 kfree(table); 524 625 525 - return 0; 626 + return xas_error(&xas); 526 627 } 527 628 #else 528 - static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) 629 + static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) 529 630 { 530 - return 0; 531 631 } 532 632 533 633 static void memcg_destroy_list_lru(struct list_lru *lru) ··· 560 618 struct lock_class_key *key, struct shrinker *shrinker) 561 619 { 562 620 int i; 563 - int err = -ENOMEM; 564 621 565 622 #ifdef CONFIG_MEMCG_KMEM 566 623 if (shrinker) ··· 567 626 else 568 627 lru->shrinker_id = -1; 569 628 #endif 570 - memcg_get_cache_ids(); 571 629 572 630 lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL); 573 631 if (!lru->node) 574 - goto out; 632 + return -ENOMEM; 575 633 576 634 for_each_node(i) { 577 635 spin_lock_init(&lru->node[i].lock); ··· 579 639 init_one_lru(&lru->node[i].lru); 580 640 } 581 641 582 - err = memcg_init_list_lru(lru, memcg_aware); 583 - if (err) { 584 - kfree(lru->node); 585 - /* Do this so a list_lru_destroy() doesn't crash: */ 586 - lru->node = NULL; 587 - goto out; 588 - } 589 - 642 + memcg_init_list_lru(lru, memcg_aware); 590 643 list_lru_register(lru); 591 - out: 592 - memcg_put_cache_ids(); 593 - return err; 644 + 645 + return 0; 594 646 } 595 647 EXPORT_SYMBOL_GPL(__list_lru_init); 596 648 ··· 591 659 /* Already destroyed or not yet initialized? */ 592 660 if (!lru->node) 593 661 return; 594 - 595 - memcg_get_cache_ids(); 596 662 597 663 list_lru_unregister(lru); 598 664 ··· 601 671 #ifdef CONFIG_MEMCG_KMEM 602 672 lru->shrinker_id = -1; 603 673 #endif 604 - memcg_put_cache_ids(); 605 674 } 606 675 EXPORT_SYMBOL_GPL(list_lru_destroy);
+5 -72
mm/memcontrol.c
··· 351 351 * This will be used as a shrinker list's index. 352 352 * The main reason for not using cgroup id for this: 353 353 * this works better in sparse environments, where we have a lot of memcgs, 354 - * but only a few kmem-limited. Or also, if we have, for instance, 200 355 - * memcgs, and none but the 200th is kmem-limited, we'd have to have a 356 - * 200 entry array for that. 357 - * 358 - * The current size of the caches array is stored in memcg_nr_cache_ids. It 359 - * will double each time we have to increase it. 354 + * but only a few kmem-limited. 360 355 */ 361 356 static DEFINE_IDA(memcg_cache_ida); 362 - int memcg_nr_cache_ids; 363 - 364 - /* Protects memcg_nr_cache_ids */ 365 - static DECLARE_RWSEM(memcg_cache_ids_sem); 366 - 367 - void memcg_get_cache_ids(void) 368 - { 369 - down_read(&memcg_cache_ids_sem); 370 - } 371 - 372 - void memcg_put_cache_ids(void) 373 - { 374 - up_read(&memcg_cache_ids_sem); 375 - } 376 357 377 358 /* 378 - * MIN_SIZE is different than 1, because we would like to avoid going through 379 - * the alloc/free process all the time. In a small machine, 4 kmem-limited 380 - * cgroups is a reasonable guess. In the future, it could be a parameter or 381 - * tunable, but that is strictly not necessary. 382 - * 383 359 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get 384 360 * this constant directly from cgroup, but it is understandable that this is 385 361 * better kept as an internal representation in cgroup.c. In any case, the 386 362 * cgrp_id space is not getting any smaller, and we don't have to necessarily 387 363 * increase ours as well if it increases. 388 364 */ 389 - #define MEMCG_CACHES_MIN_SIZE 4 390 365 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX 391 366 392 367 /* ··· 2919 2944 return objcg; 2920 2945 } 2921 2946 2922 - static int memcg_alloc_cache_id(void) 2923 - { 2924 - int id, size; 2925 - int err; 2926 - 2927 - id = ida_simple_get(&memcg_cache_ida, 2928 - 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 2929 - if (id < 0) 2930 - return id; 2931 - 2932 - if (id < memcg_nr_cache_ids) 2933 - return id; 2934 - 2935 - /* 2936 - * There's no space for the new id in memcg_caches arrays, 2937 - * so we have to grow them. 2938 - */ 2939 - down_write(&memcg_cache_ids_sem); 2940 - 2941 - size = 2 * (id + 1); 2942 - if (size < MEMCG_CACHES_MIN_SIZE) 2943 - size = MEMCG_CACHES_MIN_SIZE; 2944 - else if (size > MEMCG_CACHES_MAX_SIZE) 2945 - size = MEMCG_CACHES_MAX_SIZE; 2946 - 2947 - err = memcg_update_all_list_lrus(size); 2948 - if (!err) 2949 - memcg_nr_cache_ids = size; 2950 - 2951 - up_write(&memcg_cache_ids_sem); 2952 - 2953 - if (err) { 2954 - ida_simple_remove(&memcg_cache_ida, id); 2955 - return err; 2956 - } 2957 - return id; 2958 - } 2959 - 2960 - static void memcg_free_cache_id(int id) 2961 - { 2962 - ida_simple_remove(&memcg_cache_ida, id); 2963 - } 2964 - 2965 2947 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) 2966 2948 { 2967 2949 mod_memcg_state(memcg, MEMCG_KMEM, nr_pages); ··· 3605 3673 if (unlikely(mem_cgroup_is_root(memcg))) 3606 3674 return 0; 3607 3675 3608 - memcg_id = memcg_alloc_cache_id(); 3676 + memcg_id = ida_alloc_max(&memcg_cache_ida, MEMCG_CACHES_MAX_SIZE - 1, 3677 + GFP_KERNEL); 3609 3678 if (memcg_id < 0) 3610 3679 return memcg_id; 3611 3680 3612 3681 objcg = obj_cgroup_alloc(); 3613 3682 if (!objcg) { 3614 - memcg_free_cache_id(memcg_id); 3683 + ida_free(&memcg_cache_ida, memcg_id); 3615 3684 return -ENOMEM; 3616 3685 } 3617 3686 objcg->memcg = memcg; ··· 3656 3723 */ 3657 3724 memcg_reparent_list_lrus(memcg, parent); 3658 3725 3659 - memcg_free_cache_id(kmemcg_id); 3726 + ida_free(&memcg_cache_ida, kmemcg_id); 3660 3727 } 3661 3728 #else 3662 3729 static int memcg_online_kmem(struct mem_cgroup *memcg)