Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm/list_lru: simplify reparenting and initial allocation

Currently, there is a lot of code for detecting reparent racing using
kmemcg_id as the synchronization flag. And an intermediate table is
required to record and compare the kmemcg_id.

We can simplify this by just checking the cgroup css status, skip if
cgroup is being offlined. On the reparenting side, ensure no more
allocation is on going and no further allocation will occur by using the
XArray lock as barrier.

Combined with a O(n^2) top-down walk for the allocation, we get rid of the
intermediate table allocation completely. Despite being O(n^2), it should
be actually faster because it's not practical to have a very deep cgroup
level, and in most cases the parent cgroup should have been allocated
already.

This also avoided changing kmemcg_id before reparenting, making cgroups
have a stable index for list_lru_memcg. After this change it's possible
that a dying cgroup will see a NULL value in XArray corresponding to the
kmemcg_id, because the kmemcg_id will point to an empty slot. In such
case, just fallback to use its parent.

As a result the code is simpler, following test also showed a very slight
performance gain (12 test runs):

prepare() {
mkdir /tmp/test-fs
modprobe brd rd_nr=1 rd_size=16777216
mkfs.xfs -f /dev/ram0
mount -t xfs /dev/ram0 /tmp/test-fs
for i in $(seq 10000); do
seq 8000 > "/tmp/test-fs/$i"
done
mkdir -p /sys/fs/cgroup/system.slice/bench/test/1
echo +memory > /sys/fs/cgroup/system.slice/bench/cgroup.subtree_control
echo +memory > /sys/fs/cgroup/system.slice/bench/test/cgroup.subtree_control
echo +memory > /sys/fs/cgroup/system.slice/bench/test/1/cgroup.subtree_control
echo 768M > /sys/fs/cgroup/system.slice/bench/memory.max
}

do_test() {
read_worker() {
mkdir -p "/sys/fs/cgroup/system.slice/bench/test/1/$1"
echo $BASHPID > "/sys/fs/cgroup/system.slice/bench/test/1/$1/cgroup.procs"
read -r __TMP < "/tmp/test-fs/$1";
}
read_in_all() {
for i in $(seq 10000); do
read_worker "$i" &
done; wait
}
echo 3 > /proc/sys/vm/drop_caches
time read_in_all
for i in $(seq 1 10000); do
rmdir "/sys/fs/cgroup/system.slice/bench/test/1/$i" &>/dev/null
done
}

Before:
real 0m3.498s user 0m11.037s sys 0m35.872s
real 1m33.860s user 0m11.593s sys 3m1.169s
real 1m31.883s user 0m11.265s sys 2m59.198s
real 1m32.394s user 0m11.294s sys 3m1.616s
real 1m31.017s user 0m11.379s sys 3m1.349s
real 1m31.931s user 0m11.295s sys 2m59.863s
real 1m32.758s user 0m11.254s sys 2m59.538s
real 1m35.198s user 0m11.145s sys 3m1.123s
real 1m30.531s user 0m11.393s sys 2m58.089s
real 1m31.142s user 0m11.333s sys 3m0.549s

After:
real 0m3.489s user 0m10.943s sys 0m36.036s
real 1m10.893s user 0m11.495s sys 2m38.545s
real 1m29.129s user 0m11.382s sys 3m1.601s
real 1m29.944s user 0m11.494s sys 3m1.575s
real 1m31.208s user 0m11.451s sys 2m59.693s
real 1m25.944s user 0m11.327s sys 2m56.394s
real 1m28.599s user 0m11.312s sys 3m0.162s
real 1m26.746s user 0m11.538s sys 2m55.462s
real 1m30.668s user 0m11.475s sys 3m2.075s
real 1m29.258s user 0m11.292s sys 3m0.780s

Which is slightly faster in real time.

Link: https://lkml.kernel.org/r/20241104175257.60853-5-ryncsn@gmail.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Waiman Long <longman@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Kairui Song and committed by
Andrew Morton
28e98022 8d42abbf

+77 -108
+74 -104
mm/list_lru.c
··· 59 59 } 60 60 return &lru->node[nid].lru; 61 61 } 62 + 63 + static inline struct list_lru_one * 64 + list_lru_from_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg) 65 + { 66 + struct list_lru_one *l; 67 + again: 68 + l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); 69 + if (likely(l)) 70 + return l; 71 + 72 + memcg = parent_mem_cgroup(memcg); 73 + VM_WARN_ON(!css_is_dying(&memcg->css)); 74 + goto again; 75 + } 62 76 #else 63 77 static void list_lru_register(struct list_lru *lru) 64 78 { ··· 97 83 { 98 84 return &lru->node[nid].lru; 99 85 } 86 + 87 + static inline struct list_lru_one * 88 + list_lru_from_memcg(struct list_lru *lru, int nid, int idx) 89 + { 90 + return &lru->node[nid].lru; 91 + } 100 92 #endif /* CONFIG_MEMCG */ 101 93 102 94 /* The caller must ensure the memcg lifetime. */ ··· 114 94 115 95 spin_lock(&nlru->lock); 116 96 if (list_empty(item)) { 117 - l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); 97 + l = list_lru_from_memcg(lru, nid, memcg); 118 98 list_add_tail(item, &l->list); 119 99 /* Set shrinker bit if the first element was added */ 120 100 if (!l->nr_items++) ··· 153 133 154 134 spin_lock(&nlru->lock); 155 135 if (!list_empty(item)) { 156 - l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); 136 + l = list_lru_from_memcg(lru, nid, memcg); 157 137 list_del_init(item); 158 138 l->nr_items--; 159 139 nlru->nr_items--; ··· 375 355 return mlru; 376 356 } 377 357 378 - static void memcg_list_lru_free(struct list_lru *lru, int src_idx) 379 - { 380 - struct list_lru_memcg *mlru = xa_erase_irq(&lru->xa, src_idx); 381 - 382 - /* 383 - * The __list_lru_walk_one() can walk the list of this node. 384 - * We need kvfree_rcu() here. And the walking of the list 385 - * is under lru->node[nid]->lock, which can serve as a RCU 386 - * read-side critical section. 387 - */ 388 - if (mlru) 389 - kvfree_rcu(mlru, rcu); 390 - } 391 - 392 358 static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) 393 359 { 394 360 if (memcg_aware) ··· 399 393 } 400 394 401 395 static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, 402 - int src_idx, struct mem_cgroup *dst_memcg) 396 + struct list_lru_one *src, 397 + struct mem_cgroup *dst_memcg) 403 398 { 404 399 struct list_lru_node *nlru = &lru->node[nid]; 405 - int dst_idx = dst_memcg->kmemcg_id; 406 - struct list_lru_one *src, *dst; 400 + struct list_lru_one *dst; 407 401 408 402 /* 409 403 * Since list_lru_{add,del} may be called under an IRQ-safe lock, 410 404 * we have to use IRQ-safe primitives here to avoid deadlock. 411 405 */ 412 406 spin_lock_irq(&nlru->lock); 413 - 414 - src = list_lru_from_memcg_idx(lru, nid, src_idx); 415 - if (!src) 416 - goto out; 417 - dst = list_lru_from_memcg_idx(lru, nid, dst_idx); 407 + dst = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(dst_memcg)); 418 408 419 409 list_splice_init(&src->list, &dst->list); 420 410 ··· 419 417 set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); 420 418 src->nr_items = 0; 421 419 } 422 - out: 423 420 spin_unlock_irq(&nlru->lock); 424 421 } 425 422 426 423 void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent) 427 424 { 428 - struct cgroup_subsys_state *css; 429 425 struct list_lru *lru; 430 - int src_idx = memcg->kmemcg_id, i; 426 + int i; 431 427 432 - /* 433 - * Change kmemcg_id of this cgroup and all its descendants to the 434 - * parent's id, and then move all entries from this cgroup's list_lrus 435 - * to ones of the parent. 436 - */ 437 - rcu_read_lock(); 438 - css_for_each_descendant_pre(css, &memcg->css) { 439 - struct mem_cgroup *child; 440 - 441 - child = mem_cgroup_from_css(css); 442 - WRITE_ONCE(child->kmemcg_id, parent->kmemcg_id); 443 - } 444 - rcu_read_unlock(); 445 - 446 - /* 447 - * With kmemcg_id set to parent, holding the lock of each list_lru_node 448 - * below can prevent list_lru_{add,del,isolate} from touching the lru, 449 - * safe to reparent. 450 - */ 451 428 mutex_lock(&list_lrus_mutex); 452 429 list_for_each_entry(lru, &memcg_list_lrus, list) { 430 + struct list_lru_memcg *mlru; 431 + XA_STATE(xas, &lru->xa, memcg->kmemcg_id); 432 + 433 + /* 434 + * Lock the Xarray to ensure no on going list_lru_memcg 435 + * allocation and further allocation will see css_is_dying(). 436 + */ 437 + xas_lock_irq(&xas); 438 + mlru = xas_store(&xas, NULL); 439 + xas_unlock_irq(&xas); 440 + if (!mlru) 441 + continue; 442 + 443 + /* 444 + * With Xarray value set to NULL, holding the lru lock below 445 + * prevents list_lru_{add,del,isolate} from touching the lru, 446 + * safe to reparent. 447 + */ 453 448 for_each_node(i) 454 - memcg_reparent_list_lru_node(lru, i, src_idx, parent); 449 + memcg_reparent_list_lru_node(lru, i, &mlru->node[i], parent); 455 450 456 451 /* 457 452 * Here all list_lrus corresponding to the cgroup are guaranteed 458 453 * to remain empty, we can safely free this lru, any further 459 454 * memcg_list_lru_alloc() call will simply bail out. 460 455 */ 461 - memcg_list_lru_free(lru, src_idx); 456 + kvfree_rcu(mlru, rcu); 462 457 } 463 458 mutex_unlock(&list_lrus_mutex); 464 459 } ··· 471 472 int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, 472 473 gfp_t gfp) 473 474 { 474 - int i; 475 475 unsigned long flags; 476 - struct list_lru_memcg_table { 477 - struct list_lru_memcg *mlru; 478 - struct mem_cgroup *memcg; 479 - } *table; 476 + struct list_lru_memcg *mlru; 477 + struct mem_cgroup *pos, *parent; 480 478 XA_STATE(xas, &lru->xa, 0); 481 479 482 480 if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru)) 483 481 return 0; 484 482 485 483 gfp &= GFP_RECLAIM_MASK; 486 - table = kmalloc_array(memcg->css.cgroup->level, sizeof(*table), gfp); 487 - if (!table) 488 - return -ENOMEM; 489 - 490 484 /* 491 485 * Because the list_lru can be reparented to the parent cgroup's 492 486 * list_lru, we should make sure that this cgroup and all its 493 487 * ancestors have allocated list_lru_memcg. 494 488 */ 495 - for (i = 0; memcg; memcg = parent_mem_cgroup(memcg), i++) { 496 - if (memcg_list_lru_allocated(memcg, lru)) 497 - break; 489 + do { 490 + /* 491 + * Keep finding the farest parent that wasn't populated 492 + * until found memcg itself. 493 + */ 494 + pos = memcg; 495 + parent = parent_mem_cgroup(pos); 496 + while (!memcg_list_lru_allocated(parent, lru)) { 497 + pos = parent; 498 + parent = parent_mem_cgroup(pos); 499 + } 498 500 499 - table[i].memcg = memcg; 500 - table[i].mlru = memcg_init_list_lru_one(gfp); 501 - if (!table[i].mlru) { 502 - while (i--) 503 - kfree(table[i].mlru); 504 - kfree(table); 501 + mlru = memcg_init_list_lru_one(gfp); 502 + if (!mlru) 505 503 return -ENOMEM; 506 - } 507 - } 508 - 509 - xas_lock_irqsave(&xas, flags); 510 - while (i--) { 511 - int index = READ_ONCE(table[i].memcg->kmemcg_id); 512 - struct list_lru_memcg *mlru = table[i].mlru; 513 - 514 - xas_set(&xas, index); 515 - retry: 516 - if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) { 517 - kfree(mlru); 518 - } else { 519 - xas_store(&xas, mlru); 520 - if (xas_error(&xas) == -ENOMEM) { 521 - xas_unlock_irqrestore(&xas, flags); 522 - if (xas_nomem(&xas, gfp)) 523 - xas_set_err(&xas, 0); 524 - xas_lock_irqsave(&xas, flags); 525 - /* 526 - * The xas lock has been released, this memcg 527 - * can be reparented before us. So reload 528 - * memcg id. More details see the comments 529 - * in memcg_reparent_list_lrus(). 530 - */ 531 - index = READ_ONCE(table[i].memcg->kmemcg_id); 532 - if (index < 0) 533 - xas_set_err(&xas, 0); 534 - else if (!xas_error(&xas) && index != xas.xa_index) 535 - xas_set(&xas, index); 536 - goto retry; 504 + xas_set(&xas, pos->kmemcg_id); 505 + do { 506 + xas_lock_irqsave(&xas, flags); 507 + if (!xas_load(&xas) && !css_is_dying(&pos->css)) { 508 + xas_store(&xas, mlru); 509 + if (!xas_error(&xas)) 510 + mlru = NULL; 537 511 } 538 - } 539 - } 540 - /* xas_nomem() is used to free memory instead of memory allocation. */ 541 - if (xas.xa_alloc) 542 - xas_nomem(&xas, gfp); 543 - xas_unlock_irqrestore(&xas, flags); 544 - kfree(table); 512 + xas_unlock_irqrestore(&xas, flags); 513 + } while (xas_nomem(&xas, gfp)); 514 + if (mlru) 515 + kfree(mlru); 516 + } while (pos != memcg && !css_is_dying(&pos->css)); 545 517 546 518 return xas_error(&xas); 547 519 }
+3 -4
mm/zswap.c
··· 709 709 710 710 /* 711 711 * Note that it is safe to use rcu_read_lock() here, even in the face of 712 - * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection 713 - * used in list_lru lookup, only two scenarios are possible: 712 + * concurrent memcg offlining: 714 713 * 715 - * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The 714 + * 1. list_lru_add() is called before list_lru_memcg is erased. The 716 715 * new entry will be reparented to memcg's parent's list_lru. 717 - * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The 716 + * 2. list_lru_add() is called after list_lru_memcg is erased. The 718 717 * new entry will be added directly to memcg's parent's list_lru. 719 718 * 720 719 * Similar reasoning holds for list_lru_del().