mm/list_lru: split the lock to per-cgroup scope

-1

drivers/android/binder_alloc.c

··· 1106 1106 mmput_async(mm); 1107 1107 __free_page(page_to_free); 1108 1108 1109 - spin_lock(lock); 1110 1109 return LRU_REMOVED_RETRY; 1111 1110 1112 1111 err_invalid_vma:

-1

fs/inode.c

··· 934 934 mm_account_reclaimed_pages(reap); 935 935 } 936 936 inode_unpin_lru_isolating(inode); 937 - spin_lock(lru_lock); 938 937 return LRU_RETRY; 939 938 } 940 939

-1

fs/xfs/xfs_qm.c

··· 496 496 trace_xfs_dqreclaim_busy(dqp); 497 497 XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); 498 498 xfs_dqunlock(dqp); 499 - spin_lock(lru_lock); 500 499 return LRU_RETRY; 501 500 } 502 501

+3 -3

include/linux/list_lru.h

··· 32 32 struct list_head list; 33 33 /* may become negative during memcg reparenting */ 34 34 long nr_items; 35 + /* protects all fields above */ 36 + spinlock_t lock; 35 37 }; 36 38 37 39 struct list_lru_memcg { ··· 43 41 }; 44 42 45 43 struct list_lru_node { 46 - /* protects all lists on the node, including per cgroup */ 47 - spinlock_t lock; 48 44 /* global list, used for the root cgroup in cgroup aware lrus */ 49 45 struct list_lru_one lru; 50 - long nr_items; 46 + atomic_long_t nr_items; 51 47 } ____cacheline_aligned_in_smp; 52 48 53 49 struct list_lru {

+124 -92

mm/list_lru.c

··· 61 61 } 62 62 63 63 static inline struct list_lru_one * 64 - list_lru_from_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg) 64 + lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, 65 + bool irq, bool skip_empty) 65 66 { 66 67 struct list_lru_one *l; 68 + long nr_items; 69 + 70 + rcu_read_lock(); 67 71 again: 68 72 l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); 69 - if (likely(l)) 70 - return l; 71 - 72 - memcg = parent_mem_cgroup(memcg); 73 + if (likely(l)) { 74 + if (irq) 75 + spin_lock_irq(&l->lock); 76 + else 77 + spin_lock(&l->lock); 78 + nr_items = READ_ONCE(l->nr_items); 79 + if (likely(nr_items != LONG_MIN)) { 80 + WARN_ON(nr_items < 0); 81 + rcu_read_unlock(); 82 + return l; 83 + } 84 + if (irq) 85 + spin_unlock_irq(&l->lock); 86 + else 87 + spin_unlock(&l->lock); 88 + } 89 + /* 90 + * Caller may simply bail out if raced with reparenting or 91 + * may iterate through the list_lru and expect empty slots. 92 + */ 93 + if (skip_empty) { 94 + rcu_read_unlock(); 95 + return NULL; 96 + } 73 97 VM_WARN_ON(!css_is_dying(&memcg->css)); 98 + memcg = parent_mem_cgroup(memcg); 74 99 goto again; 100 + } 101 + 102 + static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) 103 + { 104 + if (irq_off) 105 + spin_unlock_irq(&l->lock); 106 + else 107 + spin_unlock(&l->lock); 75 108 } 76 109 #else 77 110 static void list_lru_register(struct list_lru *lru) ··· 132 99 } 133 100 134 101 static inline struct list_lru_one * 135 - list_lru_from_memcg(struct list_lru *lru, int nid, int idx) 102 + lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, 103 + bool irq, bool skip_empty) 136 104 { 137 - return &lru->node[nid].lru; 105 + struct list_lru_one *l = &lru->node[nid].lru; 106 + 107 + if (irq) 108 + spin_lock_irq(&l->lock); 109 + else 110 + spin_lock(&l->lock); 111 + 112 + return l; 113 + } 114 + 115 + static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) 116 + { 117 + if (irq_off) 118 + spin_unlock_irq(&l->lock); 119 + else 120 + spin_unlock(&l->lock); 138 121 } 139 122 #endif /* CONFIG_MEMCG */ 140 123 141 124 /* The caller must ensure the memcg lifetime. */ 142 125 bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, 143 - struct mem_cgroup *memcg) 126 + struct mem_cgroup *memcg) 144 127 { 145 128 struct list_lru_node *nlru = &lru->node[nid]; 146 129 struct list_lru_one *l; 147 130 148 - spin_lock(&nlru->lock); 131 + l = lock_list_lru_of_memcg(lru, nid, memcg, false, false); 132 + if (!l) 133 + return false; 149 134 if (list_empty(item)) { 150 - l = list_lru_from_memcg(lru, nid, memcg); 151 135 list_add_tail(item, &l->list); 152 136 /* Set shrinker bit if the first element was added */ 153 137 if (!l->nr_items++) 154 138 set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); 155 - nlru->nr_items++; 156 - spin_unlock(&nlru->lock); 139 + unlock_list_lru(l, false); 140 + atomic_long_inc(&nlru->nr_items); 157 141 return true; 158 142 } 159 - spin_unlock(&nlru->lock); 143 + unlock_list_lru(l, false); 160 144 return false; 161 145 } 162 146 ··· 196 146 197 147 /* The caller must ensure the memcg lifetime. */ 198 148 bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, 199 - struct mem_cgroup *memcg) 149 + struct mem_cgroup *memcg) 200 150 { 201 151 struct list_lru_node *nlru = &lru->node[nid]; 202 152 struct list_lru_one *l; 203 - 204 - spin_lock(&nlru->lock); 153 + l = lock_list_lru_of_memcg(lru, nid, memcg, false, false); 154 + if (!l) 155 + return false; 205 156 if (!list_empty(item)) { 206 - l = list_lru_from_memcg(lru, nid, memcg); 207 157 list_del_init(item); 208 158 l->nr_items--; 209 - nlru->nr_items--; 210 - spin_unlock(&nlru->lock); 159 + unlock_list_lru(l, false); 160 + atomic_long_dec(&nlru->nr_items); 211 161 return true; 212 162 } 213 - spin_unlock(&nlru->lock); 163 + unlock_list_lru(l, false); 214 164 return false; 215 165 } 216 - EXPORT_SYMBOL_GPL(list_lru_del); 217 166 218 167 bool list_lru_del_obj(struct list_lru *lru, struct list_head *item) 219 168 { ··· 269 220 struct list_lru_node *nlru; 270 221 271 222 nlru = &lru->node[nid]; 272 - return nlru->nr_items; 223 + return atomic_long_read(&nlru->nr_items); 273 224 } 274 225 EXPORT_SYMBOL_GPL(list_lru_count_node); 275 226 276 227 static unsigned long 277 - __list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, 228 + __list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, 278 229 list_lru_walk_cb isolate, void *cb_arg, 279 - unsigned long *nr_to_walk) 230 + unsigned long *nr_to_walk, bool irq_off) 280 231 { 281 232 struct list_lru_node *nlru = &lru->node[nid]; 282 - struct list_lru_one *l; 233 + struct list_lru_one *l = NULL; 283 234 struct list_head *item, *n; 284 235 unsigned long isolated = 0; 285 236 286 237 restart: 287 - l = list_lru_from_memcg_idx(lru, nid, memcg_idx); 238 + l = lock_list_lru_of_memcg(lru, nid, memcg, irq_off, true); 288 239 if (!l) 289 - goto out; 290 - 240 + return isolated; 291 241 list_for_each_safe(item, n, &l->list) { 292 242 enum lru_status ret; 293 243 ··· 298 250 break; 299 251 --*nr_to_walk; 300 252 301 - ret = isolate(item, l, &nlru->lock, cb_arg); 253 + ret = isolate(item, l, &l->lock, cb_arg); 302 254 switch (ret) { 255 + /* 256 + * LRU_RETRY, LRU_REMOVED_RETRY and LRU_STOP will drop the lru 257 + * lock. List traversal will have to restart from scratch. 258 + */ 259 + case LRU_RETRY: 260 + goto restart; 303 261 case LRU_REMOVED_RETRY: 304 - assert_spin_locked(&nlru->lock); 305 262 fallthrough; 306 263 case LRU_REMOVED: 307 264 isolated++; 308 - nlru->nr_items--; 309 - /* 310 - * If the lru lock has been dropped, our list 311 - * traversal is now invalid and so we have to 312 - * restart from scratch. 313 - */ 265 + atomic_long_dec(&nlru->nr_items); 314 266 if (ret == LRU_REMOVED_RETRY) 315 267 goto restart; 316 268 break; ··· 319 271 break; 320 272 case LRU_SKIP: 321 273 break; 322 - case LRU_RETRY: 323 - /* 324 - * The lru lock has been dropped, our list traversal is 325 - * now invalid and so we have to restart from scratch. 326 - */ 327 - assert_spin_locked(&nlru->lock); 328 - goto restart; 329 274 case LRU_STOP: 330 - assert_spin_locked(&nlru->lock); 331 275 goto out; 332 276 default: 333 277 BUG(); 334 278 } 335 279 } 280 + unlock_list_lru(l, irq_off); 336 281 out: 337 282 return isolated; 338 283 } ··· 335 294 list_lru_walk_cb isolate, void *cb_arg, 336 295 unsigned long *nr_to_walk) 337 296 { 338 - struct list_lru_node *nlru = &lru->node[nid]; 339 - unsigned long ret; 340 - 341 - spin_lock(&nlru->lock); 342 - ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate, 343 - cb_arg, nr_to_walk); 344 - spin_unlock(&nlru->lock); 345 - return ret; 297 + return __list_lru_walk_one(lru, nid, memcg, isolate, 298 + cb_arg, nr_to_walk, false); 346 299 } 347 300 EXPORT_SYMBOL_GPL(list_lru_walk_one); 348 301 ··· 345 310 list_lru_walk_cb isolate, void *cb_arg, 346 311 unsigned long *nr_to_walk) 347 312 { 348 - struct list_lru_node *nlru = &lru->node[nid]; 349 - unsigned long ret; 350 - 351 - spin_lock_irq(&nlru->lock); 352 - ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate, 353 - cb_arg, nr_to_walk); 354 - spin_unlock_irq(&nlru->lock); 355 - return ret; 313 + return __list_lru_walk_one(lru, nid, memcg, isolate, 314 + cb_arg, nr_to_walk, true); 356 315 } 357 316 358 317 unsigned long list_lru_walk_node(struct list_lru *lru, int nid, ··· 361 332 #ifdef CONFIG_MEMCG 362 333 if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { 363 334 struct list_lru_memcg *mlru; 335 + struct mem_cgroup *memcg; 364 336 unsigned long index; 365 337 366 338 xa_for_each(&lru->xa, index, mlru) { 367 - struct list_lru_node *nlru = &lru->node[nid]; 368 - 369 - spin_lock(&nlru->lock); 370 - isolated += __list_lru_walk_one(lru, nid, index, 339 + rcu_read_lock(); 340 + memcg = mem_cgroup_from_id(index); 341 + if (!mem_cgroup_tryget(memcg)) { 342 + rcu_read_unlock(); 343 + continue; 344 + } 345 + rcu_read_unlock(); 346 + isolated += __list_lru_walk_one(lru, nid, memcg, 371 347 isolate, cb_arg, 372 - nr_to_walk); 373 - spin_unlock(&nlru->lock); 348 + nr_to_walk, false); 349 + mem_cgroup_put(memcg); 374 350 375 351 if (*nr_to_walk <= 0) 376 352 break; ··· 387 353 } 388 354 EXPORT_SYMBOL_GPL(list_lru_walk_node); 389 355 390 - static void init_one_lru(struct list_lru_one *l) 356 + static void init_one_lru(struct list_lru *lru, struct list_lru_one *l) 391 357 { 392 358 INIT_LIST_HEAD(&l->list); 359 + spin_lock_init(&l->lock); 393 360 l->nr_items = 0; 361 + #ifdef CONFIG_LOCKDEP 362 + if (lru->key) 363 + lockdep_set_class(&l->lock, lru->key); 364 + #endif 394 365 } 395 366 396 367 #ifdef CONFIG_MEMCG 397 - static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp) 368 + static struct list_lru_memcg *memcg_init_list_lru_one(struct list_lru *lru, gfp_t gfp) 398 369 { 399 370 int nid; 400 371 struct list_lru_memcg *mlru; ··· 409 370 return NULL; 410 371 411 372 for_each_node(nid) 412 - init_one_lru(&mlru->node[nid]); 373 + init_one_lru(lru, &mlru->node[nid]); 413 374 414 375 return mlru; 415 376 } ··· 437 398 xas_unlock_irq(&xas); 438 399 } 439 400 440 - static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, 441 - struct list_lru_one *src, 442 - struct mem_cgroup *dst_memcg) 401 + static void memcg_reparent_list_lru_one(struct list_lru *lru, int nid, 402 + struct list_lru_one *src, 403 + struct mem_cgroup *dst_memcg) 443 404 { 444 - struct list_lru_node *nlru = &lru->node[nid]; 405 + int dst_idx = dst_memcg->kmemcg_id; 445 406 struct list_lru_one *dst; 446 407 447 - /* 448 - * Since list_lru_{add,del} may be called under an IRQ-safe lock, 449 - * we have to use IRQ-safe primitives here to avoid deadlock. 450 - */ 451 - spin_lock_irq(&nlru->lock); 452 - dst = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(dst_memcg)); 408 + spin_lock_irq(&src->lock); 409 + dst = list_lru_from_memcg_idx(lru, nid, dst_idx); 410 + spin_lock_nested(&dst->lock, SINGLE_DEPTH_NESTING); 453 411 454 412 list_splice_init(&src->list, &dst->list); 455 - 456 413 if (src->nr_items) { 457 414 dst->nr_items += src->nr_items; 458 415 set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); 459 - src->nr_items = 0; 460 416 } 461 - spin_unlock_irq(&nlru->lock); 417 + /* Mark the list_lru_one dead */ 418 + src->nr_items = LONG_MIN; 419 + 420 + spin_unlock(&dst->lock); 421 + spin_unlock_irq(&src->lock); 462 422 } 463 423 464 424 void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent) ··· 486 448 * safe to reparent. 487 449 */ 488 450 for_each_node(i) 489 - memcg_reparent_list_lru_node(lru, i, &mlru->node[i], parent); 451 + memcg_reparent_list_lru_one(lru, i, &mlru->node[i], parent); 490 452 491 453 /* 492 454 * Here all list_lrus corresponding to the cgroup are guaranteed ··· 535 497 parent = parent_mem_cgroup(pos); 536 498 } 537 499 538 - mlru = memcg_init_list_lru_one(gfp); 500 + mlru = memcg_init_list_lru_one(lru, gfp); 539 501 if (!mlru) 540 502 return -ENOMEM; 541 503 xas_set(&xas, pos->kmemcg_id); ··· 582 544 if (!lru->node) 583 545 return -ENOMEM; 584 546 585 - for_each_node(i) { 586 - spin_lock_init(&lru->node[i].lock); 587 - #ifdef CONFIG_LOCKDEP 588 - if (lru->key) 589 - lockdep_set_class(&lru->node[i].lock, lru->key); 590 - #endif 591 - init_one_lru(&lru->node[i].lru); 592 - } 547 + for_each_node(i) 548 + init_one_lru(lru, &lru->node[i].lru); 593 549 594 550 memcg_init_list_lru(lru, memcg_aware); 595 551 list_lru_register(lru);

+6 -1

mm/memcontrol.c

··· 3110 3110 if (!parent) 3111 3111 parent = root_mem_cgroup; 3112 3112 3113 - memcg_reparent_objcgs(memcg, parent); 3114 3113 memcg_reparent_list_lrus(memcg, parent); 3114 + 3115 + /* 3116 + * Objcg's reparenting must be after list_lru's, make sure list_lru 3117 + * helpers won't use parent's list_lru until child is drained. 3118 + */ 3119 + memcg_reparent_objcgs(memcg, parent); 3115 3120 } 3116 3121 3117 3122 #ifdef CONFIG_CGROUP_WRITEBACK

-1

mm/workingset.c

··· 767 767 ret = LRU_REMOVED_RETRY; 768 768 out: 769 769 cond_resched(); 770 - spin_lock_irq(lru_lock); 771 770 return ret; 772 771 } 773 772

+2 -3

mm/zswap.c

··· 711 711 * Note that it is safe to use rcu_read_lock() here, even in the face of 712 712 * concurrent memcg offlining: 713 713 * 714 - * 1. list_lru_add() is called before list_lru_memcg is erased. The 714 + * 1. list_lru_add() is called before list_lru_one is dead. The 715 715 * new entry will be reparented to memcg's parent's list_lru. 716 - * 2. list_lru_add() is called after list_lru_memcg is erased. The 716 + * 2. list_lru_add() is called after list_lru_one is dead. The 717 717 * new entry will be added directly to memcg's parent's list_lru. 718 718 * 719 719 * Similar reasoning holds for list_lru_del(). ··· 1179 1179 zswap_written_back_pages++; 1180 1180 } 1181 1181 1182 - spin_lock(lock); 1183 1182 return ret; 1184 1183 } 1185 1184