Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm/vmalloc: rework the drain logic

A current "lazy drain" model suffers from at least two issues.

First one is related to the unsorted list of vmap areas, thus in order to
identify the [min:max] range of areas to be drained, it requires a full
list scan. What is a time consuming if the list is too long.

Second one and as a next step is about merging all fragments with a free
space. What is also a time consuming because it has to iterate over
entire list which holds outstanding lazy areas.

See below the "preemptirqsoff" tracer that illustrates a high latency. It
is ~24676us. Our workloads like audio and video are effected by such long
latency:

<snip>
tracer: preemptirqsoff

preemptirqsoff latency trace v1.1.5 on 4.9.186-perf+
--------------------------------------------------------------------
latency: 24676 us, #4/4, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 P:8)
-----------------
| task: crtc_commit:112-261 (uid:0 nice:0 policy:1 rt_prio:16)
-----------------
=> started at: __purge_vmap_area_lazy
=> ended at: __purge_vmap_area_lazy

_------=> CPU#
/ _-----=> irqs-off
| / _----=> need-resched
|| / _---=> hardirq/softirq
||| / _--=> preempt-depth
|||| / delay
cmd pid ||||| time | caller
\ / ||||| \ | /
crtc_com-261 1...1 1us*: _raw_spin_lock <-__purge_vmap_area_lazy
[...]
crtc_com-261 1...1 24675us : _raw_spin_unlock <-__purge_vmap_area_lazy
crtc_com-261 1...1 24677us : trace_preempt_on <-__purge_vmap_area_lazy
crtc_com-261 1...1 24683us : <stack trace>
=> free_vmap_area_noflush
=> remove_vm_area
=> __vunmap
=> vfree
=> drm_property_free_blob
=> drm_mode_object_unreference
=> drm_property_unreference_blob
=> __drm_atomic_helper_crtc_destroy_state
=> sde_crtc_destroy_state
=> drm_atomic_state_default_clear
=> drm_atomic_state_clear
=> drm_atomic_state_free
=> complete_commit
=> _msm_drm_commit_work_cb
=> kthread_worker_fn
=> kthread
=> ret_from_fork
<snip>

To address those two issues we can redesign a purging of the outstanding
lazy areas. Instead of queuing vmap areas to the list, we replace it by
the separate rb-tree. In hat case an area is located in the tree/list in
ascending order. It will give us below advantages:

a) Outstanding vmap areas are merged creating bigger coalesced blocks,
thus it becomes less fragmented.

b) It is possible to calculate a flush range [min:max] without scanning
all elements. It is O(1) access time or complexity;

c) The final merge of areas with the rb-tree that represents a free
space is faster because of (a). As a result the lock contention is
also reduced.

Link: https://lkml.kernel.org/r/20201116220033.1837-2-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sonymobile.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: huang ying <huang.ying.caritas@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Uladzislau Rezki (Sony) and committed by
Linus Torvalds
96e2db45 8945a723

+53 -45
+3 -5
include/linux/vmalloc.h
··· 72 72 struct list_head list; /* address sorted list */ 73 73 74 74 /* 75 - * The following three variables can be packed, because 76 - * a vmap_area object is always one of the three states: 75 + * The following two variables can be packed, because 76 + * a vmap_area object can be either: 77 77 * 1) in "free" tree (root is vmap_area_root) 78 - * 2) in "busy" tree (root is free_vmap_area_root) 79 - * 3) in purge list (head is vmap_purge_list) 78 + * 2) or "busy" tree (root is free_vmap_area_root) 80 79 */ 81 80 union { 82 81 unsigned long subtree_max_size; /* in "free" tree */ 83 82 struct vm_struct *vm; /* in "busy" tree */ 84 - struct llist_node purge_list; /* in purge list */ 85 83 }; 86 84 }; 87 85
+50 -40
mm/vmalloc.c
··· 413 413 static DEFINE_SPINLOCK(free_vmap_area_lock); 414 414 /* Export for kexec only */ 415 415 LIST_HEAD(vmap_area_list); 416 - static LLIST_HEAD(vmap_purge_list); 417 416 static struct rb_root vmap_area_root = RB_ROOT; 418 417 static bool vmap_initialized __read_mostly; 418 + 419 + static struct rb_root purge_vmap_area_root = RB_ROOT; 420 + static LIST_HEAD(purge_vmap_area_list); 421 + static DEFINE_SPINLOCK(purge_vmap_area_lock); 419 422 420 423 /* 421 424 * This kmem_cache is used for vmap_area objects. Instead of ··· 823 820 if (!merged) 824 821 link_va(va, root, parent, link, head); 825 822 826 - /* 827 - * Last step is to check and update the tree. 828 - */ 829 - augment_tree_propagate_from(va); 823 + return va; 824 + } 825 + 826 + static __always_inline struct vmap_area * 827 + merge_or_add_vmap_area_augment(struct vmap_area *va, 828 + struct rb_root *root, struct list_head *head) 829 + { 830 + va = merge_or_add_vmap_area(va, root, head); 831 + if (va) 832 + augment_tree_propagate_from(va); 833 + 830 834 return va; 831 835 } 832 836 ··· 1148 1138 * Insert/Merge it back to the free tree/list. 1149 1139 */ 1150 1140 spin_lock(&free_vmap_area_lock); 1151 - merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list); 1141 + merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list); 1152 1142 spin_unlock(&free_vmap_area_lock); 1153 1143 } 1154 1144 ··· 1336 1326 static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) 1337 1327 { 1338 1328 unsigned long resched_threshold; 1339 - struct llist_node *valist; 1340 - struct vmap_area *va; 1341 - struct vmap_area *n_va; 1329 + struct list_head local_pure_list; 1330 + struct vmap_area *va, *n_va; 1342 1331 1343 1332 lockdep_assert_held(&vmap_purge_lock); 1344 1333 1345 - valist = llist_del_all(&vmap_purge_list); 1346 - if (unlikely(valist == NULL)) 1334 + spin_lock(&purge_vmap_area_lock); 1335 + purge_vmap_area_root = RB_ROOT; 1336 + list_replace_init(&purge_vmap_area_list, &local_pure_list); 1337 + spin_unlock(&purge_vmap_area_lock); 1338 + 1339 + if (unlikely(list_empty(&local_pure_list))) 1347 1340 return false; 1348 1341 1349 - /* 1350 - * TODO: to calculate a flush range without looping. 1351 - * The list can be up to lazy_max_pages() elements. 1352 - */ 1353 - llist_for_each_entry(va, valist, purge_list) { 1354 - if (va->va_start < start) 1355 - start = va->va_start; 1356 - if (va->va_end > end) 1357 - end = va->va_end; 1358 - } 1342 + start = min(start, 1343 + list_first_entry(&local_pure_list, 1344 + struct vmap_area, list)->va_start); 1345 + 1346 + end = max(end, 1347 + list_last_entry(&local_pure_list, 1348 + struct vmap_area, list)->va_end); 1359 1349 1360 1350 flush_tlb_kernel_range(start, end); 1361 1351 resched_threshold = lazy_max_pages() << 1; 1362 1352 1363 1353 spin_lock(&free_vmap_area_lock); 1364 - llist_for_each_entry_safe(va, n_va, valist, purge_list) { 1354 + list_for_each_entry_safe(va, n_va, &local_pure_list, list) { 1365 1355 unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; 1366 1356 unsigned long orig_start = va->va_start; 1367 1357 unsigned long orig_end = va->va_end; ··· 1371 1361 * detached and there is no need to "unlink" it from 1372 1362 * anything. 1373 1363 */ 1374 - va = merge_or_add_vmap_area(va, &free_vmap_area_root, 1375 - &free_vmap_area_list); 1364 + va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root, 1365 + &free_vmap_area_list); 1376 1366 1377 1367 if (!va) 1378 1368 continue; ··· 1429 1419 nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> 1430 1420 PAGE_SHIFT, &vmap_lazy_nr); 1431 1421 1432 - /* After this point, we may free va at any time */ 1433 - llist_add(&va->purge_list, &vmap_purge_list); 1422 + /* 1423 + * Merge or place it to the purge tree/list. 1424 + */ 1425 + spin_lock(&purge_vmap_area_lock); 1426 + merge_or_add_vmap_area(va, 1427 + &purge_vmap_area_root, &purge_vmap_area_list); 1428 + spin_unlock(&purge_vmap_area_lock); 1434 1429 1430 + /* After this point, we may free va at any time */ 1435 1431 if (unlikely(nr_lazy > lazy_max_pages())) 1436 1432 try_purge_vmap_area_lazy(); 1437 1433 } ··· 3367 3351 while (area--) { 3368 3352 orig_start = vas[area]->va_start; 3369 3353 orig_end = vas[area]->va_end; 3370 - va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root, 3371 - &free_vmap_area_list); 3354 + va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, 3355 + &free_vmap_area_list); 3372 3356 if (va) 3373 3357 kasan_release_vmalloc(orig_start, orig_end, 3374 3358 va->va_start, va->va_end); ··· 3417 3401 for (area = 0; area < nr_vms; area++) { 3418 3402 orig_start = vas[area]->va_start; 3419 3403 orig_end = vas[area]->va_end; 3420 - va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root, 3421 - &free_vmap_area_list); 3404 + va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, 3405 + &free_vmap_area_list); 3422 3406 if (va) 3423 3407 kasan_release_vmalloc(orig_start, orig_end, 3424 3408 va->va_start, va->va_end); ··· 3498 3482 3499 3483 static void show_purge_info(struct seq_file *m) 3500 3484 { 3501 - struct llist_node *head; 3502 3485 struct vmap_area *va; 3503 3486 3504 - head = READ_ONCE(vmap_purge_list.first); 3505 - if (head == NULL) 3506 - return; 3507 - 3508 - llist_for_each_entry(va, head, purge_list) { 3487 + spin_lock(&purge_vmap_area_lock); 3488 + list_for_each_entry(va, &purge_vmap_area_list, list) { 3509 3489 seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", 3510 3490 (void *)va->va_start, (void *)va->va_end, 3511 3491 va->va_end - va->va_start); 3512 3492 } 3493 + spin_unlock(&purge_vmap_area_lock); 3513 3494 } 3514 3495 3515 3496 static int s_show(struct seq_file *m, void *p) ··· 3564 3551 seq_putc(m, '\n'); 3565 3552 3566 3553 /* 3567 - * As a final step, dump "unpurged" areas. Note, 3568 - * that entire "/proc/vmallocinfo" output will not 3569 - * be address sorted, because the purge list is not 3570 - * sorted. 3554 + * As a final step, dump "unpurged" areas. 3571 3555 */ 3572 3556 if (list_is_last(&va->list, &vmap_area_list)) 3573 3557 show_purge_info(m);