mm: vmalloc: improve description of vmap node layer

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

This patch adds extra explanation of recently added vmap node layer based
on community feedback. No functional change.

Link: https://lkml.kernel.org/r/20240124180920.50725-1-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Uladzislau Rezki (Sony) and committed by

Andrew Morton 2 years ago 15e02a39 7679ba6b

+46 -14

1 changed file

expand all

vmalloc.c

+46 -14

mm/vmalloc.c

··· 765 765 static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node); 766 766 767 767 /* 768 - * An effective vmap-node logic. Users make use of nodes instead 769 - * of a global heap. It allows to balance an access and mitigate 770 - * contention. 768 + * This structure defines a single, solid model where a list and 769 + * rb-tree are part of one entity protected by the lock. Nodes are 770 + * sorted in ascending order, thus for O(1) access to left/right 771 + * neighbors a list is used as well as for sequential traversal. 771 772 */ 772 773 struct rb_list { 773 774 struct rb_root root; ··· 776 775 spinlock_t lock; 777 776 }; 778 777 778 + /* 779 + * A fast size storage contains VAs up to 1M size. A pool consists 780 + * of linked between each other ready to go VAs of certain sizes. 781 + * An index in the pool-array corresponds to number of pages + 1. 782 + */ 783 + #define MAX_VA_SIZE_PAGES 256 784 + 779 785 struct vmap_pool { 780 786 struct list_head head; 781 787 unsigned long len; 782 788 }; 783 789 784 790 /* 785 - * A fast size storage contains VAs up to 1M size. 791 + * An effective vmap-node logic. Users make use of nodes instead 792 + * of a global heap. It allows to balance an access and mitigate 793 + * contention. 786 794 */ 787 - #define MAX_VA_SIZE_PAGES 256 788 - 789 795 static struct vmap_node { 790 796 /* Simple size segregated storage. */ 791 797 struct vmap_pool pool[MAX_VA_SIZE_PAGES]; ··· 811 803 unsigned long nr_purged; 812 804 } single; 813 805 806 + /* 807 + * Initial setup consists of one single node, i.e. a balancing 808 + * is fully disabled. Later on, after vmap is initialized these 809 + * parameters are updated based on a system capacity. 810 + */ 814 811 static struct vmap_node *vmap_nodes = &single; 815 812 static __read_mostly unsigned int nr_vmap_nodes = 1; 816 813 static __read_mostly unsigned int vmap_zone_size = 1; ··· 2061 2048 } 2062 2049 } 2063 2050 2064 - /* Attach the pool back if it has been partly decayed. */ 2051 + /* 2052 + * Attach the pool back if it has been partly decayed. 2053 + * Please note, it is supposed that nobody(other contexts) 2054 + * can populate the pool therefore a simple list replace 2055 + * operation takes place here. 2056 + */ 2065 2057 if (!full_decay && !list_empty(&tmp_list)) { 2066 2058 spin_lock(&vn->pool_lock); 2067 2059 list_replace_init(&tmp_list, &vn->pool[i].head); ··· 2275 2257 * An addr_to_node_id(addr) converts an address to a node index 2276 2258 * where a VA is located. If VA spans several zones and passed 2277 2259 * addr is not the same as va->va_start, what is not common, we 2278 - * may need to scan an extra nodes. See an example: 2260 + * may need to scan extra nodes. See an example: 2279 2261 * 2280 - * <--va--> 2262 + * <----va----> 2281 2263 * -|-----|-----|-----|-----|- 2282 2264 * 1 2 0 1 2283 2265 * 2284 - * VA resides in node 1 whereas it spans 1 and 2. If passed 2285 - * addr is within a second node we should do extra work. We 2286 - * should mention that it is rare and is a corner case from 2287 - * the other hand it has to be covered. 2266 + * VA resides in node 1 whereas it spans 1, 2 an 0. If passed 2267 + * addr is within 2 or 0 nodes we should do extra work. 2288 2268 */ 2289 2269 i = j = addr_to_node_id(addr); 2290 2270 do { ··· 2305 2289 struct vmap_area *va; 2306 2290 int i, j; 2307 2291 2292 + /* 2293 + * Check the comment in the find_vmap_area() about the loop. 2294 + */ 2308 2295 i = j = addr_to_node_id(addr); 2309 2296 do { 2310 2297 vn = &vmap_nodes[i]; ··· 4901 4882 int i, n; 4902 4883 4903 4884 #if BITS_PER_LONG == 64 4904 - /* A high threshold of max nodes is fixed and bound to 128. */ 4885 + /* 4886 + * A high threshold of max nodes is fixed and bound to 128, 4887 + * thus a scale factor is 1 for systems where number of cores 4888 + * are less or equal to specified threshold. 4889 + * 4890 + * As for NUMA-aware notes. For bigger systems, for example 4891 + * NUMA with multi-sockets, where we can end-up with thousands 4892 + * of cores in total, a "sub-numa-clustering" should be added. 4893 + * 4894 + * In this case a NUMA domain is considered as a single entity 4895 + * with dedicated sub-nodes in it which describe one group or 4896 + * set of cores. Therefore a per-domain purging is supposed to 4897 + * be added as well as a per-domain balancing. 4898 + */ 4905 4899 n = clamp_t(unsigned int, num_possible_cpus(), 1, 128); 4906 4900 4907 4901 if (n > 1) {