Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

hugetlb: add hugetlb.*.numa_stat file

For hugetlb backed jobs/VMs it's critical to understand the numa
information for the memory backing these jobs to deliver optimal
performance.

Currently this technically can be queried from /proc/self/numa_maps, but
there are significant issues with that. Namely:

1. Memory can be mapped or unmapped.

2. numa_maps are per process and need to be aggregated across all
processes in the cgroup. For shared memory this is more involved as
the userspace needs to make sure it doesn't double count shared
mappings.

3. I believe querying numa_maps needs to hold the mmap_lock which adds
to the contention on this lock.

For these reasons I propose simply adding hugetlb.*.numa_stat file,
which shows the numa information of the cgroup similarly to
memory.numa_stat.

On cgroup-v2:
cat /sys/fs/cgroup/unified/test/hugetlb.2MB.numa_stat
total=2097152 N0=2097152 N1=0

On cgroup-v1:
cat /sys/fs/cgroup/hugetlb/test/hugetlb.2MB.numa_stat
total=2097152 N0=2097152 N1=0
hierarichal_total=2097152 N0=2097152 N1=0

This patch was tested manually by allocating hugetlb memory and querying
the hugetlb.*.numa_stat file of the cgroup and its parents.

[colin.i.king@googlemail.com: fix spelling mistake "hierarichal" -> "hierarchical"]
Link: https://lkml.kernel.org/r/20211125090635.23508-1-colin.i.king@gmail.com
[keescook@chromium.org: fix copy/paste array assignment]
Link: https://lkml.kernel.org/r/20211203065647.2819707-1-keescook@chromium.org

Link: https://lkml.kernel.org/r/20211123001020.4083653-1-almasrymina@google.com
Signed-off-by: Mina Almasry <almasrymina@google.com>
Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Jue Wang <juew@google.com>
Cc: Yang Yao <ygyao@google.com>
Cc: Joanna Li <joannali@google.com>
Cc: Cannon Matthews <cannonmatthews@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Mina Almasry and committed by
Linus Torvalds
f4776199 c4dc63f0

+141 -12
+4
Documentation/admin-guide/cgroup-v1/hugetlb.rst
··· 29 29 hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded 30 30 hugetlb.<hugepagesize>.usage_in_bytes # show current usage for "hugepagesize" hugetlb 31 31 hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB usage limit 32 + hugetlb.<hugepagesize>.numa_stat # show the numa information of the hugetlb memory charged to this cgroup 32 33 33 34 For a system supporting three hugepage sizes (64k, 32M and 1G), the control 34 35 files include:: 35 36 36 37 hugetlb.1GB.limit_in_bytes 37 38 hugetlb.1GB.max_usage_in_bytes 39 + hugetlb.1GB.numa_stat 38 40 hugetlb.1GB.usage_in_bytes 39 41 hugetlb.1GB.failcnt 40 42 hugetlb.1GB.rsvd.limit_in_bytes ··· 45 43 hugetlb.1GB.rsvd.failcnt 46 44 hugetlb.64KB.limit_in_bytes 47 45 hugetlb.64KB.max_usage_in_bytes 46 + hugetlb.64KB.numa_stat 48 47 hugetlb.64KB.usage_in_bytes 49 48 hugetlb.64KB.failcnt 50 49 hugetlb.64KB.rsvd.limit_in_bytes ··· 54 51 hugetlb.64KB.rsvd.failcnt 55 52 hugetlb.32MB.limit_in_bytes 56 53 hugetlb.32MB.max_usage_in_bytes 54 + hugetlb.32MB.numa_stat 57 55 hugetlb.32MB.usage_in_bytes 58 56 hugetlb.32MB.failcnt 59 57 hugetlb.32MB.rsvd.limit_in_bytes
+5
Documentation/admin-guide/cgroup-v2.rst
··· 2266 2266 are local to the cgroup i.e. not hierarchical. The file modified event 2267 2267 generated on this file reflects only the local events. 2268 2268 2269 + hugetlb.<hugepagesize>.numa_stat 2270 + Similar to memory.numa_stat, it shows the numa information of the 2271 + hugetlb pages of <hugepagesize> in this cgroup. Only active in 2272 + use hugetlb pages are included. The per-node values are in bytes. 2273 + 2269 2274 Misc 2270 2275 ---- 2271 2276
+2 -2
include/linux/hugetlb.h
··· 622 622 #endif 623 623 #ifdef CONFIG_CGROUP_HUGETLB 624 624 /* cgroup control files */ 625 - struct cftype cgroup_files_dfl[7]; 626 - struct cftype cgroup_files_legacy[9]; 625 + struct cftype cgroup_files_dfl[8]; 626 + struct cftype cgroup_files_legacy[10]; 627 627 #endif 628 628 char name[HSTATE_NAME_LEN]; 629 629 };
+7
include/linux/hugetlb_cgroup.h
··· 36 36 HUGETLB_NR_MEMORY_EVENTS, 37 37 }; 38 38 39 + struct hugetlb_cgroup_per_node { 40 + /* hugetlb usage in pages over all hstates. */ 41 + unsigned long usage[HUGE_MAX_HSTATE]; 42 + }; 43 + 39 44 struct hugetlb_cgroup { 40 45 struct cgroup_subsys_state css; 41 46 ··· 62 57 63 58 /* Handle for "hugetlb.events.local" */ 64 59 struct cgroup_file events_local_file[HUGE_MAX_HSTATE]; 60 + 61 + struct hugetlb_cgroup_per_node *nodeinfo[]; 65 62 }; 66 63 67 64 static inline struct hugetlb_cgroup *
+123 -10
mm/hugetlb_cgroup.c
··· 123 123 } 124 124 } 125 125 126 + static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup) 127 + { 128 + int node; 129 + 130 + for_each_node(node) 131 + kfree(h_cgroup->nodeinfo[node]); 132 + kfree(h_cgroup); 133 + } 134 + 126 135 static struct cgroup_subsys_state * 127 136 hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 128 137 { 129 138 struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); 130 139 struct hugetlb_cgroup *h_cgroup; 140 + int node; 131 141 132 - h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL); 142 + h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids), 143 + GFP_KERNEL); 144 + 133 145 if (!h_cgroup) 134 146 return ERR_PTR(-ENOMEM); 135 147 136 148 if (!parent_h_cgroup) 137 149 root_h_cgroup = h_cgroup; 138 150 151 + /* 152 + * TODO: this routine can waste much memory for nodes which will 153 + * never be onlined. It's better to use memory hotplug callback 154 + * function. 155 + */ 156 + for_each_node(node) { 157 + /* Set node_to_alloc to -1 for offline nodes. */ 158 + int node_to_alloc = 159 + node_state(node, N_NORMAL_MEMORY) ? node : -1; 160 + h_cgroup->nodeinfo[node] = 161 + kzalloc_node(sizeof(struct hugetlb_cgroup_per_node), 162 + GFP_KERNEL, node_to_alloc); 163 + if (!h_cgroup->nodeinfo[node]) 164 + goto fail_alloc_nodeinfo; 165 + } 166 + 139 167 hugetlb_cgroup_init(h_cgroup, parent_h_cgroup); 140 168 return &h_cgroup->css; 169 + 170 + fail_alloc_nodeinfo: 171 + hugetlb_cgroup_free(h_cgroup); 172 + return ERR_PTR(-ENOMEM); 141 173 } 142 174 143 175 static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) 144 176 { 145 - struct hugetlb_cgroup *h_cgroup; 146 - 147 - h_cgroup = hugetlb_cgroup_from_css(css); 148 - kfree(h_cgroup); 177 + hugetlb_cgroup_free(hugetlb_cgroup_from_css(css)); 149 178 } 150 179 151 180 /* ··· 318 289 return; 319 290 320 291 __set_hugetlb_cgroup(page, h_cg, rsvd); 321 - return; 292 + if (!rsvd) { 293 + unsigned long usage = 294 + h_cg->nodeinfo[page_to_nid(page)]->usage[idx]; 295 + /* 296 + * This write is not atomic due to fetching usage and writing 297 + * to it, but that's fine because we call this with 298 + * hugetlb_lock held anyway. 299 + */ 300 + WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx], 301 + usage + nr_pages); 302 + } 322 303 } 323 304 324 305 void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, ··· 367 328 368 329 if (rsvd) 369 330 css_put(&h_cg->css); 370 - 371 - return; 331 + else { 332 + unsigned long usage = 333 + h_cg->nodeinfo[page_to_nid(page)]->usage[idx]; 334 + /* 335 + * This write is not atomic due to fetching usage and writing 336 + * to it, but that's fine because we call this with 337 + * hugetlb_lock held anyway. 338 + */ 339 + WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx], 340 + usage - nr_pages); 341 + } 372 342 } 373 343 374 344 void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, ··· 465 417 RES_FAILCNT, 466 418 RES_RSVD_FAILCNT, 467 419 }; 420 + 421 + static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy) 422 + { 423 + int nid; 424 + struct cftype *cft = seq_cft(seq); 425 + int idx = MEMFILE_IDX(cft->private); 426 + bool legacy = MEMFILE_ATTR(cft->private); 427 + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); 428 + struct cgroup_subsys_state *css; 429 + unsigned long usage; 430 + 431 + if (legacy) { 432 + /* Add up usage across all nodes for the non-hierarchical total. */ 433 + usage = 0; 434 + for_each_node_state(nid, N_MEMORY) 435 + usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]); 436 + seq_printf(seq, "total=%lu", usage * PAGE_SIZE); 437 + 438 + /* Simply print the per-node usage for the non-hierarchical total. */ 439 + for_each_node_state(nid, N_MEMORY) 440 + seq_printf(seq, " N%d=%lu", nid, 441 + READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) * 442 + PAGE_SIZE); 443 + seq_putc(seq, '\n'); 444 + } 445 + 446 + /* 447 + * The hierarchical total is pretty much the value recorded by the 448 + * counter, so use that. 449 + */ 450 + seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "", 451 + page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE); 452 + 453 + /* 454 + * For each node, transverse the css tree to obtain the hierarchical 455 + * node usage. 456 + */ 457 + for_each_node_state(nid, N_MEMORY) { 458 + usage = 0; 459 + rcu_read_lock(); 460 + css_for_each_descendant_pre(css, &h_cg->css) { 461 + usage += READ_ONCE(hugetlb_cgroup_from_css(css) 462 + ->nodeinfo[nid] 463 + ->usage[idx]); 464 + } 465 + rcu_read_unlock(); 466 + seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE); 467 + } 468 + 469 + seq_putc(seq, '\n'); 470 + 471 + return 0; 472 + } 468 473 469 474 static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, 470 475 struct cftype *cft) ··· 769 668 events_local_file[idx]); 770 669 cft->flags = CFTYPE_NOT_ON_ROOT; 771 670 772 - /* NULL terminate the last cft */ 671 + /* Add the numa stat file */ 773 672 cft = &h->cgroup_files_dfl[6]; 673 + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf); 674 + cft->seq_show = hugetlb_cgroup_read_numa_stat; 675 + cft->flags = CFTYPE_NOT_ON_ROOT; 676 + 677 + /* NULL terminate the last cft */ 678 + cft = &h->cgroup_files_dfl[7]; 774 679 memset(cft, 0, sizeof(*cft)); 775 680 776 681 WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys, ··· 846 739 cft->write = hugetlb_cgroup_reset; 847 740 cft->read_u64 = hugetlb_cgroup_read_u64; 848 741 849 - /* NULL terminate the last cft */ 742 + /* Add the numa stat file */ 850 743 cft = &h->cgroup_files_legacy[8]; 744 + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf); 745 + cft->private = MEMFILE_PRIVATE(idx, 1); 746 + cft->seq_show = hugetlb_cgroup_read_numa_stat; 747 + 748 + /* NULL terminate the last cft */ 749 + cft = &h->cgroup_files_legacy[9]; 851 750 memset(cft, 0, sizeof(*cft)); 852 751 853 752 WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,