Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

hugetlb_cgroup: add accounting for shared mappings

For shared mappings, the pointer to the hugetlb_cgroup to uncharge lives
in the resv_map entries, in file_region->reservation_counter.

After a call to region_chg, we charge the approprate hugetlb_cgroup, and
if successful, we pass on the hugetlb_cgroup info to a follow up
region_add call. When a file_region entry is added to the resv_map via
region_add, we put the pointer to that cgroup in
file_region->reservation_counter. If charging doesn't succeed, we report
the error to the caller, so that the kernel fails the reservation.

On region_del, which is when the hugetlb memory is unreserved, we also
uncharge the file_region->reservation_counter.

[akpm@linux-foundation.org: forward declare struct file_region]
Signed-off-by: Mina Almasry <almasrymina@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Sandipan Das <sandipan@linux.ibm.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Link: http://lkml.kernel.org/r/20200211213128.73302-5-almasrymina@google.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Mina Almasry and committed by
Linus Torvalds
075a61d0 0db9d74e

+155 -54
+35
include/linux/hugetlb.h
··· 57 57 struct cgroup_subsys_state *css; 58 58 #endif 59 59 }; 60 + 61 + /* 62 + * Region tracking -- allows tracking of reservations and instantiated pages 63 + * across the pages in a mapping. 64 + * 65 + * The region data structures are embedded into a resv_map and protected 66 + * by a resv_map's lock. The set of regions within the resv_map represent 67 + * reservations for huge pages, or huge pages that have already been 68 + * instantiated within the map. The from and to elements are huge page 69 + * indicies into the associated mapping. from indicates the starting index 70 + * of the region. to represents the first index past the end of the region. 71 + * 72 + * For example, a file region structure with from == 0 and to == 4 represents 73 + * four huge pages in a mapping. It is important to note that the to element 74 + * represents the first element past the end of the region. This is used in 75 + * arithmetic as 4(to) - 0(from) = 4 huge pages in the region. 76 + * 77 + * Interval notation of the form [from, to) will be used to indicate that 78 + * the endpoint from is inclusive and to is exclusive. 79 + */ 80 + struct file_region { 81 + struct list_head link; 82 + long from; 83 + long to; 84 + #ifdef CONFIG_CGROUP_HUGETLB 85 + /* 86 + * On shared mappings, each reserved region appears as a struct 87 + * file_region in resv_map. These fields hold the info needed to 88 + * uncharge each reservation. 89 + */ 90 + struct page_counter *reservation_counter; 91 + struct cgroup_subsys_state *css; 92 + #endif 93 + }; 94 + 60 95 extern struct resv_map *resv_map_alloc(void); 61 96 void resv_map_release(struct kref *ref); 62 97
+11
include/linux/hugetlb_cgroup.h
··· 19 19 20 20 struct hugetlb_cgroup; 21 21 struct resv_map; 22 + struct file_region; 22 23 23 24 /* 24 25 * Minimum page order trackable by hugetlb cgroup. ··· 136 135 unsigned long start, 137 136 unsigned long end); 138 137 138 + extern void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, 139 + struct file_region *rg, 140 + unsigned long nr_pages); 141 + 139 142 extern void hugetlb_cgroup_file_init(void) __init; 140 143 extern void hugetlb_cgroup_migrate(struct page *oldhpage, 141 144 struct page *newhpage); 142 145 143 146 #else 147 + static inline void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, 148 + struct file_region *rg, 149 + unsigned long nr_pages) 150 + { 151 + } 152 + 144 153 static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page) 145 154 { 146 155 return NULL;
+94 -54
mm/hugetlb.c
··· 220 220 return subpool_inode(file_inode(vma->vm_file)); 221 221 } 222 222 223 - /* 224 - * Region tracking -- allows tracking of reservations and instantiated pages 225 - * across the pages in a mapping. 226 - * 227 - * The region data structures are embedded into a resv_map and protected 228 - * by a resv_map's lock. The set of regions within the resv_map represent 229 - * reservations for huge pages, or huge pages that have already been 230 - * instantiated within the map. The from and to elements are huge page 231 - * indicies into the associated mapping. from indicates the starting index 232 - * of the region. to represents the first index past the end of the region. 233 - * 234 - * For example, a file region structure with from == 0 and to == 4 represents 235 - * four huge pages in a mapping. It is important to note that the to element 236 - * represents the first element past the end of the region. This is used in 237 - * arithmetic as 4(to) - 0(from) = 4 huge pages in the region. 238 - * 239 - * Interval notation of the form [from, to) will be used to indicate that 240 - * the endpoint from is inclusive and to is exclusive. 241 - */ 242 - struct file_region { 243 - struct list_head link; 244 - long from; 245 - long to; 246 - }; 247 - 248 223 /* Helper that removes a struct file_region from the resv_map cache and returns 249 224 * it for use. 250 225 */ ··· 241 266 return nrg; 242 267 } 243 268 269 + static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg, 270 + struct file_region *rg) 271 + { 272 + #ifdef CONFIG_CGROUP_HUGETLB 273 + nrg->reservation_counter = rg->reservation_counter; 274 + nrg->css = rg->css; 275 + if (rg->css) 276 + css_get(rg->css); 277 + #endif 278 + } 279 + 280 + /* Helper that records hugetlb_cgroup uncharge info. */ 281 + static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg, 282 + struct hstate *h, 283 + struct resv_map *resv, 284 + struct file_region *nrg) 285 + { 286 + #ifdef CONFIG_CGROUP_HUGETLB 287 + if (h_cg) { 288 + nrg->reservation_counter = 289 + &h_cg->rsvd_hugepage[hstate_index(h)]; 290 + nrg->css = &h_cg->css; 291 + if (!resv->pages_per_hpage) 292 + resv->pages_per_hpage = pages_per_huge_page(h); 293 + /* pages_per_hpage should be the same for all entries in 294 + * a resv_map. 295 + */ 296 + VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h)); 297 + } else { 298 + nrg->reservation_counter = NULL; 299 + nrg->css = NULL; 300 + } 301 + #endif 302 + } 303 + 244 304 /* Must be called with resv->lock held. Calling this with count_only == true 245 305 * will count the number of pages to be added but will not modify the linked 246 306 * list. If regions_needed != NULL and count_only == true, then regions_needed ··· 283 273 * add the regions for this range. 284 274 */ 285 275 static long add_reservation_in_range(struct resv_map *resv, long f, long t, 286 - long *regions_needed, bool count_only) 276 + struct hugetlb_cgroup *h_cg, 277 + struct hstate *h, long *regions_needed, 278 + bool count_only) 287 279 { 288 280 long add = 0; 289 281 struct list_head *head = &resv->regions; ··· 324 312 if (!count_only) { 325 313 nrg = get_file_region_entry_from_cache( 326 314 resv, last_accounted_offset, rg->from); 315 + record_hugetlb_cgroup_uncharge_info(h_cg, h, 316 + resv, nrg); 327 317 list_add(&nrg->link, rg->link.prev); 328 318 } else if (regions_needed) 329 319 *regions_needed += 1; ··· 342 328 if (!count_only) { 343 329 nrg = get_file_region_entry_from_cache( 344 330 resv, last_accounted_offset, t); 331 + record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg); 345 332 list_add(&nrg->link, rg->link.prev); 346 333 } else if (regions_needed) 347 334 *regions_needed += 1; ··· 431 416 * 1 page will only require at most 1 entry. 432 417 */ 433 418 static long region_add(struct resv_map *resv, long f, long t, 434 - long in_regions_needed) 419 + long in_regions_needed, struct hstate *h, 420 + struct hugetlb_cgroup *h_cg) 435 421 { 436 422 long add = 0, actual_regions_needed = 0; 437 423 ··· 440 424 retry: 441 425 442 426 /* Count how many regions are actually needed to execute this add. */ 443 - add_reservation_in_range(resv, f, t, &actual_regions_needed, true); 427 + add_reservation_in_range(resv, f, t, NULL, NULL, &actual_regions_needed, 428 + true); 444 429 445 430 /* 446 431 * Check for sufficient descriptors in the cache to accommodate ··· 469 452 goto retry; 470 453 } 471 454 472 - add = add_reservation_in_range(resv, f, t, NULL, false); 455 + add = add_reservation_in_range(resv, f, t, h_cg, h, NULL, false); 473 456 474 457 resv->adds_in_progress -= in_regions_needed; 475 458 ··· 506 489 spin_lock(&resv->lock); 507 490 508 491 /* Count how many hugepages in this range are NOT respresented. */ 509 - chg = add_reservation_in_range(resv, f, t, out_regions_needed, true); 492 + chg = add_reservation_in_range(resv, f, t, NULL, NULL, 493 + out_regions_needed, true); 510 494 511 495 if (*out_regions_needed == 0) 512 496 *out_regions_needed = 1; ··· 607 589 /* New entry for end of split region */ 608 590 nrg->from = t; 609 591 nrg->to = rg->to; 592 + 593 + copy_hugetlb_cgroup_uncharge_info(nrg, rg); 594 + 610 595 INIT_LIST_HEAD(&nrg->link); 611 596 612 597 /* Original entry is trimmed */ 613 598 rg->to = f; 599 + 600 + hugetlb_cgroup_uncharge_file_region( 601 + resv, rg, nrg->to - nrg->from); 614 602 615 603 list_add(&nrg->link, &rg->link); 616 604 nrg = NULL; ··· 625 601 626 602 if (f <= rg->from && t >= rg->to) { /* Remove entire region */ 627 603 del += rg->to - rg->from; 604 + hugetlb_cgroup_uncharge_file_region(resv, rg, 605 + rg->to - rg->from); 628 606 list_del(&rg->link); 629 607 kfree(rg); 630 608 continue; ··· 635 609 if (f <= rg->from) { /* Trim beginning of region */ 636 610 del += t - rg->from; 637 611 rg->from = t; 612 + 613 + hugetlb_cgroup_uncharge_file_region(resv, rg, 614 + t - rg->from); 638 615 } else { /* Trim end of region */ 639 616 del += rg->to - f; 640 617 rg->to = f; 618 + 619 + hugetlb_cgroup_uncharge_file_region(resv, rg, 620 + rg->to - f); 641 621 } 642 622 } 643 623 ··· 2156 2124 VM_BUG_ON(dummy_out_regions_needed != 1); 2157 2125 break; 2158 2126 case VMA_COMMIT_RESV: 2159 - ret = region_add(resv, idx, idx + 1, 1); 2127 + ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); 2160 2128 /* region_add calls of range 1 should never fail. */ 2161 2129 VM_BUG_ON(ret < 0); 2162 2130 break; ··· 2166 2134 break; 2167 2135 case VMA_ADD_RESV: 2168 2136 if (vma->vm_flags & VM_MAYSHARE) { 2169 - ret = region_add(resv, idx, idx + 1, 1); 2137 + ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); 2170 2138 /* region_add calls of range 1 should never fail. */ 2171 2139 VM_BUG_ON(ret < 0); 2172 2140 } else { ··· 4862 4830 struct hstate *h = hstate_inode(inode); 4863 4831 struct hugepage_subpool *spool = subpool_inode(inode); 4864 4832 struct resv_map *resv_map; 4865 - struct hugetlb_cgroup *h_cg; 4833 + struct hugetlb_cgroup *h_cg = NULL; 4866 4834 long gbl_reserve, regions_needed = 0; 4867 4835 4868 4836 /* This should never happen */ ··· 4903 4871 4904 4872 chg = to - from; 4905 4873 4906 - if (hugetlb_cgroup_charge_cgroup_rsvd( 4907 - hstate_index(h), chg * pages_per_huge_page(h), 4908 - &h_cg)) { 4909 - kref_put(&resv_map->refs, resv_map_release); 4910 - return -ENOMEM; 4911 - } 4912 - 4913 - /* 4914 - * Since this branch handles private mappings, we attach the 4915 - * counter to uncharge for this reservation off resv_map. 4916 - */ 4917 - resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h); 4918 - 4919 4874 set_vma_resv_map(vma, resv_map); 4920 4875 set_vma_resv_flags(vma, HPAGE_RESV_OWNER); 4921 4876 } ··· 4910 4891 if (chg < 0) { 4911 4892 ret = chg; 4912 4893 goto out_err; 4894 + } 4895 + 4896 + ret = hugetlb_cgroup_charge_cgroup_rsvd( 4897 + hstate_index(h), chg * pages_per_huge_page(h), &h_cg); 4898 + 4899 + if (ret < 0) { 4900 + ret = -ENOMEM; 4901 + goto out_err; 4902 + } 4903 + 4904 + if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) { 4905 + /* For private mappings, the hugetlb_cgroup uncharge info hangs 4906 + * of the resv_map. 4907 + */ 4908 + resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h); 4913 4909 } 4914 4910 4915 4911 /* ··· 4935 4901 gbl_reserve = hugepage_subpool_get_pages(spool, chg); 4936 4902 if (gbl_reserve < 0) { 4937 4903 ret = -ENOSPC; 4938 - goto out_err; 4904 + goto out_uncharge_cgroup; 4939 4905 } 4940 4906 4941 4907 /* ··· 4944 4910 */ 4945 4911 ret = hugetlb_acct_memory(h, gbl_reserve); 4946 4912 if (ret < 0) { 4947 - /* put back original number of pages, chg */ 4948 - (void)hugepage_subpool_put_pages(spool, chg); 4949 - goto out_err; 4913 + goto out_put_pages; 4950 4914 } 4951 4915 4952 4916 /* ··· 4959 4927 * else has to be done for private mappings here 4960 4928 */ 4961 4929 if (!vma || vma->vm_flags & VM_MAYSHARE) { 4962 - add = region_add(resv_map, from, to, regions_needed); 4930 + add = region_add(resv_map, from, to, regions_needed, h, h_cg); 4963 4931 4964 4932 if (unlikely(add < 0)) { 4965 4933 hugetlb_acct_memory(h, -gbl_reserve); 4966 - /* put back original number of pages, chg */ 4967 - (void)hugepage_subpool_put_pages(spool, chg); 4968 - goto out_err; 4934 + goto out_put_pages; 4969 4935 } else if (unlikely(chg > add)) { 4970 4936 /* 4971 4937 * pages in this range were added to the reserve ··· 4974 4944 */ 4975 4945 long rsv_adjust; 4976 4946 4947 + hugetlb_cgroup_uncharge_cgroup_rsvd( 4948 + hstate_index(h), 4949 + (chg - add) * pages_per_huge_page(h), h_cg); 4950 + 4977 4951 rsv_adjust = hugepage_subpool_put_pages(spool, 4978 4952 chg - add); 4979 4953 hugetlb_acct_memory(h, -rsv_adjust); 4980 4954 } 4981 4955 } 4982 4956 return 0; 4957 + out_put_pages: 4958 + /* put back original number of pages, chg */ 4959 + (void)hugepage_subpool_put_pages(spool, chg); 4960 + out_uncharge_cgroup: 4961 + hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), 4962 + chg * pages_per_huge_page(h), h_cg); 4983 4963 out_err: 4984 4964 if (!vma || vma->vm_flags & VM_MAYSHARE) 4985 4965 /* Only call region_abort if the region_chg succeeded but the
+15
mm/hugetlb_cgroup.c
··· 391 391 css_put(resv->css); 392 392 } 393 393 394 + void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, 395 + struct file_region *rg, 396 + unsigned long nr_pages) 397 + { 398 + if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages) 399 + return; 400 + 401 + if (rg->reservation_counter && resv->pages_per_hpage && nr_pages > 0 && 402 + !resv->reservation_counter) { 403 + page_counter_uncharge(rg->reservation_counter, 404 + nr_pages * resv->pages_per_hpage); 405 + css_put(rg->css); 406 + } 407 + } 408 + 394 409 enum { 395 410 RES_USAGE, 396 411 RES_RSVD_USAGE,