Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

hugetlb: disable region_add file_region coalescing

A follow up patch in this series adds hugetlb cgroup uncharge info the
file_region entries in resv->regions. The cgroup uncharge info may differ
for different regions, so they can no longer be coalesced at region_add
time. So, disable region coalescing in region_add in this patch.

Behavior change:

Say a resv_map exists like this [0->1], [2->3], and [5->6].

Then a region_chg/add call comes in region_chg/add(f=0, t=5).

Old code would generate resv->regions: [0->5], [5->6].
New code would generate resv->regions: [0->1], [1->2], [2->3], [3->5],
[5->6].

Special care needs to be taken to handle the resv->adds_in_progress
variable correctly. In the past, only 1 region would be added for every
region_chg and region_add call. But now, each call may add multiple
regions, so we can no longer increment adds_in_progress by 1 in
region_chg, or decrement adds_in_progress by 1 after region_add or
region_abort. Instead, region_chg calls add_reservation_in_range() to
count the number of regions needed and allocates those, and that info is
passed to region_add and region_abort to decrement adds_in_progress
correctly.

We've also modified the assumption that region_add after region_chg never
fails. region_chg now pre-allocates at least 1 region for region_add. If
region_add needs more regions than region_chg has allocated for it, then
it may fail.

[almasrymina@google.com: fix file_region entry allocations]
Link: http://lkml.kernel.org/r/20200219012736.20363-1-almasrymina@google.com
Signed-off-by: Mina Almasry <almasrymina@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Sandipan Das <sandipan@linux.ibm.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Greg Thelen <gthelen@google.com>
Cc: Miguel Ojeda <miguel.ojeda.sandonis@gmail.com>
Link: http://lkml.kernel.org/r/20200211213128.73302-4-almasrymina@google.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Mina Almasry and committed by
Linus Torvalds
0db9d74e e9fe92ae

+227 -107
+227 -107
mm/hugetlb.c
··· 245 245 long to; 246 246 }; 247 247 248 + /* Helper that removes a struct file_region from the resv_map cache and returns 249 + * it for use. 250 + */ 251 + static struct file_region * 252 + get_file_region_entry_from_cache(struct resv_map *resv, long from, long to) 253 + { 254 + struct file_region *nrg = NULL; 255 + 256 + VM_BUG_ON(resv->region_cache_count <= 0); 257 + 258 + resv->region_cache_count--; 259 + nrg = list_first_entry(&resv->region_cache, struct file_region, link); 260 + VM_BUG_ON(!nrg); 261 + list_del(&nrg->link); 262 + 263 + nrg->from = from; 264 + nrg->to = to; 265 + 266 + return nrg; 267 + } 268 + 248 269 /* Must be called with resv->lock held. Calling this with count_only == true 249 270 * will count the number of pages to be added but will not modify the linked 250 - * list. 271 + * list. If regions_needed != NULL and count_only == true, then regions_needed 272 + * will indicate the number of file_regions needed in the cache to carry out to 273 + * add the regions for this range. 251 274 */ 252 275 static long add_reservation_in_range(struct resv_map *resv, long f, long t, 253 - bool count_only) 276 + long *regions_needed, bool count_only) 254 277 { 255 - long chg = 0; 278 + long add = 0; 256 279 struct list_head *head = &resv->regions; 280 + long last_accounted_offset = f; 257 281 struct file_region *rg = NULL, *trg = NULL, *nrg = NULL; 258 282 259 - /* Locate the region we are before or in. */ 260 - list_for_each_entry(rg, head, link) 261 - if (f <= rg->to) 262 - break; 283 + if (regions_needed) 284 + *regions_needed = 0; 263 285 264 - /* Round our left edge to the current segment if it encloses us. */ 265 - if (f > rg->from) 266 - f = rg->from; 286 + /* In this loop, we essentially handle an entry for the range 287 + * [last_accounted_offset, rg->from), at every iteration, with some 288 + * bounds checking. 289 + */ 290 + list_for_each_entry_safe(rg, trg, head, link) { 291 + /* Skip irrelevant regions that start before our range. */ 292 + if (rg->from < f) { 293 + /* If this region ends after the last accounted offset, 294 + * then we need to update last_accounted_offset. 295 + */ 296 + if (rg->to > last_accounted_offset) 297 + last_accounted_offset = rg->to; 298 + continue; 299 + } 267 300 268 - chg = t - f; 269 - 270 - /* Check for and consume any regions we now overlap with. */ 271 - nrg = rg; 272 - list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 273 - if (&rg->link == head) 274 - break; 301 + /* When we find a region that starts beyond our range, we've 302 + * finished. 303 + */ 275 304 if (rg->from > t) 276 305 break; 277 306 278 - /* We overlap with this area, if it extends further than 279 - * us then we must extend ourselves. Account for its 280 - * existing reservation. 307 + /* Add an entry for last_accounted_offset -> rg->from, and 308 + * update last_accounted_offset. 281 309 */ 282 - if (rg->to > t) { 283 - chg += rg->to - t; 284 - t = rg->to; 310 + if (rg->from > last_accounted_offset) { 311 + add += rg->from - last_accounted_offset; 312 + if (!count_only) { 313 + nrg = get_file_region_entry_from_cache( 314 + resv, last_accounted_offset, rg->from); 315 + list_add(&nrg->link, rg->link.prev); 316 + } else if (regions_needed) 317 + *regions_needed += 1; 285 318 } 286 - chg -= rg->to - rg->from; 287 319 288 - if (!count_only && rg != nrg) { 320 + last_accounted_offset = rg->to; 321 + } 322 + 323 + /* Handle the case where our range extends beyond 324 + * last_accounted_offset. 325 + */ 326 + if (last_accounted_offset < t) { 327 + add += t - last_accounted_offset; 328 + if (!count_only) { 329 + nrg = get_file_region_entry_from_cache( 330 + resv, last_accounted_offset, t); 331 + list_add(&nrg->link, rg->link.prev); 332 + } else if (regions_needed) 333 + *regions_needed += 1; 334 + } 335 + 336 + VM_BUG_ON(add < 0); 337 + return add; 338 + } 339 + 340 + /* Must be called with resv->lock acquired. Will drop lock to allocate entries. 341 + */ 342 + static int allocate_file_region_entries(struct resv_map *resv, 343 + int regions_needed) 344 + __must_hold(&resv->lock) 345 + { 346 + struct list_head allocated_regions; 347 + int to_allocate = 0, i = 0; 348 + struct file_region *trg = NULL, *rg = NULL; 349 + 350 + VM_BUG_ON(regions_needed < 0); 351 + 352 + INIT_LIST_HEAD(&allocated_regions); 353 + 354 + /* 355 + * Check for sufficient descriptors in the cache to accommodate 356 + * the number of in progress add operations plus regions_needed. 357 + * 358 + * This is a while loop because when we drop the lock, some other call 359 + * to region_add or region_del may have consumed some region_entries, 360 + * so we keep looping here until we finally have enough entries for 361 + * (adds_in_progress + regions_needed). 362 + */ 363 + while (resv->region_cache_count < 364 + (resv->adds_in_progress + regions_needed)) { 365 + to_allocate = resv->adds_in_progress + regions_needed - 366 + resv->region_cache_count; 367 + 368 + /* At this point, we should have enough entries in the cache 369 + * for all the existings adds_in_progress. We should only be 370 + * needing to allocate for regions_needed. 371 + */ 372 + VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress); 373 + 374 + spin_unlock(&resv->lock); 375 + for (i = 0; i < to_allocate; i++) { 376 + trg = kmalloc(sizeof(*trg), GFP_KERNEL); 377 + if (!trg) 378 + goto out_of_memory; 379 + list_add(&trg->link, &allocated_regions); 380 + } 381 + 382 + spin_lock(&resv->lock); 383 + 384 + list_for_each_entry_safe(rg, trg, &allocated_regions, link) { 289 385 list_del(&rg->link); 290 - kfree(rg); 386 + list_add(&rg->link, &resv->region_cache); 387 + resv->region_cache_count++; 291 388 } 292 389 } 293 390 294 - if (!count_only) { 295 - nrg->from = f; 296 - nrg->to = t; 297 - } 391 + return 0; 298 392 299 - return chg; 393 + out_of_memory: 394 + list_for_each_entry_safe(rg, trg, &allocated_regions, link) { 395 + list_del(&rg->link); 396 + kfree(rg); 397 + } 398 + return -ENOMEM; 300 399 } 301 400 302 401 /* 303 402 * Add the huge page range represented by [f, t) to the reserve 304 - * map. Existing regions will be expanded to accommodate the specified 305 - * range, or a region will be taken from the cache. Sufficient regions 306 - * must exist in the cache due to the previous call to region_chg with 307 - * the same range. 403 + * map. Regions will be taken from the cache to fill in this range. 404 + * Sufficient regions should exist in the cache due to the previous 405 + * call to region_chg with the same range, but in some cases the cache will not 406 + * have sufficient entries due to races with other code doing region_add or 407 + * region_del. The extra needed entries will be allocated. 308 408 * 309 - * Return the number of new huge pages added to the map. This 310 - * number is greater than or equal to zero. 409 + * regions_needed is the out value provided by a previous call to region_chg. 410 + * 411 + * Return the number of new huge pages added to the map. This number is greater 412 + * than or equal to zero. If file_region entries needed to be allocated for 413 + * this operation and we were not able to allocate, it ruturns -ENOMEM. 414 + * region_add of regions of length 1 never allocate file_regions and cannot 415 + * fail; region_chg will always allocate at least 1 entry and a region_add for 416 + * 1 page will only require at most 1 entry. 311 417 */ 312 - static long region_add(struct resv_map *resv, long f, long t) 418 + static long region_add(struct resv_map *resv, long f, long t, 419 + long in_regions_needed) 313 420 { 314 - struct list_head *head = &resv->regions; 315 - struct file_region *rg, *nrg; 316 - long add = 0; 421 + long add = 0, actual_regions_needed = 0; 317 422 318 423 spin_lock(&resv->lock); 319 - /* Locate the region we are either in or before. */ 320 - list_for_each_entry(rg, head, link) 321 - if (f <= rg->to) 322 - break; 424 + retry: 425 + 426 + /* Count how many regions are actually needed to execute this add. */ 427 + add_reservation_in_range(resv, f, t, &actual_regions_needed, true); 323 428 324 429 /* 325 - * If no region exists which can be expanded to include the 326 - * specified range, pull a region descriptor from the cache 327 - * and use it for this range. 430 + * Check for sufficient descriptors in the cache to accommodate 431 + * this add operation. Note that actual_regions_needed may be greater 432 + * than in_regions_needed, as the resv_map may have been modified since 433 + * the region_chg call. In this case, we need to make sure that we 434 + * allocate extra entries, such that we have enough for all the 435 + * existing adds_in_progress, plus the excess needed for this 436 + * operation. 328 437 */ 329 - if (&rg->link == head || t < rg->from) { 330 - VM_BUG_ON(resv->region_cache_count <= 0); 438 + if (actual_regions_needed > in_regions_needed && 439 + resv->region_cache_count < 440 + resv->adds_in_progress + 441 + (actual_regions_needed - in_regions_needed)) { 442 + /* region_add operation of range 1 should never need to 443 + * allocate file_region entries. 444 + */ 445 + VM_BUG_ON(t - f <= 1); 331 446 332 - resv->region_cache_count--; 333 - nrg = list_first_entry(&resv->region_cache, struct file_region, 334 - link); 335 - list_del(&nrg->link); 447 + if (allocate_file_region_entries( 448 + resv, actual_regions_needed - in_regions_needed)) { 449 + return -ENOMEM; 450 + } 336 451 337 - nrg->from = f; 338 - nrg->to = t; 339 - list_add(&nrg->link, rg->link.prev); 340 - 341 - add += t - f; 342 - goto out_locked; 452 + goto retry; 343 453 } 344 454 345 - add = add_reservation_in_range(resv, f, t, false); 455 + add = add_reservation_in_range(resv, f, t, NULL, false); 346 456 347 - out_locked: 348 - resv->adds_in_progress--; 457 + resv->adds_in_progress -= in_regions_needed; 458 + 349 459 spin_unlock(&resv->lock); 350 460 VM_BUG_ON(add < 0); 351 461 return add; ··· 468 358 * call to region_add that will actually modify the reserve 469 359 * map to add the specified range [f, t). region_chg does 470 360 * not change the number of huge pages represented by the 471 - * map. A new file_region structure is added to the cache 472 - * as a placeholder, so that the subsequent region_add 473 - * call will have all the regions it needs and will not fail. 361 + * map. A number of new file_region structures is added to the cache as a 362 + * placeholder, for the subsequent region_add call to use. At least 1 363 + * file_region structure is added. 364 + * 365 + * out_regions_needed is the number of regions added to the 366 + * resv->adds_in_progress. This value needs to be provided to a follow up call 367 + * to region_add or region_abort for proper accounting. 474 368 * 475 369 * Returns the number of huge pages that need to be added to the existing 476 370 * reservation map for the range [f, t). This number is greater or equal to 477 371 * zero. -ENOMEM is returned if a new file_region structure or cache entry 478 372 * is needed and can not be allocated. 479 373 */ 480 - static long region_chg(struct resv_map *resv, long f, long t) 374 + static long region_chg(struct resv_map *resv, long f, long t, 375 + long *out_regions_needed) 481 376 { 482 377 long chg = 0; 483 378 484 379 spin_lock(&resv->lock); 485 - retry_locked: 486 - resv->adds_in_progress++; 487 380 488 - /* 489 - * Check for sufficient descriptors in the cache to accommodate 490 - * the number of in progress add operations. 491 - */ 492 - if (resv->adds_in_progress > resv->region_cache_count) { 493 - struct file_region *trg; 381 + /* Count how many hugepages in this range are NOT respresented. */ 382 + chg = add_reservation_in_range(resv, f, t, out_regions_needed, true); 494 383 495 - VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1); 496 - /* Must drop lock to allocate a new descriptor. */ 497 - resv->adds_in_progress--; 498 - spin_unlock(&resv->lock); 384 + if (*out_regions_needed == 0) 385 + *out_regions_needed = 1; 499 386 500 - trg = kmalloc(sizeof(*trg), GFP_KERNEL); 501 - if (!trg) 502 - return -ENOMEM; 387 + if (allocate_file_region_entries(resv, *out_regions_needed)) 388 + return -ENOMEM; 503 389 504 - spin_lock(&resv->lock); 505 - list_add(&trg->link, &resv->region_cache); 506 - resv->region_cache_count++; 507 - goto retry_locked; 508 - } 509 - 510 - chg = add_reservation_in_range(resv, f, t, true); 390 + resv->adds_in_progress += *out_regions_needed; 511 391 512 392 spin_unlock(&resv->lock); 513 393 return chg; ··· 508 408 * of the resv_map keeps track of the operations in progress between 509 409 * calls to region_chg and region_add. Operations are sometimes 510 410 * aborted after the call to region_chg. In such cases, region_abort 511 - * is called to decrement the adds_in_progress counter. 411 + * is called to decrement the adds_in_progress counter. regions_needed 412 + * is the value returned by the region_chg call, it is used to decrement 413 + * the adds_in_progress counter. 512 414 * 513 415 * NOTE: The range arguments [f, t) are not needed or used in this 514 416 * routine. They are kept to make reading the calling code easier as 515 417 * arguments will match the associated region_chg call. 516 418 */ 517 - static void region_abort(struct resv_map *resv, long f, long t) 419 + static void region_abort(struct resv_map *resv, long f, long t, 420 + long regions_needed) 518 421 { 519 422 spin_lock(&resv->lock); 520 423 VM_BUG_ON(!resv->region_cache_count); 521 - resv->adds_in_progress--; 424 + resv->adds_in_progress -= regions_needed; 522 425 spin_unlock(&resv->lock); 523 426 } 524 427 ··· 2107 2004 struct resv_map *resv; 2108 2005 pgoff_t idx; 2109 2006 long ret; 2007 + long dummy_out_regions_needed; 2110 2008 2111 2009 resv = vma_resv_map(vma); 2112 2010 if (!resv) ··· 2116 2012 idx = vma_hugecache_offset(h, vma, addr); 2117 2013 switch (mode) { 2118 2014 case VMA_NEEDS_RESV: 2119 - ret = region_chg(resv, idx, idx + 1); 2015 + ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed); 2016 + /* We assume that vma_reservation_* routines always operate on 2017 + * 1 page, and that adding to resv map a 1 page entry can only 2018 + * ever require 1 region. 2019 + */ 2020 + VM_BUG_ON(dummy_out_regions_needed != 1); 2120 2021 break; 2121 2022 case VMA_COMMIT_RESV: 2122 - ret = region_add(resv, idx, idx + 1); 2023 + ret = region_add(resv, idx, idx + 1, 1); 2024 + /* region_add calls of range 1 should never fail. */ 2025 + VM_BUG_ON(ret < 0); 2123 2026 break; 2124 2027 case VMA_END_RESV: 2125 - region_abort(resv, idx, idx + 1); 2028 + region_abort(resv, idx, idx + 1, 1); 2126 2029 ret = 0; 2127 2030 break; 2128 2031 case VMA_ADD_RESV: 2129 - if (vma->vm_flags & VM_MAYSHARE) 2130 - ret = region_add(resv, idx, idx + 1); 2131 - else { 2132 - region_abort(resv, idx, idx + 1); 2032 + if (vma->vm_flags & VM_MAYSHARE) { 2033 + ret = region_add(resv, idx, idx + 1, 1); 2034 + /* region_add calls of range 1 should never fail. */ 2035 + VM_BUG_ON(ret < 0); 2036 + } else { 2037 + region_abort(resv, idx, idx + 1, 1); 2133 2038 ret = region_del(resv, idx, idx + 1); 2134 2039 } 2135 2040 break; ··· 4826 4713 struct vm_area_struct *vma, 4827 4714 vm_flags_t vm_flags) 4828 4715 { 4829 - long ret, chg; 4716 + long ret, chg, add = -1; 4830 4717 struct hstate *h = hstate_inode(inode); 4831 4718 struct hugepage_subpool *spool = subpool_inode(inode); 4832 4719 struct resv_map *resv_map; 4833 4720 struct hugetlb_cgroup *h_cg; 4834 - long gbl_reserve; 4721 + long gbl_reserve, regions_needed = 0; 4835 4722 4836 4723 /* This should never happen */ 4837 4724 if (from > to) { ··· 4861 4748 */ 4862 4749 resv_map = inode_resv_map(inode); 4863 4750 4864 - chg = region_chg(resv_map, from, to); 4751 + chg = region_chg(resv_map, from, to, &regions_needed); 4865 4752 4866 4753 } else { 4867 4754 /* Private mapping. */ ··· 4927 4814 * else has to be done for private mappings here 4928 4815 */ 4929 4816 if (!vma || vma->vm_flags & VM_MAYSHARE) { 4930 - long add = region_add(resv_map, from, to); 4817 + add = region_add(resv_map, from, to, regions_needed); 4931 4818 4932 - if (unlikely(chg > add)) { 4819 + if (unlikely(add < 0)) { 4820 + hugetlb_acct_memory(h, -gbl_reserve); 4821 + /* put back original number of pages, chg */ 4822 + (void)hugepage_subpool_put_pages(spool, chg); 4823 + goto out_err; 4824 + } else if (unlikely(chg > add)) { 4933 4825 /* 4934 4826 * pages in this range were added to the reserve 4935 4827 * map between region_chg and region_add. This ··· 4952 4834 return 0; 4953 4835 out_err: 4954 4836 if (!vma || vma->vm_flags & VM_MAYSHARE) 4955 - /* Don't call region_abort if region_chg failed */ 4956 - if (chg >= 0) 4957 - region_abort(resv_map, from, to); 4837 + /* Only call region_abort if the region_chg succeeded but the 4838 + * region_add failed or didn't run. 4839 + */ 4840 + if (chg >= 0 && add < 0) 4841 + region_abort(resv_map, from, to, regions_needed); 4958 4842 if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 4959 4843 kref_put(&resv_map->refs, resv_map_release); 4960 4844 return ret;