Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: munlock: manual pte walk in fast path instead of follow_page_mask()

Currently munlock_vma_pages_range() calls follow_page_mask() to obtain
each individual struct page. This entails repeated full page table
translations and page table lock taken for each page separately.

This patch avoids the costly follow_page_mask() where possible, by
iterating over ptes within single pmd under single page table lock. The
first pte is obtained by get_locked_pte() for non-THP page acquired by the
initial follow_page_mask(). The rest of the on-stack pagevec for munlock
is filled up using pte_walk as long as pte_present() and vm_normal_page()
are sufficient to obtain the struct page.

After this patch, a 14% speedup was measured for munlocking a 56GB large
memory area with THP disabled.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jörn Engel <joern@logfs.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michel Lespinasse <walken@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Vlastimil Babka and committed by
Linus Torvalds
7a8010cd 5b40998a

+85 -37
+6 -6
include/linux/mm.h
··· 643 643 #endif 644 644 645 645 /* 646 - * The identification function is only used by the buddy allocator for 647 - * determining if two pages could be buddies. We are not really 648 - * identifying a zone since we could be using a the section number 649 - * id if we have not node id available in page flags. 650 - * We guarantee only that it will return the same value for two 651 - * combinable pages in a zone. 646 + * The identification function is mainly used by the buddy allocator for 647 + * determining if two pages could be buddies. We are not really identifying 648 + * the zone since we could be using the section number id if we do not have 649 + * node id available in page flags. 650 + * We only guarantee that it will return the same value for two combinable 651 + * pages in a zone. 652 652 */ 653 653 static inline int page_zone_id(struct page *page) 654 654 {
+79 -31
mm/mlock.c
··· 280 280 * The second phase finishes the munlock only for pages where isolation 281 281 * succeeded. 282 282 * 283 - * Note that pvec is modified during the process. Before returning 284 - * pagevec_reinit() is called on it. 283 + * Note that the pagevec may be modified during the process. 285 284 */ 286 285 static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) 287 286 { ··· 355 356 */ 356 357 if (pagevec_count(&pvec_putback)) 357 358 __putback_lru_fast(&pvec_putback, pgrescued); 359 + } 358 360 359 - pagevec_reinit(pvec); 361 + /* 362 + * Fill up pagevec for __munlock_pagevec using pte walk 363 + * 364 + * The function expects that the struct page corresponding to @start address is 365 + * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone. 366 + * 367 + * The rest of @pvec is filled by subsequent pages within the same pmd and same 368 + * zone, as long as the pte's are present and vm_normal_page() succeeds. These 369 + * pages also get pinned. 370 + * 371 + * Returns the address of the next page that should be scanned. This equals 372 + * @start + PAGE_SIZE when no page could be added by the pte walk. 373 + */ 374 + static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, 375 + struct vm_area_struct *vma, int zoneid, unsigned long start, 376 + unsigned long end) 377 + { 378 + pte_t *pte; 379 + spinlock_t *ptl; 380 + 381 + /* 382 + * Initialize pte walk starting at the already pinned page where we 383 + * are sure that there is a pte. 384 + */ 385 + pte = get_locked_pte(vma->vm_mm, start, &ptl); 386 + end = min(end, pmd_addr_end(start, end)); 387 + 388 + /* The page next to the pinned page is the first we will try to get */ 389 + start += PAGE_SIZE; 390 + while (start < end) { 391 + struct page *page = NULL; 392 + pte++; 393 + if (pte_present(*pte)) 394 + page = vm_normal_page(vma, start, *pte); 395 + /* 396 + * Break if page could not be obtained or the page's node+zone does not 397 + * match 398 + */ 399 + if (!page || page_zone_id(page) != zoneid) 400 + break; 401 + 402 + get_page(page); 403 + /* 404 + * Increase the address that will be returned *before* the 405 + * eventual break due to pvec becoming full by adding the page 406 + */ 407 + start += PAGE_SIZE; 408 + if (pagevec_add(pvec, page) == 0) 409 + break; 410 + } 411 + pte_unmap_unlock(pte, ptl); 412 + return start; 360 413 } 361 414 362 415 /* ··· 432 381 void munlock_vma_pages_range(struct vm_area_struct *vma, 433 382 unsigned long start, unsigned long end) 434 383 { 435 - struct pagevec pvec; 436 - struct zone *zone = NULL; 437 - 438 - pagevec_init(&pvec, 0); 439 384 vma->vm_flags &= ~VM_LOCKED; 440 385 441 386 while (start < end) { 442 - struct page *page; 387 + struct page *page = NULL; 443 388 unsigned int page_mask, page_increm; 444 - struct zone *pagezone; 389 + struct pagevec pvec; 390 + struct zone *zone; 391 + int zoneid; 445 392 393 + pagevec_init(&pvec, 0); 446 394 /* 447 395 * Although FOLL_DUMP is intended for get_dump_page(), 448 396 * it just so happens that its special treatment of the ··· 450 400 * has sneaked into the range, we won't oops here: great). 451 401 */ 452 402 page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP, 453 - &page_mask); 403 + &page_mask); 404 + 454 405 if (page && !IS_ERR(page)) { 455 - pagezone = page_zone(page); 456 - /* The whole pagevec must be in the same zone */ 457 - if (pagezone != zone) { 458 - if (pagevec_count(&pvec)) 459 - __munlock_pagevec(&pvec, zone); 460 - zone = pagezone; 461 - } 462 406 if (PageTransHuge(page)) { 463 - /* 464 - * THP pages are not handled by pagevec due 465 - * to their possible split (see below). 466 - */ 467 - if (pagevec_count(&pvec)) 468 - __munlock_pagevec(&pvec, zone); 469 407 lock_page(page); 470 408 /* 471 409 * Any THP page found by follow_page_mask() may ··· 466 428 put_page(page); /* follow_page_mask() */ 467 429 } else { 468 430 /* 469 - * Non-huge pages are handled in batches 470 - * via pagevec. The pin from 471 - * follow_page_mask() prevents them from 472 - * collapsing by THP. 431 + * Non-huge pages are handled in batches via 432 + * pagevec. The pin from follow_page_mask() 433 + * prevents them from collapsing by THP. 473 434 */ 474 - if (pagevec_add(&pvec, page) == 0) 475 - __munlock_pagevec(&pvec, zone); 435 + pagevec_add(&pvec, page); 436 + zone = page_zone(page); 437 + zoneid = page_zone_id(page); 438 + 439 + /* 440 + * Try to fill the rest of pagevec using fast 441 + * pte walk. This will also update start to 442 + * the next page to process. Then munlock the 443 + * pagevec. 444 + */ 445 + start = __munlock_pagevec_fill(&pvec, vma, 446 + zoneid, start, end); 447 + __munlock_pagevec(&pvec, zone); 448 + goto next; 476 449 } 477 450 } 478 451 page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); 479 452 start += page_increm * PAGE_SIZE; 453 + next: 480 454 cond_resched(); 481 455 } 482 - if (pagevec_count(&pvec)) 483 - __munlock_pagevec(&pvec, zone); 484 456 } 485 457 486 458 /*