Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

SHM_UNLOCK: fix Unevictable pages stranded after swap

Commit cc39c6a9bbde ("mm: account skipped entries to avoid looping in
find_get_pages") correctly fixed an infinite loop; but left a problem
that find_get_pages() on shmem would return 0 (appearing to callers to
mean end of tree) when it meets a run of nr_pages swap entries.

The only uses of find_get_pages() on shmem are via pagevec_lookup(),
called from invalidate_mapping_pages(), and from shmctl SHM_UNLOCK's
scan_mapping_unevictable_pages(). The first is already commented, and
not worth worrying about; but the second can leave pages on the
Unevictable list after an unusual sequence of swapping and locking.

Fix that by using shmem_find_get_pages_and_swap() (then ignoring the
swap) instead of pagevec_lookup().

But I don't want to contaminate vmscan.c with shmem internals, nor
shmem.c with LRU locking. So move scan_mapping_unevictable_pages() into
shmem.c, renaming it shmem_unlock_mapping(); and rename
check_move_unevictable_page() to check_move_unevictable_pages(), looping
down an array of pages, oftentimes under the same lock.

Leave out the "rotate unevictable list" block: that's a leftover from
when this was used for /proc/sys/vm/scan_unevictable_pages, whose flawed
handling involved looking at pages at tail of LRU.

Was there significance to the sequence first ClearPageUnevictable, then
test page_evictable, then SetPageUnevictable here? I think not, we're
under LRU lock, and have no barriers between those.

Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michel Lespinasse <walken@google.com>
Cc: <stable@vger.kernel.org> [back to 3.1 but will need respins]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Hugh Dickins and committed by
Linus Torvalds
24513264 85046579

+85 -98
+1
include/linux/shmem_fs.h
··· 48 48 loff_t size, unsigned long flags); 49 49 extern int shmem_zero_setup(struct vm_area_struct *); 50 50 extern int shmem_lock(struct file *file, int lock, struct user_struct *user); 51 + extern void shmem_unlock_mapping(struct address_space *mapping); 51 52 extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, 52 53 pgoff_t index, gfp_t gfp_mask); 53 54 extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
+1 -1
include/linux/swap.h
··· 273 273 #endif 274 274 275 275 extern int page_evictable(struct page *page, struct vm_area_struct *vma); 276 - extern void scan_mapping_unevictable_pages(struct address_space *); 276 + extern void check_move_unevictable_pages(struct page **, int nr_pages); 277 277 278 278 extern unsigned long scan_unevictable_pages; 279 279 extern int scan_unevictable_handler(struct ctl_table *, int,
+1 -1
ipc/shm.c
··· 916 916 shp->mlock_user = NULL; 917 917 get_file(shm_file); 918 918 shm_unlock(shp); 919 - scan_mapping_unevictable_pages(shm_file->f_mapping); 919 + shmem_unlock_mapping(shm_file->f_mapping); 920 920 fput(shm_file); 921 921 goto out; 922 922 }
+41 -5
mm/shmem.c
··· 379 379 /* 380 380 * Pagevec may contain swap entries, so shuffle up pages before releasing. 381 381 */ 382 - static void shmem_pagevec_release(struct pagevec *pvec) 382 + static void shmem_deswap_pagevec(struct pagevec *pvec) 383 383 { 384 384 int i, j; 385 385 ··· 389 389 pvec->pages[j++] = page; 390 390 } 391 391 pvec->nr = j; 392 - pagevec_release(pvec); 392 + } 393 + 394 + /* 395 + * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. 396 + */ 397 + void shmem_unlock_mapping(struct address_space *mapping) 398 + { 399 + struct pagevec pvec; 400 + pgoff_t indices[PAGEVEC_SIZE]; 401 + pgoff_t index = 0; 402 + 403 + pagevec_init(&pvec, 0); 404 + /* 405 + * Minor point, but we might as well stop if someone else SHM_LOCKs it. 406 + */ 407 + while (!mapping_unevictable(mapping)) { 408 + /* 409 + * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it 410 + * has finished, if it hits a row of PAGEVEC_SIZE swap entries. 411 + */ 412 + pvec.nr = shmem_find_get_pages_and_swap(mapping, index, 413 + PAGEVEC_SIZE, pvec.pages, indices); 414 + if (!pvec.nr) 415 + break; 416 + index = indices[pvec.nr - 1] + 1; 417 + shmem_deswap_pagevec(&pvec); 418 + check_move_unevictable_pages(pvec.pages, pvec.nr); 419 + pagevec_release(&pvec); 420 + cond_resched(); 421 + } 393 422 } 394 423 395 424 /* ··· 469 440 } 470 441 unlock_page(page); 471 442 } 472 - shmem_pagevec_release(&pvec); 443 + shmem_deswap_pagevec(&pvec); 444 + pagevec_release(&pvec); 473 445 mem_cgroup_uncharge_end(); 474 446 cond_resched(); 475 447 index++; ··· 500 470 continue; 501 471 } 502 472 if (index == start && indices[0] > end) { 503 - shmem_pagevec_release(&pvec); 473 + shmem_deswap_pagevec(&pvec); 474 + pagevec_release(&pvec); 504 475 break; 505 476 } 506 477 mem_cgroup_uncharge_start(); ··· 525 494 } 526 495 unlock_page(page); 527 496 } 528 - shmem_pagevec_release(&pvec); 497 + shmem_deswap_pagevec(&pvec); 498 + pagevec_release(&pvec); 529 499 mem_cgroup_uncharge_end(); 530 500 index++; 531 501 } ··· 2468 2436 int shmem_lock(struct file *file, int lock, struct user_struct *user) 2469 2437 { 2470 2438 return 0; 2439 + } 2440 + 2441 + void shmem_unlock_mapping(struct address_space *mapping) 2442 + { 2471 2443 } 2472 2444 2473 2445 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+41 -91
mm/vmscan.c
··· 26 26 #include <linux/buffer_head.h> /* for try_to_release_page(), 27 27 buffer_heads_over_limit */ 28 28 #include <linux/mm_inline.h> 29 - #include <linux/pagevec.h> 30 29 #include <linux/backing-dev.h> 31 30 #include <linux/rmap.h> 32 31 #include <linux/topology.h> ··· 660 661 * When racing with an mlock or AS_UNEVICTABLE clearing 661 662 * (page is unlocked) make sure that if the other thread 662 663 * does not observe our setting of PG_lru and fails 663 - * isolation/check_move_unevictable_page, 664 + * isolation/check_move_unevictable_pages, 664 665 * we see PG_mlocked/AS_UNEVICTABLE cleared below and move 665 666 * the page back to the evictable list. 666 667 * ··· 3500 3501 3501 3502 #ifdef CONFIG_SHMEM 3502 3503 /** 3503 - * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list 3504 - * @page: page to check evictability and move to appropriate lru list 3505 - * @zone: zone page is in 3504 + * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list 3505 + * @pages: array of pages to check 3506 + * @nr_pages: number of pages to check 3506 3507 * 3507 - * Checks a page for evictability and moves the page to the appropriate 3508 - * zone lru list. 3509 - * 3510 - * Restrictions: zone->lru_lock must be held, page must be on LRU and must 3511 - * have PageUnevictable set. 3508 + * Checks pages for evictability and moves them to the appropriate lru list. 3512 3509 * 3513 3510 * This function is only used for SysV IPC SHM_UNLOCK. 3514 3511 */ 3515 - static void check_move_unevictable_page(struct page *page, struct zone *zone) 3512 + void check_move_unevictable_pages(struct page **pages, int nr_pages) 3516 3513 { 3517 3514 struct lruvec *lruvec; 3515 + struct zone *zone = NULL; 3516 + int pgscanned = 0; 3517 + int pgrescued = 0; 3518 + int i; 3518 3519 3519 - VM_BUG_ON(PageActive(page)); 3520 - retry: 3521 - ClearPageUnevictable(page); 3522 - if (page_evictable(page, NULL)) { 3523 - enum lru_list l = page_lru_base_type(page); 3520 + for (i = 0; i < nr_pages; i++) { 3521 + struct page *page = pages[i]; 3522 + struct zone *pagezone; 3524 3523 3525 - __dec_zone_state(zone, NR_UNEVICTABLE); 3526 - lruvec = mem_cgroup_lru_move_lists(zone, page, 3527 - LRU_UNEVICTABLE, l); 3528 - list_move(&page->lru, &lruvec->lists[l]); 3529 - __inc_zone_state(zone, NR_INACTIVE_ANON + l); 3530 - __count_vm_event(UNEVICTABLE_PGRESCUED); 3531 - } else { 3532 - /* 3533 - * rotate unevictable list 3534 - */ 3535 - SetPageUnevictable(page); 3536 - lruvec = mem_cgroup_lru_move_lists(zone, page, LRU_UNEVICTABLE, 3537 - LRU_UNEVICTABLE); 3538 - list_move(&page->lru, &lruvec->lists[LRU_UNEVICTABLE]); 3539 - if (page_evictable(page, NULL)) 3540 - goto retry; 3541 - } 3542 - } 3543 - 3544 - /** 3545 - * scan_mapping_unevictable_pages - scan an address space for evictable pages 3546 - * @mapping: struct address_space to scan for evictable pages 3547 - * 3548 - * Scan all pages in mapping. Check unevictable pages for 3549 - * evictability and move them to the appropriate zone lru list. 3550 - * 3551 - * This function is only used for SysV IPC SHM_UNLOCK. 3552 - */ 3553 - void scan_mapping_unevictable_pages(struct address_space *mapping) 3554 - { 3555 - pgoff_t next = 0; 3556 - pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >> 3557 - PAGE_CACHE_SHIFT; 3558 - struct zone *zone; 3559 - struct pagevec pvec; 3560 - 3561 - if (mapping->nrpages == 0) 3562 - return; 3563 - 3564 - pagevec_init(&pvec, 0); 3565 - while (next < end && 3566 - pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 3567 - int i; 3568 - int pg_scanned = 0; 3569 - 3570 - zone = NULL; 3571 - 3572 - for (i = 0; i < pagevec_count(&pvec); i++) { 3573 - struct page *page = pvec.pages[i]; 3574 - pgoff_t page_index = page->index; 3575 - struct zone *pagezone = page_zone(page); 3576 - 3577 - pg_scanned++; 3578 - if (page_index > next) 3579 - next = page_index; 3580 - next++; 3581 - 3582 - if (pagezone != zone) { 3583 - if (zone) 3584 - spin_unlock_irq(&zone->lru_lock); 3585 - zone = pagezone; 3586 - spin_lock_irq(&zone->lru_lock); 3587 - } 3588 - 3589 - if (PageLRU(page) && PageUnevictable(page)) 3590 - check_move_unevictable_page(page, zone); 3524 + pgscanned++; 3525 + pagezone = page_zone(page); 3526 + if (pagezone != zone) { 3527 + if (zone) 3528 + spin_unlock_irq(&zone->lru_lock); 3529 + zone = pagezone; 3530 + spin_lock_irq(&zone->lru_lock); 3591 3531 } 3592 - if (zone) 3593 - spin_unlock_irq(&zone->lru_lock); 3594 - pagevec_release(&pvec); 3595 3532 3596 - count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned); 3597 - cond_resched(); 3533 + if (!PageLRU(page) || !PageUnevictable(page)) 3534 + continue; 3535 + 3536 + if (page_evictable(page, NULL)) { 3537 + enum lru_list lru = page_lru_base_type(page); 3538 + 3539 + VM_BUG_ON(PageActive(page)); 3540 + ClearPageUnevictable(page); 3541 + __dec_zone_state(zone, NR_UNEVICTABLE); 3542 + lruvec = mem_cgroup_lru_move_lists(zone, page, 3543 + LRU_UNEVICTABLE, lru); 3544 + list_move(&page->lru, &lruvec->lists[lru]); 3545 + __inc_zone_state(zone, NR_INACTIVE_ANON + lru); 3546 + pgrescued++; 3547 + } 3598 3548 } 3599 - } 3600 - #else 3601 - void scan_mapping_unevictable_pages(struct address_space *mapping) 3602 - { 3549 + 3550 + if (zone) { 3551 + __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); 3552 + __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); 3553 + spin_unlock_irq(&zone->lru_lock); 3554 + } 3603 3555 } 3604 3556 #endif /* CONFIG_SHMEM */ 3605 3557