Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'hwpoison' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6

* 'hwpoison' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6: (22 commits)
Add _addr_lsb field to ia64 siginfo
Fix migration.c compilation on s390
HWPOISON: Remove retry loop for try_to_unmap
HWPOISON: Turn addr_valid from bitfield into char
HWPOISON: Disable DEBUG by default
HWPOISON: Convert pr_debugs to pr_info
HWPOISON: Improve comments in memory-failure.c
x86: HWPOISON: Report correct address granuality for huge hwpoison faults
Encode huge page size for VM_FAULT_HWPOISON errors
Fix build error with !CONFIG_MIGRATION
hugepage: move is_hugepage_on_freelist inside ifdef to avoid warning
Clean up __page_set_anon_rmap
HWPOISON, hugetlb: fix unpoison for hugepage
HWPOISON, hugetlb: soft offlining for hugepage
HWPOSION, hugetlb: recover from free hugepage error when !MF_COUNT_INCREASED
hugetlb: move refcounting in hugepage allocation inside hugetlb_lock
HWPOISON, hugetlb: add free check to dequeue_hwpoison_huge_page()
hugetlb: hugepage migration core
hugetlb: redefine hugepage copy functions
hugetlb: add allocate function for hugepage migration
...

+596 -167
+1
arch/ia64/include/asm/siginfo.h
··· 62 62 int _imm; /* immediate value for "break" */ 63 63 unsigned int _flags; /* see below */ 64 64 unsigned long _isr; /* isr */ 65 + short _addr_lsb; /* lsb of faulting address */ 65 66 } _sigfault; 66 67 67 68 /* SIGPOLL */
+13 -6
arch/x86/mm/fault.c
··· 11 11 #include <linux/kprobes.h> /* __kprobes, ... */ 12 12 #include <linux/mmiotrace.h> /* kmmio_handler, ... */ 13 13 #include <linux/perf_event.h> /* perf_sw_event */ 14 + #include <linux/hugetlb.h> /* hstate_index_to_shift */ 14 15 15 16 #include <asm/traps.h> /* dotraplinkage, ... */ 16 17 #include <asm/pgalloc.h> /* pgd_*(), ... */ ··· 161 160 162 161 static void 163 162 force_sig_info_fault(int si_signo, int si_code, unsigned long address, 164 - struct task_struct *tsk) 163 + struct task_struct *tsk, int fault) 165 164 { 165 + unsigned lsb = 0; 166 166 siginfo_t info; 167 167 168 168 info.si_signo = si_signo; 169 169 info.si_errno = 0; 170 170 info.si_code = si_code; 171 171 info.si_addr = (void __user *)address; 172 - info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; 172 + if (fault & VM_FAULT_HWPOISON_LARGE) 173 + lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); 174 + if (fault & VM_FAULT_HWPOISON) 175 + lsb = PAGE_SHIFT; 176 + info.si_addr_lsb = lsb; 173 177 174 178 force_sig_info(si_signo, &info, tsk); 175 179 } ··· 728 722 tsk->thread.error_code = error_code | (address >= TASK_SIZE); 729 723 tsk->thread.trap_no = 14; 730 724 731 - force_sig_info_fault(SIGSEGV, si_code, address, tsk); 725 + force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); 732 726 733 727 return; 734 728 } ··· 813 807 tsk->thread.trap_no = 14; 814 808 815 809 #ifdef CONFIG_MEMORY_FAILURE 816 - if (fault & VM_FAULT_HWPOISON) { 810 + if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { 817 811 printk(KERN_ERR 818 812 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", 819 813 tsk->comm, tsk->pid, address); 820 814 code = BUS_MCEERR_AR; 821 815 } 822 816 #endif 823 - force_sig_info_fault(SIGBUS, code, address, tsk); 817 + force_sig_info_fault(SIGBUS, code, address, tsk, fault); 824 818 } 825 819 826 820 static noinline void ··· 830 824 if (fault & VM_FAULT_OOM) { 831 825 out_of_memory(regs, error_code, address); 832 826 } else { 833 - if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) 827 + if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| 828 + VM_FAULT_HWPOISON_LARGE)) 834 829 do_sigbus(regs, error_code, address, fault); 835 830 else 836 831 BUG();
+15
fs/hugetlbfs/inode.c
··· 31 31 #include <linux/statfs.h> 32 32 #include <linux/security.h> 33 33 #include <linux/magic.h> 34 + #include <linux/migrate.h> 34 35 35 36 #include <asm/uaccess.h> 36 37 ··· 574 573 return 0; 575 574 } 576 575 576 + static int hugetlbfs_migrate_page(struct address_space *mapping, 577 + struct page *newpage, struct page *page) 578 + { 579 + int rc; 580 + 581 + rc = migrate_huge_page_move_mapping(mapping, newpage, page); 582 + if (rc) 583 + return rc; 584 + migrate_page_copy(newpage, page); 585 + 586 + return 0; 587 + } 588 + 577 589 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) 578 590 { 579 591 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); ··· 673 659 .write_begin = hugetlbfs_write_begin, 674 660 .write_end = hugetlbfs_write_end, 675 661 .set_page_dirty = hugetlbfs_set_page_dirty, 662 + .migratepage = hugetlbfs_migrate_page, 676 663 }; 677 664 678 665
+10
fs/signalfd.c
··· 99 99 #ifdef __ARCH_SI_TRAPNO 100 100 err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno); 101 101 #endif 102 + #ifdef BUS_MCEERR_AO 103 + /* 104 + * Other callers might not initialize the si_lsb field, 105 + * so check explicitly for the right codes here. 106 + */ 107 + if (kinfo->si_code == BUS_MCEERR_AR || 108 + kinfo->si_code == BUS_MCEERR_AO) 109 + err |= __put_user((short) kinfo->si_addr_lsb, 110 + &uinfo->ssi_addr_lsb); 111 + #endif 102 112 break; 103 113 case __SI_CHLD: 104 114 err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
+15 -2
include/linux/hugetlb.h
··· 43 43 struct vm_area_struct *vma, 44 44 int acctflags); 45 45 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); 46 - void __isolate_hwpoisoned_huge_page(struct page *page); 46 + int dequeue_hwpoisoned_huge_page(struct page *page); 47 + void copy_huge_page(struct page *dst, struct page *src); 47 48 48 49 extern unsigned long hugepages_treat_as_movable; 49 50 extern const unsigned long hugetlb_zero, hugetlb_infinity; ··· 102 101 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) 103 102 #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) 104 103 #define huge_pte_offset(mm, address) 0 105 - #define __isolate_hwpoisoned_huge_page(page) 0 104 + #define dequeue_hwpoisoned_huge_page(page) 0 105 + static inline void copy_huge_page(struct page *dst, struct page *src) 106 + { 107 + } 106 108 107 109 #define hugetlb_change_protection(vma, address, end, newprot) 108 110 ··· 232 228 struct hstate *hstate; 233 229 }; 234 230 231 + struct page *alloc_huge_page_node(struct hstate *h, int nid); 232 + 235 233 /* arch callback */ 236 234 int __init alloc_bootmem_huge_page(struct hstate *h); 237 235 ··· 307 301 return size_to_hstate(PAGE_SIZE << compound_order(page)); 308 302 } 309 303 304 + static inline unsigned hstate_index_to_shift(unsigned index) 305 + { 306 + return hstates[index].order + PAGE_SHIFT; 307 + } 308 + 310 309 #else 311 310 struct hstate {}; 311 + #define alloc_huge_page_node(h, nid) NULL 312 312 #define alloc_bootmem_huge_page(h) NULL 313 313 #define hstate_file(f) NULL 314 314 #define hstate_vma(v) NULL ··· 329 317 { 330 318 return 1; 331 319 } 320 + #define hstate_index_to_shift(index) 0 332 321 #endif 333 322 334 323 #endif /* _LINUX_HUGETLB_H */
+16
include/linux/migrate.h
··· 14 14 struct page *, struct page *); 15 15 extern int migrate_pages(struct list_head *l, new_page_t x, 16 16 unsigned long private, int offlining); 17 + extern int migrate_huge_pages(struct list_head *l, new_page_t x, 18 + unsigned long private, int offlining); 17 19 18 20 extern int fail_migrate_page(struct address_space *, 19 21 struct page *, struct page *); ··· 25 23 extern int migrate_vmas(struct mm_struct *mm, 26 24 const nodemask_t *from, const nodemask_t *to, 27 25 unsigned long flags); 26 + extern void migrate_page_copy(struct page *newpage, struct page *page); 27 + extern int migrate_huge_page_move_mapping(struct address_space *mapping, 28 + struct page *newpage, struct page *page); 28 29 #else 29 30 #define PAGE_MIGRATION 0 30 31 31 32 static inline void putback_lru_pages(struct list_head *l) {} 32 33 static inline int migrate_pages(struct list_head *l, new_page_t x, 34 + unsigned long private, int offlining) { return -ENOSYS; } 35 + static inline int migrate_huge_pages(struct list_head *l, new_page_t x, 33 36 unsigned long private, int offlining) { return -ENOSYS; } 34 37 35 38 static inline int migrate_prep(void) { return -ENOSYS; } ··· 43 36 static inline int migrate_vmas(struct mm_struct *mm, 44 37 const nodemask_t *from, const nodemask_t *to, 45 38 unsigned long flags) 39 + { 40 + return -ENOSYS; 41 + } 42 + 43 + static inline void migrate_page_copy(struct page *newpage, 44 + struct page *page) {} 45 + 46 + static inline int migrate_huge_page_move_mapping(struct address_space *mapping, 47 + struct page *newpage, struct page *page) 46 48 { 47 49 return -ENOSYS; 48 50 }
+10 -2
include/linux/mm.h
··· 718 718 #define VM_FAULT_SIGBUS 0x0002 719 719 #define VM_FAULT_MAJOR 0x0004 720 720 #define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */ 721 - #define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned page */ 721 + #define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */ 722 + #define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */ 722 723 723 724 #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ 724 725 #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ 725 726 726 - #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON) 727 + #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ 728 + 729 + #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \ 730 + VM_FAULT_HWPOISON_LARGE) 731 + 732 + /* Encode hstate index for a hwpoisoned large page */ 733 + #define VM_FAULT_SET_HINDEX(x) ((x) << 12) 734 + #define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf) 727 735 728 736 /* 729 737 * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
+2 -1
include/linux/signalfd.h
··· 33 33 __u64 ssi_utime; 34 34 __u64 ssi_stime; 35 35 __u64 ssi_addr; 36 + __u16 ssi_addr_lsb; 36 37 37 38 /* 38 39 * Pad strcture to 128 bytes. Remember to update the ··· 44 43 * comes out of a read(2) and we really don't want to have 45 44 * a compat on read(2). 46 45 */ 47 - __u8 __pad[48]; 46 + __u8 __pad[46]; 48 47 }; 49 48 50 49
+163 -70
mm/hugetlb.c
··· 423 423 } 424 424 } 425 425 426 - static void copy_gigantic_page(struct page *dst, struct page *src, 426 + static void copy_user_gigantic_page(struct page *dst, struct page *src, 427 427 unsigned long addr, struct vm_area_struct *vma) 428 428 { 429 429 int i; 430 430 struct hstate *h = hstate_vma(vma); 431 431 struct page *dst_base = dst; 432 432 struct page *src_base = src; 433 - might_sleep(); 433 + 434 434 for (i = 0; i < pages_per_huge_page(h); ) { 435 435 cond_resched(); 436 436 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); ··· 440 440 src = mem_map_next(src, src_base, i); 441 441 } 442 442 } 443 - static void copy_huge_page(struct page *dst, struct page *src, 443 + 444 + static void copy_user_huge_page(struct page *dst, struct page *src, 444 445 unsigned long addr, struct vm_area_struct *vma) 445 446 { 446 447 int i; 447 448 struct hstate *h = hstate_vma(vma); 448 449 449 450 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { 450 - copy_gigantic_page(dst, src, addr, vma); 451 + copy_user_gigantic_page(dst, src, addr, vma); 451 452 return; 452 453 } 453 454 ··· 456 455 for (i = 0; i < pages_per_huge_page(h); i++) { 457 456 cond_resched(); 458 457 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 458 + } 459 + } 460 + 461 + static void copy_gigantic_page(struct page *dst, struct page *src) 462 + { 463 + int i; 464 + struct hstate *h = page_hstate(src); 465 + struct page *dst_base = dst; 466 + struct page *src_base = src; 467 + 468 + for (i = 0; i < pages_per_huge_page(h); ) { 469 + cond_resched(); 470 + copy_highpage(dst, src); 471 + 472 + i++; 473 + dst = mem_map_next(dst, dst_base, i); 474 + src = mem_map_next(src, src_base, i); 475 + } 476 + } 477 + 478 + void copy_huge_page(struct page *dst, struct page *src) 479 + { 480 + int i; 481 + struct hstate *h = page_hstate(src); 482 + 483 + if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { 484 + copy_gigantic_page(dst, src); 485 + return; 486 + } 487 + 488 + might_sleep(); 489 + for (i = 0; i < pages_per_huge_page(h); i++) { 490 + cond_resched(); 491 + copy_highpage(dst + i, src + i); 459 492 } 460 493 } 461 494 ··· 501 466 h->free_huge_pages_node[nid]++; 502 467 } 503 468 469 + static struct page *dequeue_huge_page_node(struct hstate *h, int nid) 470 + { 471 + struct page *page; 472 + 473 + if (list_empty(&h->hugepage_freelists[nid])) 474 + return NULL; 475 + page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); 476 + list_del(&page->lru); 477 + set_page_refcounted(page); 478 + h->free_huge_pages--; 479 + h->free_huge_pages_node[nid]--; 480 + return page; 481 + } 482 + 504 483 static struct page *dequeue_huge_page_vma(struct hstate *h, 505 484 struct vm_area_struct *vma, 506 485 unsigned long address, int avoid_reserve) 507 486 { 508 - int nid; 509 487 struct page *page = NULL; 510 488 struct mempolicy *mpol; 511 489 nodemask_t *nodemask; ··· 544 496 545 497 for_each_zone_zonelist_nodemask(zone, z, zonelist, 546 498 MAX_NR_ZONES - 1, nodemask) { 547 - nid = zone_to_nid(zone); 548 - if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && 549 - !list_empty(&h->hugepage_freelists[nid])) { 550 - page = list_entry(h->hugepage_freelists[nid].next, 551 - struct page, lru); 552 - list_del(&page->lru); 553 - h->free_huge_pages--; 554 - h->free_huge_pages_node[nid]--; 555 - 556 - if (!avoid_reserve) 557 - decrement_hugepage_resv_vma(h, vma); 558 - 559 - break; 499 + if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { 500 + page = dequeue_huge_page_node(h, zone_to_nid(zone)); 501 + if (page) { 502 + if (!avoid_reserve) 503 + decrement_hugepage_resv_vma(h, vma); 504 + break; 505 + } 560 506 } 561 507 } 562 508 err: ··· 812 770 return ret; 813 771 } 814 772 815 - static struct page *alloc_buddy_huge_page(struct hstate *h, 816 - struct vm_area_struct *vma, unsigned long address) 773 + static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) 817 774 { 818 775 struct page *page; 819 - unsigned int nid; 776 + unsigned int r_nid; 820 777 821 778 if (h->order >= MAX_ORDER) 822 779 return NULL; ··· 853 812 } 854 813 spin_unlock(&hugetlb_lock); 855 814 856 - page = alloc_pages(htlb_alloc_mask|__GFP_COMP| 857 - __GFP_REPEAT|__GFP_NOWARN, 858 - huge_page_order(h)); 815 + if (nid == NUMA_NO_NODE) 816 + page = alloc_pages(htlb_alloc_mask|__GFP_COMP| 817 + __GFP_REPEAT|__GFP_NOWARN, 818 + huge_page_order(h)); 819 + else 820 + page = alloc_pages_exact_node(nid, 821 + htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| 822 + __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); 859 823 860 824 if (page && arch_prepare_hugepage(page)) { 861 825 __free_pages(page, huge_page_order(h)); ··· 869 823 870 824 spin_lock(&hugetlb_lock); 871 825 if (page) { 872 - /* 873 - * This page is now managed by the hugetlb allocator and has 874 - * no users -- drop the buddy allocator's reference. 875 - */ 876 - put_page_testzero(page); 877 - VM_BUG_ON(page_count(page)); 878 - nid = page_to_nid(page); 826 + r_nid = page_to_nid(page); 879 827 set_compound_page_dtor(page, free_huge_page); 880 828 /* 881 829 * We incremented the global counters already 882 830 */ 883 - h->nr_huge_pages_node[nid]++; 884 - h->surplus_huge_pages_node[nid]++; 831 + h->nr_huge_pages_node[r_nid]++; 832 + h->surplus_huge_pages_node[r_nid]++; 885 833 __count_vm_event(HTLB_BUDDY_PGALLOC); 886 834 } else { 887 835 h->nr_huge_pages--; ··· 883 843 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 884 844 } 885 845 spin_unlock(&hugetlb_lock); 846 + 847 + return page; 848 + } 849 + 850 + /* 851 + * This allocation function is useful in the context where vma is irrelevant. 852 + * E.g. soft-offlining uses this function because it only cares physical 853 + * address of error page. 854 + */ 855 + struct page *alloc_huge_page_node(struct hstate *h, int nid) 856 + { 857 + struct page *page; 858 + 859 + spin_lock(&hugetlb_lock); 860 + page = dequeue_huge_page_node(h, nid); 861 + spin_unlock(&hugetlb_lock); 862 + 863 + if (!page) 864 + page = alloc_buddy_huge_page(h, nid); 886 865 887 866 return page; 888 867 } ··· 930 871 retry: 931 872 spin_unlock(&hugetlb_lock); 932 873 for (i = 0; i < needed; i++) { 933 - page = alloc_buddy_huge_page(h, NULL, 0); 934 - if (!page) { 874 + page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 875 + if (!page) 935 876 /* 936 877 * We were not able to allocate enough pages to 937 878 * satisfy the entire reservation so we free what 938 879 * we've allocated so far. 939 880 */ 940 - spin_lock(&hugetlb_lock); 941 - needed = 0; 942 881 goto free; 943 - } 944 882 945 883 list_add(&page->lru, &surplus_list); 946 884 } ··· 964 908 needed += allocated; 965 909 h->resv_huge_pages += delta; 966 910 ret = 0; 967 - free: 911 + 912 + spin_unlock(&hugetlb_lock); 968 913 /* Free the needed pages to the hugetlb pool */ 969 914 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 970 915 if ((--needed) < 0) 971 916 break; 972 917 list_del(&page->lru); 918 + /* 919 + * This page is now managed by the hugetlb allocator and has 920 + * no users -- drop the buddy allocator's reference. 921 + */ 922 + put_page_testzero(page); 923 + VM_BUG_ON(page_count(page)); 973 924 enqueue_huge_page(h, page); 974 925 } 975 926 976 927 /* Free unnecessary surplus pages to the buddy allocator */ 928 + free: 977 929 if (!list_empty(&surplus_list)) { 978 - spin_unlock(&hugetlb_lock); 979 930 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 980 931 list_del(&page->lru); 981 - /* 982 - * The page has a reference count of zero already, so 983 - * call free_huge_page directly instead of using 984 - * put_page. This must be done with hugetlb_lock 985 - * unlocked which is safe because free_huge_page takes 986 - * hugetlb_lock before deciding how to free the page. 987 - */ 988 - free_huge_page(page); 932 + put_page(page); 989 933 } 990 - spin_lock(&hugetlb_lock); 991 934 } 935 + spin_lock(&hugetlb_lock); 992 936 993 937 return ret; 994 938 } ··· 1108 1052 spin_unlock(&hugetlb_lock); 1109 1053 1110 1054 if (!page) { 1111 - page = alloc_buddy_huge_page(h, vma, addr); 1055 + page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1112 1056 if (!page) { 1113 1057 hugetlb_put_quota(inode->i_mapping, chg); 1114 1058 return ERR_PTR(-VM_FAULT_SIGBUS); 1115 1059 } 1116 1060 } 1117 1061 1118 - set_page_refcounted(page); 1119 1062 set_page_private(page, (unsigned long) mapping); 1120 1063 1121 1064 vma_commit_reservation(h, vma, addr); ··· 2208 2153 return -ENOMEM; 2209 2154 } 2210 2155 2156 + static int is_hugetlb_entry_migration(pte_t pte) 2157 + { 2158 + swp_entry_t swp; 2159 + 2160 + if (huge_pte_none(pte) || pte_present(pte)) 2161 + return 0; 2162 + swp = pte_to_swp_entry(pte); 2163 + if (non_swap_entry(swp) && is_migration_entry(swp)) { 2164 + return 1; 2165 + } else 2166 + return 0; 2167 + } 2168 + 2211 2169 static int is_hugetlb_entry_hwpoisoned(pte_t pte) 2212 2170 { 2213 2171 swp_entry_t swp; ··· 2451 2383 if (unlikely(anon_vma_prepare(vma))) 2452 2384 return VM_FAULT_OOM; 2453 2385 2454 - copy_huge_page(new_page, old_page, address, vma); 2386 + copy_user_huge_page(new_page, old_page, address, vma); 2455 2387 __SetPageUptodate(new_page); 2456 2388 2457 2389 /* ··· 2583 2515 hugepage_add_new_anon_rmap(page, vma, address); 2584 2516 } 2585 2517 } else { 2518 + /* 2519 + * If memory error occurs between mmap() and fault, some process 2520 + * don't have hwpoisoned swap entry for errored virtual address. 2521 + * So we need to block hugepage fault by PG_hwpoison bit check. 2522 + */ 2523 + if (unlikely(PageHWPoison(page))) { 2524 + ret = VM_FAULT_HWPOISON | 2525 + VM_FAULT_SET_HINDEX(h - hstates); 2526 + goto backout_unlocked; 2527 + } 2586 2528 page_dup_rmap(page); 2587 - } 2588 - 2589 - /* 2590 - * Since memory error handler replaces pte into hwpoison swap entry 2591 - * at the time of error handling, a process which reserved but not have 2592 - * the mapping to the error hugepage does not have hwpoison swap entry. 2593 - * So we need to block accesses from such a process by checking 2594 - * PG_hwpoison bit here. 2595 - */ 2596 - if (unlikely(PageHWPoison(page))) { 2597 - ret = VM_FAULT_HWPOISON; 2598 - goto backout_unlocked; 2599 2529 } 2600 2530 2601 2531 /* ··· 2653 2587 ptep = huge_pte_offset(mm, address); 2654 2588 if (ptep) { 2655 2589 entry = huge_ptep_get(ptep); 2656 - if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 2657 - return VM_FAULT_HWPOISON; 2590 + if (unlikely(is_hugetlb_entry_migration(entry))) { 2591 + migration_entry_wait(mm, (pmd_t *)ptep, address); 2592 + return 0; 2593 + } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 2594 + return VM_FAULT_HWPOISON_LARGE | 2595 + VM_FAULT_SET_HINDEX(h - hstates); 2658 2596 } 2659 2597 2660 2598 ptep = huge_pte_alloc(mm, address, huge_page_size(h)); ··· 2948 2878 hugetlb_acct_memory(h, -(chg - freed)); 2949 2879 } 2950 2880 2881 + #ifdef CONFIG_MEMORY_FAILURE 2882 + 2883 + /* Should be called in hugetlb_lock */ 2884 + static int is_hugepage_on_freelist(struct page *hpage) 2885 + { 2886 + struct page *page; 2887 + struct page *tmp; 2888 + struct hstate *h = page_hstate(hpage); 2889 + int nid = page_to_nid(hpage); 2890 + 2891 + list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru) 2892 + if (page == hpage) 2893 + return 1; 2894 + return 0; 2895 + } 2896 + 2951 2897 /* 2952 2898 * This function is called from memory failure code. 2953 2899 * Assume the caller holds page lock of the head page. 2954 2900 */ 2955 - void __isolate_hwpoisoned_huge_page(struct page *hpage) 2901 + int dequeue_hwpoisoned_huge_page(struct page *hpage) 2956 2902 { 2957 2903 struct hstate *h = page_hstate(hpage); 2958 2904 int nid = page_to_nid(hpage); 2905 + int ret = -EBUSY; 2959 2906 2960 2907 spin_lock(&hugetlb_lock); 2961 - list_del(&hpage->lru); 2962 - h->free_huge_pages--; 2963 - h->free_huge_pages_node[nid]--; 2908 + if (is_hugepage_on_freelist(hpage)) { 2909 + list_del(&hpage->lru); 2910 + set_page_refcounted(hpage); 2911 + h->free_huge_pages--; 2912 + h->free_huge_pages_node[nid]--; 2913 + ret = 0; 2914 + } 2964 2915 spin_unlock(&hugetlb_lock); 2916 + return ret; 2965 2917 } 2918 + #endif
+125 -50
mm/memory-failure.c
··· 7 7 * Free Software Foundation. 8 8 * 9 9 * High level machine check handler. Handles pages reported by the 10 - * hardware as being corrupted usually due to a 2bit ECC memory or cache 10 + * hardware as being corrupted usually due to a multi-bit ECC memory or cache 11 11 * failure. 12 + * 13 + * In addition there is a "soft offline" entry point that allows stop using 14 + * not-yet-corrupted-by-suspicious pages without killing anything. 12 15 * 13 16 * Handles page cache pages in various states. The tricky part 14 - * here is that we can access any page asynchronous to other VM 15 - * users, because memory failures could happen anytime and anywhere, 16 - * possibly violating some of their assumptions. This is why this code 17 - * has to be extremely careful. Generally it tries to use normal locking 18 - * rules, as in get the standard locks, even if that means the 19 - * error handling takes potentially a long time. 20 - * 21 - * The operation to map back from RMAP chains to processes has to walk 22 - * the complete process list and has non linear complexity with the number 23 - * mappings. In short it can be quite slow. But since memory corruptions 24 - * are rare we hope to get away with this. 17 + * here is that we can access any page asynchronously in respect to 18 + * other VM users, because memory failures could happen anytime and 19 + * anywhere. This could violate some of their assumptions. This is why 20 + * this code has to be extremely careful. Generally it tries to use 21 + * normal locking rules, as in get the standard locks, even if that means 22 + * the error handling takes potentially a long time. 23 + * 24 + * There are several operations here with exponential complexity because 25 + * of unsuitable VM data structures. For example the operation to map back 26 + * from RMAP chains to processes has to walk the complete process list and 27 + * has non linear complexity with the number. But since memory corruptions 28 + * are rare we hope to get away with this. This avoids impacting the core 29 + * VM. 25 30 */ 26 31 27 32 /* ··· 35 30 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages 36 31 * - pass bad pages to kdump next kernel 37 32 */ 38 - #define DEBUG 1 /* remove me in 2.6.34 */ 39 33 #include <linux/kernel.h> 40 34 #include <linux/mm.h> 41 35 #include <linux/page-flags.h> ··· 82 78 return 0; 83 79 84 80 /* 85 - * page_mapping() does not accept slab page 81 + * page_mapping() does not accept slab pages. 86 82 */ 87 83 if (PageSlab(p)) 88 84 return -EINVAL; ··· 272 268 struct list_head nd; 273 269 struct task_struct *tsk; 274 270 unsigned long addr; 275 - unsigned addr_valid:1; 271 + char addr_valid; 276 272 }; 277 273 278 274 /* ··· 313 309 * a SIGKILL because the error is not contained anymore. 314 310 */ 315 311 if (tk->addr == -EFAULT) { 316 - pr_debug("MCE: Unable to find user space address %lx in %s\n", 312 + pr_info("MCE: Unable to find user space address %lx in %s\n", 317 313 page_to_pfn(p), tsk->comm); 318 314 tk->addr_valid = 0; 319 315 } ··· 581 577 pfn, err); 582 578 } else if (page_has_private(p) && 583 579 !try_to_release_page(p, GFP_NOIO)) { 584 - pr_debug("MCE %#lx: failed to release buffers\n", pfn); 580 + pr_info("MCE %#lx: failed to release buffers\n", pfn); 585 581 } else { 586 582 ret = RECOVERED; 587 583 } ··· 697 693 * Issues: 698 694 * - Error on hugepage is contained in hugepage unit (not in raw page unit.) 699 695 * To narrow down kill region to one page, we need to break up pmd. 700 - * - To support soft-offlining for hugepage, we need to support hugepage 701 - * migration. 702 696 */ 703 697 static int me_huge_page(struct page *p, unsigned long pfn) 704 698 { 699 + int res = 0; 705 700 struct page *hpage = compound_head(p); 706 701 /* 707 702 * We can safely recover from error on free or reserved (i.e. ··· 713 710 * so there is no race between isolation and mapping/unmapping. 714 711 */ 715 712 if (!(page_mapping(hpage) || PageAnon(hpage))) { 716 - __isolate_hwpoisoned_huge_page(hpage); 717 - return RECOVERED; 713 + res = dequeue_hwpoisoned_huge_page(hpage); 714 + if (!res) 715 + return RECOVERED; 718 716 } 719 717 return DELAYED; 720 718 } ··· 840 836 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; 841 837 } 842 838 843 - #define N_UNMAP_TRIES 5 844 - 845 839 /* 846 840 * Do all that is necessary to remove user space mappings. Unmap 847 841 * the pages and send SIGBUS to the processes if the data was dirty. ··· 851 849 struct address_space *mapping; 852 850 LIST_HEAD(tokill); 853 851 int ret; 854 - int i; 855 852 int kill = 1; 856 853 struct page *hpage = compound_head(p); 857 854 ··· 904 903 if (kill) 905 904 collect_procs(hpage, &tokill); 906 905 907 - /* 908 - * try_to_unmap can fail temporarily due to races. 909 - * Try a few times (RED-PEN better strategy?) 910 - */ 911 - for (i = 0; i < N_UNMAP_TRIES; i++) { 912 - ret = try_to_unmap(hpage, ttu); 913 - if (ret == SWAP_SUCCESS) 914 - break; 915 - pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); 916 - } 917 - 906 + ret = try_to_unmap(hpage, ttu); 918 907 if (ret != SWAP_SUCCESS) 919 908 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", 920 909 pfn, page_mapcount(hpage)); ··· 972 981 * We need/can do nothing about count=0 pages. 973 982 * 1) it's a free page, and therefore in safe hand: 974 983 * prep_new_page() will be the gate keeper. 975 - * 2) it's part of a non-compound high order page. 984 + * 2) it's a free hugepage, which is also safe: 985 + * an affected hugepage will be dequeued from hugepage freelist, 986 + * so there's no concern about reusing it ever after. 987 + * 3) it's part of a non-compound high order page. 976 988 * Implies some kernel user: cannot stop them from 977 989 * R/W the page; let's pray that the page has been 978 990 * used and will be freed some time later. ··· 987 993 if (is_free_buddy_page(p)) { 988 994 action_result(pfn, "free buddy", DELAYED); 989 995 return 0; 996 + } else if (PageHuge(hpage)) { 997 + /* 998 + * Check "just unpoisoned", "filter hit", and 999 + * "race with other subpage." 1000 + */ 1001 + lock_page_nosync(hpage); 1002 + if (!PageHWPoison(hpage) 1003 + || (hwpoison_filter(p) && TestClearPageHWPoison(p)) 1004 + || (p != hpage && TestSetPageHWPoison(hpage))) { 1005 + atomic_long_sub(nr_pages, &mce_bad_pages); 1006 + return 0; 1007 + } 1008 + set_page_hwpoison_huge_page(hpage); 1009 + res = dequeue_hwpoisoned_huge_page(hpage); 1010 + action_result(pfn, "free huge", 1011 + res ? IGNORED : DELAYED); 1012 + unlock_page(hpage); 1013 + return res; 990 1014 } else { 991 1015 action_result(pfn, "high order kernel", IGNORED); 992 1016 return -EBUSY; ··· 1159 1147 page = compound_head(p); 1160 1148 1161 1149 if (!PageHWPoison(p)) { 1162 - pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); 1150 + pr_info("MCE: Page was already unpoisoned %#lx\n", pfn); 1163 1151 return 0; 1164 1152 } 1165 1153 1166 1154 nr_pages = 1 << compound_order(page); 1167 1155 1168 1156 if (!get_page_unless_zero(page)) { 1157 + /* 1158 + * Since HWPoisoned hugepage should have non-zero refcount, 1159 + * race between memory failure and unpoison seems to happen. 1160 + * In such case unpoison fails and memory failure runs 1161 + * to the end. 1162 + */ 1163 + if (PageHuge(page)) { 1164 + pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); 1165 + return 0; 1166 + } 1169 1167 if (TestClearPageHWPoison(p)) 1170 1168 atomic_long_sub(nr_pages, &mce_bad_pages); 1171 - pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); 1169 + pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); 1172 1170 return 0; 1173 1171 } 1174 1172 ··· 1190 1168 * the free buddy page pool. 1191 1169 */ 1192 1170 if (TestClearPageHWPoison(page)) { 1193 - pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); 1171 + pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); 1194 1172 atomic_long_sub(nr_pages, &mce_bad_pages); 1195 1173 freeit = 1; 1174 + if (PageHuge(page)) 1175 + clear_page_hwpoison_huge_page(page); 1196 1176 } 1197 - if (PageHuge(p)) 1198 - clear_page_hwpoison_huge_page(page); 1199 1177 unlock_page(page); 1200 1178 1201 1179 put_page(page); ··· 1209 1187 static struct page *new_page(struct page *p, unsigned long private, int **x) 1210 1188 { 1211 1189 int nid = page_to_nid(p); 1212 - return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); 1190 + if (PageHuge(p)) 1191 + return alloc_huge_page_node(page_hstate(compound_head(p)), 1192 + nid); 1193 + else 1194 + return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); 1213 1195 } 1214 1196 1215 1197 /* ··· 1241 1215 * was free. 1242 1216 */ 1243 1217 set_migratetype_isolate(p); 1218 + /* 1219 + * When the target page is a free hugepage, just remove it 1220 + * from free hugepage list. 1221 + */ 1244 1222 if (!get_page_unless_zero(compound_head(p))) { 1245 - if (is_free_buddy_page(p)) { 1246 - pr_debug("get_any_page: %#lx free buddy page\n", pfn); 1223 + if (PageHuge(p)) { 1224 + pr_info("get_any_page: %#lx free huge page\n", pfn); 1225 + ret = dequeue_hwpoisoned_huge_page(compound_head(p)); 1226 + } else if (is_free_buddy_page(p)) { 1227 + pr_info("get_any_page: %#lx free buddy page\n", pfn); 1247 1228 /* Set hwpoison bit while page is still isolated */ 1248 1229 SetPageHWPoison(p); 1249 1230 ret = 0; 1250 1231 } else { 1251 - pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", 1232 + pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n", 1252 1233 pfn, p->flags); 1253 1234 ret = -EIO; 1254 1235 } ··· 1265 1232 } 1266 1233 unset_migratetype_isolate(p); 1267 1234 unlock_system_sleep(); 1235 + return ret; 1236 + } 1237 + 1238 + static int soft_offline_huge_page(struct page *page, int flags) 1239 + { 1240 + int ret; 1241 + unsigned long pfn = page_to_pfn(page); 1242 + struct page *hpage = compound_head(page); 1243 + LIST_HEAD(pagelist); 1244 + 1245 + ret = get_any_page(page, pfn, flags); 1246 + if (ret < 0) 1247 + return ret; 1248 + if (ret == 0) 1249 + goto done; 1250 + 1251 + if (PageHWPoison(hpage)) { 1252 + put_page(hpage); 1253 + pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn); 1254 + return -EBUSY; 1255 + } 1256 + 1257 + /* Keep page count to indicate a given hugepage is isolated. */ 1258 + 1259 + list_add(&hpage->lru, &pagelist); 1260 + ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); 1261 + if (ret) { 1262 + pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", 1263 + pfn, ret, page->flags); 1264 + if (ret > 0) 1265 + ret = -EIO; 1266 + return ret; 1267 + } 1268 + done: 1269 + if (!PageHWPoison(hpage)) 1270 + atomic_long_add(1 << compound_order(hpage), &mce_bad_pages); 1271 + set_page_hwpoison_huge_page(hpage); 1272 + dequeue_hwpoisoned_huge_page(hpage); 1273 + /* keep elevated page count for bad page */ 1268 1274 return ret; 1269 1275 } 1270 1276 ··· 1334 1262 int ret; 1335 1263 unsigned long pfn = page_to_pfn(page); 1336 1264 1265 + if (PageHuge(page)) 1266 + return soft_offline_huge_page(page, flags); 1267 + 1337 1268 ret = get_any_page(page, pfn, flags); 1338 1269 if (ret < 0) 1339 1270 return ret; ··· 1363 1288 goto done; 1364 1289 } 1365 1290 if (!PageLRU(page)) { 1366 - pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", 1291 + pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", 1367 1292 pfn, page->flags); 1368 1293 return -EIO; 1369 1294 } ··· 1377 1302 if (PageHWPoison(page)) { 1378 1303 unlock_page(page); 1379 1304 put_page(page); 1380 - pr_debug("soft offline: %#lx page already poisoned\n", pfn); 1305 + pr_info("soft offline: %#lx page already poisoned\n", pfn); 1381 1306 return -EBUSY; 1382 1307 } 1383 1308 ··· 1398 1323 put_page(page); 1399 1324 if (ret == 1) { 1400 1325 ret = 0; 1401 - pr_debug("soft_offline: %#lx: invalidated\n", pfn); 1326 + pr_info("soft_offline: %#lx: invalidated\n", pfn); 1402 1327 goto done; 1403 1328 } 1404 1329 ··· 1414 1339 list_add(&page->lru, &pagelist); 1415 1340 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); 1416 1341 if (ret) { 1417 - pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", 1342 + pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1418 1343 pfn, ret, page->flags); 1419 1344 if (ret > 0) 1420 1345 ret = -EIO; 1421 1346 } 1422 1347 } else { 1423 - pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", 1348 + pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", 1424 1349 pfn, ret, page_count(page), page->flags); 1425 1350 } 1426 1351 if (ret)
+2 -1
mm/memory.c
··· 1450 1450 if (ret & VM_FAULT_OOM) 1451 1451 return i ? i : -ENOMEM; 1452 1452 if (ret & 1453 - (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) 1453 + (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE| 1454 + VM_FAULT_SIGBUS)) 1454 1455 return i ? i : -EFAULT; 1455 1456 BUG(); 1456 1457 }
+216 -18
mm/migrate.c
··· 32 32 #include <linux/security.h> 33 33 #include <linux/memcontrol.h> 34 34 #include <linux/syscalls.h> 35 + #include <linux/hugetlb.h> 35 36 #include <linux/gfp.h> 36 37 37 38 #include "internal.h" ··· 96 95 pte_t *ptep, pte; 97 96 spinlock_t *ptl; 98 97 99 - pgd = pgd_offset(mm, addr); 100 - if (!pgd_present(*pgd)) 101 - goto out; 98 + if (unlikely(PageHuge(new))) { 99 + ptep = huge_pte_offset(mm, addr); 100 + if (!ptep) 101 + goto out; 102 + ptl = &mm->page_table_lock; 103 + } else { 104 + pgd = pgd_offset(mm, addr); 105 + if (!pgd_present(*pgd)) 106 + goto out; 102 107 103 - pud = pud_offset(pgd, addr); 104 - if (!pud_present(*pud)) 105 - goto out; 108 + pud = pud_offset(pgd, addr); 109 + if (!pud_present(*pud)) 110 + goto out; 106 111 107 - pmd = pmd_offset(pud, addr); 108 - if (!pmd_present(*pmd)) 109 - goto out; 112 + pmd = pmd_offset(pud, addr); 113 + if (!pmd_present(*pmd)) 114 + goto out; 110 115 111 - ptep = pte_offset_map(pmd, addr); 116 + ptep = pte_offset_map(pmd, addr); 112 117 113 - if (!is_swap_pte(*ptep)) { 114 - pte_unmap(ptep); 115 - goto out; 116 - } 118 + if (!is_swap_pte(*ptep)) { 119 + pte_unmap(ptep); 120 + goto out; 121 + } 117 122 118 - ptl = pte_lockptr(mm, pmd); 123 + ptl = pte_lockptr(mm, pmd); 124 + } 125 + 119 126 spin_lock(ptl); 120 127 pte = *ptep; 121 128 if (!is_swap_pte(pte)) ··· 139 130 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 140 131 if (is_write_migration_entry(entry)) 141 132 pte = pte_mkwrite(pte); 133 + #ifdef CONFIG_HUGETLB_PAGE 134 + if (PageHuge(new)) 135 + pte = pte_mkhuge(pte); 136 + #endif 142 137 flush_cache_page(vma, addr, pte_pfn(pte)); 143 138 set_pte_at(mm, addr, ptep, pte); 144 139 145 - if (PageAnon(new)) 140 + if (PageHuge(new)) { 141 + if (PageAnon(new)) 142 + hugepage_add_anon_rmap(new, vma, addr); 143 + else 144 + page_dup_rmap(new); 145 + } else if (PageAnon(new)) 146 146 page_add_anon_rmap(new, vma, addr); 147 147 else 148 148 page_add_file_rmap(new); ··· 294 276 } 295 277 296 278 /* 279 + * The expected number of remaining references is the same as that 280 + * of migrate_page_move_mapping(). 281 + */ 282 + int migrate_huge_page_move_mapping(struct address_space *mapping, 283 + struct page *newpage, struct page *page) 284 + { 285 + int expected_count; 286 + void **pslot; 287 + 288 + if (!mapping) { 289 + if (page_count(page) != 1) 290 + return -EAGAIN; 291 + return 0; 292 + } 293 + 294 + spin_lock_irq(&mapping->tree_lock); 295 + 296 + pslot = radix_tree_lookup_slot(&mapping->page_tree, 297 + page_index(page)); 298 + 299 + expected_count = 2 + page_has_private(page); 300 + if (page_count(page) != expected_count || 301 + (struct page *)radix_tree_deref_slot(pslot) != page) { 302 + spin_unlock_irq(&mapping->tree_lock); 303 + return -EAGAIN; 304 + } 305 + 306 + if (!page_freeze_refs(page, expected_count)) { 307 + spin_unlock_irq(&mapping->tree_lock); 308 + return -EAGAIN; 309 + } 310 + 311 + get_page(newpage); 312 + 313 + radix_tree_replace_slot(pslot, newpage); 314 + 315 + page_unfreeze_refs(page, expected_count); 316 + 317 + __put_page(page); 318 + 319 + spin_unlock_irq(&mapping->tree_lock); 320 + return 0; 321 + } 322 + 323 + /* 297 324 * Copy the page to its new location 298 325 */ 299 - static void migrate_page_copy(struct page *newpage, struct page *page) 326 + void migrate_page_copy(struct page *newpage, struct page *page) 300 327 { 301 - copy_highpage(newpage, page); 328 + if (PageHuge(page)) 329 + copy_huge_page(newpage, page); 330 + else 331 + copy_highpage(newpage, page); 302 332 303 333 if (PageError(page)) 304 334 SetPageError(newpage); ··· 790 724 } 791 725 792 726 /* 727 + * Counterpart of unmap_and_move_page() for hugepage migration. 728 + * 729 + * This function doesn't wait the completion of hugepage I/O 730 + * because there is no race between I/O and migration for hugepage. 731 + * Note that currently hugepage I/O occurs only in direct I/O 732 + * where no lock is held and PG_writeback is irrelevant, 733 + * and writeback status of all subpages are counted in the reference 734 + * count of the head page (i.e. if all subpages of a 2MB hugepage are 735 + * under direct I/O, the reference of the head page is 512 and a bit more.) 736 + * This means that when we try to migrate hugepage whose subpages are 737 + * doing direct I/O, some references remain after try_to_unmap() and 738 + * hugepage migration fails without data corruption. 739 + * 740 + * There is also no race when direct I/O is issued on the page under migration, 741 + * because then pte is replaced with migration swap entry and direct I/O code 742 + * will wait in the page fault for migration to complete. 743 + */ 744 + static int unmap_and_move_huge_page(new_page_t get_new_page, 745 + unsigned long private, struct page *hpage, 746 + int force, int offlining) 747 + { 748 + int rc = 0; 749 + int *result = NULL; 750 + struct page *new_hpage = get_new_page(hpage, private, &result); 751 + int rcu_locked = 0; 752 + struct anon_vma *anon_vma = NULL; 753 + 754 + if (!new_hpage) 755 + return -ENOMEM; 756 + 757 + rc = -EAGAIN; 758 + 759 + if (!trylock_page(hpage)) { 760 + if (!force) 761 + goto out; 762 + lock_page(hpage); 763 + } 764 + 765 + if (PageAnon(hpage)) { 766 + rcu_read_lock(); 767 + rcu_locked = 1; 768 + 769 + if (page_mapped(hpage)) { 770 + anon_vma = page_anon_vma(hpage); 771 + atomic_inc(&anon_vma->external_refcount); 772 + } 773 + } 774 + 775 + try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 776 + 777 + if (!page_mapped(hpage)) 778 + rc = move_to_new_page(new_hpage, hpage, 1); 779 + 780 + if (rc) 781 + remove_migration_ptes(hpage, hpage); 782 + 783 + if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, 784 + &anon_vma->lock)) { 785 + int empty = list_empty(&anon_vma->head); 786 + spin_unlock(&anon_vma->lock); 787 + if (empty) 788 + anon_vma_free(anon_vma); 789 + } 790 + 791 + if (rcu_locked) 792 + rcu_read_unlock(); 793 + out: 794 + unlock_page(hpage); 795 + 796 + if (rc != -EAGAIN) { 797 + list_del(&hpage->lru); 798 + put_page(hpage); 799 + } 800 + 801 + put_page(new_hpage); 802 + 803 + if (result) { 804 + if (rc) 805 + *result = rc; 806 + else 807 + *result = page_to_nid(new_hpage); 808 + } 809 + return rc; 810 + } 811 + 812 + /* 793 813 * migrate_pages 794 814 * 795 815 * The function takes one list of pages to migrate and a function ··· 933 781 current->flags &= ~PF_SWAPWRITE; 934 782 935 783 putback_lru_pages(from); 784 + 785 + if (rc) 786 + return rc; 787 + 788 + return nr_failed + retry; 789 + } 790 + 791 + int migrate_huge_pages(struct list_head *from, 792 + new_page_t get_new_page, unsigned long private, int offlining) 793 + { 794 + int retry = 1; 795 + int nr_failed = 0; 796 + int pass = 0; 797 + struct page *page; 798 + struct page *page2; 799 + int rc; 800 + 801 + for (pass = 0; pass < 10 && retry; pass++) { 802 + retry = 0; 803 + 804 + list_for_each_entry_safe(page, page2, from, lru) { 805 + cond_resched(); 806 + 807 + rc = unmap_and_move_huge_page(get_new_page, 808 + private, page, pass > 2, offlining); 809 + 810 + switch(rc) { 811 + case -ENOMEM: 812 + goto out; 813 + case -EAGAIN: 814 + retry++; 815 + break; 816 + case 0: 817 + break; 818 + default: 819 + /* Permanent failure */ 820 + nr_failed++; 821 + break; 822 + } 823 + } 824 + } 825 + rc = 0; 826 + out: 827 + 828 + list_for_each_entry_safe(page, page2, from, lru) 829 + put_page(page); 936 830 937 831 if (rc) 938 832 return rc;
+8 -17
mm/rmap.c
··· 780 780 } 781 781 782 782 /** 783 - * __page_set_anon_rmap - setup new anonymous rmap 784 - * @page: the page to add the mapping to 785 - * @vma: the vm area in which the mapping is added 786 - * @address: the user virtual address mapped 783 + * __page_set_anon_rmap - set up new anonymous rmap 784 + * @page: Page to add to rmap 785 + * @vma: VM area to add page to. 786 + * @address: User virtual address of the mapping 787 787 * @exclusive: the page is exclusively owned by the current process 788 788 */ 789 789 static void __page_set_anon_rmap(struct page *page, ··· 793 793 794 794 BUG_ON(!anon_vma); 795 795 796 + if (PageAnon(page)) 797 + return; 798 + 796 799 /* 797 800 * If the page isn't exclusively mapped into this vma, 798 801 * we must use the _oldest_ possible anon_vma for the 799 802 * page mapping! 800 803 */ 801 - if (!exclusive) { 802 - if (PageAnon(page)) 803 - return; 804 + if (!exclusive) 804 805 anon_vma = anon_vma->root; 805 - } else { 806 - /* 807 - * In this case, swapped-out-but-not-discarded swap-cache 808 - * is remapped. So, no need to update page->mapping here. 809 - * We convice anon_vma poitned by page->mapping is not obsolete 810 - * because vma->anon_vma is necessary to be a family of it. 811 - */ 812 - if (PageAnon(page)) 813 - return; 814 - } 815 806 816 807 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 817 808 page->mapping = (struct address_space *) anon_vma;