Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm, hugetlb: convert hugetlbfs to use split pmd lock

Hugetlb supports multiple page sizes. We use split lock only for PMD
level, but not for PUD.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Tested-by: Alex Thorlton <athorlton@sgi.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "Eric W . Biederman" <ebiederm@xmission.com>
Cc: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Jones <davej@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Robin Holt <robinmholt@gmail.com>
Cc: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Kirill A. Shutemov and committed by
Linus Torvalds
cb900f41 c389a250

+105 -54
+1 -1
fs/proc/meminfo.c
··· 1 1 #include <linux/fs.h> 2 - #include <linux/hugetlb.h> 3 2 #include <linux/init.h> 4 3 #include <linux/kernel.h> 5 4 #include <linux/mm.h> 5 + #include <linux/hugetlb.h> 6 6 #include <linux/mman.h> 7 7 #include <linux/mmzone.h> 8 8 #include <linux/proc_fs.h>
+26
include/linux/hugetlb.h
··· 392 392 return pmd_huge_support() && (huge_page_shift(h) == PMD_SHIFT); 393 393 } 394 394 395 + static inline spinlock_t *huge_pte_lockptr(struct hstate *h, 396 + struct mm_struct *mm, pte_t *pte) 397 + { 398 + if (huge_page_size(h) == PMD_SIZE) 399 + return pmd_lockptr(mm, (pmd_t *) pte); 400 + VM_BUG_ON(huge_page_size(h) == PAGE_SIZE); 401 + return &mm->page_table_lock; 402 + } 403 + 395 404 #else /* CONFIG_HUGETLB_PAGE */ 396 405 struct hstate {}; 397 406 #define alloc_huge_page_node(h, nid) NULL ··· 410 401 #define hstate_sizelog(s) NULL 411 402 #define hstate_vma(v) NULL 412 403 #define hstate_inode(i) NULL 404 + #define page_hstate(page) NULL 413 405 #define huge_page_size(h) PAGE_SIZE 414 406 #define huge_page_mask(h) PAGE_MASK 415 407 #define vma_kernel_pagesize(v) PAGE_SIZE ··· 431 421 #define dissolve_free_huge_pages(s, e) do {} while (0) 432 422 #define pmd_huge_support() 0 433 423 #define hugepage_migration_support(h) 0 424 + 425 + static inline spinlock_t *huge_pte_lockptr(struct hstate *h, 426 + struct mm_struct *mm, pte_t *pte) 427 + { 428 + return &mm->page_table_lock; 429 + } 434 430 #endif /* CONFIG_HUGETLB_PAGE */ 431 + 432 + static inline spinlock_t *huge_pte_lock(struct hstate *h, 433 + struct mm_struct *mm, pte_t *pte) 434 + { 435 + spinlock_t *ptl; 436 + 437 + ptl = huge_pte_lockptr(h, mm, pte); 438 + spin_lock(ptl); 439 + return ptl; 440 + } 435 441 436 442 #endif /* _LINUX_HUGETLB_H */
+4 -3
include/linux/swapops.h
··· 139 139 140 140 extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, 141 141 unsigned long address); 142 - extern void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte); 142 + extern void migration_entry_wait_huge(struct vm_area_struct *vma, 143 + struct mm_struct *mm, pte_t *pte); 143 144 #else 144 145 145 146 #define make_migration_entry(page, write) swp_entry(0, 0) ··· 152 151 static inline void make_migration_entry_read(swp_entry_t *entryp) { } 153 152 static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, 154 153 unsigned long address) { } 155 - static inline void migration_entry_wait_huge(struct mm_struct *mm, 156 - pte_t *pte) { } 154 + static inline void migration_entry_wait_huge(struct vm_area_struct *vma, 155 + struct mm_struct *mm, pte_t *pte) { } 157 156 static inline int is_write_migration_entry(swp_entry_t entry) 158 157 { 159 158 return 0;
+66 -44
mm/hugetlb.c
··· 2376 2376 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 2377 2377 2378 2378 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { 2379 + spinlock_t *src_ptl, *dst_ptl; 2379 2380 src_pte = huge_pte_offset(src, addr); 2380 2381 if (!src_pte) 2381 2382 continue; ··· 2388 2387 if (dst_pte == src_pte) 2389 2388 continue; 2390 2389 2391 - spin_lock(&dst->page_table_lock); 2392 - spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING); 2390 + dst_ptl = huge_pte_lock(h, dst, dst_pte); 2391 + src_ptl = huge_pte_lockptr(h, src, src_pte); 2392 + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 2393 2393 if (!huge_pte_none(huge_ptep_get(src_pte))) { 2394 2394 if (cow) 2395 2395 huge_ptep_set_wrprotect(src, addr, src_pte); ··· 2400 2398 page_dup_rmap(ptepage); 2401 2399 set_huge_pte_at(dst, addr, dst_pte, entry); 2402 2400 } 2403 - spin_unlock(&src->page_table_lock); 2404 - spin_unlock(&dst->page_table_lock); 2401 + spin_unlock(src_ptl); 2402 + spin_unlock(dst_ptl); 2405 2403 } 2406 2404 return 0; 2407 2405 ··· 2444 2442 unsigned long address; 2445 2443 pte_t *ptep; 2446 2444 pte_t pte; 2445 + spinlock_t *ptl; 2447 2446 struct page *page; 2448 2447 struct hstate *h = hstate_vma(vma); 2449 2448 unsigned long sz = huge_page_size(h); ··· 2458 2455 tlb_start_vma(tlb, vma); 2459 2456 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2460 2457 again: 2461 - spin_lock(&mm->page_table_lock); 2462 2458 for (address = start; address < end; address += sz) { 2463 2459 ptep = huge_pte_offset(mm, address); 2464 2460 if (!ptep) 2465 2461 continue; 2466 2462 2463 + ptl = huge_pte_lock(h, mm, ptep); 2467 2464 if (huge_pmd_unshare(mm, &address, ptep)) 2468 - continue; 2465 + goto unlock; 2469 2466 2470 2467 pte = huge_ptep_get(ptep); 2471 2468 if (huge_pte_none(pte)) 2472 - continue; 2469 + goto unlock; 2473 2470 2474 2471 /* 2475 2472 * HWPoisoned hugepage is already unmapped and dropped reference 2476 2473 */ 2477 2474 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { 2478 2475 huge_pte_clear(mm, address, ptep); 2479 - continue; 2476 + goto unlock; 2480 2477 } 2481 2478 2482 2479 page = pte_page(pte); ··· 2487 2484 */ 2488 2485 if (ref_page) { 2489 2486 if (page != ref_page) 2490 - continue; 2487 + goto unlock; 2491 2488 2492 2489 /* 2493 2490 * Mark the VMA as having unmapped its page so that ··· 2504 2501 2505 2502 page_remove_rmap(page); 2506 2503 force_flush = !__tlb_remove_page(tlb, page); 2507 - if (force_flush) 2504 + if (force_flush) { 2505 + spin_unlock(ptl); 2508 2506 break; 2507 + } 2509 2508 /* Bail out after unmapping reference page if supplied */ 2510 - if (ref_page) 2509 + if (ref_page) { 2510 + spin_unlock(ptl); 2511 2511 break; 2512 + } 2513 + unlock: 2514 + spin_unlock(ptl); 2512 2515 } 2513 - spin_unlock(&mm->page_table_lock); 2514 2516 /* 2515 2517 * mmu_gather ran out of room to batch pages, we break out of 2516 2518 * the PTE lock to avoid doing the potential expensive TLB invalidate ··· 2621 2613 */ 2622 2614 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 2623 2615 unsigned long address, pte_t *ptep, pte_t pte, 2624 - struct page *pagecache_page) 2616 + struct page *pagecache_page, spinlock_t *ptl) 2625 2617 { 2626 2618 struct hstate *h = hstate_vma(vma); 2627 2619 struct page *old_page, *new_page; ··· 2655 2647 2656 2648 page_cache_get(old_page); 2657 2649 2658 - /* Drop page_table_lock as buddy allocator may be called */ 2659 - spin_unlock(&mm->page_table_lock); 2650 + /* Drop page table lock as buddy allocator may be called */ 2651 + spin_unlock(ptl); 2660 2652 new_page = alloc_huge_page(vma, address, outside_reserve); 2661 2653 2662 2654 if (IS_ERR(new_page)) { ··· 2674 2666 BUG_ON(huge_pte_none(pte)); 2675 2667 if (unmap_ref_private(mm, vma, old_page, address)) { 2676 2668 BUG_ON(huge_pte_none(pte)); 2677 - spin_lock(&mm->page_table_lock); 2669 + spin_lock(ptl); 2678 2670 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2679 2671 if (likely(pte_same(huge_ptep_get(ptep), pte))) 2680 2672 goto retry_avoidcopy; 2681 2673 /* 2682 - * race occurs while re-acquiring page_table_lock, and 2683 - * our job is done. 2674 + * race occurs while re-acquiring page table 2675 + * lock, and our job is done. 2684 2676 */ 2685 2677 return 0; 2686 2678 } ··· 2688 2680 } 2689 2681 2690 2682 /* Caller expects lock to be held */ 2691 - spin_lock(&mm->page_table_lock); 2683 + spin_lock(ptl); 2692 2684 if (err == -ENOMEM) 2693 2685 return VM_FAULT_OOM; 2694 2686 else ··· 2703 2695 page_cache_release(new_page); 2704 2696 page_cache_release(old_page); 2705 2697 /* Caller expects lock to be held */ 2706 - spin_lock(&mm->page_table_lock); 2698 + spin_lock(ptl); 2707 2699 return VM_FAULT_OOM; 2708 2700 } 2709 2701 ··· 2715 2707 mmun_end = mmun_start + huge_page_size(h); 2716 2708 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2717 2709 /* 2718 - * Retake the page_table_lock to check for racing updates 2710 + * Retake the page table lock to check for racing updates 2719 2711 * before the page tables are altered 2720 2712 */ 2721 - spin_lock(&mm->page_table_lock); 2713 + spin_lock(ptl); 2722 2714 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2723 2715 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 2724 2716 ClearPagePrivate(new_page); ··· 2732 2724 /* Make the old page be freed below */ 2733 2725 new_page = old_page; 2734 2726 } 2735 - spin_unlock(&mm->page_table_lock); 2727 + spin_unlock(ptl); 2736 2728 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2737 2729 page_cache_release(new_page); 2738 2730 page_cache_release(old_page); 2739 2731 2740 2732 /* Caller expects lock to be held */ 2741 - spin_lock(&mm->page_table_lock); 2733 + spin_lock(ptl); 2742 2734 return 0; 2743 2735 } 2744 2736 ··· 2786 2778 struct page *page; 2787 2779 struct address_space *mapping; 2788 2780 pte_t new_pte; 2781 + spinlock_t *ptl; 2789 2782 2790 2783 /* 2791 2784 * Currently, we are forced to kill the process in the event the ··· 2873 2864 goto backout_unlocked; 2874 2865 } 2875 2866 2876 - spin_lock(&mm->page_table_lock); 2867 + ptl = huge_pte_lockptr(h, mm, ptep); 2868 + spin_lock(ptl); 2877 2869 size = i_size_read(mapping->host) >> huge_page_shift(h); 2878 2870 if (idx >= size) 2879 2871 goto backout; ··· 2895 2885 2896 2886 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 2897 2887 /* Optimization, do the COW without a second fault */ 2898 - ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); 2888 + ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl); 2899 2889 } 2900 2890 2901 - spin_unlock(&mm->page_table_lock); 2891 + spin_unlock(ptl); 2902 2892 unlock_page(page); 2903 2893 out: 2904 2894 return ret; 2905 2895 2906 2896 backout: 2907 - spin_unlock(&mm->page_table_lock); 2897 + spin_unlock(ptl); 2908 2898 backout_unlocked: 2909 2899 unlock_page(page); 2910 2900 put_page(page); ··· 2916 2906 { 2917 2907 pte_t *ptep; 2918 2908 pte_t entry; 2909 + spinlock_t *ptl; 2919 2910 int ret; 2920 2911 struct page *page = NULL; 2921 2912 struct page *pagecache_page = NULL; ··· 2929 2918 if (ptep) { 2930 2919 entry = huge_ptep_get(ptep); 2931 2920 if (unlikely(is_hugetlb_entry_migration(entry))) { 2932 - migration_entry_wait_huge(mm, ptep); 2921 + migration_entry_wait_huge(vma, mm, ptep); 2933 2922 return 0; 2934 2923 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 2935 2924 return VM_FAULT_HWPOISON_LARGE | ··· 2985 2974 if (page != pagecache_page) 2986 2975 lock_page(page); 2987 2976 2988 - spin_lock(&mm->page_table_lock); 2977 + ptl = huge_pte_lockptr(h, mm, ptep); 2978 + spin_lock(ptl); 2989 2979 /* Check for a racing update before calling hugetlb_cow */ 2990 2980 if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) 2991 - goto out_page_table_lock; 2981 + goto out_ptl; 2992 2982 2993 2983 2994 2984 if (flags & FAULT_FLAG_WRITE) { 2995 2985 if (!huge_pte_write(entry)) { 2996 2986 ret = hugetlb_cow(mm, vma, address, ptep, entry, 2997 - pagecache_page); 2998 - goto out_page_table_lock; 2987 + pagecache_page, ptl); 2988 + goto out_ptl; 2999 2989 } 3000 2990 entry = huge_pte_mkdirty(entry); 3001 2991 } ··· 3005 2993 flags & FAULT_FLAG_WRITE)) 3006 2994 update_mmu_cache(vma, address, ptep); 3007 2995 3008 - out_page_table_lock: 3009 - spin_unlock(&mm->page_table_lock); 2996 + out_ptl: 2997 + spin_unlock(ptl); 3010 2998 3011 2999 if (pagecache_page) { 3012 3000 unlock_page(pagecache_page); ··· 3032 3020 unsigned long remainder = *nr_pages; 3033 3021 struct hstate *h = hstate_vma(vma); 3034 3022 3035 - spin_lock(&mm->page_table_lock); 3036 3023 while (vaddr < vma->vm_end && remainder) { 3037 3024 pte_t *pte; 3025 + spinlock_t *ptl = NULL; 3038 3026 int absent; 3039 3027 struct page *page; 3040 3028 ··· 3042 3030 * Some archs (sparc64, sh*) have multiple pte_ts to 3043 3031 * each hugepage. We have to make sure we get the 3044 3032 * first, for the page indexing below to work. 3033 + * 3034 + * Note that page table lock is not held when pte is null. 3045 3035 */ 3046 3036 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); 3037 + if (pte) 3038 + ptl = huge_pte_lock(h, mm, pte); 3047 3039 absent = !pte || huge_pte_none(huge_ptep_get(pte)); 3048 3040 3049 3041 /* ··· 3059 3043 */ 3060 3044 if (absent && (flags & FOLL_DUMP) && 3061 3045 !hugetlbfs_pagecache_present(h, vma, vaddr)) { 3046 + if (pte) 3047 + spin_unlock(ptl); 3062 3048 remainder = 0; 3063 3049 break; 3064 3050 } ··· 3080 3062 !huge_pte_write(huge_ptep_get(pte)))) { 3081 3063 int ret; 3082 3064 3083 - spin_unlock(&mm->page_table_lock); 3065 + if (pte) 3066 + spin_unlock(ptl); 3084 3067 ret = hugetlb_fault(mm, vma, vaddr, 3085 3068 (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); 3086 - spin_lock(&mm->page_table_lock); 3087 3069 if (!(ret & VM_FAULT_ERROR)) 3088 3070 continue; 3089 3071 ··· 3114 3096 */ 3115 3097 goto same_page; 3116 3098 } 3099 + spin_unlock(ptl); 3117 3100 } 3118 - spin_unlock(&mm->page_table_lock); 3119 3101 *nr_pages = remainder; 3120 3102 *position = vaddr; 3121 3103 ··· 3136 3118 flush_cache_range(vma, address, end); 3137 3119 3138 3120 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); 3139 - spin_lock(&mm->page_table_lock); 3140 3121 for (; address < end; address += huge_page_size(h)) { 3122 + spinlock_t *ptl; 3141 3123 ptep = huge_pte_offset(mm, address); 3142 3124 if (!ptep) 3143 3125 continue; 3126 + ptl = huge_pte_lock(h, mm, ptep); 3144 3127 if (huge_pmd_unshare(mm, &address, ptep)) { 3145 3128 pages++; 3129 + spin_unlock(ptl); 3146 3130 continue; 3147 3131 } 3148 3132 if (!huge_pte_none(huge_ptep_get(ptep))) { ··· 3154 3134 set_huge_pte_at(mm, address, ptep, pte); 3155 3135 pages++; 3156 3136 } 3137 + spin_unlock(ptl); 3157 3138 } 3158 - spin_unlock(&mm->page_table_lock); 3159 3139 /* 3160 3140 * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare 3161 3141 * may have cleared our pud entry and done put_page on the page table: ··· 3318 3298 unsigned long saddr; 3319 3299 pte_t *spte = NULL; 3320 3300 pte_t *pte; 3301 + spinlock_t *ptl; 3321 3302 3322 3303 if (!vma_shareable(vma, addr)) 3323 3304 return (pte_t *)pmd_alloc(mm, pud, addr); ··· 3341 3320 if (!spte) 3342 3321 goto out; 3343 3322 3344 - spin_lock(&mm->page_table_lock); 3323 + ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); 3324 + spin_lock(ptl); 3345 3325 if (pud_none(*pud)) 3346 3326 pud_populate(mm, pud, 3347 3327 (pmd_t *)((unsigned long)spte & PAGE_MASK)); 3348 3328 else 3349 3329 put_page(virt_to_page(spte)); 3350 - spin_unlock(&mm->page_table_lock); 3330 + spin_unlock(ptl); 3351 3331 out: 3352 3332 pte = (pte_t *)pmd_alloc(mm, pud, addr); 3353 3333 mutex_unlock(&mapping->i_mmap_mutex); ··· 3362 3340 * indicated by page_count > 1, unmap is achieved by clearing pud and 3363 3341 * decrementing the ref count. If count == 1, the pte page is not shared. 3364 3342 * 3365 - * called with vma->vm_mm->page_table_lock held. 3343 + * called with page table lock held. 3366 3344 * 3367 3345 * returns: 1 successfully unmapped a shared pte page 3368 3346 * 0 the underlying pte page is not shared, or it is the last user
+3 -2
mm/mempolicy.c
··· 525 525 #ifdef CONFIG_HUGETLB_PAGE 526 526 int nid; 527 527 struct page *page; 528 + spinlock_t *ptl; 528 529 529 - spin_lock(&vma->vm_mm->page_table_lock); 530 + ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); 530 531 page = pte_page(huge_ptep_get((pte_t *)pmd)); 531 532 nid = page_to_nid(page); 532 533 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) ··· 537 536 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) 538 537 isolate_huge_page(page, private); 539 538 unlock: 540 - spin_unlock(&vma->vm_mm->page_table_lock); 539 + spin_unlock(ptl); 541 540 #else 542 541 BUG(); 543 542 #endif
+4 -3
mm/migrate.c
··· 130 130 ptep = huge_pte_offset(mm, addr); 131 131 if (!ptep) 132 132 goto out; 133 - ptl = &mm->page_table_lock; 133 + ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep); 134 134 } else { 135 135 pmd = mm_find_pmd(mm, addr); 136 136 if (!pmd) ··· 249 249 __migration_entry_wait(mm, ptep, ptl); 250 250 } 251 251 252 - void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte) 252 + void migration_entry_wait_huge(struct vm_area_struct *vma, 253 + struct mm_struct *mm, pte_t *pte) 253 254 { 254 - spinlock_t *ptl = &(mm)->page_table_lock; 255 + spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte); 255 256 __migration_entry_wait(mm, pte, ptl); 256 257 } 257 258
+1 -1
mm/rmap.c
··· 601 601 602 602 if (unlikely(PageHuge(page))) { 603 603 pte = huge_pte_offset(mm, address); 604 - ptl = &mm->page_table_lock; 604 + ptl = huge_pte_lockptr(page_hstate(page), mm, pte); 605 605 goto check; 606 606 } 607 607