Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

pagemap: fix pfn calculation for hugepage

When we look into pagemap using page-types with option -p, the value of
pfn for hugepages looks wrong (see below.) This is because pte was
evaluated only once for one vma although it should be updated for each
hugepage. This patch fixes it.

$ page-types -p 3277 -Nl -b huge
voffset offset len flags
7f21e8a00 11e400 1 ___U___________H_G________________
7f21e8a01 11e401 1ff ________________TG________________
^^^
7f21e8c00 11e400 1 ___U___________H_G________________
7f21e8c01 11e401 1ff ________________TG________________
^^^

One hugepage contains 1 head page and 511 tail pages in x86_64 and each
two lines represent each hugepage. Voffset and offset mean virtual
address and physical address in the page unit, respectively. The
different hugepages should not have the same offset value.

With this patch applied:

$ page-types -p 3386 -Nl -b huge
voffset offset len flags
7fec7a600 112c00 1 ___UD__________H_G________________
7fec7a601 112c01 1ff ________________TG________________
^^^
7fec7a800 113200 1 ___UD__________H_G________________
7fec7a801 113201 1ff ________________TG________________
^^^
OK

More info:

- This patch modifies walk_page_range()'s hugepage walker. But the
change only affects pagemap_read(), which is the only caller of hugepage
callback.

- Without this patch, hugetlb_entry() callback is called per vma, that
doesn't match the natural expectation from its name.

- With this patch, hugetlb_entry() is called per hugepte entry and the
callback can become much simpler.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Naoya Horiguchi and committed by
Linus Torvalds
116354d1 57119c34

+46 -32
+7 -20
fs/proc/task_mmu.c
··· 662 662 return pme; 663 663 } 664 664 665 - static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr, 666 - unsigned long end, struct mm_walk *walk) 665 + /* This function walks within one hugetlb entry in the single call */ 666 + static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, 667 + unsigned long addr, unsigned long end, 668 + struct mm_walk *walk) 667 669 { 668 - struct vm_area_struct *vma; 669 670 struct pagemapread *pm = walk->private; 670 - struct hstate *hs = NULL; 671 671 int err = 0; 672 + u64 pfn; 672 673 673 - vma = find_vma(walk->mm, addr); 674 - if (vma) 675 - hs = hstate_vma(vma); 676 674 for (; addr != end; addr += PAGE_SIZE) { 677 - u64 pfn = PM_NOT_PRESENT; 678 - 679 - if (vma && (addr >= vma->vm_end)) { 680 - vma = find_vma(walk->mm, addr); 681 - if (vma) 682 - hs = hstate_vma(vma); 683 - } 684 - 685 - if (vma && (vma->vm_start <= addr) && is_vm_hugetlb_page(vma)) { 686 - /* calculate pfn of the "raw" page in the hugepage. */ 687 - int offset = (addr & ~huge_page_mask(hs)) >> PAGE_SHIFT; 688 - pfn = huge_pte_to_pagemap_entry(*pte, offset); 689 - } 675 + int offset = (addr & ~hmask) >> PAGE_SHIFT; 676 + pfn = huge_pte_to_pagemap_entry(*pte, offset); 690 677 err = add_to_pagemap(addr, pfn, pm); 691 678 if (err) 692 679 return err;
+2 -2
include/linux/mm.h
··· 783 783 int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, struct mm_walk *); 784 784 int (*pte_entry)(pte_t *, unsigned long, unsigned long, struct mm_walk *); 785 785 int (*pte_hole)(unsigned long, unsigned long, struct mm_walk *); 786 - int (*hugetlb_entry)(pte_t *, unsigned long, unsigned long, 787 - struct mm_walk *); 786 + int (*hugetlb_entry)(pte_t *, unsigned long, 787 + unsigned long, unsigned long, struct mm_walk *); 788 788 struct mm_struct *mm; 789 789 void *private; 790 790 };
+37 -10
mm/pagewalk.c
··· 80 80 return err; 81 81 } 82 82 83 + #ifdef CONFIG_HUGETLB_PAGE 84 + static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, 85 + unsigned long end) 86 + { 87 + unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); 88 + return boundary < end ? boundary : end; 89 + } 90 + 91 + static int walk_hugetlb_range(struct vm_area_struct *vma, 92 + unsigned long addr, unsigned long end, 93 + struct mm_walk *walk) 94 + { 95 + struct hstate *h = hstate_vma(vma); 96 + unsigned long next; 97 + unsigned long hmask = huge_page_mask(h); 98 + pte_t *pte; 99 + int err = 0; 100 + 101 + do { 102 + next = hugetlb_entry_end(h, addr, end); 103 + pte = huge_pte_offset(walk->mm, addr & hmask); 104 + if (pte && walk->hugetlb_entry) 105 + err = walk->hugetlb_entry(pte, hmask, addr, next, walk); 106 + if (err) 107 + return err; 108 + } while (addr = next, addr != end); 109 + 110 + return 0; 111 + } 112 + #endif 113 + 83 114 /** 84 115 * walk_page_range - walk a memory map's page tables with a callback 85 116 * @mm: memory map to walk ··· 159 128 vma = find_vma(walk->mm, addr); 160 129 #ifdef CONFIG_HUGETLB_PAGE 161 130 if (vma && is_vm_hugetlb_page(vma)) { 162 - pte_t *pte; 163 - struct hstate *hs; 164 - 165 131 if (vma->vm_end < next) 166 132 next = vma->vm_end; 167 - hs = hstate_vma(vma); 168 - pte = huge_pte_offset(walk->mm, 169 - addr & huge_page_mask(hs)); 170 - if (pte && !huge_pte_none(huge_ptep_get(pte)) 171 - && walk->hugetlb_entry) 172 - err = walk->hugetlb_entry(pte, addr, 173 - next, walk); 133 + /* 134 + * Hugepage is very tightly coupled with vma, so 135 + * walk through hugetlb entries within a given vma. 136 + */ 137 + err = walk_hugetlb_range(vma, addr, next, walk); 174 138 if (err) 175 139 break; 140 + pgd = pgd_offset(walk->mm, next); 176 141 continue; 177 142 } 178 143 #endif