[PATCH] unpaged: VM_UNPAGED

Although we tend to associate VM_RESERVED with remap_pfn_range, quite a few
drivers set VM_RESERVED on areas which are then populated by nopage. The
PageReserved removal in 2.6.15-rc1 changed VM_RESERVED not to free pages in
zap_pte_range, without changing those drivers not to set it: so their pages
just leak away.

Let's not change miscellaneous drivers now: introduce VM_UNPAGED at the core,
to flag the special areas where the ptes may have no struct page, or if they
have then it's not to be touched. Replace most instances of VM_RESERVED in
core mm by VM_UNPAGED. Force it on in remap_pfn_range, and the sparc and
sparc64 io_remap_pfn_range.

Revert addition of VM_RESERVED to powerpc vdso, it's not needed there. Is it
needed anywhere? It still governs the mm->reserved_vm statistic, and special
vmas not to be merged, and areas not to be core dumped; but could probably be
eliminated later (the drivers are probably specifying it because in 2.4 it
kept swapout off the vma, but in 2.6 we work from the LRU, which these pages
don't get on).

Use the VM_SHM slot for VM_UNPAGED, and define VM_SHM to 0: it serves no
purpose whatsoever, and should be removed from drivers when we clean up.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: William Irwin <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

authored by Hugh Dickins and committed by Linus Torvalds 0b14c179 664beed0

+30 -24
+1 -2
arch/powerpc/kernel/vdso.c
··· 285 * It's fine to use that for setting breakpoints in the vDSO code 286 * pages though 287 */ 288 - vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | 289 - VM_MAYEXEC | VM_RESERVED; 290 vma->vm_flags |= mm->def_flags; 291 vma->vm_page_prot = protection_map[vma->vm_flags & 0x7]; 292 vma->vm_ops = &vdso_vmops;
··· 285 * It's fine to use that for setting breakpoints in the vDSO code 286 * pages though 287 */ 288 + vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; 289 vma->vm_flags |= mm->def_flags; 290 vma->vm_page_prot = protection_map[vma->vm_flags & 0x7]; 291 vma->vm_ops = &vdso_vmops;
+1 -1
arch/sparc/mm/generic.c
··· 74 unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT; 75 76 /* See comment in mm/memory.c remap_pfn_range */ 77 - vma->vm_flags |= VM_IO | VM_RESERVED; 78 79 prot = __pgprot(pg_iobits); 80 offset -= from;
··· 74 unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT; 75 76 /* See comment in mm/memory.c remap_pfn_range */ 77 + vma->vm_flags |= VM_IO | VM_RESERVED | VM_UNPAGED; 78 79 prot = __pgprot(pg_iobits); 80 offset -= from;
+1 -1
arch/sparc64/mm/generic.c
··· 128 unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT; 129 130 /* See comment in mm/memory.c remap_pfn_range */ 131 - vma->vm_flags |= VM_IO | VM_RESERVED; 132 133 prot = __pgprot(pg_iobits); 134 offset -= from;
··· 128 unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT; 129 130 /* See comment in mm/memory.c remap_pfn_range */ 131 + vma->vm_flags |= VM_IO | VM_RESERVED | VM_UNPAGED; 132 133 prot = __pgprot(pg_iobits); 134 offset -= from;
+3 -2
include/linux/mm.h
··· 144 145 #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ 146 #define VM_GROWSUP 0x00000200 147 - #define VM_SHM 0x00000400 /* shared memory area, don't swap out */ 148 #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ 149 150 #define VM_EXECUTABLE 0x00001000 ··· 158 159 #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ 160 #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ 161 - #define VM_RESERVED 0x00080000 /* Pages managed in a special way */ 162 #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ 163 #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ 164 #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
··· 144 145 #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ 146 #define VM_GROWSUP 0x00000200 147 + #define VM_SHM 0x00000000 /* Means nothing: delete it later */ 148 + #define VM_UNPAGED 0x00000400 /* Pages managed without map count */ 149 #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ 150 151 #define VM_EXECUTABLE 0x00001000 ··· 157 158 #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ 159 #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ 160 + #define VM_RESERVED 0x00080000 /* Count as reserved_vm like IO */ 161 #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ 162 #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ 163 #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
+2 -2
mm/fremap.c
··· 65 pte_t pte_val; 66 spinlock_t *ptl; 67 68 - BUG_ON(vma->vm_flags & VM_RESERVED); 69 70 pgd = pgd_offset(mm, addr); 71 pud = pud_alloc(mm, pgd, addr); ··· 122 pte_t pte_val; 123 spinlock_t *ptl; 124 125 - BUG_ON(vma->vm_flags & VM_RESERVED); 126 127 pgd = pgd_offset(mm, addr); 128 pud = pud_alloc(mm, pgd, addr);
··· 65 pte_t pte_val; 66 spinlock_t *ptl; 67 68 + BUG_ON(vma->vm_flags & VM_UNPAGED); 69 70 pgd = pgd_offset(mm, addr); 71 pud = pud_alloc(mm, pgd, addr); ··· 122 pte_t pte_val; 123 spinlock_t *ptl; 124 125 + BUG_ON(vma->vm_flags & VM_UNPAGED); 126 127 pgd = pgd_offset(mm, addr); 128 pud = pud_alloc(mm, pgd, addr);
+1 -1
mm/madvise.c
··· 126 unsigned long start, unsigned long end) 127 { 128 *prev = vma; 129 - if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_RESERVED)) 130 return -EINVAL; 131 132 if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
··· 126 unsigned long start, unsigned long end) 127 { 128 *prev = vma; 129 + if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_UNPAGED)) 130 return -EINVAL; 131 132 if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
+18 -12
mm/memory.c
··· 334 335 /* 336 * This function is called to print an error when a pte in a 337 - * !VM_RESERVED region is found pointing to an invalid pfn (which 338 * is an error. 339 * 340 * The calling function must still handle the error. ··· 381 goto out_set_pte; 382 } 383 384 - /* If the region is VM_RESERVED, the mapping is not 385 * mapped via rmap - duplicate the pte as is. 386 */ 387 - if (vm_flags & VM_RESERVED) 388 goto out_set_pte; 389 390 pfn = pte_pfn(pte); 391 /* If the pte points outside of valid memory but 392 - * the region is not VM_RESERVED, we have a problem. 393 */ 394 if (unlikely(!pfn_valid(pfn))) { 395 print_bad_pte(vma, pte, addr); ··· 528 * readonly mappings. The tradeoff is that copy_page_range is more 529 * efficient than faulting. 530 */ 531 - if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_RESERVED))) { 532 if (!vma->anon_vma) 533 return 0; 534 } ··· 572 573 (*zap_work) -= PAGE_SIZE; 574 575 - if (!(vma->vm_flags & VM_RESERVED)) { 576 unsigned long pfn = pte_pfn(ptent); 577 if (unlikely(!pfn_valid(pfn))) 578 print_bad_pte(vma, ptent, addr); ··· 1191 * rest of the world about it: 1192 * VM_IO tells people not to look at these pages 1193 * (accesses can have side effects). 1194 - * VM_RESERVED tells the core MM not to "manage" these pages 1195 - * (e.g. refcount, mapcount, try to swap them out). 1196 */ 1197 - vma->vm_flags |= VM_IO | VM_RESERVED; 1198 1199 BUG_ON(addr >= end); 1200 pfn -= addr >> PAGE_SHIFT; ··· 1282 pte_t entry; 1283 int ret = VM_FAULT_MINOR; 1284 1285 - BUG_ON(vma->vm_flags & VM_RESERVED); 1286 1287 if (unlikely(!pfn_valid(pfn))) { 1288 /* ··· 1930 inc_mm_counter(mm, anon_rss); 1931 lru_cache_add_active(new_page); 1932 page_add_anon_rmap(new_page, vma, address); 1933 - } else if (!(vma->vm_flags & VM_RESERVED)) { 1934 inc_mm_counter(mm, file_rss); 1935 page_add_file_rmap(new_page); 1936 } ··· 2209 gate_vma.vm_start = FIXADDR_USER_START; 2210 gate_vma.vm_end = FIXADDR_USER_END; 2211 gate_vma.vm_page_prot = PAGE_READONLY; 2212 - gate_vma.vm_flags = VM_RESERVED; 2213 return 0; 2214 } 2215 __initcall(gate_vma_init);
··· 334 335 /* 336 * This function is called to print an error when a pte in a 337 + * !VM_UNPAGED region is found pointing to an invalid pfn (which 338 * is an error. 339 * 340 * The calling function must still handle the error. ··· 381 goto out_set_pte; 382 } 383 384 + /* If the region is VM_UNPAGED, the mapping is not 385 * mapped via rmap - duplicate the pte as is. 386 */ 387 + if (vm_flags & VM_UNPAGED) 388 goto out_set_pte; 389 390 pfn = pte_pfn(pte); 391 /* If the pte points outside of valid memory but 392 + * the region is not VM_UNPAGED, we have a problem. 393 */ 394 if (unlikely(!pfn_valid(pfn))) { 395 print_bad_pte(vma, pte, addr); ··· 528 * readonly mappings. The tradeoff is that copy_page_range is more 529 * efficient than faulting. 530 */ 531 + if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_UNPAGED))) { 532 if (!vma->anon_vma) 533 return 0; 534 } ··· 572 573 (*zap_work) -= PAGE_SIZE; 574 575 + if (!(vma->vm_flags & VM_UNPAGED)) { 576 unsigned long pfn = pte_pfn(ptent); 577 if (unlikely(!pfn_valid(pfn))) 578 print_bad_pte(vma, ptent, addr); ··· 1191 * rest of the world about it: 1192 * VM_IO tells people not to look at these pages 1193 * (accesses can have side effects). 1194 + * VM_RESERVED is specified all over the place, because 1195 + * in 2.4 it kept swapout's vma scan off this vma; but 1196 + * in 2.6 the LRU scan won't even find its pages, so this 1197 + * flag means no more than count its pages in reserved_vm, 1198 + * and omit it from core dump, even when VM_IO turned off. 1199 + * VM_UNPAGED tells the core MM not to "manage" these pages 1200 + * (e.g. refcount, mapcount, try to swap them out): in 1201 + * particular, zap_pte_range does not try to free them. 1202 */ 1203 + vma->vm_flags |= VM_IO | VM_RESERVED | VM_UNPAGED; 1204 1205 BUG_ON(addr >= end); 1206 pfn -= addr >> PAGE_SHIFT; ··· 1276 pte_t entry; 1277 int ret = VM_FAULT_MINOR; 1278 1279 + BUG_ON(vma->vm_flags & VM_UNPAGED); 1280 1281 if (unlikely(!pfn_valid(pfn))) { 1282 /* ··· 1924 inc_mm_counter(mm, anon_rss); 1925 lru_cache_add_active(new_page); 1926 page_add_anon_rmap(new_page, vma, address); 1927 + } else if (!(vma->vm_flags & VM_UNPAGED)) { 1928 inc_mm_counter(mm, file_rss); 1929 page_add_file_rmap(new_page); 1930 } ··· 2203 gate_vma.vm_start = FIXADDR_USER_START; 2204 gate_vma.vm_end = FIXADDR_USER_END; 2205 gate_vma.vm_page_prot = PAGE_READONLY; 2206 + gate_vma.vm_flags = 0; 2207 return 0; 2208 } 2209 __initcall(gate_vma_init);
+1 -1
mm/mempolicy.c
··· 269 first = find_vma(mm, start); 270 if (!first) 271 return ERR_PTR(-EFAULT); 272 - if (first->vm_flags & VM_RESERVED) 273 return ERR_PTR(-EACCES); 274 prev = NULL; 275 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
··· 269 first = find_vma(mm, start); 270 if (!first) 271 return ERR_PTR(-EFAULT); 272 + if (first->vm_flags & VM_UNPAGED) 273 return ERR_PTR(-EACCES); 274 prev = NULL; 275 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
+2 -2
mm/msync.c
··· 97 /* For hugepages we can't go walking the page table normally, 98 * but that's ok, hugetlbfs is memory based, so we don't need 99 * to do anything more on an msync(). 100 - * Can't do anything with VM_RESERVED regions either. 101 */ 102 - if (vma->vm_flags & (VM_HUGETLB|VM_RESERVED)) 103 return; 104 105 BUG_ON(addr >= end);
··· 97 /* For hugepages we can't go walking the page table normally, 98 * but that's ok, hugetlbfs is memory based, so we don't need 99 * to do anything more on an msync(). 100 + * Can't do anything with VM_UNPAGED regions either. 101 */ 102 + if (vma->vm_flags & (VM_HUGETLB|VM_UNPAGED)) 103 return; 104 105 BUG_ON(addr >= end);