Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'vm' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs

Pull munmap/truncate race fixes from Al Viro:
"Fixes for racy use of unmap_vmas() on truncate-related codepaths"

* 'vm' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
VM: make zap_page_range() callers that act on a single VMA use separate helper
VM: make unmap_vmas() return void
VM: don't bother with feeding upper limit to tlb_finish_mmu() in exit_mmap()
VM: make zap_page_range() return void
VM: can't go through the inner loop in unmap_vmas() more than once...
VM: unmap_page_range() can return void

+84 -58
+2 -2
include/linux/mm.h
··· 893 893 894 894 int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, 895 895 unsigned long size); 896 - unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, 896 + void zap_page_range(struct vm_area_struct *vma, unsigned long address, 897 897 unsigned long size, struct zap_details *); 898 - unsigned long unmap_vmas(struct mmu_gather *tlb, 898 + void unmap_vmas(struct mmu_gather *tlb, 899 899 struct vm_area_struct *start_vma, unsigned long start_addr, 900 900 unsigned long end_addr, unsigned long *nr_accounted, 901 901 struct zap_details *);
+80 -53
mm/memory.c
··· 1282 1282 return addr; 1283 1283 } 1284 1284 1285 - static unsigned long unmap_page_range(struct mmu_gather *tlb, 1286 - struct vm_area_struct *vma, 1287 - unsigned long addr, unsigned long end, 1288 - struct zap_details *details) 1285 + static void unmap_page_range(struct mmu_gather *tlb, 1286 + struct vm_area_struct *vma, 1287 + unsigned long addr, unsigned long end, 1288 + struct zap_details *details) 1289 1289 { 1290 1290 pgd_t *pgd; 1291 1291 unsigned long next; ··· 1305 1305 } while (pgd++, addr = next, addr != end); 1306 1306 tlb_end_vma(tlb, vma); 1307 1307 mem_cgroup_uncharge_end(); 1308 + } 1308 1309 1309 - return addr; 1310 + 1311 + static void unmap_single_vma(struct mmu_gather *tlb, 1312 + struct vm_area_struct *vma, unsigned long start_addr, 1313 + unsigned long end_addr, unsigned long *nr_accounted, 1314 + struct zap_details *details) 1315 + { 1316 + unsigned long start = max(vma->vm_start, start_addr); 1317 + unsigned long end; 1318 + 1319 + if (start >= vma->vm_end) 1320 + return; 1321 + end = min(vma->vm_end, end_addr); 1322 + if (end <= vma->vm_start) 1323 + return; 1324 + 1325 + if (vma->vm_flags & VM_ACCOUNT) 1326 + *nr_accounted += (end - start) >> PAGE_SHIFT; 1327 + 1328 + if (unlikely(is_pfn_mapping(vma))) 1329 + untrack_pfn_vma(vma, 0, 0); 1330 + 1331 + if (start != end) { 1332 + if (unlikely(is_vm_hugetlb_page(vma))) { 1333 + /* 1334 + * It is undesirable to test vma->vm_file as it 1335 + * should be non-null for valid hugetlb area. 1336 + * However, vm_file will be NULL in the error 1337 + * cleanup path of do_mmap_pgoff. When 1338 + * hugetlbfs ->mmap method fails, 1339 + * do_mmap_pgoff() nullifies vma->vm_file 1340 + * before calling this function to clean up. 1341 + * Since no pte has actually been setup, it is 1342 + * safe to do nothing in this case. 1343 + */ 1344 + if (vma->vm_file) 1345 + unmap_hugepage_range(vma, start, end, NULL); 1346 + } else 1347 + unmap_page_range(tlb, vma, start, end, details); 1348 + } 1310 1349 } 1311 1350 1312 1351 /** ··· 1356 1317 * @end_addr: virtual address at which to end unmapping 1357 1318 * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here 1358 1319 * @details: details of nonlinear truncation or shared cache invalidation 1359 - * 1360 - * Returns the end address of the unmapping (restart addr if interrupted). 1361 1320 * 1362 1321 * Unmap all pages in the vma list. 1363 1322 * ··· 1368 1331 * ensure that any thus-far unmapped pages are flushed before unmap_vmas() 1369 1332 * drops the lock and schedules. 1370 1333 */ 1371 - unsigned long unmap_vmas(struct mmu_gather *tlb, 1334 + void unmap_vmas(struct mmu_gather *tlb, 1372 1335 struct vm_area_struct *vma, unsigned long start_addr, 1373 1336 unsigned long end_addr, unsigned long *nr_accounted, 1374 1337 struct zap_details *details) 1375 1338 { 1376 - unsigned long start = start_addr; 1377 1339 struct mm_struct *mm = vma->vm_mm; 1378 1340 1379 1341 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); 1380 - for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { 1381 - unsigned long end; 1382 - 1383 - start = max(vma->vm_start, start_addr); 1384 - if (start >= vma->vm_end) 1385 - continue; 1386 - end = min(vma->vm_end, end_addr); 1387 - if (end <= vma->vm_start) 1388 - continue; 1389 - 1390 - if (vma->vm_flags & VM_ACCOUNT) 1391 - *nr_accounted += (end - start) >> PAGE_SHIFT; 1392 - 1393 - if (unlikely(is_pfn_mapping(vma))) 1394 - untrack_pfn_vma(vma, 0, 0); 1395 - 1396 - while (start != end) { 1397 - if (unlikely(is_vm_hugetlb_page(vma))) { 1398 - /* 1399 - * It is undesirable to test vma->vm_file as it 1400 - * should be non-null for valid hugetlb area. 1401 - * However, vm_file will be NULL in the error 1402 - * cleanup path of do_mmap_pgoff. When 1403 - * hugetlbfs ->mmap method fails, 1404 - * do_mmap_pgoff() nullifies vma->vm_file 1405 - * before calling this function to clean up. 1406 - * Since no pte has actually been setup, it is 1407 - * safe to do nothing in this case. 1408 - */ 1409 - if (vma->vm_file) 1410 - unmap_hugepage_range(vma, start, end, NULL); 1411 - 1412 - start = end; 1413 - } else 1414 - start = unmap_page_range(tlb, vma, start, end, details); 1415 - } 1416 - } 1417 - 1342 + for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) 1343 + unmap_single_vma(tlb, vma, start_addr, end_addr, nr_accounted, 1344 + details); 1418 1345 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); 1419 - return start; /* which is now the end (or restart) address */ 1420 1346 } 1421 1347 1422 1348 /** ··· 1388 1388 * @address: starting address of pages to zap 1389 1389 * @size: number of bytes to zap 1390 1390 * @details: details of nonlinear truncation or shared cache invalidation 1391 + * 1392 + * Caller must protect the VMA list 1391 1393 */ 1392 - unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, 1394 + void zap_page_range(struct vm_area_struct *vma, unsigned long address, 1393 1395 unsigned long size, struct zap_details *details) 1394 1396 { 1395 1397 struct mm_struct *mm = vma->vm_mm; ··· 1402 1400 lru_add_drain(); 1403 1401 tlb_gather_mmu(&tlb, mm, 0); 1404 1402 update_hiwater_rss(mm); 1405 - end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); 1403 + unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); 1406 1404 tlb_finish_mmu(&tlb, address, end); 1407 - return end; 1405 + } 1406 + 1407 + /** 1408 + * zap_page_range_single - remove user pages in a given range 1409 + * @vma: vm_area_struct holding the applicable pages 1410 + * @address: starting address of pages to zap 1411 + * @size: number of bytes to zap 1412 + * @details: details of nonlinear truncation or shared cache invalidation 1413 + * 1414 + * The range must fit into one VMA. 1415 + */ 1416 + static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, 1417 + unsigned long size, struct zap_details *details) 1418 + { 1419 + struct mm_struct *mm = vma->vm_mm; 1420 + struct mmu_gather tlb; 1421 + unsigned long end = address + size; 1422 + unsigned long nr_accounted = 0; 1423 + 1424 + lru_add_drain(); 1425 + tlb_gather_mmu(&tlb, mm, 0); 1426 + update_hiwater_rss(mm); 1427 + mmu_notifier_invalidate_range_start(mm, address, end); 1428 + unmap_single_vma(&tlb, vma, address, end, &nr_accounted, details); 1429 + mmu_notifier_invalidate_range_end(mm, address, end); 1430 + tlb_finish_mmu(&tlb, address, end); 1408 1431 } 1409 1432 1410 1433 /** ··· 1450 1423 if (address < vma->vm_start || address + size > vma->vm_end || 1451 1424 !(vma->vm_flags & VM_PFNMAP)) 1452 1425 return -1; 1453 - zap_page_range(vma, address, size, NULL); 1426 + zap_page_range_single(vma, address, size, NULL); 1454 1427 return 0; 1455 1428 } 1456 1429 EXPORT_SYMBOL_GPL(zap_vma_ptes); ··· 2797 2770 unsigned long start_addr, unsigned long end_addr, 2798 2771 struct zap_details *details) 2799 2772 { 2800 - zap_page_range(vma, start_addr, end_addr - start_addr, details); 2773 + zap_page_range_single(vma, start_addr, end_addr - start_addr, details); 2801 2774 } 2802 2775 2803 2776 static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
+2 -3
mm/mmap.c
··· 2237 2237 struct mmu_gather tlb; 2238 2238 struct vm_area_struct *vma; 2239 2239 unsigned long nr_accounted = 0; 2240 - unsigned long end; 2241 2240 2242 2241 /* mm's last user has gone, and its about to be pulled down */ 2243 2242 mmu_notifier_release(mm); ··· 2261 2262 tlb_gather_mmu(&tlb, mm, 1); 2262 2263 /* update_hiwater_rss(mm) here? but nobody should be looking */ 2263 2264 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 2264 - end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); 2265 + unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); 2265 2266 vm_unacct_memory(nr_accounted); 2266 2267 2267 2268 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); 2268 - tlb_finish_mmu(&tlb, 0, end); 2269 + tlb_finish_mmu(&tlb, 0, -1); 2269 2270 2270 2271 /* 2271 2272 * Walk the list again, actually closing and freeing it,