Merge branch 'akpm' (incoming from Andrew)

+9

Documentation/filesystems/proc.txt

··· 767 767 768 768 MemTotal: 16344972 kB 769 769 MemFree: 13634064 kB 770 + MemAvailable: 14836172 kB 770 771 Buffers: 3656 kB 771 772 Cached: 1195708 kB 772 773 SwapCached: 0 kB ··· 800 799 MemTotal: Total usable ram (i.e. physical ram minus a few reserved 801 800 bits and the kernel binary code) 802 801 MemFree: The sum of LowFree+HighFree 802 + MemAvailable: An estimate of how much memory is available for starting new 803 + applications, without swapping. Calculated from MemFree, 804 + SReclaimable, the size of the file LRU lists, and the low 805 + watermarks in each zone. 806 + The estimate takes into account that the system needs some 807 + page cache to function well, and that not all reclaimable 808 + slab will be reclaimable, due to items being in use. The 809 + impact of those factors will vary from system to system. 803 810 Buffers: Relatively temporary storage for raw disk blocks 804 811 shouldn't get tremendously large (20MB or so) 805 812 Cached: in-memory cache for files read from the disk (the

+12

Documentation/sysctl/vm.txt

··· 47 47 - numa_zonelist_order 48 48 - oom_dump_tasks 49 49 - oom_kill_allocating_task 50 + - overcommit_kbytes 50 51 - overcommit_memory 51 52 - overcommit_ratio 52 53 - page-cluster ··· 572 571 is used in oom_kill_allocating_task. 573 572 574 573 The default value is 0. 574 + 575 + ============================================================== 576 + 577 + overcommit_kbytes: 578 + 579 + When overcommit_memory is set to 2, the committed address space is not 580 + permitted to exceed swap plus this amount of physical RAM. See below. 581 + 582 + Note: overcommit_kbytes is the counterpart of overcommit_ratio. Only one 583 + of them may be specified at a time. Setting one disables the other (which 584 + then appears as 0 when read). 575 585 576 586 ============================================================== 577 587

+4 -3

Documentation/vm/overcommit-accounting

··· 14 14 15 15 2 - Don't overcommit. The total address space commit 16 16 for the system is not permitted to exceed swap + a 17 - configurable percentage (default is 50) of physical RAM. 18 - Depending on the percentage you use, in most situations 17 + configurable amount (default is 50%) of physical RAM. 18 + Depending on the amount you use, in most situations 19 19 this means a process will not be killed while accessing 20 20 pages but will receive errors on memory allocation as 21 21 appropriate. ··· 26 26 27 27 The overcommit policy is set via the sysctl `vm.overcommit_memory'. 28 28 29 - The overcommit percentage is set via `vm.overcommit_ratio'. 29 + The overcommit amount can be set via `vm.overcommit_ratio' (percentage) 30 + or `vm.overcommit_kbytes' (absolute value). 30 31 31 32 The current overcommit limit and amount committed are viewable in 32 33 /proc/meminfo as CommitLimit and Committed_AS respectively.

+2 -2

arch/arm/include/asm/dma.h

··· 8 8 #define MAX_DMA_ADDRESS 0xffffffffUL 9 9 #else 10 10 #define MAX_DMA_ADDRESS ({ \ 11 - extern unsigned long arm_dma_zone_size; \ 12 - arm_dma_zone_size ? \ 11 + extern phys_addr_t arm_dma_zone_size; \ 12 + arm_dma_zone_size && arm_dma_zone_size < (0x10000000 - PAGE_OFFSET) ? \ 13 13 (PAGE_OFFSET + arm_dma_zone_size) : 0xffffffffUL; }) 14 14 #endif 15 15

+1 -1

arch/arm/kernel/devtree.c

··· 33 33 34 34 void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align) 35 35 { 36 - return alloc_bootmem_align(size, align); 36 + return memblock_virt_alloc(size, align); 37 37 } 38 38 39 39 void __init arm_dt_memblock_reserve(void)

+1 -1

arch/arm/kernel/setup.c

··· 717 717 kernel_data.end = virt_to_phys(_end - 1); 718 718 719 719 for_each_memblock(memory, region) { 720 - res = alloc_bootmem_low(sizeof(*res)); 720 + res = memblock_virt_alloc(sizeof(*res), 0); 721 721 res->name = "System RAM"; 722 722 res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region)); 723 723 res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1;

+2 -6

arch/arm/mach-omap2/omap_hwmod.c

··· 2791 2791 sz = sizeof(struct omap_hwmod_link) * LINKS_PER_OCP_IF; 2792 2792 2793 2793 *sl = NULL; 2794 - *ml = alloc_bootmem(sz); 2795 - 2796 - memset(*ml, 0, sz); 2794 + *ml = memblock_virt_alloc(sz, 0); 2797 2795 2798 2796 *sl = (void *)(*ml) + sizeof(struct omap_hwmod_link); 2799 2797 ··· 2910 2912 pr_debug("omap_hwmod: %s: allocating %d byte linkspace (%d links)\n", 2911 2913 __func__, sz, max_ls); 2912 2914 2913 - linkspace = alloc_bootmem(sz); 2914 - 2915 - memset(linkspace, 0, sz); 2915 + linkspace = memblock_virt_alloc(sz, 0); 2916 2916 2917 2917 return 0; 2918 2918 }

+1 -4

arch/arm/mm/init.c

··· 92 92 printk("Mem-info:\n"); 93 93 show_free_areas(filter); 94 94 95 - if (filter & SHOW_MEM_FILTER_PAGE_COUNT) 96 - return; 97 - 98 95 for_each_bank (i, mi) { 99 96 struct membank *bank = &mi->bank[i]; 100 97 unsigned int pfn1, pfn2; ··· 458 461 * free the section of the memmap array. 459 462 */ 460 463 if (pg < pgend) 461 - free_bootmem(pg, pgend - pg); 464 + memblock_free_early(pg, pgend - pg); 462 465 } 463 466 464 467 /*

-68

arch/ia64/mm/contig.c

··· 31 31 static unsigned long max_gap; 32 32 #endif 33 33 34 - /** 35 - * show_mem - give short summary of memory stats 36 - * 37 - * Shows a simple page count of reserved and used pages in the system. 38 - * For discontig machines, it does this on a per-pgdat basis. 39 - */ 40 - void show_mem(unsigned int filter) 41 - { 42 - int i, total_reserved = 0; 43 - int total_shared = 0, total_cached = 0; 44 - unsigned long total_present = 0; 45 - pg_data_t *pgdat; 46 - 47 - printk(KERN_INFO "Mem-info:\n"); 48 - show_free_areas(filter); 49 - printk(KERN_INFO "Node memory in pages:\n"); 50 - if (filter & SHOW_MEM_FILTER_PAGE_COUNT) 51 - return; 52 - for_each_online_pgdat(pgdat) { 53 - unsigned long present; 54 - unsigned long flags; 55 - int shared = 0, cached = 0, reserved = 0; 56 - int nid = pgdat->node_id; 57 - 58 - if (skip_free_areas_node(filter, nid)) 59 - continue; 60 - pgdat_resize_lock(pgdat, &flags); 61 - present = pgdat->node_present_pages; 62 - for(i = 0; i < pgdat->node_spanned_pages; i++) { 63 - struct page *page; 64 - if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) 65 - touch_nmi_watchdog(); 66 - if (pfn_valid(pgdat->node_start_pfn + i)) 67 - page = pfn_to_page(pgdat->node_start_pfn + i); 68 - else { 69 - #ifdef CONFIG_VIRTUAL_MEM_MAP 70 - if (max_gap < LARGE_GAP) 71 - continue; 72 - #endif 73 - i = vmemmap_find_next_valid_pfn(nid, i) - 1; 74 - continue; 75 - } 76 - if (PageReserved(page)) 77 - reserved++; 78 - else if (PageSwapCache(page)) 79 - cached++; 80 - else if (page_count(page)) 81 - shared += page_count(page)-1; 82 - } 83 - pgdat_resize_unlock(pgdat, &flags); 84 - total_present += present; 85 - total_reserved += reserved; 86 - total_cached += cached; 87 - total_shared += shared; 88 - printk(KERN_INFO "Node %4d: RAM: %11ld, rsvd: %8d, " 89 - "shrd: %10d, swpd: %10d\n", nid, 90 - present, reserved, shared, cached); 91 - } 92 - printk(KERN_INFO "%ld pages of RAM\n", total_present); 93 - printk(KERN_INFO "%d reserved pages\n", total_reserved); 94 - printk(KERN_INFO "%d pages shared\n", total_shared); 95 - printk(KERN_INFO "%d pages swap cached\n", total_cached); 96 - printk(KERN_INFO "Total of %ld pages in page table cache\n", 97 - quicklist_total_size()); 98 - printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages()); 99 - } 100 - 101 - 102 34 /* physical address where the bootmem map is located */ 103 35 unsigned long bootmap_start; 104 36

-63

arch/ia64/mm/discontig.c

··· 608 608 #endif /* CONFIG_SMP */ 609 609 610 610 /** 611 - * show_mem - give short summary of memory stats 612 - * 613 - * Shows a simple page count of reserved and used pages in the system. 614 - * For discontig machines, it does this on a per-pgdat basis. 615 - */ 616 - void show_mem(unsigned int filter) 617 - { 618 - int i, total_reserved = 0; 619 - int total_shared = 0, total_cached = 0; 620 - unsigned long total_present = 0; 621 - pg_data_t *pgdat; 622 - 623 - printk(KERN_INFO "Mem-info:\n"); 624 - show_free_areas(filter); 625 - if (filter & SHOW_MEM_FILTER_PAGE_COUNT) 626 - return; 627 - printk(KERN_INFO "Node memory in pages:\n"); 628 - for_each_online_pgdat(pgdat) { 629 - unsigned long present; 630 - unsigned long flags; 631 - int shared = 0, cached = 0, reserved = 0; 632 - int nid = pgdat->node_id; 633 - 634 - if (skip_free_areas_node(filter, nid)) 635 - continue; 636 - pgdat_resize_lock(pgdat, &flags); 637 - present = pgdat->node_present_pages; 638 - for(i = 0; i < pgdat->node_spanned_pages; i++) { 639 - struct page *page; 640 - if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) 641 - touch_nmi_watchdog(); 642 - if (pfn_valid(pgdat->node_start_pfn + i)) 643 - page = pfn_to_page(pgdat->node_start_pfn + i); 644 - else { 645 - i = vmemmap_find_next_valid_pfn(nid, i) - 1; 646 - continue; 647 - } 648 - if (PageReserved(page)) 649 - reserved++; 650 - else if (PageSwapCache(page)) 651 - cached++; 652 - else if (page_count(page)) 653 - shared += page_count(page)-1; 654 - } 655 - pgdat_resize_unlock(pgdat, &flags); 656 - total_present += present; 657 - total_reserved += reserved; 658 - total_cached += cached; 659 - total_shared += shared; 660 - printk(KERN_INFO "Node %4d: RAM: %11ld, rsvd: %8d, " 661 - "shrd: %10d, swpd: %10d\n", nid, 662 - present, reserved, shared, cached); 663 - } 664 - printk(KERN_INFO "%ld pages of RAM\n", total_present); 665 - printk(KERN_INFO "%d reserved pages\n", total_reserved); 666 - printk(KERN_INFO "%d pages shared\n", total_shared); 667 - printk(KERN_INFO "%d pages swap cached\n", total_cached); 668 - printk(KERN_INFO "Total of %ld pages in page table cache\n", 669 - quicklist_total_size()); 670 - printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages()); 671 - } 672 - 673 - /** 674 611 * call_pernode_memory - use SRAT to call callback functions with node info 675 612 * @start: physical start of range 676 613 * @len: length of range

+48

arch/ia64/mm/init.c

··· 684 684 } 685 685 686 686 __initcall(per_linux32_init); 687 + 688 + /** 689 + * show_mem - give short summary of memory stats 690 + * 691 + * Shows a simple page count of reserved and used pages in the system. 692 + * For discontig machines, it does this on a per-pgdat basis. 693 + */ 694 + void show_mem(unsigned int filter) 695 + { 696 + int total_reserved = 0; 697 + unsigned long total_present = 0; 698 + pg_data_t *pgdat; 699 + 700 + printk(KERN_INFO "Mem-info:\n"); 701 + show_free_areas(filter); 702 + printk(KERN_INFO "Node memory in pages:\n"); 703 + for_each_online_pgdat(pgdat) { 704 + unsigned long present; 705 + unsigned long flags; 706 + int reserved = 0; 707 + int nid = pgdat->node_id; 708 + int zoneid; 709 + 710 + if (skip_free_areas_node(filter, nid)) 711 + continue; 712 + pgdat_resize_lock(pgdat, &flags); 713 + 714 + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { 715 + struct zone *zone = &pgdat->node_zones[zoneid]; 716 + if (!populated_zone(zone)) 717 + continue; 718 + 719 + reserved += zone->present_pages - zone->managed_pages; 720 + } 721 + present = pgdat->node_present_pages; 722 + 723 + pgdat_resize_unlock(pgdat, &flags); 724 + total_present += present; 725 + total_reserved += reserved; 726 + printk(KERN_INFO "Node %4d: RAM: %11ld, rsvd: %8d, ", 727 + nid, present, reserved); 728 + } 729 + printk(KERN_INFO "%ld pages of RAM\n", total_present); 730 + printk(KERN_INFO "%d reserved pages\n", total_reserved); 731 + printk(KERN_INFO "Total of %ld pages in page table cache\n", 732 + quicklist_total_size()); 733 + printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages()); 734 + }

+2 -1

arch/metag/mm/init.c

··· 204 204 start_pfn = memblock_region_memory_base_pfn(reg); 205 205 end_pfn = memblock_region_memory_end_pfn(reg); 206 206 memblock_set_node(PFN_PHYS(start_pfn), 207 - PFN_PHYS(end_pfn - start_pfn), 0); 207 + PFN_PHYS(end_pfn - start_pfn), 208 + &memblock.memory, 0); 208 209 } 209 210 210 211 /* All of system RAM sits in node 0 for the non-NUMA case */

+2 -1

arch/metag/mm/numa.c

··· 42 42 memblock_add(start, end - start); 43 43 44 44 memblock_set_node(PFN_PHYS(start_pfn), 45 - PFN_PHYS(end_pfn - start_pfn), nid); 45 + PFN_PHYS(end_pfn - start_pfn), 46 + &memblock.memory, nid); 46 47 47 48 /* Node-local pgdat */ 48 49 pgdat_paddr = memblock_alloc_base(sizeof(struct pglist_data),

+2 -1

arch/microblaze/mm/init.c

··· 192 192 start_pfn = memblock_region_memory_base_pfn(reg); 193 193 end_pfn = memblock_region_memory_end_pfn(reg); 194 194 memblock_set_node(start_pfn << PAGE_SHIFT, 195 - (end_pfn - start_pfn) << PAGE_SHIFT, 0); 195 + (end_pfn - start_pfn) << PAGE_SHIFT, 196 + &memblock.memory, 0); 196 197 } 197 198 198 199 /* free bootmem is whole main memory */

+17 -42

arch/parisc/mm/init.c

··· 645 645 646 646 void show_mem(unsigned int filter) 647 647 { 648 - int i,free = 0,total = 0,reserved = 0; 649 - int shared = 0, cached = 0; 648 + int total = 0,reserved = 0; 649 + pg_data_t *pgdat; 650 650 651 651 printk(KERN_INFO "Mem-info:\n"); 652 652 show_free_areas(filter); 653 - if (filter & SHOW_MEM_FILTER_PAGE_COUNT) 654 - return; 655 - #ifndef CONFIG_DISCONTIGMEM 656 - i = max_mapnr; 657 - while (i-- > 0) { 658 - total++; 659 - if (PageReserved(mem_map+i)) 660 - reserved++; 661 - else if (PageSwapCache(mem_map+i)) 662 - cached++; 663 - else if (!page_count(&mem_map[i])) 664 - free++; 665 - else 666 - shared += page_count(&mem_map[i]) - 1; 653 + 654 + for_each_online_pgdat(pgdat) { 655 + unsigned long flags; 656 + int zoneid; 657 + 658 + pgdat_resize_lock(pgdat, &flags); 659 + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { 660 + struct zone *zone = &pgdat->node_zones[zoneid]; 661 + if (!populated_zone(zone)) 662 + continue; 663 + 664 + total += zone->present_pages; 665 + reserved = zone->present_pages - zone->managed_pages; 666 + } 667 + pgdat_resize_unlock(pgdat, &flags); 667 668 } 668 - #else 669 - for (i = 0; i < npmem_ranges; i++) { 670 - int j; 671 669 672 - for (j = node_start_pfn(i); j < node_end_pfn(i); j++) { 673 - struct page *p; 674 - unsigned long flags; 675 - 676 - pgdat_resize_lock(NODE_DATA(i), &flags); 677 - p = nid_page_nr(i, j) - node_start_pfn(i); 678 - 679 - total++; 680 - if (PageReserved(p)) 681 - reserved++; 682 - else if (PageSwapCache(p)) 683 - cached++; 684 - else if (!page_count(p)) 685 - free++; 686 - else 687 - shared += page_count(p) - 1; 688 - pgdat_resize_unlock(NODE_DATA(i), &flags); 689 - } 690 - } 691 - #endif 692 670 printk(KERN_INFO "%d pages of RAM\n", total); 693 671 printk(KERN_INFO "%d reserved pages\n", reserved); 694 - printk(KERN_INFO "%d pages shared\n", shared); 695 - printk(KERN_INFO "%d pages swap cached\n", cached); 696 - 697 672 698 673 #ifdef CONFIG_DISCONTIGMEM 699 674 {

+1 -1

arch/powerpc/mm/mem.c

··· 209 209 /* Place all memblock_regions in the same node and merge contiguous 210 210 * memblock_regions 211 211 */ 212 - memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); 212 + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); 213 213 214 214 /* Add all physical memory to the bootmem map, mark each area 215 215 * present.

+5 -3

arch/powerpc/mm/numa.c

··· 670 670 node_set_online(nid); 671 671 sz = numa_enforce_memory_limit(base, size); 672 672 if (sz) 673 - memblock_set_node(base, sz, nid); 673 + memblock_set_node(base, sz, 674 + &memblock.memory, nid); 674 675 } while (--ranges); 675 676 } 676 677 } ··· 761 760 continue; 762 761 } 763 762 764 - memblock_set_node(start, size, nid); 763 + memblock_set_node(start, size, &memblock.memory, nid); 765 764 766 765 if (--ranges) 767 766 goto new_range; ··· 798 797 799 798 fake_numa_create_new_node(end_pfn, &nid); 800 799 memblock_set_node(PFN_PHYS(start_pfn), 801 - PFN_PHYS(end_pfn - start_pfn), nid); 800 + PFN_PHYS(end_pfn - start_pfn), 801 + &memblock.memory, nid); 802 802 node_set_online(nid); 803 803 } 804 804 }

-1

arch/score/Kconfig

··· 2 2 3 3 config SCORE 4 4 def_bool y 5 - select HAVE_GENERIC_HARDIRQS 6 5 select GENERIC_IRQ_SHOW 7 6 select GENERIC_IOMAP 8 7 select GENERIC_ATOMIC64

+1

arch/sh/kernel/kgdb.c

··· 13 13 #include <linux/kdebug.h> 14 14 #include <linux/irq.h> 15 15 #include <linux/io.h> 16 + #include <linux/sched.h> 16 17 #include <asm/cacheflush.h> 17 18 #include <asm/traps.h> 18 19

+2 -2

arch/sh/kernel/setup.c

··· 230 230 pmb_bolt_mapping((unsigned long)__va(start), start, end - start, 231 231 PAGE_KERNEL); 232 232 233 - memblock_set_node(PFN_PHYS(start_pfn), 234 - PFN_PHYS(end_pfn - start_pfn), nid); 233 + memblock_set_node(PFN_PHYS(start_pfn), PFN_PHYS(end_pfn - start_pfn), 234 + &memblock.memory, nid); 235 235 } 236 236 237 237 void __init __weak plat_early_device_setup(void)

+3 -2

arch/sparc/mm/init_64.c

··· 1021 1021 "start[%lx] end[%lx]\n", 1022 1022 nid, start, this_end); 1023 1023 1024 - memblock_set_node(start, this_end - start, nid); 1024 + memblock_set_node(start, this_end - start, 1025 + &memblock.memory, nid); 1025 1026 start = this_end; 1026 1027 } 1027 1028 } ··· 1326 1325 (top_of_ram - total_ram) >> 20); 1327 1326 1328 1327 init_node_masks_nonnuma(); 1329 - memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); 1328 + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); 1330 1329 allocate_node_data(0); 1331 1330 node_set_online(0); 1332 1331 }

-3

arch/unicore32/mm/init.c

··· 66 66 printk(KERN_DEFAULT "Mem-info:\n"); 67 67 show_free_areas(filter); 68 68 69 - if (filter & SHOW_MEM_FILTER_PAGE_COUNT) 70 - return; 71 - 72 69 for_each_bank(i, mi) { 73 70 struct membank *bank = &mi->bank[i]; 74 71 unsigned int pfn1, pfn2;

+2 -2

arch/x86/include/asm/page_types.h

··· 51 51 extern unsigned long max_low_pfn_mapped; 52 52 extern unsigned long max_pfn_mapped; 53 53 54 - static inline phys_addr_t get_max_mapped(void) 54 + static inline phys_addr_t get_max_low_mapped(void) 55 55 { 56 - return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; 56 + return (phys_addr_t)max_low_pfn_mapped << PAGE_SHIFT; 57 57 } 58 58 59 59 bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn);

+1 -1

arch/x86/kernel/check.c

··· 91 91 92 92 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); 93 93 94 - for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { 94 + for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) { 95 95 start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE), 96 96 PAGE_SIZE, corruption_check_size); 97 97 end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),

+1 -1

arch/x86/kernel/e820.c

··· 1120 1120 nr_pages += end_pfn - start_pfn; 1121 1121 } 1122 1122 1123 - for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) { 1123 + for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) { 1124 1124 start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); 1125 1125 end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); 1126 1126 if (start_pfn < end_pfn)

+1 -1

arch/x86/kernel/setup.c

··· 1119 1119 1120 1120 setup_real_mode(); 1121 1121 1122 - memblock_set_current_limit(get_max_mapped()); 1122 + memblock_set_current_limit(get_max_low_mapped()); 1123 1123 dma_contiguous_reserve(0); 1124 1124 1125 1125 /*

+1 -1

arch/x86/mm/init_32.c

··· 665 665 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; 666 666 #endif 667 667 668 - memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); 668 + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); 669 669 sparse_memory_present_with_active_regions(0); 670 670 671 671 #ifdef CONFIG_FLATMEM

+1 -1

arch/x86/mm/init_64.c

··· 643 643 #ifndef CONFIG_NUMA 644 644 void __init initmem_init(void) 645 645 { 646 - memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); 646 + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); 647 647 } 648 648 #endif 649 649

+1 -1

arch/x86/mm/memtest.c

··· 74 74 u64 i; 75 75 phys_addr_t this_start, this_end; 76 76 77 - for_each_free_mem_range(i, MAX_NUMNODES, &this_start, &this_end, NULL) { 77 + for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) { 78 78 this_start = clamp_t(phys_addr_t, this_start, start, end); 79 79 this_end = clamp_t(phys_addr_t, this_end, start, end); 80 80 if (this_start < this_end) {

+50 -2

arch/x86/mm/numa.c

··· 491 491 492 492 for (i = 0; i < mi->nr_blks; i++) { 493 493 struct numa_memblk *mb = &mi->blk[i]; 494 - memblock_set_node(mb->start, mb->end - mb->start, mb->nid); 494 + memblock_set_node(mb->start, mb->end - mb->start, 495 + &memblock.memory, mb->nid); 496 + 497 + /* 498 + * At this time, all memory regions reserved by memblock are 499 + * used by the kernel. Set the nid in memblock.reserved will 500 + * mark out all the nodes the kernel resides in. 501 + */ 502 + memblock_set_node(mb->start, mb->end - mb->start, 503 + &memblock.reserved, mb->nid); 495 504 } 496 505 497 506 /* ··· 562 553 } 563 554 } 564 555 556 + static void __init numa_clear_kernel_node_hotplug(void) 557 + { 558 + int i, nid; 559 + nodemask_t numa_kernel_nodes; 560 + unsigned long start, end; 561 + struct memblock_type *type = &memblock.reserved; 562 + 563 + /* Mark all kernel nodes. */ 564 + for (i = 0; i < type->cnt; i++) 565 + node_set(type->regions[i].nid, numa_kernel_nodes); 566 + 567 + /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */ 568 + for (i = 0; i < numa_meminfo.nr_blks; i++) { 569 + nid = numa_meminfo.blk[i].nid; 570 + if (!node_isset(nid, numa_kernel_nodes)) 571 + continue; 572 + 573 + start = numa_meminfo.blk[i].start; 574 + end = numa_meminfo.blk[i].end; 575 + 576 + memblock_clear_hotplug(start, end - start); 577 + } 578 + } 579 + 565 580 static int __init numa_init(int (*init_func)(void)) 566 581 { 567 582 int i; ··· 598 565 nodes_clear(node_possible_map); 599 566 nodes_clear(node_online_map); 600 567 memset(&numa_meminfo, 0, sizeof(numa_meminfo)); 601 - WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES)); 568 + WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory, 569 + MAX_NUMNODES)); 570 + WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved, 571 + MAX_NUMNODES)); 572 + /* In case that parsing SRAT failed. */ 573 + WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX)); 602 574 numa_reset_distance(); 603 575 604 576 ret = init_func(); ··· 639 601 numa_clear_node(i); 640 602 } 641 603 numa_init_array(); 604 + 605 + /* 606 + * At very early time, the kernel have to use some memory such as 607 + * loading the kernel image. We cannot prevent this anyway. So any 608 + * node the kernel resides in should be un-hotpluggable. 609 + * 610 + * And when we come here, numa_init() won't fail. 611 + */ 612 + numa_clear_kernel_node_hotplug(); 613 + 642 614 return 0; 643 615 } 644 616

+5

arch/x86/mm/srat.c

··· 181 181 (unsigned long long) start, (unsigned long long) end - 1, 182 182 hotpluggable ? " hotplug" : ""); 183 183 184 + /* Mark hotplug range in memblock. */ 185 + if (hotpluggable && memblock_mark_hotplug(start, ma->length)) 186 + pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n", 187 + (unsigned long long)start, (unsigned long long)end - 1); 188 + 184 189 return 0; 185 190 out_err_bad_srat: 186 191 bad_srat();

-1

drivers/char/mem.c

··· 22 22 #include <linux/device.h> 23 23 #include <linux/highmem.h> 24 24 #include <linux/backing-dev.h> 25 - #include <linux/bootmem.h> 26 25 #include <linux/splice.h> 27 26 #include <linux/pfn.h> 28 27 #include <linux/export.h>

+1 -1

drivers/firmware/memmap.c

··· 324 324 { 325 325 struct firmware_map_entry *entry; 326 326 327 - entry = alloc_bootmem(sizeof(struct firmware_map_entry)); 327 + entry = memblock_virt_alloc(sizeof(struct firmware_map_entry), 0); 328 328 if (WARN_ON(!entry)) 329 329 return -ENOMEM; 330 330

+1 -1

drivers/iommu/intel-iommu.c

··· 917 917 918 918 /* If range covers entire pagetable, free it */ 919 919 if (!(start_pfn > level_pfn || 920 - last_pfn < level_pfn + level_size(level))) { 920 + last_pfn < level_pfn + level_size(level) - 1)) { 921 921 dma_clear_pte(pte); 922 922 domain_flush_cache(domain, pte, sizeof(*pte)); 923 923 free_pgtable_page(level_pte);

+2 -1

fs/compat_ioctl.c

··· 680 680 struct i2c_msg __user *tmsgs; 681 681 struct i2c_msg32 __user *umsgs; 682 682 compat_caddr_t datap; 683 - int nmsgs, i; 683 + u32 nmsgs; 684 + int i; 684 685 685 686 if (get_user(nmsgs, &udata->nmsgs)) 686 687 return -EFAULT;

+8 -26

fs/notify/dnotify/dnotify.c

··· 82 82 * events. 83 83 */ 84 84 static int dnotify_handle_event(struct fsnotify_group *group, 85 + struct inode *inode, 85 86 struct fsnotify_mark *inode_mark, 86 87 struct fsnotify_mark *vfsmount_mark, 87 - struct fsnotify_event *event) 88 + u32 mask, void *data, int data_type, 89 + const unsigned char *file_name) 88 90 { 89 91 struct dnotify_mark *dn_mark; 90 - struct inode *to_tell; 91 92 struct dnotify_struct *dn; 92 93 struct dnotify_struct **prev; 93 94 struct fown_struct *fown; 94 - __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD; 95 + __u32 test_mask = mask & ~FS_EVENT_ON_CHILD; 96 + 97 + /* not a dir, dnotify doesn't care */ 98 + if (!S_ISDIR(inode->i_mode)) 99 + return 0; 95 100 96 101 BUG_ON(vfsmount_mark); 97 - 98 - to_tell = event->to_tell; 99 102 100 103 dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark); 101 104 ··· 125 122 return 0; 126 123 } 127 124 128 - /* 129 - * Given an inode and mask determine if dnotify would be interested in sending 130 - * userspace notification for that pair. 131 - */ 132 - static bool dnotify_should_send_event(struct fsnotify_group *group, 133 - struct inode *inode, 134 - struct fsnotify_mark *inode_mark, 135 - struct fsnotify_mark *vfsmount_mark, 136 - __u32 mask, void *data, int data_type) 137 - { 138 - /* not a dir, dnotify doesn't care */ 139 - if (!S_ISDIR(inode->i_mode)) 140 - return false; 141 - 142 - return true; 143 - } 144 - 145 125 static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) 146 126 { 147 127 struct dnotify_mark *dn_mark = container_of(fsn_mark, ··· 138 152 139 153 static struct fsnotify_ops dnotify_fsnotify_ops = { 140 154 .handle_event = dnotify_handle_event, 141 - .should_send_event = dnotify_should_send_event, 142 - .free_group_priv = NULL, 143 - .freeing_mark = NULL, 144 - .free_event_priv = NULL, 145 155 }; 146 156 147 157 /*

+112 -114

fs/notify/fanotify/fanotify.c

··· 9 9 #include <linux/types.h> 10 10 #include <linux/wait.h> 11 11 12 - static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new) 13 - { 14 - pr_debug("%s: old=%p new=%p\n", __func__, old, new); 12 + #include "fanotify.h" 15 13 16 - if (old->to_tell == new->to_tell && 17 - old->data_type == new->data_type && 18 - old->tgid == new->tgid) { 19 - switch (old->data_type) { 20 - case (FSNOTIFY_EVENT_PATH): 14 + static bool should_merge(struct fsnotify_event *old_fsn, 15 + struct fsnotify_event *new_fsn) 16 + { 17 + struct fanotify_event_info *old, *new; 18 + 21 19 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 22 - /* dont merge two permission events */ 23 - if ((old->mask & FAN_ALL_PERM_EVENTS) && 24 - (new->mask & FAN_ALL_PERM_EVENTS)) 25 - return false; 20 + /* dont merge two permission events */ 21 + if ((old_fsn->mask & FAN_ALL_PERM_EVENTS) && 22 + (new_fsn->mask & FAN_ALL_PERM_EVENTS)) 23 + return false; 26 24 #endif 27 - if ((old->path.mnt == new->path.mnt) && 28 - (old->path.dentry == new->path.dentry)) 29 - return true; 30 - break; 31 - case (FSNOTIFY_EVENT_NONE): 32 - return true; 33 - default: 34 - BUG(); 35 - }; 36 - } 25 + pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn); 26 + old = FANOTIFY_E(old_fsn); 27 + new = FANOTIFY_E(new_fsn); 28 + 29 + if (old_fsn->inode == new_fsn->inode && old->tgid == new->tgid && 30 + old->path.mnt == new->path.mnt && 31 + old->path.dentry == new->path.dentry) 32 + return true; 37 33 return false; 38 34 } 39 35 ··· 37 41 static struct fsnotify_event *fanotify_merge(struct list_head *list, 38 42 struct fsnotify_event *event) 39 43 { 40 - struct fsnotify_event_holder *test_holder; 41 - struct fsnotify_event *test_event = NULL; 42 - struct fsnotify_event *new_event; 44 + struct fsnotify_event *test_event; 45 + bool do_merge = false; 43 46 44 47 pr_debug("%s: list=%p event=%p\n", __func__, list, event); 45 48 46 - 47 - list_for_each_entry_reverse(test_holder, list, event_list) { 48 - if (should_merge(test_holder->event, event)) { 49 - test_event = test_holder->event; 49 + list_for_each_entry_reverse(test_event, list, list) { 50 + if (should_merge(test_event, event)) { 51 + do_merge = true; 50 52 break; 51 53 } 52 54 } 53 55 54 - if (!test_event) 56 + if (!do_merge) 55 57 return NULL; 56 58 57 - fsnotify_get_event(test_event); 58 - 59 - /* if they are exactly the same we are done */ 60 - if (test_event->mask == event->mask) 61 - return test_event; 62 - 63 - /* 64 - * if the refcnt == 2 this is the only queue 65 - * for this event and so we can update the mask 66 - * in place. 67 - */ 68 - if (atomic_read(&test_event->refcnt) == 2) { 69 - test_event->mask |= event->mask; 70 - return test_event; 71 - } 72 - 73 - new_event = fsnotify_clone_event(test_event); 74 - 75 - /* done with test_event */ 76 - fsnotify_put_event(test_event); 77 - 78 - /* couldn't allocate memory, merge was not possible */ 79 - if (unlikely(!new_event)) 80 - return ERR_PTR(-ENOMEM); 81 - 82 - /* build new event and replace it on the list */ 83 - new_event->mask = (test_event->mask | event->mask); 84 - fsnotify_replace_event(test_holder, new_event); 85 - 86 - /* we hold a reference on new_event from clone_event */ 87 - return new_event; 59 + test_event->mask |= event->mask; 60 + return test_event; 88 61 } 89 62 90 63 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 91 64 static int fanotify_get_response_from_access(struct fsnotify_group *group, 92 - struct fsnotify_event *event) 65 + struct fanotify_event_info *event) 93 66 { 94 67 int ret; 95 68 ··· 71 106 return 0; 72 107 73 108 /* userspace responded, convert to something usable */ 74 - spin_lock(&event->lock); 75 109 switch (event->response) { 76 110 case FAN_ALLOW: 77 111 ret = 0; ··· 80 116 ret = -EPERM; 81 117 } 82 118 event->response = 0; 83 - spin_unlock(&event->lock); 84 119 85 120 pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__, 86 121 group, event, ret); ··· 88 125 } 89 126 #endif 90 127 91 - static int fanotify_handle_event(struct fsnotify_group *group, 92 - struct fsnotify_mark *inode_mark, 93 - struct fsnotify_mark *fanotify_mark, 94 - struct fsnotify_event *event) 95 - { 96 - int ret = 0; 97 - struct fsnotify_event *notify_event = NULL; 98 - 99 - BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); 100 - BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); 101 - BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); 102 - BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE); 103 - BUILD_BUG_ON(FAN_OPEN != FS_OPEN); 104 - BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD); 105 - BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW); 106 - BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM); 107 - BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); 108 - BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); 109 - 110 - pr_debug("%s: group=%p event=%p\n", __func__, group, event); 111 - 112 - notify_event = fsnotify_add_notify_event(group, event, NULL, fanotify_merge); 113 - if (IS_ERR(notify_event)) 114 - return PTR_ERR(notify_event); 115 - 116 - #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 117 - if (event->mask & FAN_ALL_PERM_EVENTS) { 118 - /* if we merged we need to wait on the new event */ 119 - if (notify_event) 120 - event = notify_event; 121 - ret = fanotify_get_response_from_access(group, event); 122 - } 123 - #endif 124 - 125 - if (notify_event) 126 - fsnotify_put_event(notify_event); 127 - 128 - return ret; 129 - } 130 - 131 - static bool fanotify_should_send_event(struct fsnotify_group *group, 132 - struct inode *to_tell, 133 - struct fsnotify_mark *inode_mark, 128 + static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, 134 129 struct fsnotify_mark *vfsmnt_mark, 135 - __u32 event_mask, void *data, int data_type) 130 + u32 event_mask, 131 + void *data, int data_type) 136 132 { 137 133 __u32 marks_mask, marks_ignored_mask; 138 134 struct path *path = data; 139 135 140 - pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p " 141 - "mask=%x data=%p data_type=%d\n", __func__, group, to_tell, 142 - inode_mark, vfsmnt_mark, event_mask, data, data_type); 136 + pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p" 137 + " data_type=%d\n", __func__, inode_mark, vfsmnt_mark, 138 + event_mask, data, data_type); 143 139 144 140 /* if we don't have enough info to send an event to userspace say no */ 145 141 if (data_type != FSNOTIFY_EVENT_PATH) ··· 139 217 return false; 140 218 } 141 219 220 + static int fanotify_handle_event(struct fsnotify_group *group, 221 + struct inode *inode, 222 + struct fsnotify_mark *inode_mark, 223 + struct fsnotify_mark *fanotify_mark, 224 + u32 mask, void *data, int data_type, 225 + const unsigned char *file_name) 226 + { 227 + int ret = 0; 228 + struct fanotify_event_info *event; 229 + struct fsnotify_event *fsn_event; 230 + struct fsnotify_event *notify_fsn_event; 231 + 232 + BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); 233 + BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); 234 + BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); 235 + BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE); 236 + BUILD_BUG_ON(FAN_OPEN != FS_OPEN); 237 + BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD); 238 + BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW); 239 + BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM); 240 + BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); 241 + BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); 242 + 243 + if (!fanotify_should_send_event(inode_mark, fanotify_mark, mask, data, 244 + data_type)) 245 + return 0; 246 + 247 + pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, 248 + mask); 249 + 250 + event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL); 251 + if (unlikely(!event)) 252 + return -ENOMEM; 253 + 254 + fsn_event = &event->fse; 255 + fsnotify_init_event(fsn_event, inode, mask); 256 + event->tgid = get_pid(task_tgid(current)); 257 + if (data_type == FSNOTIFY_EVENT_PATH) { 258 + struct path *path = data; 259 + event->path = *path; 260 + path_get(&event->path); 261 + } else { 262 + event->path.mnt = NULL; 263 + event->path.dentry = NULL; 264 + } 265 + #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 266 + event->response = 0; 267 + #endif 268 + 269 + notify_fsn_event = fsnotify_add_notify_event(group, fsn_event, 270 + fanotify_merge); 271 + if (notify_fsn_event) { 272 + /* Our event wasn't used in the end. Free it. */ 273 + fsnotify_destroy_event(group, fsn_event); 274 + if (IS_ERR(notify_fsn_event)) 275 + return PTR_ERR(notify_fsn_event); 276 + /* We need to ask about a different events after a merge... */ 277 + event = FANOTIFY_E(notify_fsn_event); 278 + fsn_event = notify_fsn_event; 279 + } 280 + 281 + #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 282 + if (fsn_event->mask & FAN_ALL_PERM_EVENTS) 283 + ret = fanotify_get_response_from_access(group, event); 284 + #endif 285 + return ret; 286 + } 287 + 142 288 static void fanotify_free_group_priv(struct fsnotify_group *group) 143 289 { 144 290 struct user_struct *user; ··· 216 226 free_uid(user); 217 227 } 218 228 229 + static void fanotify_free_event(struct fsnotify_event *fsn_event) 230 + { 231 + struct fanotify_event_info *event; 232 + 233 + event = FANOTIFY_E(fsn_event); 234 + path_put(&event->path); 235 + put_pid(event->tgid); 236 + kmem_cache_free(fanotify_event_cachep, event); 237 + } 238 + 219 239 const struct fsnotify_ops fanotify_fsnotify_ops = { 220 240 .handle_event = fanotify_handle_event, 221 - .should_send_event = fanotify_should_send_event, 222 241 .free_group_priv = fanotify_free_group_priv, 223 - .free_event_priv = NULL, 224 - .freeing_mark = NULL, 242 + .free_event = fanotify_free_event, 225 243 };

+23

fs/notify/fanotify/fanotify.h

··· 1 + #include <linux/fsnotify_backend.h> 2 + #include <linux/path.h> 3 + #include <linux/slab.h> 4 + 5 + extern struct kmem_cache *fanotify_event_cachep; 6 + 7 + struct fanotify_event_info { 8 + struct fsnotify_event fse; 9 + /* 10 + * We hold ref to this path so it may be dereferenced at any point 11 + * during this object's lifetime 12 + */ 13 + struct path path; 14 + struct pid *tgid; 15 + #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 16 + u32 response; /* userspace answer to question */ 17 + #endif 18 + }; 19 + 20 + static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) 21 + { 22 + return container_of(fse, struct fanotify_event_info, fse); 23 + }

+20 -21

fs/notify/fanotify/fanotify_user.c

··· 19 19 20 20 #include "../../mount.h" 21 21 #include "../fdinfo.h" 22 + #include "fanotify.h" 22 23 23 24 #define FANOTIFY_DEFAULT_MAX_EVENTS 16384 24 25 #define FANOTIFY_DEFAULT_MAX_MARKS 8192 ··· 29 28 30 29 static struct kmem_cache *fanotify_mark_cache __read_mostly; 31 30 static struct kmem_cache *fanotify_response_event_cache __read_mostly; 31 + struct kmem_cache *fanotify_event_cachep __read_mostly; 32 32 33 33 struct fanotify_response_event { 34 34 struct list_head list; 35 35 __s32 fd; 36 - struct fsnotify_event *event; 36 + struct fanotify_event_info *event; 37 37 }; 38 38 39 39 /* ··· 63 61 } 64 62 65 63 static int create_fd(struct fsnotify_group *group, 66 - struct fsnotify_event *event, 67 - struct file **file) 64 + struct fanotify_event_info *event, 65 + struct file **file) 68 66 { 69 67 int client_fd; 70 68 struct file *new_file; ··· 74 72 client_fd = get_unused_fd(); 75 73 if (client_fd < 0) 76 74 return client_fd; 77 - 78 - if (event->data_type != FSNOTIFY_EVENT_PATH) { 79 - WARN_ON(1); 80 - put_unused_fd(client_fd); 81 - return -EINVAL; 82 - } 83 75 84 76 /* 85 77 * we need a new file handle for the userspace program so it can read even if it was ··· 105 109 } 106 110 107 111 static int fill_event_metadata(struct fsnotify_group *group, 108 - struct fanotify_event_metadata *metadata, 109 - struct fsnotify_event *event, 110 - struct file **file) 112 + struct fanotify_event_metadata *metadata, 113 + struct fsnotify_event *fsn_event, 114 + struct file **file) 111 115 { 112 116 int ret = 0; 117 + struct fanotify_event_info *event; 113 118 114 119 pr_debug("%s: group=%p metadata=%p event=%p\n", __func__, 115 - group, metadata, event); 120 + group, metadata, fsn_event); 116 121 117 122 *file = NULL; 123 + event = container_of(fsn_event, struct fanotify_event_info, fse); 118 124 metadata->event_len = FAN_EVENT_METADATA_LEN; 119 125 metadata->metadata_len = FAN_EVENT_METADATA_LEN; 120 126 metadata->vers = FANOTIFY_METADATA_VERSION; 121 127 metadata->reserved = 0; 122 - metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS; 128 + metadata->mask = fsn_event->mask & FAN_ALL_OUTGOING_EVENTS; 123 129 metadata->pid = pid_vnr(event->tgid); 124 - if (unlikely(event->mask & FAN_Q_OVERFLOW)) 130 + if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW)) 125 131 metadata->fd = FAN_NOFD; 126 132 else { 127 133 metadata->fd = create_fd(group, event, file); ··· 207 209 if (!re) 208 210 return -ENOMEM; 209 211 210 - re->event = event; 212 + re->event = FANOTIFY_E(event); 211 213 re->fd = fd; 212 214 213 215 mutex_lock(&group->fanotify_data.access_mutex); ··· 215 217 if (atomic_read(&group->fanotify_data.bypass_perm)) { 216 218 mutex_unlock(&group->fanotify_data.access_mutex); 217 219 kmem_cache_free(fanotify_response_event_cache, re); 218 - event->response = FAN_ALLOW; 220 + FANOTIFY_E(event)->response = FAN_ALLOW; 219 221 return 0; 220 222 } 221 223 ··· 271 273 out: 272 274 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 273 275 if (event->mask & FAN_ALL_PERM_EVENTS) { 274 - event->response = FAN_DENY; 276 + FANOTIFY_E(event)->response = FAN_DENY; 275 277 wake_up(&group->fanotify_data.access_waitq); 276 278 } 277 279 #endif ··· 319 321 if (IS_ERR(kevent)) 320 322 break; 321 323 ret = copy_event_to_user(group, kevent, buf); 322 - fsnotify_put_event(kevent); 324 + fsnotify_destroy_event(group, kevent); 323 325 if (ret < 0) 324 326 break; 325 327 buf += ret; ··· 407 409 static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 408 410 { 409 411 struct fsnotify_group *group; 410 - struct fsnotify_event_holder *holder; 412 + struct fsnotify_event *fsn_event; 411 413 void __user *p; 412 414 int ret = -ENOTTY; 413 415 size_t send_len = 0; ··· 419 421 switch (cmd) { 420 422 case FIONREAD: 421 423 mutex_lock(&group->notification_mutex); 422 - list_for_each_entry(holder, &group->notification_list, event_list) 424 + list_for_each_entry(fsn_event, &group->notification_list, list) 423 425 send_len += FAN_EVENT_METADATA_LEN; 424 426 mutex_unlock(&group->notification_mutex); 425 427 ret = put_user(send_len, (int __user *) p); ··· 904 906 fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC); 905 907 fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event, 906 908 SLAB_PANIC); 909 + fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC); 907 910 908 911 return 0; 909 912 }

+12 -30

fs/notify/fsnotify.c

··· 128 128 struct fsnotify_mark *vfsmount_mark, 129 129 __u32 mask, void *data, 130 130 int data_is, u32 cookie, 131 - const unsigned char *file_name, 132 - struct fsnotify_event **event) 131 + const unsigned char *file_name) 133 132 { 134 133 struct fsnotify_group *group = NULL; 135 134 __u32 inode_test_mask = 0; ··· 169 170 170 171 pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p" 171 172 " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x" 172 - " data=%p data_is=%d cookie=%d event=%p\n", 173 + " data=%p data_is=%d cookie=%d\n", 173 174 __func__, group, to_tell, mask, inode_mark, 174 175 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data, 175 - data_is, cookie, *event); 176 + data_is, cookie); 176 177 177 178 if (!inode_test_mask && !vfsmount_test_mask) 178 179 return 0; 179 180 180 - if (group->ops->should_send_event(group, to_tell, inode_mark, 181 - vfsmount_mark, mask, data, 182 - data_is) == false) 183 - return 0; 184 - 185 - if (!*event) { 186 - *event = fsnotify_create_event(to_tell, mask, data, 187 - data_is, file_name, 188 - cookie, GFP_KERNEL); 189 - if (!*event) 190 - return -ENOMEM; 191 - } 192 - return group->ops->handle_event(group, inode_mark, vfsmount_mark, *event); 181 + return group->ops->handle_event(group, to_tell, inode_mark, 182 + vfsmount_mark, mask, data, data_is, 183 + file_name); 193 184 } 194 185 195 186 /* ··· 194 205 struct hlist_node *inode_node = NULL, *vfsmount_node = NULL; 195 206 struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; 196 207 struct fsnotify_group *inode_group, *vfsmount_group; 197 - struct fsnotify_event *event = NULL; 198 208 struct mount *mnt; 199 209 int idx, ret = 0; 200 210 /* global tests shouldn't care about events on child only the specific event */ ··· 246 258 247 259 if (inode_group > vfsmount_group) { 248 260 /* handle inode */ 249 - ret = send_to_group(to_tell, inode_mark, NULL, mask, data, 250 - data_is, cookie, file_name, &event); 261 + ret = send_to_group(to_tell, inode_mark, NULL, mask, 262 + data, data_is, cookie, file_name); 251 263 /* we didn't use the vfsmount_mark */ 252 264 vfsmount_group = NULL; 253 265 } else if (vfsmount_group > inode_group) { 254 - ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, data, 255 - data_is, cookie, file_name, &event); 266 + ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, 267 + data, data_is, cookie, file_name); 256 268 inode_group = NULL; 257 269 } else { 258 270 ret = send_to_group(to_tell, inode_mark, vfsmount_mark, 259 - mask, data, data_is, cookie, file_name, 260 - &event); 271 + mask, data, data_is, cookie, 272 + file_name); 261 273 } 262 274 263 275 if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) ··· 273 285 ret = 0; 274 286 out: 275 287 srcu_read_unlock(&fsnotify_mark_srcu, idx); 276 - /* 277 - * fsnotify_create_event() took a reference so the event can't be cleaned 278 - * up while we are still trying to add it to lists, drop that one. 279 - */ 280 - if (event) 281 - fsnotify_put_event(event); 282 288 283 289 return ret; 284 290 }

+1

fs/notify/group.c

··· 99 99 INIT_LIST_HEAD(&group->marks_list); 100 100 101 101 group->ops = ops; 102 + fsnotify_init_event(&group->overflow_event, NULL, FS_Q_OVERFLOW); 102 103 103 104 return group; 104 105 }

+16 -5

fs/notify/inotify/inotify.h

··· 2 2 #include <linux/inotify.h> 3 3 #include <linux/slab.h> /* struct kmem_cache */ 4 4 5 - extern struct kmem_cache *event_priv_cachep; 6 - 7 - struct inotify_event_private_data { 8 - struct fsnotify_event_private_data fsnotify_event_priv_data; 5 + struct inotify_event_info { 6 + struct fsnotify_event fse; 9 7 int wd; 8 + u32 sync_cookie; 9 + int name_len; 10 + char name[]; 10 11 }; 11 12 12 13 struct inotify_inode_mark { ··· 15 14 int wd; 16 15 }; 17 16 17 + static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse) 18 + { 19 + return container_of(fse, struct inotify_event_info, fse); 20 + } 21 + 18 22 extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, 19 23 struct fsnotify_group *group); 20 - extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv); 24 + extern int inotify_handle_event(struct fsnotify_group *group, 25 + struct inode *inode, 26 + struct fsnotify_mark *inode_mark, 27 + struct fsnotify_mark *vfsmount_mark, 28 + u32 mask, void *data, int data_type, 29 + const unsigned char *file_name); 21 30 22 31 extern const struct fsnotify_ops inotify_fsnotify_ops;

+56 -93

fs/notify/inotify/inotify_fsnotify.c

··· 34 34 #include "inotify.h" 35 35 36 36 /* 37 - * Check if 2 events contain the same information. We do not compare private data 38 - * but at this moment that isn't a problem for any know fsnotify listeners. 37 + * Check if 2 events contain the same information. 39 38 */ 40 - static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new) 39 + static bool event_compare(struct fsnotify_event *old_fsn, 40 + struct fsnotify_event *new_fsn) 41 41 { 42 - if ((old->mask == new->mask) && 43 - (old->to_tell == new->to_tell) && 44 - (old->data_type == new->data_type) && 45 - (old->name_len == new->name_len)) { 46 - switch (old->data_type) { 47 - case (FSNOTIFY_EVENT_INODE): 48 - /* remember, after old was put on the wait_q we aren't 49 - * allowed to look at the inode any more, only thing 50 - * left to check was if the file_name is the same */ 51 - if (!old->name_len || 52 - !strcmp(old->file_name, new->file_name)) 53 - return true; 54 - break; 55 - case (FSNOTIFY_EVENT_PATH): 56 - if ((old->path.mnt == new->path.mnt) && 57 - (old->path.dentry == new->path.dentry)) 58 - return true; 59 - break; 60 - case (FSNOTIFY_EVENT_NONE): 61 - if (old->mask & FS_Q_OVERFLOW) 62 - return true; 63 - else if (old->mask & FS_IN_IGNORED) 64 - return false; 65 - return true; 66 - }; 67 - } 42 + struct inotify_event_info *old, *new; 43 + 44 + if (old_fsn->mask & FS_IN_IGNORED) 45 + return false; 46 + old = INOTIFY_E(old_fsn); 47 + new = INOTIFY_E(new_fsn); 48 + if ((old_fsn->mask == new_fsn->mask) && 49 + (old_fsn->inode == new_fsn->inode) && 50 + (old->name_len == new->name_len) && 51 + (!old->name_len || !strcmp(old->name, new->name))) 52 + return true; 68 53 return false; 69 54 } 70 55 71 56 static struct fsnotify_event *inotify_merge(struct list_head *list, 72 57 struct fsnotify_event *event) 73 58 { 74 - struct fsnotify_event_holder *last_holder; 75 59 struct fsnotify_event *last_event; 76 60 77 - /* and the list better be locked by something too */ 78 - spin_lock(&event->lock); 79 - 80 - last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list); 81 - last_event = last_holder->event; 82 - if (event_compare(last_event, event)) 83 - fsnotify_get_event(last_event); 84 - else 85 - last_event = NULL; 86 - 87 - spin_unlock(&event->lock); 88 - 61 + last_event = list_entry(list->prev, struct fsnotify_event, list); 62 + if (!event_compare(last_event, event)) 63 + return NULL; 89 64 return last_event; 90 65 } 91 66 92 - static int inotify_handle_event(struct fsnotify_group *group, 93 - struct fsnotify_mark *inode_mark, 94 - struct fsnotify_mark *vfsmount_mark, 95 - struct fsnotify_event *event) 67 + int inotify_handle_event(struct fsnotify_group *group, 68 + struct inode *inode, 69 + struct fsnotify_mark *inode_mark, 70 + struct fsnotify_mark *vfsmount_mark, 71 + u32 mask, void *data, int data_type, 72 + const unsigned char *file_name) 96 73 { 97 74 struct inotify_inode_mark *i_mark; 98 - struct inode *to_tell; 99 - struct inotify_event_private_data *event_priv; 100 - struct fsnotify_event_private_data *fsn_event_priv; 75 + struct inotify_event_info *event; 101 76 struct fsnotify_event *added_event; 102 - int wd, ret = 0; 77 + struct fsnotify_event *fsn_event; 78 + int ret = 0; 79 + int len = 0; 80 + int alloc_len = sizeof(struct inotify_event_info); 103 81 104 82 BUG_ON(vfsmount_mark); 105 83 106 - pr_debug("%s: group=%p event=%p to_tell=%p mask=%x\n", __func__, group, 107 - event, event->to_tell, event->mask); 84 + if ((inode_mark->mask & FS_EXCL_UNLINK) && 85 + (data_type == FSNOTIFY_EVENT_PATH)) { 86 + struct path *path = data; 108 87 109 - to_tell = event->to_tell; 88 + if (d_unlinked(path->dentry)) 89 + return 0; 90 + } 91 + if (file_name) { 92 + len = strlen(file_name); 93 + alloc_len += len + 1; 94 + } 95 + 96 + pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, 97 + mask); 110 98 111 99 i_mark = container_of(inode_mark, struct inotify_inode_mark, 112 100 fsn_mark); 113 - wd = i_mark->wd; 114 101 115 - event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); 116 - if (unlikely(!event_priv)) 102 + event = kmalloc(alloc_len, GFP_KERNEL); 103 + if (unlikely(!event)) 117 104 return -ENOMEM; 118 105 119 - fsn_event_priv = &event_priv->fsnotify_event_priv_data; 106 + fsn_event = &event->fse; 107 + fsnotify_init_event(fsn_event, inode, mask); 108 + event->wd = i_mark->wd; 109 + event->name_len = len; 110 + if (len) 111 + strcpy(event->name, file_name); 120 112 121 - fsnotify_get_group(group); 122 - fsn_event_priv->group = group; 123 - event_priv->wd = wd; 124 - 125 - added_event = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge); 113 + added_event = fsnotify_add_notify_event(group, fsn_event, inotify_merge); 126 114 if (added_event) { 127 - inotify_free_event_priv(fsn_event_priv); 128 - if (!IS_ERR(added_event)) 129 - fsnotify_put_event(added_event); 130 - else 115 + /* Our event wasn't used in the end. Free it. */ 116 + fsnotify_destroy_event(group, fsn_event); 117 + if (IS_ERR(added_event)) 131 118 ret = PTR_ERR(added_event); 132 119 } 133 120 ··· 127 140 static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group) 128 141 { 129 142 inotify_ignored_and_remove_idr(fsn_mark, group); 130 - } 131 - 132 - static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, 133 - struct fsnotify_mark *inode_mark, 134 - struct fsnotify_mark *vfsmount_mark, 135 - __u32 mask, void *data, int data_type) 136 - { 137 - if ((inode_mark->mask & FS_EXCL_UNLINK) && 138 - (data_type == FSNOTIFY_EVENT_PATH)) { 139 - struct path *path = data; 140 - 141 - if (d_unlinked(path->dentry)) 142 - return false; 143 - } 144 - 145 - return true; 146 143 } 147 144 148 145 /* ··· 173 202 free_uid(group->inotify_data.user); 174 203 } 175 204 176 - void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv) 205 + static void inotify_free_event(struct fsnotify_event *fsn_event) 177 206 { 178 - struct inotify_event_private_data *event_priv; 179 - 180 - 181 - event_priv = container_of(fsn_event_priv, struct inotify_event_private_data, 182 - fsnotify_event_priv_data); 183 - 184 - fsnotify_put_group(fsn_event_priv->group); 185 - kmem_cache_free(event_priv_cachep, event_priv); 207 + kfree(INOTIFY_E(fsn_event)); 186 208 } 187 209 188 210 const struct fsnotify_ops inotify_fsnotify_ops = { 189 211 .handle_event = inotify_handle_event, 190 - .should_send_event = inotify_should_send_event, 191 212 .free_group_priv = inotify_free_group_priv, 192 - .free_event_priv = inotify_free_event_priv, 213 + .free_event = inotify_free_event, 193 214 .freeing_mark = inotify_freeing_mark, 194 215 };

+38 -81

fs/notify/inotify/inotify_user.c

··· 50 50 static int inotify_max_user_watches __read_mostly; 51 51 52 52 static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; 53 - struct kmem_cache *event_priv_cachep __read_mostly; 54 53 55 54 #ifdef CONFIG_SYSCTL 56 55 ··· 123 124 return ret; 124 125 } 125 126 127 + static int round_event_name_len(struct fsnotify_event *fsn_event) 128 + { 129 + struct inotify_event_info *event; 130 + 131 + event = INOTIFY_E(fsn_event); 132 + if (!event->name_len) 133 + return 0; 134 + return roundup(event->name_len + 1, sizeof(struct inotify_event)); 135 + } 136 + 126 137 /* 127 138 * Get an inotify_kernel_event if one exists and is small 128 139 * enough to fit in "count". Return an error pointer if ··· 153 144 154 145 pr_debug("%s: group=%p event=%p\n", __func__, group, event); 155 146 156 - if (event->name_len) 157 - event_size += roundup(event->name_len + 1, event_size); 158 - 147 + event_size += round_event_name_len(event); 159 148 if (event_size > count) 160 149 return ERR_PTR(-EINVAL); 161 150 ··· 171 164 * buffer we had in "get_one_event()" above. 172 165 */ 173 166 static ssize_t copy_event_to_user(struct fsnotify_group *group, 174 - struct fsnotify_event *event, 167 + struct fsnotify_event *fsn_event, 175 168 char __user *buf) 176 169 { 177 170 struct inotify_event inotify_event; 178 - struct fsnotify_event_private_data *fsn_priv; 179 - struct inotify_event_private_data *priv; 171 + struct inotify_event_info *event; 180 172 size_t event_size = sizeof(struct inotify_event); 181 - size_t name_len = 0; 173 + size_t name_len; 174 + size_t pad_name_len; 182 175 183 - pr_debug("%s: group=%p event=%p\n", __func__, group, event); 176 + pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event); 184 177 185 - /* we get the inotify watch descriptor from the event private data */ 186 - spin_lock(&event->lock); 187 - fsn_priv = fsnotify_remove_priv_from_event(group, event); 188 - spin_unlock(&event->lock); 189 - 190 - if (!fsn_priv) 191 - inotify_event.wd = -1; 192 - else { 193 - priv = container_of(fsn_priv, struct inotify_event_private_data, 194 - fsnotify_event_priv_data); 195 - inotify_event.wd = priv->wd; 196 - inotify_free_event_priv(fsn_priv); 197 - } 198 - 178 + event = INOTIFY_E(fsn_event); 179 + name_len = event->name_len; 199 180 /* 200 - * round up event->name_len so it is a multiple of event_size 181 + * round up name length so it is a multiple of event_size 201 182 * plus an extra byte for the terminating '\0'. 202 183 */ 203 - if (event->name_len) 204 - name_len = roundup(event->name_len + 1, event_size); 205 - inotify_event.len = name_len; 206 - 207 - inotify_event.mask = inotify_mask_to_arg(event->mask); 184 + pad_name_len = round_event_name_len(fsn_event); 185 + inotify_event.len = pad_name_len; 186 + inotify_event.mask = inotify_mask_to_arg(fsn_event->mask); 187 + inotify_event.wd = event->wd; 208 188 inotify_event.cookie = event->sync_cookie; 209 189 210 190 /* send the main event */ ··· 203 209 /* 204 210 * fsnotify only stores the pathname, so here we have to send the pathname 205 211 * and then pad that pathname out to a multiple of sizeof(inotify_event) 206 - * with zeros. I get my zeros from the nul_inotify_event. 212 + * with zeros. 207 213 */ 208 - if (name_len) { 209 - unsigned int len_to_zero = name_len - event->name_len; 214 + if (pad_name_len) { 210 215 /* copy the path name */ 211 - if (copy_to_user(buf, event->file_name, event->name_len)) 216 + if (copy_to_user(buf, event->name, name_len)) 212 217 return -EFAULT; 213 - buf += event->name_len; 218 + buf += name_len; 214 219 215 220 /* fill userspace with 0's */ 216 - if (clear_user(buf, len_to_zero)) 221 + if (clear_user(buf, pad_name_len - name_len)) 217 222 return -EFAULT; 218 - buf += len_to_zero; 219 - event_size += name_len; 223 + event_size += pad_name_len; 220 224 } 221 225 222 226 return event_size; ··· 246 254 if (IS_ERR(kevent)) 247 255 break; 248 256 ret = copy_event_to_user(group, kevent, buf); 249 - fsnotify_put_event(kevent); 257 + fsnotify_destroy_event(group, kevent); 250 258 if (ret < 0) 251 259 break; 252 260 buf += ret; ··· 289 297 unsigned long arg) 290 298 { 291 299 struct fsnotify_group *group; 292 - struct fsnotify_event_holder *holder; 293 - struct fsnotify_event *event; 300 + struct fsnotify_event *fsn_event; 294 301 void __user *p; 295 302 int ret = -ENOTTY; 296 303 size_t send_len = 0; ··· 302 311 switch (cmd) { 303 312 case FIONREAD: 304 313 mutex_lock(&group->notification_mutex); 305 - list_for_each_entry(holder, &group->notification_list, event_list) { 306 - event = holder->event; 314 + list_for_each_entry(fsn_event, &group->notification_list, 315 + list) { 307 316 send_len += sizeof(struct inotify_event); 308 - if (event->name_len) 309 - send_len += roundup(event->name_len + 1, 310 - sizeof(struct inotify_event)); 317 + send_len += round_event_name_len(fsn_event); 311 318 } 312 319 mutex_unlock(&group->notification_mutex); 313 320 ret = put_user(send_len, (int __user *) p); ··· 492 503 struct fsnotify_group *group) 493 504 { 494 505 struct inotify_inode_mark *i_mark; 495 - struct fsnotify_event *ignored_event, *notify_event; 496 - struct inotify_event_private_data *event_priv; 497 - struct fsnotify_event_private_data *fsn_event_priv; 498 - int ret; 506 + 507 + /* Queue ignore event for the watch */ 508 + inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED, 509 + NULL, FSNOTIFY_EVENT_NONE, NULL); 499 510 500 511 i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); 501 - 502 - ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, 503 - FSNOTIFY_EVENT_NONE, NULL, 0, 504 - GFP_NOFS); 505 - if (!ignored_event) 506 - goto skip_send_ignore; 507 - 508 - event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS); 509 - if (unlikely(!event_priv)) 510 - goto skip_send_ignore; 511 - 512 - fsn_event_priv = &event_priv->fsnotify_event_priv_data; 513 - 514 - fsnotify_get_group(group); 515 - fsn_event_priv->group = group; 516 - event_priv->wd = i_mark->wd; 517 - 518 - notify_event = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL); 519 - if (notify_event) { 520 - if (IS_ERR(notify_event)) 521 - ret = PTR_ERR(notify_event); 522 - else 523 - fsnotify_put_event(notify_event); 524 - inotify_free_event_priv(fsn_event_priv); 525 - } 526 - 527 - skip_send_ignore: 528 - /* matches the reference taken when the event was created */ 529 - if (ignored_event) 530 - fsnotify_put_event(ignored_event); 531 - 532 512 /* remove this mark from the idr */ 533 513 inotify_remove_from_idr(group, i_mark); 534 514 ··· 794 836 BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21); 795 837 796 838 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); 797 - event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); 798 839 799 840 inotify_max_queued_events = 16384; 800 841 inotify_max_user_instances = 128;

+28 -306

fs/notify/notification.c

··· 48 48 #include <linux/fsnotify_backend.h> 49 49 #include "fsnotify.h" 50 50 51 - static struct kmem_cache *fsnotify_event_cachep; 52 - static struct kmem_cache *fsnotify_event_holder_cachep; 53 - /* 54 - * This is a magic event we send when the q is too full. Since it doesn't 55 - * hold real event information we just keep one system wide and use it any time 56 - * it is needed. It's refcnt is set 1 at kernel init time and will never 57 - * get set to 0 so it will never get 'freed' 58 - */ 59 - static struct fsnotify_event *q_overflow_event; 60 51 static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0); 61 52 62 53 /** ··· 67 76 return list_empty(&group->notification_list) ? true : false; 68 77 } 69 78 70 - void fsnotify_get_event(struct fsnotify_event *event) 79 + void fsnotify_destroy_event(struct fsnotify_group *group, 80 + struct fsnotify_event *event) 71 81 { 72 - atomic_inc(&event->refcnt); 73 - } 74 - 75 - void fsnotify_put_event(struct fsnotify_event *event) 76 - { 77 - if (!event) 82 + /* Overflow events are per-group and we don't want to free them */ 83 + if (!event || event->mask == FS_Q_OVERFLOW) 78 84 return; 79 85 80 - if (atomic_dec_and_test(&event->refcnt)) { 81 - pr_debug("%s: event=%p\n", __func__, event); 82 - 83 - if (event->data_type == FSNOTIFY_EVENT_PATH) 84 - path_put(&event->path); 85 - 86 - BUG_ON(!list_empty(&event->private_data_list)); 87 - 88 - kfree(event->file_name); 89 - put_pid(event->tgid); 90 - kmem_cache_free(fsnotify_event_cachep, event); 91 - } 92 - } 93 - 94 - struct fsnotify_event_holder *fsnotify_alloc_event_holder(void) 95 - { 96 - return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL); 97 - } 98 - 99 - void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder) 100 - { 101 - if (holder) 102 - kmem_cache_free(fsnotify_event_holder_cachep, holder); 103 - } 104 - 105 - /* 106 - * Find the private data that the group previously attached to this event when 107 - * the group added the event to the notification queue (fsnotify_add_notify_event) 108 - */ 109 - struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event) 110 - { 111 - struct fsnotify_event_private_data *lpriv; 112 - struct fsnotify_event_private_data *priv = NULL; 113 - 114 - assert_spin_locked(&event->lock); 115 - 116 - list_for_each_entry(lpriv, &event->private_data_list, event_list) { 117 - if (lpriv->group == group) { 118 - priv = lpriv; 119 - list_del(&priv->event_list); 120 - break; 121 - } 122 - } 123 - return priv; 86 + group->ops->free_event(event); 124 87 } 125 88 126 89 /* ··· 82 137 * event off the queue to deal with. If the event is successfully added to the 83 138 * group's notification queue, a reference is taken on event. 84 139 */ 85 - struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event, 86 - struct fsnotify_event_private_data *priv, 140 + struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, 141 + struct fsnotify_event *event, 87 142 struct fsnotify_event *(*merge)(struct list_head *, 88 143 struct fsnotify_event *)) 89 144 { 90 145 struct fsnotify_event *return_event = NULL; 91 - struct fsnotify_event_holder *holder = NULL; 92 146 struct list_head *list = &group->notification_list; 93 147 94 - pr_debug("%s: group=%p event=%p priv=%p\n", __func__, group, event, priv); 95 - 96 - /* 97 - * There is one fsnotify_event_holder embedded inside each fsnotify_event. 98 - * Check if we expect to be able to use that holder. If not alloc a new 99 - * holder. 100 - * For the overflow event it's possible that something will use the in 101 - * event holder before we get the lock so we may need to jump back and 102 - * alloc a new holder, this can't happen for most events... 103 - */ 104 - if (!list_empty(&event->holder.event_list)) { 105 - alloc_holder: 106 - holder = fsnotify_alloc_event_holder(); 107 - if (!holder) 108 - return ERR_PTR(-ENOMEM); 109 - } 148 + pr_debug("%s: group=%p event=%p\n", __func__, group, event); 110 149 111 150 mutex_lock(&group->notification_mutex); 112 151 113 152 if (group->q_len >= group->max_events) { 114 - event = q_overflow_event; 115 - 116 - /* 117 - * we need to return the overflow event 118 - * which means we need a ref 119 - */ 120 - fsnotify_get_event(event); 153 + /* Queue overflow event only if it isn't already queued */ 154 + if (list_empty(&group->overflow_event.list)) 155 + event = &group->overflow_event; 121 156 return_event = event; 122 - 123 - /* sorry, no private data on the overflow event */ 124 - priv = NULL; 125 157 } 126 158 127 159 if (!list_empty(list) && merge) { 128 - struct fsnotify_event *tmp; 129 - 130 - tmp = merge(list, event); 131 - if (tmp) { 132 - mutex_unlock(&group->notification_mutex); 133 - 134 - if (return_event) 135 - fsnotify_put_event(return_event); 136 - if (holder != &event->holder) 137 - fsnotify_destroy_event_holder(holder); 138 - return tmp; 139 - } 140 - } 141 - 142 - spin_lock(&event->lock); 143 - 144 - if (list_empty(&event->holder.event_list)) { 145 - if (unlikely(holder)) 146 - fsnotify_destroy_event_holder(holder); 147 - holder = &event->holder; 148 - } else if (unlikely(!holder)) { 149 - /* between the time we checked above and got the lock the in 150 - * event holder was used, go back and get a new one */ 151 - spin_unlock(&event->lock); 152 - mutex_unlock(&group->notification_mutex); 153 - 160 + return_event = merge(list, event); 154 161 if (return_event) { 155 - fsnotify_put_event(return_event); 156 - return_event = NULL; 162 + mutex_unlock(&group->notification_mutex); 163 + return return_event; 157 164 } 158 - 159 - goto alloc_holder; 160 165 } 161 166 162 167 group->q_len++; 163 - holder->event = event; 164 - 165 - fsnotify_get_event(event); 166 - list_add_tail(&holder->event_list, list); 167 - if (priv) 168 - list_add_tail(&priv->event_list, &event->private_data_list); 169 - spin_unlock(&event->lock); 168 + list_add_tail(&event->list, list); 170 169 mutex_unlock(&group->notification_mutex); 171 170 172 171 wake_up(&group->notification_waitq); ··· 119 230 } 120 231 121 232 /* 122 - * Remove and return the first event from the notification list. There is a 123 - * reference held on this event since it was on the list. It is the responsibility 124 - * of the caller to drop this reference. 233 + * Remove and return the first event from the notification list. It is the 234 + * responsibility of the caller to destroy the obtained event 125 235 */ 126 236 struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group) 127 237 { 128 238 struct fsnotify_event *event; 129 - struct fsnotify_event_holder *holder; 130 239 131 240 BUG_ON(!mutex_is_locked(&group->notification_mutex)); 132 241 133 242 pr_debug("%s: group=%p\n", __func__, group); 134 243 135 - holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); 136 - 137 - event = holder->event; 138 - 139 - spin_lock(&event->lock); 140 - holder->event = NULL; 141 - list_del_init(&holder->event_list); 142 - spin_unlock(&event->lock); 143 - 144 - /* event == holder means we are referenced through the in event holder */ 145 - if (holder != &event->holder) 146 - fsnotify_destroy_event_holder(holder); 147 - 244 + event = list_first_entry(&group->notification_list, 245 + struct fsnotify_event, list); 246 + list_del(&event->list); 148 247 group->q_len--; 149 248 150 249 return event; ··· 143 266 */ 144 267 struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group) 145 268 { 146 - struct fsnotify_event *event; 147 - struct fsnotify_event_holder *holder; 148 - 149 269 BUG_ON(!mutex_is_locked(&group->notification_mutex)); 150 270 151 - holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); 152 - event = holder->event; 153 - 154 - return event; 271 + return list_first_entry(&group->notification_list, 272 + struct fsnotify_event, list); 155 273 } 156 274 157 275 /* ··· 156 284 void fsnotify_flush_notify(struct fsnotify_group *group) 157 285 { 158 286 struct fsnotify_event *event; 159 - struct fsnotify_event_private_data *priv; 160 287 161 288 mutex_lock(&group->notification_mutex); 162 289 while (!fsnotify_notify_queue_is_empty(group)) { 163 290 event = fsnotify_remove_notify_event(group); 164 - /* if they don't implement free_event_priv they better not have attached any */ 165 - if (group->ops->free_event_priv) { 166 - spin_lock(&event->lock); 167 - priv = fsnotify_remove_priv_from_event(group, event); 168 - spin_unlock(&event->lock); 169 - if (priv) 170 - group->ops->free_event_priv(priv); 171 - } 172 - fsnotify_put_event(event); /* matches fsnotify_add_notify_event */ 291 + fsnotify_destroy_event(group, event); 173 292 } 174 293 mutex_unlock(&group->notification_mutex); 175 - } 176 - 177 - static void initialize_event(struct fsnotify_event *event) 178 - { 179 - INIT_LIST_HEAD(&event->holder.event_list); 180 - atomic_set(&event->refcnt, 1); 181 - 182 - spin_lock_init(&event->lock); 183 - 184 - INIT_LIST_HEAD(&event->private_data_list); 185 - } 186 - 187 - /* 188 - * Caller damn well better be holding whatever mutex is protecting the 189 - * old_holder->event_list and the new_event must be a clean event which 190 - * cannot be found anywhere else in the kernel. 191 - */ 192 - int fsnotify_replace_event(struct fsnotify_event_holder *old_holder, 193 - struct fsnotify_event *new_event) 194 - { 195 - struct fsnotify_event *old_event = old_holder->event; 196 - struct fsnotify_event_holder *new_holder = &new_event->holder; 197 - 198 - enum event_spinlock_class { 199 - SPINLOCK_OLD, 200 - SPINLOCK_NEW, 201 - }; 202 - 203 - pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, new_event); 204 - 205 - /* 206 - * if the new_event's embedded holder is in use someone 207 - * screwed up and didn't give us a clean new event. 208 - */ 209 - BUG_ON(!list_empty(&new_holder->event_list)); 210 - 211 - spin_lock_nested(&old_event->lock, SPINLOCK_OLD); 212 - spin_lock_nested(&new_event->lock, SPINLOCK_NEW); 213 - 214 - new_holder->event = new_event; 215 - list_replace_init(&old_holder->event_list, &new_holder->event_list); 216 - 217 - spin_unlock(&new_event->lock); 218 - spin_unlock(&old_event->lock); 219 - 220 - /* event == holder means we are referenced through the in event holder */ 221 - if (old_holder != &old_event->holder) 222 - fsnotify_destroy_event_holder(old_holder); 223 - 224 - fsnotify_get_event(new_event); /* on the list take reference */ 225 - fsnotify_put_event(old_event); /* off the list, drop reference */ 226 - 227 - return 0; 228 - } 229 - 230 - struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event) 231 - { 232 - struct fsnotify_event *event; 233 - 234 - event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL); 235 - if (!event) 236 - return NULL; 237 - 238 - pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, event); 239 - 240 - memcpy(event, old_event, sizeof(*event)); 241 - initialize_event(event); 242 - 243 - if (event->name_len) { 244 - event->file_name = kstrdup(old_event->file_name, GFP_KERNEL); 245 - if (!event->file_name) { 246 - kmem_cache_free(fsnotify_event_cachep, event); 247 - return NULL; 248 - } 249 - } 250 - event->tgid = get_pid(old_event->tgid); 251 - if (event->data_type == FSNOTIFY_EVENT_PATH) 252 - path_get(&event->path); 253 - 254 - return event; 255 294 } 256 295 257 296 /* ··· 170 387 * group's handle_event function if the group was interested in this 171 388 * particular event. 172 389 * 173 - * @to_tell the inode which is supposed to receive the event (sometimes a 390 + * @inode the inode which is supposed to receive the event (sometimes a 174 391 * parent of the inode to which the event happened. 175 392 * @mask what actually happened. 176 393 * @data pointer to the object which was actually affected 177 394 * @data_type flag indication if the data is a file, path, inode, nothing... 178 395 * @name the filename, if available 179 396 */ 180 - struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, 181 - int data_type, const unsigned char *name, 182 - u32 cookie, gfp_t gfp) 397 + void fsnotify_init_event(struct fsnotify_event *event, struct inode *inode, 398 + u32 mask) 183 399 { 184 - struct fsnotify_event *event; 185 - 186 - event = kmem_cache_zalloc(fsnotify_event_cachep, gfp); 187 - if (!event) 188 - return NULL; 189 - 190 - pr_debug("%s: event=%p to_tell=%p mask=%x data=%p data_type=%d\n", 191 - __func__, event, to_tell, mask, data, data_type); 192 - 193 - initialize_event(event); 194 - 195 - if (name) { 196 - event->file_name = kstrdup(name, gfp); 197 - if (!event->file_name) { 198 - kmem_cache_free(fsnotify_event_cachep, event); 199 - return NULL; 200 - } 201 - event->name_len = strlen(event->file_name); 202 - } 203 - 204 - event->tgid = get_pid(task_tgid(current)); 205 - event->sync_cookie = cookie; 206 - event->to_tell = to_tell; 207 - event->data_type = data_type; 208 - 209 - switch (data_type) { 210 - case FSNOTIFY_EVENT_PATH: { 211 - struct path *path = data; 212 - event->path.dentry = path->dentry; 213 - event->path.mnt = path->mnt; 214 - path_get(&event->path); 215 - break; 216 - } 217 - case FSNOTIFY_EVENT_INODE: 218 - event->inode = data; 219 - break; 220 - case FSNOTIFY_EVENT_NONE: 221 - event->inode = NULL; 222 - event->path.dentry = NULL; 223 - event->path.mnt = NULL; 224 - break; 225 - default: 226 - BUG(); 227 - } 228 - 400 + INIT_LIST_HEAD(&event->list); 401 + event->inode = inode; 229 402 event->mask = mask; 230 - 231 - return event; 232 403 } 233 - 234 - static __init int fsnotify_notification_init(void) 235 - { 236 - fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC); 237 - fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC); 238 - 239 - q_overflow_event = fsnotify_create_event(NULL, FS_Q_OVERFLOW, NULL, 240 - FSNOTIFY_EVENT_NONE, NULL, 0, 241 - GFP_KERNEL); 242 - if (!q_overflow_event) 243 - panic("unable to allocate fsnotify q_overflow_event\n"); 244 - 245 - return 0; 246 - } 247 - subsys_initcall(fsnotify_notification_init);

-1

fs/ocfs2/Makefile

··· 38 38 symlink.o \ 39 39 sysfile.o \ 40 40 uptodate.o \ 41 - ver.o \ 42 41 quota_local.o \ 43 42 quota_global.o \ 44 43 xattr.o \

+3 -7

fs/ocfs2/alloc.c

··· 7260 7260 start = range->start >> osb->s_clustersize_bits; 7261 7261 len = range->len >> osb->s_clustersize_bits; 7262 7262 minlen = range->minlen >> osb->s_clustersize_bits; 7263 - trimmed = 0; 7264 7263 7265 - if (!len) { 7266 - range->len = 0; 7267 - return 0; 7268 - } 7269 - 7270 - if (minlen >= osb->bitmap_cpg) 7264 + if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize) 7271 7265 return -EINVAL; 7272 7266 7273 7267 main_bm_inode = ocfs2_get_system_file_inode(osb, ··· 7287 7293 goto out_unlock; 7288 7294 } 7289 7295 7296 + len = range->len >> osb->s_clustersize_bits; 7290 7297 if (start + len > le32_to_cpu(main_bm->i_clusters)) 7291 7298 len = le32_to_cpu(main_bm->i_clusters) - start; 7292 7299 ··· 7302 7307 last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1); 7303 7308 last_bit = osb->bitmap_cpg; 7304 7309 7310 + trimmed = 0; 7305 7311 for (group = first_group; group <= last_group;) { 7306 7312 if (first_bit + len >= osb->bitmap_cpg) 7307 7313 last_bit = osb->bitmap_cpg;

+1 -1

fs/ocfs2/cluster/Makefile

··· 1 1 obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o 2 2 3 3 ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \ 4 - quorum.o tcp.o netdebug.o ver.o 4 + quorum.o tcp.o netdebug.o

+1 -3

fs/ocfs2/cluster/nodemanager.c

··· 29 29 #include "heartbeat.h" 30 30 #include "masklog.h" 31 31 #include "sys.h" 32 - #include "ver.h" 33 32 34 33 /* for now we operate under the assertion that there can be only one 35 34 * cluster active at a time. Changing this will require trickling ··· 944 945 { 945 946 int ret = -1; 946 947 947 - cluster_print_version(); 948 - 949 948 ret = o2hb_init(); 950 949 if (ret) 951 950 goto out; ··· 981 984 982 985 MODULE_AUTHOR("Oracle"); 983 986 MODULE_LICENSE("GPL"); 987 + MODULE_DESCRIPTION("OCFS2 cluster management"); 984 988 985 989 module_init(init_o2nm) 986 990 module_exit(exit_o2nm)

-42

fs/ocfs2/cluster/ver.c

··· 1 - /* -*- mode: c; c-basic-offset: 8; -*- 2 - * vim: noexpandtab sw=8 ts=8 sts=0: 3 - * 4 - * ver.c 5 - * 6 - * version string 7 - * 8 - * Copyright (C) 2002, 2005 Oracle. All rights reserved. 9 - * 10 - * This program is free software; you can redistribute it and/or 11 - * modify it under the terms of the GNU General Public 12 - * License as published by the Free Software Foundation; either 13 - * version 2 of the License, or (at your option) any later version. 14 - * 15 - * This program is distributed in the hope that it will be useful, 16 - * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 - * General Public License for more details. 19 - * 20 - * You should have received a copy of the GNU General Public 21 - * License along with this program; if not, write to the 22 - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 - * Boston, MA 021110-1307, USA. 24 - */ 25 - 26 - #include <linux/module.h> 27 - #include <linux/kernel.h> 28 - 29 - #include "ver.h" 30 - 31 - #define CLUSTER_BUILD_VERSION "1.5.0" 32 - 33 - #define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION 34 - 35 - void cluster_print_version(void) 36 - { 37 - printk(KERN_INFO "%s\n", VERSION_STR); 38 - } 39 - 40 - MODULE_DESCRIPTION(VERSION_STR); 41 - 42 - MODULE_VERSION(CLUSTER_BUILD_VERSION);

-31

fs/ocfs2/cluster/ver.h

··· 1 - /* -*- mode: c; c-basic-offset: 8; -*- 2 - * vim: noexpandtab sw=8 ts=8 sts=0: 3 - * 4 - * ver.h 5 - * 6 - * Function prototypes 7 - * 8 - * Copyright (C) 2005 Oracle. All rights reserved. 9 - * 10 - * This program is free software; you can redistribute it and/or 11 - * modify it under the terms of the GNU General Public 12 - * License as published by the Free Software Foundation; either 13 - * version 2 of the License, or (at your option) any later version. 14 - * 15 - * This program is distributed in the hope that it will be useful, 16 - * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 - * General Public License for more details. 19 - * 20 - * You should have received a copy of the GNU General Public 21 - * License along with this program; if not, write to the 22 - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 - * Boston, MA 021110-1307, USA. 24 - */ 25 - 26 - #ifndef O2CLUSTER_VER_H 27 - #define O2CLUSTER_VER_H 28 - 29 - void cluster_print_version(void); 30 - 31 - #endif /* O2CLUSTER_VER_H */

+1 -1

fs/ocfs2/dlm/Makefile

··· 3 3 obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o 4 4 5 5 ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ 6 - dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o 6 + dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o 7 7

+1 -4

fs/ocfs2/dlm/dlmdomain.c

··· 43 43 #include "dlmdomain.h" 44 44 #include "dlmdebug.h" 45 45 46 - #include "dlmver.h" 47 - 48 46 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) 49 47 #include "cluster/masklog.h" 50 48 ··· 2326 2328 { 2327 2329 int status; 2328 2330 2329 - dlm_print_version(); 2330 - 2331 2331 status = dlm_init_mle_cache(); 2332 2332 if (status) { 2333 2333 mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n"); ··· 2375 2379 2376 2380 MODULE_AUTHOR("Oracle"); 2377 2381 MODULE_LICENSE("GPL"); 2382 + MODULE_DESCRIPTION("OCFS2 Distributed Lock Management"); 2378 2383 2379 2384 module_init(dlm_init); 2380 2385 module_exit(dlm_exit);

-42

fs/ocfs2/dlm/dlmver.c

··· 1 - /* -*- mode: c; c-basic-offset: 8; -*- 2 - * vim: noexpandtab sw=8 ts=8 sts=0: 3 - * 4 - * dlmver.c 5 - * 6 - * version string 7 - * 8 - * Copyright (C) 2002, 2005 Oracle. All rights reserved. 9 - * 10 - * This program is free software; you can redistribute it and/or 11 - * modify it under the terms of the GNU General Public 12 - * License as published by the Free Software Foundation; either 13 - * version 2 of the License, or (at your option) any later version. 14 - * 15 - * This program is distributed in the hope that it will be useful, 16 - * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 - * General Public License for more details. 19 - * 20 - * You should have received a copy of the GNU General Public 21 - * License along with this program; if not, write to the 22 - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 - * Boston, MA 021110-1307, USA. 24 - */ 25 - 26 - #include <linux/module.h> 27 - #include <linux/kernel.h> 28 - 29 - #include "dlmver.h" 30 - 31 - #define DLM_BUILD_VERSION "1.5.0" 32 - 33 - #define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION 34 - 35 - void dlm_print_version(void) 36 - { 37 - printk(KERN_INFO "%s\n", VERSION_STR); 38 - } 39 - 40 - MODULE_DESCRIPTION(VERSION_STR); 41 - 42 - MODULE_VERSION(DLM_BUILD_VERSION);

-31

fs/ocfs2/dlm/dlmver.h

··· 1 - /* -*- mode: c; c-basic-offset: 8; -*- 2 - * vim: noexpandtab sw=8 ts=8 sts=0: 3 - * 4 - * dlmfsver.h 5 - * 6 - * Function prototypes 7 - * 8 - * Copyright (C) 2005 Oracle. All rights reserved. 9 - * 10 - * This program is free software; you can redistribute it and/or 11 - * modify it under the terms of the GNU General Public 12 - * License as published by the Free Software Foundation; either 13 - * version 2 of the License, or (at your option) any later version. 14 - * 15 - * This program is distributed in the hope that it will be useful, 16 - * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 - * General Public License for more details. 19 - * 20 - * You should have received a copy of the GNU General Public 21 - * License along with this program; if not, write to the 22 - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 - * Boston, MA 021110-1307, USA. 24 - */ 25 - 26 - #ifndef DLM_VER_H 27 - #define DLM_VER_H 28 - 29 - void dlm_print_version(void); 30 - 31 - #endif /* DLM_VER_H */

+1 -1

fs/ocfs2/dlmfs/Makefile

··· 2 2 3 3 obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o 4 4 5 - ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o 5 + ocfs2_dlmfs-objs := userdlm.o dlmfs.o

+1 -3

fs/ocfs2/dlmfs/dlmfs.c

··· 49 49 50 50 #include "stackglue.h" 51 51 #include "userdlm.h" 52 - #include "dlmfsver.h" 53 52 54 53 #define MLOG_MASK_PREFIX ML_DLMFS 55 54 #include "cluster/masklog.h" ··· 643 644 int status; 644 645 int cleanup_inode = 0, cleanup_worker = 0; 645 646 646 - dlmfs_print_version(); 647 - 648 647 status = bdi_init(&dlmfs_backing_dev_info); 649 648 if (status) 650 649 return status; ··· 698 701 699 702 MODULE_AUTHOR("Oracle"); 700 703 MODULE_LICENSE("GPL"); 704 + MODULE_DESCRIPTION("OCFS2 DLM-Filesystem"); 701 705 702 706 module_init(init_dlmfs_fs) 703 707 module_exit(exit_dlmfs_fs)

-42

fs/ocfs2/dlmfs/dlmfsver.c

··· 1 - /* -*- mode: c; c-basic-offset: 8; -*- 2 - * vim: noexpandtab sw=8 ts=8 sts=0: 3 - * 4 - * dlmfsver.c 5 - * 6 - * version string 7 - * 8 - * Copyright (C) 2002, 2005 Oracle. All rights reserved. 9 - * 10 - * This program is free software; you can redistribute it and/or 11 - * modify it under the terms of the GNU General Public 12 - * License as published by the Free Software Foundation; either 13 - * version 2 of the License, or (at your option) any later version. 14 - * 15 - * This program is distributed in the hope that it will be useful, 16 - * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 - * General Public License for more details. 19 - * 20 - * You should have received a copy of the GNU General Public 21 - * License along with this program; if not, write to the 22 - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 - * Boston, MA 021110-1307, USA. 24 - */ 25 - 26 - #include <linux/module.h> 27 - #include <linux/kernel.h> 28 - 29 - #include "dlmfsver.h" 30 - 31 - #define DLM_BUILD_VERSION "1.5.0" 32 - 33 - #define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION 34 - 35 - void dlmfs_print_version(void) 36 - { 37 - printk(KERN_INFO "%s\n", VERSION_STR); 38 - } 39 - 40 - MODULE_DESCRIPTION(VERSION_STR); 41 - 42 - MODULE_VERSION(DLM_BUILD_VERSION);

-31

fs/ocfs2/dlmfs/dlmfsver.h

··· 1 - /* -*- mode: c; c-basic-offset: 8; -*- 2 - * vim: noexpandtab sw=8 ts=8 sts=0: 3 - * 4 - * dlmver.h 5 - * 6 - * Function prototypes 7 - * 8 - * Copyright (C) 2005 Oracle. All rights reserved. 9 - * 10 - * This program is free software; you can redistribute it and/or 11 - * modify it under the terms of the GNU General Public 12 - * License as published by the Free Software Foundation; either 13 - * version 2 of the License, or (at your option) any later version. 14 - * 15 - * This program is distributed in the hope that it will be useful, 16 - * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 - * General Public License for more details. 19 - * 20 - * You should have received a copy of the GNU General Public 21 - * License along with this program; if not, write to the 22 - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 - * Boston, MA 021110-1307, USA. 24 - */ 25 - 26 - #ifndef DLMFS_VER_H 27 - #define DLMFS_VER_H 28 - 29 - void dlmfs_print_version(void); 30 - 31 - #endif /* DLMFS_VER_H */

+3 -1

fs/ocfs2/dlmglue.c

··· 2996 2996 2997 2997 /* for now, uuid == domain */ 2998 2998 status = ocfs2_cluster_connect(osb->osb_cluster_stack, 2999 + osb->osb_cluster_name, 3000 + strlen(osb->osb_cluster_name), 2999 3001 osb->uuid_str, 3000 3002 strlen(osb->uuid_str), 3001 3003 &lproto, ocfs2_do_node_down, osb, ··· 3007 3005 goto bail; 3008 3006 } 3009 3007 3010 - status = ocfs2_cluster_this_node(&osb->node_num); 3008 + status = ocfs2_cluster_this_node(conn, &osb->node_num); 3011 3009 if (status < 0) { 3012 3010 mlog_errno(status); 3013 3011 mlog(ML_ERROR,

+2 -1

fs/ocfs2/file.c

··· 1869 1869 } 1870 1870 size = sr->l_start + sr->l_len; 1871 1871 1872 - if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { 1872 + if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 || 1873 + cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) { 1873 1874 if (sr->l_len <= 0) { 1874 1875 ret = -EINVAL; 1875 1876 goto out_inode_unlock;

+7

fs/ocfs2/ioctl.c

··· 7 7 8 8 #include <linux/fs.h> 9 9 #include <linux/mount.h> 10 + #include <linux/blkdev.h> 10 11 #include <linux/compat.h> 11 12 12 13 #include <cluster/masklog.h> ··· 967 966 case FITRIM: 968 967 { 969 968 struct super_block *sb = inode->i_sb; 969 + struct request_queue *q = bdev_get_queue(sb->s_bdev); 970 970 struct fstrim_range range; 971 971 int ret = 0; 972 972 973 973 if (!capable(CAP_SYS_ADMIN)) 974 974 return -EPERM; 975 975 976 + if (!blk_queue_discard(q)) 977 + return -EOPNOTSUPP; 978 + 976 979 if (copy_from_user(&range, argp, sizeof(range))) 977 980 return -EFAULT; 978 981 982 + range.minlen = max_t(u64, q->limits.discard_granularity, 983 + range.minlen); 979 984 ret = ocfs2_trim_fs(sb, &range); 980 985 if (ret < 0) 981 986 return ret;

-77

fs/ocfs2/move_extents.c

··· 561 561 mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos); 562 562 } 563 563 564 - static int ocfs2_alloc_dinode_update_counts(struct inode *inode, 565 - handle_t *handle, 566 - struct buffer_head *di_bh, 567 - u32 num_bits, 568 - u16 chain) 569 - { 570 - int ret; 571 - u32 tmp_used; 572 - struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 573 - struct ocfs2_chain_list *cl = 574 - (struct ocfs2_chain_list *) &di->id2.i_chain; 575 - 576 - ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 577 - OCFS2_JOURNAL_ACCESS_WRITE); 578 - if (ret < 0) { 579 - mlog_errno(ret); 580 - goto out; 581 - } 582 - 583 - tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); 584 - di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); 585 - le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); 586 - ocfs2_journal_dirty(handle, di_bh); 587 - 588 - out: 589 - return ret; 590 - } 591 - 592 - static inline int ocfs2_block_group_set_bits(handle_t *handle, 593 - struct inode *alloc_inode, 594 - struct ocfs2_group_desc *bg, 595 - struct buffer_head *group_bh, 596 - unsigned int bit_off, 597 - unsigned int num_bits) 598 - { 599 - int status; 600 - void *bitmap = bg->bg_bitmap; 601 - int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; 602 - 603 - /* All callers get the descriptor via 604 - * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 605 - BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 606 - BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); 607 - 608 - mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, 609 - num_bits); 610 - 611 - if (ocfs2_is_cluster_bitmap(alloc_inode)) 612 - journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 613 - 614 - status = ocfs2_journal_access_gd(handle, 615 - INODE_CACHE(alloc_inode), 616 - group_bh, 617 - journal_type); 618 - if (status < 0) { 619 - mlog_errno(status); 620 - goto bail; 621 - } 622 - 623 - le16_add_cpu(&bg->bg_free_bits_count, -num_bits); 624 - if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { 625 - ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" 626 - " count %u but claims %u are freed. num_bits %d", 627 - (unsigned long long)le64_to_cpu(bg->bg_blkno), 628 - le16_to_cpu(bg->bg_bits), 629 - le16_to_cpu(bg->bg_free_bits_count), num_bits); 630 - return -EROFS; 631 - } 632 - while (num_bits--) 633 - ocfs2_set_bit(bit_off++, bitmap); 634 - 635 - ocfs2_journal_dirty(handle, group_bh); 636 - 637 - bail: 638 - return status; 639 - } 640 - 641 564 static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, 642 565 u32 cpos, u32 phys_cpos, u32 *new_phys_cpos, 643 566 u32 len, int ext_flags)

+1

fs/ocfs2/ocfs2.h

··· 387 387 u8 osb_stackflags; 388 388 389 389 char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; 390 + char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1]; 390 391 struct ocfs2_cluster_connection *cconn; 391 392 struct ocfs2_lock_res osb_super_lockres; 392 393 struct ocfs2_lock_res osb_rename_lockres;

+2 -1

fs/ocfs2/stack_o2cb.c

··· 398 398 return 0; 399 399 } 400 400 401 - static int o2cb_cluster_this_node(unsigned int *node) 401 + static int o2cb_cluster_this_node(struct ocfs2_cluster_connection *conn, 402 + unsigned int *node) 402 403 { 403 404 int node_num; 404 405

+268 -40

fs/ocfs2/stack_user.c

··· 23 23 #include <linux/mutex.h> 24 24 #include <linux/slab.h> 25 25 #include <linux/reboot.h> 26 + #include <linux/sched.h> 26 27 #include <asm/uaccess.h> 27 28 28 29 #include "stackglue.h" ··· 103 102 #define OCFS2_TEXT_UUID_LEN 32 104 103 #define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2 105 104 #define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8 105 + #define VERSION_LOCK "version_lock" 106 + 107 + enum ocfs2_connection_type { 108 + WITH_CONTROLD, 109 + NO_CONTROLD 110 + }; 106 111 107 112 /* 108 113 * ocfs2_live_connection is refcounted because the filesystem and ··· 117 110 struct ocfs2_live_connection { 118 111 struct list_head oc_list; 119 112 struct ocfs2_cluster_connection *oc_conn; 113 + enum ocfs2_connection_type oc_type; 114 + atomic_t oc_this_node; 115 + int oc_our_slot; 116 + struct dlm_lksb oc_version_lksb; 117 + char oc_lvb[DLM_LVB_LEN]; 118 + struct completion oc_sync_wait; 119 + wait_queue_head_t oc_wait; 120 120 }; 121 121 122 122 struct ocfs2_control_private { ··· 212 198 * mount path. Since the VFS prevents multiple calls to 213 199 * fill_super(), we can't get dupes here. 214 200 */ 215 - static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, 216 - struct ocfs2_live_connection **c_ret) 201 + static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn, 202 + struct ocfs2_live_connection *c) 217 203 { 218 204 int rc = 0; 219 - struct ocfs2_live_connection *c; 220 - 221 - c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL); 222 - if (!c) 223 - return -ENOMEM; 224 205 225 206 mutex_lock(&ocfs2_control_lock); 226 207 c->oc_conn = conn; 227 208 228 - if (atomic_read(&ocfs2_control_opened)) 209 + if ((c->oc_type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened)) 229 210 list_add(&c->oc_list, &ocfs2_live_connection_list); 230 211 else { 231 212 printk(KERN_ERR ··· 229 220 } 230 221 231 222 mutex_unlock(&ocfs2_control_lock); 232 - 233 - if (!rc) 234 - *c_ret = c; 235 - else 236 - kfree(c); 237 - 238 223 return rc; 239 224 } 240 225 ··· 802 799 return 0; 803 800 } 804 801 802 + static void lvb_to_version(char *lvb, struct ocfs2_protocol_version *ver) 803 + { 804 + struct ocfs2_protocol_version *pv = 805 + (struct ocfs2_protocol_version *)lvb; 806 + /* 807 + * ocfs2_protocol_version has two u8 variables, so we don't 808 + * need any endian conversion. 809 + */ 810 + ver->pv_major = pv->pv_major; 811 + ver->pv_minor = pv->pv_minor; 812 + } 813 + 814 + static void version_to_lvb(struct ocfs2_protocol_version *ver, char *lvb) 815 + { 816 + struct ocfs2_protocol_version *pv = 817 + (struct ocfs2_protocol_version *)lvb; 818 + /* 819 + * ocfs2_protocol_version has two u8 variables, so we don't 820 + * need any endian conversion. 821 + */ 822 + pv->pv_major = ver->pv_major; 823 + pv->pv_minor = ver->pv_minor; 824 + } 825 + 826 + static void sync_wait_cb(void *arg) 827 + { 828 + struct ocfs2_cluster_connection *conn = arg; 829 + struct ocfs2_live_connection *lc = conn->cc_private; 830 + complete(&lc->oc_sync_wait); 831 + } 832 + 833 + static int sync_unlock(struct ocfs2_cluster_connection *conn, 834 + struct dlm_lksb *lksb, char *name) 835 + { 836 + int error; 837 + struct ocfs2_live_connection *lc = conn->cc_private; 838 + 839 + error = dlm_unlock(conn->cc_lockspace, lksb->sb_lkid, 0, lksb, conn); 840 + if (error) { 841 + printk(KERN_ERR "%s lkid %x error %d\n", 842 + name, lksb->sb_lkid, error); 843 + return error; 844 + } 845 + 846 + wait_for_completion(&lc->oc_sync_wait); 847 + 848 + if (lksb->sb_status != -DLM_EUNLOCK) { 849 + printk(KERN_ERR "%s lkid %x status %d\n", 850 + name, lksb->sb_lkid, lksb->sb_status); 851 + return -1; 852 + } 853 + return 0; 854 + } 855 + 856 + static int sync_lock(struct ocfs2_cluster_connection *conn, 857 + int mode, uint32_t flags, 858 + struct dlm_lksb *lksb, char *name) 859 + { 860 + int error, status; 861 + struct ocfs2_live_connection *lc = conn->cc_private; 862 + 863 + error = dlm_lock(conn->cc_lockspace, mode, lksb, flags, 864 + name, strlen(name), 865 + 0, sync_wait_cb, conn, NULL); 866 + if (error) { 867 + printk(KERN_ERR "%s lkid %x flags %x mode %d error %d\n", 868 + name, lksb->sb_lkid, flags, mode, error); 869 + return error; 870 + } 871 + 872 + wait_for_completion(&lc->oc_sync_wait); 873 + 874 + status = lksb->sb_status; 875 + 876 + if (status && status != -EAGAIN) { 877 + printk(KERN_ERR "%s lkid %x flags %x mode %d status %d\n", 878 + name, lksb->sb_lkid, flags, mode, status); 879 + } 880 + 881 + return status; 882 + } 883 + 884 + 885 + static int version_lock(struct ocfs2_cluster_connection *conn, int mode, 886 + int flags) 887 + { 888 + struct ocfs2_live_connection *lc = conn->cc_private; 889 + return sync_lock(conn, mode, flags, 890 + &lc->oc_version_lksb, VERSION_LOCK); 891 + } 892 + 893 + static int version_unlock(struct ocfs2_cluster_connection *conn) 894 + { 895 + struct ocfs2_live_connection *lc = conn->cc_private; 896 + return sync_unlock(conn, &lc->oc_version_lksb, VERSION_LOCK); 897 + } 898 + 899 + /* get_protocol_version() 900 + * 901 + * To exchange ocfs2 versioning, we use the LVB of the version dlm lock. 902 + * The algorithm is: 903 + * 1. Attempt to take the lock in EX mode (non-blocking). 904 + * 2. If successful (which means it is the first mount), write the 905 + * version number and downconvert to PR lock. 906 + * 3. If unsuccessful (returns -EAGAIN), read the version from the LVB after 907 + * taking the PR lock. 908 + */ 909 + 910 + static int get_protocol_version(struct ocfs2_cluster_connection *conn) 911 + { 912 + int ret; 913 + struct ocfs2_live_connection *lc = conn->cc_private; 914 + struct ocfs2_protocol_version pv; 915 + 916 + running_proto.pv_major = 917 + ocfs2_user_plugin.sp_max_proto.pv_major; 918 + running_proto.pv_minor = 919 + ocfs2_user_plugin.sp_max_proto.pv_minor; 920 + 921 + lc->oc_version_lksb.sb_lvbptr = lc->oc_lvb; 922 + ret = version_lock(conn, DLM_LOCK_EX, 923 + DLM_LKF_VALBLK|DLM_LKF_NOQUEUE); 924 + if (!ret) { 925 + conn->cc_version.pv_major = running_proto.pv_major; 926 + conn->cc_version.pv_minor = running_proto.pv_minor; 927 + version_to_lvb(&running_proto, lc->oc_lvb); 928 + version_lock(conn, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_VALBLK); 929 + } else if (ret == -EAGAIN) { 930 + ret = version_lock(conn, DLM_LOCK_PR, DLM_LKF_VALBLK); 931 + if (ret) 932 + goto out; 933 + lvb_to_version(lc->oc_lvb, &pv); 934 + 935 + if ((pv.pv_major != running_proto.pv_major) || 936 + (pv.pv_minor > running_proto.pv_minor)) { 937 + ret = -EINVAL; 938 + goto out; 939 + } 940 + 941 + conn->cc_version.pv_major = pv.pv_major; 942 + conn->cc_version.pv_minor = pv.pv_minor; 943 + } 944 + out: 945 + return ret; 946 + } 947 + 948 + static void user_recover_prep(void *arg) 949 + { 950 + } 951 + 952 + static void user_recover_slot(void *arg, struct dlm_slot *slot) 953 + { 954 + struct ocfs2_cluster_connection *conn = arg; 955 + printk(KERN_INFO "ocfs2: Node %d/%d down. Initiating recovery.\n", 956 + slot->nodeid, slot->slot); 957 + conn->cc_recovery_handler(slot->nodeid, conn->cc_recovery_data); 958 + 959 + } 960 + 961 + static void user_recover_done(void *arg, struct dlm_slot *slots, 962 + int num_slots, int our_slot, 963 + uint32_t generation) 964 + { 965 + struct ocfs2_cluster_connection *conn = arg; 966 + struct ocfs2_live_connection *lc = conn->cc_private; 967 + int i; 968 + 969 + for (i = 0; i < num_slots; i++) 970 + if (slots[i].slot == our_slot) { 971 + atomic_set(&lc->oc_this_node, slots[i].nodeid); 972 + break; 973 + } 974 + 975 + lc->oc_our_slot = our_slot; 976 + wake_up(&lc->oc_wait); 977 + } 978 + 979 + static const struct dlm_lockspace_ops ocfs2_ls_ops = { 980 + .recover_prep = user_recover_prep, 981 + .recover_slot = user_recover_slot, 982 + .recover_done = user_recover_done, 983 + }; 984 + 985 + static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) 986 + { 987 + version_unlock(conn); 988 + dlm_release_lockspace(conn->cc_lockspace, 2); 989 + conn->cc_lockspace = NULL; 990 + ocfs2_live_connection_drop(conn->cc_private); 991 + conn->cc_private = NULL; 992 + return 0; 993 + } 994 + 805 995 static int user_cluster_connect(struct ocfs2_cluster_connection *conn) 806 996 { 807 997 dlm_lockspace_t *fsdlm; 808 - struct ocfs2_live_connection *uninitialized_var(control); 809 - int rc = 0; 998 + struct ocfs2_live_connection *lc; 999 + int rc, ops_rv; 810 1000 811 1001 BUG_ON(conn == NULL); 812 1002 813 - rc = ocfs2_live_connection_new(conn, &control); 1003 + lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL); 1004 + if (!lc) { 1005 + rc = -ENOMEM; 1006 + goto out; 1007 + } 1008 + 1009 + init_waitqueue_head(&lc->oc_wait); 1010 + init_completion(&lc->oc_sync_wait); 1011 + atomic_set(&lc->oc_this_node, 0); 1012 + conn->cc_private = lc; 1013 + lc->oc_type = NO_CONTROLD; 1014 + 1015 + rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name, 1016 + DLM_LSFL_FS, DLM_LVB_LEN, 1017 + &ocfs2_ls_ops, conn, &ops_rv, &fsdlm); 814 1018 if (rc) 815 1019 goto out; 1020 + 1021 + if (ops_rv == -EOPNOTSUPP) { 1022 + lc->oc_type = WITH_CONTROLD; 1023 + printk(KERN_NOTICE "ocfs2: You seem to be using an older " 1024 + "version of dlm_controld and/or ocfs2-tools." 1025 + " Please consider upgrading.\n"); 1026 + } else if (ops_rv) { 1027 + rc = ops_rv; 1028 + goto out; 1029 + } 1030 + conn->cc_lockspace = fsdlm; 1031 + 1032 + rc = ocfs2_live_connection_attach(conn, lc); 1033 + if (rc) 1034 + goto out; 1035 + 1036 + if (lc->oc_type == NO_CONTROLD) { 1037 + rc = get_protocol_version(conn); 1038 + if (rc) { 1039 + printk(KERN_ERR "ocfs2: Could not determine" 1040 + " locking version\n"); 1041 + user_cluster_disconnect(conn); 1042 + goto out; 1043 + } 1044 + wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0)); 1045 + } 816 1046 817 1047 /* 818 1048 * running_proto must have been set before we allowed any mounts ··· 1054 818 if (fs_protocol_compare(&running_proto, &conn->cc_version)) { 1055 819 printk(KERN_ERR 1056 820 "Unable to mount with fs locking protocol version " 1057 - "%u.%u because the userspace control daemon has " 1058 - "negotiated %u.%u\n", 821 + "%u.%u because negotiated protocol is %u.%u\n", 1059 822 conn->cc_version.pv_major, conn->cc_version.pv_minor, 1060 823 running_proto.pv_major, running_proto.pv_minor); 1061 824 rc = -EPROTO; 1062 - ocfs2_live_connection_drop(control); 1063 - goto out; 825 + ocfs2_live_connection_drop(lc); 826 + lc = NULL; 1064 827 } 1065 828 1066 - rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN, 1067 - NULL, NULL, NULL, &fsdlm); 1068 - if (rc) { 1069 - ocfs2_live_connection_drop(control); 1070 - goto out; 1071 - } 1072 - 1073 - conn->cc_private = control; 1074 - conn->cc_lockspace = fsdlm; 1075 829 out: 830 + if (rc && lc) 831 + kfree(lc); 1076 832 return rc; 1077 833 } 1078 834 1079 - static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) 1080 - { 1081 - dlm_release_lockspace(conn->cc_lockspace, 2); 1082 - conn->cc_lockspace = NULL; 1083 - ocfs2_live_connection_drop(conn->cc_private); 1084 - conn->cc_private = NULL; 1085 - return 0; 1086 - } 1087 835 1088 - static int user_cluster_this_node(unsigned int *this_node) 836 + static int user_cluster_this_node(struct ocfs2_cluster_connection *conn, 837 + unsigned int *this_node) 1089 838 { 1090 839 int rc; 840 + struct ocfs2_live_connection *lc = conn->cc_private; 1091 841 1092 - rc = ocfs2_control_get_this_node(); 842 + if (lc->oc_type == WITH_CONTROLD) 843 + rc = ocfs2_control_get_this_node(); 844 + else if (lc->oc_type == NO_CONTROLD) 845 + rc = atomic_read(&lc->oc_this_node); 846 + else 847 + rc = -EINVAL; 848 + 1093 849 if (rc < 0) 1094 850 return rc; 1095 851

+11 -5

fs/ocfs2/stackglue.c

··· 309 309 EXPORT_SYMBOL_GPL(ocfs2_plock); 310 310 311 311 int ocfs2_cluster_connect(const char *stack_name, 312 + const char *cluster_name, 313 + int cluster_name_len, 312 314 const char *group, 313 315 int grouplen, 314 316 struct ocfs2_locking_protocol *lproto, ··· 344 342 goto out; 345 343 } 346 344 347 - memcpy(new_conn->cc_name, group, grouplen); 345 + strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1); 348 346 new_conn->cc_namelen = grouplen; 347 + strlcpy(new_conn->cc_cluster_name, cluster_name, CLUSTER_NAME_MAX + 1); 348 + new_conn->cc_cluster_name_len = cluster_name_len; 349 349 new_conn->cc_recovery_handler = recovery_handler; 350 350 new_conn->cc_recovery_data = recovery_data; 351 351 ··· 390 386 391 387 if (cluster_stack_name[0]) 392 388 stack_name = cluster_stack_name; 393 - return ocfs2_cluster_connect(stack_name, group, grouplen, lproto, 394 - recovery_handler, recovery_data, conn); 389 + return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen, 390 + lproto, recovery_handler, recovery_data, 391 + conn); 395 392 } 396 393 EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic); 397 394 ··· 465 460 } 466 461 EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup); 467 462 468 - int ocfs2_cluster_this_node(unsigned int *node) 463 + int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn, 464 + unsigned int *node) 469 465 { 470 - return active_stack->sp_ops->this_node(node); 466 + return active_stack->sp_ops->this_node(conn, node); 471 467 } 472 468 EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node); 473 469

+12 -3

fs/ocfs2/stackglue.h

··· 45 45 */ 46 46 #define GROUP_NAME_MAX 64 47 47 48 + /* This shadows OCFS2_CLUSTER_NAME_LEN */ 49 + #define CLUSTER_NAME_MAX 16 50 + 48 51 49 52 /* 50 53 * ocfs2_protocol_version changes when ocfs2 does something different in ··· 100 97 * locking compatibility. 101 98 */ 102 99 struct ocfs2_cluster_connection { 103 - char cc_name[GROUP_NAME_MAX]; 100 + char cc_name[GROUP_NAME_MAX + 1]; 104 101 int cc_namelen; 102 + char cc_cluster_name[CLUSTER_NAME_MAX + 1]; 103 + int cc_cluster_name_len; 105 104 struct ocfs2_protocol_version cc_version; 106 105 struct ocfs2_locking_protocol *cc_proto; 107 106 void (*cc_recovery_handler)(int node_num, void *recovery_data); ··· 157 152 * ->this_node() returns the cluster's unique identifier for the 158 153 * local node. 159 154 */ 160 - int (*this_node)(unsigned int *node); 155 + int (*this_node)(struct ocfs2_cluster_connection *conn, 156 + unsigned int *node); 161 157 162 158 /* 163 159 * Call the underlying dlm lock function. The ->dlm_lock() ··· 245 239 246 240 /* Used by the filesystem */ 247 241 int ocfs2_cluster_connect(const char *stack_name, 242 + const char *cluster_name, 243 + int cluster_name_len, 248 244 const char *group, 249 245 int grouplen, 250 246 struct ocfs2_locking_protocol *lproto, ··· 268 260 int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, 269 261 int hangup_pending); 270 262 void ocfs2_cluster_hangup(const char *group, int grouplen); 271 - int ocfs2_cluster_this_node(unsigned int *node); 263 + int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn, 264 + unsigned int *node); 272 265 273 266 struct ocfs2_lock_res; 274 267 int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,

+2 -10

fs/ocfs2/suballoc.c

··· 113 113 struct ocfs2_suballoc_result *res); 114 114 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 115 115 int nr); 116 - static inline int ocfs2_block_group_set_bits(handle_t *handle, 117 - struct inode *alloc_inode, 118 - struct ocfs2_group_desc *bg, 119 - struct buffer_head *group_bh, 120 - unsigned int bit_off, 121 - unsigned int num_bits); 122 116 static int ocfs2_relink_block_group(handle_t *handle, 123 117 struct inode *alloc_inode, 124 118 struct buffer_head *fe_bh, ··· 1337 1343 return status; 1338 1344 } 1339 1345 1340 - static inline int ocfs2_block_group_set_bits(handle_t *handle, 1346 + int ocfs2_block_group_set_bits(handle_t *handle, 1341 1347 struct inode *alloc_inode, 1342 1348 struct ocfs2_group_desc *bg, 1343 1349 struct buffer_head *group_bh, ··· 1382 1388 ocfs2_journal_dirty(handle, group_bh); 1383 1389 1384 1390 bail: 1385 - if (status) 1386 - mlog_errno(status); 1387 1391 return status; 1388 1392 } 1389 1393 ··· 1580 1588 return ret; 1581 1589 } 1582 1590 1583 - static int ocfs2_alloc_dinode_update_counts(struct inode *inode, 1591 + int ocfs2_alloc_dinode_update_counts(struct inode *inode, 1584 1592 handle_t *handle, 1585 1593 struct buffer_head *di_bh, 1586 1594 u32 num_bits,

+12

fs/ocfs2/suballoc.h

··· 86 86 u32 bits_wanted, 87 87 struct ocfs2_alloc_context **ac); 88 88 89 + int ocfs2_alloc_dinode_update_counts(struct inode *inode, 90 + handle_t *handle, 91 + struct buffer_head *di_bh, 92 + u32 num_bits, 93 + u16 chain); 94 + int ocfs2_block_group_set_bits(handle_t *handle, 95 + struct inode *alloc_inode, 96 + struct ocfs2_group_desc *bg, 97 + struct buffer_head *group_bh, 98 + unsigned int bit_off, 99 + unsigned int num_bits); 100 + 89 101 int ocfs2_claim_metadata(handle_t *handle, 90 102 struct ocfs2_alloc_context *ac, 91 103 u32 bits_wanted,

+12 -8

fs/ocfs2/super.c

··· 68 68 #include "super.h" 69 69 #include "sysfile.h" 70 70 #include "uptodate.h" 71 - #include "ver.h" 72 71 #include "xattr.h" 73 72 #include "quota.h" 74 73 #include "refcounttree.h" ··· 89 90 90 91 MODULE_AUTHOR("Oracle"); 91 92 MODULE_LICENSE("GPL"); 93 + MODULE_DESCRIPTION("OCFS2 cluster file system"); 92 94 93 95 struct mount_options 94 96 { ··· 1618 1618 { 1619 1619 int status, i; 1620 1620 1621 - ocfs2_print_version(); 1622 - 1623 1621 for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++) 1624 1622 init_waitqueue_head(&ocfs2__ioend_wq[i]); 1625 1623 ··· 1945 1947 1946 1948 ocfs2_shutdown_local_alloc(osb); 1947 1949 1948 - ocfs2_truncate_log_shutdown(osb); 1949 - 1950 1950 /* This will disable recovery and flush any recovery work. */ 1951 1951 ocfs2_recovery_exit(osb); 1952 + 1953 + /* 1954 + * During dismount, when it recovers another node it will call 1955 + * ocfs2_recover_orphans and queue delayed work osb_truncate_log_wq. 1956 + */ 1957 + ocfs2_truncate_log_shutdown(osb); 1952 1958 1953 1959 ocfs2_journal_shutdown(osb); 1954 1960 ··· 2227 2225 if (ocfs2_clusterinfo_valid(osb)) { 2228 2226 osb->osb_stackflags = 2229 2227 OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags; 2230 - memcpy(osb->osb_cluster_stack, 2228 + strlcpy(osb->osb_cluster_stack, 2231 2229 OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, 2232 - OCFS2_STACK_LABEL_LEN); 2233 - osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; 2230 + OCFS2_STACK_LABEL_LEN + 1); 2234 2231 if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) { 2235 2232 mlog(ML_ERROR, 2236 2233 "couldn't mount because of an invalid " ··· 2238 2237 status = -EINVAL; 2239 2238 goto bail; 2240 2239 } 2240 + strlcpy(osb->osb_cluster_name, 2241 + OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster, 2242 + OCFS2_CLUSTER_NAME_LEN + 1); 2241 2243 } else { 2242 2244 /* The empty string is identical with classic tools that 2243 2245 * don't know about s_cluster_info. */

-43

fs/ocfs2/ver.c

··· 1 - /* -*- mode: c; c-basic-offset: 8; -*- 2 - * vim: noexpandtab sw=8 ts=8 sts=0: 3 - * 4 - * ver.c 5 - * 6 - * version string 7 - * 8 - * Copyright (C) 2002, 2005 Oracle. All rights reserved. 9 - * 10 - * This program is free software; you can redistribute it and/or 11 - * modify it under the terms of the GNU General Public 12 - * License as published by the Free Software Foundation; either 13 - * version 2 of the License, or (at your option) any later version. 14 - * 15 - * This program is distributed in the hope that it will be useful, 16 - * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 - * General Public License for more details. 19 - * 20 - * You should have received a copy of the GNU General Public 21 - * License along with this program; if not, write to the 22 - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 - * Boston, MA 021110-1307, USA. 24 - */ 25 - 26 - #include <linux/module.h> 27 - #include <linux/string.h> 28 - #include <linux/kernel.h> 29 - 30 - #include "ver.h" 31 - 32 - #define OCFS2_BUILD_VERSION "1.5.0" 33 - 34 - #define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION 35 - 36 - void ocfs2_print_version(void) 37 - { 38 - printk(KERN_INFO "%s\n", VERSION_STR); 39 - } 40 - 41 - MODULE_DESCRIPTION(VERSION_STR); 42 - 43 - MODULE_VERSION(OCFS2_BUILD_VERSION);

-31

fs/ocfs2/ver.h

··· 1 - /* -*- mode: c; c-basic-offset: 8; -*- 2 - * vim: noexpandtab sw=8 ts=8 sts=0: 3 - * 4 - * ver.h 5 - * 6 - * Function prototypes 7 - * 8 - * Copyright (C) 2002, 2004 Oracle. All rights reserved. 9 - * 10 - * This program is free software; you can redistribute it and/or 11 - * modify it under the terms of the GNU General Public 12 - * License as published by the Free Software Foundation; either 13 - * version 2 of the License, or (at your option) any later version. 14 - * 15 - * This program is distributed in the hope that it will be useful, 16 - * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 - * General Public License for more details. 19 - * 20 - * You should have received a copy of the GNU General Public 21 - * License along with this program; if not, write to the 22 - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 - * Boston, MA 021110-1307, USA. 24 - */ 25 - 26 - #ifndef OCFS2_VER_H 27 - #define OCFS2_VER_H 28 - 29 - void ocfs2_print_version(void); 30 - 31 - #endif /* OCFS2_VER_H */

+79 -5

fs/posix_acl.c

··· 22 22 23 23 #include <linux/errno.h> 24 24 25 - EXPORT_SYMBOL(posix_acl_init); 26 - EXPORT_SYMBOL(posix_acl_alloc); 27 - EXPORT_SYMBOL(posix_acl_valid); 28 - EXPORT_SYMBOL(posix_acl_equiv_mode); 29 - EXPORT_SYMBOL(posix_acl_from_mode); 25 + struct posix_acl **acl_by_type(struct inode *inode, int type) 26 + { 27 + switch (type) { 28 + case ACL_TYPE_ACCESS: 29 + return &inode->i_acl; 30 + case ACL_TYPE_DEFAULT: 31 + return &inode->i_default_acl; 32 + default: 33 + BUG(); 34 + } 35 + } 36 + EXPORT_SYMBOL(acl_by_type); 37 + 38 + struct posix_acl *get_cached_acl(struct inode *inode, int type) 39 + { 40 + struct posix_acl **p = acl_by_type(inode, type); 41 + struct posix_acl *acl = ACCESS_ONCE(*p); 42 + if (acl) { 43 + spin_lock(&inode->i_lock); 44 + acl = *p; 45 + if (acl != ACL_NOT_CACHED) 46 + acl = posix_acl_dup(acl); 47 + spin_unlock(&inode->i_lock); 48 + } 49 + return acl; 50 + } 51 + EXPORT_SYMBOL(get_cached_acl); 52 + 53 + struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type) 54 + { 55 + return rcu_dereference(*acl_by_type(inode, type)); 56 + } 57 + EXPORT_SYMBOL(get_cached_acl_rcu); 58 + 59 + void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl) 60 + { 61 + struct posix_acl **p = acl_by_type(inode, type); 62 + struct posix_acl *old; 63 + spin_lock(&inode->i_lock); 64 + old = *p; 65 + rcu_assign_pointer(*p, posix_acl_dup(acl)); 66 + spin_unlock(&inode->i_lock); 67 + if (old != ACL_NOT_CACHED) 68 + posix_acl_release(old); 69 + } 70 + EXPORT_SYMBOL(set_cached_acl); 71 + 72 + void forget_cached_acl(struct inode *inode, int type) 73 + { 74 + struct posix_acl **p = acl_by_type(inode, type); 75 + struct posix_acl *old; 76 + spin_lock(&inode->i_lock); 77 + old = *p; 78 + *p = ACL_NOT_CACHED; 79 + spin_unlock(&inode->i_lock); 80 + if (old != ACL_NOT_CACHED) 81 + posix_acl_release(old); 82 + } 83 + EXPORT_SYMBOL(forget_cached_acl); 84 + 85 + void forget_all_cached_acls(struct inode *inode) 86 + { 87 + struct posix_acl *old_access, *old_default; 88 + spin_lock(&inode->i_lock); 89 + old_access = inode->i_acl; 90 + old_default = inode->i_default_acl; 91 + inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; 92 + spin_unlock(&inode->i_lock); 93 + if (old_access != ACL_NOT_CACHED) 94 + posix_acl_release(old_access); 95 + if (old_default != ACL_NOT_CACHED) 96 + posix_acl_release(old_default); 97 + } 98 + EXPORT_SYMBOL(forget_all_cached_acls); 30 99 31 100 /* 32 101 * Init a fresh posix_acl ··· 106 37 atomic_set(&acl->a_refcount, 1); 107 38 acl->a_count = count; 108 39 } 40 + EXPORT_SYMBOL(posix_acl_init); 109 41 110 42 /* 111 43 * Allocate a new ACL with the specified number of entries. ··· 121 51 posix_acl_init(acl, count); 122 52 return acl; 123 53 } 54 + EXPORT_SYMBOL(posix_acl_alloc); 124 55 125 56 /* 126 57 * Clone an ACL. ··· 217 146 return 0; 218 147 return -EINVAL; 219 148 } 149 + EXPORT_SYMBOL(posix_acl_valid); 220 150 221 151 /* 222 152 * Returns 0 if the acl can be exactly represented in the traditional ··· 258 186 *mode_p = (*mode_p & ~S_IRWXUGO) | mode; 259 187 return not_equiv; 260 188 } 189 + EXPORT_SYMBOL(posix_acl_equiv_mode); 261 190 262 191 /* 263 192 * Create an ACL representing the file mode permission bits of an inode. ··· 280 207 acl->a_entries[2].e_perm = (mode & S_IRWXO); 281 208 return acl; 282 209 } 210 + EXPORT_SYMBOL(posix_acl_from_mode); 283 211 284 212 /* 285 213 * Return 0 if current is granted want access to the inode

+37

fs/proc/meminfo.c

··· 26 26 unsigned long committed; 27 27 struct vmalloc_info vmi; 28 28 long cached; 29 + long available; 30 + unsigned long pagecache; 31 + unsigned long wmark_low = 0; 29 32 unsigned long pages[NR_LRU_LISTS]; 33 + struct zone *zone; 30 34 int lru; 31 35 32 36 /* ··· 51 47 for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) 52 48 pages[lru] = global_page_state(NR_LRU_BASE + lru); 53 49 50 + for_each_zone(zone) 51 + wmark_low += zone->watermark[WMARK_LOW]; 52 + 53 + /* 54 + * Estimate the amount of memory available for userspace allocations, 55 + * without causing swapping. 56 + * 57 + * Free memory cannot be taken below the low watermark, before the 58 + * system starts swapping. 59 + */ 60 + available = i.freeram - wmark_low; 61 + 62 + /* 63 + * Not all the page cache can be freed, otherwise the system will 64 + * start swapping. Assume at least half of the page cache, or the 65 + * low watermark worth of cache, needs to stay. 66 + */ 67 + pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; 68 + pagecache -= min(pagecache / 2, wmark_low); 69 + available += pagecache; 70 + 71 + /* 72 + * Part of the reclaimable swap consists of items that are in use, 73 + * and cannot be freed. Cap this estimate at the low watermark. 74 + */ 75 + available += global_page_state(NR_SLAB_RECLAIMABLE) - 76 + min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); 77 + 78 + if (available < 0) 79 + available = 0; 80 + 54 81 /* 55 82 * Tagged format, for easy grepping and expansion. 56 83 */ 57 84 seq_printf(m, 58 85 "MemTotal: %8lu kB\n" 59 86 "MemFree: %8lu kB\n" 87 + "MemAvailable: %8lu kB\n" 60 88 "Buffers: %8lu kB\n" 61 89 "Cached: %8lu kB\n" 62 90 "SwapCached: %8lu kB\n" ··· 141 105 , 142 106 K(i.totalram), 143 107 K(i.freeram), 108 + K(available), 144 109 K(i.bufferram), 145 110 K(cached), 146 111 K(total_swapcache_pages()),

+1 -1

fs/ramfs/inode.c

··· 275 275 276 276 return err; 277 277 } 278 - module_init(init_ramfs_fs) 278 + fs_initcall(init_ramfs_fs);

-4

fs/read_write.c

··· 901 901 io_fn_t fn; 902 902 iov_fn_t fnv; 903 903 904 - ret = -EFAULT; 905 - if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) 906 - goto out; 907 - 908 904 ret = compat_rw_copy_check_uvector(type, uvector, nr_segs, 909 905 UIO_FASTIOV, iovstack, &iov); 910 906 if (ret <= 0)

+2 -1

fs/super.c

··· 166 166 if (!s) 167 167 return NULL; 168 168 169 + INIT_LIST_HEAD(&s->s_mounts); 170 + 169 171 if (security_sb_alloc(s)) 170 172 goto fail; 171 173 ··· 190 188 if (list_lru_init(&s->s_inode_lru)) 191 189 goto fail; 192 190 193 - INIT_LIST_HEAD(&s->s_mounts); 194 191 init_rwsem(&s->s_umount); 195 192 lockdep_set_class(&s->s_umount, &type->s_umount_key); 196 193 /*

+152 -1

include/linux/bootmem.h

··· 5 5 #define _LINUX_BOOTMEM_H 6 6 7 7 #include <linux/mmzone.h> 8 + #include <linux/mm_types.h> 8 9 #include <asm/dma.h> 9 10 10 11 /* ··· 53 52 unsigned long size); 54 53 extern void free_bootmem(unsigned long physaddr, unsigned long size); 55 54 extern void free_bootmem_late(unsigned long physaddr, unsigned long size); 56 - extern void __free_pages_bootmem(struct page *page, unsigned int order); 57 55 58 56 /* 59 57 * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE, ··· 141 141 __alloc_bootmem_low(x, PAGE_SIZE, 0) 142 142 #define alloc_bootmem_low_pages_node(pgdat, x) \ 143 143 __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0) 144 + 145 + 146 + #if defined(CONFIG_HAVE_MEMBLOCK) && defined(CONFIG_NO_BOOTMEM) 147 + 148 + /* FIXME: use MEMBLOCK_ALLOC_* variants here */ 149 + #define BOOTMEM_ALLOC_ACCESSIBLE 0 150 + #define BOOTMEM_ALLOC_ANYWHERE (~(phys_addr_t)0) 151 + 152 + /* FIXME: Move to memblock.h at a point where we remove nobootmem.c */ 153 + void *memblock_virt_alloc_try_nid_nopanic(phys_addr_t size, 154 + phys_addr_t align, phys_addr_t min_addr, 155 + phys_addr_t max_addr, int nid); 156 + void *memblock_virt_alloc_try_nid(phys_addr_t size, phys_addr_t align, 157 + phys_addr_t min_addr, phys_addr_t max_addr, int nid); 158 + void __memblock_free_early(phys_addr_t base, phys_addr_t size); 159 + void __memblock_free_late(phys_addr_t base, phys_addr_t size); 160 + 161 + static inline void * __init memblock_virt_alloc( 162 + phys_addr_t size, phys_addr_t align) 163 + { 164 + return memblock_virt_alloc_try_nid(size, align, BOOTMEM_LOW_LIMIT, 165 + BOOTMEM_ALLOC_ACCESSIBLE, 166 + NUMA_NO_NODE); 167 + } 168 + 169 + static inline void * __init memblock_virt_alloc_nopanic( 170 + phys_addr_t size, phys_addr_t align) 171 + { 172 + return memblock_virt_alloc_try_nid_nopanic(size, align, 173 + BOOTMEM_LOW_LIMIT, 174 + BOOTMEM_ALLOC_ACCESSIBLE, 175 + NUMA_NO_NODE); 176 + } 177 + 178 + static inline void * __init memblock_virt_alloc_from_nopanic( 179 + phys_addr_t size, phys_addr_t align, phys_addr_t min_addr) 180 + { 181 + return memblock_virt_alloc_try_nid_nopanic(size, align, min_addr, 182 + BOOTMEM_ALLOC_ACCESSIBLE, 183 + NUMA_NO_NODE); 184 + } 185 + 186 + static inline void * __init memblock_virt_alloc_node( 187 + phys_addr_t size, int nid) 188 + { 189 + return memblock_virt_alloc_try_nid(size, 0, BOOTMEM_LOW_LIMIT, 190 + BOOTMEM_ALLOC_ACCESSIBLE, nid); 191 + } 192 + 193 + static inline void * __init memblock_virt_alloc_node_nopanic( 194 + phys_addr_t size, int nid) 195 + { 196 + return memblock_virt_alloc_try_nid_nopanic(size, 0, BOOTMEM_LOW_LIMIT, 197 + BOOTMEM_ALLOC_ACCESSIBLE, 198 + nid); 199 + } 200 + 201 + static inline void __init memblock_free_early( 202 + phys_addr_t base, phys_addr_t size) 203 + { 204 + __memblock_free_early(base, size); 205 + } 206 + 207 + static inline void __init memblock_free_early_nid( 208 + phys_addr_t base, phys_addr_t size, int nid) 209 + { 210 + __memblock_free_early(base, size); 211 + } 212 + 213 + static inline void __init memblock_free_late( 214 + phys_addr_t base, phys_addr_t size) 215 + { 216 + __memblock_free_late(base, size); 217 + } 218 + 219 + #else 220 + 221 + #define BOOTMEM_ALLOC_ACCESSIBLE 0 222 + 223 + 224 + /* Fall back to all the existing bootmem APIs */ 225 + static inline void * __init memblock_virt_alloc( 226 + phys_addr_t size, phys_addr_t align) 227 + { 228 + if (!align) 229 + align = SMP_CACHE_BYTES; 230 + return __alloc_bootmem(size, align, BOOTMEM_LOW_LIMIT); 231 + } 232 + 233 + static inline void * __init memblock_virt_alloc_nopanic( 234 + phys_addr_t size, phys_addr_t align) 235 + { 236 + if (!align) 237 + align = SMP_CACHE_BYTES; 238 + return __alloc_bootmem_nopanic(size, align, BOOTMEM_LOW_LIMIT); 239 + } 240 + 241 + static inline void * __init memblock_virt_alloc_from_nopanic( 242 + phys_addr_t size, phys_addr_t align, phys_addr_t min_addr) 243 + { 244 + return __alloc_bootmem_nopanic(size, align, min_addr); 245 + } 246 + 247 + static inline void * __init memblock_virt_alloc_node( 248 + phys_addr_t size, int nid) 249 + { 250 + return __alloc_bootmem_node(NODE_DATA(nid), size, SMP_CACHE_BYTES, 251 + BOOTMEM_LOW_LIMIT); 252 + } 253 + 254 + static inline void * __init memblock_virt_alloc_node_nopanic( 255 + phys_addr_t size, int nid) 256 + { 257 + return __alloc_bootmem_node_nopanic(NODE_DATA(nid), size, 258 + SMP_CACHE_BYTES, 259 + BOOTMEM_LOW_LIMIT); 260 + } 261 + 262 + static inline void * __init memblock_virt_alloc_try_nid(phys_addr_t size, 263 + phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid) 264 + { 265 + return __alloc_bootmem_node_high(NODE_DATA(nid), size, align, 266 + min_addr); 267 + } 268 + 269 + static inline void * __init memblock_virt_alloc_try_nid_nopanic( 270 + phys_addr_t size, phys_addr_t align, 271 + phys_addr_t min_addr, phys_addr_t max_addr, int nid) 272 + { 273 + return ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, align, 274 + min_addr, max_addr); 275 + } 276 + 277 + static inline void __init memblock_free_early( 278 + phys_addr_t base, phys_addr_t size) 279 + { 280 + free_bootmem(base, size); 281 + } 282 + 283 + static inline void __init memblock_free_early_nid( 284 + phys_addr_t base, phys_addr_t size, int nid) 285 + { 286 + free_bootmem_node(NODE_DATA(nid), base, size); 287 + } 288 + 289 + static inline void __init memblock_free_late( 290 + phys_addr_t base, phys_addr_t size) 291 + { 292 + free_bootmem_late(base, size); 293 + } 294 + #endif /* defined(CONFIG_HAVE_MEMBLOCK) && defined(CONFIG_NO_BOOTMEM) */ 144 295 145 296 #ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP 146 297 extern void *alloc_remap(int nid, unsigned long size);

+16

include/linux/compaction.h

··· 62 62 return zone->compact_considered < defer_limit; 63 63 } 64 64 65 + /* 66 + * Update defer tracking counters after successful compaction of given order, 67 + * which means an allocation either succeeded (alloc_success == true) or is 68 + * expected to succeed. 69 + */ 70 + static inline void compaction_defer_reset(struct zone *zone, int order, 71 + bool alloc_success) 72 + { 73 + if (alloc_success) { 74 + zone->compact_considered = 0; 75 + zone->compact_defer_shift = 0; 76 + } 77 + if (order >= zone->compact_order_failed) 78 + zone->compact_order_failed = order + 1; 79 + } 80 + 65 81 /* Returns true if restarting compaction after many failures */ 66 82 static inline bool compaction_restarting(struct zone *zone, int order) 67 83 {

+6

include/linux/dma-debug.h

··· 85 85 86 86 extern void debug_dma_dump_mappings(struct device *dev); 87 87 88 + extern void debug_dma_assert_idle(struct page *page); 89 + 88 90 #else /* CONFIG_DMA_API_DEBUG */ 89 91 90 92 static inline void dma_debug_add_bus(struct bus_type *bus) ··· 182 180 } 183 181 184 182 static inline void debug_dma_dump_mappings(struct device *dev) 183 + { 184 + } 185 + 186 + static inline void debug_dma_assert_idle(struct page *page) 185 187 { 186 188 } 187 189

+27 -91

include/linux/fsnotify_backend.h

··· 15 15 #include <linux/path.h> /* struct path */ 16 16 #include <linux/spinlock.h> 17 17 #include <linux/types.h> 18 - 19 18 #include <linux/atomic.h> 20 19 21 20 /* ··· 78 79 struct fsnotify_event; 79 80 struct fsnotify_mark; 80 81 struct fsnotify_event_private_data; 82 + struct fsnotify_fname; 81 83 82 84 /* 83 85 * Each group much define these ops. The fsnotify infrastructure will call ··· 94 94 * userspace messages that marks have been removed. 95 95 */ 96 96 struct fsnotify_ops { 97 - bool (*should_send_event)(struct fsnotify_group *group, struct inode *inode, 98 - struct fsnotify_mark *inode_mark, 99 - struct fsnotify_mark *vfsmount_mark, 100 - __u32 mask, void *data, int data_type); 101 97 int (*handle_event)(struct fsnotify_group *group, 98 + struct inode *inode, 102 99 struct fsnotify_mark *inode_mark, 103 100 struct fsnotify_mark *vfsmount_mark, 104 - struct fsnotify_event *event); 101 + u32 mask, void *data, int data_type, 102 + const unsigned char *file_name); 105 103 void (*free_group_priv)(struct fsnotify_group *group); 106 104 void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group); 107 - void (*free_event_priv)(struct fsnotify_event_private_data *priv); 105 + void (*free_event)(struct fsnotify_event *event); 106 + }; 107 + 108 + /* 109 + * all of the information about the original object we want to now send to 110 + * a group. If you want to carry more info from the accessing task to the 111 + * listener this structure is where you need to be adding fields. 112 + */ 113 + struct fsnotify_event { 114 + struct list_head list; 115 + /* inode may ONLY be dereferenced during handle_event(). */ 116 + struct inode *inode; /* either the inode the event happened to or its parent */ 117 + u32 mask; /* the type of access, bitwise OR for FS_* event types */ 108 118 }; 109 119 110 120 /* ··· 158 148 * a group */ 159 149 struct list_head marks_list; /* all inode marks for this group */ 160 150 161 - struct fasync_struct *fsn_fa; /* async notification */ 151 + struct fasync_struct *fsn_fa; /* async notification */ 152 + 153 + struct fsnotify_event overflow_event; /* Event we queue when the 154 + * notification list is too 155 + * full */ 162 156 163 157 /* groups can define private fields here or use the void *private */ 164 158 union { ··· 191 177 }; 192 178 }; 193 179 194 - /* 195 - * A single event can be queued in multiple group->notification_lists. 196 - * 197 - * each group->notification_list will point to an event_holder which in turns points 198 - * to the actual event that needs to be sent to userspace. 199 - * 200 - * Seemed cheaper to create a refcnt'd event and a small holder for every group 201 - * than create a different event for every group 202 - * 203 - */ 204 - struct fsnotify_event_holder { 205 - struct fsnotify_event *event; 206 - struct list_head event_list; 207 - }; 208 - 209 - /* 210 - * Inotify needs to tack data onto an event. This struct lets us later find the 211 - * correct private data of the correct group. 212 - */ 213 - struct fsnotify_event_private_data { 214 - struct fsnotify_group *group; 215 - struct list_head event_list; 216 - }; 217 - 218 - /* 219 - * all of the information about the original object we want to now send to 220 - * a group. If you want to carry more info from the accessing task to the 221 - * listener this structure is where you need to be adding fields. 222 - */ 223 - struct fsnotify_event { 224 - /* 225 - * If we create an event we are also likely going to need a holder 226 - * to link to a group. So embed one holder in the event. Means only 227 - * one allocation for the common case where we only have one group 228 - */ 229 - struct fsnotify_event_holder holder; 230 - spinlock_t lock; /* protection for the associated event_holder and private_list */ 231 - /* to_tell may ONLY be dereferenced during handle_event(). */ 232 - struct inode *to_tell; /* either the inode the event happened to or its parent */ 233 - /* 234 - * depending on the event type we should have either a path or inode 235 - * We hold a reference on path, but NOT on inode. Since we have the ref on 236 - * the path, it may be dereferenced at any point during this object's 237 - * lifetime. That reference is dropped when this object's refcnt hits 238 - * 0. If this event contains an inode instead of a path, the inode may 239 - * ONLY be used during handle_event(). 240 - */ 241 - union { 242 - struct path path; 243 - struct inode *inode; 244 - }; 245 180 /* when calling fsnotify tell it if the data is a path or inode */ 246 181 #define FSNOTIFY_EVENT_NONE 0 247 182 #define FSNOTIFY_EVENT_PATH 1 248 183 #define FSNOTIFY_EVENT_INODE 2 249 - int data_type; /* which of the above union we have */ 250 - atomic_t refcnt; /* how many groups still are using/need to send this event */ 251 - __u32 mask; /* the type of access, bitwise OR for FS_* event types */ 252 - 253 - u32 sync_cookie; /* used to corrolate events, namely inotify mv events */ 254 - const unsigned char *file_name; 255 - size_t name_len; 256 - struct pid *tgid; 257 - 258 - #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 259 - __u32 response; /* userspace answer to question */ 260 - #endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */ 261 - 262 - struct list_head private_data_list; /* groups can store private data here */ 263 - }; 264 184 265 185 /* 266 186 * Inode specific fields in an fsnotify_mark ··· 318 370 extern void fsnotify_destroy_group(struct fsnotify_group *group); 319 371 /* fasync handler function */ 320 372 extern int fsnotify_fasync(int fd, struct file *file, int on); 321 - /* take a reference to an event */ 322 - extern void fsnotify_get_event(struct fsnotify_event *event); 323 - extern void fsnotify_put_event(struct fsnotify_event *event); 324 - /* find private data previously attached to an event and unlink it */ 325 - extern struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, 326 - struct fsnotify_event *event); 327 - 373 + /* Free event from memory */ 374 + extern void fsnotify_destroy_event(struct fsnotify_group *group, 375 + struct fsnotify_event *event); 328 376 /* attach the event to the group notification queue */ 329 377 extern struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, 330 378 struct fsnotify_event *event, 331 - struct fsnotify_event_private_data *priv, 332 379 struct fsnotify_event *(*merge)(struct list_head *, 333 380 struct fsnotify_event *)); 334 381 /* true if the group notification queue is empty */ ··· 373 430 extern void fsnotify_unmount_inodes(struct list_head *list); 374 431 375 432 /* put here because inotify does some weird stuff when destroying watches */ 376 - extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, 377 - void *data, int data_is, 378 - const unsigned char *name, 379 - u32 cookie, gfp_t gfp); 380 - 381 - /* fanotify likes to change events after they are on lists... */ 382 - extern struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event); 383 - extern int fsnotify_replace_event(struct fsnotify_event_holder *old_holder, 384 - struct fsnotify_event *new_event); 433 + extern void fsnotify_init_event(struct fsnotify_event *event, 434 + struct inode *to_tell, u32 mask); 385 435 386 436 #else 387 437

+23

include/linux/huge_mm.h

··· 157 157 return HPAGE_PMD_NR; 158 158 return 1; 159 159 } 160 + /* 161 + * compound_trans_head() should be used instead of compound_head(), 162 + * whenever the "page" passed as parameter could be the tail of a 163 + * transparent hugepage that could be undergoing a 164 + * __split_huge_page_refcount(). The page structure layout often 165 + * changes across releases and it makes extensive use of unions. So if 166 + * the page structure layout will change in a way that 167 + * page->first_page gets clobbered by __split_huge_page_refcount, the 168 + * implementation making use of smp_rmb() will be required. 169 + * 170 + * Currently we define compound_trans_head as compound_head, because 171 + * page->private is in the same union with page->first_page, and 172 + * page->private isn't clobbered. However this also means we're 173 + * currently leaving dirt into the page->private field of anonymous 174 + * pages resulting from a THP split, instead of setting page->private 175 + * to zero like for every other page that has PG_private not set. But 176 + * anonymous pages don't use page->private so this is not a problem. 177 + */ 178 + #if 0 179 + /* This will be needed if page->private will be clobbered in split_huge_page */ 160 180 static inline struct page *compound_trans_head(struct page *page) 161 181 { 162 182 if (PageTail(page)) { ··· 194 174 } 195 175 return page; 196 176 } 177 + #else 178 + #define compound_trans_head(page) compound_head(page) 179 + #endif 197 180 198 181 extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, 199 182 unsigned long addr, pmd_t pmd, pmd_t *pmdp);

+1 -6

include/linux/hugetlb.h

··· 31 31 void hugepage_put_subpool(struct hugepage_subpool *spool); 32 32 33 33 int PageHuge(struct page *page); 34 - int PageHeadHuge(struct page *page_head); 35 34 36 35 void reset_vma_resv_huge_pages(struct vm_area_struct *vma); 37 36 int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); ··· 99 100 #else /* !CONFIG_HUGETLB_PAGE */ 100 101 101 102 static inline int PageHuge(struct page *page) 102 - { 103 - return 0; 104 - } 105 - 106 - static inline int PageHeadHuge(struct page *page_head) 107 103 { 108 104 return 0; 109 105 } ··· 354 360 355 361 static inline struct hstate *page_hstate(struct page *page) 356 362 { 363 + VM_BUG_ON(!PageHuge(page)); 357 364 return size_to_hstate(PAGE_SIZE << compound_order(page)); 358 365 } 359 366

+2

include/linux/init_task.h

··· 41 41 42 42 #define INIT_SIGNALS(sig) { \ 43 43 .nr_threads = 1, \ 44 + .thread_head = LIST_HEAD_INIT(init_task.thread_node), \ 44 45 .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\ 45 46 .shared_pending = { \ 46 47 .list = LIST_HEAD_INIT(sig.shared_pending.list), \ ··· 223 222 [PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \ 224 223 }, \ 225 224 .thread_group = LIST_HEAD_INIT(tsk.thread_group), \ 225 + .thread_node = LIST_HEAD_INIT(init_signals.thread_head), \ 226 226 INIT_IDS \ 227 227 INIT_PERF_EVENTS(tsk) \ 228 228 INIT_TRACE_IRQFLAGS \

+3 -12

include/linux/ksm.h

··· 73 73 struct page *ksm_might_need_to_copy(struct page *page, 74 74 struct vm_area_struct *vma, unsigned long address); 75 75 76 - int page_referenced_ksm(struct page *page, 77 - struct mem_cgroup *memcg, unsigned long *vm_flags); 78 - int try_to_unmap_ksm(struct page *page, enum ttu_flags flags); 79 - int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, 80 - struct vm_area_struct *, unsigned long, void *), void *arg); 76 + int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); 81 77 void ksm_migrate_page(struct page *newpage, struct page *oldpage); 82 78 83 79 #else /* !CONFIG_KSM */ ··· 111 115 return 0; 112 116 } 113 117 114 - static inline int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) 115 - { 116 - return 0; 117 - } 118 - 119 - static inline int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page*, 120 - struct vm_area_struct *, unsigned long, void *), void *arg) 118 + static inline int rmap_walk_ksm(struct page *page, 119 + struct rmap_walk_control *rwc) 121 120 { 122 121 return 0; 123 122 }

+49 -5

include/linux/memblock.h

··· 19 19 20 20 #define INIT_MEMBLOCK_REGIONS 128 21 21 22 + /* Definition of memblock flags. */ 23 + #define MEMBLOCK_HOTPLUG 0x1 /* hotpluggable region */ 24 + 22 25 struct memblock_region { 23 26 phys_addr_t base; 24 27 phys_addr_t size; 28 + unsigned long flags; 25 29 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 26 30 int nid; 27 31 #endif ··· 47 43 48 44 extern struct memblock memblock; 49 45 extern int memblock_debug; 46 + #ifdef CONFIG_MOVABLE_NODE 47 + /* If movable_node boot option specified */ 48 + extern bool movable_node_enabled; 49 + #endif /* CONFIG_MOVABLE_NODE */ 50 50 51 51 #define memblock_dbg(fmt, ...) \ 52 52 if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) 53 53 54 - phys_addr_t memblock_find_in_range_node(phys_addr_t start, phys_addr_t end, 55 - phys_addr_t size, phys_addr_t align, int nid); 54 + phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, 55 + phys_addr_t start, phys_addr_t end, 56 + int nid); 56 57 phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, 57 58 phys_addr_t size, phys_addr_t align); 58 59 phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr); ··· 68 59 int memblock_free(phys_addr_t base, phys_addr_t size); 69 60 int memblock_reserve(phys_addr_t base, phys_addr_t size); 70 61 void memblock_trim_memory(phys_addr_t align); 62 + int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size); 63 + int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size); 64 + #ifdef CONFIG_MOVABLE_NODE 65 + static inline bool memblock_is_hotpluggable(struct memblock_region *m) 66 + { 67 + return m->flags & MEMBLOCK_HOTPLUG; 68 + } 69 + 70 + static inline bool movable_node_is_enabled(void) 71 + { 72 + return movable_node_enabled; 73 + } 74 + #else 75 + static inline bool memblock_is_hotpluggable(struct memblock_region *m) 76 + { 77 + return false; 78 + } 79 + static inline bool movable_node_is_enabled(void) 80 + { 81 + return false; 82 + } 83 + #endif 71 84 72 85 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 73 86 int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, ··· 118 87 /** 119 88 * for_each_free_mem_range - iterate through free memblock areas 120 89 * @i: u64 used as loop variable 121 - * @nid: node selector, %MAX_NUMNODES for all nodes 90 + * @nid: node selector, %NUMA_NO_NODE for all nodes 122 91 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL 123 92 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL 124 93 * @p_nid: ptr to int for nid of the range, can be %NULL ··· 138 107 /** 139 108 * for_each_free_mem_range_reverse - rev-iterate through free memblock areas 140 109 * @i: u64 used as loop variable 141 - * @nid: node selector, %MAX_NUMNODES for all nodes 110 + * @nid: node selector, %NUMA_NO_NODE for all nodes 142 111 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL 143 112 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL 144 113 * @p_nid: ptr to int for nid of the range, can be %NULL ··· 152 121 i != (u64)ULLONG_MAX; \ 153 122 __next_free_mem_range_rev(&i, nid, p_start, p_end, p_nid)) 154 123 124 + static inline void memblock_set_region_flags(struct memblock_region *r, 125 + unsigned long flags) 126 + { 127 + r->flags |= flags; 128 + } 129 + 130 + static inline void memblock_clear_region_flags(struct memblock_region *r, 131 + unsigned long flags) 132 + { 133 + r->flags &= ~flags; 134 + } 135 + 155 136 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 156 - int memblock_set_node(phys_addr_t base, phys_addr_t size, int nid); 137 + int memblock_set_node(phys_addr_t base, phys_addr_t size, 138 + struct memblock_type *type, int nid); 157 139 158 140 static inline void memblock_set_region_node(struct memblock_region *r, int nid) 159 141 {

-32

include/linux/mempolicy.h

··· 211 211 { 212 212 } 213 213 214 - static inline struct mempolicy *mpol_dup(struct mempolicy *old) 215 - { 216 - return NULL; 217 - } 218 - 219 214 struct shared_policy {}; 220 - 221 - static inline int mpol_set_shared_policy(struct shared_policy *info, 222 - struct vm_area_struct *vma, 223 - struct mempolicy *new) 224 - { 225 - return -EINVAL; 226 - } 227 215 228 216 static inline void mpol_shared_policy_init(struct shared_policy *sp, 229 217 struct mempolicy *mpol) ··· 220 232 221 233 static inline void mpol_free_shared_policy(struct shared_policy *p) 222 234 { 223 - } 224 - 225 - static inline struct mempolicy * 226 - mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) 227 - { 228 - return NULL; 229 235 } 230 236 231 237 #define vma_policy(vma) NULL ··· 248 266 { 249 267 } 250 268 251 - static inline void mpol_fix_fork_child_flag(struct task_struct *p) 252 - { 253 - } 254 - 255 269 static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, 256 270 unsigned long addr, gfp_t gfp_flags, 257 271 struct mempolicy **mpol, nodemask_t **nodemask) ··· 258 280 } 259 281 260 282 static inline bool init_nodemask_of_mempolicy(nodemask_t *m) 261 - { 262 - return false; 263 - } 264 - 265 - static inline bool mempolicy_nodemask_intersects(struct task_struct *tsk, 266 - const nodemask_t *mask) 267 283 { 268 284 return false; 269 285 } ··· 278 306 return 1; /* error */ 279 307 } 280 308 #endif 281 - 282 - static inline void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) 283 - { 284 - } 285 309 286 310 static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma, 287 311 unsigned long address)

-6

include/linux/migrate.h

··· 35 35 36 36 #ifdef CONFIG_MIGRATION 37 37 38 - extern void putback_lru_pages(struct list_head *l); 39 38 extern void putback_movable_pages(struct list_head *l); 40 39 extern int migrate_page(struct address_space *, 41 40 struct page *, struct page *, enum migrate_mode); 42 41 extern int migrate_pages(struct list_head *l, new_page_t x, 43 42 unsigned long private, enum migrate_mode mode, int reason); 44 - 45 - extern int fail_migrate_page(struct address_space *, 46 - struct page *, struct page *); 47 43 48 44 extern int migrate_prep(void); 49 45 extern int migrate_prep_local(void); ··· 55 59 int extra_count); 56 60 #else 57 61 58 - static inline void putback_lru_pages(struct list_head *l) {} 59 62 static inline void putback_movable_pages(struct list_head *l) {} 60 63 static inline int migrate_pages(struct list_head *l, new_page_t x, 61 64 unsigned long private, enum migrate_mode mode, int reason) ··· 81 86 82 87 /* Possible settings for the migrate_page() method in address_operations */ 83 88 #define migrate_page NULL 84 - #define fail_migrate_page NULL 85 89 86 90 #endif /* CONFIG_MIGRATION */ 87 91

+61 -9

include/linux/mm.h

··· 57 57 extern unsigned long sysctl_user_reserve_kbytes; 58 58 extern unsigned long sysctl_admin_reserve_kbytes; 59 59 60 + extern int sysctl_overcommit_memory; 61 + extern int sysctl_overcommit_ratio; 62 + extern unsigned long sysctl_overcommit_kbytes; 63 + 64 + extern int overcommit_ratio_handler(struct ctl_table *, int, void __user *, 65 + size_t *, loff_t *); 66 + extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *, 67 + size_t *, loff_t *); 68 + 60 69 #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) 61 70 62 71 /* to align the pointer to the (next) page boundary */ ··· 423 414 return atomic_read(&compound_head(page)->_count); 424 415 } 425 416 417 + #ifdef CONFIG_HUGETLB_PAGE 418 + extern int PageHeadHuge(struct page *page_head); 419 + #else /* CONFIG_HUGETLB_PAGE */ 420 + static inline int PageHeadHuge(struct page *page_head) 421 + { 422 + return 0; 423 + } 424 + #endif /* CONFIG_HUGETLB_PAGE */ 425 + 426 + static inline bool __compound_tail_refcounted(struct page *page) 427 + { 428 + return !PageSlab(page) && !PageHeadHuge(page); 429 + } 430 + 431 + /* 432 + * This takes a head page as parameter and tells if the 433 + * tail page reference counting can be skipped. 434 + * 435 + * For this to be safe, PageSlab and PageHeadHuge must remain true on 436 + * any given page where they return true here, until all tail pins 437 + * have been released. 438 + */ 439 + static inline bool compound_tail_refcounted(struct page *page) 440 + { 441 + VM_BUG_ON(!PageHead(page)); 442 + return __compound_tail_refcounted(page); 443 + } 444 + 426 445 static inline void get_huge_page_tail(struct page *page) 427 446 { 428 447 /* 429 - * __split_huge_page_refcount() cannot run 430 - * from under us. 448 + * __split_huge_page_refcount() cannot run from under us. 431 449 */ 450 + VM_BUG_ON(!PageTail(page)); 432 451 VM_BUG_ON(page_mapcount(page) < 0); 433 452 VM_BUG_ON(atomic_read(&page->_count) != 0); 434 - atomic_inc(&page->_mapcount); 453 + if (compound_tail_refcounted(page->first_page)) 454 + atomic_inc(&page->_mapcount); 435 455 } 436 456 437 457 extern bool __get_page_tail(struct page *page); ··· 884 846 #endif 885 847 886 848 #if defined(WANT_PAGE_VIRTUAL) 887 - #define page_address(page) ((page)->virtual) 888 - #define set_page_address(page, address) \ 889 - do { \ 890 - (page)->virtual = (address); \ 891 - } while(0) 849 + static inline void *page_address(const struct page *page) 850 + { 851 + return page->virtual; 852 + } 853 + static inline void set_page_address(struct page *page, void *address) 854 + { 855 + page->virtual = address; 856 + } 892 857 #define page_address_init() do { } while(0) 893 858 #endif 894 859 ··· 1025 984 * various contexts. 1026 985 */ 1027 986 #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ 1028 - #define SHOW_MEM_FILTER_PAGE_COUNT (0x0002u) /* page type count */ 1029 987 1030 988 extern void show_free_areas(unsigned int flags); 1031 989 extern bool skip_free_areas_node(unsigned int flags, int nid); ··· 1358 1318 1359 1319 #if USE_SPLIT_PTE_PTLOCKS 1360 1320 #if ALLOC_SPLIT_PTLOCKS 1321 + void __init ptlock_cache_init(void); 1361 1322 extern bool ptlock_alloc(struct page *page); 1362 1323 extern void ptlock_free(struct page *page); 1363 1324 ··· 1367 1326 return page->ptl; 1368 1327 } 1369 1328 #else /* ALLOC_SPLIT_PTLOCKS */ 1329 + static inline void ptlock_cache_init(void) 1330 + { 1331 + } 1332 + 1370 1333 static inline bool ptlock_alloc(struct page *page) 1371 1334 { 1372 1335 return true; ··· 1423 1378 { 1424 1379 return &mm->page_table_lock; 1425 1380 } 1381 + static inline void ptlock_cache_init(void) {} 1426 1382 static inline bool ptlock_init(struct page *page) { return true; } 1427 1383 static inline void pte_lock_deinit(struct page *page) {} 1428 1384 #endif /* USE_SPLIT_PTE_PTLOCKS */ 1385 + 1386 + static inline void pgtable_init(void) 1387 + { 1388 + ptlock_cache_init(); 1389 + pgtable_cache_init(); 1390 + } 1429 1391 1430 1392 static inline bool pgtable_page_ctor(struct page *page) 1431 1393 {

+1

include/linux/mman.h

··· 9 9 10 10 extern int sysctl_overcommit_memory; 11 11 extern int sysctl_overcommit_ratio; 12 + extern unsigned long sysctl_overcommit_kbytes; 12 13 extern struct percpu_counter vm_committed_as; 13 14 14 15 #ifdef CONFIG_SMP

+7 -4

include/linux/mmzone.h

··· 490 490 unsigned long managed_pages; 491 491 492 492 /* 493 + * Number of MIGRATE_RESEVE page block. To maintain for just 494 + * optimization. Protected by zone->lock. 495 + */ 496 + int nr_migrate_reserve_block; 497 + 498 + /* 493 499 * rarely used fields: 494 500 */ 495 501 const char *name; ··· 764 758 int kswapd_max_order; 765 759 enum zone_type classzone_idx; 766 760 #ifdef CONFIG_NUMA_BALANCING 767 - /* 768 - * Lock serializing the per destination node AutoNUMA memory 769 - * migration rate limiting data. 770 - */ 761 + /* Lock serializing the migrate rate limiting window */ 771 762 spinlock_t numabalancing_migrate_lock; 772 763 773 764 /* Rate limiting time interval */

+6 -72

include/linux/posix_acl.h

··· 94 94 extern struct posix_acl *get_posix_acl(struct inode *, int); 95 95 extern int set_posix_acl(struct inode *, int, struct posix_acl *); 96 96 97 - #ifdef CONFIG_FS_POSIX_ACL 98 - static inline struct posix_acl **acl_by_type(struct inode *inode, int type) 99 - { 100 - switch (type) { 101 - case ACL_TYPE_ACCESS: 102 - return &inode->i_acl; 103 - case ACL_TYPE_DEFAULT: 104 - return &inode->i_default_acl; 105 - default: 106 - BUG(); 107 - } 108 - } 109 - 110 - static inline struct posix_acl *get_cached_acl(struct inode *inode, int type) 111 - { 112 - struct posix_acl **p = acl_by_type(inode, type); 113 - struct posix_acl *acl = ACCESS_ONCE(*p); 114 - if (acl) { 115 - spin_lock(&inode->i_lock); 116 - acl = *p; 117 - if (acl != ACL_NOT_CACHED) 118 - acl = posix_acl_dup(acl); 119 - spin_unlock(&inode->i_lock); 120 - } 121 - return acl; 122 - } 123 - 124 - static inline struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type) 125 - { 126 - return rcu_dereference(*acl_by_type(inode, type)); 127 - } 128 - 129 - static inline void set_cached_acl(struct inode *inode, 130 - int type, 131 - struct posix_acl *acl) 132 - { 133 - struct posix_acl **p = acl_by_type(inode, type); 134 - struct posix_acl *old; 135 - spin_lock(&inode->i_lock); 136 - old = *p; 137 - rcu_assign_pointer(*p, posix_acl_dup(acl)); 138 - spin_unlock(&inode->i_lock); 139 - if (old != ACL_NOT_CACHED) 140 - posix_acl_release(old); 141 - } 142 - 143 - static inline void forget_cached_acl(struct inode *inode, int type) 144 - { 145 - struct posix_acl **p = acl_by_type(inode, type); 146 - struct posix_acl *old; 147 - spin_lock(&inode->i_lock); 148 - old = *p; 149 - *p = ACL_NOT_CACHED; 150 - spin_unlock(&inode->i_lock); 151 - if (old != ACL_NOT_CACHED) 152 - posix_acl_release(old); 153 - } 154 - 155 - static inline void forget_all_cached_acls(struct inode *inode) 156 - { 157 - struct posix_acl *old_access, *old_default; 158 - spin_lock(&inode->i_lock); 159 - old_access = inode->i_acl; 160 - old_default = inode->i_default_acl; 161 - inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; 162 - spin_unlock(&inode->i_lock); 163 - if (old_access != ACL_NOT_CACHED) 164 - posix_acl_release(old_access); 165 - if (old_default != ACL_NOT_CACHED) 166 - posix_acl_release(old_default); 167 - } 168 - #endif 97 + struct posix_acl **acl_by_type(struct inode *inode, int type); 98 + struct posix_acl *get_cached_acl(struct inode *inode, int type); 99 + struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type); 100 + void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl); 101 + void forget_cached_acl(struct inode *inode, int type); 102 + void forget_all_cached_acls(struct inode *inode); 169 103 170 104 static inline void cache_no_acl(struct inode *inode) 171 105 {

+22 -5

include/linux/rmap.h

··· 184 184 int page_referenced(struct page *, int is_locked, 185 185 struct mem_cgroup *memcg, unsigned long *vm_flags); 186 186 int page_referenced_one(struct page *, struct vm_area_struct *, 187 - unsigned long address, unsigned int *mapcount, unsigned long *vm_flags); 187 + unsigned long address, void *arg); 188 188 189 189 #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) 190 190 191 191 int try_to_unmap(struct page *, enum ttu_flags flags); 192 192 int try_to_unmap_one(struct page *, struct vm_area_struct *, 193 - unsigned long address, enum ttu_flags flags); 193 + unsigned long address, void *arg); 194 194 195 195 /* 196 196 * Called from mm/filemap_xip.c to unmap empty zero page ··· 236 236 int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); 237 237 238 238 /* 239 - * Called by migrate.c to remove migration ptes, but might be used more later. 239 + * rmap_walk_control: To control rmap traversing for specific needs 240 + * 241 + * arg: passed to rmap_one() and invalid_vma() 242 + * rmap_one: executed on each vma where page is mapped 243 + * done: for checking traversing termination condition 244 + * file_nonlinear: for handling file nonlinear mapping 245 + * anon_lock: for getting anon_lock by optimized way rather than default 246 + * invalid_vma: for skipping uninterested vma 240 247 */ 241 - int rmap_walk(struct page *page, int (*rmap_one)(struct page *, 242 - struct vm_area_struct *, unsigned long, void *), void *arg); 248 + struct rmap_walk_control { 249 + void *arg; 250 + int (*rmap_one)(struct page *page, struct vm_area_struct *vma, 251 + unsigned long addr, void *arg); 252 + int (*done)(struct page *page); 253 + int (*file_nonlinear)(struct page *, struct address_space *, 254 + struct vm_area_struct *vma); 255 + struct anon_vma *(*anon_lock)(struct page *page); 256 + bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); 257 + }; 258 + 259 + int rmap_walk(struct page *page, struct rmap_walk_control *rwc); 243 260 244 261 #else /* !CONFIG_MMU */ 245 262

+12

include/linux/sched.h

··· 549 549 atomic_t sigcnt; 550 550 atomic_t live; 551 551 int nr_threads; 552 + struct list_head thread_head; 552 553 553 554 wait_queue_head_t wait_chldexit; /* for wait4() */ 554 555 ··· 1272 1271 /* PID/PID hash table linkage. */ 1273 1272 struct pid_link pids[PIDTYPE_MAX]; 1274 1273 struct list_head thread_group; 1274 + struct list_head thread_node; 1275 1275 1276 1276 struct completion *vfork_done; /* for vfork() */ 1277 1277 int __user *set_child_tid; /* CLONE_CHILD_SETTID */ ··· 2342 2340 2343 2341 #define while_each_thread(g, t) \ 2344 2342 while ((t = next_thread(t)) != g) 2343 + 2344 + #define __for_each_thread(signal, t) \ 2345 + list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node) 2346 + 2347 + #define for_each_thread(p, t) \ 2348 + __for_each_thread((p)->signal, t) 2349 + 2350 + /* Careful: this is a double loop, 'break' won't work as expected. */ 2351 + #define for_each_process_thread(p, t) \ 2352 + for_each_process(p) for_each_thread(p, t) 2345 2353 2346 2354 static inline int get_nr_threads(struct task_struct *tsk) 2347 2355 {

+42

include/trace/events/compaction.h

··· 67 67 __entry->nr_failed) 68 68 ); 69 69 70 + TRACE_EVENT(mm_compaction_begin, 71 + TP_PROTO(unsigned long zone_start, unsigned long migrate_start, 72 + unsigned long free_start, unsigned long zone_end), 73 + 74 + TP_ARGS(zone_start, migrate_start, free_start, zone_end), 75 + 76 + TP_STRUCT__entry( 77 + __field(unsigned long, zone_start) 78 + __field(unsigned long, migrate_start) 79 + __field(unsigned long, free_start) 80 + __field(unsigned long, zone_end) 81 + ), 82 + 83 + TP_fast_assign( 84 + __entry->zone_start = zone_start; 85 + __entry->migrate_start = migrate_start; 86 + __entry->free_start = free_start; 87 + __entry->zone_end = zone_end; 88 + ), 89 + 90 + TP_printk("zone_start=%lu migrate_start=%lu free_start=%lu zone_end=%lu", 91 + __entry->zone_start, 92 + __entry->migrate_start, 93 + __entry->free_start, 94 + __entry->zone_end) 95 + ); 96 + 97 + TRACE_EVENT(mm_compaction_end, 98 + TP_PROTO(int status), 99 + 100 + TP_ARGS(status), 101 + 102 + TP_STRUCT__entry( 103 + __field(int, status) 104 + ), 105 + 106 + TP_fast_assign( 107 + __entry->status = status; 108 + ), 109 + 110 + TP_printk("status=%d", __entry->status) 111 + ); 70 112 71 113 #endif /* _TRACE_COMPACTION_H */ 72 114

+26

include/trace/events/migrate.h

··· 45 45 __print_symbolic(__entry->reason, MIGRATE_REASON)) 46 46 ); 47 47 48 + TRACE_EVENT(mm_numa_migrate_ratelimit, 49 + 50 + TP_PROTO(struct task_struct *p, int dst_nid, unsigned long nr_pages), 51 + 52 + TP_ARGS(p, dst_nid, nr_pages), 53 + 54 + TP_STRUCT__entry( 55 + __array( char, comm, TASK_COMM_LEN) 56 + __field( pid_t, pid) 57 + __field( int, dst_nid) 58 + __field( unsigned long, nr_pages) 59 + ), 60 + 61 + TP_fast_assign( 62 + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); 63 + __entry->pid = p->pid; 64 + __entry->dst_nid = dst_nid; 65 + __entry->nr_pages = nr_pages; 66 + ), 67 + 68 + TP_printk("comm=%s pid=%d dst_nid=%d nr_pages=%lu", 69 + __entry->comm, 70 + __entry->pid, 71 + __entry->dst_nid, 72 + __entry->nr_pages) 73 + ); 48 74 #endif /* _TRACE_MIGRATE_H */ 49 75 50 76 /* This part must be outside protection */

+87

include/trace/events/sched.h

··· 443 443 ); 444 444 #endif /* CONFIG_DETECT_HUNG_TASK */ 445 445 446 + DECLARE_EVENT_CLASS(sched_move_task_template, 447 + 448 + TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu), 449 + 450 + TP_ARGS(tsk, src_cpu, dst_cpu), 451 + 452 + TP_STRUCT__entry( 453 + __field( pid_t, pid ) 454 + __field( pid_t, tgid ) 455 + __field( pid_t, ngid ) 456 + __field( int, src_cpu ) 457 + __field( int, src_nid ) 458 + __field( int, dst_cpu ) 459 + __field( int, dst_nid ) 460 + ), 461 + 462 + TP_fast_assign( 463 + __entry->pid = task_pid_nr(tsk); 464 + __entry->tgid = task_tgid_nr(tsk); 465 + __entry->ngid = task_numa_group_id(tsk); 466 + __entry->src_cpu = src_cpu; 467 + __entry->src_nid = cpu_to_node(src_cpu); 468 + __entry->dst_cpu = dst_cpu; 469 + __entry->dst_nid = cpu_to_node(dst_cpu); 470 + ), 471 + 472 + TP_printk("pid=%d tgid=%d ngid=%d src_cpu=%d src_nid=%d dst_cpu=%d dst_nid=%d", 473 + __entry->pid, __entry->tgid, __entry->ngid, 474 + __entry->src_cpu, __entry->src_nid, 475 + __entry->dst_cpu, __entry->dst_nid) 476 + ); 477 + 478 + /* 479 + * Tracks migration of tasks from one runqueue to another. Can be used to 480 + * detect if automatic NUMA balancing is bouncing between nodes 481 + */ 482 + DEFINE_EVENT(sched_move_task_template, sched_move_numa, 483 + TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu), 484 + 485 + TP_ARGS(tsk, src_cpu, dst_cpu) 486 + ); 487 + 488 + DEFINE_EVENT(sched_move_task_template, sched_stick_numa, 489 + TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu), 490 + 491 + TP_ARGS(tsk, src_cpu, dst_cpu) 492 + ); 493 + 494 + TRACE_EVENT(sched_swap_numa, 495 + 496 + TP_PROTO(struct task_struct *src_tsk, int src_cpu, 497 + struct task_struct *dst_tsk, int dst_cpu), 498 + 499 + TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu), 500 + 501 + TP_STRUCT__entry( 502 + __field( pid_t, src_pid ) 503 + __field( pid_t, src_tgid ) 504 + __field( pid_t, src_ngid ) 505 + __field( int, src_cpu ) 506 + __field( int, src_nid ) 507 + __field( pid_t, dst_pid ) 508 + __field( pid_t, dst_tgid ) 509 + __field( pid_t, dst_ngid ) 510 + __field( int, dst_cpu ) 511 + __field( int, dst_nid ) 512 + ), 513 + 514 + TP_fast_assign( 515 + __entry->src_pid = task_pid_nr(src_tsk); 516 + __entry->src_tgid = task_tgid_nr(src_tsk); 517 + __entry->src_ngid = task_numa_group_id(src_tsk); 518 + __entry->src_cpu = src_cpu; 519 + __entry->src_nid = cpu_to_node(src_cpu); 520 + __entry->dst_pid = task_pid_nr(dst_tsk); 521 + __entry->dst_tgid = task_tgid_nr(dst_tsk); 522 + __entry->dst_ngid = task_numa_group_id(dst_tsk); 523 + __entry->dst_cpu = dst_cpu; 524 + __entry->dst_nid = cpu_to_node(dst_cpu); 525 + ), 526 + 527 + TP_printk("src_pid=%d src_tgid=%d src_ngid=%d src_cpu=%d src_nid=%d dst_pid=%d dst_tgid=%d dst_ngid=%d dst_cpu=%d dst_nid=%d", 528 + __entry->src_pid, __entry->src_tgid, __entry->src_ngid, 529 + __entry->src_cpu, __entry->src_nid, 530 + __entry->dst_pid, __entry->dst_tgid, __entry->dst_ngid, 531 + __entry->dst_cpu, __entry->dst_nid) 532 + ); 446 533 #endif /* _TRACE_SCHED_H */ 447 534 448 535 /* This part must be outside protection */

+6 -4

init/main.c

··· 355 355 */ 356 356 static void __init setup_command_line(char *command_line) 357 357 { 358 - saved_command_line = alloc_bootmem(strlen (boot_command_line)+1); 359 - initcall_command_line = alloc_bootmem(strlen (boot_command_line)+1); 360 - static_command_line = alloc_bootmem(strlen (command_line)+1); 358 + saved_command_line = 359 + memblock_virt_alloc(strlen(boot_command_line) + 1, 0); 360 + initcall_command_line = 361 + memblock_virt_alloc(strlen(boot_command_line) + 1, 0); 362 + static_command_line = memblock_virt_alloc(strlen(command_line) + 1, 0); 361 363 strcpy (saved_command_line, boot_command_line); 362 364 strcpy (static_command_line, command_line); 363 365 } ··· 478 476 mem_init(); 479 477 kmem_cache_init(); 480 478 percpu_init_late(); 481 - pgtable_cache_init(); 479 + pgtable_init(); 482 480 vmalloc_init(); 483 481 } 484 482

+5 -15

kernel/audit_tree.c

··· 912 912 } 913 913 914 914 static int audit_tree_handle_event(struct fsnotify_group *group, 915 + struct inode *to_tell, 915 916 struct fsnotify_mark *inode_mark, 916 - struct fsnotify_mark *vfsmonut_mark, 917 - struct fsnotify_event *event) 917 + struct fsnotify_mark *vfsmount_mark, 918 + u32 mask, void *data, int data_type, 919 + const unsigned char *file_name) 918 920 { 919 - BUG(); 920 - return -EOPNOTSUPP; 921 + return 0; 921 922 } 922 923 923 924 static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group) ··· 934 933 BUG_ON(atomic_read(&entry->refcnt) < 1); 935 934 } 936 935 937 - static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, 938 - struct fsnotify_mark *inode_mark, 939 - struct fsnotify_mark *vfsmount_mark, 940 - __u32 mask, void *data, int data_type) 941 - { 942 - return false; 943 - } 944 - 945 936 static const struct fsnotify_ops audit_tree_ops = { 946 937 .handle_event = audit_tree_handle_event, 947 - .should_send_event = audit_tree_send_event, 948 - .free_group_priv = NULL, 949 - .free_event_priv = NULL, 950 938 .freeing_mark = audit_tree_freeing_mark, 951 939 }; 952 940

+6 -18

kernel/audit_watch.c

··· 465 465 } 466 466 } 467 467 468 - static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode, 469 - struct fsnotify_mark *inode_mark, 470 - struct fsnotify_mark *vfsmount_mark, 471 - __u32 mask, void *data, int data_type) 472 - { 473 - return true; 474 - } 475 - 476 468 /* Update watch data in audit rules based on fsnotify events. */ 477 469 static int audit_watch_handle_event(struct fsnotify_group *group, 470 + struct inode *to_tell, 478 471 struct fsnotify_mark *inode_mark, 479 472 struct fsnotify_mark *vfsmount_mark, 480 - struct fsnotify_event *event) 473 + u32 mask, void *data, int data_type, 474 + const unsigned char *dname) 481 475 { 482 476 struct inode *inode; 483 - __u32 mask = event->mask; 484 - const char *dname = event->file_name; 485 477 struct audit_parent *parent; 486 478 487 479 parent = container_of(inode_mark, struct audit_parent, mark); 488 480 489 481 BUG_ON(group != audit_watch_group); 490 482 491 - switch (event->data_type) { 483 + switch (data_type) { 492 484 case (FSNOTIFY_EVENT_PATH): 493 - inode = event->path.dentry->d_inode; 485 + inode = ((struct path *)data)->dentry->d_inode; 494 486 break; 495 487 case (FSNOTIFY_EVENT_INODE): 496 - inode = event->inode; 488 + inode = (struct inode *)data; 497 489 break; 498 490 default: 499 491 BUG(); ··· 504 512 } 505 513 506 514 static const struct fsnotify_ops audit_watch_fsnotify_ops = { 507 - .should_send_event = audit_watch_should_send_event, 508 515 .handle_event = audit_watch_handle_event, 509 - .free_group_priv = NULL, 510 - .freeing_mark = NULL, 511 - .free_event_priv = NULL, 512 516 }; 513 517 514 518 static int __init audit_watch_init(void)

+1

kernel/exit.c

··· 74 74 __this_cpu_dec(process_counts); 75 75 } 76 76 list_del_rcu(&p->thread_group); 77 + list_del_rcu(&p->thread_node); 77 78 } 78 79 79 80 /*

+7

kernel/fork.c

··· 1035 1035 sig->nr_threads = 1; 1036 1036 atomic_set(&sig->live, 1); 1037 1037 atomic_set(&sig->sigcnt, 1); 1038 + 1039 + /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ 1040 + sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); 1041 + tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head); 1042 + 1038 1043 init_waitqueue_head(&sig->wait_chldexit); 1039 1044 sig->curr_target = tsk; 1040 1045 init_sigpending(&sig->shared_pending); ··· 1479 1474 atomic_inc(&current->signal->sigcnt); 1480 1475 list_add_tail_rcu(&p->thread_group, 1481 1476 &p->group_leader->thread_group); 1477 + list_add_tail_rcu(&p->thread_node, 1478 + &p->signal->thread_head); 1482 1479 } 1483 1480 attach_pid(p, PIDTYPE_PID); 1484 1481 nr_threads++;

+1 -1

kernel/power/snapshot.c

··· 637 637 BUG_ON(!region); 638 638 } else 639 639 /* This allocation cannot fail */ 640 - region = alloc_bootmem(sizeof(struct nosave_region)); 640 + region = memblock_virt_alloc(sizeof(struct nosave_region), 0); 641 641 region->start_pfn = start_pfn; 642 642 region->end_pfn = end_pfn; 643 643 list_add_tail(&region->list, &nosave_regions);

+3 -7

kernel/printk/printk.c

··· 757 757 return; 758 758 759 759 if (early) { 760 - unsigned long mem; 761 - 762 - mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); 763 - if (!mem) 764 - return; 765 - new_log_buf = __va(mem); 760 + new_log_buf = 761 + memblock_virt_alloc(new_log_buf_len, PAGE_SIZE); 766 762 } else { 767 - new_log_buf = alloc_bootmem_nopanic(new_log_buf_len); 763 + new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0); 768 764 } 769 765 770 766 if (unlikely(!new_log_buf)) {

+2

kernel/sched/core.c

··· 1108 1108 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) 1109 1109 goto out; 1110 1110 1111 + trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); 1111 1112 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); 1112 1113 1113 1114 out: ··· 4604 4603 4605 4604 /* TODO: This is not properly updating schedstats */ 4606 4605 4606 + trace_sched_move_numa(p, curr_cpu, target_cpu); 4607 4607 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); 4608 4608 } 4609 4609

+5 -1

kernel/sched/fair.c

··· 1250 1250 p->numa_scan_period = task_scan_min(p); 1251 1251 1252 1252 if (env.best_task == NULL) { 1253 - int ret = migrate_task_to(p, env.best_cpu); 1253 + ret = migrate_task_to(p, env.best_cpu); 1254 + if (ret != 0) 1255 + trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); 1254 1256 return ret; 1255 1257 } 1256 1258 1257 1259 ret = migrate_swap(p, env.best_task); 1260 + if (ret != 0) 1261 + trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); 1258 1262 put_task_struct(env.best_task); 1259 1263 return ret; 1260 1264 }

+8 -3

kernel/sysctl.c

··· 95 95 #if defined(CONFIG_SYSCTL) 96 96 97 97 /* External variables not in a header file. */ 98 - extern int sysctl_overcommit_memory; 99 - extern int sysctl_overcommit_ratio; 100 98 extern int max_threads; 101 99 extern int suid_dumpable; 102 100 #ifdef CONFIG_COREDUMP ··· 1119 1121 .data = &sysctl_overcommit_ratio, 1120 1122 .maxlen = sizeof(sysctl_overcommit_ratio), 1121 1123 .mode = 0644, 1122 - .proc_handler = proc_dointvec, 1124 + .proc_handler = overcommit_ratio_handler, 1125 + }, 1126 + { 1127 + .procname = "overcommit_kbytes", 1128 + .data = &sysctl_overcommit_kbytes, 1129 + .maxlen = sizeof(sysctl_overcommit_kbytes), 1130 + .mode = 0644, 1131 + .proc_handler = overcommit_kbytes_handler, 1123 1132 }, 1124 1133 { 1125 1134 .procname = "page-cluster",

+10 -2

lib/Kconfig.debug

··· 1584 1584 With this option you will be able to detect common bugs in device 1585 1585 drivers like double-freeing of DMA mappings or freeing mappings that 1586 1586 were never allocated. 1587 - This option causes a performance degredation. Use only if you want 1588 - to debug device drivers. If unsure, say N. 1587 + 1588 + This also attempts to catch cases where a page owned by DMA is 1589 + accessed by the cpu in a way that could cause data corruption. For 1590 + example, this enables cow_user_page() to check that the source page is 1591 + not undergoing DMA. 1592 + 1593 + This option causes a performance degradation. Use only if you want to 1594 + debug device drivers and dma interactions. 1595 + 1596 + If unsure, say N. 1589 1597 1590 1598 source "samples/Kconfig" 1591 1599

+2 -2

lib/cpumask.c

··· 140 140 */ 141 141 void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask) 142 142 { 143 - *mask = alloc_bootmem(cpumask_size()); 143 + *mask = memblock_virt_alloc(cpumask_size(), 0); 144 144 } 145 145 146 146 /** ··· 161 161 */ 162 162 void __init free_bootmem_cpumask_var(cpumask_var_t mask) 163 163 { 164 - free_bootmem(__pa(mask), cpumask_size()); 164 + memblock_free_early(__pa(mask), cpumask_size()); 165 165 } 166 166 #endif

+180 -13

lib/dma-debug.c

··· 53 53 54 54 #define DMA_DEBUG_STACKTRACE_ENTRIES 5 55 55 56 + /** 57 + * struct dma_debug_entry - track a dma_map* or dma_alloc_coherent mapping 58 + * @list: node on pre-allocated free_entries list 59 + * @dev: 'dev' argument to dma_map_{page|single|sg} or dma_alloc_coherent 60 + * @type: single, page, sg, coherent 61 + * @pfn: page frame of the start address 62 + * @offset: offset of mapping relative to pfn 63 + * @size: length of the mapping 64 + * @direction: enum dma_data_direction 65 + * @sg_call_ents: 'nents' from dma_map_sg 66 + * @sg_mapped_ents: 'mapped_ents' from dma_map_sg 67 + * @map_err_type: track whether dma_mapping_error() was checked 68 + * @stacktrace: support backtraces when a violation is detected 69 + */ 56 70 struct dma_debug_entry { 57 71 struct list_head list; 58 72 struct device *dev; 59 73 int type; 60 - phys_addr_t paddr; 74 + unsigned long pfn; 75 + size_t offset; 61 76 u64 dev_addr; 62 77 u64 size; 63 78 int direction; ··· 387 372 list_del(&entry->list); 388 373 } 389 374 375 + static unsigned long long phys_addr(struct dma_debug_entry *entry) 376 + { 377 + return page_to_phys(pfn_to_page(entry->pfn)) + entry->offset; 378 + } 379 + 390 380 /* 391 381 * Dump mapping entries for debugging purposes 392 382 */ ··· 409 389 list_for_each_entry(entry, &bucket->list, list) { 410 390 if (!dev || dev == entry->dev) { 411 391 dev_info(entry->dev, 412 - "%s idx %d P=%Lx D=%Lx L=%Lx %s %s\n", 392 + "%s idx %d P=%Lx N=%lx D=%Lx L=%Lx %s %s\n", 413 393 type2name[entry->type], idx, 414 - (unsigned long long)entry->paddr, 394 + phys_addr(entry), entry->pfn, 415 395 entry->dev_addr, entry->size, 416 396 dir2name[entry->direction], 417 397 maperr2str[entry->map_err_type]); ··· 424 404 EXPORT_SYMBOL(debug_dma_dump_mappings); 425 405 426 406 /* 407 + * For each page mapped (initial page in the case of 408 + * dma_alloc_coherent/dma_map_{single|page}, or each page in a 409 + * scatterlist) insert into this tree using the pfn as the key. At 410 + * dma_unmap_{single|sg|page} or dma_free_coherent delete the entry. If 411 + * the pfn already exists at insertion time add a tag as a reference 412 + * count for the overlapping mappings. For now, the overlap tracking 413 + * just ensures that 'unmaps' balance 'maps' before marking the pfn 414 + * idle, but we should also be flagging overlaps as an API violation. 415 + * 416 + * Memory usage is mostly constrained by the maximum number of available 417 + * dma-debug entries in that we need a free dma_debug_entry before 418 + * inserting into the tree. In the case of dma_map_{single|page} and 419 + * dma_alloc_coherent there is only one dma_debug_entry and one pfn to 420 + * track per event. dma_map_sg(), on the other hand, 421 + * consumes a single dma_debug_entry, but inserts 'nents' entries into 422 + * the tree. 423 + * 424 + * At any time debug_dma_assert_idle() can be called to trigger a 425 + * warning if the given page is in the active set. 426 + */ 427 + static RADIX_TREE(dma_active_pfn, GFP_NOWAIT); 428 + static DEFINE_SPINLOCK(radix_lock); 429 + #define ACTIVE_PFN_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1) 430 + 431 + static int active_pfn_read_overlap(unsigned long pfn) 432 + { 433 + int overlap = 0, i; 434 + 435 + for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--) 436 + if (radix_tree_tag_get(&dma_active_pfn, pfn, i)) 437 + overlap |= 1 << i; 438 + return overlap; 439 + } 440 + 441 + static int active_pfn_set_overlap(unsigned long pfn, int overlap) 442 + { 443 + int i; 444 + 445 + if (overlap > ACTIVE_PFN_MAX_OVERLAP || overlap < 0) 446 + return 0; 447 + 448 + for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--) 449 + if (overlap & 1 << i) 450 + radix_tree_tag_set(&dma_active_pfn, pfn, i); 451 + else 452 + radix_tree_tag_clear(&dma_active_pfn, pfn, i); 453 + 454 + return overlap; 455 + } 456 + 457 + static void active_pfn_inc_overlap(unsigned long pfn) 458 + { 459 + int overlap = active_pfn_read_overlap(pfn); 460 + 461 + overlap = active_pfn_set_overlap(pfn, ++overlap); 462 + 463 + /* If we overflowed the overlap counter then we're potentially 464 + * leaking dma-mappings. Otherwise, if maps and unmaps are 465 + * balanced then this overflow may cause false negatives in 466 + * debug_dma_assert_idle() as the pfn may be marked idle 467 + * prematurely. 468 + */ 469 + WARN_ONCE(overlap == 0, 470 + "DMA-API: exceeded %d overlapping mappings of pfn %lx\n", 471 + ACTIVE_PFN_MAX_OVERLAP, pfn); 472 + } 473 + 474 + static int active_pfn_dec_overlap(unsigned long pfn) 475 + { 476 + int overlap = active_pfn_read_overlap(pfn); 477 + 478 + return active_pfn_set_overlap(pfn, --overlap); 479 + } 480 + 481 + static int active_pfn_insert(struct dma_debug_entry *entry) 482 + { 483 + unsigned long flags; 484 + int rc; 485 + 486 + spin_lock_irqsave(&radix_lock, flags); 487 + rc = radix_tree_insert(&dma_active_pfn, entry->pfn, entry); 488 + if (rc == -EEXIST) 489 + active_pfn_inc_overlap(entry->pfn); 490 + spin_unlock_irqrestore(&radix_lock, flags); 491 + 492 + return rc; 493 + } 494 + 495 + static void active_pfn_remove(struct dma_debug_entry *entry) 496 + { 497 + unsigned long flags; 498 + 499 + spin_lock_irqsave(&radix_lock, flags); 500 + if (active_pfn_dec_overlap(entry->pfn) == 0) 501 + radix_tree_delete(&dma_active_pfn, entry->pfn); 502 + spin_unlock_irqrestore(&radix_lock, flags); 503 + } 504 + 505 + /** 506 + * debug_dma_assert_idle() - assert that a page is not undergoing dma 507 + * @page: page to lookup in the dma_active_pfn tree 508 + * 509 + * Place a call to this routine in cases where the cpu touching the page 510 + * before the dma completes (page is dma_unmapped) will lead to data 511 + * corruption. 512 + */ 513 + void debug_dma_assert_idle(struct page *page) 514 + { 515 + unsigned long flags; 516 + struct dma_debug_entry *entry; 517 + 518 + if (!page) 519 + return; 520 + 521 + spin_lock_irqsave(&radix_lock, flags); 522 + entry = radix_tree_lookup(&dma_active_pfn, page_to_pfn(page)); 523 + spin_unlock_irqrestore(&radix_lock, flags); 524 + 525 + if (!entry) 526 + return; 527 + 528 + err_printk(entry->dev, entry, 529 + "DMA-API: cpu touching an active dma mapped page " 530 + "[pfn=0x%lx]\n", entry->pfn); 531 + } 532 + 533 + /* 427 534 * Wrapper function for adding an entry to the hash. 428 535 * This function takes care of locking itself. 429 536 */ ··· 558 411 { 559 412 struct hash_bucket *bucket; 560 413 unsigned long flags; 414 + int rc; 561 415 562 416 bucket = get_hash_bucket(entry, &flags); 563 417 hash_bucket_add(bucket, entry); 564 418 put_hash_bucket(bucket, &flags); 419 + 420 + rc = active_pfn_insert(entry); 421 + if (rc == -ENOMEM) { 422 + pr_err("DMA-API: pfn tracking ENOMEM, dma-debug disabled\n"); 423 + global_disable = true; 424 + } 425 + 426 + /* TODO: report -EEXIST errors here as overlapping mappings are 427 + * not supported by the DMA API 428 + */ 565 429 } 566 430 567 431 static struct dma_debug_entry *__dma_entry_alloc(void) ··· 626 468 static void dma_entry_free(struct dma_debug_entry *entry) 627 469 { 628 470 unsigned long flags; 471 + 472 + active_pfn_remove(entry); 629 473 630 474 /* 631 475 * add to beginning of the list - this way the entries are ··· 1055 895 ref->dev_addr, ref->size, 1056 896 type2name[entry->type], type2name[ref->type]); 1057 897 } else if ((entry->type == dma_debug_coherent) && 1058 - (ref->paddr != entry->paddr)) { 898 + (phys_addr(ref) != phys_addr(entry))) { 1059 899 err_printk(ref->dev, entry, "DMA-API: device driver frees " 1060 900 "DMA memory with different CPU address " 1061 901 "[device address=0x%016llx] [size=%llu bytes] " 1062 902 "[cpu alloc address=0x%016llx] " 1063 903 "[cpu free address=0x%016llx]", 1064 904 ref->dev_addr, ref->size, 1065 - (unsigned long long)entry->paddr, 1066 - (unsigned long long)ref->paddr); 905 + phys_addr(entry), 906 + phys_addr(ref)); 1067 907 } 1068 908 1069 909 if (ref->sg_call_ents && ref->type == dma_debug_sg && ··· 1212 1052 1213 1053 entry->dev = dev; 1214 1054 entry->type = dma_debug_page; 1215 - entry->paddr = page_to_phys(page) + offset; 1055 + entry->pfn = page_to_pfn(page); 1056 + entry->offset = offset, 1216 1057 entry->dev_addr = dma_addr; 1217 1058 entry->size = size; 1218 1059 entry->direction = direction; ··· 1309 1148 1310 1149 entry->type = dma_debug_sg; 1311 1150 entry->dev = dev; 1312 - entry->paddr = sg_phys(s); 1151 + entry->pfn = page_to_pfn(sg_page(s)); 1152 + entry->offset = s->offset, 1313 1153 entry->size = sg_dma_len(s); 1314 1154 entry->dev_addr = sg_dma_address(s); 1315 1155 entry->direction = direction; ··· 1360 1198 struct dma_debug_entry ref = { 1361 1199 .type = dma_debug_sg, 1362 1200 .dev = dev, 1363 - .paddr = sg_phys(s), 1201 + .pfn = page_to_pfn(sg_page(s)), 1202 + .offset = s->offset, 1364 1203 .dev_addr = sg_dma_address(s), 1365 1204 .size = sg_dma_len(s), 1366 1205 .direction = dir, ··· 1396 1233 1397 1234 entry->type = dma_debug_coherent; 1398 1235 entry->dev = dev; 1399 - entry->paddr = virt_to_phys(virt); 1236 + entry->pfn = page_to_pfn(virt_to_page(virt)); 1237 + entry->offset = (size_t) virt & PAGE_MASK; 1400 1238 entry->size = size; 1401 1239 entry->dev_addr = dma_addr; 1402 1240 entry->direction = DMA_BIDIRECTIONAL; ··· 1412 1248 struct dma_debug_entry ref = { 1413 1249 .type = dma_debug_coherent, 1414 1250 .dev = dev, 1415 - .paddr = virt_to_phys(virt), 1251 + .pfn = page_to_pfn(virt_to_page(virt)), 1252 + .offset = (size_t) virt & PAGE_MASK, 1416 1253 .dev_addr = addr, 1417 1254 .size = size, 1418 1255 .direction = DMA_BIDIRECTIONAL, ··· 1521 1356 struct dma_debug_entry ref = { 1522 1357 .type = dma_debug_sg, 1523 1358 .dev = dev, 1524 - .paddr = sg_phys(s), 1359 + .pfn = page_to_pfn(sg_page(s)), 1360 + .offset = s->offset, 1525 1361 .dev_addr = sg_dma_address(s), 1526 1362 .size = sg_dma_len(s), 1527 1363 .direction = direction, ··· 1554 1388 struct dma_debug_entry ref = { 1555 1389 .type = dma_debug_sg, 1556 1390 .dev = dev, 1557 - .paddr = sg_phys(s), 1391 + .pfn = page_to_pfn(sg_page(s)), 1392 + .offset = s->offset, 1558 1393 .dev_addr = sg_dma_address(s), 1559 1394 .size = sg_dma_len(s), 1560 1395 .direction = direction,

+3 -3

lib/show_mem.c

··· 17 17 printk("Mem-Info:\n"); 18 18 show_free_areas(filter); 19 19 20 - if (filter & SHOW_MEM_FILTER_PAGE_COUNT) 21 - return; 22 - 23 20 for_each_online_pgdat(pgdat) { 24 21 unsigned long flags; 25 22 int zoneid; ··· 42 45 #ifdef CONFIG_QUICKLIST 43 46 printk("%lu pages in pagetable cache\n", 44 47 quicklist_total_size()); 48 + #endif 49 + #ifdef CONFIG_MEMORY_FAILURE 50 + printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); 45 51 #endif 46 52 }

+20 -15

lib/swiotlb.c

··· 172 172 /* 173 173 * Get the overflow emergency buffer 174 174 */ 175 - v_overflow_buffer = alloc_bootmem_low_pages_nopanic( 176 - PAGE_ALIGN(io_tlb_overflow)); 175 + v_overflow_buffer = memblock_virt_alloc_nopanic( 176 + PAGE_ALIGN(io_tlb_overflow), 177 + PAGE_SIZE); 177 178 if (!v_overflow_buffer) 178 179 return -ENOMEM; 179 180 ··· 185 184 * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE 186 185 * between io_tlb_start and io_tlb_end. 187 186 */ 188 - io_tlb_list = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); 187 + io_tlb_list = memblock_virt_alloc( 188 + PAGE_ALIGN(io_tlb_nslabs * sizeof(int)), 189 + PAGE_SIZE); 189 190 for (i = 0; i < io_tlb_nslabs; i++) 190 191 io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); 191 192 io_tlb_index = 0; 192 - io_tlb_orig_addr = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); 193 + io_tlb_orig_addr = memblock_virt_alloc( 194 + PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)), 195 + PAGE_SIZE); 193 196 194 197 if (verbose) 195 198 swiotlb_print_info(); ··· 220 215 bytes = io_tlb_nslabs << IO_TLB_SHIFT; 221 216 222 217 /* Get IO TLB memory from the low pages */ 223 - vstart = alloc_bootmem_low_pages_nopanic(PAGE_ALIGN(bytes)); 218 + vstart = memblock_virt_alloc_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE); 224 219 if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) 225 220 return; 226 221 227 222 if (io_tlb_start) 228 - free_bootmem(io_tlb_start, 229 - PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); 223 + memblock_free_early(io_tlb_start, 224 + PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); 230 225 pr_warn("Cannot allocate SWIOTLB buffer"); 231 226 no_iotlb_memory = true; 232 227 } ··· 362 357 free_pages((unsigned long)phys_to_virt(io_tlb_start), 363 358 get_order(io_tlb_nslabs << IO_TLB_SHIFT)); 364 359 } else { 365 - free_bootmem_late(io_tlb_overflow_buffer, 366 - PAGE_ALIGN(io_tlb_overflow)); 367 - free_bootmem_late(__pa(io_tlb_orig_addr), 368 - PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); 369 - free_bootmem_late(__pa(io_tlb_list), 370 - PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); 371 - free_bootmem_late(io_tlb_start, 372 - PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); 360 + memblock_free_late(io_tlb_overflow_buffer, 361 + PAGE_ALIGN(io_tlb_overflow)); 362 + memblock_free_late(__pa(io_tlb_orig_addr), 363 + PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); 364 + memblock_free_late(__pa(io_tlb_list), 365 + PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); 366 + memblock_free_late(io_tlb_start, 367 + PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); 373 368 } 374 369 io_tlb_nslabs = 0; 375 370 }

+43 -18

mm/compaction.c

··· 459 459 unsigned long flags; 460 460 bool locked = false; 461 461 struct page *page = NULL, *valid_page = NULL; 462 + bool skipped_async_unsuitable = false; 462 463 463 464 /* 464 465 * Ensure that there are not too many pages isolated from the LRU ··· 535 534 if (!cc->sync && last_pageblock_nr != pageblock_nr && 536 535 !migrate_async_suitable(get_pageblock_migratetype(page))) { 537 536 cc->finished_update_migrate = true; 537 + skipped_async_unsuitable = true; 538 538 goto next_pageblock; 539 539 } 540 540 ··· 629 627 if (locked) 630 628 spin_unlock_irqrestore(&zone->lru_lock, flags); 631 629 632 - /* Update the pageblock-skip if the whole pageblock was scanned */ 633 - if (low_pfn == end_pfn) 630 + /* 631 + * Update the pageblock-skip information and cached scanner pfn, 632 + * if the whole pageblock was scanned without isolating any page. 633 + * This is not done when pageblock was skipped due to being unsuitable 634 + * for async compaction, so that eventual sync compaction can try. 635 + */ 636 + if (low_pfn == end_pfn && !skipped_async_unsuitable) 634 637 update_pageblock_skip(cc, valid_page, nr_isolated, true); 635 638 636 639 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); ··· 667 660 * is the end of the pageblock the migration scanner is using. 668 661 */ 669 662 pfn = cc->free_pfn; 670 - low_pfn = cc->migrate_pfn + pageblock_nr_pages; 663 + low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); 671 664 672 665 /* 673 666 * Take care that if the migration scanner is at the end of the zone ··· 683 676 * pages on cc->migratepages. We stop searching if the migrate 684 677 * and free page scanners meet or enough free pages are isolated. 685 678 */ 686 - for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; 679 + for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages; 687 680 pfn -= pageblock_nr_pages) { 688 681 unsigned long isolated; 689 682 ··· 745 738 /* split_free_page does not map the pages */ 746 739 map_pages(freelist); 747 740 748 - cc->free_pfn = high_pfn; 741 + /* 742 + * If we crossed the migrate scanner, we want to keep it that way 743 + * so that compact_finished() may detect this 744 + */ 745 + if (pfn < low_pfn) 746 + cc->free_pfn = max(pfn, zone->zone_start_pfn); 747 + else 748 + cc->free_pfn = high_pfn; 749 749 cc->nr_freepages = nr_freepages; 750 750 } 751 751 ··· 851 837 852 838 /* Compaction run completes if the migrate and free scanner meet */ 853 839 if (cc->free_pfn <= cc->migrate_pfn) { 840 + /* Let the next compaction start anew. */ 841 + zone->compact_cached_migrate_pfn = zone->zone_start_pfn; 842 + zone->compact_cached_free_pfn = zone_end_pfn(zone); 843 + 854 844 /* 855 845 * Mark that the PG_migrate_skip information should be cleared 856 846 * by kswapd when it goes to sleep. kswapd does not set the ··· 965 947 } 966 948 967 949 /* 950 + * Clear pageblock skip if there were failures recently and compaction 951 + * is about to be retried after being deferred. kswapd does not do 952 + * this reset as it'll reset the cached information when going to sleep. 953 + */ 954 + if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) 955 + __reset_isolation_suitable(zone); 956 + 957 + /* 968 958 * Setup to move all movable pages to the end of the zone. Used cached 969 959 * information on where the scanners should start but check that it 970 960 * is initialised by ensuring the values are within zone boundaries. ··· 988 962 zone->compact_cached_migrate_pfn = cc->migrate_pfn; 989 963 } 990 964 991 - /* 992 - * Clear pageblock skip if there were failures recently and compaction 993 - * is about to be retried after being deferred. kswapd does not do 994 - * this reset as it'll reset the cached information when going to sleep. 995 - */ 996 - if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) 997 - __reset_isolation_suitable(zone); 965 + trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); 998 966 999 967 migrate_prep_local(); 1000 968 ··· 1023 1003 if (err) { 1024 1004 putback_movable_pages(&cc->migratepages); 1025 1005 cc->nr_migratepages = 0; 1026 - if (err == -ENOMEM) { 1006 + /* 1007 + * migrate_pages() may return -ENOMEM when scanners meet 1008 + * and we want compact_finished() to detect it 1009 + */ 1010 + if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) { 1027 1011 ret = COMPACT_PARTIAL; 1028 1012 goto out; 1029 1013 } ··· 1038 1014 /* Release free pages and check accounting */ 1039 1015 cc->nr_freepages -= release_freepages(&cc->freepages); 1040 1016 VM_BUG_ON(cc->nr_freepages != 0); 1017 + 1018 + trace_mm_compaction_end(ret); 1041 1019 1042 1020 return ret; 1043 1021 } ··· 1146 1120 compact_zone(zone, cc); 1147 1121 1148 1122 if (cc->order > 0) { 1149 - int ok = zone_watermark_ok(zone, cc->order, 1150 - low_wmark_pages(zone), 0, 0); 1151 - if (ok && cc->order >= zone->compact_order_failed) 1152 - zone->compact_order_failed = cc->order + 1; 1123 + if (zone_watermark_ok(zone, cc->order, 1124 + low_wmark_pages(zone), 0, 0)) 1125 + compaction_defer_reset(zone, cc->order, false); 1153 1126 /* Currently async compaction is never deferred. */ 1154 - else if (!ok && cc->sync) 1127 + else if (cc->sync) 1155 1128 defer_compaction(zone, cc->order); 1156 1129 } 1157 1130

+24 -22

mm/hugetlb.c

··· 690 690 */ 691 691 int PageHuge(struct page *page) 692 692 { 693 - compound_page_dtor *dtor; 694 - 695 693 if (!PageCompound(page)) 696 694 return 0; 697 695 698 696 page = compound_head(page); 699 - dtor = get_compound_page_dtor(page); 700 - 701 - return dtor == free_huge_page; 697 + return get_compound_page_dtor(page) == free_huge_page; 702 698 } 703 699 EXPORT_SYMBOL_GPL(PageHuge); 704 700 ··· 704 708 */ 705 709 int PageHeadHuge(struct page *page_head) 706 710 { 707 - compound_page_dtor *dtor; 708 - 709 711 if (!PageHead(page_head)) 710 712 return 0; 711 713 712 - dtor = get_compound_page_dtor(page_head); 713 - 714 - return dtor == free_huge_page; 714 + return get_compound_page_dtor(page_head) == free_huge_page; 715 715 } 716 - EXPORT_SYMBOL_GPL(PageHeadHuge); 717 716 718 717 pgoff_t __basepage_index(struct page *page) 719 718 { ··· 1271 1280 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { 1272 1281 void *addr; 1273 1282 1274 - addr = __alloc_bootmem_node_nopanic(NODE_DATA(node), 1275 - huge_page_size(h), huge_page_size(h), 0); 1276 - 1283 + addr = memblock_virt_alloc_try_nid_nopanic( 1284 + huge_page_size(h), huge_page_size(h), 1285 + 0, BOOTMEM_ALLOC_ACCESSIBLE, node); 1277 1286 if (addr) { 1278 1287 /* 1279 1288 * Use the beginning of the huge page to store the ··· 1313 1322 1314 1323 #ifdef CONFIG_HIGHMEM 1315 1324 page = pfn_to_page(m->phys >> PAGE_SHIFT); 1316 - free_bootmem_late((unsigned long)m, 1317 - sizeof(struct huge_bootmem_page)); 1325 + memblock_free_late(__pa(m), 1326 + sizeof(struct huge_bootmem_page)); 1318 1327 #else 1319 1328 page = virt_to_page(m); 1320 1329 #endif ··· 2346 2355 int cow; 2347 2356 struct hstate *h = hstate_vma(vma); 2348 2357 unsigned long sz = huge_page_size(h); 2358 + unsigned long mmun_start; /* For mmu_notifiers */ 2359 + unsigned long mmun_end; /* For mmu_notifiers */ 2360 + int ret = 0; 2349 2361 2350 2362 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 2363 + 2364 + mmun_start = vma->vm_start; 2365 + mmun_end = vma->vm_end; 2366 + if (cow) 2367 + mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end); 2351 2368 2352 2369 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { 2353 2370 spinlock_t *src_ptl, *dst_ptl; ··· 2363 2364 if (!src_pte) 2364 2365 continue; 2365 2366 dst_pte = huge_pte_alloc(dst, addr, sz); 2366 - if (!dst_pte) 2367 - goto nomem; 2367 + if (!dst_pte) { 2368 + ret = -ENOMEM; 2369 + break; 2370 + } 2368 2371 2369 2372 /* If the pagetables are shared don't copy or take references */ 2370 2373 if (dst_pte == src_pte) ··· 2387 2386 spin_unlock(src_ptl); 2388 2387 spin_unlock(dst_ptl); 2389 2388 } 2390 - return 0; 2391 2389 2392 - nomem: 2393 - return -ENOMEM; 2390 + if (cow) 2391 + mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end); 2392 + 2393 + return ret; 2394 2394 } 2395 2395 2396 2396 static int is_hugetlb_entry_migration(pte_t pte) ··· 3081 3079 same_page: 3082 3080 if (pages) { 3083 3081 pages[i] = mem_map_offset(page, pfn_offset); 3084 - get_page(pages[i]); 3082 + get_page_foll(pages[i]); 3085 3083 } 3086 3084 3087 3085 if (vmas)

+1 -1

mm/hwpoison-inject.c

··· 55 55 return 0; 56 56 57 57 inject: 58 - printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); 58 + pr_info("Injecting memory failure at pfn %#lx\n", pfn); 59 59 return memory_failure(pfn, 18, MF_COUNT_INCREASED); 60 60 } 61 61

+1 -3

mm/internal.h

··· 47 47 * page_cache_get_speculative()) on tail pages. 48 48 */ 49 49 VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); 50 - VM_BUG_ON(atomic_read(&page->_count) != 0); 51 - VM_BUG_ON(page_mapcount(page) < 0); 52 50 if (get_page_head) 53 51 atomic_inc(&page->first_page->_count); 54 - atomic_inc(&page->_mapcount); 52 + get_huge_page_tail(page); 55 53 } 56 54 57 55 /*

+15 -108

mm/ksm.c

··· 1891 1891 return new_page; 1892 1892 } 1893 1893 1894 - int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, 1895 - unsigned long *vm_flags) 1896 - { 1897 - struct stable_node *stable_node; 1898 - struct rmap_item *rmap_item; 1899 - unsigned int mapcount = page_mapcount(page); 1900 - int referenced = 0; 1901 - int search_new_forks = 0; 1902 - 1903 - VM_BUG_ON(!PageKsm(page)); 1904 - VM_BUG_ON(!PageLocked(page)); 1905 - 1906 - stable_node = page_stable_node(page); 1907 - if (!stable_node) 1908 - return 0; 1909 - again: 1910 - hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { 1911 - struct anon_vma *anon_vma = rmap_item->anon_vma; 1912 - struct anon_vma_chain *vmac; 1913 - struct vm_area_struct *vma; 1914 - 1915 - anon_vma_lock_read(anon_vma); 1916 - anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 1917 - 0, ULONG_MAX) { 1918 - vma = vmac->vma; 1919 - if (rmap_item->address < vma->vm_start || 1920 - rmap_item->address >= vma->vm_end) 1921 - continue; 1922 - /* 1923 - * Initially we examine only the vma which covers this 1924 - * rmap_item; but later, if there is still work to do, 1925 - * we examine covering vmas in other mms: in case they 1926 - * were forked from the original since ksmd passed. 1927 - */ 1928 - if ((rmap_item->mm == vma->vm_mm) == search_new_forks) 1929 - continue; 1930 - 1931 - if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) 1932 - continue; 1933 - 1934 - referenced += page_referenced_one(page, vma, 1935 - rmap_item->address, &mapcount, vm_flags); 1936 - if (!search_new_forks || !mapcount) 1937 - break; 1938 - } 1939 - anon_vma_unlock_read(anon_vma); 1940 - if (!mapcount) 1941 - goto out; 1942 - } 1943 - if (!search_new_forks++) 1944 - goto again; 1945 - out: 1946 - return referenced; 1947 - } 1948 - 1949 - int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) 1894 + int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) 1950 1895 { 1951 1896 struct stable_node *stable_node; 1952 1897 struct rmap_item *rmap_item; ··· 1899 1954 int search_new_forks = 0; 1900 1955 1901 1956 VM_BUG_ON(!PageKsm(page)); 1902 - VM_BUG_ON(!PageLocked(page)); 1903 1957 1904 - stable_node = page_stable_node(page); 1905 - if (!stable_node) 1906 - return SWAP_FAIL; 1907 - again: 1908 - hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { 1909 - struct anon_vma *anon_vma = rmap_item->anon_vma; 1910 - struct anon_vma_chain *vmac; 1911 - struct vm_area_struct *vma; 1912 - 1913 - anon_vma_lock_read(anon_vma); 1914 - anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 1915 - 0, ULONG_MAX) { 1916 - vma = vmac->vma; 1917 - if (rmap_item->address < vma->vm_start || 1918 - rmap_item->address >= vma->vm_end) 1919 - continue; 1920 - /* 1921 - * Initially we examine only the vma which covers this 1922 - * rmap_item; but later, if there is still work to do, 1923 - * we examine covering vmas in other mms: in case they 1924 - * were forked from the original since ksmd passed. 1925 - */ 1926 - if ((rmap_item->mm == vma->vm_mm) == search_new_forks) 1927 - continue; 1928 - 1929 - ret = try_to_unmap_one(page, vma, 1930 - rmap_item->address, flags); 1931 - if (ret != SWAP_AGAIN || !page_mapped(page)) { 1932 - anon_vma_unlock_read(anon_vma); 1933 - goto out; 1934 - } 1935 - } 1936 - anon_vma_unlock_read(anon_vma); 1937 - } 1938 - if (!search_new_forks++) 1939 - goto again; 1940 - out: 1941 - return ret; 1942 - } 1943 - 1944 - #ifdef CONFIG_MIGRATION 1945 - int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, 1946 - struct vm_area_struct *, unsigned long, void *), void *arg) 1947 - { 1948 - struct stable_node *stable_node; 1949 - struct rmap_item *rmap_item; 1950 - int ret = SWAP_AGAIN; 1951 - int search_new_forks = 0; 1952 - 1953 - VM_BUG_ON(!PageKsm(page)); 1958 + /* 1959 + * Rely on the page lock to protect against concurrent modifications 1960 + * to that page's node of the stable tree. 1961 + */ 1954 1962 VM_BUG_ON(!PageLocked(page)); 1955 1963 1956 1964 stable_node = page_stable_node(page); ··· 1931 2033 if ((rmap_item->mm == vma->vm_mm) == search_new_forks) 1932 2034 continue; 1933 2035 1934 - ret = rmap_one(page, vma, rmap_item->address, arg); 2036 + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 2037 + continue; 2038 + 2039 + ret = rwc->rmap_one(page, vma, 2040 + rmap_item->address, rwc->arg); 1935 2041 if (ret != SWAP_AGAIN) { 2042 + anon_vma_unlock_read(anon_vma); 2043 + goto out; 2044 + } 2045 + if (rwc->done && rwc->done(page)) { 1936 2046 anon_vma_unlock_read(anon_vma); 1937 2047 goto out; 1938 2048 } ··· 1953 2047 return ret; 1954 2048 } 1955 2049 2050 + #ifdef CONFIG_MIGRATION 1956 2051 void ksm_migrate_page(struct page *newpage, struct page *oldpage) 1957 2052 { 1958 2053 struct stable_node *stable_node;

+349 -38

mm/memblock.c

··· 21 21 #include <linux/memblock.h> 22 22 23 23 #include <asm-generic/sections.h> 24 + #include <linux/io.h> 25 + 26 + #include "internal.h" 24 27 25 28 static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; 26 29 static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; ··· 42 39 }; 43 40 44 41 int memblock_debug __initdata_memblock; 42 + #ifdef CONFIG_MOVABLE_NODE 43 + bool movable_node_enabled __initdata_memblock = false; 44 + #endif 45 45 static int memblock_can_resize __initdata_memblock; 46 46 static int memblock_memory_in_slab __initdata_memblock = 0; 47 47 static int memblock_reserved_in_slab __initdata_memblock = 0; ··· 97 91 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} 98 92 * @size: size of free area to find 99 93 * @align: alignment of free area to find 100 - * @nid: nid of the free area to find, %MAX_NUMNODES for any node 94 + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node 101 95 * 102 96 * Utility called from memblock_find_in_range_node(), find free area bottom-up. 103 97 * ··· 129 123 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} 130 124 * @size: size of free area to find 131 125 * @align: alignment of free area to find 132 - * @nid: nid of the free area to find, %MAX_NUMNODES for any node 126 + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node 133 127 * 134 128 * Utility called from memblock_find_in_range_node(), find free area top-down. 135 129 * ··· 160 154 161 155 /** 162 156 * memblock_find_in_range_node - find free area in given range and node 163 - * @start: start of candidate range 164 - * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} 165 157 * @size: size of free area to find 166 158 * @align: alignment of free area to find 167 - * @nid: nid of the free area to find, %MAX_NUMNODES for any node 159 + * @start: start of candidate range 160 + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} 161 + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node 168 162 * 169 163 * Find @size free area aligned to @align in the specified range and node. 170 164 * ··· 179 173 * RETURNS: 180 174 * Found address on success, 0 on failure. 181 175 */ 182 - phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, 183 - phys_addr_t end, phys_addr_t size, 184 - phys_addr_t align, int nid) 176 + phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, 177 + phys_addr_t align, phys_addr_t start, 178 + phys_addr_t end, int nid) 185 179 { 186 180 int ret; 187 181 phys_addr_t kernel_end; ··· 244 238 phys_addr_t end, phys_addr_t size, 245 239 phys_addr_t align) 246 240 { 247 - return memblock_find_in_range_node(start, end, size, align, 248 - MAX_NUMNODES); 241 + return memblock_find_in_range_node(size, align, start, end, 242 + NUMA_NO_NODE); 249 243 } 250 244 251 245 static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) ··· 261 255 type->cnt = 1; 262 256 type->regions[0].base = 0; 263 257 type->regions[0].size = 0; 258 + type->regions[0].flags = 0; 264 259 memblock_set_region_node(&type->regions[0], MAX_NUMNODES); 265 260 } 266 261 } ··· 270 263 phys_addr_t *addr) 271 264 { 272 265 if (memblock.reserved.regions == memblock_reserved_init_regions) 266 + return 0; 267 + 268 + /* 269 + * Don't allow nobootmem allocator to free reserved memory regions 270 + * array if 271 + * - CONFIG_DEBUG_FS is enabled; 272 + * - CONFIG_ARCH_DISCARD_MEMBLOCK is not enabled; 273 + * - reserved memory regions array have been resized during boot. 274 + * Otherwise debug_fs entry "sys/kernel/debug/memblock/reserved" 275 + * will show garbage instead of state of memory reservations. 276 + */ 277 + if (IS_ENABLED(CONFIG_DEBUG_FS) && 278 + !IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK)) 273 279 return 0; 274 280 275 281 *addr = __pa(memblock.reserved.regions); ··· 425 405 426 406 if (this->base + this->size != next->base || 427 407 memblock_get_region_node(this) != 428 - memblock_get_region_node(next)) { 408 + memblock_get_region_node(next) || 409 + this->flags != next->flags) { 429 410 BUG_ON(this->base + this->size > next->base); 430 411 i++; 431 412 continue; ··· 446 425 * @base: base address of the new region 447 426 * @size: size of the new region 448 427 * @nid: node id of the new region 428 + * @flags: flags of the new region 449 429 * 450 430 * Insert new memblock region [@base,@base+@size) into @type at @idx. 451 431 * @type must already have extra room to accomodate the new region. 452 432 */ 453 433 static void __init_memblock memblock_insert_region(struct memblock_type *type, 454 434 int idx, phys_addr_t base, 455 - phys_addr_t size, int nid) 435 + phys_addr_t size, 436 + int nid, unsigned long flags) 456 437 { 457 438 struct memblock_region *rgn = &type->regions[idx]; 458 439 ··· 462 439 memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn)); 463 440 rgn->base = base; 464 441 rgn->size = size; 442 + rgn->flags = flags; 465 443 memblock_set_region_node(rgn, nid); 466 444 type->cnt++; 467 445 type->total_size += size; ··· 474 450 * @base: base address of the new region 475 451 * @size: size of the new region 476 452 * @nid: nid of the new region 453 + * @flags: flags of the new region 477 454 * 478 455 * Add new memblock region [@base,@base+@size) into @type. The new region 479 456 * is allowed to overlap with existing ones - overlaps don't affect already ··· 485 460 * 0 on success, -errno on failure. 486 461 */ 487 462 static int __init_memblock memblock_add_region(struct memblock_type *type, 488 - phys_addr_t base, phys_addr_t size, int nid) 463 + phys_addr_t base, phys_addr_t size, 464 + int nid, unsigned long flags) 489 465 { 490 466 bool insert = false; 491 467 phys_addr_t obase = base; ··· 501 475 WARN_ON(type->cnt != 1 || type->total_size); 502 476 type->regions[0].base = base; 503 477 type->regions[0].size = size; 478 + type->regions[0].flags = flags; 504 479 memblock_set_region_node(&type->regions[0], nid); 505 480 type->total_size = size; 506 481 return 0; ··· 532 505 nr_new++; 533 506 if (insert) 534 507 memblock_insert_region(type, i++, base, 535 - rbase - base, nid); 508 + rbase - base, nid, 509 + flags); 536 510 } 537 511 /* area below @rend is dealt with, forget about it */ 538 512 base = min(rend, end); ··· 543 515 if (base < end) { 544 516 nr_new++; 545 517 if (insert) 546 - memblock_insert_region(type, i, base, end - base, nid); 518 + memblock_insert_region(type, i, base, end - base, 519 + nid, flags); 547 520 } 548 521 549 522 /* ··· 566 537 int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, 567 538 int nid) 568 539 { 569 - return memblock_add_region(&memblock.memory, base, size, nid); 540 + return memblock_add_region(&memblock.memory, base, size, nid, 0); 570 541 } 571 542 572 543 int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) 573 544 { 574 - return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES); 545 + return memblock_add_region(&memblock.memory, base, size, 546 + MAX_NUMNODES, 0); 575 547 } 576 548 577 549 /** ··· 627 597 rgn->size -= base - rbase; 628 598 type->total_size -= base - rbase; 629 599 memblock_insert_region(type, i, rbase, base - rbase, 630 - memblock_get_region_node(rgn)); 600 + memblock_get_region_node(rgn), 601 + rgn->flags); 631 602 } else if (rend > end) { 632 603 /* 633 604 * @rgn intersects from above. Split and redo the ··· 638 607 rgn->size -= end - rbase; 639 608 type->total_size -= end - rbase; 640 609 memblock_insert_region(type, i--, rbase, end - rbase, 641 - memblock_get_region_node(rgn)); 610 + memblock_get_region_node(rgn), 611 + rgn->flags); 642 612 } else { 643 613 /* @rgn is fully contained, record it */ 644 614 if (!*end_rgn) ··· 675 643 { 676 644 memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", 677 645 (unsigned long long)base, 678 - (unsigned long long)base + size, 646 + (unsigned long long)base + size - 1, 679 647 (void *)_RET_IP_); 680 648 681 649 return __memblock_remove(&memblock.reserved, base, size); 682 650 } 683 651 684 - int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) 652 + static int __init_memblock memblock_reserve_region(phys_addr_t base, 653 + phys_addr_t size, 654 + int nid, 655 + unsigned long flags) 685 656 { 686 657 struct memblock_type *_rgn = &memblock.reserved; 687 658 688 - memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n", 659 + memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", 689 660 (unsigned long long)base, 690 - (unsigned long long)base + size, 691 - (void *)_RET_IP_); 661 + (unsigned long long)base + size - 1, 662 + flags, (void *)_RET_IP_); 692 663 693 - return memblock_add_region(_rgn, base, size, MAX_NUMNODES); 664 + return memblock_add_region(_rgn, base, size, nid, flags); 665 + } 666 + 667 + int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) 668 + { 669 + return memblock_reserve_region(base, size, MAX_NUMNODES, 0); 670 + } 671 + 672 + /** 673 + * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG. 674 + * @base: the base phys addr of the region 675 + * @size: the size of the region 676 + * 677 + * This function isolates region [@base, @base + @size), and mark it with flag 678 + * MEMBLOCK_HOTPLUG. 679 + * 680 + * Return 0 on succees, -errno on failure. 681 + */ 682 + int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) 683 + { 684 + struct memblock_type *type = &memblock.memory; 685 + int i, ret, start_rgn, end_rgn; 686 + 687 + ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); 688 + if (ret) 689 + return ret; 690 + 691 + for (i = start_rgn; i < end_rgn; i++) 692 + memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG); 693 + 694 + memblock_merge_regions(type); 695 + return 0; 696 + } 697 + 698 + /** 699 + * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region. 700 + * @base: the base phys addr of the region 701 + * @size: the size of the region 702 + * 703 + * This function isolates region [@base, @base + @size), and clear flag 704 + * MEMBLOCK_HOTPLUG for the isolated regions. 705 + * 706 + * Return 0 on succees, -errno on failure. 707 + */ 708 + int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) 709 + { 710 + struct memblock_type *type = &memblock.memory; 711 + int i, ret, start_rgn, end_rgn; 712 + 713 + ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); 714 + if (ret) 715 + return ret; 716 + 717 + for (i = start_rgn; i < end_rgn; i++) 718 + memblock_clear_region_flags(&type->regions[i], 719 + MEMBLOCK_HOTPLUG); 720 + 721 + memblock_merge_regions(type); 722 + return 0; 694 723 } 695 724 696 725 /** 697 726 * __next_free_mem_range - next function for for_each_free_mem_range() 698 727 * @idx: pointer to u64 loop variable 699 - * @nid: node selector, %MAX_NUMNODES for all nodes 728 + * @nid: node selector, %NUMA_NO_NODE for all nodes 700 729 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL 701 730 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL 702 731 * @out_nid: ptr to int for nid of the range, can be %NULL ··· 786 693 int mi = *idx & 0xffffffff; 787 694 int ri = *idx >> 32; 788 695 696 + if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) 697 + nid = NUMA_NO_NODE; 698 + 789 699 for ( ; mi < mem->cnt; mi++) { 790 700 struct memblock_region *m = &mem->regions[mi]; 791 701 phys_addr_t m_start = m->base; 792 702 phys_addr_t m_end = m->base + m->size; 793 703 794 704 /* only memory regions are associated with nodes, check it */ 795 - if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) 705 + if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) 796 706 continue; 797 707 798 708 /* scan areas before each reservation for intersection */ ··· 836 740 /** 837 741 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() 838 742 * @idx: pointer to u64 loop variable 839 - * @nid: nid: node selector, %MAX_NUMNODES for all nodes 743 + * @nid: nid: node selector, %NUMA_NO_NODE for all nodes 840 744 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL 841 745 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL 842 746 * @out_nid: ptr to int for nid of the range, can be %NULL 843 747 * 844 748 * Reverse of __next_free_mem_range(). 749 + * 750 + * Linux kernel cannot migrate pages used by itself. Memory hotplug users won't 751 + * be able to hot-remove hotpluggable memory used by the kernel. So this 752 + * function skip hotpluggable regions if needed when allocating memory for the 753 + * kernel. 845 754 */ 846 755 void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, 847 756 phys_addr_t *out_start, ··· 856 755 struct memblock_type *rsv = &memblock.reserved; 857 756 int mi = *idx & 0xffffffff; 858 757 int ri = *idx >> 32; 758 + 759 + if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) 760 + nid = NUMA_NO_NODE; 859 761 860 762 if (*idx == (u64)ULLONG_MAX) { 861 763 mi = mem->cnt - 1; ··· 871 767 phys_addr_t m_end = m->base + m->size; 872 768 873 769 /* only memory regions are associated with nodes, check it */ 874 - if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) 770 + if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) 771 + continue; 772 + 773 + /* skip hotpluggable memory regions if needed */ 774 + if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) 875 775 continue; 876 776 877 777 /* scan areas before each reservation for intersection */ ··· 945 837 * memblock_set_node - set node ID on memblock regions 946 838 * @base: base of area to set node ID for 947 839 * @size: size of area to set node ID for 840 + * @type: memblock type to set node ID for 948 841 * @nid: node ID to set 949 842 * 950 - * Set the nid of memblock memory regions in [@base,@base+@size) to @nid. 843 + * Set the nid of memblock @type regions in [@base,@base+@size) to @nid. 951 844 * Regions which cross the area boundaries are split as necessary. 952 845 * 953 846 * RETURNS: 954 847 * 0 on success, -errno on failure. 955 848 */ 956 849 int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, 957 - int nid) 850 + struct memblock_type *type, int nid) 958 851 { 959 - struct memblock_type *type = &memblock.memory; 960 852 int start_rgn, end_rgn; 961 853 int i, ret; 962 854 ··· 978 870 { 979 871 phys_addr_t found; 980 872 981 - if (WARN_ON(!align)) 982 - align = __alignof__(long long); 873 + if (!align) 874 + align = SMP_CACHE_BYTES; 983 875 984 876 /* align @size to avoid excessive fragmentation on reserved array */ 985 877 size = round_up(size, align); 986 878 987 - found = memblock_find_in_range_node(0, max_addr, size, align, nid); 879 + found = memblock_find_in_range_node(size, align, 0, max_addr, nid); 988 880 if (found && !memblock_reserve(found, size)) 989 881 return found; 990 882 ··· 998 890 999 891 phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) 1000 892 { 1001 - return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES); 893 + return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE); 1002 894 } 1003 895 1004 896 phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) ··· 1028 920 return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); 1029 921 } 1030 922 923 + /** 924 + * memblock_virt_alloc_internal - allocate boot memory block 925 + * @size: size of memory block to be allocated in bytes 926 + * @align: alignment of the region and block's size 927 + * @min_addr: the lower bound of the memory region to allocate (phys address) 928 + * @max_addr: the upper bound of the memory region to allocate (phys address) 929 + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node 930 + * 931 + * The @min_addr limit is dropped if it can not be satisfied and the allocation 932 + * will fall back to memory below @min_addr. Also, allocation may fall back 933 + * to any node in the system if the specified node can not 934 + * hold the requested memory. 935 + * 936 + * The allocation is performed from memory region limited by 937 + * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE. 938 + * 939 + * The memory block is aligned on SMP_CACHE_BYTES if @align == 0. 940 + * 941 + * The phys address of allocated boot memory block is converted to virtual and 942 + * allocated memory is reset to 0. 943 + * 944 + * In addition, function sets the min_count to 0 using kmemleak_alloc for 945 + * allocated boot memory block, so that it is never reported as leaks. 946 + * 947 + * RETURNS: 948 + * Virtual address of allocated memory block on success, NULL on failure. 949 + */ 950 + static void * __init memblock_virt_alloc_internal( 951 + phys_addr_t size, phys_addr_t align, 952 + phys_addr_t min_addr, phys_addr_t max_addr, 953 + int nid) 954 + { 955 + phys_addr_t alloc; 956 + void *ptr; 957 + 958 + if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) 959 + nid = NUMA_NO_NODE; 960 + 961 + /* 962 + * Detect any accidental use of these APIs after slab is ready, as at 963 + * this moment memblock may be deinitialized already and its 964 + * internal data may be destroyed (after execution of free_all_bootmem) 965 + */ 966 + if (WARN_ON_ONCE(slab_is_available())) 967 + return kzalloc_node(size, GFP_NOWAIT, nid); 968 + 969 + if (!align) 970 + align = SMP_CACHE_BYTES; 971 + 972 + /* align @size to avoid excessive fragmentation on reserved array */ 973 + size = round_up(size, align); 974 + 975 + again: 976 + alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, 977 + nid); 978 + if (alloc) 979 + goto done; 980 + 981 + if (nid != NUMA_NO_NODE) { 982 + alloc = memblock_find_in_range_node(size, align, min_addr, 983 + max_addr, NUMA_NO_NODE); 984 + if (alloc) 985 + goto done; 986 + } 987 + 988 + if (min_addr) { 989 + min_addr = 0; 990 + goto again; 991 + } else { 992 + goto error; 993 + } 994 + 995 + done: 996 + memblock_reserve(alloc, size); 997 + ptr = phys_to_virt(alloc); 998 + memset(ptr, 0, size); 999 + 1000 + /* 1001 + * The min_count is set to 0 so that bootmem allocated blocks 1002 + * are never reported as leaks. This is because many of these blocks 1003 + * are only referred via the physical address which is not 1004 + * looked up by kmemleak. 1005 + */ 1006 + kmemleak_alloc(ptr, size, 0, 0); 1007 + 1008 + return ptr; 1009 + 1010 + error: 1011 + return NULL; 1012 + } 1013 + 1014 + /** 1015 + * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block 1016 + * @size: size of memory block to be allocated in bytes 1017 + * @align: alignment of the region and block's size 1018 + * @min_addr: the lower bound of the memory region from where the allocation 1019 + * is preferred (phys address) 1020 + * @max_addr: the upper bound of the memory region from where the allocation 1021 + * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to 1022 + * allocate only from memory limited by memblock.current_limit value 1023 + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node 1024 + * 1025 + * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides 1026 + * additional debug information (including caller info), if enabled. 1027 + * 1028 + * RETURNS: 1029 + * Virtual address of allocated memory block on success, NULL on failure. 1030 + */ 1031 + void * __init memblock_virt_alloc_try_nid_nopanic( 1032 + phys_addr_t size, phys_addr_t align, 1033 + phys_addr_t min_addr, phys_addr_t max_addr, 1034 + int nid) 1035 + { 1036 + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", 1037 + __func__, (u64)size, (u64)align, nid, (u64)min_addr, 1038 + (u64)max_addr, (void *)_RET_IP_); 1039 + return memblock_virt_alloc_internal(size, align, min_addr, 1040 + max_addr, nid); 1041 + } 1042 + 1043 + /** 1044 + * memblock_virt_alloc_try_nid - allocate boot memory block with panicking 1045 + * @size: size of memory block to be allocated in bytes 1046 + * @align: alignment of the region and block's size 1047 + * @min_addr: the lower bound of the memory region from where the allocation 1048 + * is preferred (phys address) 1049 + * @max_addr: the upper bound of the memory region from where the allocation 1050 + * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to 1051 + * allocate only from memory limited by memblock.current_limit value 1052 + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node 1053 + * 1054 + * Public panicking version of _memblock_virt_alloc_try_nid_nopanic() 1055 + * which provides debug information (including caller info), if enabled, 1056 + * and panics if the request can not be satisfied. 1057 + * 1058 + * RETURNS: 1059 + * Virtual address of allocated memory block on success, NULL on failure. 1060 + */ 1061 + void * __init memblock_virt_alloc_try_nid( 1062 + phys_addr_t size, phys_addr_t align, 1063 + phys_addr_t min_addr, phys_addr_t max_addr, 1064 + int nid) 1065 + { 1066 + void *ptr; 1067 + 1068 + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", 1069 + __func__, (u64)size, (u64)align, nid, (u64)min_addr, 1070 + (u64)max_addr, (void *)_RET_IP_); 1071 + ptr = memblock_virt_alloc_internal(size, align, 1072 + min_addr, max_addr, nid); 1073 + if (ptr) 1074 + return ptr; 1075 + 1076 + panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n", 1077 + __func__, (u64)size, (u64)align, nid, (u64)min_addr, 1078 + (u64)max_addr); 1079 + return NULL; 1080 + } 1081 + 1082 + /** 1083 + * __memblock_free_early - free boot memory block 1084 + * @base: phys starting address of the boot memory block 1085 + * @size: size of the boot memory block in bytes 1086 + * 1087 + * Free boot memory block previously allocated by memblock_virt_alloc_xx() API. 1088 + * The freeing memory will not be released to the buddy allocator. 1089 + */ 1090 + void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) 1091 + { 1092 + memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", 1093 + __func__, (u64)base, (u64)base + size - 1, 1094 + (void *)_RET_IP_); 1095 + kmemleak_free_part(__va(base), size); 1096 + __memblock_remove(&memblock.reserved, base, size); 1097 + } 1098 + 1099 + /* 1100 + * __memblock_free_late - free bootmem block pages directly to buddy allocator 1101 + * @addr: phys starting address of the boot memory block 1102 + * @size: size of the boot memory block in bytes 1103 + * 1104 + * This is only useful when the bootmem allocator has already been torn 1105 + * down, but we are still initializing the system. Pages are released directly 1106 + * to the buddy allocator, no bootmem metadata is updated because it is gone. 1107 + */ 1108 + void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) 1109 + { 1110 + u64 cursor, end; 1111 + 1112 + memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", 1113 + __func__, (u64)base, (u64)base + size - 1, 1114 + (void *)_RET_IP_); 1115 + kmemleak_free_part(__va(base), size); 1116 + cursor = PFN_UP(base); 1117 + end = PFN_DOWN(base + size); 1118 + 1119 + for (; cursor < end; cursor++) { 1120 + __free_pages_bootmem(pfn_to_page(cursor), 0); 1121 + totalram_pages++; 1122 + } 1123 + } 1031 1124 1032 1125 /* 1033 1126 * Remaining API functions ··· 1410 1101 static void __init_memblock memblock_dump(struct memblock_type *type, char *name) 1411 1102 { 1412 1103 unsigned long long base, size; 1104 + unsigned long flags; 1413 1105 int i; 1414 1106 1415 1107 pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); ··· 1421 1111 1422 1112 base = rgn->base; 1423 1113 size = rgn->size; 1114 + flags = rgn->flags; 1424 1115 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 1425 1116 if (memblock_get_region_node(rgn) != MAX_NUMNODES) 1426 1117 snprintf(nid_buf, sizeof(nid_buf), " on node %d", 1427 1118 memblock_get_region_node(rgn)); 1428 1119 #endif 1429 - pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n", 1430 - name, i, base, base + size - 1, size, nid_buf); 1120 + pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n", 1121 + name, i, base, base + size - 1, size, nid_buf, flags); 1431 1122 } 1432 1123 } 1433 1124

+10 -7

mm/memcontrol.c

··· 1688 1688 */ 1689 1689 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1690 1690 { 1691 + /* 1692 + * protects memcg_name and makes sure that parallel ooms do not 1693 + * interleave 1694 + */ 1695 + static DEFINE_SPINLOCK(oom_info_lock); 1691 1696 struct cgroup *task_cgrp; 1692 1697 struct cgroup *mem_cgrp; 1693 - /* 1694 - * Need a buffer in BSS, can't rely on allocations. The code relies 1695 - * on the assumption that OOM is serialized for memory controller. 1696 - * If this assumption is broken, revisit this code. 1697 - */ 1698 1698 static char memcg_name[PATH_MAX]; 1699 1699 int ret; 1700 1700 struct mem_cgroup *iter; ··· 1703 1703 if (!p) 1704 1704 return; 1705 1705 1706 + spin_lock(&oom_info_lock); 1706 1707 rcu_read_lock(); 1707 1708 1708 1709 mem_cgrp = memcg->css.cgroup; ··· 1772 1771 1773 1772 pr_cont("\n"); 1774 1773 } 1774 + spin_unlock(&oom_info_lock); 1775 1775 } 1776 1776 1777 1777 /* ··· 3002 3000 static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) 3003 3001 { 3004 3002 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && 3005 - (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); 3003 + (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK) == 3004 + KMEM_ACCOUNTED_MASK; 3006 3005 } 3007 3006 3008 3007 /* ··· 3129 3126 * But when we create a new cache, we can call this as well if its parent 3130 3127 * is kmem-limited. That will have to hold set_limit_mutex as well. 3131 3128 */ 3132 - int memcg_update_cache_sizes(struct mem_cgroup *memcg) 3129 + static int memcg_update_cache_sizes(struct mem_cgroup *memcg) 3133 3130 { 3134 3131 int num, ret; 3135 3132

+8 -2

mm/memory-failure.c

··· 611 611 } 612 612 613 613 /* 614 - * Dirty cache page page 614 + * Dirty pagecache page 615 615 * Issues: when the error hit a hole page the error is not properly 616 616 * propagated. 617 617 */ ··· 1585 1585 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1586 1586 MIGRATE_SYNC, MR_MEMORY_FAILURE); 1587 1587 if (ret) { 1588 - putback_lru_pages(&pagelist); 1588 + if (!list_empty(&pagelist)) { 1589 + list_del(&page->lru); 1590 + dec_zone_page_state(page, NR_ISOLATED_ANON + 1591 + page_is_file_cache(page)); 1592 + putback_lru_page(page); 1593 + } 1594 + 1589 1595 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1590 1596 pfn, ret, page->flags); 1591 1597 if (ret > 0)

+14 -2

mm/memory.c

··· 59 59 #include <linux/gfp.h> 60 60 #include <linux/migrate.h> 61 61 #include <linux/string.h> 62 + #include <linux/dma-debug.h> 62 63 63 64 #include <asm/io.h> 64 65 #include <asm/pgalloc.h> ··· 2560 2559 2561 2560 static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) 2562 2561 { 2562 + debug_dma_assert_idle(src); 2563 + 2563 2564 /* 2564 2565 * If the source page was a PFN mapping, we don't have 2565 2566 * a "struct page" for it. We do a best-effort copy by ··· 4275 4272 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ 4276 4273 4277 4274 #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS 4275 + 4276 + static struct kmem_cache *page_ptl_cachep; 4277 + 4278 + void __init ptlock_cache_init(void) 4279 + { 4280 + page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, 4281 + SLAB_PANIC, NULL); 4282 + } 4283 + 4278 4284 bool ptlock_alloc(struct page *page) 4279 4285 { 4280 4286 spinlock_t *ptl; 4281 4287 4282 - ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL); 4288 + ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); 4283 4289 if (!ptl) 4284 4290 return false; 4285 4291 page->ptl = ptl; ··· 4297 4285 4298 4286 void ptlock_free(struct page *page) 4299 4287 { 4300 - kfree(page->ptl); 4288 + kmem_cache_free(page_ptl_cachep, page->ptl); 4301 4289 } 4302 4290 #endif

+2 -2

mm/memory_hotplug.c

··· 9 9 #include <linux/swap.h> 10 10 #include <linux/interrupt.h> 11 11 #include <linux/pagemap.h> 12 - #include <linux/bootmem.h> 13 12 #include <linux/compiler.h> 14 13 #include <linux/export.h> 15 14 #include <linux/pagevec.h> ··· 268 269 } 269 270 270 271 /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or 271 - * alloc_bootmem_node_nopanic() */ 272 + * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */ 272 273 static int __ref ensure_zone_is_initialized(struct zone *zone, 273 274 unsigned long start_pfn, unsigned long num_pages) 274 275 { ··· 1445 1446 * the kernel away from hotpluggable memory. 1446 1447 */ 1447 1448 memblock_set_bottom_up(true); 1449 + movable_node_enabled = true; 1448 1450 #else 1449 1451 pr_warn("movable_node option not supported\n"); 1450 1452 #endif

+45 -44

mm/migrate.c

··· 72 72 } 73 73 74 74 /* 75 - * Add isolated pages on the list back to the LRU under page lock 76 - * to avoid leaking evictable pages back onto unevictable list. 77 - */ 78 - void putback_lru_pages(struct list_head *l) 79 - { 80 - struct page *page; 81 - struct page *page2; 82 - 83 - list_for_each_entry_safe(page, page2, l, lru) { 84 - list_del(&page->lru); 85 - dec_zone_page_state(page, NR_ISOLATED_ANON + 86 - page_is_file_cache(page)); 87 - putback_lru_page(page); 88 - } 89 - } 90 - 91 - /* 92 75 * Put previously isolated pages back onto the appropriate lists 93 76 * from where they were once taken off for compaction/migration. 94 77 * 95 - * This function shall be used instead of putback_lru_pages(), 96 - * whenever the isolated pageset has been built by isolate_migratepages_range() 78 + * This function shall be used whenever the isolated pageset has been 79 + * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() 80 + * and isolate_huge_page(). 97 81 */ 98 82 void putback_movable_pages(struct list_head *l) 99 83 { ··· 183 199 */ 184 200 static void remove_migration_ptes(struct page *old, struct page *new) 185 201 { 186 - rmap_walk(new, remove_migration_pte, old); 202 + struct rmap_walk_control rwc = { 203 + .rmap_one = remove_migration_pte, 204 + .arg = old, 205 + }; 206 + 207 + rmap_walk(new, &rwc); 187 208 } 188 209 189 210 /* ··· 551 562 /************************************************************ 552 563 * Migration functions 553 564 ***********************************************************/ 554 - 555 - /* Always fail migration. Used for mappings that are not movable */ 556 - int fail_migrate_page(struct address_space *mapping, 557 - struct page *newpage, struct page *page) 558 - { 559 - return -EIO; 560 - } 561 - EXPORT_SYMBOL(fail_migrate_page); 562 565 563 566 /* 564 567 * Common logic to directly migrate a single page suitable for ··· 989 1008 { 990 1009 int rc = 0; 991 1010 int *result = NULL; 992 - struct page *new_hpage = get_new_page(hpage, private, &result); 1011 + struct page *new_hpage; 993 1012 struct anon_vma *anon_vma = NULL; 994 1013 995 1014 /* ··· 999 1018 * tables or check whether the hugepage is pmd-based or not before 1000 1019 * kicking migration. 1001 1020 */ 1002 - if (!hugepage_migration_support(page_hstate(hpage))) 1021 + if (!hugepage_migration_support(page_hstate(hpage))) { 1022 + putback_active_hugepage(hpage); 1003 1023 return -ENOSYS; 1024 + } 1004 1025 1026 + new_hpage = get_new_page(hpage, private, &result); 1005 1027 if (!new_hpage) 1006 1028 return -ENOMEM; 1007 1029 ··· 1104 1120 nr_succeeded++; 1105 1121 break; 1106 1122 default: 1107 - /* Permanent failure */ 1123 + /* 1124 + * Permanent failure (-EBUSY, -ENOSYS, etc.): 1125 + * unlike -EAGAIN case, the failed page is 1126 + * removed from migration page list and not 1127 + * retried in the next outer loop. 1128 + */ 1108 1129 nr_failed++; 1109 1130 break; 1110 1131 } ··· 1583 1594 } 1584 1595 1585 1596 /* Returns true if the node is migrate rate-limited after the update */ 1586 - bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) 1597 + static bool numamigrate_update_ratelimit(pg_data_t *pgdat, 1598 + unsigned long nr_pages) 1587 1599 { 1588 - bool rate_limited = false; 1589 - 1590 1600 /* 1591 1601 * Rate-limit the amount of data that is being migrated to a node. 1592 1602 * Optimal placement is no good if the memory bus is saturated and 1593 1603 * all the time is being spent migrating! 1594 1604 */ 1595 - spin_lock(&pgdat->numabalancing_migrate_lock); 1596 1605 if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { 1606 + spin_lock(&pgdat->numabalancing_migrate_lock); 1597 1607 pgdat->numabalancing_migrate_nr_pages = 0; 1598 1608 pgdat->numabalancing_migrate_next_window = jiffies + 1599 1609 msecs_to_jiffies(migrate_interval_millisecs); 1610 + spin_unlock(&pgdat->numabalancing_migrate_lock); 1600 1611 } 1601 - if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) 1602 - rate_limited = true; 1603 - else 1604 - pgdat->numabalancing_migrate_nr_pages += nr_pages; 1605 - spin_unlock(&pgdat->numabalancing_migrate_lock); 1606 - 1607 - return rate_limited; 1612 + if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) { 1613 + trace_mm_numa_migrate_ratelimit(current, pgdat->node_id, 1614 + nr_pages); 1615 + return true; 1616 + } 1617 + 1618 + /* 1619 + * This is an unlocked non-atomic update so errors are possible. 1620 + * The consequences are failing to migrate when we potentiall should 1621 + * have which is not severe enough to warrant locking. If it is ever 1622 + * a problem, it can be converted to a per-cpu counter. 1623 + */ 1624 + pgdat->numabalancing_migrate_nr_pages += nr_pages; 1625 + return false; 1608 1626 } 1609 1627 1610 - int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) 1628 + static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) 1611 1629 { 1612 1630 int page_lru; 1613 1631 ··· 1701 1705 nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, 1702 1706 node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); 1703 1707 if (nr_remaining) { 1704 - putback_lru_pages(&migratepages); 1708 + if (!list_empty(&migratepages)) { 1709 + list_del(&page->lru); 1710 + dec_zone_page_state(page, NR_ISOLATED_ANON + 1711 + page_is_file_cache(page)); 1712 + putback_lru_page(page); 1713 + } 1705 1714 isolated = 0; 1706 1715 } else 1707 1716 count_vm_numa_event(NUMA_PAGE_MIGRATE);

+11 -7

mm/mlock.c

··· 709 709 710 710 lru_add_drain_all(); /* flush pagevec */ 711 711 712 - down_write(&current->mm->mmap_sem); 713 712 len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); 714 713 start &= PAGE_MASK; 715 714 716 - locked = len >> PAGE_SHIFT; 717 - locked += current->mm->locked_vm; 718 - 719 715 lock_limit = rlimit(RLIMIT_MEMLOCK); 720 716 lock_limit >>= PAGE_SHIFT; 717 + locked = len >> PAGE_SHIFT; 718 + 719 + down_write(&current->mm->mmap_sem); 720 + 721 + locked += current->mm->locked_vm; 721 722 722 723 /* check against resource limits */ 723 724 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) 724 725 error = do_mlock(start, len, 1); 726 + 725 727 up_write(&current->mm->mmap_sem); 726 728 if (!error) 727 729 error = __mm_populate(start, len, 0); ··· 734 732 { 735 733 int ret; 736 734 737 - down_write(&current->mm->mmap_sem); 738 735 len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); 739 736 start &= PAGE_MASK; 737 + 738 + down_write(&current->mm->mmap_sem); 740 739 ret = do_mlock(start, len, 0); 741 740 up_write(&current->mm->mmap_sem); 741 + 742 742 return ret; 743 743 } 744 744 ··· 785 781 if (flags & MCL_CURRENT) 786 782 lru_add_drain_all(); /* flush pagevec */ 787 783 788 - down_write(&current->mm->mmap_sem); 789 - 790 784 lock_limit = rlimit(RLIMIT_MEMLOCK); 791 785 lock_limit >>= PAGE_SHIFT; 792 786 793 787 ret = -ENOMEM; 788 + down_write(&current->mm->mmap_sem); 789 + 794 790 if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || 795 791 capable(CAP_IPC_LOCK)) 796 792 ret = do_mlockall(flags);

+24 -22

mm/mmap.c

··· 86 86 87 87 int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ 88 88 int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ 89 + unsigned long sysctl_overcommit_kbytes __read_mostly; 89 90 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 90 91 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ 91 92 unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ ··· 1191 1190 return hint; 1192 1191 } 1193 1192 1193 + static inline int mlock_future_check(struct mm_struct *mm, 1194 + unsigned long flags, 1195 + unsigned long len) 1196 + { 1197 + unsigned long locked, lock_limit; 1198 + 1199 + /* mlock MCL_FUTURE? */ 1200 + if (flags & VM_LOCKED) { 1201 + locked = len >> PAGE_SHIFT; 1202 + locked += mm->locked_vm; 1203 + lock_limit = rlimit(RLIMIT_MEMLOCK); 1204 + lock_limit >>= PAGE_SHIFT; 1205 + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 1206 + return -EAGAIN; 1207 + } 1208 + return 0; 1209 + } 1210 + 1194 1211 /* 1195 1212 * The caller must hold down_write(&current->mm->mmap_sem). 1196 1213 */ ··· 1270 1251 if (!can_do_mlock()) 1271 1252 return -EPERM; 1272 1253 1273 - /* mlock MCL_FUTURE? */ 1274 - if (vm_flags & VM_LOCKED) { 1275 - unsigned long locked, lock_limit; 1276 - locked = len >> PAGE_SHIFT; 1277 - locked += mm->locked_vm; 1278 - lock_limit = rlimit(RLIMIT_MEMLOCK); 1279 - lock_limit >>= PAGE_SHIFT; 1280 - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 1281 - return -EAGAIN; 1282 - } 1254 + if (mlock_future_check(mm, vm_flags, len)) 1255 + return -EAGAIN; 1283 1256 1284 1257 if (file) { 1285 1258 struct inode *inode = file_inode(file); ··· 2602 2591 if (error & ~PAGE_MASK) 2603 2592 return error; 2604 2593 2605 - /* 2606 - * mlock MCL_FUTURE? 2607 - */ 2608 - if (mm->def_flags & VM_LOCKED) { 2609 - unsigned long locked, lock_limit; 2610 - locked = len >> PAGE_SHIFT; 2611 - locked += mm->locked_vm; 2612 - lock_limit = rlimit(RLIMIT_MEMLOCK); 2613 - lock_limit >>= PAGE_SHIFT; 2614 - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 2615 - return -EAGAIN; 2616 - } 2594 + error = mlock_future_check(mm, mm->def_flags, len); 2595 + if (error) 2596 + return error; 2617 2597 2618 2598 /* 2619 2599 * mm->mmap_sem is required to protect against another thread

+2 -1

mm/mprotect.c

··· 23 23 #include <linux/mmu_notifier.h> 24 24 #include <linux/migrate.h> 25 25 #include <linux/perf_event.h> 26 + #include <linux/ksm.h> 26 27 #include <asm/uaccess.h> 27 28 #include <asm/pgtable.h> 28 29 #include <asm/cacheflush.h> ··· 64 63 65 64 ptent = *pte; 66 65 page = vm_normal_page(vma, addr, oldpte); 67 - if (page) { 66 + if (page && !PageKsm(page)) { 68 67 if (!pte_numa(oldpte)) { 69 68 ptent = pte_mknuma(ptent); 70 69 set_pte_at(mm, addr, pte, ptent);

+5 -5

mm/nobootmem.c

··· 41 41 if (limit > memblock.current_limit) 42 42 limit = memblock.current_limit; 43 43 44 - addr = memblock_find_in_range_node(goal, limit, size, align, nid); 44 + addr = memblock_find_in_range_node(size, align, goal, limit, nid); 45 45 if (!addr) 46 46 return NULL; 47 47 ··· 117 117 phys_addr_t start, end, size; 118 118 u64 i; 119 119 120 - for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) 120 + for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) 121 121 count += __free_memory_core(start, end); 122 122 123 123 /* free range that is used for reserved array if we allocate it */ ··· 161 161 reset_all_zones_managed_pages(); 162 162 163 163 /* 164 - * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id 164 + * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id 165 165 * because in some case like Node0 doesn't have RAM installed 166 166 * low ram will be on Node1 167 167 */ ··· 215 215 216 216 restart: 217 217 218 - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); 218 + ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit); 219 219 220 220 if (ptr) 221 221 return ptr; ··· 299 299 if (ptr) 300 300 return ptr; 301 301 302 - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, 302 + ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, 303 303 goal, limit); 304 304 if (ptr) 305 305 return ptr;

+1

mm/nommu.c

··· 60 60 struct percpu_counter vm_committed_as; 61 61 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 62 62 int sysctl_overcommit_ratio = 50; /* default is 50% */ 63 + unsigned long sysctl_overcommit_kbytes __read_mostly; 63 64 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; 64 65 int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; 65 66 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */

+29 -22

mm/oom_kill.c

··· 47 47 #ifdef CONFIG_NUMA 48 48 /** 49 49 * has_intersects_mems_allowed() - check task eligiblity for kill 50 - * @tsk: task struct of which task to consider 50 + * @start: task struct of which task to consider 51 51 * @mask: nodemask passed to page allocator for mempolicy ooms 52 52 * 53 53 * Task eligibility is determined by whether or not a candidate task, @tsk, 54 54 * shares the same mempolicy nodes as current if it is bound by such a policy 55 55 * and whether or not it has the same set of allowed cpuset nodes. 56 56 */ 57 - static bool has_intersects_mems_allowed(struct task_struct *tsk, 57 + static bool has_intersects_mems_allowed(struct task_struct *start, 58 58 const nodemask_t *mask) 59 59 { 60 - struct task_struct *start = tsk; 60 + struct task_struct *tsk; 61 + bool ret = false; 61 62 62 - do { 63 + rcu_read_lock(); 64 + for_each_thread(start, tsk) { 63 65 if (mask) { 64 66 /* 65 67 * If this is a mempolicy constrained oom, tsk's ··· 69 67 * mempolicy intersects current, otherwise it may be 70 68 * needlessly killed. 71 69 */ 72 - if (mempolicy_nodemask_intersects(tsk, mask)) 73 - return true; 70 + ret = mempolicy_nodemask_intersects(tsk, mask); 74 71 } else { 75 72 /* 76 73 * This is not a mempolicy constrained oom, so only 77 74 * check the mems of tsk's cpuset. 78 75 */ 79 - if (cpuset_mems_allowed_intersects(current, tsk)) 80 - return true; 76 + ret = cpuset_mems_allowed_intersects(current, tsk); 81 77 } 82 - } while_each_thread(start, tsk); 78 + if (ret) 79 + break; 80 + } 81 + rcu_read_unlock(); 83 82 84 - return false; 83 + return ret; 85 84 } 86 85 #else 87 86 static bool has_intersects_mems_allowed(struct task_struct *tsk, ··· 100 97 */ 101 98 struct task_struct *find_lock_task_mm(struct task_struct *p) 102 99 { 103 - struct task_struct *t = p; 100 + struct task_struct *t; 104 101 105 - do { 102 + rcu_read_lock(); 103 + 104 + for_each_thread(p, t) { 106 105 task_lock(t); 107 106 if (likely(t->mm)) 108 - return t; 107 + goto found; 109 108 task_unlock(t); 110 - } while_each_thread(p, t); 109 + } 110 + t = NULL; 111 + found: 112 + rcu_read_unlock(); 111 113 112 - return NULL; 114 + return t; 113 115 } 114 116 115 117 /* return true if the task is not adequate as candidate victim task. */ ··· 309 301 unsigned long chosen_points = 0; 310 302 311 303 rcu_read_lock(); 312 - do_each_thread(g, p) { 304 + for_each_process_thread(g, p) { 313 305 unsigned int points; 314 306 315 307 switch (oom_scan_process_thread(p, totalpages, nodemask, ··· 331 323 chosen = p; 332 324 chosen_points = points; 333 325 } 334 - } while_each_thread(g, p); 326 + } 335 327 if (chosen) 336 328 get_task_struct(chosen); 337 329 rcu_read_unlock(); ··· 414 406 { 415 407 struct task_struct *victim = p; 416 408 struct task_struct *child; 417 - struct task_struct *t = p; 409 + struct task_struct *t; 418 410 struct mm_struct *mm; 419 411 unsigned int victim_points = 0; 420 412 static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, ··· 445 437 * still freeing memory. 446 438 */ 447 439 read_lock(&tasklist_lock); 448 - do { 440 + for_each_thread(p, t) { 449 441 list_for_each_entry(child, &t->children, sibling) { 450 442 unsigned int child_points; 451 443 ··· 463 455 get_task_struct(victim); 464 456 } 465 457 } 466 - } while_each_thread(p, t); 458 + } 467 459 read_unlock(&tasklist_lock); 468 460 469 - rcu_read_lock(); 470 461 p = find_lock_task_mm(victim); 471 462 if (!p) { 472 - rcu_read_unlock(); 473 463 put_task_struct(victim); 474 464 return; 475 465 } else if (victim != p) { ··· 493 487 * That thread will now get access to memory reserves since it has a 494 488 * pending fatal signal. 495 489 */ 490 + rcu_read_lock(); 496 491 for_each_process(p) 497 492 if (p->mm == mm && !same_thread_group(p, victim) && 498 493 !(p->flags & PF_KTHREAD)) {

+63 -26

mm/page_alloc.c

··· 2072 2072 return; 2073 2073 2074 2074 /* 2075 - * Walking all memory to count page types is very expensive and should 2076 - * be inhibited in non-blockable contexts. 2077 - */ 2078 - if (!(gfp_mask & __GFP_WAIT)) 2079 - filter |= SHOW_MEM_FILTER_PAGE_COUNT; 2080 - 2081 - /* 2082 2075 * This documents exceptions given to allocations in certain 2083 2076 * contexts that are allowed to allocate outside current's set 2084 2077 * of allowed nodes. ··· 2235 2242 preferred_zone, migratetype); 2236 2243 if (page) { 2237 2244 preferred_zone->compact_blockskip_flush = false; 2238 - preferred_zone->compact_considered = 0; 2239 - preferred_zone->compact_defer_shift = 0; 2240 - if (order >= preferred_zone->compact_order_failed) 2241 - preferred_zone->compact_order_failed = order + 1; 2245 + compaction_defer_reset(preferred_zone, order, true); 2242 2246 count_vm_event(COMPACTSUCCESS); 2243 2247 return page; 2244 2248 } ··· 2525 2535 } 2526 2536 2527 2537 /* Atomic allocations - we can't balance anything */ 2528 - if (!wait) 2538 + if (!wait) { 2539 + /* 2540 + * All existing users of the deprecated __GFP_NOFAIL are 2541 + * blockable, so warn of any new users that actually allow this 2542 + * type of allocation to fail. 2543 + */ 2544 + WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL); 2529 2545 goto nopage; 2546 + } 2530 2547 2531 2548 /* Avoid recursion of direct reclaim */ 2532 2549 if (current->flags & PF_MEMALLOC) ··· 3898 3901 struct page *page; 3899 3902 unsigned long block_migratetype; 3900 3903 int reserve; 3904 + int old_reserve; 3901 3905 3902 3906 /* 3903 3907 * Get the start pfn, end pfn and the number of blocks to reserve ··· 3920 3922 * future allocation of hugepages at runtime. 3921 3923 */ 3922 3924 reserve = min(2, reserve); 3925 + old_reserve = zone->nr_migrate_reserve_block; 3926 + 3927 + /* When memory hot-add, we almost always need to do nothing */ 3928 + if (reserve == old_reserve) 3929 + return; 3930 + zone->nr_migrate_reserve_block = reserve; 3923 3931 3924 3932 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 3925 3933 if (!pfn_valid(pfn)) ··· 3963 3959 reserve--; 3964 3960 continue; 3965 3961 } 3962 + } else if (!old_reserve) { 3963 + /* 3964 + * At boot time we don't need to scan the whole zone 3965 + * for turning off MIGRATE_RESERVE. 3966 + */ 3967 + break; 3966 3968 } 3967 3969 3968 3970 /* ··· 4219 4209 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 4220 4210 { 4221 4211 int i; 4222 - struct pglist_data *pgdat = zone->zone_pgdat; 4223 4212 size_t alloc_size; 4224 4213 4225 4214 /* ··· 4234 4225 4235 4226 if (!slab_is_available()) { 4236 4227 zone->wait_table = (wait_queue_head_t *) 4237 - alloc_bootmem_node_nopanic(pgdat, alloc_size); 4228 + memblock_virt_alloc_node_nopanic( 4229 + alloc_size, zone->zone_pgdat->node_id); 4238 4230 } else { 4239 4231 /* 4240 4232 * This case means that a zone whose size was 0 gets new memory ··· 4355 4345 #endif 4356 4346 4357 4347 /** 4358 - * free_bootmem_with_active_regions - Call free_bootmem_node for each active range 4348 + * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range 4359 4349 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 4360 - * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node 4350 + * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid 4361 4351 * 4362 4352 * If an architecture guarantees that all ranges registered with 4363 4353 * add_active_ranges() contain no holes and may be freed, this 4364 - * this function may be used instead of calling free_bootmem() manually. 4354 + * this function may be used instead of calling memblock_free_early_nid() 4355 + * manually. 4365 4356 */ 4366 4357 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) 4367 4358 { ··· 4374 4363 end_pfn = min(end_pfn, max_low_pfn); 4375 4364 4376 4365 if (start_pfn < end_pfn) 4377 - free_bootmem_node(NODE_DATA(this_nid), 4378 - PFN_PHYS(start_pfn), 4379 - (end_pfn - start_pfn) << PAGE_SHIFT); 4366 + memblock_free_early_nid(PFN_PHYS(start_pfn), 4367 + (end_pfn - start_pfn) << PAGE_SHIFT, 4368 + this_nid); 4380 4369 } 4381 4370 } 4382 4371 ··· 4647 4636 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); 4648 4637 zone->pageblock_flags = NULL; 4649 4638 if (usemapsize) 4650 - zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, 4651 - usemapsize); 4639 + zone->pageblock_flags = 4640 + memblock_virt_alloc_node_nopanic(usemapsize, 4641 + pgdat->node_id); 4652 4642 } 4653 4643 #else 4654 4644 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, ··· 4843 4831 size = (end - start) * sizeof(struct page); 4844 4832 map = alloc_remap(pgdat->node_id, size); 4845 4833 if (!map) 4846 - map = alloc_bootmem_node_nopanic(pgdat, size); 4834 + map = memblock_virt_alloc_node_nopanic(size, 4835 + pgdat->node_id); 4847 4836 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); 4848 4837 } 4849 4838 #ifndef CONFIG_NEED_MULTIPLE_NODES ··· 5025 5012 nodemask_t saved_node_state = node_states[N_MEMORY]; 5026 5013 unsigned long totalpages = early_calculate_totalpages(); 5027 5014 int usable_nodes = nodes_weight(node_states[N_MEMORY]); 5015 + struct memblock_type *type = &memblock.memory; 5016 + 5017 + /* Need to find movable_zone earlier when movable_node is specified. */ 5018 + find_usable_zone_for_movable(); 5028 5019 5029 5020 /* 5030 - * If movablecore was specified, calculate what size of 5021 + * If movable_node is specified, ignore kernelcore and movablecore 5022 + * options. 5023 + */ 5024 + if (movable_node_is_enabled()) { 5025 + for (i = 0; i < type->cnt; i++) { 5026 + if (!memblock_is_hotpluggable(&type->regions[i])) 5027 + continue; 5028 + 5029 + nid = type->regions[i].nid; 5030 + 5031 + usable_startpfn = PFN_DOWN(type->regions[i].base); 5032 + zone_movable_pfn[nid] = zone_movable_pfn[nid] ? 5033 + min(usable_startpfn, zone_movable_pfn[nid]) : 5034 + usable_startpfn; 5035 + } 5036 + 5037 + goto out2; 5038 + } 5039 + 5040 + /* 5041 + * If movablecore=nn[KMG] was specified, calculate what size of 5031 5042 * kernelcore that corresponds so that memory usable for 5032 5043 * any allocation type is evenly spread. If both kernelcore 5033 5044 * and movablecore are specified, then the value of kernelcore ··· 5077 5040 goto out; 5078 5041 5079 5042 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 5080 - find_usable_zone_for_movable(); 5081 5043 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 5082 5044 5083 5045 restart: ··· 5167 5131 if (usable_nodes && required_kernelcore > usable_nodes) 5168 5132 goto restart; 5169 5133 5134 + out2: 5170 5135 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 5171 5136 for (nid = 0; nid < MAX_NUMNODES; nid++) 5172 5137 zone_movable_pfn[nid] = ··· 5894 5857 do { 5895 5858 size = bucketsize << log2qty; 5896 5859 if (flags & HASH_EARLY) 5897 - table = alloc_bootmem_nopanic(size); 5860 + table = memblock_virt_alloc_nopanic(size, 0); 5898 5861 else if (hashdist) 5899 5862 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 5900 5863 else {

+3 -2

mm/page_cgroup.c

··· 54 54 55 55 table_size = sizeof(struct page_cgroup) * nr_pages; 56 56 57 - base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), 58 - table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 57 + base = memblock_virt_alloc_try_nid_nopanic( 58 + table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 59 + BOOTMEM_ALLOC_ACCESSIBLE, nid); 59 60 if (!base) 60 61 return -ENOMEM; 61 62 NODE_DATA(nid)->node_page_cgroup = base;

+22 -16

mm/percpu.c

··· 1063 1063 __alignof__(ai->groups[0].cpu_map[0])); 1064 1064 ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); 1065 1065 1066 - ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size)); 1066 + ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0); 1067 1067 if (!ptr) 1068 1068 return NULL; 1069 1069 ai = ptr; ··· 1088 1088 */ 1089 1089 void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) 1090 1090 { 1091 - free_bootmem(__pa(ai), ai->__ai_size); 1091 + memblock_free_early(__pa(ai), ai->__ai_size); 1092 1092 } 1093 1093 1094 1094 /** ··· 1246 1246 PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); 1247 1247 1248 1248 /* process group information and build config tables accordingly */ 1249 - group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); 1250 - group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0])); 1251 - unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); 1252 - unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); 1249 + group_offsets = memblock_virt_alloc(ai->nr_groups * 1250 + sizeof(group_offsets[0]), 0); 1251 + group_sizes = memblock_virt_alloc(ai->nr_groups * 1252 + sizeof(group_sizes[0]), 0); 1253 + unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0); 1254 + unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0); 1253 1255 1254 1256 for (cpu = 0; cpu < nr_cpu_ids; cpu++) 1255 1257 unit_map[cpu] = UINT_MAX; ··· 1313 1311 * empty chunks. 1314 1312 */ 1315 1313 pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; 1316 - pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); 1314 + pcpu_slot = memblock_virt_alloc( 1315 + pcpu_nr_slots * sizeof(pcpu_slot[0]), 0); 1317 1316 for (i = 0; i < pcpu_nr_slots; i++) 1318 1317 INIT_LIST_HEAD(&pcpu_slot[i]); 1319 1318 ··· 1325 1322 * covers static area + reserved area (mostly used for module 1326 1323 * static percpu allocation). 1327 1324 */ 1328 - schunk = alloc_bootmem(pcpu_chunk_struct_size); 1325 + schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); 1329 1326 INIT_LIST_HEAD(&schunk->list); 1330 1327 schunk->base_addr = base_addr; 1331 1328 schunk->map = smap; ··· 1349 1346 1350 1347 /* init dynamic chunk if necessary */ 1351 1348 if (dyn_size) { 1352 - dchunk = alloc_bootmem(pcpu_chunk_struct_size); 1349 + dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); 1353 1350 INIT_LIST_HEAD(&dchunk->list); 1354 1351 dchunk->base_addr = base_addr; 1355 1352 dchunk->map = dmap; ··· 1629 1626 size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; 1630 1627 areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); 1631 1628 1632 - areas = alloc_bootmem_nopanic(areas_size); 1629 + areas = memblock_virt_alloc_nopanic(areas_size, 0); 1633 1630 if (!areas) { 1634 1631 rc = -ENOMEM; 1635 1632 goto out_free; ··· 1715 1712 out_free: 1716 1713 pcpu_free_alloc_info(ai); 1717 1714 if (areas) 1718 - free_bootmem(__pa(areas), areas_size); 1715 + memblock_free_early(__pa(areas), areas_size); 1719 1716 return rc; 1720 1717 } 1721 1718 #endif /* BUILD_EMBED_FIRST_CHUNK */ ··· 1763 1760 /* unaligned allocations can't be freed, round up to page size */ 1764 1761 pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * 1765 1762 sizeof(pages[0])); 1766 - pages = alloc_bootmem(pages_size); 1763 + pages = memblock_virt_alloc(pages_size, 0); 1767 1764 1768 1765 /* allocate pages */ 1769 1766 j = 0; ··· 1826 1823 free_fn(page_address(pages[j]), PAGE_SIZE); 1827 1824 rc = -ENOMEM; 1828 1825 out_free_ar: 1829 - free_bootmem(__pa(pages), pages_size); 1826 + memblock_free_early(__pa(pages), pages_size); 1830 1827 pcpu_free_alloc_info(ai); 1831 1828 return rc; 1832 1829 } ··· 1851 1848 static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, 1852 1849 size_t align) 1853 1850 { 1854 - return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); 1851 + return memblock_virt_alloc_from_nopanic( 1852 + size, align, __pa(MAX_DMA_ADDRESS)); 1855 1853 } 1856 1854 1857 1855 static void __init pcpu_dfl_fc_free(void *ptr, size_t size) 1858 1856 { 1859 - free_bootmem(__pa(ptr), size); 1857 + memblock_free_early(__pa(ptr), size); 1860 1858 } 1861 1859 1862 1860 void __init setup_per_cpu_areas(void) ··· 1900 1896 void *fc; 1901 1897 1902 1898 ai = pcpu_alloc_alloc_info(1, 1); 1903 - fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 1899 + fc = memblock_virt_alloc_from_nopanic(unit_size, 1900 + PAGE_SIZE, 1901 + __pa(MAX_DMA_ADDRESS)); 1904 1902 if (!ai || !fc) 1905 1903 panic("Failed to allocate memory for percpu areas."); 1906 1904 /* kmemleak tracks the percpu allocations separately */

+256 -324

mm/rmap.c

··· 660 660 return 1; 661 661 } 662 662 663 + struct page_referenced_arg { 664 + int mapcount; 665 + int referenced; 666 + unsigned long vm_flags; 667 + struct mem_cgroup *memcg; 668 + }; 663 669 /* 664 - * Subfunctions of page_referenced: page_referenced_one called 665 - * repeatedly from either page_referenced_anon or page_referenced_file. 670 + * arg: page_referenced_arg will be passed 666 671 */ 667 672 int page_referenced_one(struct page *page, struct vm_area_struct *vma, 668 - unsigned long address, unsigned int *mapcount, 669 - unsigned long *vm_flags) 673 + unsigned long address, void *arg) 670 674 { 671 675 struct mm_struct *mm = vma->vm_mm; 672 676 spinlock_t *ptl; 673 677 int referenced = 0; 678 + struct page_referenced_arg *pra = arg; 674 679 675 680 if (unlikely(PageTransHuge(page))) { 676 681 pmd_t *pmd; ··· 687 682 pmd = page_check_address_pmd(page, mm, address, 688 683 PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); 689 684 if (!pmd) 690 - goto out; 685 + return SWAP_AGAIN; 691 686 692 687 if (vma->vm_flags & VM_LOCKED) { 693 688 spin_unlock(ptl); 694 - *mapcount = 0; /* break early from loop */ 695 - *vm_flags |= VM_LOCKED; 696 - goto out; 689 + pra->vm_flags |= VM_LOCKED; 690 + return SWAP_FAIL; /* To break the loop */ 697 691 } 698 692 699 693 /* go ahead even if the pmd is pmd_trans_splitting() */ ··· 708 704 */ 709 705 pte = page_check_address(page, mm, address, &ptl, 0); 710 706 if (!pte) 711 - goto out; 707 + return SWAP_AGAIN; 712 708 713 709 if (vma->vm_flags & VM_LOCKED) { 714 710 pte_unmap_unlock(pte, ptl); 715 - *mapcount = 0; /* break early from loop */ 716 - *vm_flags |= VM_LOCKED; 717 - goto out; 711 + pra->vm_flags |= VM_LOCKED; 712 + return SWAP_FAIL; /* To break the loop */ 718 713 } 719 714 720 715 if (ptep_clear_flush_young_notify(vma, address, pte)) { ··· 730 727 pte_unmap_unlock(pte, ptl); 731 728 } 732 729 733 - (*mapcount)--; 734 - 735 - if (referenced) 736 - *vm_flags |= vma->vm_flags; 737 - out: 738 - return referenced; 739 - } 740 - 741 - static int page_referenced_anon(struct page *page, 742 - struct mem_cgroup *memcg, 743 - unsigned long *vm_flags) 744 - { 745 - unsigned int mapcount; 746 - struct anon_vma *anon_vma; 747 - pgoff_t pgoff; 748 - struct anon_vma_chain *avc; 749 - int referenced = 0; 750 - 751 - anon_vma = page_lock_anon_vma_read(page); 752 - if (!anon_vma) 753 - return referenced; 754 - 755 - mapcount = page_mapcount(page); 756 - pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 757 - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { 758 - struct vm_area_struct *vma = avc->vma; 759 - unsigned long address = vma_address(page, vma); 760 - /* 761 - * If we are reclaiming on behalf of a cgroup, skip 762 - * counting on behalf of references from different 763 - * cgroups 764 - */ 765 - if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) 766 - continue; 767 - referenced += page_referenced_one(page, vma, address, 768 - &mapcount, vm_flags); 769 - if (!mapcount) 770 - break; 730 + if (referenced) { 731 + pra->referenced++; 732 + pra->vm_flags |= vma->vm_flags; 771 733 } 772 734 773 - page_unlock_anon_vma_read(anon_vma); 774 - return referenced; 735 + pra->mapcount--; 736 + if (!pra->mapcount) 737 + return SWAP_SUCCESS; /* To break the loop */ 738 + 739 + return SWAP_AGAIN; 775 740 } 776 741 777 - /** 778 - * page_referenced_file - referenced check for object-based rmap 779 - * @page: the page we're checking references on. 780 - * @memcg: target memory control group 781 - * @vm_flags: collect encountered vma->vm_flags who actually referenced the page 782 - * 783 - * For an object-based mapped page, find all the places it is mapped and 784 - * check/clear the referenced flag. This is done by following the page->mapping 785 - * pointer, then walking the chain of vmas it holds. It returns the number 786 - * of references it found. 787 - * 788 - * This function is only called from page_referenced for object-based pages. 789 - */ 790 - static int page_referenced_file(struct page *page, 791 - struct mem_cgroup *memcg, 792 - unsigned long *vm_flags) 742 + static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) 793 743 { 794 - unsigned int mapcount; 795 - struct address_space *mapping = page->mapping; 796 - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 797 - struct vm_area_struct *vma; 798 - int referenced = 0; 744 + struct page_referenced_arg *pra = arg; 745 + struct mem_cgroup *memcg = pra->memcg; 799 746 800 - /* 801 - * The caller's checks on page->mapping and !PageAnon have made 802 - * sure that this is a file page: the check for page->mapping 803 - * excludes the case just before it gets set on an anon page. 804 - */ 805 - BUG_ON(PageAnon(page)); 747 + if (!mm_match_cgroup(vma->vm_mm, memcg)) 748 + return true; 806 749 807 - /* 808 - * The page lock not only makes sure that page->mapping cannot 809 - * suddenly be NULLified by truncation, it makes sure that the 810 - * structure at mapping cannot be freed and reused yet, 811 - * so we can safely take mapping->i_mmap_mutex. 812 - */ 813 - BUG_ON(!PageLocked(page)); 814 - 815 - mutex_lock(&mapping->i_mmap_mutex); 816 - 817 - /* 818 - * i_mmap_mutex does not stabilize mapcount at all, but mapcount 819 - * is more likely to be accurate if we note it after spinning. 820 - */ 821 - mapcount = page_mapcount(page); 822 - 823 - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 824 - unsigned long address = vma_address(page, vma); 825 - /* 826 - * If we are reclaiming on behalf of a cgroup, skip 827 - * counting on behalf of references from different 828 - * cgroups 829 - */ 830 - if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) 831 - continue; 832 - referenced += page_referenced_one(page, vma, address, 833 - &mapcount, vm_flags); 834 - if (!mapcount) 835 - break; 836 - } 837 - 838 - mutex_unlock(&mapping->i_mmap_mutex); 839 - return referenced; 750 + return false; 840 751 } 841 752 842 753 /** ··· 768 851 struct mem_cgroup *memcg, 769 852 unsigned long *vm_flags) 770 853 { 771 - int referenced = 0; 854 + int ret; 772 855 int we_locked = 0; 856 + struct page_referenced_arg pra = { 857 + .mapcount = page_mapcount(page), 858 + .memcg = memcg, 859 + }; 860 + struct rmap_walk_control rwc = { 861 + .rmap_one = page_referenced_one, 862 + .arg = (void *)&pra, 863 + .anon_lock = page_lock_anon_vma_read, 864 + }; 773 865 774 866 *vm_flags = 0; 775 - if (page_mapped(page) && page_rmapping(page)) { 776 - if (!is_locked && (!PageAnon(page) || PageKsm(page))) { 777 - we_locked = trylock_page(page); 778 - if (!we_locked) { 779 - referenced++; 780 - goto out; 781 - } 782 - } 783 - if (unlikely(PageKsm(page))) 784 - referenced += page_referenced_ksm(page, memcg, 785 - vm_flags); 786 - else if (PageAnon(page)) 787 - referenced += page_referenced_anon(page, memcg, 788 - vm_flags); 789 - else if (page->mapping) 790 - referenced += page_referenced_file(page, memcg, 791 - vm_flags); 792 - if (we_locked) 793 - unlock_page(page); 867 + if (!page_mapped(page)) 868 + return 0; 869 + 870 + if (!page_rmapping(page)) 871 + return 0; 872 + 873 + if (!is_locked && (!PageAnon(page) || PageKsm(page))) { 874 + we_locked = trylock_page(page); 875 + if (!we_locked) 876 + return 1; 794 877 } 795 - out: 796 - return referenced; 878 + 879 + /* 880 + * If we are reclaiming on behalf of a cgroup, skip 881 + * counting on behalf of references from different 882 + * cgroups 883 + */ 884 + if (memcg) { 885 + rwc.invalid_vma = invalid_page_referenced_vma; 886 + } 887 + 888 + ret = rmap_walk(page, &rwc); 889 + *vm_flags = pra.vm_flags; 890 + 891 + if (we_locked) 892 + unlock_page(page); 893 + 894 + return pra.referenced; 797 895 } 798 896 799 897 static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, 800 - unsigned long address) 898 + unsigned long address, void *arg) 801 899 { 802 900 struct mm_struct *mm = vma->vm_mm; 803 901 pte_t *pte; 804 902 spinlock_t *ptl; 805 903 int ret = 0; 904 + int *cleaned = arg; 806 905 807 906 pte = page_check_address(page, mm, address, &ptl, 1); 808 907 if (!pte) ··· 837 904 838 905 pte_unmap_unlock(pte, ptl); 839 906 840 - if (ret) 907 + if (ret) { 841 908 mmu_notifier_invalidate_page(mm, address); 909 + (*cleaned)++; 910 + } 842 911 out: 843 - return ret; 912 + return SWAP_AGAIN; 844 913 } 845 914 846 - static int page_mkclean_file(struct address_space *mapping, struct page *page) 915 + static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) 847 916 { 848 - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 849 - struct vm_area_struct *vma; 850 - int ret = 0; 917 + if (vma->vm_flags & VM_SHARED) 918 + return 0; 851 919 852 - BUG_ON(PageAnon(page)); 853 - 854 - mutex_lock(&mapping->i_mmap_mutex); 855 - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 856 - if (vma->vm_flags & VM_SHARED) { 857 - unsigned long address = vma_address(page, vma); 858 - ret += page_mkclean_one(page, vma, address); 859 - } 860 - } 861 - mutex_unlock(&mapping->i_mmap_mutex); 862 - return ret; 920 + return 1; 863 921 } 864 922 865 923 int page_mkclean(struct page *page) 866 924 { 867 - int ret = 0; 925 + int cleaned = 0; 926 + struct address_space *mapping; 927 + struct rmap_walk_control rwc = { 928 + .arg = (void *)&cleaned, 929 + .rmap_one = page_mkclean_one, 930 + .invalid_vma = invalid_mkclean_vma, 931 + }; 868 932 869 933 BUG_ON(!PageLocked(page)); 870 934 871 - if (page_mapped(page)) { 872 - struct address_space *mapping = page_mapping(page); 873 - if (mapping) 874 - ret = page_mkclean_file(mapping, page); 875 - } 935 + if (!page_mapped(page)) 936 + return 0; 876 937 877 - return ret; 938 + mapping = page_mapping(page); 939 + if (!mapping) 940 + return 0; 941 + 942 + rmap_walk(page, &rwc); 943 + 944 + return cleaned; 878 945 } 879 946 EXPORT_SYMBOL_GPL(page_mkclean); 880 947 ··· 1110 1177 } 1111 1178 1112 1179 /* 1113 - * Subfunctions of try_to_unmap: try_to_unmap_one called 1114 - * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file. 1180 + * @arg: enum ttu_flags will be passed to this argument 1115 1181 */ 1116 1182 int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 1117 - unsigned long address, enum ttu_flags flags) 1183 + unsigned long address, void *arg) 1118 1184 { 1119 1185 struct mm_struct *mm = vma->vm_mm; 1120 1186 pte_t *pte; 1121 1187 pte_t pteval; 1122 1188 spinlock_t *ptl; 1123 1189 int ret = SWAP_AGAIN; 1190 + enum ttu_flags flags = (enum ttu_flags)arg; 1124 1191 1125 1192 pte = page_check_address(page, mm, address, &ptl, 0); 1126 1193 if (!pte) ··· 1359 1426 return ret; 1360 1427 } 1361 1428 1362 - bool is_vma_temporary_stack(struct vm_area_struct *vma) 1429 + static int try_to_unmap_nonlinear(struct page *page, 1430 + struct address_space *mapping, struct vm_area_struct *vma) 1363 1431 { 1364 - int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); 1365 - 1366 - if (!maybe_stack) 1367 - return false; 1368 - 1369 - if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == 1370 - VM_STACK_INCOMPLETE_SETUP) 1371 - return true; 1372 - 1373 - return false; 1374 - } 1375 - 1376 - /** 1377 - * try_to_unmap_anon - unmap or unlock anonymous page using the object-based 1378 - * rmap method 1379 - * @page: the page to unmap/unlock 1380 - * @flags: action and flags 1381 - * 1382 - * Find all the mappings of a page using the mapping pointer and the vma chains 1383 - * contained in the anon_vma struct it points to. 1384 - * 1385 - * This function is only called from try_to_unmap/try_to_munlock for 1386 - * anonymous pages. 1387 - * When called from try_to_munlock(), the mmap_sem of the mm containing the vma 1388 - * where the page was found will be held for write. So, we won't recheck 1389 - * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1390 - * 'LOCKED. 1391 - */ 1392 - static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) 1393 - { 1394 - struct anon_vma *anon_vma; 1395 - pgoff_t pgoff; 1396 - struct anon_vma_chain *avc; 1397 - int ret = SWAP_AGAIN; 1398 - 1399 - anon_vma = page_lock_anon_vma_read(page); 1400 - if (!anon_vma) 1401 - return ret; 1402 - 1403 - pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1404 - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { 1405 - struct vm_area_struct *vma = avc->vma; 1406 - unsigned long address; 1407 - 1408 - /* 1409 - * During exec, a temporary VMA is setup and later moved. 1410 - * The VMA is moved under the anon_vma lock but not the 1411 - * page tables leading to a race where migration cannot 1412 - * find the migration ptes. Rather than increasing the 1413 - * locking requirements of exec(), migration skips 1414 - * temporary VMAs until after exec() completes. 1415 - */ 1416 - if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && 1417 - is_vma_temporary_stack(vma)) 1418 - continue; 1419 - 1420 - address = vma_address(page, vma); 1421 - ret = try_to_unmap_one(page, vma, address, flags); 1422 - if (ret != SWAP_AGAIN || !page_mapped(page)) 1423 - break; 1424 - } 1425 - 1426 - page_unlock_anon_vma_read(anon_vma); 1427 - return ret; 1428 - } 1429 - 1430 - /** 1431 - * try_to_unmap_file - unmap/unlock file page using the object-based rmap method 1432 - * @page: the page to unmap/unlock 1433 - * @flags: action and flags 1434 - * 1435 - * Find all the mappings of a page using the mapping pointer and the vma chains 1436 - * contained in the address_space struct it points to. 1437 - * 1438 - * This function is only called from try_to_unmap/try_to_munlock for 1439 - * object-based pages. 1440 - * When called from try_to_munlock(), the mmap_sem of the mm containing the vma 1441 - * where the page was found will be held for write. So, we won't recheck 1442 - * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1443 - * 'LOCKED. 1444 - */ 1445 - static int try_to_unmap_file(struct page *page, enum ttu_flags flags) 1446 - { 1447 - struct address_space *mapping = page->mapping; 1448 - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1449 - struct vm_area_struct *vma; 1450 1432 int ret = SWAP_AGAIN; 1451 1433 unsigned long cursor; 1452 1434 unsigned long max_nl_cursor = 0; 1453 1435 unsigned long max_nl_size = 0; 1454 1436 unsigned int mapcount; 1455 1437 1456 - if (PageHuge(page)) 1457 - pgoff = page->index << compound_order(page); 1438 + list_for_each_entry(vma, 1439 + &mapping->i_mmap_nonlinear, shared.nonlinear) { 1458 1440 1459 - mutex_lock(&mapping->i_mmap_mutex); 1460 - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 1461 - unsigned long address = vma_address(page, vma); 1462 - ret = try_to_unmap_one(page, vma, address, flags); 1463 - if (ret != SWAP_AGAIN || !page_mapped(page)) 1464 - goto out; 1465 - } 1466 - 1467 - if (list_empty(&mapping->i_mmap_nonlinear)) 1468 - goto out; 1469 - 1470 - /* 1471 - * We don't bother to try to find the munlocked page in nonlinears. 1472 - * It's costly. Instead, later, page reclaim logic may call 1473 - * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily. 1474 - */ 1475 - if (TTU_ACTION(flags) == TTU_MUNLOCK) 1476 - goto out; 1477 - 1478 - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1479 - shared.nonlinear) { 1480 1441 cursor = (unsigned long) vma->vm_private_data; 1481 1442 if (cursor > max_nl_cursor) 1482 1443 max_nl_cursor = cursor; ··· 1380 1553 } 1381 1554 1382 1555 if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ 1383 - ret = SWAP_FAIL; 1384 - goto out; 1556 + return SWAP_FAIL; 1385 1557 } 1386 1558 1387 1559 /* ··· 1392 1566 */ 1393 1567 mapcount = page_mapcount(page); 1394 1568 if (!mapcount) 1395 - goto out; 1569 + return ret; 1570 + 1396 1571 cond_resched(); 1397 1572 1398 1573 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; ··· 1401 1574 max_nl_cursor = CLUSTER_SIZE; 1402 1575 1403 1576 do { 1404 - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1405 - shared.nonlinear) { 1577 + list_for_each_entry(vma, 1578 + &mapping->i_mmap_nonlinear, shared.nonlinear) { 1579 + 1406 1580 cursor = (unsigned long) vma->vm_private_data; 1407 - while ( cursor < max_nl_cursor && 1581 + while (cursor < max_nl_cursor && 1408 1582 cursor < vma->vm_end - vma->vm_start) { 1409 1583 if (try_to_unmap_cluster(cursor, &mapcount, 1410 1584 vma, page) == SWAP_MLOCK) ··· 1413 1585 cursor += CLUSTER_SIZE; 1414 1586 vma->vm_private_data = (void *) cursor; 1415 1587 if ((int)mapcount <= 0) 1416 - goto out; 1588 + return ret; 1417 1589 } 1418 1590 vma->vm_private_data = (void *) max_nl_cursor; 1419 1591 } ··· 1428 1600 */ 1429 1601 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) 1430 1602 vma->vm_private_data = NULL; 1431 - out: 1432 - mutex_unlock(&mapping->i_mmap_mutex); 1603 + 1433 1604 return ret; 1434 1605 } 1606 + 1607 + bool is_vma_temporary_stack(struct vm_area_struct *vma) 1608 + { 1609 + int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); 1610 + 1611 + if (!maybe_stack) 1612 + return false; 1613 + 1614 + if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == 1615 + VM_STACK_INCOMPLETE_SETUP) 1616 + return true; 1617 + 1618 + return false; 1619 + } 1620 + 1621 + static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) 1622 + { 1623 + return is_vma_temporary_stack(vma); 1624 + } 1625 + 1626 + static int page_not_mapped(struct page *page) 1627 + { 1628 + return !page_mapped(page); 1629 + }; 1435 1630 1436 1631 /** 1437 1632 * try_to_unmap - try to remove all page table mappings to a page ··· 1473 1622 int try_to_unmap(struct page *page, enum ttu_flags flags) 1474 1623 { 1475 1624 int ret; 1625 + struct rmap_walk_control rwc = { 1626 + .rmap_one = try_to_unmap_one, 1627 + .arg = (void *)flags, 1628 + .done = page_not_mapped, 1629 + .file_nonlinear = try_to_unmap_nonlinear, 1630 + .anon_lock = page_lock_anon_vma_read, 1631 + }; 1476 1632 1477 - BUG_ON(!PageLocked(page)); 1478 1633 VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); 1479 1634 1480 - if (unlikely(PageKsm(page))) 1481 - ret = try_to_unmap_ksm(page, flags); 1482 - else if (PageAnon(page)) 1483 - ret = try_to_unmap_anon(page, flags); 1484 - else 1485 - ret = try_to_unmap_file(page, flags); 1635 + /* 1636 + * During exec, a temporary VMA is setup and later moved. 1637 + * The VMA is moved under the anon_vma lock but not the 1638 + * page tables leading to a race where migration cannot 1639 + * find the migration ptes. Rather than increasing the 1640 + * locking requirements of exec(), migration skips 1641 + * temporary VMAs until after exec() completes. 1642 + */ 1643 + if (flags & TTU_MIGRATION && !PageKsm(page) && PageAnon(page)) 1644 + rwc.invalid_vma = invalid_migration_vma; 1645 + 1646 + ret = rmap_walk(page, &rwc); 1647 + 1486 1648 if (ret != SWAP_MLOCK && !page_mapped(page)) 1487 1649 ret = SWAP_SUCCESS; 1488 1650 return ret; ··· 1518 1654 */ 1519 1655 int try_to_munlock(struct page *page) 1520 1656 { 1657 + int ret; 1658 + struct rmap_walk_control rwc = { 1659 + .rmap_one = try_to_unmap_one, 1660 + .arg = (void *)TTU_MUNLOCK, 1661 + .done = page_not_mapped, 1662 + /* 1663 + * We don't bother to try to find the munlocked page in 1664 + * nonlinears. It's costly. Instead, later, page reclaim logic 1665 + * may call try_to_unmap() and recover PG_mlocked lazily. 1666 + */ 1667 + .file_nonlinear = NULL, 1668 + .anon_lock = page_lock_anon_vma_read, 1669 + 1670 + }; 1671 + 1521 1672 VM_BUG_ON(!PageLocked(page) || PageLRU(page)); 1522 1673 1523 - if (unlikely(PageKsm(page))) 1524 - return try_to_unmap_ksm(page, TTU_MUNLOCK); 1525 - else if (PageAnon(page)) 1526 - return try_to_unmap_anon(page, TTU_MUNLOCK); 1527 - else 1528 - return try_to_unmap_file(page, TTU_MUNLOCK); 1674 + ret = rmap_walk(page, &rwc); 1675 + return ret; 1529 1676 } 1530 1677 1531 1678 void __put_anon_vma(struct anon_vma *anon_vma) ··· 1549 1674 anon_vma_free(anon_vma); 1550 1675 } 1551 1676 1552 - #ifdef CONFIG_MIGRATION 1553 - /* 1554 - * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): 1555 - * Called by migrate.c to remove migration ptes, but might be used more later. 1556 - */ 1557 - static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, 1558 - struct vm_area_struct *, unsigned long, void *), void *arg) 1677 + static struct anon_vma *rmap_walk_anon_lock(struct page *page, 1678 + struct rmap_walk_control *rwc) 1559 1679 { 1560 1680 struct anon_vma *anon_vma; 1561 - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1562 - struct anon_vma_chain *avc; 1563 - int ret = SWAP_AGAIN; 1681 + 1682 + if (rwc->anon_lock) 1683 + return rwc->anon_lock(page); 1564 1684 1565 1685 /* 1566 1686 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() ··· 1565 1695 */ 1566 1696 anon_vma = page_anon_vma(page); 1567 1697 if (!anon_vma) 1568 - return ret; 1698 + return NULL; 1699 + 1569 1700 anon_vma_lock_read(anon_vma); 1701 + return anon_vma; 1702 + } 1703 + 1704 + /* 1705 + * rmap_walk_anon - do something to anonymous page using the object-based 1706 + * rmap method 1707 + * @page: the page to be handled 1708 + * @rwc: control variable according to each walk type 1709 + * 1710 + * Find all the mappings of a page using the mapping pointer and the vma chains 1711 + * contained in the anon_vma struct it points to. 1712 + * 1713 + * When called from try_to_munlock(), the mmap_sem of the mm containing the vma 1714 + * where the page was found will be held for write. So, we won't recheck 1715 + * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1716 + * LOCKED. 1717 + */ 1718 + static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) 1719 + { 1720 + struct anon_vma *anon_vma; 1721 + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1722 + struct anon_vma_chain *avc; 1723 + int ret = SWAP_AGAIN; 1724 + 1725 + anon_vma = rmap_walk_anon_lock(page, rwc); 1726 + if (!anon_vma) 1727 + return ret; 1728 + 1570 1729 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { 1571 1730 struct vm_area_struct *vma = avc->vma; 1572 1731 unsigned long address = vma_address(page, vma); 1573 - ret = rmap_one(page, vma, address, arg); 1732 + 1733 + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 1734 + continue; 1735 + 1736 + ret = rwc->rmap_one(page, vma, address, rwc->arg); 1574 1737 if (ret != SWAP_AGAIN) 1738 + break; 1739 + if (rwc->done && rwc->done(page)) 1575 1740 break; 1576 1741 } 1577 1742 anon_vma_unlock_read(anon_vma); 1578 1743 return ret; 1579 1744 } 1580 1745 1581 - static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, 1582 - struct vm_area_struct *, unsigned long, void *), void *arg) 1746 + /* 1747 + * rmap_walk_file - do something to file page using the object-based rmap method 1748 + * @page: the page to be handled 1749 + * @rwc: control variable according to each walk type 1750 + * 1751 + * Find all the mappings of a page using the mapping pointer and the vma chains 1752 + * contained in the address_space struct it points to. 1753 + * 1754 + * When called from try_to_munlock(), the mmap_sem of the mm containing the vma 1755 + * where the page was found will be held for write. So, we won't recheck 1756 + * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1757 + * LOCKED. 1758 + */ 1759 + static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) 1583 1760 { 1584 1761 struct address_space *mapping = page->mapping; 1585 - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1762 + pgoff_t pgoff = page->index << compound_order(page); 1586 1763 struct vm_area_struct *vma; 1587 1764 int ret = SWAP_AGAIN; 1765 + 1766 + /* 1767 + * The page lock not only makes sure that page->mapping cannot 1768 + * suddenly be NULLified by truncation, it makes sure that the 1769 + * structure at mapping cannot be freed and reused yet, 1770 + * so we can safely take mapping->i_mmap_mutex. 1771 + */ 1772 + VM_BUG_ON(!PageLocked(page)); 1588 1773 1589 1774 if (!mapping) 1590 1775 return ret; 1591 1776 mutex_lock(&mapping->i_mmap_mutex); 1592 1777 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 1593 1778 unsigned long address = vma_address(page, vma); 1594 - ret = rmap_one(page, vma, address, arg); 1779 + 1780 + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 1781 + continue; 1782 + 1783 + ret = rwc->rmap_one(page, vma, address, rwc->arg); 1595 1784 if (ret != SWAP_AGAIN) 1596 - break; 1785 + goto done; 1786 + if (rwc->done && rwc->done(page)) 1787 + goto done; 1597 1788 } 1598 - /* 1599 - * No nonlinear handling: being always shared, nonlinear vmas 1600 - * never contain migration ptes. Decide what to do about this 1601 - * limitation to linear when we need rmap_walk() on nonlinear. 1602 - */ 1789 + 1790 + if (!rwc->file_nonlinear) 1791 + goto done; 1792 + 1793 + if (list_empty(&mapping->i_mmap_nonlinear)) 1794 + goto done; 1795 + 1796 + ret = rwc->file_nonlinear(page, mapping, vma); 1797 + 1798 + done: 1603 1799 mutex_unlock(&mapping->i_mmap_mutex); 1604 1800 return ret; 1605 1801 } 1606 1802 1607 - int rmap_walk(struct page *page, int (*rmap_one)(struct page *, 1608 - struct vm_area_struct *, unsigned long, void *), void *arg) 1803 + int rmap_walk(struct page *page, struct rmap_walk_control *rwc) 1609 1804 { 1610 - VM_BUG_ON(!PageLocked(page)); 1611 - 1612 1805 if (unlikely(PageKsm(page))) 1613 - return rmap_walk_ksm(page, rmap_one, arg); 1806 + return rmap_walk_ksm(page, rwc); 1614 1807 else if (PageAnon(page)) 1615 - return rmap_walk_anon(page, rmap_one, arg); 1808 + return rmap_walk_anon(page, rwc); 1616 1809 else 1617 - return rmap_walk_file(page, rmap_one, arg); 1810 + return rmap_walk_file(page, rwc); 1618 1811 } 1619 - #endif /* CONFIG_MIGRATION */ 1620 1812 1621 1813 #ifdef CONFIG_HUGETLB_PAGE 1622 1814 /*

+4 -2

mm/sparse-vmemmap.c

··· 40 40 unsigned long align, 41 41 unsigned long goal) 42 42 { 43 - return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal); 43 + return memblock_virt_alloc_try_nid(size, align, goal, 44 + BOOTMEM_ALLOC_ACCESSIBLE, node); 44 45 } 45 46 46 47 static void *vmemmap_buf; ··· 227 226 228 227 if (vmemmap_buf_start) { 229 228 /* need to free left buf */ 230 - free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf); 229 + memblock_free_early(__pa(vmemmap_buf), 230 + vmemmap_buf_end - vmemmap_buf); 231 231 vmemmap_buf = NULL; 232 232 vmemmap_buf_end = NULL; 233 233 }

+15 -12

mm/sparse.c

··· 69 69 else 70 70 section = kzalloc(array_size, GFP_KERNEL); 71 71 } else { 72 - section = alloc_bootmem_node(NODE_DATA(nid), array_size); 72 + section = memblock_virt_alloc_node(array_size, nid); 73 73 } 74 74 75 75 return section; ··· 279 279 limit = goal + (1UL << PA_SECTION_SHIFT); 280 280 nid = early_pfn_to_nid(goal >> PAGE_SHIFT); 281 281 again: 282 - p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, 283 - SMP_CACHE_BYTES, goal, limit); 282 + p = memblock_virt_alloc_try_nid_nopanic(size, 283 + SMP_CACHE_BYTES, goal, limit, 284 + nid); 284 285 if (!p && limit) { 285 286 limit = 0; 286 287 goto again; ··· 332 331 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, 333 332 unsigned long size) 334 333 { 335 - return alloc_bootmem_node_nopanic(pgdat, size); 334 + return memblock_virt_alloc_node_nopanic(size, pgdat->node_id); 336 335 } 337 336 338 337 static void __init check_usemap_section_nr(int nid, unsigned long *usemap) ··· 377 376 return map; 378 377 379 378 size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); 380 - map = __alloc_bootmem_node_high(NODE_DATA(nid), size, 381 - PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 379 + map = memblock_virt_alloc_try_nid(size, 380 + PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 381 + BOOTMEM_ALLOC_ACCESSIBLE, nid); 382 382 return map; 383 383 } 384 384 void __init sparse_mem_maps_populate_node(struct page **map_map, ··· 403 401 } 404 402 405 403 size = PAGE_ALIGN(size); 406 - map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count, 407 - PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 404 + map = memblock_virt_alloc_try_nid(size * map_count, 405 + PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 406 + BOOTMEM_ALLOC_ACCESSIBLE, nodeid); 408 407 if (map) { 409 408 for (pnum = pnum_begin; pnum < pnum_end; pnum++) { 410 409 if (!present_section_nr(pnum)) ··· 548 545 * sparse_early_mem_map_alloc, so allocate usemap_map at first. 549 546 */ 550 547 size = sizeof(unsigned long *) * NR_MEM_SECTIONS; 551 - usemap_map = alloc_bootmem(size); 548 + usemap_map = memblock_virt_alloc(size, 0); 552 549 if (!usemap_map) 553 550 panic("can not allocate usemap_map\n"); 554 551 alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, ··· 556 553 557 554 #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER 558 555 size2 = sizeof(struct page *) * NR_MEM_SECTIONS; 559 - map_map = alloc_bootmem(size2); 556 + map_map = memblock_virt_alloc(size2, 0); 560 557 if (!map_map) 561 558 panic("can not allocate map_map\n"); 562 559 alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, ··· 586 583 vmemmap_populate_print_last(); 587 584 588 585 #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER 589 - free_bootmem(__pa(map_map), size2); 586 + memblock_free_early(__pa(map_map), size2); 590 587 #endif 591 - free_bootmem(__pa(usemap_map), size); 588 + memblock_free_early(__pa(usemap_map), size); 592 589 } 593 590 594 591 #ifdef CONFIG_MEMORY_HOTPLUG

+161 -129

mm/swap.c

··· 31 31 #include <linux/memcontrol.h> 32 32 #include <linux/gfp.h> 33 33 #include <linux/uio.h> 34 - #include <linux/hugetlb.h> 35 34 36 35 #include "internal.h" 37 36 ··· 81 82 82 83 static void put_compound_page(struct page *page) 83 84 { 84 - if (unlikely(PageTail(page))) { 85 - /* __split_huge_page_refcount can run under us */ 86 - struct page *page_head = compound_trans_head(page); 85 + struct page *page_head; 87 86 88 - if (likely(page != page_head && 89 - get_page_unless_zero(page_head))) { 90 - unsigned long flags; 87 + if (likely(!PageTail(page))) { 88 + if (put_page_testzero(page)) { 89 + /* 90 + * By the time all refcounts have been released 91 + * split_huge_page cannot run anymore from under us. 92 + */ 93 + if (PageHead(page)) 94 + __put_compound_page(page); 95 + else 96 + __put_single_page(page); 97 + } 98 + return; 99 + } 91 100 92 - /* 93 - * THP can not break up slab pages so avoid taking 94 - * compound_lock(). Slab performs non-atomic bit ops 95 - * on page->flags for better performance. In particular 96 - * slab_unlock() in slub used to be a hot path. It is 97 - * still hot on arches that do not support 98 - * this_cpu_cmpxchg_double(). 99 - */ 100 - if (PageSlab(page_head) || PageHeadHuge(page_head)) { 101 - if (likely(PageTail(page))) { 102 - /* 103 - * __split_huge_page_refcount 104 - * cannot race here. 105 - */ 106 - VM_BUG_ON(!PageHead(page_head)); 107 - atomic_dec(&page->_mapcount); 108 - if (put_page_testzero(page_head)) 109 - VM_BUG_ON(1); 110 - if (put_page_testzero(page_head)) 111 - __put_compound_page(page_head); 112 - return; 113 - } else 114 - /* 115 - * __split_huge_page_refcount 116 - * run before us, "page" was a 117 - * THP tail. The split 118 - * page_head has been freed 119 - * and reallocated as slab or 120 - * hugetlbfs page of smaller 121 - * order (only possible if 122 - * reallocated as slab on 123 - * x86). 124 - */ 125 - goto skip_lock; 126 - } 127 - /* 128 - * page_head wasn't a dangling pointer but it 129 - * may not be a head page anymore by the time 130 - * we obtain the lock. That is ok as long as it 131 - * can't be freed from under us. 132 - */ 133 - flags = compound_lock_irqsave(page_head); 134 - if (unlikely(!PageTail(page))) { 135 - /* __split_huge_page_refcount run before us */ 136 - compound_unlock_irqrestore(page_head, flags); 137 - skip_lock: 138 - if (put_page_testzero(page_head)) { 139 - /* 140 - * The head page may have been 141 - * freed and reallocated as a 142 - * compound page of smaller 143 - * order and then freed again. 144 - * All we know is that it 145 - * cannot have become: a THP 146 - * page, a compound page of 147 - * higher order, a tail page. 148 - * That is because we still 149 - * hold the refcount of the 150 - * split THP tail and 151 - * page_head was the THP head 152 - * before the split. 153 - */ 154 - if (PageHead(page_head)) 155 - __put_compound_page(page_head); 156 - else 157 - __put_single_page(page_head); 158 - } 159 - out_put_single: 160 - if (put_page_testzero(page)) 161 - __put_single_page(page); 162 - return; 163 - } 164 - VM_BUG_ON(page_head != page->first_page); 165 - /* 166 - * We can release the refcount taken by 167 - * get_page_unless_zero() now that 168 - * __split_huge_page_refcount() is blocked on 169 - * the compound_lock. 170 - */ 171 - if (put_page_testzero(page_head)) 172 - VM_BUG_ON(1); 173 - /* __split_huge_page_refcount will wait now */ 174 - VM_BUG_ON(page_mapcount(page) <= 0); 175 - atomic_dec(&page->_mapcount); 176 - VM_BUG_ON(atomic_read(&page_head->_count) <= 0); 177 - VM_BUG_ON(atomic_read(&page->_count) != 0); 178 - compound_unlock_irqrestore(page_head, flags); 101 + /* __split_huge_page_refcount can run under us */ 102 + page_head = compound_trans_head(page); 179 103 104 + /* 105 + * THP can not break up slab pages so avoid taking 106 + * compound_lock() and skip the tail page refcounting (in 107 + * _mapcount) too. Slab performs non-atomic bit ops on 108 + * page->flags for better performance. In particular 109 + * slab_unlock() in slub used to be a hot path. It is still 110 + * hot on arches that do not support 111 + * this_cpu_cmpxchg_double(). 112 + * 113 + * If "page" is part of a slab or hugetlbfs page it cannot be 114 + * splitted and the head page cannot change from under us. And 115 + * if "page" is part of a THP page under splitting, if the 116 + * head page pointed by the THP tail isn't a THP head anymore, 117 + * we'll find PageTail clear after smp_rmb() and we'll treat 118 + * it as a single page. 119 + */ 120 + if (!__compound_tail_refcounted(page_head)) { 121 + /* 122 + * If "page" is a THP tail, we must read the tail page 123 + * flags after the head page flags. The 124 + * split_huge_page side enforces write memory barriers 125 + * between clearing PageTail and before the head page 126 + * can be freed and reallocated. 127 + */ 128 + smp_rmb(); 129 + if (likely(PageTail(page))) { 130 + /* 131 + * __split_huge_page_refcount cannot race 132 + * here. 133 + */ 134 + VM_BUG_ON(!PageHead(page_head)); 135 + VM_BUG_ON(page_mapcount(page) != 0); 180 136 if (put_page_testzero(page_head)) { 137 + /* 138 + * If this is the tail of a slab 139 + * compound page, the tail pin must 140 + * not be the last reference held on 141 + * the page, because the PG_slab 142 + * cannot be cleared before all tail 143 + * pins (which skips the _mapcount 144 + * tail refcounting) have been 145 + * released. For hugetlbfs the tail 146 + * pin may be the last reference on 147 + * the page instead, because 148 + * PageHeadHuge will not go away until 149 + * the compound page enters the buddy 150 + * allocator. 151 + */ 152 + VM_BUG_ON(PageSlab(page_head)); 153 + __put_compound_page(page_head); 154 + } 155 + return; 156 + } else 157 + /* 158 + * __split_huge_page_refcount run before us, 159 + * "page" was a THP tail. The split page_head 160 + * has been freed and reallocated as slab or 161 + * hugetlbfs page of smaller order (only 162 + * possible if reallocated as slab on x86). 163 + */ 164 + goto out_put_single; 165 + } 166 + 167 + if (likely(page != page_head && get_page_unless_zero(page_head))) { 168 + unsigned long flags; 169 + 170 + /* 171 + * page_head wasn't a dangling pointer but it may not 172 + * be a head page anymore by the time we obtain the 173 + * lock. That is ok as long as it can't be freed from 174 + * under us. 175 + */ 176 + flags = compound_lock_irqsave(page_head); 177 + if (unlikely(!PageTail(page))) { 178 + /* __split_huge_page_refcount run before us */ 179 + compound_unlock_irqrestore(page_head, flags); 180 + if (put_page_testzero(page_head)) { 181 + /* 182 + * The head page may have been freed 183 + * and reallocated as a compound page 184 + * of smaller order and then freed 185 + * again. All we know is that it 186 + * cannot have become: a THP page, a 187 + * compound page of higher order, a 188 + * tail page. That is because we 189 + * still hold the refcount of the 190 + * split THP tail and page_head was 191 + * the THP head before the split. 192 + */ 181 193 if (PageHead(page_head)) 182 194 __put_compound_page(page_head); 183 195 else 184 196 __put_single_page(page_head); 185 197 } 186 - } else { 187 - /* page_head is a dangling pointer */ 188 - VM_BUG_ON(PageTail(page)); 189 - goto out_put_single; 198 + out_put_single: 199 + if (put_page_testzero(page)) 200 + __put_single_page(page); 201 + return; 190 202 } 191 - } else if (put_page_testzero(page)) { 192 - if (PageHead(page)) 193 - __put_compound_page(page); 194 - else 195 - __put_single_page(page); 203 + VM_BUG_ON(page_head != page->first_page); 204 + /* 205 + * We can release the refcount taken by 206 + * get_page_unless_zero() now that 207 + * __split_huge_page_refcount() is blocked on the 208 + * compound_lock. 209 + */ 210 + if (put_page_testzero(page_head)) 211 + VM_BUG_ON(1); 212 + /* __split_huge_page_refcount will wait now */ 213 + VM_BUG_ON(page_mapcount(page) <= 0); 214 + atomic_dec(&page->_mapcount); 215 + VM_BUG_ON(atomic_read(&page_head->_count) <= 0); 216 + VM_BUG_ON(atomic_read(&page->_count) != 0); 217 + compound_unlock_irqrestore(page_head, flags); 218 + 219 + if (put_page_testzero(page_head)) { 220 + if (PageHead(page_head)) 221 + __put_compound_page(page_head); 222 + else 223 + __put_single_page(page_head); 224 + } 225 + } else { 226 + /* page_head is a dangling pointer */ 227 + VM_BUG_ON(PageTail(page)); 228 + goto out_put_single; 196 229 } 197 230 } 198 231 ··· 252 221 * split_huge_page(). 253 222 */ 254 223 unsigned long flags; 255 - bool got = false; 224 + bool got; 256 225 struct page *page_head = compound_trans_head(page); 257 226 258 - if (likely(page != page_head && get_page_unless_zero(page_head))) { 259 - /* Ref to put_compound_page() comment. */ 260 - if (PageSlab(page_head) || PageHeadHuge(page_head)) { 261 - if (likely(PageTail(page))) { 262 - /* 263 - * This is a hugetlbfs page or a slab 264 - * page. __split_huge_page_refcount 265 - * cannot race here. 266 - */ 267 - VM_BUG_ON(!PageHead(page_head)); 268 - __get_page_tail_foll(page, false); 269 - return true; 270 - } else { 271 - /* 272 - * __split_huge_page_refcount run 273 - * before us, "page" was a THP 274 - * tail. The split page_head has been 275 - * freed and reallocated as slab or 276 - * hugetlbfs page of smaller order 277 - * (only possible if reallocated as 278 - * slab on x86). 279 - */ 280 - put_page(page_head); 281 - return false; 282 - } 227 + /* Ref to put_compound_page() comment. */ 228 + if (!__compound_tail_refcounted(page_head)) { 229 + smp_rmb(); 230 + if (likely(PageTail(page))) { 231 + /* 232 + * This is a hugetlbfs page or a slab 233 + * page. __split_huge_page_refcount 234 + * cannot race here. 235 + */ 236 + VM_BUG_ON(!PageHead(page_head)); 237 + __get_page_tail_foll(page, true); 238 + return true; 239 + } else { 240 + /* 241 + * __split_huge_page_refcount run 242 + * before us, "page" was a THP 243 + * tail. The split page_head has been 244 + * freed and reallocated as slab or 245 + * hugetlbfs page of smaller order 246 + * (only possible if reallocated as 247 + * slab on x86). 248 + */ 249 + return false; 283 250 } 251 + } 284 252 253 + got = false; 254 + if (likely(page != page_head && get_page_unless_zero(page_head))) { 285 255 /* 286 256 * page_head wasn't a dangling pointer but it 287 257 * may not be a head page anymore by the time

+34 -2

mm/util.c

··· 404 404 return mapping; 405 405 } 406 406 407 + int overcommit_ratio_handler(struct ctl_table *table, int write, 408 + void __user *buffer, size_t *lenp, 409 + loff_t *ppos) 410 + { 411 + int ret; 412 + 413 + ret = proc_dointvec(table, write, buffer, lenp, ppos); 414 + if (ret == 0 && write) 415 + sysctl_overcommit_kbytes = 0; 416 + return ret; 417 + } 418 + 419 + int overcommit_kbytes_handler(struct ctl_table *table, int write, 420 + void __user *buffer, size_t *lenp, 421 + loff_t *ppos) 422 + { 423 + int ret; 424 + 425 + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); 426 + if (ret == 0 && write) 427 + sysctl_overcommit_ratio = 0; 428 + return ret; 429 + } 430 + 407 431 /* 408 432 * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used 409 433 */ 410 434 unsigned long vm_commit_limit(void) 411 435 { 412 - return ((totalram_pages - hugetlb_total_pages()) 413 - * sysctl_overcommit_ratio / 100) + total_swap_pages; 436 + unsigned long allowed; 437 + 438 + if (sysctl_overcommit_kbytes) 439 + allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); 440 + else 441 + allowed = ((totalram_pages - hugetlb_total_pages()) 442 + * sysctl_overcommit_ratio / 100); 443 + allowed += total_swap_pages; 444 + 445 + return allowed; 414 446 } 415 447 416 448

+14 -14

mm/vmalloc.c

··· 220 220 } 221 221 222 222 /* 223 - * Walk a vmap address to the struct page it maps. 223 + * Walk a vmap address to the physical pfn it maps to. 224 224 */ 225 - struct page *vmalloc_to_page(const void *vmalloc_addr) 225 + unsigned long vmalloc_to_pfn(const void *vmalloc_addr) 226 226 { 227 227 unsigned long addr = (unsigned long) vmalloc_addr; 228 - struct page *page = NULL; 228 + unsigned long pfn = 0; 229 229 pgd_t *pgd = pgd_offset_k(addr); 230 230 231 231 /* ··· 244 244 ptep = pte_offset_map(pmd, addr); 245 245 pte = *ptep; 246 246 if (pte_present(pte)) 247 - page = pte_page(pte); 247 + pfn = pte_pfn(pte); 248 248 pte_unmap(ptep); 249 249 } 250 250 } 251 251 } 252 - return page; 253 - } 254 - EXPORT_SYMBOL(vmalloc_to_page); 255 - 256 - /* 257 - * Map a vmalloc()-space virtual address to the physical page frame number. 258 - */ 259 - unsigned long vmalloc_to_pfn(const void *vmalloc_addr) 260 - { 261 - return page_to_pfn(vmalloc_to_page(vmalloc_addr)); 252 + return pfn; 262 253 } 263 254 EXPORT_SYMBOL(vmalloc_to_pfn); 255 + 256 + /* 257 + * Map a vmalloc()-space virtual address to the struct page. 258 + */ 259 + struct page *vmalloc_to_page(const void *vmalloc_addr) 260 + { 261 + return pfn_to_page(vmalloc_to_pfn(vmalloc_addr)); 262 + } 263 + EXPORT_SYMBOL(vmalloc_to_page); 264 264 265 265 266 266 /*** Global kva allocator ***/