powerpc: Fix boot freeze on machine with empty memory node

I got a bug report about a distro kernel not booting on a particular
machine. It would freeze during boot:

> ...
> Could not find start_pfn for node 1
> [boot]0015 Setup Done
> Built 2 zonelists in Node order, mobility grouping on. Total pages: 123783
> Policy zone: DMA
> Kernel command line:
> [boot]0020 XICS Init
> [boot]0021 XICS Done
> PID hash table entries: 4096 (order: 12, 32768 bytes)
> clocksource: timebase mult[7d0000] shift[22] registered
> Console: colour dummy device 80x25
> console handover: boot [udbg0] -> real [hvc0]
> Dentry cache hash table entries: 1048576 (order: 7, 8388608 bytes)
> Inode-cache hash table entries: 524288 (order: 6, 4194304 bytes)
> freeing bootmem node 0

I've reproduced this on 2.6.27.7. It is caused by commit
8f64e1f2d1e09267ac926e15090fd505c1c0cbcb ("powerpc: Reserve in bootmem
lmb reserved regions that cross NUMA nodes").

The problem is that Jon took a loop which was (in pseudocode):

for_each_node(nid)
NODE_DATA(nid) = careful_alloc(nid);
setup_bootmem(nid);
reserve_node_bootmem(nid);

and broke it up into:

for_each_node(nid)
NODE_DATA(nid) = careful_alloc(nid);
setup_bootmem(nid);
for_each_node(nid)
reserve_node_bootmem(nid);

The issue comes in when the 'careful_alloc()' is called on a node with
no memory. It falls back to using bootmem from a previously-initialized
node. But, bootmem has not yet been reserved when Jon's patch is
applied. It gives back bogus memory (0xc000000000000000) and pukes
later in boot.

The following patch collapses the loop back together. It also breaks
the mark_reserved_regions_for_nid() code out into a function and adds
some comments. I think a huge part of introducing this bug is because
for loop was too long and hard to read.

The actual bug fix here is the:

+ if (end_pfn <= node->node_start_pfn ||
+ start_pfn >= node_end_pfn)
+ continue;

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>

authored by Dave Hansen and committed by Paul Mackerras 4a618669 4b824de9

+88 -60
+88 -60
arch/powerpc/mm/numa.c
··· 865 865 .priority = 1 /* Must run before sched domains notifier. */ 866 866 }; 867 867 868 - void __init do_init_bootmem(void) 868 + static void mark_reserved_regions_for_nid(int nid) 869 869 { 870 - int nid; 871 - unsigned int i; 870 + struct pglist_data *node = NODE_DATA(nid); 871 + int i; 872 872 873 - min_low_pfn = 0; 874 - max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT; 875 - max_pfn = max_low_pfn; 876 - 877 - if (parse_numa_properties()) 878 - setup_nonnuma(); 879 - else 880 - dump_numa_memory_topology(); 881 - 882 - register_cpu_notifier(&ppc64_numa_nb); 883 - cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE, 884 - (void *)(unsigned long)boot_cpuid); 885 - 886 - for_each_online_node(nid) { 887 - unsigned long start_pfn, end_pfn; 888 - unsigned long bootmem_paddr; 889 - unsigned long bootmap_pages; 890 - 891 - get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 892 - 893 - /* Allocate the node structure node local if possible */ 894 - NODE_DATA(nid) = careful_allocation(nid, 895 - sizeof(struct pglist_data), 896 - SMP_CACHE_BYTES, end_pfn); 897 - NODE_DATA(nid) = __va(NODE_DATA(nid)); 898 - memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); 899 - 900 - dbg("node %d\n", nid); 901 - dbg("NODE_DATA() = %p\n", NODE_DATA(nid)); 902 - 903 - NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; 904 - NODE_DATA(nid)->node_start_pfn = start_pfn; 905 - NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; 906 - 907 - if (NODE_DATA(nid)->node_spanned_pages == 0) 908 - continue; 909 - 910 - dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT); 911 - dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT); 912 - 913 - bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 914 - bootmem_paddr = (unsigned long)careful_allocation(nid, 915 - bootmap_pages << PAGE_SHIFT, 916 - PAGE_SIZE, end_pfn); 917 - memset(__va(bootmem_paddr), 0, bootmap_pages << PAGE_SHIFT); 918 - 919 - dbg("bootmap_paddr = %lx\n", bootmem_paddr); 920 - 921 - init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT, 922 - start_pfn, end_pfn); 923 - 924 - free_bootmem_with_active_regions(nid, end_pfn); 925 - } 926 - 927 - /* Mark reserved regions */ 928 873 for (i = 0; i < lmb.reserved.cnt; i++) { 929 874 unsigned long physbase = lmb.reserved.region[i].base; 930 875 unsigned long size = lmb.reserved.region[i].size; 931 876 unsigned long start_pfn = physbase >> PAGE_SHIFT; 932 877 unsigned long end_pfn = ((physbase + size) >> PAGE_SHIFT); 933 878 struct node_active_region node_ar; 879 + unsigned long node_end_pfn = node->node_start_pfn + 880 + node->node_spanned_pages; 881 + 882 + /* 883 + * Check to make sure that this lmb.reserved area is 884 + * within the bounds of the node that we care about. 885 + * Checking the nid of the start and end points is not 886 + * sufficient because the reserved area could span the 887 + * entire node. 888 + */ 889 + if (end_pfn <= node->node_start_pfn || 890 + start_pfn >= node_end_pfn) 891 + continue; 934 892 935 893 get_node_active_region(start_pfn, &node_ar); 936 894 while (start_pfn < end_pfn && ··· 922 964 size = size - reserve_size; 923 965 get_node_active_region(start_pfn, &node_ar); 924 966 } 925 - 926 967 } 968 + } 927 969 928 - for_each_online_node(nid) 970 + 971 + void __init do_init_bootmem(void) 972 + { 973 + int nid; 974 + unsigned int i; 975 + 976 + min_low_pfn = 0; 977 + max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT; 978 + max_pfn = max_low_pfn; 979 + 980 + if (parse_numa_properties()) 981 + setup_nonnuma(); 982 + else 983 + dump_numa_memory_topology(); 984 + 985 + register_cpu_notifier(&ppc64_numa_nb); 986 + cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE, 987 + (void *)(unsigned long)boot_cpuid); 988 + 989 + for_each_online_node(nid) { 990 + unsigned long start_pfn, end_pfn; 991 + unsigned long bootmem_paddr; 992 + unsigned long bootmap_pages; 993 + 994 + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 995 + 996 + /* 997 + * Allocate the node structure node local if possible 998 + * 999 + * Be careful moving this around, as it relies on all 1000 + * previous nodes' bootmem to be initialized and have 1001 + * all reserved areas marked. 1002 + */ 1003 + NODE_DATA(nid) = careful_allocation(nid, 1004 + sizeof(struct pglist_data), 1005 + SMP_CACHE_BYTES, end_pfn); 1006 + NODE_DATA(nid) = __va(NODE_DATA(nid)); 1007 + memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); 1008 + 1009 + dbg("node %d\n", nid); 1010 + dbg("NODE_DATA() = %p\n", NODE_DATA(nid)); 1011 + 1012 + NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; 1013 + NODE_DATA(nid)->node_start_pfn = start_pfn; 1014 + NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; 1015 + 1016 + if (NODE_DATA(nid)->node_spanned_pages == 0) 1017 + continue; 1018 + 1019 + dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT); 1020 + dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT); 1021 + 1022 + bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 1023 + bootmem_paddr = (unsigned long)careful_allocation(nid, 1024 + bootmap_pages << PAGE_SHIFT, 1025 + PAGE_SIZE, end_pfn); 1026 + memset(__va(bootmem_paddr), 0, bootmap_pages << PAGE_SHIFT); 1027 + 1028 + dbg("bootmap_paddr = %lx\n", bootmem_paddr); 1029 + 1030 + init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT, 1031 + start_pfn, end_pfn); 1032 + 1033 + free_bootmem_with_active_regions(nid, end_pfn); 1034 + /* 1035 + * Be very careful about moving this around. Future 1036 + * calls to careful_allocation() depend on this getting 1037 + * done correctly. 1038 + */ 1039 + mark_reserved_regions_for_nid(nid); 929 1040 sparse_memory_present_with_active_regions(nid); 1041 + } 930 1042 } 931 1043 932 1044 void __init paging_init(void)