[PATCH] x86_64: Calgary IOMMU - Multi-Node NULL pointer dereference fix

Calgary hits a NULL pointer dereference when booting in a multi-chassis
NUMA system. See Redhat bugzilla number 198498, found by Konrad
Rzeszutek (konradr@redhat.com).

There are many issues that had to be resolved to fix this problem.
Firstly when I originally wrote the code to handle NUMA systems, I
had a large misunderstanding that was not corrected until now. That was
that I thought the "number of nodes online" referred to number of
physical systems connected. So that if NUMA was disabled, there
would only be 1 node and it would only show that node's PCI bus.
In reality if NUMA is disabled, the system displays all of the
connected chassis as one node but is only ignorant of the delays
in accessing main memory. Therefore, references to num_online_nodes()
and MAX_NUMNODES are incorrect and need to be set to the maximum
number of nodes that can be accessed (which are 8). I created a
variable, MAX_NUM_CHASSIS, and set it to 8 to fix this.

Secondly, when walking the PCI in detect_calgary, the code only
checked the first "slot" when looking to see if a device is present.
This will work for most cases, but unfortunately it isn't always the
case. In the NUMA MXE drawers, there are USB devices present on the
3rd slot (with slot 1 being empty). So, to work around this, all
slots (up to 8) are scanned to see if there are any devices present.

Lastly, the bus is being enumerated on large systems in a different
way the we originally thought. This throws the ugly logic we had
out the window. To more elegantly handle this, I reorganized the
kva array to be sparse (which removed the need to have any bus number
to kva slot logic in tce.c) and created a secondary space array to
contain the bus number to phb mapping.

With these changes Calgary boots on an x460 with 4 nodes with and
without NUMA enabled.

Signed-off-by: Jon Mason <jdmason@us.ibm.com>
Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

authored by Jon Mason and committed by Linus Torvalds d2105b10 089bbbcb

+45 -40
+44 -32
arch/x86_64/kernel/pci-calgary.c
··· 85 85 #define CSR_AGENT_MASK 0xffe0ffff 86 86 87 87 #define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */ 88 - #define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * 2) /* max dev->bus->number */ 88 + #define MAX_NUM_CHASSIS 8 /* max number of chassis */ 89 + #define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) /* max dev->bus->number */ 89 90 #define PHBS_PER_CALGARY 4 90 91 91 92 /* register offsets in Calgary's internal register space */ ··· 111 110 0xB000 /* PHB3 */ 112 111 }; 113 112 114 - void* tce_table_kva[MAX_NUM_OF_PHBS * MAX_NUMNODES]; 113 + static char bus_to_phb[MAX_PHB_BUS_NUM]; 114 + void* tce_table_kva[MAX_PHB_BUS_NUM]; 115 115 unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; 116 116 static int translate_empty_slots __read_mostly = 0; 117 117 static int calgary_detected __read_mostly = 0; ··· 121 119 * the bitmap of PHBs the user requested that we disable 122 120 * translation on. 123 121 */ 124 - static DECLARE_BITMAP(translation_disabled, MAX_NUMNODES * MAX_PHB_BUS_NUM); 122 + static DECLARE_BITMAP(translation_disabled, MAX_PHB_BUS_NUM); 125 123 126 124 static void tce_cache_blast(struct iommu_table *tbl); 127 125 ··· 454 452 455 453 static inline int busno_to_phbid(unsigned char num) 456 454 { 457 - return bus_to_phb(num) % PHBS_PER_CALGARY; 455 + return bus_to_phb[num]; 458 456 } 459 457 460 458 static inline unsigned long split_queue_offset(unsigned char num) ··· 814 812 int i, ret = -ENODEV; 815 813 struct pci_dev *dev = NULL; 816 814 817 - for (i = 0; i < num_online_nodes() * MAX_NUM_OF_PHBS; i++) { 815 + for (i = 0; i < MAX_PHB_BUS_NUM; i++) { 818 816 dev = pci_get_device(PCI_VENDOR_ID_IBM, 819 817 PCI_DEVICE_ID_IBM_CALGARY, 820 818 dev); ··· 824 822 calgary_init_one_nontraslated(dev); 825 823 continue; 826 824 } 827 - if (!tce_table_kva[i] && !translate_empty_slots) { 825 + if (!tce_table_kva[dev->bus->number] && !translate_empty_slots) { 828 826 pci_dev_put(dev); 829 827 continue; 830 828 } ··· 844 842 pci_dev_put(dev); 845 843 continue; 846 844 } 847 - if (!tce_table_kva[i] && !translate_empty_slots) 845 + if (!tce_table_kva[dev->bus->number] && !translate_empty_slots) 848 846 continue; 849 847 calgary_disable_translation(dev); 850 848 calgary_free_tar(dev); ··· 878 876 void __init detect_calgary(void) 879 877 { 880 878 u32 val; 881 - int bus, table_idx; 879 + int bus; 882 880 void *tbl; 883 - int detected = 0; 881 + int calgary_found = 0; 882 + int phb = -1; 884 883 885 884 /* 886 885 * if the user specified iommu=off or iommu=soft or we found ··· 892 889 893 890 specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); 894 891 895 - for (bus = 0, table_idx = 0; 896 - bus < num_online_nodes() * MAX_PHB_BUS_NUM; 897 - bus++) { 892 + for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { 893 + int dev; 894 + 895 + tce_table_kva[bus] = NULL; 896 + bus_to_phb[bus] = -1; 897 + 898 898 if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY) 899 899 continue; 900 + 901 + /* 902 + * There are 4 PHBs per Calgary chip. Set phb to which phb (0-3) 903 + * it is connected to releative to the clagary chip. 904 + */ 905 + phb = (phb + 1) % PHBS_PER_CALGARY; 906 + 900 907 if (test_bit(bus, translation_disabled)) { 901 908 printk(KERN_INFO "Calgary: translation is disabled for " 902 909 "PHB 0x%x\n", bus); 903 910 /* skip this phb, don't allocate a tbl for it */ 904 - tce_table_kva[table_idx] = NULL; 905 - table_idx++; 906 911 continue; 907 912 } 908 913 /* 909 - * scan the first slot of the PCI bus to see if there 910 - * are any devices present 914 + * Scan the slots of the PCI bus to see if there is a device present. 915 + * The parent bus will be the zero-ith device, so start at 1. 911 916 */ 912 - val = read_pci_config(bus, 1, 0, 0); 913 - if (val != 0xffffffff || translate_empty_slots) { 914 - tbl = alloc_tce_table(); 915 - if (!tbl) 916 - goto cleanup; 917 - detected = 1; 918 - } else 919 - tbl = NULL; 920 - 921 - tce_table_kva[table_idx] = tbl; 922 - table_idx++; 917 + for (dev = 1; dev < 8; dev++) { 918 + val = read_pci_config(bus, dev, 0, 0); 919 + if (val != 0xffffffff || translate_empty_slots) { 920 + tbl = alloc_tce_table(); 921 + if (!tbl) 922 + goto cleanup; 923 + tce_table_kva[bus] = tbl; 924 + bus_to_phb[bus] = phb; 925 + calgary_found = 1; 926 + break; 927 + } 928 + } 923 929 } 924 930 925 - if (detected) { 931 + if (calgary_found) { 926 932 iommu_detected = 1; 927 933 calgary_detected = 1; 928 934 printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected. " ··· 940 928 return; 941 929 942 930 cleanup: 943 - for (--table_idx; table_idx >= 0; --table_idx) 944 - if (tce_table_kva[table_idx]) 945 - free_tce_table(tce_table_kva[table_idx]); 931 + for (--bus; bus >= 0; --bus) 932 + if (tce_table_kva[bus]) 933 + free_tce_table(tce_table_kva[bus]); 946 934 } 947 935 948 936 int __init calgary_iommu_init(void) ··· 1013 1001 if (p == endp) 1014 1002 break; 1015 1003 1016 - if (bridge < (num_online_nodes() * MAX_PHB_BUS_NUM)) { 1004 + if (bridge < MAX_PHB_BUS_NUM) { 1017 1005 printk(KERN_INFO "Calgary: disabling " 1018 1006 "translation for PHB 0x%x\n", bridge); 1019 1007 set_bit(bridge, translation_disabled);
+1 -3
arch/x86_64/kernel/tce.c
··· 96 96 static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl) 97 97 { 98 98 unsigned int bitmapsz; 99 - unsigned int tce_table_index; 100 99 unsigned long bmppages; 101 100 int ret; 102 101 ··· 104 105 /* set the tce table size - measured in entries */ 105 106 tbl->it_size = table_size_to_number_of_entries(specified_table_size); 106 107 107 - tce_table_index = bus_to_phb(tbl->it_busno); 108 - tbl->it_base = (unsigned long)tce_table_kva[tce_table_index]; 108 + tbl->it_base = (unsigned long)tce_table_kva[dev->bus->number]; 109 109 if (!tbl->it_base) { 110 110 printk(KERN_ERR "Calgary: iommu_table_setparms: " 111 111 "no table allocated?!\n");
-5
include/asm-x86_64/calgary.h
··· 60 60 static inline void detect_calgary(void) { return; } 61 61 #endif 62 62 63 - static inline unsigned int bus_to_phb(unsigned char busno) 64 - { 65 - return ((busno % 15 == 0) ? 0 : busno / 2 + 1); 66 - } 67 - 68 63 #endif /* _ASM_X86_64_CALGARY_H */