[PATCH] x86_64: Calgary IOMMU - Multi-Node NULL pointer dereference fix

Calgary hits a NULL pointer dereference when booting in a multi-chassis
NUMA system. See Redhat bugzilla number 198498, found by Konrad
Rzeszutek (konradr@redhat.com).

There are many issues that had to be resolved to fix this problem.
Firstly when I originally wrote the code to handle NUMA systems, I
had a large misunderstanding that was not corrected until now. That was
that I thought the "number of nodes online" referred to number of
physical systems connected. So that if NUMA was disabled, there
would only be 1 node and it would only show that node's PCI bus.
In reality if NUMA is disabled, the system displays all of the
connected chassis as one node but is only ignorant of the delays
in accessing main memory. Therefore, references to num_online_nodes()
and MAX_NUMNODES are incorrect and need to be set to the maximum
number of nodes that can be accessed (which are 8). I created a
variable, MAX_NUM_CHASSIS, and set it to 8 to fix this.

Secondly, when walking the PCI in detect_calgary, the code only
checked the first "slot" when looking to see if a device is present.
This will work for most cases, but unfortunately it isn't always the
case. In the NUMA MXE drawers, there are USB devices present on the
3rd slot (with slot 1 being empty). So, to work around this, all
slots (up to 8) are scanned to see if there are any devices present.

Lastly, the bus is being enumerated on large systems in a different
way the we originally thought. This throws the ugly logic we had
out the window. To more elegantly handle this, I reorganized the
kva array to be sparse (which removed the need to have any bus number
to kva slot logic in tce.c) and created a secondary space array to
contain the bus number to phb mapping.

With these changes Calgary boots on an x460 with 4 nodes with and
without NUMA enabled.

Signed-off-by: Jon Mason <jdmason@us.ibm.com>
Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

authored by Jon Mason and committed by Linus Torvalds d2105b10 089bbbcb

+45 -40
+44 -32
arch/x86_64/kernel/pci-calgary.c
··· 85 #define CSR_AGENT_MASK 0xffe0ffff 86 87 #define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */ 88 - #define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * 2) /* max dev->bus->number */ 89 #define PHBS_PER_CALGARY 4 90 91 /* register offsets in Calgary's internal register space */ ··· 111 0xB000 /* PHB3 */ 112 }; 113 114 - void* tce_table_kva[MAX_NUM_OF_PHBS * MAX_NUMNODES]; 115 unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; 116 static int translate_empty_slots __read_mostly = 0; 117 static int calgary_detected __read_mostly = 0; ··· 121 * the bitmap of PHBs the user requested that we disable 122 * translation on. 123 */ 124 - static DECLARE_BITMAP(translation_disabled, MAX_NUMNODES * MAX_PHB_BUS_NUM); 125 126 static void tce_cache_blast(struct iommu_table *tbl); 127 ··· 454 455 static inline int busno_to_phbid(unsigned char num) 456 { 457 - return bus_to_phb(num) % PHBS_PER_CALGARY; 458 } 459 460 static inline unsigned long split_queue_offset(unsigned char num) ··· 814 int i, ret = -ENODEV; 815 struct pci_dev *dev = NULL; 816 817 - for (i = 0; i < num_online_nodes() * MAX_NUM_OF_PHBS; i++) { 818 dev = pci_get_device(PCI_VENDOR_ID_IBM, 819 PCI_DEVICE_ID_IBM_CALGARY, 820 dev); ··· 824 calgary_init_one_nontraslated(dev); 825 continue; 826 } 827 - if (!tce_table_kva[i] && !translate_empty_slots) { 828 pci_dev_put(dev); 829 continue; 830 } ··· 844 pci_dev_put(dev); 845 continue; 846 } 847 - if (!tce_table_kva[i] && !translate_empty_slots) 848 continue; 849 calgary_disable_translation(dev); 850 calgary_free_tar(dev); ··· 878 void __init detect_calgary(void) 879 { 880 u32 val; 881 - int bus, table_idx; 882 void *tbl; 883 - int detected = 0; 884 885 /* 886 * if the user specified iommu=off or iommu=soft or we found ··· 892 893 specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); 894 895 - for (bus = 0, table_idx = 0; 896 - bus < num_online_nodes() * MAX_PHB_BUS_NUM; 897 - bus++) { 898 if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY) 899 continue; 900 if (test_bit(bus, translation_disabled)) { 901 printk(KERN_INFO "Calgary: translation is disabled for " 902 "PHB 0x%x\n", bus); 903 /* skip this phb, don't allocate a tbl for it */ 904 - tce_table_kva[table_idx] = NULL; 905 - table_idx++; 906 continue; 907 } 908 /* 909 - * scan the first slot of the PCI bus to see if there 910 - * are any devices present 911 */ 912 - val = read_pci_config(bus, 1, 0, 0); 913 - if (val != 0xffffffff || translate_empty_slots) { 914 - tbl = alloc_tce_table(); 915 - if (!tbl) 916 - goto cleanup; 917 - detected = 1; 918 - } else 919 - tbl = NULL; 920 - 921 - tce_table_kva[table_idx] = tbl; 922 - table_idx++; 923 } 924 925 - if (detected) { 926 iommu_detected = 1; 927 calgary_detected = 1; 928 printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected. " ··· 940 return; 941 942 cleanup: 943 - for (--table_idx; table_idx >= 0; --table_idx) 944 - if (tce_table_kva[table_idx]) 945 - free_tce_table(tce_table_kva[table_idx]); 946 } 947 948 int __init calgary_iommu_init(void) ··· 1013 if (p == endp) 1014 break; 1015 1016 - if (bridge < (num_online_nodes() * MAX_PHB_BUS_NUM)) { 1017 printk(KERN_INFO "Calgary: disabling " 1018 "translation for PHB 0x%x\n", bridge); 1019 set_bit(bridge, translation_disabled);
··· 85 #define CSR_AGENT_MASK 0xffe0ffff 86 87 #define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */ 88 + #define MAX_NUM_CHASSIS 8 /* max number of chassis */ 89 + #define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) /* max dev->bus->number */ 90 #define PHBS_PER_CALGARY 4 91 92 /* register offsets in Calgary's internal register space */ ··· 110 0xB000 /* PHB3 */ 111 }; 112 113 + static char bus_to_phb[MAX_PHB_BUS_NUM]; 114 + void* tce_table_kva[MAX_PHB_BUS_NUM]; 115 unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; 116 static int translate_empty_slots __read_mostly = 0; 117 static int calgary_detected __read_mostly = 0; ··· 119 * the bitmap of PHBs the user requested that we disable 120 * translation on. 121 */ 122 + static DECLARE_BITMAP(translation_disabled, MAX_PHB_BUS_NUM); 123 124 static void tce_cache_blast(struct iommu_table *tbl); 125 ··· 452 453 static inline int busno_to_phbid(unsigned char num) 454 { 455 + return bus_to_phb[num]; 456 } 457 458 static inline unsigned long split_queue_offset(unsigned char num) ··· 812 int i, ret = -ENODEV; 813 struct pci_dev *dev = NULL; 814 815 + for (i = 0; i < MAX_PHB_BUS_NUM; i++) { 816 dev = pci_get_device(PCI_VENDOR_ID_IBM, 817 PCI_DEVICE_ID_IBM_CALGARY, 818 dev); ··· 822 calgary_init_one_nontraslated(dev); 823 continue; 824 } 825 + if (!tce_table_kva[dev->bus->number] && !translate_empty_slots) { 826 pci_dev_put(dev); 827 continue; 828 } ··· 842 pci_dev_put(dev); 843 continue; 844 } 845 + if (!tce_table_kva[dev->bus->number] && !translate_empty_slots) 846 continue; 847 calgary_disable_translation(dev); 848 calgary_free_tar(dev); ··· 876 void __init detect_calgary(void) 877 { 878 u32 val; 879 + int bus; 880 void *tbl; 881 + int calgary_found = 0; 882 + int phb = -1; 883 884 /* 885 * if the user specified iommu=off or iommu=soft or we found ··· 889 890 specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); 891 892 + for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { 893 + int dev; 894 + 895 + tce_table_kva[bus] = NULL; 896 + bus_to_phb[bus] = -1; 897 + 898 if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY) 899 continue; 900 + 901 + /* 902 + * There are 4 PHBs per Calgary chip. Set phb to which phb (0-3) 903 + * it is connected to releative to the clagary chip. 904 + */ 905 + phb = (phb + 1) % PHBS_PER_CALGARY; 906 + 907 if (test_bit(bus, translation_disabled)) { 908 printk(KERN_INFO "Calgary: translation is disabled for " 909 "PHB 0x%x\n", bus); 910 /* skip this phb, don't allocate a tbl for it */ 911 continue; 912 } 913 /* 914 + * Scan the slots of the PCI bus to see if there is a device present. 915 + * The parent bus will be the zero-ith device, so start at 1. 916 */ 917 + for (dev = 1; dev < 8; dev++) { 918 + val = read_pci_config(bus, dev, 0, 0); 919 + if (val != 0xffffffff || translate_empty_slots) { 920 + tbl = alloc_tce_table(); 921 + if (!tbl) 922 + goto cleanup; 923 + tce_table_kva[bus] = tbl; 924 + bus_to_phb[bus] = phb; 925 + calgary_found = 1; 926 + break; 927 + } 928 + } 929 } 930 931 + if (calgary_found) { 932 iommu_detected = 1; 933 calgary_detected = 1; 934 printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected. " ··· 928 return; 929 930 cleanup: 931 + for (--bus; bus >= 0; --bus) 932 + if (tce_table_kva[bus]) 933 + free_tce_table(tce_table_kva[bus]); 934 } 935 936 int __init calgary_iommu_init(void) ··· 1001 if (p == endp) 1002 break; 1003 1004 + if (bridge < MAX_PHB_BUS_NUM) { 1005 printk(KERN_INFO "Calgary: disabling " 1006 "translation for PHB 0x%x\n", bridge); 1007 set_bit(bridge, translation_disabled);
+1 -3
arch/x86_64/kernel/tce.c
··· 96 static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl) 97 { 98 unsigned int bitmapsz; 99 - unsigned int tce_table_index; 100 unsigned long bmppages; 101 int ret; 102 ··· 104 /* set the tce table size - measured in entries */ 105 tbl->it_size = table_size_to_number_of_entries(specified_table_size); 106 107 - tce_table_index = bus_to_phb(tbl->it_busno); 108 - tbl->it_base = (unsigned long)tce_table_kva[tce_table_index]; 109 if (!tbl->it_base) { 110 printk(KERN_ERR "Calgary: iommu_table_setparms: " 111 "no table allocated?!\n");
··· 96 static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl) 97 { 98 unsigned int bitmapsz; 99 unsigned long bmppages; 100 int ret; 101 ··· 105 /* set the tce table size - measured in entries */ 106 tbl->it_size = table_size_to_number_of_entries(specified_table_size); 107 108 + tbl->it_base = (unsigned long)tce_table_kva[dev->bus->number]; 109 if (!tbl->it_base) { 110 printk(KERN_ERR "Calgary: iommu_table_setparms: " 111 "no table allocated?!\n");
-5
include/asm-x86_64/calgary.h
··· 60 static inline void detect_calgary(void) { return; } 61 #endif 62 63 - static inline unsigned int bus_to_phb(unsigned char busno) 64 - { 65 - return ((busno % 15 == 0) ? 0 : busno / 2 + 1); 66 - } 67 - 68 #endif /* _ASM_X86_64_CALGARY_H */
··· 60 static inline void detect_calgary(void) { return; } 61 #endif 62 63 #endif /* _ASM_X86_64_CALGARY_H */