Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

PCI: Spread interrupt vectors in pci_alloc_irq_vectors()

Set the affinity_mask in the PCI device before allocating vectors so that
the affinity can be propagated through the MSI descriptor structures to the
core IRQ code. To facilitate this, new __pci_enable_msi_range() and
__pci_enable_msix_range() helpers are factored out of their not prefixed
variants which assigning the new IRQ affinity mask in the PCI device so
that the low-level interrupt code can perform the interrupt affinity
assignment and do node-local allocations.

A new PCI_IRQ_NOAFFINITY flag is added to pci_alloc_irq_vectors() so that
this function can also be used by drivers that don't wish to use the
automatic affinity assignment.

[bhelgaas: omit "else" after "return" consistently]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Alexander Gordeev <agordeev@redhat.com>

authored by

Christoph Hellwig and committed by
Bjorn Helgaas
4ef33685 aff17164

+96 -46
+4
Documentation/PCI/MSI-HOWTO.txt
··· 99 99 MSI or MSI-X, but the support is broken, or to pass PCI_IRQ_NOLEGACY in 100 100 case the device does not support legacy interrupt lines. 101 101 102 + By default this function will spread the interrupts around the available 103 + CPUs, but this feature can be disabled by passing the PCI_IRQ_NOAFFINITY 104 + flag. 105 + 102 106 To get the Linux IRQ numbers passed to request_irq() and free_irq() and the 103 107 vectors, use the following function: 104 108
+90 -46
drivers/pci/msi.c
··· 569 569 entry->msi_attrib.multi_cap = (control & PCI_MSI_FLAGS_QMASK) >> 1; 570 570 entry->msi_attrib.multiple = ilog2(__roundup_pow_of_two(nvec)); 571 571 entry->nvec_used = nvec; 572 + entry->affinity = dev->irq_affinity; 572 573 573 574 if (control & PCI_MSI_FLAGS_64BIT) 574 575 entry->mask_pos = dev->msi_cap + PCI_MSI_MASK_64; ··· 681 680 static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, 682 681 struct msix_entry *entries, int nvec) 683 682 { 683 + const struct cpumask *mask = NULL; 684 684 struct msi_desc *entry; 685 - int i; 685 + int cpu = -1, i; 686 686 687 687 for (i = 0; i < nvec; i++) { 688 + if (dev->irq_affinity) { 689 + cpu = cpumask_next(cpu, dev->irq_affinity); 690 + if (cpu >= nr_cpu_ids) 691 + cpu = cpumask_first(dev->irq_affinity); 692 + mask = cpumask_of(cpu); 693 + } 694 + 688 695 entry = alloc_msi_entry(&dev->dev); 689 696 if (!entry) { 690 697 if (!i) ··· 712 703 entry->msi_attrib.default_irq = dev->irq; 713 704 entry->mask_base = base; 714 705 entry->nvec_used = 1; 706 + entry->affinity = mask; 715 707 716 708 list_add_tail(&entry->list, dev_to_msi_list(&dev->dev)); 717 709 } ··· 1038 1028 } 1039 1029 EXPORT_SYMBOL(pci_msi_enabled); 1040 1030 1041 - /** 1042 - * pci_enable_msi_range - configure device's MSI capability structure 1043 - * @dev: device to configure 1044 - * @minvec: minimal number of interrupts to configure 1045 - * @maxvec: maximum number of interrupts to configure 1046 - * 1047 - * This function tries to allocate a maximum possible number of interrupts in a 1048 - * range between @minvec and @maxvec. It returns a negative errno if an error 1049 - * occurs. If it succeeds, it returns the actual number of interrupts allocated 1050 - * and updates the @dev's irq member to the lowest new interrupt number; 1051 - * the other interrupt numbers allocated to this device are consecutive. 1052 - **/ 1053 - int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec) 1031 + static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec, 1032 + unsigned int flags) 1054 1033 { 1055 1034 int nvec; 1056 1035 int rc; ··· 1062 1063 nvec = pci_msi_vec_count(dev); 1063 1064 if (nvec < 0) 1064 1065 return nvec; 1065 - else if (nvec < minvec) 1066 + if (nvec < minvec) 1066 1067 return -EINVAL; 1067 - else if (nvec > maxvec) 1068 + 1069 + if (nvec > maxvec) 1068 1070 nvec = maxvec; 1069 1071 1070 - do { 1071 - rc = msi_capability_init(dev, nvec); 1072 - if (rc < 0) { 1073 - return rc; 1074 - } else if (rc > 0) { 1075 - if (rc < minvec) 1072 + for (;;) { 1073 + if (!(flags & PCI_IRQ_NOAFFINITY)) { 1074 + dev->irq_affinity = irq_create_affinity_mask(&nvec); 1075 + if (nvec < minvec) 1076 1076 return -ENOSPC; 1077 - nvec = rc; 1078 1077 } 1079 - } while (rc); 1080 1078 1081 - return nvec; 1079 + rc = msi_capability_init(dev, nvec); 1080 + if (rc == 0) 1081 + return nvec; 1082 + 1083 + kfree(dev->irq_affinity); 1084 + dev->irq_affinity = NULL; 1085 + 1086 + if (rc < 0) 1087 + return rc; 1088 + if (rc < minvec) 1089 + return -ENOSPC; 1090 + 1091 + nvec = rc; 1092 + } 1093 + } 1094 + 1095 + /** 1096 + * pci_enable_msi_range - configure device's MSI capability structure 1097 + * @dev: device to configure 1098 + * @minvec: minimal number of interrupts to configure 1099 + * @maxvec: maximum number of interrupts to configure 1100 + * 1101 + * This function tries to allocate a maximum possible number of interrupts in a 1102 + * range between @minvec and @maxvec. It returns a negative errno if an error 1103 + * occurs. If it succeeds, it returns the actual number of interrupts allocated 1104 + * and updates the @dev's irq member to the lowest new interrupt number; 1105 + * the other interrupt numbers allocated to this device are consecutive. 1106 + **/ 1107 + int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec) 1108 + { 1109 + return __pci_enable_msi_range(dev, minvec, maxvec, PCI_IRQ_NOAFFINITY); 1082 1110 } 1083 1111 EXPORT_SYMBOL(pci_enable_msi_range); 1112 + 1113 + static int __pci_enable_msix_range(struct pci_dev *dev, 1114 + struct msix_entry *entries, int minvec, int maxvec, 1115 + unsigned int flags) 1116 + { 1117 + int nvec = maxvec; 1118 + int rc; 1119 + 1120 + if (maxvec < minvec) 1121 + return -ERANGE; 1122 + 1123 + for (;;) { 1124 + if (!(flags & PCI_IRQ_NOAFFINITY)) { 1125 + dev->irq_affinity = irq_create_affinity_mask(&nvec); 1126 + if (nvec < minvec) 1127 + return -ENOSPC; 1128 + } 1129 + 1130 + rc = pci_enable_msix(dev, entries, nvec); 1131 + if (rc == 0) 1132 + return nvec; 1133 + 1134 + kfree(dev->irq_affinity); 1135 + dev->irq_affinity = NULL; 1136 + 1137 + if (rc < 0) 1138 + return rc; 1139 + if (rc < minvec) 1140 + return -ENOSPC; 1141 + 1142 + nvec = rc; 1143 + } 1144 + } 1084 1145 1085 1146 /** 1086 1147 * pci_enable_msix_range - configure device's MSI-X capability structure ··· 1158 1099 * with new allocated MSI-X interrupts. 1159 1100 **/ 1160 1101 int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries, 1161 - int minvec, int maxvec) 1102 + int minvec, int maxvec) 1162 1103 { 1163 - int nvec = maxvec; 1164 - int rc; 1165 - 1166 - if (maxvec < minvec) 1167 - return -ERANGE; 1168 - 1169 - do { 1170 - rc = pci_enable_msix(dev, entries, nvec); 1171 - if (rc < 0) { 1172 - return rc; 1173 - } else if (rc > 0) { 1174 - if (rc < minvec) 1175 - return -ENOSPC; 1176 - nvec = rc; 1177 - } 1178 - } while (rc); 1179 - 1180 - return nvec; 1104 + return __pci_enable_msix_range(dev, entries, minvec, maxvec, 1105 + PCI_IRQ_NOAFFINITY); 1181 1106 } 1182 1107 EXPORT_SYMBOL(pci_enable_msix_range); 1183 1108 ··· 1188 1145 int vecs = -ENOSPC; 1189 1146 1190 1147 if (!(flags & PCI_IRQ_NOMSIX)) { 1191 - vecs = pci_enable_msix_range(dev, NULL, min_vecs, max_vecs); 1148 + vecs = __pci_enable_msix_range(dev, NULL, min_vecs, max_vecs, 1149 + flags); 1192 1150 if (vecs > 0) 1193 1151 return vecs; 1194 1152 } 1195 1153 1196 1154 if (!(flags & PCI_IRQ_NOMSI)) { 1197 - vecs = pci_enable_msi_range(dev, min_vecs, max_vecs); 1155 + vecs = __pci_enable_msi_range(dev, min_vecs, max_vecs, flags); 1198 1156 if (vecs > 0) 1199 1157 return vecs; 1200 1158 }
+2
include/linux/pci.h
··· 320 320 * directly, use the values stored here. They might be different! 321 321 */ 322 322 unsigned int irq; 323 + struct cpumask *irq_affinity; 323 324 struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */ 324 325 325 326 bool match_driver; /* Skip attaching driver */ ··· 1241 1240 #define PCI_IRQ_NOLEGACY (1 << 0) /* don't use legacy interrupts */ 1242 1241 #define PCI_IRQ_NOMSI (1 << 1) /* don't use MSI interrupts */ 1243 1242 #define PCI_IRQ_NOMSIX (1 << 2) /* don't use MSI-X interrupts */ 1243 + #define PCI_IRQ_NOAFFINITY (1 << 3) /* don't auto-assign affinity */ 1244 1244 1245 1245 /* kmem_cache style wrapper around pci_alloc_consistent() */ 1246 1246