Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

[PATCH] i386/x86-64: Share interrupt vectors when there is a large number of interrupt sources

Here's a patch that builds on Natalie Protasevich's IRQ compression
patch and tries to work for MPS boots as well as ACPI. It is meant for
a 4-node IBM x460 NUMA box, which was dying because it had interrupt
pins with GSI numbers > NR_IRQS and thus overflowed irq_desc.

The problem is that this system has 270 GSIs (which are 1:1 mapped with
I/O APIC RTEs) and an 8-node box would have 540. This is much bigger
than NR_IRQS (224 for both i386 and x86_64). Also, there aren't enough
vectors to go around. There are about 190 usable vectors, not counting
the reserved ones and the unused vectors at 0x20 to 0x2F. So, my patch
attempts to compress the GSI range and share vectors by sharing IRQs.

Cc: "Protasevich, Natalie" <Natalie.Protasevich@unisys.com>

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

authored by

James Cleverdon and committed by
Linus Torvalds
6004e1b7 89b831ef

+86 -17
+7 -8
arch/i386/kernel/acpi/boot.c
··· 39 39 40 40 #ifdef CONFIG_X86_64 41 41 42 - static inline void acpi_madt_oem_check(char *oem_id, char *oem_table_id) 43 - { 44 - } 45 42 extern void __init clustered_apic_check(void); 46 - static inline int ioapic_setup_disabled(void) 47 - { 48 - return 0; 49 - } 50 43 44 + extern int gsi_irq_sharing(int gsi); 51 45 #include <asm/proto.h> 46 + 47 + static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 0; } 48 + 52 49 53 50 #else /* X86 */ 54 51 ··· 53 56 #include <mach_apic.h> 54 57 #include <mach_mpparse.h> 55 58 #endif /* CONFIG_X86_LOCAL_APIC */ 59 + 60 + static inline int gsi_irq_sharing(int gsi) { return gsi; } 56 61 57 62 #endif /* X86 */ 58 63 ··· 458 459 *irq = IO_APIC_VECTOR(gsi); 459 460 else 460 461 #endif 461 - *irq = gsi; 462 + *irq = gsi_irq_sharing(gsi); 462 463 return 0; 463 464 } 464 465
+73 -7
arch/x86_64/kernel/io_apic.c
··· 57 57 * Rough estimation of how many shared IRQs there are, can 58 58 * be changed anytime. 59 59 */ 60 - #define MAX_PLUS_SHARED_IRQS NR_IRQS 60 + #define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS 61 61 #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) 62 62 63 63 /* ··· 85 85 int pin; \ 86 86 struct irq_pin_list *entry = irq_2_pin + irq; \ 87 87 \ 88 + BUG_ON(irq >= NR_IRQS); \ 88 89 for (;;) { \ 89 90 unsigned int reg; \ 90 91 pin = entry->pin; \ ··· 128 127 } 129 128 #endif 130 129 130 + static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF }; 131 + 131 132 /* 132 133 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are 133 134 * shared ISA-space IRQs, so we have to support them. We are super ··· 140 137 static int first_free_entry = NR_IRQS; 141 138 struct irq_pin_list *entry = irq_2_pin + irq; 142 139 140 + BUG_ON(irq >= NR_IRQS); 143 141 while (entry->next) 144 142 entry = irq_2_pin + entry->next; 145 143 ··· 148 144 entry->next = first_free_entry; 149 145 entry = irq_2_pin + entry->next; 150 146 if (++first_free_entry >= PIN_MAP_SIZE) 151 - panic("io_apic.c: whoops"); 147 + panic("io_apic.c: ran out of irq_2_pin entries!"); 152 148 } 153 149 entry->apic = apic; 154 150 entry->pin = pin; ··· 424 420 best_guess = irq; 425 421 } 426 422 } 423 + BUG_ON(best_guess >= NR_IRQS); 427 424 return best_guess; 428 425 } 429 426 ··· 615 610 return MPBIOS_trigger(idx); 616 611 } 617 612 613 + static int next_irq = 16; 614 + 615 + /* 616 + * gsi_irq_sharing -- Name overload! "irq" can be either a legacy IRQ 617 + * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number 618 + * from ACPI, which can reach 800 in large boxen. 619 + * 620 + * Compact the sparse GSI space into a sequential IRQ series and reuse 621 + * vectors if possible. 622 + */ 623 + int gsi_irq_sharing(int gsi) 624 + { 625 + int i, tries, vector; 626 + 627 + BUG_ON(gsi >= NR_IRQ_VECTORS); 628 + 629 + if (platform_legacy_irq(gsi)) 630 + return gsi; 631 + 632 + if (gsi_2_irq[gsi] != 0xFF) 633 + return (int)gsi_2_irq[gsi]; 634 + 635 + tries = NR_IRQS; 636 + try_again: 637 + vector = assign_irq_vector(gsi); 638 + 639 + /* 640 + * Sharing vectors means sharing IRQs, so scan irq_vectors for previous 641 + * use of vector and if found, return that IRQ. However, we never want 642 + * to share legacy IRQs, which usually have a different trigger mode 643 + * than PCI. 644 + */ 645 + for (i = 0; i < NR_IRQS; i++) 646 + if (IO_APIC_VECTOR(i) == vector) 647 + break; 648 + if (platform_legacy_irq(i)) { 649 + if (--tries >= 0) { 650 + IO_APIC_VECTOR(i) = 0; 651 + goto try_again; 652 + } 653 + panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi); 654 + } 655 + if (i < NR_IRQS) { 656 + gsi_2_irq[gsi] = i; 657 + printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n", 658 + gsi, vector, i); 659 + return i; 660 + } 661 + 662 + i = next_irq++; 663 + BUG_ON(i >= NR_IRQS); 664 + gsi_2_irq[gsi] = i; 665 + IO_APIC_VECTOR(i) = vector; 666 + printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n", 667 + gsi, vector, i); 668 + return i; 669 + } 670 + 618 671 static int pin_2_irq(int idx, int apic, int pin) 619 672 { 620 673 int irq, i; ··· 702 639 while (i < apic) 703 640 irq += nr_ioapic_registers[i++]; 704 641 irq += pin; 642 + irq = gsi_irq_sharing(irq); 705 643 break; 706 644 } 707 645 default: ··· 712 648 break; 713 649 } 714 650 } 651 + BUG_ON(irq >= NR_IRQS); 715 652 716 653 /* 717 654 * PCI IRQ command line redirection. Yes, limits are hardcoded. ··· 728 663 } 729 664 } 730 665 } 666 + BUG_ON(irq >= NR_IRQS); 731 667 return irq; 732 668 } 733 669 ··· 756 690 { 757 691 static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; 758 692 759 - BUG_ON(irq >= NR_IRQ_VECTORS); 760 - if (IO_APIC_VECTOR(irq) > 0) 693 + BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS); 694 + if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) 761 695 return IO_APIC_VECTOR(irq); 762 696 next: 763 697 current_vector += 8; ··· 765 699 goto next; 766 700 767 701 if (current_vector >= FIRST_SYSTEM_VECTOR) { 768 - offset++; 769 - if (!(offset%8)) 770 - return -ENOSPC; 702 + /* If we run out of vectors on large boxen, must share them. */ 703 + offset = (offset + 1) % 8; 771 704 current_vector = FIRST_DEVICE_VECTOR + offset; 772 705 } 773 706 ··· 1982 1917 entry.polarity = active_high_low; 1983 1918 entry.mask = 1; /* Disabled (masked) */ 1984 1919 1920 + irq = gsi_irq_sharing(irq); 1985 1921 /* 1986 1922 * IRQs < 16 are already in the irq_2_pin[] map 1987 1923 */
+1 -1
arch/x86_64/kernel/mpparse.c
··· 218 218 m->mpc_irqtype, m->mpc_irqflag & 3, 219 219 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, 220 220 m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); 221 - if (++mp_irq_entries == MAX_IRQ_SOURCES) 221 + if (++mp_irq_entries >= MAX_IRQ_SOURCES) 222 222 panic("Max # of irq sources exceeded!!\n"); 223 223 } 224 224
+3
include/asm-x86_64/desc.h
··· 98 98 99 99 static inline void set_intr_gate(int nr, void *func) 100 100 { 101 + BUG_ON((unsigned)nr > 0xFF); 101 102 _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0); 102 103 } 103 104 104 105 static inline void set_intr_gate_ist(int nr, void *func, unsigned ist) 105 106 { 107 + BUG_ON((unsigned)nr > 0xFF); 106 108 _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist); 107 109 } 108 110 109 111 static inline void set_system_gate(int nr, void *func) 110 112 { 113 + BUG_ON((unsigned)nr > 0xFF); 111 114 _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0); 112 115 } 113 116
+2 -1
include/asm-x86_64/mpspec.h
··· 157 157 */ 158 158 159 159 #define MAX_MP_BUSSES 256 160 - #define MAX_IRQ_SOURCES 256 160 + /* Each PCI slot may be a combo card with its own bus. 4 IRQ pins per slot. */ 161 + #define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4) 161 162 enum mp_bustype { 162 163 MP_BUS_ISA = 1, 163 164 MP_BUS_EISA,