Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

powerpc/pci: unmap legacy INTx interrupts when a PHB is removed

When a passthrough IO adapter is removed from a pseries machine using
hash MMU and the XIVE interrupt mode, the POWER hypervisor expects the
guest OS to clear all page table entries related to the adapter. If
some are still present, the RTAS call which isolates the PCI slot
returns error 9001 "valid outstanding translations" and the removal of
the IO adapter fails. This is because when the PHBs are scanned, Linux
maps automatically the INTx interrupts in the Linux interrupt number
space but these are never removed.

To solve this problem, we introduce a PPC platform specific
pcibios_remove_bus() routine which clears all interrupt mappings when
the bus is removed. This also clears the associated page table entries
of the ESB pages when using XIVE.

For this purpose, we record the logical interrupt numbers of the
mapped interrupt under the PHB structure and let pcibios_remove_bus()
do the clean up.

Since some PCI adapters, like GPUs, use the "interrupt-map" property
to describe interrupt mappings other than the legacy INTx interrupts,
we can not restrict the size of the mapping array to PCI_NUM_INTX. The
number of interrupt mappings is computed from the "interrupt-map"
property and the mapping array is allocated accordingly.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200807101854.844619-1-clg@kaod.org

authored by

Cédric Le Goater and committed by
Michael Ellerman
3a3181e1 ffd2961b

+120
+6
arch/powerpc/include/asm/pci-bridge.h
··· 48 48 49 49 /* 50 50 * Structure of a PCI controller (host bridge) 51 + * 52 + * @irq_count: number of interrupt mappings 53 + * @irq_map: interrupt mappings 51 54 */ 52 55 struct pci_controller { 53 56 struct pci_bus *bus; ··· 130 127 131 128 void *private_data; 132 129 struct npu *npu; 130 + 131 + unsigned int irq_count; 132 + unsigned int *irq_map; 133 133 }; 134 134 135 135 /* These are used for config access before all the PCI probing
+114
arch/powerpc/kernel/pci-common.c
··· 354 354 } 355 355 356 356 /* 357 + * Assumption is made on the interrupt parent. All interrupt-map 358 + * entries are considered to have the same parent. 359 + */ 360 + static int pcibios_irq_map_count(struct pci_controller *phb) 361 + { 362 + const __be32 *imap; 363 + int imaplen; 364 + struct device_node *parent; 365 + u32 intsize, addrsize, parintsize, paraddrsize; 366 + 367 + if (of_property_read_u32(phb->dn, "#interrupt-cells", &intsize)) 368 + return 0; 369 + if (of_property_read_u32(phb->dn, "#address-cells", &addrsize)) 370 + return 0; 371 + 372 + imap = of_get_property(phb->dn, "interrupt-map", &imaplen); 373 + if (!imap) { 374 + pr_debug("%pOF : no interrupt-map\n", phb->dn); 375 + return 0; 376 + } 377 + imaplen /= sizeof(u32); 378 + pr_debug("%pOF : imaplen=%d\n", phb->dn, imaplen); 379 + 380 + if (imaplen < (addrsize + intsize + 1)) 381 + return 0; 382 + 383 + imap += intsize + addrsize; 384 + parent = of_find_node_by_phandle(be32_to_cpup(imap)); 385 + if (!parent) { 386 + pr_debug("%pOF : no imap parent found !\n", phb->dn); 387 + return 0; 388 + } 389 + 390 + if (of_property_read_u32(parent, "#interrupt-cells", &parintsize)) { 391 + pr_debug("%pOF : parent lacks #interrupt-cells!\n", phb->dn); 392 + return 0; 393 + } 394 + 395 + if (of_property_read_u32(parent, "#address-cells", &paraddrsize)) 396 + paraddrsize = 0; 397 + 398 + return imaplen / (addrsize + intsize + 1 + paraddrsize + parintsize); 399 + } 400 + 401 + static void pcibios_irq_map_init(struct pci_controller *phb) 402 + { 403 + phb->irq_count = pcibios_irq_map_count(phb); 404 + if (phb->irq_count < PCI_NUM_INTX) 405 + phb->irq_count = PCI_NUM_INTX; 406 + 407 + pr_debug("%pOF : interrupt map #%d\n", phb->dn, phb->irq_count); 408 + 409 + phb->irq_map = kcalloc(phb->irq_count, sizeof(unsigned int), 410 + GFP_KERNEL); 411 + } 412 + 413 + static void pci_irq_map_register(struct pci_dev *pdev, unsigned int virq) 414 + { 415 + struct pci_controller *phb = pci_bus_to_host(pdev->bus); 416 + int i; 417 + 418 + if (!phb->irq_map) 419 + return; 420 + 421 + for (i = 0; i < phb->irq_count; i++) { 422 + /* 423 + * Look for an empty or an equivalent slot, as INTx 424 + * interrupts can be shared between adapters. 425 + */ 426 + if (phb->irq_map[i] == virq || !phb->irq_map[i]) { 427 + phb->irq_map[i] = virq; 428 + break; 429 + } 430 + } 431 + 432 + if (i == phb->irq_count) 433 + pr_err("PCI:%s all platform interrupts mapped\n", 434 + pci_name(pdev)); 435 + } 436 + 437 + /* 438 + * Clearing the mapped interrupts will also clear the underlying 439 + * mappings of the ESB pages of the interrupts when under XIVE. It is 440 + * a requirement of PowerVM to clear all memory mappings before 441 + * removing a PHB. 442 + */ 443 + static void pci_irq_map_dispose(struct pci_bus *bus) 444 + { 445 + struct pci_controller *phb = pci_bus_to_host(bus); 446 + int i; 447 + 448 + if (!phb->irq_map) 449 + return; 450 + 451 + pr_debug("PCI: Clearing interrupt mappings for PHB %04x:%02x...\n", 452 + pci_domain_nr(bus), bus->number); 453 + for (i = 0; i < phb->irq_count; i++) 454 + irq_dispose_mapping(phb->irq_map[i]); 455 + 456 + kfree(phb->irq_map); 457 + } 458 + 459 + void pcibios_remove_bus(struct pci_bus *bus) 460 + { 461 + pci_irq_map_dispose(bus); 462 + } 463 + EXPORT_SYMBOL_GPL(pcibios_remove_bus); 464 + 465 + /* 357 466 * Reads the interrupt pin to determine if interrupt is use by card. 358 467 * If the interrupt is used, then gets the interrupt line from the 359 468 * openfirmware and sets it in the pci_dev and pci_config line. ··· 510 401 511 402 pci_dev->irq = virq; 512 403 404 + /* Record all interrut mappings for later removal of a PHB */ 405 + pci_irq_map_register(pci_dev, virq); 513 406 return 0; 514 407 } 515 408 ··· 1664 1553 int mode; 1665 1554 1666 1555 pr_debug("PCI: Scanning PHB %pOF\n", node); 1556 + 1557 + /* Allocate interrupt mappings array */ 1558 + pcibios_irq_map_init(hose); 1667 1559 1668 1560 /* Get some IO space for the new PHB */ 1669 1561 pcibios_setup_phb_io_space(hose);