Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v4.10-rc3 2385 lines 67 kB view raw
1/* 2 * Copyright (c) Microsoft Corporation. 3 * 4 * Author: 5 * Jake Oshins <jakeo@microsoft.com> 6 * 7 * This driver acts as a paravirtual front-end for PCI Express root buses. 8 * When a PCI Express function (either an entire device or an SR-IOV 9 * Virtual Function) is being passed through to the VM, this driver exposes 10 * a new bus to the guest VM. This is modeled as a root PCI bus because 11 * no bridges are being exposed to the VM. In fact, with a "Generation 2" 12 * VM within Hyper-V, there may seem to be no PCI bus at all in the VM 13 * until a device as been exposed using this driver. 14 * 15 * Each root PCI bus has its own PCI domain, which is called "Segment" in 16 * the PCI Firmware Specifications. Thus while each device passed through 17 * to the VM using this front-end will appear at "device 0", the domain will 18 * be unique. Typically, each bus will have one PCI function on it, though 19 * this driver does support more than one. 20 * 21 * In order to map the interrupts from the device through to the guest VM, 22 * this driver also implements an IRQ Domain, which handles interrupts (either 23 * MSI or MSI-X) associated with the functions on the bus. As interrupts are 24 * set up, torn down, or reaffined, this driver communicates with the 25 * underlying hypervisor to adjust the mappings in the I/O MMU so that each 26 * interrupt will be delivered to the correct virtual processor at the right 27 * vector. This driver does not support level-triggered (line-based) 28 * interrupts, and will report that the Interrupt Line register in the 29 * function's configuration space is zero. 30 * 31 * The rest of this driver mostly maps PCI concepts onto underlying Hyper-V 32 * facilities. For instance, the configuration space of a function exposed 33 * by Hyper-V is mapped into a single page of memory space, and the 34 * read and write handlers for config space must be aware of this mechanism. 35 * Similarly, device setup and teardown involves messages sent to and from 36 * the PCI back-end driver in Hyper-V. 37 * 38 * This program is free software; you can redistribute it and/or modify it 39 * under the terms of the GNU General Public License version 2 as published 40 * by the Free Software Foundation. 41 * 42 * This program is distributed in the hope that it will be useful, but 43 * WITHOUT ANY WARRANTY; without even the implied warranty of 44 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or 45 * NON INFRINGEMENT. See the GNU General Public License for more 46 * details. 47 * 48 */ 49 50#include <linux/kernel.h> 51#include <linux/module.h> 52#include <linux/pci.h> 53#include <linux/semaphore.h> 54#include <linux/irqdomain.h> 55#include <asm/irqdomain.h> 56#include <asm/apic.h> 57#include <linux/msi.h> 58#include <linux/hyperv.h> 59#include <asm/mshyperv.h> 60 61/* 62 * Protocol versions. The low word is the minor version, the high word the 63 * major version. 64 */ 65 66#define PCI_MAKE_VERSION(major, minor) ((u32)(((major) << 16) | (major))) 67#define PCI_MAJOR_VERSION(version) ((u32)(version) >> 16) 68#define PCI_MINOR_VERSION(version) ((u32)(version) & 0xff) 69 70enum { 71 PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1), 72 PCI_PROTOCOL_VERSION_CURRENT = PCI_PROTOCOL_VERSION_1_1 73}; 74 75#define PCI_CONFIG_MMIO_LENGTH 0x2000 76#define CFG_PAGE_OFFSET 0x1000 77#define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET) 78 79#define MAX_SUPPORTED_MSI_MESSAGES 0x400 80 81/* 82 * Message Types 83 */ 84 85enum pci_message_type { 86 /* 87 * Version 1.1 88 */ 89 PCI_MESSAGE_BASE = 0x42490000, 90 PCI_BUS_RELATIONS = PCI_MESSAGE_BASE + 0, 91 PCI_QUERY_BUS_RELATIONS = PCI_MESSAGE_BASE + 1, 92 PCI_POWER_STATE_CHANGE = PCI_MESSAGE_BASE + 4, 93 PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5, 94 PCI_QUERY_RESOURCE_RESOURCES = PCI_MESSAGE_BASE + 6, 95 PCI_BUS_D0ENTRY = PCI_MESSAGE_BASE + 7, 96 PCI_BUS_D0EXIT = PCI_MESSAGE_BASE + 8, 97 PCI_READ_BLOCK = PCI_MESSAGE_BASE + 9, 98 PCI_WRITE_BLOCK = PCI_MESSAGE_BASE + 0xA, 99 PCI_EJECT = PCI_MESSAGE_BASE + 0xB, 100 PCI_QUERY_STOP = PCI_MESSAGE_BASE + 0xC, 101 PCI_REENABLE = PCI_MESSAGE_BASE + 0xD, 102 PCI_QUERY_STOP_FAILED = PCI_MESSAGE_BASE + 0xE, 103 PCI_EJECTION_COMPLETE = PCI_MESSAGE_BASE + 0xF, 104 PCI_RESOURCES_ASSIGNED = PCI_MESSAGE_BASE + 0x10, 105 PCI_RESOURCES_RELEASED = PCI_MESSAGE_BASE + 0x11, 106 PCI_INVALIDATE_BLOCK = PCI_MESSAGE_BASE + 0x12, 107 PCI_QUERY_PROTOCOL_VERSION = PCI_MESSAGE_BASE + 0x13, 108 PCI_CREATE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x14, 109 PCI_DELETE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x15, 110 PCI_MESSAGE_MAXIMUM 111}; 112 113/* 114 * Structures defining the virtual PCI Express protocol. 115 */ 116 117union pci_version { 118 struct { 119 u16 minor_version; 120 u16 major_version; 121 } parts; 122 u32 version; 123} __packed; 124 125/* 126 * Function numbers are 8-bits wide on Express, as interpreted through ARI, 127 * which is all this driver does. This representation is the one used in 128 * Windows, which is what is expected when sending this back and forth with 129 * the Hyper-V parent partition. 130 */ 131union win_slot_encoding { 132 struct { 133 u32 func:8; 134 u32 reserved:24; 135 } bits; 136 u32 slot; 137} __packed; 138 139/* 140 * Pretty much as defined in the PCI Specifications. 141 */ 142struct pci_function_description { 143 u16 v_id; /* vendor ID */ 144 u16 d_id; /* device ID */ 145 u8 rev; 146 u8 prog_intf; 147 u8 subclass; 148 u8 base_class; 149 u32 subsystem_id; 150 union win_slot_encoding win_slot; 151 u32 ser; /* serial number */ 152} __packed; 153 154/** 155 * struct hv_msi_desc 156 * @vector: IDT entry 157 * @delivery_mode: As defined in Intel's Programmer's 158 * Reference Manual, Volume 3, Chapter 8. 159 * @vector_count: Number of contiguous entries in the 160 * Interrupt Descriptor Table that are 161 * occupied by this Message-Signaled 162 * Interrupt. For "MSI", as first defined 163 * in PCI 2.2, this can be between 1 and 164 * 32. For "MSI-X," as first defined in PCI 165 * 3.0, this must be 1, as each MSI-X table 166 * entry would have its own descriptor. 167 * @reserved: Empty space 168 * @cpu_mask: All the target virtual processors. 169 */ 170struct hv_msi_desc { 171 u8 vector; 172 u8 delivery_mode; 173 u16 vector_count; 174 u32 reserved; 175 u64 cpu_mask; 176} __packed; 177 178/** 179 * struct tran_int_desc 180 * @reserved: unused, padding 181 * @vector_count: same as in hv_msi_desc 182 * @data: This is the "data payload" value that is 183 * written by the device when it generates 184 * a message-signaled interrupt, either MSI 185 * or MSI-X. 186 * @address: This is the address to which the data 187 * payload is written on interrupt 188 * generation. 189 */ 190struct tran_int_desc { 191 u16 reserved; 192 u16 vector_count; 193 u32 data; 194 u64 address; 195} __packed; 196 197/* 198 * A generic message format for virtual PCI. 199 * Specific message formats are defined later in the file. 200 */ 201 202struct pci_message { 203 u32 type; 204} __packed; 205 206struct pci_child_message { 207 struct pci_message message_type; 208 union win_slot_encoding wslot; 209} __packed; 210 211struct pci_incoming_message { 212 struct vmpacket_descriptor hdr; 213 struct pci_message message_type; 214} __packed; 215 216struct pci_response { 217 struct vmpacket_descriptor hdr; 218 s32 status; /* negative values are failures */ 219} __packed; 220 221struct pci_packet { 222 void (*completion_func)(void *context, struct pci_response *resp, 223 int resp_packet_size); 224 void *compl_ctxt; 225 226 struct pci_message message[0]; 227}; 228 229/* 230 * Specific message types supporting the PCI protocol. 231 */ 232 233/* 234 * Version negotiation message. Sent from the guest to the host. 235 * The guest is free to try different versions until the host 236 * accepts the version. 237 * 238 * pci_version: The protocol version requested. 239 * is_last_attempt: If TRUE, this is the last version guest will request. 240 * reservedz: Reserved field, set to zero. 241 */ 242 243struct pci_version_request { 244 struct pci_message message_type; 245 enum pci_message_type protocol_version; 246} __packed; 247 248/* 249 * Bus D0 Entry. This is sent from the guest to the host when the virtual 250 * bus (PCI Express port) is ready for action. 251 */ 252 253struct pci_bus_d0_entry { 254 struct pci_message message_type; 255 u32 reserved; 256 u64 mmio_base; 257} __packed; 258 259struct pci_bus_relations { 260 struct pci_incoming_message incoming; 261 u32 device_count; 262 struct pci_function_description func[0]; 263} __packed; 264 265struct pci_q_res_req_response { 266 struct vmpacket_descriptor hdr; 267 s32 status; /* negative values are failures */ 268 u32 probed_bar[6]; 269} __packed; 270 271struct pci_set_power { 272 struct pci_message message_type; 273 union win_slot_encoding wslot; 274 u32 power_state; /* In Windows terms */ 275 u32 reserved; 276} __packed; 277 278struct pci_set_power_response { 279 struct vmpacket_descriptor hdr; 280 s32 status; /* negative values are failures */ 281 union win_slot_encoding wslot; 282 u32 resultant_state; /* In Windows terms */ 283 u32 reserved; 284} __packed; 285 286struct pci_resources_assigned { 287 struct pci_message message_type; 288 union win_slot_encoding wslot; 289 u8 memory_range[0x14][6]; /* not used here */ 290 u32 msi_descriptors; 291 u32 reserved[4]; 292} __packed; 293 294struct pci_create_interrupt { 295 struct pci_message message_type; 296 union win_slot_encoding wslot; 297 struct hv_msi_desc int_desc; 298} __packed; 299 300struct pci_create_int_response { 301 struct pci_response response; 302 u32 reserved; 303 struct tran_int_desc int_desc; 304} __packed; 305 306struct pci_delete_interrupt { 307 struct pci_message message_type; 308 union win_slot_encoding wslot; 309 struct tran_int_desc int_desc; 310} __packed; 311 312struct pci_dev_incoming { 313 struct pci_incoming_message incoming; 314 union win_slot_encoding wslot; 315} __packed; 316 317struct pci_eject_response { 318 struct pci_message message_type; 319 union win_slot_encoding wslot; 320 u32 status; 321} __packed; 322 323static int pci_ring_size = (4 * PAGE_SIZE); 324 325/* 326 * Definitions or interrupt steering hypercall. 327 */ 328#define HV_PARTITION_ID_SELF ((u64)-1) 329#define HVCALL_RETARGET_INTERRUPT 0x7e 330 331struct retarget_msi_interrupt { 332 u64 partition_id; /* use "self" */ 333 u64 device_id; 334 u32 source; /* 1 for MSI(-X) */ 335 u32 reserved1; 336 u32 address; 337 u32 data; 338 u64 reserved2; 339 u32 vector; 340 u32 flags; 341 u64 vp_mask; 342} __packed; 343 344/* 345 * Driver specific state. 346 */ 347 348enum hv_pcibus_state { 349 hv_pcibus_init = 0, 350 hv_pcibus_probed, 351 hv_pcibus_installed, 352 hv_pcibus_maximum 353}; 354 355struct hv_pcibus_device { 356 struct pci_sysdata sysdata; 357 enum hv_pcibus_state state; 358 atomic_t remove_lock; 359 struct hv_device *hdev; 360 resource_size_t low_mmio_space; 361 resource_size_t high_mmio_space; 362 struct resource *mem_config; 363 struct resource *low_mmio_res; 364 struct resource *high_mmio_res; 365 struct completion *survey_event; 366 struct completion remove_event; 367 struct pci_bus *pci_bus; 368 spinlock_t config_lock; /* Avoid two threads writing index page */ 369 spinlock_t device_list_lock; /* Protect lists below */ 370 void __iomem *cfg_addr; 371 372 struct semaphore enum_sem; 373 struct list_head resources_for_children; 374 375 struct list_head children; 376 struct list_head dr_list; 377 378 struct msi_domain_info msi_info; 379 struct msi_controller msi_chip; 380 struct irq_domain *irq_domain; 381 struct retarget_msi_interrupt retarget_msi_interrupt_params; 382 spinlock_t retarget_msi_interrupt_lock; 383}; 384 385/* 386 * Tracks "Device Relations" messages from the host, which must be both 387 * processed in order and deferred so that they don't run in the context 388 * of the incoming packet callback. 389 */ 390struct hv_dr_work { 391 struct work_struct wrk; 392 struct hv_pcibus_device *bus; 393}; 394 395struct hv_dr_state { 396 struct list_head list_entry; 397 u32 device_count; 398 struct pci_function_description func[0]; 399}; 400 401enum hv_pcichild_state { 402 hv_pcichild_init = 0, 403 hv_pcichild_requirements, 404 hv_pcichild_resourced, 405 hv_pcichild_ejecting, 406 hv_pcichild_maximum 407}; 408 409enum hv_pcidev_ref_reason { 410 hv_pcidev_ref_invalid = 0, 411 hv_pcidev_ref_initial, 412 hv_pcidev_ref_by_slot, 413 hv_pcidev_ref_packet, 414 hv_pcidev_ref_pnp, 415 hv_pcidev_ref_childlist, 416 hv_pcidev_irqdata, 417 hv_pcidev_ref_max 418}; 419 420struct hv_pci_dev { 421 /* List protected by pci_rescan_remove_lock */ 422 struct list_head list_entry; 423 atomic_t refs; 424 enum hv_pcichild_state state; 425 struct pci_function_description desc; 426 bool reported_missing; 427 struct hv_pcibus_device *hbus; 428 struct work_struct wrk; 429 430 /* 431 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then 432 * read it back, for each of the BAR offsets within config space. 433 */ 434 u32 probed_bar[6]; 435}; 436 437struct hv_pci_compl { 438 struct completion host_event; 439 s32 completion_status; 440}; 441 442/** 443 * hv_pci_generic_compl() - Invoked for a completion packet 444 * @context: Set up by the sender of the packet. 445 * @resp: The response packet 446 * @resp_packet_size: Size in bytes of the packet 447 * 448 * This function is used to trigger an event and report status 449 * for any message for which the completion packet contains a 450 * status and nothing else. 451 */ 452static void hv_pci_generic_compl(void *context, struct pci_response *resp, 453 int resp_packet_size) 454{ 455 struct hv_pci_compl *comp_pkt = context; 456 457 if (resp_packet_size >= offsetofend(struct pci_response, status)) 458 comp_pkt->completion_status = resp->status; 459 else 460 comp_pkt->completion_status = -1; 461 462 complete(&comp_pkt->host_event); 463} 464 465static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus, 466 u32 wslot); 467static void get_pcichild(struct hv_pci_dev *hv_pcidev, 468 enum hv_pcidev_ref_reason reason); 469static void put_pcichild(struct hv_pci_dev *hv_pcidev, 470 enum hv_pcidev_ref_reason reason); 471 472static void get_hvpcibus(struct hv_pcibus_device *hv_pcibus); 473static void put_hvpcibus(struct hv_pcibus_device *hv_pcibus); 474 475/** 476 * devfn_to_wslot() - Convert from Linux PCI slot to Windows 477 * @devfn: The Linux representation of PCI slot 478 * 479 * Windows uses a slightly different representation of PCI slot. 480 * 481 * Return: The Windows representation 482 */ 483static u32 devfn_to_wslot(int devfn) 484{ 485 union win_slot_encoding wslot; 486 487 wslot.slot = 0; 488 wslot.bits.func = PCI_SLOT(devfn) | (PCI_FUNC(devfn) << 5); 489 490 return wslot.slot; 491} 492 493/** 494 * wslot_to_devfn() - Convert from Windows PCI slot to Linux 495 * @wslot: The Windows representation of PCI slot 496 * 497 * Windows uses a slightly different representation of PCI slot. 498 * 499 * Return: The Linux representation 500 */ 501static int wslot_to_devfn(u32 wslot) 502{ 503 union win_slot_encoding slot_no; 504 505 slot_no.slot = wslot; 506 return PCI_DEVFN(0, slot_no.bits.func); 507} 508 509/* 510 * PCI Configuration Space for these root PCI buses is implemented as a pair 511 * of pages in memory-mapped I/O space. Writing to the first page chooses 512 * the PCI function being written or read. Once the first page has been 513 * written to, the following page maps in the entire configuration space of 514 * the function. 515 */ 516 517/** 518 * _hv_pcifront_read_config() - Internal PCI config read 519 * @hpdev: The PCI driver's representation of the device 520 * @where: Offset within config space 521 * @size: Size of the transfer 522 * @val: Pointer to the buffer receiving the data 523 */ 524static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where, 525 int size, u32 *val) 526{ 527 unsigned long flags; 528 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where; 529 530 /* 531 * If the attempt is to read the IDs or the ROM BAR, simulate that. 532 */ 533 if (where + size <= PCI_COMMAND) { 534 memcpy(val, ((u8 *)&hpdev->desc.v_id) + where, size); 535 } else if (where >= PCI_CLASS_REVISION && where + size <= 536 PCI_CACHE_LINE_SIZE) { 537 memcpy(val, ((u8 *)&hpdev->desc.rev) + where - 538 PCI_CLASS_REVISION, size); 539 } else if (where >= PCI_SUBSYSTEM_VENDOR_ID && where + size <= 540 PCI_ROM_ADDRESS) { 541 memcpy(val, (u8 *)&hpdev->desc.subsystem_id + where - 542 PCI_SUBSYSTEM_VENDOR_ID, size); 543 } else if (where >= PCI_ROM_ADDRESS && where + size <= 544 PCI_CAPABILITY_LIST) { 545 /* ROM BARs are unimplemented */ 546 *val = 0; 547 } else if (where >= PCI_INTERRUPT_LINE && where + size <= 548 PCI_INTERRUPT_PIN) { 549 /* 550 * Interrupt Line and Interrupt PIN are hard-wired to zero 551 * because this front-end only supports message-signaled 552 * interrupts. 553 */ 554 *val = 0; 555 } else if (where + size <= CFG_PAGE_SIZE) { 556 spin_lock_irqsave(&hpdev->hbus->config_lock, flags); 557 /* Choose the function to be read. (See comment above) */ 558 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr); 559 /* Make sure the function was chosen before we start reading. */ 560 mb(); 561 /* Read from that function's config space. */ 562 switch (size) { 563 case 1: 564 *val = readb(addr); 565 break; 566 case 2: 567 *val = readw(addr); 568 break; 569 default: 570 *val = readl(addr); 571 break; 572 } 573 /* 574 * Make sure the write was done before we release the spinlock 575 * allowing consecutive reads/writes. 576 */ 577 mb(); 578 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags); 579 } else { 580 dev_err(&hpdev->hbus->hdev->device, 581 "Attempt to read beyond a function's config space.\n"); 582 } 583} 584 585/** 586 * _hv_pcifront_write_config() - Internal PCI config write 587 * @hpdev: The PCI driver's representation of the device 588 * @where: Offset within config space 589 * @size: Size of the transfer 590 * @val: The data being transferred 591 */ 592static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where, 593 int size, u32 val) 594{ 595 unsigned long flags; 596 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where; 597 598 if (where >= PCI_SUBSYSTEM_VENDOR_ID && 599 where + size <= PCI_CAPABILITY_LIST) { 600 /* SSIDs and ROM BARs are read-only */ 601 } else if (where >= PCI_COMMAND && where + size <= CFG_PAGE_SIZE) { 602 spin_lock_irqsave(&hpdev->hbus->config_lock, flags); 603 /* Choose the function to be written. (See comment above) */ 604 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr); 605 /* Make sure the function was chosen before we start writing. */ 606 wmb(); 607 /* Write to that function's config space. */ 608 switch (size) { 609 case 1: 610 writeb(val, addr); 611 break; 612 case 2: 613 writew(val, addr); 614 break; 615 default: 616 writel(val, addr); 617 break; 618 } 619 /* 620 * Make sure the write was done before we release the spinlock 621 * allowing consecutive reads/writes. 622 */ 623 mb(); 624 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags); 625 } else { 626 dev_err(&hpdev->hbus->hdev->device, 627 "Attempt to write beyond a function's config space.\n"); 628 } 629} 630 631/** 632 * hv_pcifront_read_config() - Read configuration space 633 * @bus: PCI Bus structure 634 * @devfn: Device/function 635 * @where: Offset from base 636 * @size: Byte/word/dword 637 * @val: Value to be read 638 * 639 * Return: PCIBIOS_SUCCESSFUL on success 640 * PCIBIOS_DEVICE_NOT_FOUND on failure 641 */ 642static int hv_pcifront_read_config(struct pci_bus *bus, unsigned int devfn, 643 int where, int size, u32 *val) 644{ 645 struct hv_pcibus_device *hbus = 646 container_of(bus->sysdata, struct hv_pcibus_device, sysdata); 647 struct hv_pci_dev *hpdev; 648 649 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn)); 650 if (!hpdev) 651 return PCIBIOS_DEVICE_NOT_FOUND; 652 653 _hv_pcifront_read_config(hpdev, where, size, val); 654 655 put_pcichild(hpdev, hv_pcidev_ref_by_slot); 656 return PCIBIOS_SUCCESSFUL; 657} 658 659/** 660 * hv_pcifront_write_config() - Write configuration space 661 * @bus: PCI Bus structure 662 * @devfn: Device/function 663 * @where: Offset from base 664 * @size: Byte/word/dword 665 * @val: Value to be written to device 666 * 667 * Return: PCIBIOS_SUCCESSFUL on success 668 * PCIBIOS_DEVICE_NOT_FOUND on failure 669 */ 670static int hv_pcifront_write_config(struct pci_bus *bus, unsigned int devfn, 671 int where, int size, u32 val) 672{ 673 struct hv_pcibus_device *hbus = 674 container_of(bus->sysdata, struct hv_pcibus_device, sysdata); 675 struct hv_pci_dev *hpdev; 676 677 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn)); 678 if (!hpdev) 679 return PCIBIOS_DEVICE_NOT_FOUND; 680 681 _hv_pcifront_write_config(hpdev, where, size, val); 682 683 put_pcichild(hpdev, hv_pcidev_ref_by_slot); 684 return PCIBIOS_SUCCESSFUL; 685} 686 687/* PCIe operations */ 688static struct pci_ops hv_pcifront_ops = { 689 .read = hv_pcifront_read_config, 690 .write = hv_pcifront_write_config, 691}; 692 693/* Interrupt management hooks */ 694static void hv_int_desc_free(struct hv_pci_dev *hpdev, 695 struct tran_int_desc *int_desc) 696{ 697 struct pci_delete_interrupt *int_pkt; 698 struct { 699 struct pci_packet pkt; 700 u8 buffer[sizeof(struct pci_delete_interrupt)]; 701 } ctxt; 702 703 memset(&ctxt, 0, sizeof(ctxt)); 704 int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message; 705 int_pkt->message_type.type = 706 PCI_DELETE_INTERRUPT_MESSAGE; 707 int_pkt->wslot.slot = hpdev->desc.win_slot.slot; 708 int_pkt->int_desc = *int_desc; 709 vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt, sizeof(*int_pkt), 710 (unsigned long)&ctxt.pkt, VM_PKT_DATA_INBAND, 0); 711 kfree(int_desc); 712} 713 714/** 715 * hv_msi_free() - Free the MSI. 716 * @domain: The interrupt domain pointer 717 * @info: Extra MSI-related context 718 * @irq: Identifies the IRQ. 719 * 720 * The Hyper-V parent partition and hypervisor are tracking the 721 * messages that are in use, keeping the interrupt redirection 722 * table up to date. This callback sends a message that frees 723 * the IRT entry and related tracking nonsense. 724 */ 725static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info, 726 unsigned int irq) 727{ 728 struct hv_pcibus_device *hbus; 729 struct hv_pci_dev *hpdev; 730 struct pci_dev *pdev; 731 struct tran_int_desc *int_desc; 732 struct irq_data *irq_data = irq_domain_get_irq_data(domain, irq); 733 struct msi_desc *msi = irq_data_get_msi_desc(irq_data); 734 735 pdev = msi_desc_to_pci_dev(msi); 736 hbus = info->data; 737 int_desc = irq_data_get_irq_chip_data(irq_data); 738 if (!int_desc) 739 return; 740 741 irq_data->chip_data = NULL; 742 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); 743 if (!hpdev) { 744 kfree(int_desc); 745 return; 746 } 747 748 hv_int_desc_free(hpdev, int_desc); 749 put_pcichild(hpdev, hv_pcidev_ref_by_slot); 750} 751 752static int hv_set_affinity(struct irq_data *data, const struct cpumask *dest, 753 bool force) 754{ 755 struct irq_data *parent = data->parent_data; 756 757 return parent->chip->irq_set_affinity(parent, dest, force); 758} 759 760static void hv_irq_mask(struct irq_data *data) 761{ 762 pci_msi_mask_irq(data); 763} 764 765/** 766 * hv_irq_unmask() - "Unmask" the IRQ by setting its current 767 * affinity. 768 * @data: Describes the IRQ 769 * 770 * Build new a destination for the MSI and make a hypercall to 771 * update the Interrupt Redirection Table. "Device Logical ID" 772 * is built out of this PCI bus's instance GUID and the function 773 * number of the device. 774 */ 775static void hv_irq_unmask(struct irq_data *data) 776{ 777 struct msi_desc *msi_desc = irq_data_get_msi_desc(data); 778 struct irq_cfg *cfg = irqd_cfg(data); 779 struct retarget_msi_interrupt *params; 780 struct hv_pcibus_device *hbus; 781 struct cpumask *dest; 782 struct pci_bus *pbus; 783 struct pci_dev *pdev; 784 int cpu; 785 unsigned long flags; 786 787 dest = irq_data_get_affinity_mask(data); 788 pdev = msi_desc_to_pci_dev(msi_desc); 789 pbus = pdev->bus; 790 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); 791 792 spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags); 793 794 params = &hbus->retarget_msi_interrupt_params; 795 memset(params, 0, sizeof(*params)); 796 params->partition_id = HV_PARTITION_ID_SELF; 797 params->source = 1; /* MSI(-X) */ 798 params->address = msi_desc->msg.address_lo; 799 params->data = msi_desc->msg.data; 800 params->device_id = (hbus->hdev->dev_instance.b[5] << 24) | 801 (hbus->hdev->dev_instance.b[4] << 16) | 802 (hbus->hdev->dev_instance.b[7] << 8) | 803 (hbus->hdev->dev_instance.b[6] & 0xf8) | 804 PCI_FUNC(pdev->devfn); 805 params->vector = cfg->vector; 806 807 for_each_cpu_and(cpu, dest, cpu_online_mask) 808 params->vp_mask |= (1ULL << vmbus_cpu_number_to_vp_number(cpu)); 809 810 hv_do_hypercall(HVCALL_RETARGET_INTERRUPT, params, NULL); 811 812 spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags); 813 814 pci_msi_unmask_irq(data); 815} 816 817struct compose_comp_ctxt { 818 struct hv_pci_compl comp_pkt; 819 struct tran_int_desc int_desc; 820}; 821 822static void hv_pci_compose_compl(void *context, struct pci_response *resp, 823 int resp_packet_size) 824{ 825 struct compose_comp_ctxt *comp_pkt = context; 826 struct pci_create_int_response *int_resp = 827 (struct pci_create_int_response *)resp; 828 829 comp_pkt->comp_pkt.completion_status = resp->status; 830 comp_pkt->int_desc = int_resp->int_desc; 831 complete(&comp_pkt->comp_pkt.host_event); 832} 833 834/** 835 * hv_compose_msi_msg() - Supplies a valid MSI address/data 836 * @data: Everything about this MSI 837 * @msg: Buffer that is filled in by this function 838 * 839 * This function unpacks the IRQ looking for target CPU set, IDT 840 * vector and mode and sends a message to the parent partition 841 * asking for a mapping for that tuple in this partition. The 842 * response supplies a data value and address to which that data 843 * should be written to trigger that interrupt. 844 */ 845static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) 846{ 847 struct irq_cfg *cfg = irqd_cfg(data); 848 struct hv_pcibus_device *hbus; 849 struct hv_pci_dev *hpdev; 850 struct pci_bus *pbus; 851 struct pci_dev *pdev; 852 struct pci_create_interrupt *int_pkt; 853 struct compose_comp_ctxt comp; 854 struct tran_int_desc *int_desc; 855 struct cpumask *affinity; 856 struct { 857 struct pci_packet pkt; 858 u8 buffer[sizeof(struct pci_create_interrupt)]; 859 } ctxt; 860 int cpu; 861 int ret; 862 863 pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data)); 864 pbus = pdev->bus; 865 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); 866 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); 867 if (!hpdev) 868 goto return_null_message; 869 870 /* Free any previous message that might have already been composed. */ 871 if (data->chip_data) { 872 int_desc = data->chip_data; 873 data->chip_data = NULL; 874 hv_int_desc_free(hpdev, int_desc); 875 } 876 877 int_desc = kzalloc(sizeof(*int_desc), GFP_KERNEL); 878 if (!int_desc) 879 goto drop_reference; 880 881 memset(&ctxt, 0, sizeof(ctxt)); 882 init_completion(&comp.comp_pkt.host_event); 883 ctxt.pkt.completion_func = hv_pci_compose_compl; 884 ctxt.pkt.compl_ctxt = &comp; 885 int_pkt = (struct pci_create_interrupt *)&ctxt.pkt.message; 886 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE; 887 int_pkt->wslot.slot = hpdev->desc.win_slot.slot; 888 int_pkt->int_desc.vector = cfg->vector; 889 int_pkt->int_desc.vector_count = 1; 890 int_pkt->int_desc.delivery_mode = 891 (apic->irq_delivery_mode == dest_LowestPrio) ? 1 : 0; 892 893 /* 894 * This bit doesn't have to work on machines with more than 64 895 * processors because Hyper-V only supports 64 in a guest. 896 */ 897 affinity = irq_data_get_affinity_mask(data); 898 for_each_cpu_and(cpu, affinity, cpu_online_mask) { 899 int_pkt->int_desc.cpu_mask |= 900 (1ULL << vmbus_cpu_number_to_vp_number(cpu)); 901 } 902 903 ret = vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt, 904 sizeof(*int_pkt), (unsigned long)&ctxt.pkt, 905 VM_PKT_DATA_INBAND, 906 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 907 if (ret) 908 goto free_int_desc; 909 910 wait_for_completion(&comp.comp_pkt.host_event); 911 912 if (comp.comp_pkt.completion_status < 0) { 913 dev_err(&hbus->hdev->device, 914 "Request for interrupt failed: 0x%x", 915 comp.comp_pkt.completion_status); 916 goto free_int_desc; 917 } 918 919 /* 920 * Record the assignment so that this can be unwound later. Using 921 * irq_set_chip_data() here would be appropriate, but the lock it takes 922 * is already held. 923 */ 924 *int_desc = comp.int_desc; 925 data->chip_data = int_desc; 926 927 /* Pass up the result. */ 928 msg->address_hi = comp.int_desc.address >> 32; 929 msg->address_lo = comp.int_desc.address & 0xffffffff; 930 msg->data = comp.int_desc.data; 931 932 put_pcichild(hpdev, hv_pcidev_ref_by_slot); 933 return; 934 935free_int_desc: 936 kfree(int_desc); 937drop_reference: 938 put_pcichild(hpdev, hv_pcidev_ref_by_slot); 939return_null_message: 940 msg->address_hi = 0; 941 msg->address_lo = 0; 942 msg->data = 0; 943} 944 945/* HW Interrupt Chip Descriptor */ 946static struct irq_chip hv_msi_irq_chip = { 947 .name = "Hyper-V PCIe MSI", 948 .irq_compose_msi_msg = hv_compose_msi_msg, 949 .irq_set_affinity = hv_set_affinity, 950 .irq_ack = irq_chip_ack_parent, 951 .irq_mask = hv_irq_mask, 952 .irq_unmask = hv_irq_unmask, 953}; 954 955static irq_hw_number_t hv_msi_domain_ops_get_hwirq(struct msi_domain_info *info, 956 msi_alloc_info_t *arg) 957{ 958 return arg->msi_hwirq; 959} 960 961static struct msi_domain_ops hv_msi_ops = { 962 .get_hwirq = hv_msi_domain_ops_get_hwirq, 963 .msi_prepare = pci_msi_prepare, 964 .set_desc = pci_msi_set_desc, 965 .msi_free = hv_msi_free, 966}; 967 968/** 969 * hv_pcie_init_irq_domain() - Initialize IRQ domain 970 * @hbus: The root PCI bus 971 * 972 * This function creates an IRQ domain which will be used for 973 * interrupts from devices that have been passed through. These 974 * devices only support MSI and MSI-X, not line-based interrupts 975 * or simulations of line-based interrupts through PCIe's 976 * fabric-layer messages. Because interrupts are remapped, we 977 * can support multi-message MSI here. 978 * 979 * Return: '0' on success and error value on failure 980 */ 981static int hv_pcie_init_irq_domain(struct hv_pcibus_device *hbus) 982{ 983 hbus->msi_info.chip = &hv_msi_irq_chip; 984 hbus->msi_info.ops = &hv_msi_ops; 985 hbus->msi_info.flags = (MSI_FLAG_USE_DEF_DOM_OPS | 986 MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI | 987 MSI_FLAG_PCI_MSIX); 988 hbus->msi_info.handler = handle_edge_irq; 989 hbus->msi_info.handler_name = "edge"; 990 hbus->msi_info.data = hbus; 991 hbus->irq_domain = pci_msi_create_irq_domain(hbus->sysdata.fwnode, 992 &hbus->msi_info, 993 x86_vector_domain); 994 if (!hbus->irq_domain) { 995 dev_err(&hbus->hdev->device, 996 "Failed to build an MSI IRQ domain\n"); 997 return -ENODEV; 998 } 999 1000 return 0; 1001} 1002 1003/** 1004 * get_bar_size() - Get the address space consumed by a BAR 1005 * @bar_val: Value that a BAR returned after -1 was written 1006 * to it. 1007 * 1008 * This function returns the size of the BAR, rounded up to 1 1009 * page. It has to be rounded up because the hypervisor's page 1010 * table entry that maps the BAR into the VM can't specify an 1011 * offset within a page. The invariant is that the hypervisor 1012 * must place any BARs of smaller than page length at the 1013 * beginning of a page. 1014 * 1015 * Return: Size in bytes of the consumed MMIO space. 1016 */ 1017static u64 get_bar_size(u64 bar_val) 1018{ 1019 return round_up((1 + ~(bar_val & PCI_BASE_ADDRESS_MEM_MASK)), 1020 PAGE_SIZE); 1021} 1022 1023/** 1024 * survey_child_resources() - Total all MMIO requirements 1025 * @hbus: Root PCI bus, as understood by this driver 1026 */ 1027static void survey_child_resources(struct hv_pcibus_device *hbus) 1028{ 1029 struct list_head *iter; 1030 struct hv_pci_dev *hpdev; 1031 resource_size_t bar_size = 0; 1032 unsigned long flags; 1033 struct completion *event; 1034 u64 bar_val; 1035 int i; 1036 1037 /* If nobody is waiting on the answer, don't compute it. */ 1038 event = xchg(&hbus->survey_event, NULL); 1039 if (!event) 1040 return; 1041 1042 /* If the answer has already been computed, go with it. */ 1043 if (hbus->low_mmio_space || hbus->high_mmio_space) { 1044 complete(event); 1045 return; 1046 } 1047 1048 spin_lock_irqsave(&hbus->device_list_lock, flags); 1049 1050 /* 1051 * Due to an interesting quirk of the PCI spec, all memory regions 1052 * for a child device are a power of 2 in size and aligned in memory, 1053 * so it's sufficient to just add them up without tracking alignment. 1054 */ 1055 list_for_each(iter, &hbus->children) { 1056 hpdev = container_of(iter, struct hv_pci_dev, list_entry); 1057 for (i = 0; i < 6; i++) { 1058 if (hpdev->probed_bar[i] & PCI_BASE_ADDRESS_SPACE_IO) 1059 dev_err(&hbus->hdev->device, 1060 "There's an I/O BAR in this list!\n"); 1061 1062 if (hpdev->probed_bar[i] != 0) { 1063 /* 1064 * A probed BAR has all the upper bits set that 1065 * can be changed. 1066 */ 1067 1068 bar_val = hpdev->probed_bar[i]; 1069 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64) 1070 bar_val |= 1071 ((u64)hpdev->probed_bar[++i] << 32); 1072 else 1073 bar_val |= 0xffffffff00000000ULL; 1074 1075 bar_size = get_bar_size(bar_val); 1076 1077 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64) 1078 hbus->high_mmio_space += bar_size; 1079 else 1080 hbus->low_mmio_space += bar_size; 1081 } 1082 } 1083 } 1084 1085 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 1086 complete(event); 1087} 1088 1089/** 1090 * prepopulate_bars() - Fill in BARs with defaults 1091 * @hbus: Root PCI bus, as understood by this driver 1092 * 1093 * The core PCI driver code seems much, much happier if the BARs 1094 * for a device have values upon first scan. So fill them in. 1095 * The algorithm below works down from large sizes to small, 1096 * attempting to pack the assignments optimally. The assumption, 1097 * enforced in other parts of the code, is that the beginning of 1098 * the memory-mapped I/O space will be aligned on the largest 1099 * BAR size. 1100 */ 1101static void prepopulate_bars(struct hv_pcibus_device *hbus) 1102{ 1103 resource_size_t high_size = 0; 1104 resource_size_t low_size = 0; 1105 resource_size_t high_base = 0; 1106 resource_size_t low_base = 0; 1107 resource_size_t bar_size; 1108 struct hv_pci_dev *hpdev; 1109 struct list_head *iter; 1110 unsigned long flags; 1111 u64 bar_val; 1112 u32 command; 1113 bool high; 1114 int i; 1115 1116 if (hbus->low_mmio_space) { 1117 low_size = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space)); 1118 low_base = hbus->low_mmio_res->start; 1119 } 1120 1121 if (hbus->high_mmio_space) { 1122 high_size = 1ULL << 1123 (63 - __builtin_clzll(hbus->high_mmio_space)); 1124 high_base = hbus->high_mmio_res->start; 1125 } 1126 1127 spin_lock_irqsave(&hbus->device_list_lock, flags); 1128 1129 /* Pick addresses for the BARs. */ 1130 do { 1131 list_for_each(iter, &hbus->children) { 1132 hpdev = container_of(iter, struct hv_pci_dev, 1133 list_entry); 1134 for (i = 0; i < 6; i++) { 1135 bar_val = hpdev->probed_bar[i]; 1136 if (bar_val == 0) 1137 continue; 1138 high = bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64; 1139 if (high) { 1140 bar_val |= 1141 ((u64)hpdev->probed_bar[i + 1] 1142 << 32); 1143 } else { 1144 bar_val |= 0xffffffffULL << 32; 1145 } 1146 bar_size = get_bar_size(bar_val); 1147 if (high) { 1148 if (high_size != bar_size) { 1149 i++; 1150 continue; 1151 } 1152 _hv_pcifront_write_config(hpdev, 1153 PCI_BASE_ADDRESS_0 + (4 * i), 1154 4, 1155 (u32)(high_base & 0xffffff00)); 1156 i++; 1157 _hv_pcifront_write_config(hpdev, 1158 PCI_BASE_ADDRESS_0 + (4 * i), 1159 4, (u32)(high_base >> 32)); 1160 high_base += bar_size; 1161 } else { 1162 if (low_size != bar_size) 1163 continue; 1164 _hv_pcifront_write_config(hpdev, 1165 PCI_BASE_ADDRESS_0 + (4 * i), 1166 4, 1167 (u32)(low_base & 0xffffff00)); 1168 low_base += bar_size; 1169 } 1170 } 1171 if (high_size <= 1 && low_size <= 1) { 1172 /* Set the memory enable bit. */ 1173 _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2, 1174 &command); 1175 command |= PCI_COMMAND_MEMORY; 1176 _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2, 1177 command); 1178 break; 1179 } 1180 } 1181 1182 high_size >>= 1; 1183 low_size >>= 1; 1184 } while (high_size || low_size); 1185 1186 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 1187} 1188 1189/** 1190 * create_root_hv_pci_bus() - Expose a new root PCI bus 1191 * @hbus: Root PCI bus, as understood by this driver 1192 * 1193 * Return: 0 on success, -errno on failure 1194 */ 1195static int create_root_hv_pci_bus(struct hv_pcibus_device *hbus) 1196{ 1197 /* Register the device */ 1198 hbus->pci_bus = pci_create_root_bus(&hbus->hdev->device, 1199 0, /* bus number is always zero */ 1200 &hv_pcifront_ops, 1201 &hbus->sysdata, 1202 &hbus->resources_for_children); 1203 if (!hbus->pci_bus) 1204 return -ENODEV; 1205 1206 hbus->pci_bus->msi = &hbus->msi_chip; 1207 hbus->pci_bus->msi->dev = &hbus->hdev->device; 1208 1209 pci_scan_child_bus(hbus->pci_bus); 1210 pci_bus_assign_resources(hbus->pci_bus); 1211 pci_bus_add_devices(hbus->pci_bus); 1212 hbus->state = hv_pcibus_installed; 1213 return 0; 1214} 1215 1216struct q_res_req_compl { 1217 struct completion host_event; 1218 struct hv_pci_dev *hpdev; 1219}; 1220 1221/** 1222 * q_resource_requirements() - Query Resource Requirements 1223 * @context: The completion context. 1224 * @resp: The response that came from the host. 1225 * @resp_packet_size: The size in bytes of resp. 1226 * 1227 * This function is invoked on completion of a Query Resource 1228 * Requirements packet. 1229 */ 1230static void q_resource_requirements(void *context, struct pci_response *resp, 1231 int resp_packet_size) 1232{ 1233 struct q_res_req_compl *completion = context; 1234 struct pci_q_res_req_response *q_res_req = 1235 (struct pci_q_res_req_response *)resp; 1236 int i; 1237 1238 if (resp->status < 0) { 1239 dev_err(&completion->hpdev->hbus->hdev->device, 1240 "query resource requirements failed: %x\n", 1241 resp->status); 1242 } else { 1243 for (i = 0; i < 6; i++) { 1244 completion->hpdev->probed_bar[i] = 1245 q_res_req->probed_bar[i]; 1246 } 1247 } 1248 1249 complete(&completion->host_event); 1250} 1251 1252static void get_pcichild(struct hv_pci_dev *hpdev, 1253 enum hv_pcidev_ref_reason reason) 1254{ 1255 atomic_inc(&hpdev->refs); 1256} 1257 1258static void put_pcichild(struct hv_pci_dev *hpdev, 1259 enum hv_pcidev_ref_reason reason) 1260{ 1261 if (atomic_dec_and_test(&hpdev->refs)) 1262 kfree(hpdev); 1263} 1264 1265/** 1266 * new_pcichild_device() - Create a new child device 1267 * @hbus: The internal struct tracking this root PCI bus. 1268 * @desc: The information supplied so far from the host 1269 * about the device. 1270 * 1271 * This function creates the tracking structure for a new child 1272 * device and kicks off the process of figuring out what it is. 1273 * 1274 * Return: Pointer to the new tracking struct 1275 */ 1276static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus, 1277 struct pci_function_description *desc) 1278{ 1279 struct hv_pci_dev *hpdev; 1280 struct pci_child_message *res_req; 1281 struct q_res_req_compl comp_pkt; 1282 struct { 1283 struct pci_packet init_packet; 1284 u8 buffer[sizeof(struct pci_child_message)]; 1285 } pkt; 1286 unsigned long flags; 1287 int ret; 1288 1289 hpdev = kzalloc(sizeof(*hpdev), GFP_ATOMIC); 1290 if (!hpdev) 1291 return NULL; 1292 1293 hpdev->hbus = hbus; 1294 1295 memset(&pkt, 0, sizeof(pkt)); 1296 init_completion(&comp_pkt.host_event); 1297 comp_pkt.hpdev = hpdev; 1298 pkt.init_packet.compl_ctxt = &comp_pkt; 1299 pkt.init_packet.completion_func = q_resource_requirements; 1300 res_req = (struct pci_child_message *)&pkt.init_packet.message; 1301 res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS; 1302 res_req->wslot.slot = desc->win_slot.slot; 1303 1304 ret = vmbus_sendpacket(hbus->hdev->channel, res_req, 1305 sizeof(struct pci_child_message), 1306 (unsigned long)&pkt.init_packet, 1307 VM_PKT_DATA_INBAND, 1308 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 1309 if (ret) 1310 goto error; 1311 1312 wait_for_completion(&comp_pkt.host_event); 1313 1314 hpdev->desc = *desc; 1315 get_pcichild(hpdev, hv_pcidev_ref_initial); 1316 get_pcichild(hpdev, hv_pcidev_ref_childlist); 1317 spin_lock_irqsave(&hbus->device_list_lock, flags); 1318 list_add_tail(&hpdev->list_entry, &hbus->children); 1319 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 1320 return hpdev; 1321 1322error: 1323 kfree(hpdev); 1324 return NULL; 1325} 1326 1327/** 1328 * get_pcichild_wslot() - Find device from slot 1329 * @hbus: Root PCI bus, as understood by this driver 1330 * @wslot: Location on the bus 1331 * 1332 * This function looks up a PCI device and returns the internal 1333 * representation of it. It acquires a reference on it, so that 1334 * the device won't be deleted while somebody is using it. The 1335 * caller is responsible for calling put_pcichild() to release 1336 * this reference. 1337 * 1338 * Return: Internal representation of a PCI device 1339 */ 1340static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus, 1341 u32 wslot) 1342{ 1343 unsigned long flags; 1344 struct hv_pci_dev *iter, *hpdev = NULL; 1345 1346 spin_lock_irqsave(&hbus->device_list_lock, flags); 1347 list_for_each_entry(iter, &hbus->children, list_entry) { 1348 if (iter->desc.win_slot.slot == wslot) { 1349 hpdev = iter; 1350 get_pcichild(hpdev, hv_pcidev_ref_by_slot); 1351 break; 1352 } 1353 } 1354 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 1355 1356 return hpdev; 1357} 1358 1359/** 1360 * pci_devices_present_work() - Handle new list of child devices 1361 * @work: Work struct embedded in struct hv_dr_work 1362 * 1363 * "Bus Relations" is the Windows term for "children of this 1364 * bus." The terminology is preserved here for people trying to 1365 * debug the interaction between Hyper-V and Linux. This 1366 * function is called when the parent partition reports a list 1367 * of functions that should be observed under this PCI Express 1368 * port (bus). 1369 * 1370 * This function updates the list, and must tolerate being 1371 * called multiple times with the same information. The typical 1372 * number of child devices is one, with very atypical cases 1373 * involving three or four, so the algorithms used here can be 1374 * simple and inefficient. 1375 * 1376 * It must also treat the omission of a previously observed device as 1377 * notification that the device no longer exists. 1378 * 1379 * Note that this function is a work item, and it may not be 1380 * invoked in the order that it was queued. Back to back 1381 * updates of the list of present devices may involve queuing 1382 * multiple work items, and this one may run before ones that 1383 * were sent later. As such, this function only does something 1384 * if is the last one in the queue. 1385 */ 1386static void pci_devices_present_work(struct work_struct *work) 1387{ 1388 u32 child_no; 1389 bool found; 1390 struct list_head *iter; 1391 struct pci_function_description *new_desc; 1392 struct hv_pci_dev *hpdev; 1393 struct hv_pcibus_device *hbus; 1394 struct list_head removed; 1395 struct hv_dr_work *dr_wrk; 1396 struct hv_dr_state *dr = NULL; 1397 unsigned long flags; 1398 1399 dr_wrk = container_of(work, struct hv_dr_work, wrk); 1400 hbus = dr_wrk->bus; 1401 kfree(dr_wrk); 1402 1403 INIT_LIST_HEAD(&removed); 1404 1405 if (down_interruptible(&hbus->enum_sem)) { 1406 put_hvpcibus(hbus); 1407 return; 1408 } 1409 1410 /* Pull this off the queue and process it if it was the last one. */ 1411 spin_lock_irqsave(&hbus->device_list_lock, flags); 1412 while (!list_empty(&hbus->dr_list)) { 1413 dr = list_first_entry(&hbus->dr_list, struct hv_dr_state, 1414 list_entry); 1415 list_del(&dr->list_entry); 1416 1417 /* Throw this away if the list still has stuff in it. */ 1418 if (!list_empty(&hbus->dr_list)) { 1419 kfree(dr); 1420 continue; 1421 } 1422 } 1423 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 1424 1425 if (!dr) { 1426 up(&hbus->enum_sem); 1427 put_hvpcibus(hbus); 1428 return; 1429 } 1430 1431 /* First, mark all existing children as reported missing. */ 1432 spin_lock_irqsave(&hbus->device_list_lock, flags); 1433 list_for_each(iter, &hbus->children) { 1434 hpdev = container_of(iter, struct hv_pci_dev, 1435 list_entry); 1436 hpdev->reported_missing = true; 1437 } 1438 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 1439 1440 /* Next, add back any reported devices. */ 1441 for (child_no = 0; child_no < dr->device_count; child_no++) { 1442 found = false; 1443 new_desc = &dr->func[child_no]; 1444 1445 spin_lock_irqsave(&hbus->device_list_lock, flags); 1446 list_for_each(iter, &hbus->children) { 1447 hpdev = container_of(iter, struct hv_pci_dev, 1448 list_entry); 1449 if ((hpdev->desc.win_slot.slot == 1450 new_desc->win_slot.slot) && 1451 (hpdev->desc.v_id == new_desc->v_id) && 1452 (hpdev->desc.d_id == new_desc->d_id) && 1453 (hpdev->desc.ser == new_desc->ser)) { 1454 hpdev->reported_missing = false; 1455 found = true; 1456 } 1457 } 1458 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 1459 1460 if (!found) { 1461 hpdev = new_pcichild_device(hbus, new_desc); 1462 if (!hpdev) 1463 dev_err(&hbus->hdev->device, 1464 "couldn't record a child device.\n"); 1465 } 1466 } 1467 1468 /* Move missing children to a list on the stack. */ 1469 spin_lock_irqsave(&hbus->device_list_lock, flags); 1470 do { 1471 found = false; 1472 list_for_each(iter, &hbus->children) { 1473 hpdev = container_of(iter, struct hv_pci_dev, 1474 list_entry); 1475 if (hpdev->reported_missing) { 1476 found = true; 1477 put_pcichild(hpdev, hv_pcidev_ref_childlist); 1478 list_move_tail(&hpdev->list_entry, &removed); 1479 break; 1480 } 1481 } 1482 } while (found); 1483 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 1484 1485 /* Delete everything that should no longer exist. */ 1486 while (!list_empty(&removed)) { 1487 hpdev = list_first_entry(&removed, struct hv_pci_dev, 1488 list_entry); 1489 list_del(&hpdev->list_entry); 1490 put_pcichild(hpdev, hv_pcidev_ref_initial); 1491 } 1492 1493 /* Tell the core to rescan bus because there may have been changes. */ 1494 if (hbus->state == hv_pcibus_installed) { 1495 pci_lock_rescan_remove(); 1496 pci_scan_child_bus(hbus->pci_bus); 1497 pci_unlock_rescan_remove(); 1498 } else { 1499 survey_child_resources(hbus); 1500 } 1501 1502 up(&hbus->enum_sem); 1503 put_hvpcibus(hbus); 1504 kfree(dr); 1505} 1506 1507/** 1508 * hv_pci_devices_present() - Handles list of new children 1509 * @hbus: Root PCI bus, as understood by this driver 1510 * @relations: Packet from host listing children 1511 * 1512 * This function is invoked whenever a new list of devices for 1513 * this bus appears. 1514 */ 1515static void hv_pci_devices_present(struct hv_pcibus_device *hbus, 1516 struct pci_bus_relations *relations) 1517{ 1518 struct hv_dr_state *dr; 1519 struct hv_dr_work *dr_wrk; 1520 unsigned long flags; 1521 1522 dr_wrk = kzalloc(sizeof(*dr_wrk), GFP_NOWAIT); 1523 if (!dr_wrk) 1524 return; 1525 1526 dr = kzalloc(offsetof(struct hv_dr_state, func) + 1527 (sizeof(struct pci_function_description) * 1528 (relations->device_count)), GFP_NOWAIT); 1529 if (!dr) { 1530 kfree(dr_wrk); 1531 return; 1532 } 1533 1534 INIT_WORK(&dr_wrk->wrk, pci_devices_present_work); 1535 dr_wrk->bus = hbus; 1536 dr->device_count = relations->device_count; 1537 if (dr->device_count != 0) { 1538 memcpy(dr->func, relations->func, 1539 sizeof(struct pci_function_description) * 1540 dr->device_count); 1541 } 1542 1543 spin_lock_irqsave(&hbus->device_list_lock, flags); 1544 list_add_tail(&dr->list_entry, &hbus->dr_list); 1545 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 1546 1547 get_hvpcibus(hbus); 1548 schedule_work(&dr_wrk->wrk); 1549} 1550 1551/** 1552 * hv_eject_device_work() - Asynchronously handles ejection 1553 * @work: Work struct embedded in internal device struct 1554 * 1555 * This function handles ejecting a device. Windows will 1556 * attempt to gracefully eject a device, waiting 60 seconds to 1557 * hear back from the guest OS that this completed successfully. 1558 * If this timer expires, the device will be forcibly removed. 1559 */ 1560static void hv_eject_device_work(struct work_struct *work) 1561{ 1562 struct pci_eject_response *ejct_pkt; 1563 struct hv_pci_dev *hpdev; 1564 struct pci_dev *pdev; 1565 unsigned long flags; 1566 int wslot; 1567 struct { 1568 struct pci_packet pkt; 1569 u8 buffer[sizeof(struct pci_eject_response)]; 1570 } ctxt; 1571 1572 hpdev = container_of(work, struct hv_pci_dev, wrk); 1573 1574 if (hpdev->state != hv_pcichild_ejecting) { 1575 put_pcichild(hpdev, hv_pcidev_ref_pnp); 1576 return; 1577 } 1578 1579 /* 1580 * Ejection can come before or after the PCI bus has been set up, so 1581 * attempt to find it and tear down the bus state, if it exists. This 1582 * must be done without constructs like pci_domain_nr(hbus->pci_bus) 1583 * because hbus->pci_bus may not exist yet. 1584 */ 1585 wslot = wslot_to_devfn(hpdev->desc.win_slot.slot); 1586 pdev = pci_get_domain_bus_and_slot(hpdev->hbus->sysdata.domain, 0, 1587 wslot); 1588 if (pdev) { 1589 pci_stop_and_remove_bus_device(pdev); 1590 pci_dev_put(pdev); 1591 } 1592 1593 spin_lock_irqsave(&hpdev->hbus->device_list_lock, flags); 1594 list_del(&hpdev->list_entry); 1595 spin_unlock_irqrestore(&hpdev->hbus->device_list_lock, flags); 1596 1597 memset(&ctxt, 0, sizeof(ctxt)); 1598 ejct_pkt = (struct pci_eject_response *)&ctxt.pkt.message; 1599 ejct_pkt->message_type.type = PCI_EJECTION_COMPLETE; 1600 ejct_pkt->wslot.slot = hpdev->desc.win_slot.slot; 1601 vmbus_sendpacket(hpdev->hbus->hdev->channel, ejct_pkt, 1602 sizeof(*ejct_pkt), (unsigned long)&ctxt.pkt, 1603 VM_PKT_DATA_INBAND, 0); 1604 1605 put_pcichild(hpdev, hv_pcidev_ref_childlist); 1606 put_pcichild(hpdev, hv_pcidev_ref_pnp); 1607 put_hvpcibus(hpdev->hbus); 1608} 1609 1610/** 1611 * hv_pci_eject_device() - Handles device ejection 1612 * @hpdev: Internal device tracking struct 1613 * 1614 * This function is invoked when an ejection packet arrives. It 1615 * just schedules work so that we don't re-enter the packet 1616 * delivery code handling the ejection. 1617 */ 1618static void hv_pci_eject_device(struct hv_pci_dev *hpdev) 1619{ 1620 hpdev->state = hv_pcichild_ejecting; 1621 get_pcichild(hpdev, hv_pcidev_ref_pnp); 1622 INIT_WORK(&hpdev->wrk, hv_eject_device_work); 1623 get_hvpcibus(hpdev->hbus); 1624 schedule_work(&hpdev->wrk); 1625} 1626 1627/** 1628 * hv_pci_onchannelcallback() - Handles incoming packets 1629 * @context: Internal bus tracking struct 1630 * 1631 * This function is invoked whenever the host sends a packet to 1632 * this channel (which is private to this root PCI bus). 1633 */ 1634static void hv_pci_onchannelcallback(void *context) 1635{ 1636 const int packet_size = 0x100; 1637 int ret; 1638 struct hv_pcibus_device *hbus = context; 1639 u32 bytes_recvd; 1640 u64 req_id; 1641 struct vmpacket_descriptor *desc; 1642 unsigned char *buffer; 1643 int bufferlen = packet_size; 1644 struct pci_packet *comp_packet; 1645 struct pci_response *response; 1646 struct pci_incoming_message *new_message; 1647 struct pci_bus_relations *bus_rel; 1648 struct pci_dev_incoming *dev_message; 1649 struct hv_pci_dev *hpdev; 1650 1651 buffer = kmalloc(bufferlen, GFP_ATOMIC); 1652 if (!buffer) 1653 return; 1654 1655 while (1) { 1656 ret = vmbus_recvpacket_raw(hbus->hdev->channel, buffer, 1657 bufferlen, &bytes_recvd, &req_id); 1658 1659 if (ret == -ENOBUFS) { 1660 kfree(buffer); 1661 /* Handle large packet */ 1662 bufferlen = bytes_recvd; 1663 buffer = kmalloc(bytes_recvd, GFP_ATOMIC); 1664 if (!buffer) 1665 return; 1666 continue; 1667 } 1668 1669 /* Zero length indicates there are no more packets. */ 1670 if (ret || !bytes_recvd) 1671 break; 1672 1673 /* 1674 * All incoming packets must be at least as large as a 1675 * response. 1676 */ 1677 if (bytes_recvd <= sizeof(struct pci_response)) 1678 continue; 1679 desc = (struct vmpacket_descriptor *)buffer; 1680 1681 switch (desc->type) { 1682 case VM_PKT_COMP: 1683 1684 /* 1685 * The host is trusted, and thus it's safe to interpret 1686 * this transaction ID as a pointer. 1687 */ 1688 comp_packet = (struct pci_packet *)req_id; 1689 response = (struct pci_response *)buffer; 1690 comp_packet->completion_func(comp_packet->compl_ctxt, 1691 response, 1692 bytes_recvd); 1693 break; 1694 1695 case VM_PKT_DATA_INBAND: 1696 1697 new_message = (struct pci_incoming_message *)buffer; 1698 switch (new_message->message_type.type) { 1699 case PCI_BUS_RELATIONS: 1700 1701 bus_rel = (struct pci_bus_relations *)buffer; 1702 if (bytes_recvd < 1703 offsetof(struct pci_bus_relations, func) + 1704 (sizeof(struct pci_function_description) * 1705 (bus_rel->device_count))) { 1706 dev_err(&hbus->hdev->device, 1707 "bus relations too small\n"); 1708 break; 1709 } 1710 1711 hv_pci_devices_present(hbus, bus_rel); 1712 break; 1713 1714 case PCI_EJECT: 1715 1716 dev_message = (struct pci_dev_incoming *)buffer; 1717 hpdev = get_pcichild_wslot(hbus, 1718 dev_message->wslot.slot); 1719 if (hpdev) { 1720 hv_pci_eject_device(hpdev); 1721 put_pcichild(hpdev, 1722 hv_pcidev_ref_by_slot); 1723 } 1724 break; 1725 1726 default: 1727 dev_warn(&hbus->hdev->device, 1728 "Unimplemented protocol message %x\n", 1729 new_message->message_type.type); 1730 break; 1731 } 1732 break; 1733 1734 default: 1735 dev_err(&hbus->hdev->device, 1736 "unhandled packet type %d, tid %llx len %d\n", 1737 desc->type, req_id, bytes_recvd); 1738 break; 1739 } 1740 } 1741 1742 kfree(buffer); 1743} 1744 1745/** 1746 * hv_pci_protocol_negotiation() - Set up protocol 1747 * @hdev: VMBus's tracking struct for this root PCI bus 1748 * 1749 * This driver is intended to support running on Windows 10 1750 * (server) and later versions. It will not run on earlier 1751 * versions, as they assume that many of the operations which 1752 * Linux needs accomplished with a spinlock held were done via 1753 * asynchronous messaging via VMBus. Windows 10 increases the 1754 * surface area of PCI emulation so that these actions can take 1755 * place by suspending a virtual processor for their duration. 1756 * 1757 * This function negotiates the channel protocol version, 1758 * failing if the host doesn't support the necessary protocol 1759 * level. 1760 */ 1761static int hv_pci_protocol_negotiation(struct hv_device *hdev) 1762{ 1763 struct pci_version_request *version_req; 1764 struct hv_pci_compl comp_pkt; 1765 struct pci_packet *pkt; 1766 int ret; 1767 1768 /* 1769 * Initiate the handshake with the host and negotiate 1770 * a version that the host can support. We start with the 1771 * highest version number and go down if the host cannot 1772 * support it. 1773 */ 1774 pkt = kzalloc(sizeof(*pkt) + sizeof(*version_req), GFP_KERNEL); 1775 if (!pkt) 1776 return -ENOMEM; 1777 1778 init_completion(&comp_pkt.host_event); 1779 pkt->completion_func = hv_pci_generic_compl; 1780 pkt->compl_ctxt = &comp_pkt; 1781 version_req = (struct pci_version_request *)&pkt->message; 1782 version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION; 1783 version_req->protocol_version = PCI_PROTOCOL_VERSION_CURRENT; 1784 1785 ret = vmbus_sendpacket(hdev->channel, version_req, 1786 sizeof(struct pci_version_request), 1787 (unsigned long)pkt, VM_PKT_DATA_INBAND, 1788 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 1789 if (ret) 1790 goto exit; 1791 1792 wait_for_completion(&comp_pkt.host_event); 1793 1794 if (comp_pkt.completion_status < 0) { 1795 dev_err(&hdev->device, 1796 "PCI Pass-through VSP failed version request %x\n", 1797 comp_pkt.completion_status); 1798 ret = -EPROTO; 1799 goto exit; 1800 } 1801 1802 ret = 0; 1803 1804exit: 1805 kfree(pkt); 1806 return ret; 1807} 1808 1809/** 1810 * hv_pci_free_bridge_windows() - Release memory regions for the 1811 * bus 1812 * @hbus: Root PCI bus, as understood by this driver 1813 */ 1814static void hv_pci_free_bridge_windows(struct hv_pcibus_device *hbus) 1815{ 1816 /* 1817 * Set the resources back to the way they looked when they 1818 * were allocated by setting IORESOURCE_BUSY again. 1819 */ 1820 1821 if (hbus->low_mmio_space && hbus->low_mmio_res) { 1822 hbus->low_mmio_res->flags |= IORESOURCE_BUSY; 1823 vmbus_free_mmio(hbus->low_mmio_res->start, 1824 resource_size(hbus->low_mmio_res)); 1825 } 1826 1827 if (hbus->high_mmio_space && hbus->high_mmio_res) { 1828 hbus->high_mmio_res->flags |= IORESOURCE_BUSY; 1829 vmbus_free_mmio(hbus->high_mmio_res->start, 1830 resource_size(hbus->high_mmio_res)); 1831 } 1832} 1833 1834/** 1835 * hv_pci_allocate_bridge_windows() - Allocate memory regions 1836 * for the bus 1837 * @hbus: Root PCI bus, as understood by this driver 1838 * 1839 * This function calls vmbus_allocate_mmio(), which is itself a 1840 * bit of a compromise. Ideally, we might change the pnp layer 1841 * in the kernel such that it comprehends either PCI devices 1842 * which are "grandchildren of ACPI," with some intermediate bus 1843 * node (in this case, VMBus) or change it such that it 1844 * understands VMBus. The pnp layer, however, has been declared 1845 * deprecated, and not subject to change. 1846 * 1847 * The workaround, implemented here, is to ask VMBus to allocate 1848 * MMIO space for this bus. VMBus itself knows which ranges are 1849 * appropriate by looking at its own ACPI objects. Then, after 1850 * these ranges are claimed, they're modified to look like they 1851 * would have looked if the ACPI and pnp code had allocated 1852 * bridge windows. These descriptors have to exist in this form 1853 * in order to satisfy the code which will get invoked when the 1854 * endpoint PCI function driver calls request_mem_region() or 1855 * request_mem_region_exclusive(). 1856 * 1857 * Return: 0 on success, -errno on failure 1858 */ 1859static int hv_pci_allocate_bridge_windows(struct hv_pcibus_device *hbus) 1860{ 1861 resource_size_t align; 1862 int ret; 1863 1864 if (hbus->low_mmio_space) { 1865 align = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space)); 1866 ret = vmbus_allocate_mmio(&hbus->low_mmio_res, hbus->hdev, 0, 1867 (u64)(u32)0xffffffff, 1868 hbus->low_mmio_space, 1869 align, false); 1870 if (ret) { 1871 dev_err(&hbus->hdev->device, 1872 "Need %#llx of low MMIO space. Consider reconfiguring the VM.\n", 1873 hbus->low_mmio_space); 1874 return ret; 1875 } 1876 1877 /* Modify this resource to become a bridge window. */ 1878 hbus->low_mmio_res->flags |= IORESOURCE_WINDOW; 1879 hbus->low_mmio_res->flags &= ~IORESOURCE_BUSY; 1880 pci_add_resource(&hbus->resources_for_children, 1881 hbus->low_mmio_res); 1882 } 1883 1884 if (hbus->high_mmio_space) { 1885 align = 1ULL << (63 - __builtin_clzll(hbus->high_mmio_space)); 1886 ret = vmbus_allocate_mmio(&hbus->high_mmio_res, hbus->hdev, 1887 0x100000000, -1, 1888 hbus->high_mmio_space, align, 1889 false); 1890 if (ret) { 1891 dev_err(&hbus->hdev->device, 1892 "Need %#llx of high MMIO space. Consider reconfiguring the VM.\n", 1893 hbus->high_mmio_space); 1894 goto release_low_mmio; 1895 } 1896 1897 /* Modify this resource to become a bridge window. */ 1898 hbus->high_mmio_res->flags |= IORESOURCE_WINDOW; 1899 hbus->high_mmio_res->flags &= ~IORESOURCE_BUSY; 1900 pci_add_resource(&hbus->resources_for_children, 1901 hbus->high_mmio_res); 1902 } 1903 1904 return 0; 1905 1906release_low_mmio: 1907 if (hbus->low_mmio_res) { 1908 vmbus_free_mmio(hbus->low_mmio_res->start, 1909 resource_size(hbus->low_mmio_res)); 1910 } 1911 1912 return ret; 1913} 1914 1915/** 1916 * hv_allocate_config_window() - Find MMIO space for PCI Config 1917 * @hbus: Root PCI bus, as understood by this driver 1918 * 1919 * This function claims memory-mapped I/O space for accessing 1920 * configuration space for the functions on this bus. 1921 * 1922 * Return: 0 on success, -errno on failure 1923 */ 1924static int hv_allocate_config_window(struct hv_pcibus_device *hbus) 1925{ 1926 int ret; 1927 1928 /* 1929 * Set up a region of MMIO space to use for accessing configuration 1930 * space. 1931 */ 1932 ret = vmbus_allocate_mmio(&hbus->mem_config, hbus->hdev, 0, -1, 1933 PCI_CONFIG_MMIO_LENGTH, 0x1000, false); 1934 if (ret) 1935 return ret; 1936 1937 /* 1938 * vmbus_allocate_mmio() gets used for allocating both device endpoint 1939 * resource claims (those which cannot be overlapped) and the ranges 1940 * which are valid for the children of this bus, which are intended 1941 * to be overlapped by those children. Set the flag on this claim 1942 * meaning that this region can't be overlapped. 1943 */ 1944 1945 hbus->mem_config->flags |= IORESOURCE_BUSY; 1946 1947 return 0; 1948} 1949 1950static void hv_free_config_window(struct hv_pcibus_device *hbus) 1951{ 1952 vmbus_free_mmio(hbus->mem_config->start, PCI_CONFIG_MMIO_LENGTH); 1953} 1954 1955/** 1956 * hv_pci_enter_d0() - Bring the "bus" into the D0 power state 1957 * @hdev: VMBus's tracking struct for this root PCI bus 1958 * 1959 * Return: 0 on success, -errno on failure 1960 */ 1961static int hv_pci_enter_d0(struct hv_device *hdev) 1962{ 1963 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 1964 struct pci_bus_d0_entry *d0_entry; 1965 struct hv_pci_compl comp_pkt; 1966 struct pci_packet *pkt; 1967 int ret; 1968 1969 /* 1970 * Tell the host that the bus is ready to use, and moved into the 1971 * powered-on state. This includes telling the host which region 1972 * of memory-mapped I/O space has been chosen for configuration space 1973 * access. 1974 */ 1975 pkt = kzalloc(sizeof(*pkt) + sizeof(*d0_entry), GFP_KERNEL); 1976 if (!pkt) 1977 return -ENOMEM; 1978 1979 init_completion(&comp_pkt.host_event); 1980 pkt->completion_func = hv_pci_generic_compl; 1981 pkt->compl_ctxt = &comp_pkt; 1982 d0_entry = (struct pci_bus_d0_entry *)&pkt->message; 1983 d0_entry->message_type.type = PCI_BUS_D0ENTRY; 1984 d0_entry->mmio_base = hbus->mem_config->start; 1985 1986 ret = vmbus_sendpacket(hdev->channel, d0_entry, sizeof(*d0_entry), 1987 (unsigned long)pkt, VM_PKT_DATA_INBAND, 1988 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 1989 if (ret) 1990 goto exit; 1991 1992 wait_for_completion(&comp_pkt.host_event); 1993 1994 if (comp_pkt.completion_status < 0) { 1995 dev_err(&hdev->device, 1996 "PCI Pass-through VSP failed D0 Entry with status %x\n", 1997 comp_pkt.completion_status); 1998 ret = -EPROTO; 1999 goto exit; 2000 } 2001 2002 ret = 0; 2003 2004exit: 2005 kfree(pkt); 2006 return ret; 2007} 2008 2009/** 2010 * hv_pci_query_relations() - Ask host to send list of child 2011 * devices 2012 * @hdev: VMBus's tracking struct for this root PCI bus 2013 * 2014 * Return: 0 on success, -errno on failure 2015 */ 2016static int hv_pci_query_relations(struct hv_device *hdev) 2017{ 2018 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 2019 struct pci_message message; 2020 struct completion comp; 2021 int ret; 2022 2023 /* Ask the host to send along the list of child devices */ 2024 init_completion(&comp); 2025 if (cmpxchg(&hbus->survey_event, NULL, &comp)) 2026 return -ENOTEMPTY; 2027 2028 memset(&message, 0, sizeof(message)); 2029 message.type = PCI_QUERY_BUS_RELATIONS; 2030 2031 ret = vmbus_sendpacket(hdev->channel, &message, sizeof(message), 2032 0, VM_PKT_DATA_INBAND, 0); 2033 if (ret) 2034 return ret; 2035 2036 wait_for_completion(&comp); 2037 return 0; 2038} 2039 2040/** 2041 * hv_send_resources_allocated() - Report local resource choices 2042 * @hdev: VMBus's tracking struct for this root PCI bus 2043 * 2044 * The host OS is expecting to be sent a request as a message 2045 * which contains all the resources that the device will use. 2046 * The response contains those same resources, "translated" 2047 * which is to say, the values which should be used by the 2048 * hardware, when it delivers an interrupt. (MMIO resources are 2049 * used in local terms.) This is nice for Windows, and lines up 2050 * with the FDO/PDO split, which doesn't exist in Linux. Linux 2051 * is deeply expecting to scan an emulated PCI configuration 2052 * space. So this message is sent here only to drive the state 2053 * machine on the host forward. 2054 * 2055 * Return: 0 on success, -errno on failure 2056 */ 2057static int hv_send_resources_allocated(struct hv_device *hdev) 2058{ 2059 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 2060 struct pci_resources_assigned *res_assigned; 2061 struct hv_pci_compl comp_pkt; 2062 struct hv_pci_dev *hpdev; 2063 struct pci_packet *pkt; 2064 u32 wslot; 2065 int ret; 2066 2067 pkt = kmalloc(sizeof(*pkt) + sizeof(*res_assigned), GFP_KERNEL); 2068 if (!pkt) 2069 return -ENOMEM; 2070 2071 ret = 0; 2072 2073 for (wslot = 0; wslot < 256; wslot++) { 2074 hpdev = get_pcichild_wslot(hbus, wslot); 2075 if (!hpdev) 2076 continue; 2077 2078 memset(pkt, 0, sizeof(*pkt) + sizeof(*res_assigned)); 2079 init_completion(&comp_pkt.host_event); 2080 pkt->completion_func = hv_pci_generic_compl; 2081 pkt->compl_ctxt = &comp_pkt; 2082 res_assigned = (struct pci_resources_assigned *)&pkt->message; 2083 res_assigned->message_type.type = PCI_RESOURCES_ASSIGNED; 2084 res_assigned->wslot.slot = hpdev->desc.win_slot.slot; 2085 2086 put_pcichild(hpdev, hv_pcidev_ref_by_slot); 2087 2088 ret = vmbus_sendpacket( 2089 hdev->channel, &pkt->message, 2090 sizeof(*res_assigned), 2091 (unsigned long)pkt, 2092 VM_PKT_DATA_INBAND, 2093 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 2094 if (ret) 2095 break; 2096 2097 wait_for_completion(&comp_pkt.host_event); 2098 2099 if (comp_pkt.completion_status < 0) { 2100 ret = -EPROTO; 2101 dev_err(&hdev->device, 2102 "resource allocated returned 0x%x", 2103 comp_pkt.completion_status); 2104 break; 2105 } 2106 } 2107 2108 kfree(pkt); 2109 return ret; 2110} 2111 2112/** 2113 * hv_send_resources_released() - Report local resources 2114 * released 2115 * @hdev: VMBus's tracking struct for this root PCI bus 2116 * 2117 * Return: 0 on success, -errno on failure 2118 */ 2119static int hv_send_resources_released(struct hv_device *hdev) 2120{ 2121 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 2122 struct pci_child_message pkt; 2123 struct hv_pci_dev *hpdev; 2124 u32 wslot; 2125 int ret; 2126 2127 for (wslot = 0; wslot < 256; wslot++) { 2128 hpdev = get_pcichild_wslot(hbus, wslot); 2129 if (!hpdev) 2130 continue; 2131 2132 memset(&pkt, 0, sizeof(pkt)); 2133 pkt.message_type.type = PCI_RESOURCES_RELEASED; 2134 pkt.wslot.slot = hpdev->desc.win_slot.slot; 2135 2136 put_pcichild(hpdev, hv_pcidev_ref_by_slot); 2137 2138 ret = vmbus_sendpacket(hdev->channel, &pkt, sizeof(pkt), 0, 2139 VM_PKT_DATA_INBAND, 0); 2140 if (ret) 2141 return ret; 2142 } 2143 2144 return 0; 2145} 2146 2147static void get_hvpcibus(struct hv_pcibus_device *hbus) 2148{ 2149 atomic_inc(&hbus->remove_lock); 2150} 2151 2152static void put_hvpcibus(struct hv_pcibus_device *hbus) 2153{ 2154 if (atomic_dec_and_test(&hbus->remove_lock)) 2155 complete(&hbus->remove_event); 2156} 2157 2158/** 2159 * hv_pci_probe() - New VMBus channel probe, for a root PCI bus 2160 * @hdev: VMBus's tracking struct for this root PCI bus 2161 * @dev_id: Identifies the device itself 2162 * 2163 * Return: 0 on success, -errno on failure 2164 */ 2165static int hv_pci_probe(struct hv_device *hdev, 2166 const struct hv_vmbus_device_id *dev_id) 2167{ 2168 struct hv_pcibus_device *hbus; 2169 int ret; 2170 2171 hbus = kzalloc(sizeof(*hbus), GFP_KERNEL); 2172 if (!hbus) 2173 return -ENOMEM; 2174 2175 /* 2176 * The PCI bus "domain" is what is called "segment" in ACPI and 2177 * other specs. Pull it from the instance ID, to get something 2178 * unique. Bytes 8 and 9 are what is used in Windows guests, so 2179 * do the same thing for consistency. Note that, since this code 2180 * only runs in a Hyper-V VM, Hyper-V can (and does) guarantee 2181 * that (1) the only domain in use for something that looks like 2182 * a physical PCI bus (which is actually emulated by the 2183 * hypervisor) is domain 0 and (2) there will be no overlap 2184 * between domains derived from these instance IDs in the same 2185 * VM. 2186 */ 2187 hbus->sysdata.domain = hdev->dev_instance.b[9] | 2188 hdev->dev_instance.b[8] << 8; 2189 2190 hbus->hdev = hdev; 2191 atomic_inc(&hbus->remove_lock); 2192 INIT_LIST_HEAD(&hbus->children); 2193 INIT_LIST_HEAD(&hbus->dr_list); 2194 INIT_LIST_HEAD(&hbus->resources_for_children); 2195 spin_lock_init(&hbus->config_lock); 2196 spin_lock_init(&hbus->device_list_lock); 2197 spin_lock_init(&hbus->retarget_msi_interrupt_lock); 2198 sema_init(&hbus->enum_sem, 1); 2199 init_completion(&hbus->remove_event); 2200 2201 ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, 2202 hv_pci_onchannelcallback, hbus); 2203 if (ret) 2204 goto free_bus; 2205 2206 hv_set_drvdata(hdev, hbus); 2207 2208 ret = hv_pci_protocol_negotiation(hdev); 2209 if (ret) 2210 goto close; 2211 2212 ret = hv_allocate_config_window(hbus); 2213 if (ret) 2214 goto close; 2215 2216 hbus->cfg_addr = ioremap(hbus->mem_config->start, 2217 PCI_CONFIG_MMIO_LENGTH); 2218 if (!hbus->cfg_addr) { 2219 dev_err(&hdev->device, 2220 "Unable to map a virtual address for config space\n"); 2221 ret = -ENOMEM; 2222 goto free_config; 2223 } 2224 2225 hbus->sysdata.fwnode = irq_domain_alloc_fwnode(hbus); 2226 if (!hbus->sysdata.fwnode) { 2227 ret = -ENOMEM; 2228 goto unmap; 2229 } 2230 2231 ret = hv_pcie_init_irq_domain(hbus); 2232 if (ret) 2233 goto free_fwnode; 2234 2235 ret = hv_pci_query_relations(hdev); 2236 if (ret) 2237 goto free_irq_domain; 2238 2239 ret = hv_pci_enter_d0(hdev); 2240 if (ret) 2241 goto free_irq_domain; 2242 2243 ret = hv_pci_allocate_bridge_windows(hbus); 2244 if (ret) 2245 goto free_irq_domain; 2246 2247 ret = hv_send_resources_allocated(hdev); 2248 if (ret) 2249 goto free_windows; 2250 2251 prepopulate_bars(hbus); 2252 2253 hbus->state = hv_pcibus_probed; 2254 2255 ret = create_root_hv_pci_bus(hbus); 2256 if (ret) 2257 goto free_windows; 2258 2259 return 0; 2260 2261free_windows: 2262 hv_pci_free_bridge_windows(hbus); 2263free_irq_domain: 2264 irq_domain_remove(hbus->irq_domain); 2265free_fwnode: 2266 irq_domain_free_fwnode(hbus->sysdata.fwnode); 2267unmap: 2268 iounmap(hbus->cfg_addr); 2269free_config: 2270 hv_free_config_window(hbus); 2271close: 2272 vmbus_close(hdev->channel); 2273free_bus: 2274 kfree(hbus); 2275 return ret; 2276} 2277 2278static void hv_pci_bus_exit(struct hv_device *hdev) 2279{ 2280 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 2281 struct { 2282 struct pci_packet teardown_packet; 2283 u8 buffer[sizeof(struct pci_message)]; 2284 } pkt; 2285 struct pci_bus_relations relations; 2286 struct hv_pci_compl comp_pkt; 2287 int ret; 2288 2289 /* 2290 * After the host sends the RESCIND_CHANNEL message, it doesn't 2291 * access the per-channel ringbuffer any longer. 2292 */ 2293 if (hdev->channel->rescind) 2294 return; 2295 2296 /* Delete any children which might still exist. */ 2297 memset(&relations, 0, sizeof(relations)); 2298 hv_pci_devices_present(hbus, &relations); 2299 2300 ret = hv_send_resources_released(hdev); 2301 if (ret) 2302 dev_err(&hdev->device, 2303 "Couldn't send resources released packet(s)\n"); 2304 2305 memset(&pkt.teardown_packet, 0, sizeof(pkt.teardown_packet)); 2306 init_completion(&comp_pkt.host_event); 2307 pkt.teardown_packet.completion_func = hv_pci_generic_compl; 2308 pkt.teardown_packet.compl_ctxt = &comp_pkt; 2309 pkt.teardown_packet.message[0].type = PCI_BUS_D0EXIT; 2310 2311 ret = vmbus_sendpacket(hdev->channel, &pkt.teardown_packet.message, 2312 sizeof(struct pci_message), 2313 (unsigned long)&pkt.teardown_packet, 2314 VM_PKT_DATA_INBAND, 2315 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 2316 if (!ret) 2317 wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ); 2318} 2319 2320/** 2321 * hv_pci_remove() - Remove routine for this VMBus channel 2322 * @hdev: VMBus's tracking struct for this root PCI bus 2323 * 2324 * Return: 0 on success, -errno on failure 2325 */ 2326static int hv_pci_remove(struct hv_device *hdev) 2327{ 2328 struct hv_pcibus_device *hbus; 2329 2330 hbus = hv_get_drvdata(hdev); 2331 if (hbus->state == hv_pcibus_installed) { 2332 /* Remove the bus from PCI's point of view. */ 2333 pci_lock_rescan_remove(); 2334 pci_stop_root_bus(hbus->pci_bus); 2335 pci_remove_root_bus(hbus->pci_bus); 2336 pci_unlock_rescan_remove(); 2337 } 2338 2339 hv_pci_bus_exit(hdev); 2340 2341 vmbus_close(hdev->channel); 2342 2343 iounmap(hbus->cfg_addr); 2344 hv_free_config_window(hbus); 2345 pci_free_resource_list(&hbus->resources_for_children); 2346 hv_pci_free_bridge_windows(hbus); 2347 irq_domain_remove(hbus->irq_domain); 2348 irq_domain_free_fwnode(hbus->sysdata.fwnode); 2349 put_hvpcibus(hbus); 2350 wait_for_completion(&hbus->remove_event); 2351 kfree(hbus); 2352 return 0; 2353} 2354 2355static const struct hv_vmbus_device_id hv_pci_id_table[] = { 2356 /* PCI Pass-through Class ID */ 2357 /* 44C4F61D-4444-4400-9D52-802E27EDE19F */ 2358 { HV_PCIE_GUID, }, 2359 { }, 2360}; 2361 2362MODULE_DEVICE_TABLE(vmbus, hv_pci_id_table); 2363 2364static struct hv_driver hv_pci_drv = { 2365 .name = "hv_pci", 2366 .id_table = hv_pci_id_table, 2367 .probe = hv_pci_probe, 2368 .remove = hv_pci_remove, 2369}; 2370 2371static void __exit exit_hv_pci_drv(void) 2372{ 2373 vmbus_driver_unregister(&hv_pci_drv); 2374} 2375 2376static int __init init_hv_pci_drv(void) 2377{ 2378 return vmbus_driver_register(&hv_pci_drv); 2379} 2380 2381module_init(init_hv_pci_drv); 2382module_exit(exit_hv_pci_drv); 2383 2384MODULE_DESCRIPTION("Hyper-V PCI"); 2385MODULE_LICENSE("GPL v2");