Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v4.8-rc3 2364 lines 67 kB view raw
1/* 2 * Copyright (c) Microsoft Corporation. 3 * 4 * Author: 5 * Jake Oshins <jakeo@microsoft.com> 6 * 7 * This driver acts as a paravirtual front-end for PCI Express root buses. 8 * When a PCI Express function (either an entire device or an SR-IOV 9 * Virtual Function) is being passed through to the VM, this driver exposes 10 * a new bus to the guest VM. This is modeled as a root PCI bus because 11 * no bridges are being exposed to the VM. In fact, with a "Generation 2" 12 * VM within Hyper-V, there may seem to be no PCI bus at all in the VM 13 * until a device as been exposed using this driver. 14 * 15 * Each root PCI bus has its own PCI domain, which is called "Segment" in 16 * the PCI Firmware Specifications. Thus while each device passed through 17 * to the VM using this front-end will appear at "device 0", the domain will 18 * be unique. Typically, each bus will have one PCI function on it, though 19 * this driver does support more than one. 20 * 21 * In order to map the interrupts from the device through to the guest VM, 22 * this driver also implements an IRQ Domain, which handles interrupts (either 23 * MSI or MSI-X) associated with the functions on the bus. As interrupts are 24 * set up, torn down, or reaffined, this driver communicates with the 25 * underlying hypervisor to adjust the mappings in the I/O MMU so that each 26 * interrupt will be delivered to the correct virtual processor at the right 27 * vector. This driver does not support level-triggered (line-based) 28 * interrupts, and will report that the Interrupt Line register in the 29 * function's configuration space is zero. 30 * 31 * The rest of this driver mostly maps PCI concepts onto underlying Hyper-V 32 * facilities. For instance, the configuration space of a function exposed 33 * by Hyper-V is mapped into a single page of memory space, and the 34 * read and write handlers for config space must be aware of this mechanism. 35 * Similarly, device setup and teardown involves messages sent to and from 36 * the PCI back-end driver in Hyper-V. 37 * 38 * This program is free software; you can redistribute it and/or modify it 39 * under the terms of the GNU General Public License version 2 as published 40 * by the Free Software Foundation. 41 * 42 * This program is distributed in the hope that it will be useful, but 43 * WITHOUT ANY WARRANTY; without even the implied warranty of 44 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or 45 * NON INFRINGEMENT. See the GNU General Public License for more 46 * details. 47 * 48 */ 49 50#include <linux/kernel.h> 51#include <linux/module.h> 52#include <linux/pci.h> 53#include <linux/semaphore.h> 54#include <linux/irqdomain.h> 55#include <asm/irqdomain.h> 56#include <asm/apic.h> 57#include <linux/msi.h> 58#include <linux/hyperv.h> 59#include <asm/mshyperv.h> 60 61/* 62 * Protocol versions. The low word is the minor version, the high word the 63 * major version. 64 */ 65 66#define PCI_MAKE_VERSION(major, minor) ((u32)(((major) << 16) | (major))) 67#define PCI_MAJOR_VERSION(version) ((u32)(version) >> 16) 68#define PCI_MINOR_VERSION(version) ((u32)(version) & 0xff) 69 70enum { 71 PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1), 72 PCI_PROTOCOL_VERSION_CURRENT = PCI_PROTOCOL_VERSION_1_1 73}; 74 75#define PCI_CONFIG_MMIO_LENGTH 0x2000 76#define CFG_PAGE_OFFSET 0x1000 77#define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET) 78 79#define MAX_SUPPORTED_MSI_MESSAGES 0x400 80 81/* 82 * Message Types 83 */ 84 85enum pci_message_type { 86 /* 87 * Version 1.1 88 */ 89 PCI_MESSAGE_BASE = 0x42490000, 90 PCI_BUS_RELATIONS = PCI_MESSAGE_BASE + 0, 91 PCI_QUERY_BUS_RELATIONS = PCI_MESSAGE_BASE + 1, 92 PCI_POWER_STATE_CHANGE = PCI_MESSAGE_BASE + 4, 93 PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5, 94 PCI_QUERY_RESOURCE_RESOURCES = PCI_MESSAGE_BASE + 6, 95 PCI_BUS_D0ENTRY = PCI_MESSAGE_BASE + 7, 96 PCI_BUS_D0EXIT = PCI_MESSAGE_BASE + 8, 97 PCI_READ_BLOCK = PCI_MESSAGE_BASE + 9, 98 PCI_WRITE_BLOCK = PCI_MESSAGE_BASE + 0xA, 99 PCI_EJECT = PCI_MESSAGE_BASE + 0xB, 100 PCI_QUERY_STOP = PCI_MESSAGE_BASE + 0xC, 101 PCI_REENABLE = PCI_MESSAGE_BASE + 0xD, 102 PCI_QUERY_STOP_FAILED = PCI_MESSAGE_BASE + 0xE, 103 PCI_EJECTION_COMPLETE = PCI_MESSAGE_BASE + 0xF, 104 PCI_RESOURCES_ASSIGNED = PCI_MESSAGE_BASE + 0x10, 105 PCI_RESOURCES_RELEASED = PCI_MESSAGE_BASE + 0x11, 106 PCI_INVALIDATE_BLOCK = PCI_MESSAGE_BASE + 0x12, 107 PCI_QUERY_PROTOCOL_VERSION = PCI_MESSAGE_BASE + 0x13, 108 PCI_CREATE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x14, 109 PCI_DELETE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x15, 110 PCI_MESSAGE_MAXIMUM 111}; 112 113/* 114 * Structures defining the virtual PCI Express protocol. 115 */ 116 117union pci_version { 118 struct { 119 u16 minor_version; 120 u16 major_version; 121 } parts; 122 u32 version; 123} __packed; 124 125/* 126 * Function numbers are 8-bits wide on Express, as interpreted through ARI, 127 * which is all this driver does. This representation is the one used in 128 * Windows, which is what is expected when sending this back and forth with 129 * the Hyper-V parent partition. 130 */ 131union win_slot_encoding { 132 struct { 133 u32 func:8; 134 u32 reserved:24; 135 } bits; 136 u32 slot; 137} __packed; 138 139/* 140 * Pretty much as defined in the PCI Specifications. 141 */ 142struct pci_function_description { 143 u16 v_id; /* vendor ID */ 144 u16 d_id; /* device ID */ 145 u8 rev; 146 u8 prog_intf; 147 u8 subclass; 148 u8 base_class; 149 u32 subsystem_id; 150 union win_slot_encoding win_slot; 151 u32 ser; /* serial number */ 152} __packed; 153 154/** 155 * struct hv_msi_desc 156 * @vector: IDT entry 157 * @delivery_mode: As defined in Intel's Programmer's 158 * Reference Manual, Volume 3, Chapter 8. 159 * @vector_count: Number of contiguous entries in the 160 * Interrupt Descriptor Table that are 161 * occupied by this Message-Signaled 162 * Interrupt. For "MSI", as first defined 163 * in PCI 2.2, this can be between 1 and 164 * 32. For "MSI-X," as first defined in PCI 165 * 3.0, this must be 1, as each MSI-X table 166 * entry would have its own descriptor. 167 * @reserved: Empty space 168 * @cpu_mask: All the target virtual processors. 169 */ 170struct hv_msi_desc { 171 u8 vector; 172 u8 delivery_mode; 173 u16 vector_count; 174 u32 reserved; 175 u64 cpu_mask; 176} __packed; 177 178/** 179 * struct tran_int_desc 180 * @reserved: unused, padding 181 * @vector_count: same as in hv_msi_desc 182 * @data: This is the "data payload" value that is 183 * written by the device when it generates 184 * a message-signaled interrupt, either MSI 185 * or MSI-X. 186 * @address: This is the address to which the data 187 * payload is written on interrupt 188 * generation. 189 */ 190struct tran_int_desc { 191 u16 reserved; 192 u16 vector_count; 193 u32 data; 194 u64 address; 195} __packed; 196 197/* 198 * A generic message format for virtual PCI. 199 * Specific message formats are defined later in the file. 200 */ 201 202struct pci_message { 203 u32 message_type; 204} __packed; 205 206struct pci_child_message { 207 u32 message_type; 208 union win_slot_encoding wslot; 209} __packed; 210 211struct pci_incoming_message { 212 struct vmpacket_descriptor hdr; 213 struct pci_message message_type; 214} __packed; 215 216struct pci_response { 217 struct vmpacket_descriptor hdr; 218 s32 status; /* negative values are failures */ 219} __packed; 220 221struct pci_packet { 222 void (*completion_func)(void *context, struct pci_response *resp, 223 int resp_packet_size); 224 void *compl_ctxt; 225 struct pci_message message; 226}; 227 228/* 229 * Specific message types supporting the PCI protocol. 230 */ 231 232/* 233 * Version negotiation message. Sent from the guest to the host. 234 * The guest is free to try different versions until the host 235 * accepts the version. 236 * 237 * pci_version: The protocol version requested. 238 * is_last_attempt: If TRUE, this is the last version guest will request. 239 * reservedz: Reserved field, set to zero. 240 */ 241 242struct pci_version_request { 243 struct pci_message message_type; 244 enum pci_message_type protocol_version; 245} __packed; 246 247/* 248 * Bus D0 Entry. This is sent from the guest to the host when the virtual 249 * bus (PCI Express port) is ready for action. 250 */ 251 252struct pci_bus_d0_entry { 253 struct pci_message message_type; 254 u32 reserved; 255 u64 mmio_base; 256} __packed; 257 258struct pci_bus_relations { 259 struct pci_incoming_message incoming; 260 u32 device_count; 261 struct pci_function_description func[1]; 262} __packed; 263 264struct pci_q_res_req_response { 265 struct vmpacket_descriptor hdr; 266 s32 status; /* negative values are failures */ 267 u32 probed_bar[6]; 268} __packed; 269 270struct pci_set_power { 271 struct pci_message message_type; 272 union win_slot_encoding wslot; 273 u32 power_state; /* In Windows terms */ 274 u32 reserved; 275} __packed; 276 277struct pci_set_power_response { 278 struct vmpacket_descriptor hdr; 279 s32 status; /* negative values are failures */ 280 union win_slot_encoding wslot; 281 u32 resultant_state; /* In Windows terms */ 282 u32 reserved; 283} __packed; 284 285struct pci_resources_assigned { 286 struct pci_message message_type; 287 union win_slot_encoding wslot; 288 u8 memory_range[0x14][6]; /* not used here */ 289 u32 msi_descriptors; 290 u32 reserved[4]; 291} __packed; 292 293struct pci_create_interrupt { 294 struct pci_message message_type; 295 union win_slot_encoding wslot; 296 struct hv_msi_desc int_desc; 297} __packed; 298 299struct pci_create_int_response { 300 struct pci_response response; 301 u32 reserved; 302 struct tran_int_desc int_desc; 303} __packed; 304 305struct pci_delete_interrupt { 306 struct pci_message message_type; 307 union win_slot_encoding wslot; 308 struct tran_int_desc int_desc; 309} __packed; 310 311struct pci_dev_incoming { 312 struct pci_incoming_message incoming; 313 union win_slot_encoding wslot; 314} __packed; 315 316struct pci_eject_response { 317 u32 message_type; 318 union win_slot_encoding wslot; 319 u32 status; 320} __packed; 321 322static int pci_ring_size = (4 * PAGE_SIZE); 323 324/* 325 * Definitions or interrupt steering hypercall. 326 */ 327#define HV_PARTITION_ID_SELF ((u64)-1) 328#define HVCALL_RETARGET_INTERRUPT 0x7e 329 330struct retarget_msi_interrupt { 331 u64 partition_id; /* use "self" */ 332 u64 device_id; 333 u32 source; /* 1 for MSI(-X) */ 334 u32 reserved1; 335 u32 address; 336 u32 data; 337 u64 reserved2; 338 u32 vector; 339 u32 flags; 340 u64 vp_mask; 341} __packed; 342 343/* 344 * Driver specific state. 345 */ 346 347enum hv_pcibus_state { 348 hv_pcibus_init = 0, 349 hv_pcibus_probed, 350 hv_pcibus_installed, 351 hv_pcibus_maximum 352}; 353 354struct hv_pcibus_device { 355 struct pci_sysdata sysdata; 356 enum hv_pcibus_state state; 357 atomic_t remove_lock; 358 struct hv_device *hdev; 359 resource_size_t low_mmio_space; 360 resource_size_t high_mmio_space; 361 struct resource *mem_config; 362 struct resource *low_mmio_res; 363 struct resource *high_mmio_res; 364 struct completion *survey_event; 365 struct completion remove_event; 366 struct pci_bus *pci_bus; 367 spinlock_t config_lock; /* Avoid two threads writing index page */ 368 spinlock_t device_list_lock; /* Protect lists below */ 369 void __iomem *cfg_addr; 370 371 struct semaphore enum_sem; 372 struct list_head resources_for_children; 373 374 struct list_head children; 375 struct list_head dr_list; 376 struct work_struct wrk; 377 378 struct msi_domain_info msi_info; 379 struct msi_controller msi_chip; 380 struct irq_domain *irq_domain; 381}; 382 383/* 384 * Tracks "Device Relations" messages from the host, which must be both 385 * processed in order and deferred so that they don't run in the context 386 * of the incoming packet callback. 387 */ 388struct hv_dr_work { 389 struct work_struct wrk; 390 struct hv_pcibus_device *bus; 391}; 392 393struct hv_dr_state { 394 struct list_head list_entry; 395 u32 device_count; 396 struct pci_function_description func[1]; 397}; 398 399enum hv_pcichild_state { 400 hv_pcichild_init = 0, 401 hv_pcichild_requirements, 402 hv_pcichild_resourced, 403 hv_pcichild_ejecting, 404 hv_pcichild_maximum 405}; 406 407enum hv_pcidev_ref_reason { 408 hv_pcidev_ref_invalid = 0, 409 hv_pcidev_ref_initial, 410 hv_pcidev_ref_by_slot, 411 hv_pcidev_ref_packet, 412 hv_pcidev_ref_pnp, 413 hv_pcidev_ref_childlist, 414 hv_pcidev_irqdata, 415 hv_pcidev_ref_max 416}; 417 418struct hv_pci_dev { 419 /* List protected by pci_rescan_remove_lock */ 420 struct list_head list_entry; 421 atomic_t refs; 422 enum hv_pcichild_state state; 423 struct pci_function_description desc; 424 bool reported_missing; 425 struct hv_pcibus_device *hbus; 426 struct work_struct wrk; 427 428 /* 429 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then 430 * read it back, for each of the BAR offsets within config space. 431 */ 432 u32 probed_bar[6]; 433}; 434 435struct hv_pci_compl { 436 struct completion host_event; 437 s32 completion_status; 438}; 439 440/** 441 * hv_pci_generic_compl() - Invoked for a completion packet 442 * @context: Set up by the sender of the packet. 443 * @resp: The response packet 444 * @resp_packet_size: Size in bytes of the packet 445 * 446 * This function is used to trigger an event and report status 447 * for any message for which the completion packet contains a 448 * status and nothing else. 449 */ 450static 451void 452hv_pci_generic_compl(void *context, struct pci_response *resp, 453 int resp_packet_size) 454{ 455 struct hv_pci_compl *comp_pkt = context; 456 457 if (resp_packet_size >= offsetofend(struct pci_response, status)) 458 comp_pkt->completion_status = resp->status; 459 complete(&comp_pkt->host_event); 460} 461 462static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus, 463 u32 wslot); 464static void get_pcichild(struct hv_pci_dev *hv_pcidev, 465 enum hv_pcidev_ref_reason reason); 466static void put_pcichild(struct hv_pci_dev *hv_pcidev, 467 enum hv_pcidev_ref_reason reason); 468 469static void get_hvpcibus(struct hv_pcibus_device *hv_pcibus); 470static void put_hvpcibus(struct hv_pcibus_device *hv_pcibus); 471 472/** 473 * devfn_to_wslot() - Convert from Linux PCI slot to Windows 474 * @devfn: The Linux representation of PCI slot 475 * 476 * Windows uses a slightly different representation of PCI slot. 477 * 478 * Return: The Windows representation 479 */ 480static u32 devfn_to_wslot(int devfn) 481{ 482 union win_slot_encoding wslot; 483 484 wslot.slot = 0; 485 wslot.bits.func = PCI_SLOT(devfn) | (PCI_FUNC(devfn) << 5); 486 487 return wslot.slot; 488} 489 490/** 491 * wslot_to_devfn() - Convert from Windows PCI slot to Linux 492 * @wslot: The Windows representation of PCI slot 493 * 494 * Windows uses a slightly different representation of PCI slot. 495 * 496 * Return: The Linux representation 497 */ 498static int wslot_to_devfn(u32 wslot) 499{ 500 union win_slot_encoding slot_no; 501 502 slot_no.slot = wslot; 503 return PCI_DEVFN(0, slot_no.bits.func); 504} 505 506/* 507 * PCI Configuration Space for these root PCI buses is implemented as a pair 508 * of pages in memory-mapped I/O space. Writing to the first page chooses 509 * the PCI function being written or read. Once the first page has been 510 * written to, the following page maps in the entire configuration space of 511 * the function. 512 */ 513 514/** 515 * _hv_pcifront_read_config() - Internal PCI config read 516 * @hpdev: The PCI driver's representation of the device 517 * @where: Offset within config space 518 * @size: Size of the transfer 519 * @val: Pointer to the buffer receiving the data 520 */ 521static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where, 522 int size, u32 *val) 523{ 524 unsigned long flags; 525 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where; 526 527 /* 528 * If the attempt is to read the IDs or the ROM BAR, simulate that. 529 */ 530 if (where + size <= PCI_COMMAND) { 531 memcpy(val, ((u8 *)&hpdev->desc.v_id) + where, size); 532 } else if (where >= PCI_CLASS_REVISION && where + size <= 533 PCI_CACHE_LINE_SIZE) { 534 memcpy(val, ((u8 *)&hpdev->desc.rev) + where - 535 PCI_CLASS_REVISION, size); 536 } else if (where >= PCI_SUBSYSTEM_VENDOR_ID && where + size <= 537 PCI_ROM_ADDRESS) { 538 memcpy(val, (u8 *)&hpdev->desc.subsystem_id + where - 539 PCI_SUBSYSTEM_VENDOR_ID, size); 540 } else if (where >= PCI_ROM_ADDRESS && where + size <= 541 PCI_CAPABILITY_LIST) { 542 /* ROM BARs are unimplemented */ 543 *val = 0; 544 } else if (where >= PCI_INTERRUPT_LINE && where + size <= 545 PCI_INTERRUPT_PIN) { 546 /* 547 * Interrupt Line and Interrupt PIN are hard-wired to zero 548 * because this front-end only supports message-signaled 549 * interrupts. 550 */ 551 *val = 0; 552 } else if (where + size <= CFG_PAGE_SIZE) { 553 spin_lock_irqsave(&hpdev->hbus->config_lock, flags); 554 /* Choose the function to be read. (See comment above) */ 555 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr); 556 /* Make sure the function was chosen before we start reading. */ 557 mb(); 558 /* Read from that function's config space. */ 559 switch (size) { 560 case 1: 561 *val = readb(addr); 562 break; 563 case 2: 564 *val = readw(addr); 565 break; 566 default: 567 *val = readl(addr); 568 break; 569 } 570 /* 571 * Make sure the write was done before we release the spinlock 572 * allowing consecutive reads/writes. 573 */ 574 mb(); 575 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags); 576 } else { 577 dev_err(&hpdev->hbus->hdev->device, 578 "Attempt to read beyond a function's config space.\n"); 579 } 580} 581 582/** 583 * _hv_pcifront_write_config() - Internal PCI config write 584 * @hpdev: The PCI driver's representation of the device 585 * @where: Offset within config space 586 * @size: Size of the transfer 587 * @val: The data being transferred 588 */ 589static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where, 590 int size, u32 val) 591{ 592 unsigned long flags; 593 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where; 594 595 if (where >= PCI_SUBSYSTEM_VENDOR_ID && 596 where + size <= PCI_CAPABILITY_LIST) { 597 /* SSIDs and ROM BARs are read-only */ 598 } else if (where >= PCI_COMMAND && where + size <= CFG_PAGE_SIZE) { 599 spin_lock_irqsave(&hpdev->hbus->config_lock, flags); 600 /* Choose the function to be written. (See comment above) */ 601 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr); 602 /* Make sure the function was chosen before we start writing. */ 603 wmb(); 604 /* Write to that function's config space. */ 605 switch (size) { 606 case 1: 607 writeb(val, addr); 608 break; 609 case 2: 610 writew(val, addr); 611 break; 612 default: 613 writel(val, addr); 614 break; 615 } 616 /* 617 * Make sure the write was done before we release the spinlock 618 * allowing consecutive reads/writes. 619 */ 620 mb(); 621 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags); 622 } else { 623 dev_err(&hpdev->hbus->hdev->device, 624 "Attempt to write beyond a function's config space.\n"); 625 } 626} 627 628/** 629 * hv_pcifront_read_config() - Read configuration space 630 * @bus: PCI Bus structure 631 * @devfn: Device/function 632 * @where: Offset from base 633 * @size: Byte/word/dword 634 * @val: Value to be read 635 * 636 * Return: PCIBIOS_SUCCESSFUL on success 637 * PCIBIOS_DEVICE_NOT_FOUND on failure 638 */ 639static int hv_pcifront_read_config(struct pci_bus *bus, unsigned int devfn, 640 int where, int size, u32 *val) 641{ 642 struct hv_pcibus_device *hbus = 643 container_of(bus->sysdata, struct hv_pcibus_device, sysdata); 644 struct hv_pci_dev *hpdev; 645 646 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn)); 647 if (!hpdev) 648 return PCIBIOS_DEVICE_NOT_FOUND; 649 650 _hv_pcifront_read_config(hpdev, where, size, val); 651 652 put_pcichild(hpdev, hv_pcidev_ref_by_slot); 653 return PCIBIOS_SUCCESSFUL; 654} 655 656/** 657 * hv_pcifront_write_config() - Write configuration space 658 * @bus: PCI Bus structure 659 * @devfn: Device/function 660 * @where: Offset from base 661 * @size: Byte/word/dword 662 * @val: Value to be written to device 663 * 664 * Return: PCIBIOS_SUCCESSFUL on success 665 * PCIBIOS_DEVICE_NOT_FOUND on failure 666 */ 667static int hv_pcifront_write_config(struct pci_bus *bus, unsigned int devfn, 668 int where, int size, u32 val) 669{ 670 struct hv_pcibus_device *hbus = 671 container_of(bus->sysdata, struct hv_pcibus_device, sysdata); 672 struct hv_pci_dev *hpdev; 673 674 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn)); 675 if (!hpdev) 676 return PCIBIOS_DEVICE_NOT_FOUND; 677 678 _hv_pcifront_write_config(hpdev, where, size, val); 679 680 put_pcichild(hpdev, hv_pcidev_ref_by_slot); 681 return PCIBIOS_SUCCESSFUL; 682} 683 684/* PCIe operations */ 685static struct pci_ops hv_pcifront_ops = { 686 .read = hv_pcifront_read_config, 687 .write = hv_pcifront_write_config, 688}; 689 690/* Interrupt management hooks */ 691static void hv_int_desc_free(struct hv_pci_dev *hpdev, 692 struct tran_int_desc *int_desc) 693{ 694 struct pci_delete_interrupt *int_pkt; 695 struct { 696 struct pci_packet pkt; 697 u8 buffer[sizeof(struct pci_delete_interrupt) - 698 sizeof(struct pci_message)]; 699 } ctxt; 700 701 memset(&ctxt, 0, sizeof(ctxt)); 702 int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message; 703 int_pkt->message_type.message_type = 704 PCI_DELETE_INTERRUPT_MESSAGE; 705 int_pkt->wslot.slot = hpdev->desc.win_slot.slot; 706 int_pkt->int_desc = *int_desc; 707 vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt, sizeof(*int_pkt), 708 (unsigned long)&ctxt.pkt, VM_PKT_DATA_INBAND, 0); 709 kfree(int_desc); 710} 711 712/** 713 * hv_msi_free() - Free the MSI. 714 * @domain: The interrupt domain pointer 715 * @info: Extra MSI-related context 716 * @irq: Identifies the IRQ. 717 * 718 * The Hyper-V parent partition and hypervisor are tracking the 719 * messages that are in use, keeping the interrupt redirection 720 * table up to date. This callback sends a message that frees 721 * the IRT entry and related tracking nonsense. 722 */ 723static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info, 724 unsigned int irq) 725{ 726 struct hv_pcibus_device *hbus; 727 struct hv_pci_dev *hpdev; 728 struct pci_dev *pdev; 729 struct tran_int_desc *int_desc; 730 struct irq_data *irq_data = irq_domain_get_irq_data(domain, irq); 731 struct msi_desc *msi = irq_data_get_msi_desc(irq_data); 732 733 pdev = msi_desc_to_pci_dev(msi); 734 hbus = info->data; 735 int_desc = irq_data_get_irq_chip_data(irq_data); 736 if (!int_desc) 737 return; 738 739 irq_data->chip_data = NULL; 740 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); 741 if (!hpdev) { 742 kfree(int_desc); 743 return; 744 } 745 746 hv_int_desc_free(hpdev, int_desc); 747 put_pcichild(hpdev, hv_pcidev_ref_by_slot); 748} 749 750static int hv_set_affinity(struct irq_data *data, const struct cpumask *dest, 751 bool force) 752{ 753 struct irq_data *parent = data->parent_data; 754 755 return parent->chip->irq_set_affinity(parent, dest, force); 756} 757 758void hv_irq_mask(struct irq_data *data) 759{ 760 pci_msi_mask_irq(data); 761} 762 763/** 764 * hv_irq_unmask() - "Unmask" the IRQ by setting its current 765 * affinity. 766 * @data: Describes the IRQ 767 * 768 * Build new a destination for the MSI and make a hypercall to 769 * update the Interrupt Redirection Table. "Device Logical ID" 770 * is built out of this PCI bus's instance GUID and the function 771 * number of the device. 772 */ 773void hv_irq_unmask(struct irq_data *data) 774{ 775 struct msi_desc *msi_desc = irq_data_get_msi_desc(data); 776 struct irq_cfg *cfg = irqd_cfg(data); 777 struct retarget_msi_interrupt params; 778 struct hv_pcibus_device *hbus; 779 struct cpumask *dest; 780 struct pci_bus *pbus; 781 struct pci_dev *pdev; 782 int cpu; 783 784 dest = irq_data_get_affinity_mask(data); 785 pdev = msi_desc_to_pci_dev(msi_desc); 786 pbus = pdev->bus; 787 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); 788 789 memset(&params, 0, sizeof(params)); 790 params.partition_id = HV_PARTITION_ID_SELF; 791 params.source = 1; /* MSI(-X) */ 792 params.address = msi_desc->msg.address_lo; 793 params.data = msi_desc->msg.data; 794 params.device_id = (hbus->hdev->dev_instance.b[5] << 24) | 795 (hbus->hdev->dev_instance.b[4] << 16) | 796 (hbus->hdev->dev_instance.b[7] << 8) | 797 (hbus->hdev->dev_instance.b[6] & 0xf8) | 798 PCI_FUNC(pdev->devfn); 799 params.vector = cfg->vector; 800 801 for_each_cpu_and(cpu, dest, cpu_online_mask) 802 params.vp_mask |= (1ULL << vmbus_cpu_number_to_vp_number(cpu)); 803 804 hv_do_hypercall(HVCALL_RETARGET_INTERRUPT, &params, NULL); 805 806 pci_msi_unmask_irq(data); 807} 808 809struct compose_comp_ctxt { 810 struct hv_pci_compl comp_pkt; 811 struct tran_int_desc int_desc; 812}; 813 814static void hv_pci_compose_compl(void *context, struct pci_response *resp, 815 int resp_packet_size) 816{ 817 struct compose_comp_ctxt *comp_pkt = context; 818 struct pci_create_int_response *int_resp = 819 (struct pci_create_int_response *)resp; 820 821 comp_pkt->comp_pkt.completion_status = resp->status; 822 comp_pkt->int_desc = int_resp->int_desc; 823 complete(&comp_pkt->comp_pkt.host_event); 824} 825 826/** 827 * hv_compose_msi_msg() - Supplies a valid MSI address/data 828 * @data: Everything about this MSI 829 * @msg: Buffer that is filled in by this function 830 * 831 * This function unpacks the IRQ looking for target CPU set, IDT 832 * vector and mode and sends a message to the parent partition 833 * asking for a mapping for that tuple in this partition. The 834 * response supplies a data value and address to which that data 835 * should be written to trigger that interrupt. 836 */ 837static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) 838{ 839 struct irq_cfg *cfg = irqd_cfg(data); 840 struct hv_pcibus_device *hbus; 841 struct hv_pci_dev *hpdev; 842 struct pci_bus *pbus; 843 struct pci_dev *pdev; 844 struct pci_create_interrupt *int_pkt; 845 struct compose_comp_ctxt comp; 846 struct tran_int_desc *int_desc; 847 struct cpumask *affinity; 848 struct { 849 struct pci_packet pkt; 850 u8 buffer[sizeof(struct pci_create_interrupt) - 851 sizeof(struct pci_message)]; 852 } ctxt; 853 int cpu; 854 int ret; 855 856 pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data)); 857 pbus = pdev->bus; 858 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); 859 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); 860 if (!hpdev) 861 goto return_null_message; 862 863 /* Free any previous message that might have already been composed. */ 864 if (data->chip_data) { 865 int_desc = data->chip_data; 866 data->chip_data = NULL; 867 hv_int_desc_free(hpdev, int_desc); 868 } 869 870 int_desc = kzalloc(sizeof(*int_desc), GFP_KERNEL); 871 if (!int_desc) 872 goto drop_reference; 873 874 memset(&ctxt, 0, sizeof(ctxt)); 875 init_completion(&comp.comp_pkt.host_event); 876 ctxt.pkt.completion_func = hv_pci_compose_compl; 877 ctxt.pkt.compl_ctxt = &comp; 878 int_pkt = (struct pci_create_interrupt *)&ctxt.pkt.message; 879 int_pkt->message_type.message_type = PCI_CREATE_INTERRUPT_MESSAGE; 880 int_pkt->wslot.slot = hpdev->desc.win_slot.slot; 881 int_pkt->int_desc.vector = cfg->vector; 882 int_pkt->int_desc.vector_count = 1; 883 int_pkt->int_desc.delivery_mode = 884 (apic->irq_delivery_mode == dest_LowestPrio) ? 1 : 0; 885 886 /* 887 * This bit doesn't have to work on machines with more than 64 888 * processors because Hyper-V only supports 64 in a guest. 889 */ 890 affinity = irq_data_get_affinity_mask(data); 891 for_each_cpu_and(cpu, affinity, cpu_online_mask) { 892 int_pkt->int_desc.cpu_mask |= 893 (1ULL << vmbus_cpu_number_to_vp_number(cpu)); 894 } 895 896 ret = vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt, 897 sizeof(*int_pkt), (unsigned long)&ctxt.pkt, 898 VM_PKT_DATA_INBAND, 899 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 900 if (!ret) 901 wait_for_completion(&comp.comp_pkt.host_event); 902 903 if (comp.comp_pkt.completion_status < 0) { 904 dev_err(&hbus->hdev->device, 905 "Request for interrupt failed: 0x%x", 906 comp.comp_pkt.completion_status); 907 goto free_int_desc; 908 } 909 910 /* 911 * Record the assignment so that this can be unwound later. Using 912 * irq_set_chip_data() here would be appropriate, but the lock it takes 913 * is already held. 914 */ 915 *int_desc = comp.int_desc; 916 data->chip_data = int_desc; 917 918 /* Pass up the result. */ 919 msg->address_hi = comp.int_desc.address >> 32; 920 msg->address_lo = comp.int_desc.address & 0xffffffff; 921 msg->data = comp.int_desc.data; 922 923 put_pcichild(hpdev, hv_pcidev_ref_by_slot); 924 return; 925 926free_int_desc: 927 kfree(int_desc); 928drop_reference: 929 put_pcichild(hpdev, hv_pcidev_ref_by_slot); 930return_null_message: 931 msg->address_hi = 0; 932 msg->address_lo = 0; 933 msg->data = 0; 934} 935 936/* HW Interrupt Chip Descriptor */ 937static struct irq_chip hv_msi_irq_chip = { 938 .name = "Hyper-V PCIe MSI", 939 .irq_compose_msi_msg = hv_compose_msi_msg, 940 .irq_set_affinity = hv_set_affinity, 941 .irq_ack = irq_chip_ack_parent, 942 .irq_mask = hv_irq_mask, 943 .irq_unmask = hv_irq_unmask, 944}; 945 946static irq_hw_number_t hv_msi_domain_ops_get_hwirq(struct msi_domain_info *info, 947 msi_alloc_info_t *arg) 948{ 949 return arg->msi_hwirq; 950} 951 952static struct msi_domain_ops hv_msi_ops = { 953 .get_hwirq = hv_msi_domain_ops_get_hwirq, 954 .msi_prepare = pci_msi_prepare, 955 .set_desc = pci_msi_set_desc, 956 .msi_free = hv_msi_free, 957}; 958 959/** 960 * hv_pcie_init_irq_domain() - Initialize IRQ domain 961 * @hbus: The root PCI bus 962 * 963 * This function creates an IRQ domain which will be used for 964 * interrupts from devices that have been passed through. These 965 * devices only support MSI and MSI-X, not line-based interrupts 966 * or simulations of line-based interrupts through PCIe's 967 * fabric-layer messages. Because interrupts are remapped, we 968 * can support multi-message MSI here. 969 * 970 * Return: '0' on success and error value on failure 971 */ 972static int hv_pcie_init_irq_domain(struct hv_pcibus_device *hbus) 973{ 974 hbus->msi_info.chip = &hv_msi_irq_chip; 975 hbus->msi_info.ops = &hv_msi_ops; 976 hbus->msi_info.flags = (MSI_FLAG_USE_DEF_DOM_OPS | 977 MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI | 978 MSI_FLAG_PCI_MSIX); 979 hbus->msi_info.handler = handle_edge_irq; 980 hbus->msi_info.handler_name = "edge"; 981 hbus->msi_info.data = hbus; 982 hbus->irq_domain = pci_msi_create_irq_domain(hbus->sysdata.fwnode, 983 &hbus->msi_info, 984 x86_vector_domain); 985 if (!hbus->irq_domain) { 986 dev_err(&hbus->hdev->device, 987 "Failed to build an MSI IRQ domain\n"); 988 return -ENODEV; 989 } 990 991 return 0; 992} 993 994/** 995 * get_bar_size() - Get the address space consumed by a BAR 996 * @bar_val: Value that a BAR returned after -1 was written 997 * to it. 998 * 999 * This function returns the size of the BAR, rounded up to 1 1000 * page. It has to be rounded up because the hypervisor's page 1001 * table entry that maps the BAR into the VM can't specify an 1002 * offset within a page. The invariant is that the hypervisor 1003 * must place any BARs of smaller than page length at the 1004 * beginning of a page. 1005 * 1006 * Return: Size in bytes of the consumed MMIO space. 1007 */ 1008static u64 get_bar_size(u64 bar_val) 1009{ 1010 return round_up((1 + ~(bar_val & PCI_BASE_ADDRESS_MEM_MASK)), 1011 PAGE_SIZE); 1012} 1013 1014/** 1015 * survey_child_resources() - Total all MMIO requirements 1016 * @hbus: Root PCI bus, as understood by this driver 1017 */ 1018static void survey_child_resources(struct hv_pcibus_device *hbus) 1019{ 1020 struct list_head *iter; 1021 struct hv_pci_dev *hpdev; 1022 resource_size_t bar_size = 0; 1023 unsigned long flags; 1024 struct completion *event; 1025 u64 bar_val; 1026 int i; 1027 1028 /* If nobody is waiting on the answer, don't compute it. */ 1029 event = xchg(&hbus->survey_event, NULL); 1030 if (!event) 1031 return; 1032 1033 /* If the answer has already been computed, go with it. */ 1034 if (hbus->low_mmio_space || hbus->high_mmio_space) { 1035 complete(event); 1036 return; 1037 } 1038 1039 spin_lock_irqsave(&hbus->device_list_lock, flags); 1040 1041 /* 1042 * Due to an interesting quirk of the PCI spec, all memory regions 1043 * for a child device are a power of 2 in size and aligned in memory, 1044 * so it's sufficient to just add them up without tracking alignment. 1045 */ 1046 list_for_each(iter, &hbus->children) { 1047 hpdev = container_of(iter, struct hv_pci_dev, list_entry); 1048 for (i = 0; i < 6; i++) { 1049 if (hpdev->probed_bar[i] & PCI_BASE_ADDRESS_SPACE_IO) 1050 dev_err(&hbus->hdev->device, 1051 "There's an I/O BAR in this list!\n"); 1052 1053 if (hpdev->probed_bar[i] != 0) { 1054 /* 1055 * A probed BAR has all the upper bits set that 1056 * can be changed. 1057 */ 1058 1059 bar_val = hpdev->probed_bar[i]; 1060 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64) 1061 bar_val |= 1062 ((u64)hpdev->probed_bar[++i] << 32); 1063 else 1064 bar_val |= 0xffffffff00000000ULL; 1065 1066 bar_size = get_bar_size(bar_val); 1067 1068 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64) 1069 hbus->high_mmio_space += bar_size; 1070 else 1071 hbus->low_mmio_space += bar_size; 1072 } 1073 } 1074 } 1075 1076 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 1077 complete(event); 1078} 1079 1080/** 1081 * prepopulate_bars() - Fill in BARs with defaults 1082 * @hbus: Root PCI bus, as understood by this driver 1083 * 1084 * The core PCI driver code seems much, much happier if the BARs 1085 * for a device have values upon first scan. So fill them in. 1086 * The algorithm below works down from large sizes to small, 1087 * attempting to pack the assignments optimally. The assumption, 1088 * enforced in other parts of the code, is that the beginning of 1089 * the memory-mapped I/O space will be aligned on the largest 1090 * BAR size. 1091 */ 1092static void prepopulate_bars(struct hv_pcibus_device *hbus) 1093{ 1094 resource_size_t high_size = 0; 1095 resource_size_t low_size = 0; 1096 resource_size_t high_base = 0; 1097 resource_size_t low_base = 0; 1098 resource_size_t bar_size; 1099 struct hv_pci_dev *hpdev; 1100 struct list_head *iter; 1101 unsigned long flags; 1102 u64 bar_val; 1103 u32 command; 1104 bool high; 1105 int i; 1106 1107 if (hbus->low_mmio_space) { 1108 low_size = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space)); 1109 low_base = hbus->low_mmio_res->start; 1110 } 1111 1112 if (hbus->high_mmio_space) { 1113 high_size = 1ULL << 1114 (63 - __builtin_clzll(hbus->high_mmio_space)); 1115 high_base = hbus->high_mmio_res->start; 1116 } 1117 1118 spin_lock_irqsave(&hbus->device_list_lock, flags); 1119 1120 /* Pick addresses for the BARs. */ 1121 do { 1122 list_for_each(iter, &hbus->children) { 1123 hpdev = container_of(iter, struct hv_pci_dev, 1124 list_entry); 1125 for (i = 0; i < 6; i++) { 1126 bar_val = hpdev->probed_bar[i]; 1127 if (bar_val == 0) 1128 continue; 1129 high = bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64; 1130 if (high) { 1131 bar_val |= 1132 ((u64)hpdev->probed_bar[i + 1] 1133 << 32); 1134 } else { 1135 bar_val |= 0xffffffffULL << 32; 1136 } 1137 bar_size = get_bar_size(bar_val); 1138 if (high) { 1139 if (high_size != bar_size) { 1140 i++; 1141 continue; 1142 } 1143 _hv_pcifront_write_config(hpdev, 1144 PCI_BASE_ADDRESS_0 + (4 * i), 1145 4, 1146 (u32)(high_base & 0xffffff00)); 1147 i++; 1148 _hv_pcifront_write_config(hpdev, 1149 PCI_BASE_ADDRESS_0 + (4 * i), 1150 4, (u32)(high_base >> 32)); 1151 high_base += bar_size; 1152 } else { 1153 if (low_size != bar_size) 1154 continue; 1155 _hv_pcifront_write_config(hpdev, 1156 PCI_BASE_ADDRESS_0 + (4 * i), 1157 4, 1158 (u32)(low_base & 0xffffff00)); 1159 low_base += bar_size; 1160 } 1161 } 1162 if (high_size <= 1 && low_size <= 1) { 1163 /* Set the memory enable bit. */ 1164 _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2, 1165 &command); 1166 command |= PCI_COMMAND_MEMORY; 1167 _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2, 1168 command); 1169 break; 1170 } 1171 } 1172 1173 high_size >>= 1; 1174 low_size >>= 1; 1175 } while (high_size || low_size); 1176 1177 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 1178} 1179 1180/** 1181 * create_root_hv_pci_bus() - Expose a new root PCI bus 1182 * @hbus: Root PCI bus, as understood by this driver 1183 * 1184 * Return: 0 on success, -errno on failure 1185 */ 1186static int create_root_hv_pci_bus(struct hv_pcibus_device *hbus) 1187{ 1188 /* Register the device */ 1189 hbus->pci_bus = pci_create_root_bus(&hbus->hdev->device, 1190 0, /* bus number is always zero */ 1191 &hv_pcifront_ops, 1192 &hbus->sysdata, 1193 &hbus->resources_for_children); 1194 if (!hbus->pci_bus) 1195 return -ENODEV; 1196 1197 hbus->pci_bus->msi = &hbus->msi_chip; 1198 hbus->pci_bus->msi->dev = &hbus->hdev->device; 1199 1200 pci_scan_child_bus(hbus->pci_bus); 1201 pci_bus_assign_resources(hbus->pci_bus); 1202 pci_bus_add_devices(hbus->pci_bus); 1203 hbus->state = hv_pcibus_installed; 1204 return 0; 1205} 1206 1207struct q_res_req_compl { 1208 struct completion host_event; 1209 struct hv_pci_dev *hpdev; 1210}; 1211 1212/** 1213 * q_resource_requirements() - Query Resource Requirements 1214 * @context: The completion context. 1215 * @resp: The response that came from the host. 1216 * @resp_packet_size: The size in bytes of resp. 1217 * 1218 * This function is invoked on completion of a Query Resource 1219 * Requirements packet. 1220 */ 1221static void q_resource_requirements(void *context, struct pci_response *resp, 1222 int resp_packet_size) 1223{ 1224 struct q_res_req_compl *completion = context; 1225 struct pci_q_res_req_response *q_res_req = 1226 (struct pci_q_res_req_response *)resp; 1227 int i; 1228 1229 if (resp->status < 0) { 1230 dev_err(&completion->hpdev->hbus->hdev->device, 1231 "query resource requirements failed: %x\n", 1232 resp->status); 1233 } else { 1234 for (i = 0; i < 6; i++) { 1235 completion->hpdev->probed_bar[i] = 1236 q_res_req->probed_bar[i]; 1237 } 1238 } 1239 1240 complete(&completion->host_event); 1241} 1242 1243static void get_pcichild(struct hv_pci_dev *hpdev, 1244 enum hv_pcidev_ref_reason reason) 1245{ 1246 atomic_inc(&hpdev->refs); 1247} 1248 1249static void put_pcichild(struct hv_pci_dev *hpdev, 1250 enum hv_pcidev_ref_reason reason) 1251{ 1252 if (atomic_dec_and_test(&hpdev->refs)) 1253 kfree(hpdev); 1254} 1255 1256/** 1257 * new_pcichild_device() - Create a new child device 1258 * @hbus: The internal struct tracking this root PCI bus. 1259 * @desc: The information supplied so far from the host 1260 * about the device. 1261 * 1262 * This function creates the tracking structure for a new child 1263 * device and kicks off the process of figuring out what it is. 1264 * 1265 * Return: Pointer to the new tracking struct 1266 */ 1267static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus, 1268 struct pci_function_description *desc) 1269{ 1270 struct hv_pci_dev *hpdev; 1271 struct pci_child_message *res_req; 1272 struct q_res_req_compl comp_pkt; 1273 union { 1274 struct pci_packet init_packet; 1275 u8 buffer[0x100]; 1276 } pkt; 1277 unsigned long flags; 1278 int ret; 1279 1280 hpdev = kzalloc(sizeof(*hpdev), GFP_ATOMIC); 1281 if (!hpdev) 1282 return NULL; 1283 1284 hpdev->hbus = hbus; 1285 1286 memset(&pkt, 0, sizeof(pkt)); 1287 init_completion(&comp_pkt.host_event); 1288 comp_pkt.hpdev = hpdev; 1289 pkt.init_packet.compl_ctxt = &comp_pkt; 1290 pkt.init_packet.completion_func = q_resource_requirements; 1291 res_req = (struct pci_child_message *)&pkt.init_packet.message; 1292 res_req->message_type = PCI_QUERY_RESOURCE_REQUIREMENTS; 1293 res_req->wslot.slot = desc->win_slot.slot; 1294 1295 ret = vmbus_sendpacket(hbus->hdev->channel, res_req, 1296 sizeof(struct pci_child_message), 1297 (unsigned long)&pkt.init_packet, 1298 VM_PKT_DATA_INBAND, 1299 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 1300 if (ret) 1301 goto error; 1302 1303 wait_for_completion(&comp_pkt.host_event); 1304 1305 hpdev->desc = *desc; 1306 get_pcichild(hpdev, hv_pcidev_ref_initial); 1307 get_pcichild(hpdev, hv_pcidev_ref_childlist); 1308 spin_lock_irqsave(&hbus->device_list_lock, flags); 1309 list_add_tail(&hpdev->list_entry, &hbus->children); 1310 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 1311 return hpdev; 1312 1313error: 1314 kfree(hpdev); 1315 return NULL; 1316} 1317 1318/** 1319 * get_pcichild_wslot() - Find device from slot 1320 * @hbus: Root PCI bus, as understood by this driver 1321 * @wslot: Location on the bus 1322 * 1323 * This function looks up a PCI device and returns the internal 1324 * representation of it. It acquires a reference on it, so that 1325 * the device won't be deleted while somebody is using it. The 1326 * caller is responsible for calling put_pcichild() to release 1327 * this reference. 1328 * 1329 * Return: Internal representation of a PCI device 1330 */ 1331static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus, 1332 u32 wslot) 1333{ 1334 unsigned long flags; 1335 struct hv_pci_dev *iter, *hpdev = NULL; 1336 1337 spin_lock_irqsave(&hbus->device_list_lock, flags); 1338 list_for_each_entry(iter, &hbus->children, list_entry) { 1339 if (iter->desc.win_slot.slot == wslot) { 1340 hpdev = iter; 1341 get_pcichild(hpdev, hv_pcidev_ref_by_slot); 1342 break; 1343 } 1344 } 1345 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 1346 1347 return hpdev; 1348} 1349 1350/** 1351 * pci_devices_present_work() - Handle new list of child devices 1352 * @work: Work struct embedded in struct hv_dr_work 1353 * 1354 * "Bus Relations" is the Windows term for "children of this 1355 * bus." The terminology is preserved here for people trying to 1356 * debug the interaction between Hyper-V and Linux. This 1357 * function is called when the parent partition reports a list 1358 * of functions that should be observed under this PCI Express 1359 * port (bus). 1360 * 1361 * This function updates the list, and must tolerate being 1362 * called multiple times with the same information. The typical 1363 * number of child devices is one, with very atypical cases 1364 * involving three or four, so the algorithms used here can be 1365 * simple and inefficient. 1366 * 1367 * It must also treat the omission of a previously observed device as 1368 * notification that the device no longer exists. 1369 * 1370 * Note that this function is a work item, and it may not be 1371 * invoked in the order that it was queued. Back to back 1372 * updates of the list of present devices may involve queuing 1373 * multiple work items, and this one may run before ones that 1374 * were sent later. As such, this function only does something 1375 * if is the last one in the queue. 1376 */ 1377static void pci_devices_present_work(struct work_struct *work) 1378{ 1379 u32 child_no; 1380 bool found; 1381 struct list_head *iter; 1382 struct pci_function_description *new_desc; 1383 struct hv_pci_dev *hpdev; 1384 struct hv_pcibus_device *hbus; 1385 struct list_head removed; 1386 struct hv_dr_work *dr_wrk; 1387 struct hv_dr_state *dr = NULL; 1388 unsigned long flags; 1389 1390 dr_wrk = container_of(work, struct hv_dr_work, wrk); 1391 hbus = dr_wrk->bus; 1392 kfree(dr_wrk); 1393 1394 INIT_LIST_HEAD(&removed); 1395 1396 if (down_interruptible(&hbus->enum_sem)) { 1397 put_hvpcibus(hbus); 1398 return; 1399 } 1400 1401 /* Pull this off the queue and process it if it was the last one. */ 1402 spin_lock_irqsave(&hbus->device_list_lock, flags); 1403 while (!list_empty(&hbus->dr_list)) { 1404 dr = list_first_entry(&hbus->dr_list, struct hv_dr_state, 1405 list_entry); 1406 list_del(&dr->list_entry); 1407 1408 /* Throw this away if the list still has stuff in it. */ 1409 if (!list_empty(&hbus->dr_list)) { 1410 kfree(dr); 1411 continue; 1412 } 1413 } 1414 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 1415 1416 if (!dr) { 1417 up(&hbus->enum_sem); 1418 put_hvpcibus(hbus); 1419 return; 1420 } 1421 1422 /* First, mark all existing children as reported missing. */ 1423 spin_lock_irqsave(&hbus->device_list_lock, flags); 1424 list_for_each(iter, &hbus->children) { 1425 hpdev = container_of(iter, struct hv_pci_dev, 1426 list_entry); 1427 hpdev->reported_missing = true; 1428 } 1429 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 1430 1431 /* Next, add back any reported devices. */ 1432 for (child_no = 0; child_no < dr->device_count; child_no++) { 1433 found = false; 1434 new_desc = &dr->func[child_no]; 1435 1436 spin_lock_irqsave(&hbus->device_list_lock, flags); 1437 list_for_each(iter, &hbus->children) { 1438 hpdev = container_of(iter, struct hv_pci_dev, 1439 list_entry); 1440 if ((hpdev->desc.win_slot.slot == 1441 new_desc->win_slot.slot) && 1442 (hpdev->desc.v_id == new_desc->v_id) && 1443 (hpdev->desc.d_id == new_desc->d_id) && 1444 (hpdev->desc.ser == new_desc->ser)) { 1445 hpdev->reported_missing = false; 1446 found = true; 1447 } 1448 } 1449 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 1450 1451 if (!found) { 1452 hpdev = new_pcichild_device(hbus, new_desc); 1453 if (!hpdev) 1454 dev_err(&hbus->hdev->device, 1455 "couldn't record a child device.\n"); 1456 } 1457 } 1458 1459 /* Move missing children to a list on the stack. */ 1460 spin_lock_irqsave(&hbus->device_list_lock, flags); 1461 do { 1462 found = false; 1463 list_for_each(iter, &hbus->children) { 1464 hpdev = container_of(iter, struct hv_pci_dev, 1465 list_entry); 1466 if (hpdev->reported_missing) { 1467 found = true; 1468 put_pcichild(hpdev, hv_pcidev_ref_childlist); 1469 list_del(&hpdev->list_entry); 1470 list_add_tail(&hpdev->list_entry, &removed); 1471 break; 1472 } 1473 } 1474 } while (found); 1475 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 1476 1477 /* Delete everything that should no longer exist. */ 1478 while (!list_empty(&removed)) { 1479 hpdev = list_first_entry(&removed, struct hv_pci_dev, 1480 list_entry); 1481 list_del(&hpdev->list_entry); 1482 put_pcichild(hpdev, hv_pcidev_ref_initial); 1483 } 1484 1485 /* Tell the core to rescan bus because there may have been changes. */ 1486 if (hbus->state == hv_pcibus_installed) { 1487 pci_lock_rescan_remove(); 1488 pci_scan_child_bus(hbus->pci_bus); 1489 pci_unlock_rescan_remove(); 1490 } else { 1491 survey_child_resources(hbus); 1492 } 1493 1494 up(&hbus->enum_sem); 1495 put_hvpcibus(hbus); 1496 kfree(dr); 1497} 1498 1499/** 1500 * hv_pci_devices_present() - Handles list of new children 1501 * @hbus: Root PCI bus, as understood by this driver 1502 * @relations: Packet from host listing children 1503 * 1504 * This function is invoked whenever a new list of devices for 1505 * this bus appears. 1506 */ 1507static void hv_pci_devices_present(struct hv_pcibus_device *hbus, 1508 struct pci_bus_relations *relations) 1509{ 1510 struct hv_dr_state *dr; 1511 struct hv_dr_work *dr_wrk; 1512 unsigned long flags; 1513 1514 dr_wrk = kzalloc(sizeof(*dr_wrk), GFP_NOWAIT); 1515 if (!dr_wrk) 1516 return; 1517 1518 dr = kzalloc(offsetof(struct hv_dr_state, func) + 1519 (sizeof(struct pci_function_description) * 1520 (relations->device_count)), GFP_NOWAIT); 1521 if (!dr) { 1522 kfree(dr_wrk); 1523 return; 1524 } 1525 1526 INIT_WORK(&dr_wrk->wrk, pci_devices_present_work); 1527 dr_wrk->bus = hbus; 1528 dr->device_count = relations->device_count; 1529 if (dr->device_count != 0) { 1530 memcpy(dr->func, relations->func, 1531 sizeof(struct pci_function_description) * 1532 dr->device_count); 1533 } 1534 1535 spin_lock_irqsave(&hbus->device_list_lock, flags); 1536 list_add_tail(&dr->list_entry, &hbus->dr_list); 1537 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 1538 1539 get_hvpcibus(hbus); 1540 schedule_work(&dr_wrk->wrk); 1541} 1542 1543/** 1544 * hv_eject_device_work() - Asynchronously handles ejection 1545 * @work: Work struct embedded in internal device struct 1546 * 1547 * This function handles ejecting a device. Windows will 1548 * attempt to gracefully eject a device, waiting 60 seconds to 1549 * hear back from the guest OS that this completed successfully. 1550 * If this timer expires, the device will be forcibly removed. 1551 */ 1552static void hv_eject_device_work(struct work_struct *work) 1553{ 1554 struct pci_eject_response *ejct_pkt; 1555 struct hv_pci_dev *hpdev; 1556 struct pci_dev *pdev; 1557 unsigned long flags; 1558 int wslot; 1559 struct { 1560 struct pci_packet pkt; 1561 u8 buffer[sizeof(struct pci_eject_response) - 1562 sizeof(struct pci_message)]; 1563 } ctxt; 1564 1565 hpdev = container_of(work, struct hv_pci_dev, wrk); 1566 1567 if (hpdev->state != hv_pcichild_ejecting) { 1568 put_pcichild(hpdev, hv_pcidev_ref_pnp); 1569 return; 1570 } 1571 1572 /* 1573 * Ejection can come before or after the PCI bus has been set up, so 1574 * attempt to find it and tear down the bus state, if it exists. This 1575 * must be done without constructs like pci_domain_nr(hbus->pci_bus) 1576 * because hbus->pci_bus may not exist yet. 1577 */ 1578 wslot = wslot_to_devfn(hpdev->desc.win_slot.slot); 1579 pdev = pci_get_domain_bus_and_slot(hpdev->hbus->sysdata.domain, 0, 1580 wslot); 1581 if (pdev) { 1582 pci_stop_and_remove_bus_device(pdev); 1583 pci_dev_put(pdev); 1584 } 1585 1586 memset(&ctxt, 0, sizeof(ctxt)); 1587 ejct_pkt = (struct pci_eject_response *)&ctxt.pkt.message; 1588 ejct_pkt->message_type = PCI_EJECTION_COMPLETE; 1589 ejct_pkt->wslot.slot = hpdev->desc.win_slot.slot; 1590 vmbus_sendpacket(hpdev->hbus->hdev->channel, ejct_pkt, 1591 sizeof(*ejct_pkt), (unsigned long)&ctxt.pkt, 1592 VM_PKT_DATA_INBAND, 0); 1593 1594 spin_lock_irqsave(&hpdev->hbus->device_list_lock, flags); 1595 list_del(&hpdev->list_entry); 1596 spin_unlock_irqrestore(&hpdev->hbus->device_list_lock, flags); 1597 1598 put_pcichild(hpdev, hv_pcidev_ref_childlist); 1599 put_pcichild(hpdev, hv_pcidev_ref_pnp); 1600 put_hvpcibus(hpdev->hbus); 1601} 1602 1603/** 1604 * hv_pci_eject_device() - Handles device ejection 1605 * @hpdev: Internal device tracking struct 1606 * 1607 * This function is invoked when an ejection packet arrives. It 1608 * just schedules work so that we don't re-enter the packet 1609 * delivery code handling the ejection. 1610 */ 1611static void hv_pci_eject_device(struct hv_pci_dev *hpdev) 1612{ 1613 hpdev->state = hv_pcichild_ejecting; 1614 get_pcichild(hpdev, hv_pcidev_ref_pnp); 1615 INIT_WORK(&hpdev->wrk, hv_eject_device_work); 1616 get_hvpcibus(hpdev->hbus); 1617 schedule_work(&hpdev->wrk); 1618} 1619 1620/** 1621 * hv_pci_onchannelcallback() - Handles incoming packets 1622 * @context: Internal bus tracking struct 1623 * 1624 * This function is invoked whenever the host sends a packet to 1625 * this channel (which is private to this root PCI bus). 1626 */ 1627static void hv_pci_onchannelcallback(void *context) 1628{ 1629 const int packet_size = 0x100; 1630 int ret; 1631 struct hv_pcibus_device *hbus = context; 1632 u32 bytes_recvd; 1633 u64 req_id; 1634 struct vmpacket_descriptor *desc; 1635 unsigned char *buffer; 1636 int bufferlen = packet_size; 1637 struct pci_packet *comp_packet; 1638 struct pci_response *response; 1639 struct pci_incoming_message *new_message; 1640 struct pci_bus_relations *bus_rel; 1641 struct pci_dev_incoming *dev_message; 1642 struct hv_pci_dev *hpdev; 1643 1644 buffer = kmalloc(bufferlen, GFP_ATOMIC); 1645 if (!buffer) 1646 return; 1647 1648 while (1) { 1649 ret = vmbus_recvpacket_raw(hbus->hdev->channel, buffer, 1650 bufferlen, &bytes_recvd, &req_id); 1651 1652 if (ret == -ENOBUFS) { 1653 kfree(buffer); 1654 /* Handle large packet */ 1655 bufferlen = bytes_recvd; 1656 buffer = kmalloc(bytes_recvd, GFP_ATOMIC); 1657 if (!buffer) 1658 return; 1659 continue; 1660 } 1661 1662 /* Zero length indicates there are no more packets. */ 1663 if (ret || !bytes_recvd) 1664 break; 1665 1666 /* 1667 * All incoming packets must be at least as large as a 1668 * response. 1669 */ 1670 if (bytes_recvd <= sizeof(struct pci_response)) 1671 continue; 1672 desc = (struct vmpacket_descriptor *)buffer; 1673 1674 switch (desc->type) { 1675 case VM_PKT_COMP: 1676 1677 /* 1678 * The host is trusted, and thus it's safe to interpret 1679 * this transaction ID as a pointer. 1680 */ 1681 comp_packet = (struct pci_packet *)req_id; 1682 response = (struct pci_response *)buffer; 1683 comp_packet->completion_func(comp_packet->compl_ctxt, 1684 response, 1685 bytes_recvd); 1686 break; 1687 1688 case VM_PKT_DATA_INBAND: 1689 1690 new_message = (struct pci_incoming_message *)buffer; 1691 switch (new_message->message_type.message_type) { 1692 case PCI_BUS_RELATIONS: 1693 1694 bus_rel = (struct pci_bus_relations *)buffer; 1695 if (bytes_recvd < 1696 offsetof(struct pci_bus_relations, func) + 1697 (sizeof(struct pci_function_description) * 1698 (bus_rel->device_count))) { 1699 dev_err(&hbus->hdev->device, 1700 "bus relations too small\n"); 1701 break; 1702 } 1703 1704 hv_pci_devices_present(hbus, bus_rel); 1705 break; 1706 1707 case PCI_EJECT: 1708 1709 dev_message = (struct pci_dev_incoming *)buffer; 1710 hpdev = get_pcichild_wslot(hbus, 1711 dev_message->wslot.slot); 1712 if (hpdev) { 1713 hv_pci_eject_device(hpdev); 1714 put_pcichild(hpdev, 1715 hv_pcidev_ref_by_slot); 1716 } 1717 break; 1718 1719 default: 1720 dev_warn(&hbus->hdev->device, 1721 "Unimplemented protocol message %x\n", 1722 new_message->message_type.message_type); 1723 break; 1724 } 1725 break; 1726 1727 default: 1728 dev_err(&hbus->hdev->device, 1729 "unhandled packet type %d, tid %llx len %d\n", 1730 desc->type, req_id, bytes_recvd); 1731 break; 1732 } 1733 } 1734 1735 kfree(buffer); 1736} 1737 1738/** 1739 * hv_pci_protocol_negotiation() - Set up protocol 1740 * @hdev: VMBus's tracking struct for this root PCI bus 1741 * 1742 * This driver is intended to support running on Windows 10 1743 * (server) and later versions. It will not run on earlier 1744 * versions, as they assume that many of the operations which 1745 * Linux needs accomplished with a spinlock held were done via 1746 * asynchronous messaging via VMBus. Windows 10 increases the 1747 * surface area of PCI emulation so that these actions can take 1748 * place by suspending a virtual processor for their duration. 1749 * 1750 * This function negotiates the channel protocol version, 1751 * failing if the host doesn't support the necessary protocol 1752 * level. 1753 */ 1754static int hv_pci_protocol_negotiation(struct hv_device *hdev) 1755{ 1756 struct pci_version_request *version_req; 1757 struct hv_pci_compl comp_pkt; 1758 struct pci_packet *pkt; 1759 int ret; 1760 1761 /* 1762 * Initiate the handshake with the host and negotiate 1763 * a version that the host can support. We start with the 1764 * highest version number and go down if the host cannot 1765 * support it. 1766 */ 1767 pkt = kzalloc(sizeof(*pkt) + sizeof(*version_req), GFP_KERNEL); 1768 if (!pkt) 1769 return -ENOMEM; 1770 1771 init_completion(&comp_pkt.host_event); 1772 pkt->completion_func = hv_pci_generic_compl; 1773 pkt->compl_ctxt = &comp_pkt; 1774 version_req = (struct pci_version_request *)&pkt->message; 1775 version_req->message_type.message_type = PCI_QUERY_PROTOCOL_VERSION; 1776 version_req->protocol_version = PCI_PROTOCOL_VERSION_CURRENT; 1777 1778 ret = vmbus_sendpacket(hdev->channel, version_req, 1779 sizeof(struct pci_version_request), 1780 (unsigned long)pkt, VM_PKT_DATA_INBAND, 1781 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 1782 if (ret) 1783 goto exit; 1784 1785 wait_for_completion(&comp_pkt.host_event); 1786 1787 if (comp_pkt.completion_status < 0) { 1788 dev_err(&hdev->device, 1789 "PCI Pass-through VSP failed version request %x\n", 1790 comp_pkt.completion_status); 1791 ret = -EPROTO; 1792 goto exit; 1793 } 1794 1795 ret = 0; 1796 1797exit: 1798 kfree(pkt); 1799 return ret; 1800} 1801 1802/** 1803 * hv_pci_free_bridge_windows() - Release memory regions for the 1804 * bus 1805 * @hbus: Root PCI bus, as understood by this driver 1806 */ 1807static void hv_pci_free_bridge_windows(struct hv_pcibus_device *hbus) 1808{ 1809 /* 1810 * Set the resources back to the way they looked when they 1811 * were allocated by setting IORESOURCE_BUSY again. 1812 */ 1813 1814 if (hbus->low_mmio_space && hbus->low_mmio_res) { 1815 hbus->low_mmio_res->flags |= IORESOURCE_BUSY; 1816 vmbus_free_mmio(hbus->low_mmio_res->start, 1817 resource_size(hbus->low_mmio_res)); 1818 } 1819 1820 if (hbus->high_mmio_space && hbus->high_mmio_res) { 1821 hbus->high_mmio_res->flags |= IORESOURCE_BUSY; 1822 vmbus_free_mmio(hbus->high_mmio_res->start, 1823 resource_size(hbus->high_mmio_res)); 1824 } 1825} 1826 1827/** 1828 * hv_pci_allocate_bridge_windows() - Allocate memory regions 1829 * for the bus 1830 * @hbus: Root PCI bus, as understood by this driver 1831 * 1832 * This function calls vmbus_allocate_mmio(), which is itself a 1833 * bit of a compromise. Ideally, we might change the pnp layer 1834 * in the kernel such that it comprehends either PCI devices 1835 * which are "grandchildren of ACPI," with some intermediate bus 1836 * node (in this case, VMBus) or change it such that it 1837 * understands VMBus. The pnp layer, however, has been declared 1838 * deprecated, and not subject to change. 1839 * 1840 * The workaround, implemented here, is to ask VMBus to allocate 1841 * MMIO space for this bus. VMBus itself knows which ranges are 1842 * appropriate by looking at its own ACPI objects. Then, after 1843 * these ranges are claimed, they're modified to look like they 1844 * would have looked if the ACPI and pnp code had allocated 1845 * bridge windows. These descriptors have to exist in this form 1846 * in order to satisfy the code which will get invoked when the 1847 * endpoint PCI function driver calls request_mem_region() or 1848 * request_mem_region_exclusive(). 1849 * 1850 * Return: 0 on success, -errno on failure 1851 */ 1852static int hv_pci_allocate_bridge_windows(struct hv_pcibus_device *hbus) 1853{ 1854 resource_size_t align; 1855 int ret; 1856 1857 if (hbus->low_mmio_space) { 1858 align = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space)); 1859 ret = vmbus_allocate_mmio(&hbus->low_mmio_res, hbus->hdev, 0, 1860 (u64)(u32)0xffffffff, 1861 hbus->low_mmio_space, 1862 align, false); 1863 if (ret) { 1864 dev_err(&hbus->hdev->device, 1865 "Need %#llx of low MMIO space. Consider reconfiguring the VM.\n", 1866 hbus->low_mmio_space); 1867 return ret; 1868 } 1869 1870 /* Modify this resource to become a bridge window. */ 1871 hbus->low_mmio_res->flags |= IORESOURCE_WINDOW; 1872 hbus->low_mmio_res->flags &= ~IORESOURCE_BUSY; 1873 pci_add_resource(&hbus->resources_for_children, 1874 hbus->low_mmio_res); 1875 } 1876 1877 if (hbus->high_mmio_space) { 1878 align = 1ULL << (63 - __builtin_clzll(hbus->high_mmio_space)); 1879 ret = vmbus_allocate_mmio(&hbus->high_mmio_res, hbus->hdev, 1880 0x100000000, -1, 1881 hbus->high_mmio_space, align, 1882 false); 1883 if (ret) { 1884 dev_err(&hbus->hdev->device, 1885 "Need %#llx of high MMIO space. Consider reconfiguring the VM.\n", 1886 hbus->high_mmio_space); 1887 goto release_low_mmio; 1888 } 1889 1890 /* Modify this resource to become a bridge window. */ 1891 hbus->high_mmio_res->flags |= IORESOURCE_WINDOW; 1892 hbus->high_mmio_res->flags &= ~IORESOURCE_BUSY; 1893 pci_add_resource(&hbus->resources_for_children, 1894 hbus->high_mmio_res); 1895 } 1896 1897 return 0; 1898 1899release_low_mmio: 1900 if (hbus->low_mmio_res) { 1901 vmbus_free_mmio(hbus->low_mmio_res->start, 1902 resource_size(hbus->low_mmio_res)); 1903 } 1904 1905 return ret; 1906} 1907 1908/** 1909 * hv_allocate_config_window() - Find MMIO space for PCI Config 1910 * @hbus: Root PCI bus, as understood by this driver 1911 * 1912 * This function claims memory-mapped I/O space for accessing 1913 * configuration space for the functions on this bus. 1914 * 1915 * Return: 0 on success, -errno on failure 1916 */ 1917static int hv_allocate_config_window(struct hv_pcibus_device *hbus) 1918{ 1919 int ret; 1920 1921 /* 1922 * Set up a region of MMIO space to use for accessing configuration 1923 * space. 1924 */ 1925 ret = vmbus_allocate_mmio(&hbus->mem_config, hbus->hdev, 0, -1, 1926 PCI_CONFIG_MMIO_LENGTH, 0x1000, false); 1927 if (ret) 1928 return ret; 1929 1930 /* 1931 * vmbus_allocate_mmio() gets used for allocating both device endpoint 1932 * resource claims (those which cannot be overlapped) and the ranges 1933 * which are valid for the children of this bus, which are intended 1934 * to be overlapped by those children. Set the flag on this claim 1935 * meaning that this region can't be overlapped. 1936 */ 1937 1938 hbus->mem_config->flags |= IORESOURCE_BUSY; 1939 1940 return 0; 1941} 1942 1943static void hv_free_config_window(struct hv_pcibus_device *hbus) 1944{ 1945 vmbus_free_mmio(hbus->mem_config->start, PCI_CONFIG_MMIO_LENGTH); 1946} 1947 1948/** 1949 * hv_pci_enter_d0() - Bring the "bus" into the D0 power state 1950 * @hdev: VMBus's tracking struct for this root PCI bus 1951 * 1952 * Return: 0 on success, -errno on failure 1953 */ 1954static int hv_pci_enter_d0(struct hv_device *hdev) 1955{ 1956 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 1957 struct pci_bus_d0_entry *d0_entry; 1958 struct hv_pci_compl comp_pkt; 1959 struct pci_packet *pkt; 1960 int ret; 1961 1962 /* 1963 * Tell the host that the bus is ready to use, and moved into the 1964 * powered-on state. This includes telling the host which region 1965 * of memory-mapped I/O space has been chosen for configuration space 1966 * access. 1967 */ 1968 pkt = kzalloc(sizeof(*pkt) + sizeof(*d0_entry), GFP_KERNEL); 1969 if (!pkt) 1970 return -ENOMEM; 1971 1972 init_completion(&comp_pkt.host_event); 1973 pkt->completion_func = hv_pci_generic_compl; 1974 pkt->compl_ctxt = &comp_pkt; 1975 d0_entry = (struct pci_bus_d0_entry *)&pkt->message; 1976 d0_entry->message_type.message_type = PCI_BUS_D0ENTRY; 1977 d0_entry->mmio_base = hbus->mem_config->start; 1978 1979 ret = vmbus_sendpacket(hdev->channel, d0_entry, sizeof(*d0_entry), 1980 (unsigned long)pkt, VM_PKT_DATA_INBAND, 1981 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 1982 if (ret) 1983 goto exit; 1984 1985 wait_for_completion(&comp_pkt.host_event); 1986 1987 if (comp_pkt.completion_status < 0) { 1988 dev_err(&hdev->device, 1989 "PCI Pass-through VSP failed D0 Entry with status %x\n", 1990 comp_pkt.completion_status); 1991 ret = -EPROTO; 1992 goto exit; 1993 } 1994 1995 ret = 0; 1996 1997exit: 1998 kfree(pkt); 1999 return ret; 2000} 2001 2002/** 2003 * hv_pci_query_relations() - Ask host to send list of child 2004 * devices 2005 * @hdev: VMBus's tracking struct for this root PCI bus 2006 * 2007 * Return: 0 on success, -errno on failure 2008 */ 2009static int hv_pci_query_relations(struct hv_device *hdev) 2010{ 2011 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 2012 struct pci_message message; 2013 struct completion comp; 2014 int ret; 2015 2016 /* Ask the host to send along the list of child devices */ 2017 init_completion(&comp); 2018 if (cmpxchg(&hbus->survey_event, NULL, &comp)) 2019 return -ENOTEMPTY; 2020 2021 memset(&message, 0, sizeof(message)); 2022 message.message_type = PCI_QUERY_BUS_RELATIONS; 2023 2024 ret = vmbus_sendpacket(hdev->channel, &message, sizeof(message), 2025 0, VM_PKT_DATA_INBAND, 0); 2026 if (ret) 2027 return ret; 2028 2029 wait_for_completion(&comp); 2030 return 0; 2031} 2032 2033/** 2034 * hv_send_resources_allocated() - Report local resource choices 2035 * @hdev: VMBus's tracking struct for this root PCI bus 2036 * 2037 * The host OS is expecting to be sent a request as a message 2038 * which contains all the resources that the device will use. 2039 * The response contains those same resources, "translated" 2040 * which is to say, the values which should be used by the 2041 * hardware, when it delivers an interrupt. (MMIO resources are 2042 * used in local terms.) This is nice for Windows, and lines up 2043 * with the FDO/PDO split, which doesn't exist in Linux. Linux 2044 * is deeply expecting to scan an emulated PCI configuration 2045 * space. So this message is sent here only to drive the state 2046 * machine on the host forward. 2047 * 2048 * Return: 0 on success, -errno on failure 2049 */ 2050static int hv_send_resources_allocated(struct hv_device *hdev) 2051{ 2052 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 2053 struct pci_resources_assigned *res_assigned; 2054 struct hv_pci_compl comp_pkt; 2055 struct hv_pci_dev *hpdev; 2056 struct pci_packet *pkt; 2057 u32 wslot; 2058 int ret; 2059 2060 pkt = kmalloc(sizeof(*pkt) + sizeof(*res_assigned), GFP_KERNEL); 2061 if (!pkt) 2062 return -ENOMEM; 2063 2064 ret = 0; 2065 2066 for (wslot = 0; wslot < 256; wslot++) { 2067 hpdev = get_pcichild_wslot(hbus, wslot); 2068 if (!hpdev) 2069 continue; 2070 2071 memset(pkt, 0, sizeof(*pkt) + sizeof(*res_assigned)); 2072 init_completion(&comp_pkt.host_event); 2073 pkt->completion_func = hv_pci_generic_compl; 2074 pkt->compl_ctxt = &comp_pkt; 2075 pkt->message.message_type = PCI_RESOURCES_ASSIGNED; 2076 res_assigned = (struct pci_resources_assigned *)&pkt->message; 2077 res_assigned->wslot.slot = hpdev->desc.win_slot.slot; 2078 2079 put_pcichild(hpdev, hv_pcidev_ref_by_slot); 2080 2081 ret = vmbus_sendpacket( 2082 hdev->channel, &pkt->message, 2083 sizeof(*res_assigned), 2084 (unsigned long)pkt, 2085 VM_PKT_DATA_INBAND, 2086 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 2087 if (ret) 2088 break; 2089 2090 wait_for_completion(&comp_pkt.host_event); 2091 2092 if (comp_pkt.completion_status < 0) { 2093 ret = -EPROTO; 2094 dev_err(&hdev->device, 2095 "resource allocated returned 0x%x", 2096 comp_pkt.completion_status); 2097 break; 2098 } 2099 } 2100 2101 kfree(pkt); 2102 return ret; 2103} 2104 2105/** 2106 * hv_send_resources_released() - Report local resources 2107 * released 2108 * @hdev: VMBus's tracking struct for this root PCI bus 2109 * 2110 * Return: 0 on success, -errno on failure 2111 */ 2112static int hv_send_resources_released(struct hv_device *hdev) 2113{ 2114 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 2115 struct pci_child_message pkt; 2116 struct hv_pci_dev *hpdev; 2117 u32 wslot; 2118 int ret; 2119 2120 for (wslot = 0; wslot < 256; wslot++) { 2121 hpdev = get_pcichild_wslot(hbus, wslot); 2122 if (!hpdev) 2123 continue; 2124 2125 memset(&pkt, 0, sizeof(pkt)); 2126 pkt.message_type = PCI_RESOURCES_RELEASED; 2127 pkt.wslot.slot = hpdev->desc.win_slot.slot; 2128 2129 put_pcichild(hpdev, hv_pcidev_ref_by_slot); 2130 2131 ret = vmbus_sendpacket(hdev->channel, &pkt, sizeof(pkt), 0, 2132 VM_PKT_DATA_INBAND, 0); 2133 if (ret) 2134 return ret; 2135 } 2136 2137 return 0; 2138} 2139 2140static void get_hvpcibus(struct hv_pcibus_device *hbus) 2141{ 2142 atomic_inc(&hbus->remove_lock); 2143} 2144 2145static void put_hvpcibus(struct hv_pcibus_device *hbus) 2146{ 2147 if (atomic_dec_and_test(&hbus->remove_lock)) 2148 complete(&hbus->remove_event); 2149} 2150 2151/** 2152 * hv_pci_probe() - New VMBus channel probe, for a root PCI bus 2153 * @hdev: VMBus's tracking struct for this root PCI bus 2154 * @dev_id: Identifies the device itself 2155 * 2156 * Return: 0 on success, -errno on failure 2157 */ 2158static int hv_pci_probe(struct hv_device *hdev, 2159 const struct hv_vmbus_device_id *dev_id) 2160{ 2161 struct hv_pcibus_device *hbus; 2162 int ret; 2163 2164 hbus = kzalloc(sizeof(*hbus), GFP_KERNEL); 2165 if (!hbus) 2166 return -ENOMEM; 2167 2168 /* 2169 * The PCI bus "domain" is what is called "segment" in ACPI and 2170 * other specs. Pull it from the instance ID, to get something 2171 * unique. Bytes 8 and 9 are what is used in Windows guests, so 2172 * do the same thing for consistency. Note that, since this code 2173 * only runs in a Hyper-V VM, Hyper-V can (and does) guarantee 2174 * that (1) the only domain in use for something that looks like 2175 * a physical PCI bus (which is actually emulated by the 2176 * hypervisor) is domain 0 and (2) there will be no overlap 2177 * between domains derived from these instance IDs in the same 2178 * VM. 2179 */ 2180 hbus->sysdata.domain = hdev->dev_instance.b[9] | 2181 hdev->dev_instance.b[8] << 8; 2182 2183 hbus->hdev = hdev; 2184 atomic_inc(&hbus->remove_lock); 2185 INIT_LIST_HEAD(&hbus->children); 2186 INIT_LIST_HEAD(&hbus->dr_list); 2187 INIT_LIST_HEAD(&hbus->resources_for_children); 2188 spin_lock_init(&hbus->config_lock); 2189 spin_lock_init(&hbus->device_list_lock); 2190 sema_init(&hbus->enum_sem, 1); 2191 init_completion(&hbus->remove_event); 2192 2193 ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, 2194 hv_pci_onchannelcallback, hbus); 2195 if (ret) 2196 goto free_bus; 2197 2198 hv_set_drvdata(hdev, hbus); 2199 2200 ret = hv_pci_protocol_negotiation(hdev); 2201 if (ret) 2202 goto close; 2203 2204 ret = hv_allocate_config_window(hbus); 2205 if (ret) 2206 goto close; 2207 2208 hbus->cfg_addr = ioremap(hbus->mem_config->start, 2209 PCI_CONFIG_MMIO_LENGTH); 2210 if (!hbus->cfg_addr) { 2211 dev_err(&hdev->device, 2212 "Unable to map a virtual address for config space\n"); 2213 ret = -ENOMEM; 2214 goto free_config; 2215 } 2216 2217 hbus->sysdata.fwnode = irq_domain_alloc_fwnode(hbus); 2218 if (!hbus->sysdata.fwnode) { 2219 ret = -ENOMEM; 2220 goto unmap; 2221 } 2222 2223 ret = hv_pcie_init_irq_domain(hbus); 2224 if (ret) 2225 goto free_fwnode; 2226 2227 ret = hv_pci_query_relations(hdev); 2228 if (ret) 2229 goto free_irq_domain; 2230 2231 ret = hv_pci_enter_d0(hdev); 2232 if (ret) 2233 goto free_irq_domain; 2234 2235 ret = hv_pci_allocate_bridge_windows(hbus); 2236 if (ret) 2237 goto free_irq_domain; 2238 2239 ret = hv_send_resources_allocated(hdev); 2240 if (ret) 2241 goto free_windows; 2242 2243 prepopulate_bars(hbus); 2244 2245 hbus->state = hv_pcibus_probed; 2246 2247 ret = create_root_hv_pci_bus(hbus); 2248 if (ret) 2249 goto free_windows; 2250 2251 return 0; 2252 2253free_windows: 2254 hv_pci_free_bridge_windows(hbus); 2255free_irq_domain: 2256 irq_domain_remove(hbus->irq_domain); 2257free_fwnode: 2258 irq_domain_free_fwnode(hbus->sysdata.fwnode); 2259unmap: 2260 iounmap(hbus->cfg_addr); 2261free_config: 2262 hv_free_config_window(hbus); 2263close: 2264 vmbus_close(hdev->channel); 2265free_bus: 2266 kfree(hbus); 2267 return ret; 2268} 2269 2270/** 2271 * hv_pci_remove() - Remove routine for this VMBus channel 2272 * @hdev: VMBus's tracking struct for this root PCI bus 2273 * 2274 * Return: 0 on success, -errno on failure 2275 */ 2276static int hv_pci_remove(struct hv_device *hdev) 2277{ 2278 int ret; 2279 struct hv_pcibus_device *hbus; 2280 union { 2281 struct pci_packet teardown_packet; 2282 u8 buffer[0x100]; 2283 } pkt; 2284 struct pci_bus_relations relations; 2285 struct hv_pci_compl comp_pkt; 2286 2287 hbus = hv_get_drvdata(hdev); 2288 2289 memset(&pkt.teardown_packet, 0, sizeof(pkt.teardown_packet)); 2290 init_completion(&comp_pkt.host_event); 2291 pkt.teardown_packet.completion_func = hv_pci_generic_compl; 2292 pkt.teardown_packet.compl_ctxt = &comp_pkt; 2293 pkt.teardown_packet.message.message_type = PCI_BUS_D0EXIT; 2294 2295 ret = vmbus_sendpacket(hdev->channel, &pkt.teardown_packet.message, 2296 sizeof(struct pci_message), 2297 (unsigned long)&pkt.teardown_packet, 2298 VM_PKT_DATA_INBAND, 2299 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 2300 if (!ret) 2301 wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ); 2302 2303 if (hbus->state == hv_pcibus_installed) { 2304 /* Remove the bus from PCI's point of view. */ 2305 pci_lock_rescan_remove(); 2306 pci_stop_root_bus(hbus->pci_bus); 2307 pci_remove_root_bus(hbus->pci_bus); 2308 pci_unlock_rescan_remove(); 2309 } 2310 2311 ret = hv_send_resources_released(hdev); 2312 if (ret) 2313 dev_err(&hdev->device, 2314 "Couldn't send resources released packet(s)\n"); 2315 2316 vmbus_close(hdev->channel); 2317 2318 /* Delete any children which might still exist. */ 2319 memset(&relations, 0, sizeof(relations)); 2320 hv_pci_devices_present(hbus, &relations); 2321 2322 iounmap(hbus->cfg_addr); 2323 hv_free_config_window(hbus); 2324 pci_free_resource_list(&hbus->resources_for_children); 2325 hv_pci_free_bridge_windows(hbus); 2326 irq_domain_remove(hbus->irq_domain); 2327 irq_domain_free_fwnode(hbus->sysdata.fwnode); 2328 put_hvpcibus(hbus); 2329 wait_for_completion(&hbus->remove_event); 2330 kfree(hbus); 2331 return 0; 2332} 2333 2334static const struct hv_vmbus_device_id hv_pci_id_table[] = { 2335 /* PCI Pass-through Class ID */ 2336 /* 44C4F61D-4444-4400-9D52-802E27EDE19F */ 2337 { HV_PCIE_GUID, }, 2338 { }, 2339}; 2340 2341MODULE_DEVICE_TABLE(vmbus, hv_pci_id_table); 2342 2343static struct hv_driver hv_pci_drv = { 2344 .name = "hv_pci", 2345 .id_table = hv_pci_id_table, 2346 .probe = hv_pci_probe, 2347 .remove = hv_pci_remove, 2348}; 2349 2350static void __exit exit_hv_pci_drv(void) 2351{ 2352 vmbus_driver_unregister(&hv_pci_drv); 2353} 2354 2355static int __init init_hv_pci_drv(void) 2356{ 2357 return vmbus_driver_register(&hv_pci_drv); 2358} 2359 2360module_init(init_hv_pci_drv); 2361module_exit(exit_hv_pci_drv); 2362 2363MODULE_DESCRIPTION("Hyper-V PCI"); 2364MODULE_LICENSE("GPL v2");