Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

PCI: hv: Add a paravirtual backchannel in software

Windows SR-IOV provides a backchannel mechanism in software for communication
between a VF driver and a PF driver. These "configuration blocks" are
similar in concept to PCI configuration space, but instead of doing reads and
writes in 32-bit chunks through a very slow path, packets of up to 128 bytes
can be sent or received asynchronously.

Nearly every SR-IOV device contains just such a communications channel in
hardware, so using this one in software is usually optional. Using the
software channel, however, allows driver implementers to leverage software
tools that fuzz the communications channel looking for vulnerabilities.

The usage model for these packets puts the responsibility for reading or
writing on the VF driver. The VF driver sends a read or a write packet,
indicating which "block" is being referred to by number.

If the PF driver wishes to initiate communication, it can "invalidate" one or
more of the first 64 blocks. This invalidation is delivered via a callback
supplied by the VF driver by this driver.

No protocol is implied, except that supplied by the PF and VF drivers.

Signed-off-by: Jake Oshins <jakeo@microsoft.com>
Signed-off-by: Dexuan Cui <decui@microsoft.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Dexuan Cui and committed by
David S. Miller
e5d2f910 fed07ef3

+317
+302
drivers/pci/controller/pci-hyperv.c
··· 365 365 struct tran_int_desc int_desc; 366 366 } __packed; 367 367 368 + /* 369 + * Note: the VM must pass a valid block id, wslot and bytes_requested. 370 + */ 371 + struct pci_read_block { 372 + struct pci_message message_type; 373 + u32 block_id; 374 + union win_slot_encoding wslot; 375 + u32 bytes_requested; 376 + } __packed; 377 + 378 + struct pci_read_block_response { 379 + struct vmpacket_descriptor hdr; 380 + u32 status; 381 + u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX]; 382 + } __packed; 383 + 384 + /* 385 + * Note: the VM must pass a valid block id, wslot and byte_count. 386 + */ 387 + struct pci_write_block { 388 + struct pci_message message_type; 389 + u32 block_id; 390 + union win_slot_encoding wslot; 391 + u32 byte_count; 392 + u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX]; 393 + } __packed; 394 + 395 + struct pci_dev_inval_block { 396 + struct pci_incoming_message incoming; 397 + union win_slot_encoding wslot; 398 + u64 block_mask; 399 + } __packed; 400 + 368 401 struct pci_dev_incoming { 369 402 struct pci_incoming_message incoming; 370 403 union win_slot_encoding wslot; ··· 531 498 bool reported_missing; 532 499 struct hv_pcibus_device *hbus; 533 500 struct work_struct wrk; 501 + 502 + void (*block_invalidate)(void *context, u64 block_mask); 503 + void *invalidate_context; 534 504 535 505 /* 536 506 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then ··· 852 816 .read = hv_pcifront_read_config, 853 817 .write = hv_pcifront_write_config, 854 818 }; 819 + 820 + /* 821 + * Paravirtual backchannel 822 + * 823 + * Hyper-V SR-IOV provides a backchannel mechanism in software for 824 + * communication between a VF driver and a PF driver. These 825 + * "configuration blocks" are similar in concept to PCI configuration space, 826 + * but instead of doing reads and writes in 32-bit chunks through a very slow 827 + * path, packets of up to 128 bytes can be sent or received asynchronously. 828 + * 829 + * Nearly every SR-IOV device contains just such a communications channel in 830 + * hardware, so using this one in software is usually optional. Using the 831 + * software channel, however, allows driver implementers to leverage software 832 + * tools that fuzz the communications channel looking for vulnerabilities. 833 + * 834 + * The usage model for these packets puts the responsibility for reading or 835 + * writing on the VF driver. The VF driver sends a read or a write packet, 836 + * indicating which "block" is being referred to by number. 837 + * 838 + * If the PF driver wishes to initiate communication, it can "invalidate" one or 839 + * more of the first 64 blocks. This invalidation is delivered via a callback 840 + * supplied by the VF driver by this driver. 841 + * 842 + * No protocol is implied, except that supplied by the PF and VF drivers. 843 + */ 844 + 845 + struct hv_read_config_compl { 846 + struct hv_pci_compl comp_pkt; 847 + void *buf; 848 + unsigned int len; 849 + unsigned int bytes_returned; 850 + }; 851 + 852 + /** 853 + * hv_pci_read_config_compl() - Invoked when a response packet 854 + * for a read config block operation arrives. 855 + * @context: Identifies the read config operation 856 + * @resp: The response packet itself 857 + * @resp_packet_size: Size in bytes of the response packet 858 + */ 859 + static void hv_pci_read_config_compl(void *context, struct pci_response *resp, 860 + int resp_packet_size) 861 + { 862 + struct hv_read_config_compl *comp = context; 863 + struct pci_read_block_response *read_resp = 864 + (struct pci_read_block_response *)resp; 865 + unsigned int data_len, hdr_len; 866 + 867 + hdr_len = offsetof(struct pci_read_block_response, bytes); 868 + if (resp_packet_size < hdr_len) { 869 + comp->comp_pkt.completion_status = -1; 870 + goto out; 871 + } 872 + 873 + data_len = resp_packet_size - hdr_len; 874 + if (data_len > 0 && read_resp->status == 0) { 875 + comp->bytes_returned = min(comp->len, data_len); 876 + memcpy(comp->buf, read_resp->bytes, comp->bytes_returned); 877 + } else { 878 + comp->bytes_returned = 0; 879 + } 880 + 881 + comp->comp_pkt.completion_status = read_resp->status; 882 + out: 883 + complete(&comp->comp_pkt.host_event); 884 + } 885 + 886 + /** 887 + * hv_read_config_block() - Sends a read config block request to 888 + * the back-end driver running in the Hyper-V parent partition. 889 + * @pdev: The PCI driver's representation for this device. 890 + * @buf: Buffer into which the config block will be copied. 891 + * @len: Size in bytes of buf. 892 + * @block_id: Identifies the config block which has been requested. 893 + * @bytes_returned: Size which came back from the back-end driver. 894 + * 895 + * Return: 0 on success, -errno on failure 896 + */ 897 + int hv_read_config_block(struct pci_dev *pdev, void *buf, unsigned int len, 898 + unsigned int block_id, unsigned int *bytes_returned) 899 + { 900 + struct hv_pcibus_device *hbus = 901 + container_of(pdev->bus->sysdata, struct hv_pcibus_device, 902 + sysdata); 903 + struct { 904 + struct pci_packet pkt; 905 + char buf[sizeof(struct pci_read_block)]; 906 + } pkt; 907 + struct hv_read_config_compl comp_pkt; 908 + struct pci_read_block *read_blk; 909 + int ret; 910 + 911 + if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX) 912 + return -EINVAL; 913 + 914 + init_completion(&comp_pkt.comp_pkt.host_event); 915 + comp_pkt.buf = buf; 916 + comp_pkt.len = len; 917 + 918 + memset(&pkt, 0, sizeof(pkt)); 919 + pkt.pkt.completion_func = hv_pci_read_config_compl; 920 + pkt.pkt.compl_ctxt = &comp_pkt; 921 + read_blk = (struct pci_read_block *)&pkt.pkt.message; 922 + read_blk->message_type.type = PCI_READ_BLOCK; 923 + read_blk->wslot.slot = devfn_to_wslot(pdev->devfn); 924 + read_blk->block_id = block_id; 925 + read_blk->bytes_requested = len; 926 + 927 + ret = vmbus_sendpacket(hbus->hdev->channel, read_blk, 928 + sizeof(*read_blk), (unsigned long)&pkt.pkt, 929 + VM_PKT_DATA_INBAND, 930 + VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 931 + if (ret) 932 + return ret; 933 + 934 + ret = wait_for_response(hbus->hdev, &comp_pkt.comp_pkt.host_event); 935 + if (ret) 936 + return ret; 937 + 938 + if (comp_pkt.comp_pkt.completion_status != 0 || 939 + comp_pkt.bytes_returned == 0) { 940 + dev_err(&hbus->hdev->device, 941 + "Read Config Block failed: 0x%x, bytes_returned=%d\n", 942 + comp_pkt.comp_pkt.completion_status, 943 + comp_pkt.bytes_returned); 944 + return -EIO; 945 + } 946 + 947 + *bytes_returned = comp_pkt.bytes_returned; 948 + return 0; 949 + } 950 + EXPORT_SYMBOL(hv_read_config_block); 951 + 952 + /** 953 + * hv_pci_write_config_compl() - Invoked when a response packet for a write 954 + * config block operation arrives. 955 + * @context: Identifies the write config operation 956 + * @resp: The response packet itself 957 + * @resp_packet_size: Size in bytes of the response packet 958 + */ 959 + static void hv_pci_write_config_compl(void *context, struct pci_response *resp, 960 + int resp_packet_size) 961 + { 962 + struct hv_pci_compl *comp_pkt = context; 963 + 964 + comp_pkt->completion_status = resp->status; 965 + complete(&comp_pkt->host_event); 966 + } 967 + 968 + /** 969 + * hv_write_config_block() - Sends a write config block request to the 970 + * back-end driver running in the Hyper-V parent partition. 971 + * @pdev: The PCI driver's representation for this device. 972 + * @buf: Buffer from which the config block will be copied. 973 + * @len: Size in bytes of buf. 974 + * @block_id: Identifies the config block which is being written. 975 + * 976 + * Return: 0 on success, -errno on failure 977 + */ 978 + int hv_write_config_block(struct pci_dev *pdev, void *buf, unsigned int len, 979 + unsigned int block_id) 980 + { 981 + struct hv_pcibus_device *hbus = 982 + container_of(pdev->bus->sysdata, struct hv_pcibus_device, 983 + sysdata); 984 + struct { 985 + struct pci_packet pkt; 986 + char buf[sizeof(struct pci_write_block)]; 987 + u32 reserved; 988 + } pkt; 989 + struct hv_pci_compl comp_pkt; 990 + struct pci_write_block *write_blk; 991 + u32 pkt_size; 992 + int ret; 993 + 994 + if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX) 995 + return -EINVAL; 996 + 997 + init_completion(&comp_pkt.host_event); 998 + 999 + memset(&pkt, 0, sizeof(pkt)); 1000 + pkt.pkt.completion_func = hv_pci_write_config_compl; 1001 + pkt.pkt.compl_ctxt = &comp_pkt; 1002 + write_blk = (struct pci_write_block *)&pkt.pkt.message; 1003 + write_blk->message_type.type = PCI_WRITE_BLOCK; 1004 + write_blk->wslot.slot = devfn_to_wslot(pdev->devfn); 1005 + write_blk->block_id = block_id; 1006 + write_blk->byte_count = len; 1007 + memcpy(write_blk->bytes, buf, len); 1008 + pkt_size = offsetof(struct pci_write_block, bytes) + len; 1009 + /* 1010 + * This quirk is required on some hosts shipped around 2018, because 1011 + * these hosts don't check the pkt_size correctly (new hosts have been 1012 + * fixed since early 2019). The quirk is also safe on very old hosts 1013 + * and new hosts, because, on them, what really matters is the length 1014 + * specified in write_blk->byte_count. 1015 + */ 1016 + pkt_size += sizeof(pkt.reserved); 1017 + 1018 + ret = vmbus_sendpacket(hbus->hdev->channel, write_blk, pkt_size, 1019 + (unsigned long)&pkt.pkt, VM_PKT_DATA_INBAND, 1020 + VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 1021 + if (ret) 1022 + return ret; 1023 + 1024 + ret = wait_for_response(hbus->hdev, &comp_pkt.host_event); 1025 + if (ret) 1026 + return ret; 1027 + 1028 + if (comp_pkt.completion_status != 0) { 1029 + dev_err(&hbus->hdev->device, 1030 + "Write Config Block failed: 0x%x\n", 1031 + comp_pkt.completion_status); 1032 + return -EIO; 1033 + } 1034 + 1035 + return 0; 1036 + } 1037 + EXPORT_SYMBOL(hv_write_config_block); 1038 + 1039 + /** 1040 + * hv_register_block_invalidate() - Invoked when a config block invalidation 1041 + * arrives from the back-end driver. 1042 + * @pdev: The PCI driver's representation for this device. 1043 + * @context: Identifies the device. 1044 + * @block_invalidate: Identifies all of the blocks being invalidated. 1045 + * 1046 + * Return: 0 on success, -errno on failure 1047 + */ 1048 + int hv_register_block_invalidate(struct pci_dev *pdev, void *context, 1049 + void (*block_invalidate)(void *context, 1050 + u64 block_mask)) 1051 + { 1052 + struct hv_pcibus_device *hbus = 1053 + container_of(pdev->bus->sysdata, struct hv_pcibus_device, 1054 + sysdata); 1055 + struct hv_pci_dev *hpdev; 1056 + 1057 + hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); 1058 + if (!hpdev) 1059 + return -ENODEV; 1060 + 1061 + hpdev->block_invalidate = block_invalidate; 1062 + hpdev->invalidate_context = context; 1063 + 1064 + put_pcichild(hpdev); 1065 + return 0; 1066 + 1067 + } 1068 + EXPORT_SYMBOL(hv_register_block_invalidate); 855 1069 856 1070 /* Interrupt management hooks */ 857 1071 static void hv_int_desc_free(struct hv_pci_dev *hpdev, ··· 2254 1968 struct pci_response *response; 2255 1969 struct pci_incoming_message *new_message; 2256 1970 struct pci_bus_relations *bus_rel; 1971 + struct pci_dev_inval_block *inval; 2257 1972 struct pci_dev_incoming *dev_message; 2258 1973 struct hv_pci_dev *hpdev; 2259 1974 ··· 2328 2041 dev_message->wslot.slot); 2329 2042 if (hpdev) { 2330 2043 hv_pci_eject_device(hpdev); 2044 + put_pcichild(hpdev); 2045 + } 2046 + break; 2047 + 2048 + case PCI_INVALIDATE_BLOCK: 2049 + 2050 + inval = (struct pci_dev_inval_block *)buffer; 2051 + hpdev = get_pcichild_wslot(hbus, 2052 + inval->wslot.slot); 2053 + if (hpdev) { 2054 + if (hpdev->block_invalidate) { 2055 + hpdev->block_invalidate( 2056 + hpdev->invalidate_context, 2057 + inval->block_mask); 2058 + } 2331 2059 put_pcichild(hpdev); 2332 2060 } 2333 2061 break;
+15
include/linux/hyperv.h
··· 1578 1578 for (pkt = hv_pkt_iter_first(channel); pkt; \ 1579 1579 pkt = hv_pkt_iter_next(channel, pkt)) 1580 1580 1581 + /* 1582 + * Functions for passing data between SR-IOV PF and VF drivers. The VF driver 1583 + * sends requests to read and write blocks. Each block must be 128 bytes or 1584 + * smaller. Optionally, the VF driver can register a callback function which 1585 + * will be invoked when the host says that one or more of the first 64 block 1586 + * IDs is "invalid" which means that the VF driver should reread them. 1587 + */ 1588 + #define HV_CONFIG_BLOCK_SIZE_MAX 128 1589 + int hv_read_config_block(struct pci_dev *dev, void *buf, unsigned int buf_len, 1590 + unsigned int block_id, unsigned int *bytes_returned); 1591 + int hv_write_config_block(struct pci_dev *dev, void *buf, unsigned int len, 1592 + unsigned int block_id); 1593 + int hv_register_block_invalidate(struct pci_dev *dev, void *context, 1594 + void (*block_invalidate)(void *context, 1595 + u64 block_mask)); 1581 1596 #endif /* _HYPERV_H */