Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tools: hv: Add vmbus_bufring

Common userspace interface for read/write from VMBus ringbuffer.
This implementation is open for use by any userspace driver or
application seeking direct control over VMBus ring buffers.
A significant part of this code is borrowed from DPDK.
Link: https://github.com/DPDK/dpdk/

Currently this library is not supported for ARM64.

Signed-off-by: Mary Hardy <maryhardy@microsoft.com>
Signed-off-by: Saurabh Sengar <ssengar@linux.microsoft.com>
Reviewed-by: Long Li <longli@microsoft.com>
Link: https://lore.kernel.org/r/1711788723-8593-5-git-send-email-ssengar@linux.microsoft.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

authored by

Saurabh Sengar and committed by
Greg Kroah-Hartman
45bab4d7 547fa4ff

+476
+318
tools/hv/vmbus_bufring.c
··· 1 + // SPDX-License-Identifier: BSD-3-Clause 2 + /* 3 + * Copyright (c) 2009-2012,2016,2023 Microsoft Corp. 4 + * Copyright (c) 2012 NetApp Inc. 5 + * Copyright (c) 2012 Citrix Inc. 6 + * All rights reserved. 7 + */ 8 + 9 + #include <errno.h> 10 + #include <fcntl.h> 11 + #include <emmintrin.h> 12 + #include <linux/limits.h> 13 + #include <stdbool.h> 14 + #include <stdint.h> 15 + #include <stdio.h> 16 + #include <string.h> 17 + #include <sys/mman.h> 18 + #include <sys/uio.h> 19 + #include <unistd.h> 20 + #include "vmbus_bufring.h" 21 + 22 + /** 23 + * Compiler barrier. 24 + * 25 + * Guarantees that operation reordering does not occur at compile time 26 + * for operations directly before and after the barrier. 27 + */ 28 + #define rte_compiler_barrier() ({ asm volatile ("" : : : "memory"); }) 29 + 30 + #define VMBUS_RQST_ERROR 0xFFFFFFFFFFFFFFFF 31 + #define ALIGN(val, align) ((typeof(val))((val) & (~((typeof(val))((align) - 1))))) 32 + 33 + void *vmbus_uio_map(int *fd, int size) 34 + { 35 + void *map; 36 + 37 + map = mmap(NULL, 2 * size, PROT_READ | PROT_WRITE, MAP_SHARED, *fd, 0); 38 + if (map == MAP_FAILED) 39 + return NULL; 40 + 41 + return map; 42 + } 43 + 44 + /* Increase bufring index by inc with wraparound */ 45 + static inline uint32_t vmbus_br_idxinc(uint32_t idx, uint32_t inc, uint32_t sz) 46 + { 47 + idx += inc; 48 + if (idx >= sz) 49 + idx -= sz; 50 + 51 + return idx; 52 + } 53 + 54 + void vmbus_br_setup(struct vmbus_br *br, void *buf, unsigned int blen) 55 + { 56 + br->vbr = buf; 57 + br->windex = br->vbr->windex; 58 + br->dsize = blen - sizeof(struct vmbus_bufring); 59 + } 60 + 61 + static inline __always_inline void 62 + rte_smp_mb(void) 63 + { 64 + asm volatile("lock addl $0, -128(%%rsp); " ::: "memory"); 65 + } 66 + 67 + static inline int 68 + rte_atomic32_cmpset(volatile uint32_t *dst, uint32_t exp, uint32_t src) 69 + { 70 + uint8_t res; 71 + 72 + asm volatile("lock ; " 73 + "cmpxchgl %[src], %[dst];" 74 + "sete %[res];" 75 + : [res] "=a" (res), /* output */ 76 + [dst] "=m" (*dst) 77 + : [src] "r" (src), /* input */ 78 + "a" (exp), 79 + "m" (*dst) 80 + : "memory"); /* no-clobber list */ 81 + return res; 82 + } 83 + 84 + static inline uint32_t 85 + vmbus_txbr_copyto(const struct vmbus_br *tbr, uint32_t windex, 86 + const void *src0, uint32_t cplen) 87 + { 88 + uint8_t *br_data = tbr->vbr->data; 89 + uint32_t br_dsize = tbr->dsize; 90 + const uint8_t *src = src0; 91 + 92 + /* XXX use double mapping like Linux kernel? */ 93 + if (cplen > br_dsize - windex) { 94 + uint32_t fraglen = br_dsize - windex; 95 + 96 + /* Wrap-around detected */ 97 + memcpy(br_data + windex, src, fraglen); 98 + memcpy(br_data, src + fraglen, cplen - fraglen); 99 + } else { 100 + memcpy(br_data + windex, src, cplen); 101 + } 102 + 103 + return vmbus_br_idxinc(windex, cplen, br_dsize); 104 + } 105 + 106 + /* 107 + * Write scattered channel packet to TX bufring. 108 + * 109 + * The offset of this channel packet is written as a 64bits value 110 + * immediately after this channel packet. 111 + * 112 + * The write goes through three stages: 113 + * 1. Reserve space in ring buffer for the new data. 114 + * Writer atomically moves priv_write_index. 115 + * 2. Copy the new data into the ring. 116 + * 3. Update the tail of the ring (visible to host) that indicates 117 + * next read location. Writer updates write_index 118 + */ 119 + static int 120 + vmbus_txbr_write(struct vmbus_br *tbr, const struct iovec iov[], int iovlen) 121 + { 122 + struct vmbus_bufring *vbr = tbr->vbr; 123 + uint32_t ring_size = tbr->dsize; 124 + uint32_t old_windex, next_windex, windex, total; 125 + uint64_t save_windex; 126 + int i; 127 + 128 + total = 0; 129 + for (i = 0; i < iovlen; i++) 130 + total += iov[i].iov_len; 131 + total += sizeof(save_windex); 132 + 133 + /* Reserve space in ring */ 134 + do { 135 + uint32_t avail; 136 + 137 + /* Get current free location */ 138 + old_windex = tbr->windex; 139 + 140 + /* Prevent compiler reordering this with calculation */ 141 + rte_compiler_barrier(); 142 + 143 + avail = vmbus_br_availwrite(tbr, old_windex); 144 + 145 + /* If not enough space in ring, then tell caller. */ 146 + if (avail <= total) 147 + return -EAGAIN; 148 + 149 + next_windex = vmbus_br_idxinc(old_windex, total, ring_size); 150 + 151 + /* Atomic update of next write_index for other threads */ 152 + } while (!rte_atomic32_cmpset(&tbr->windex, old_windex, next_windex)); 153 + 154 + /* Space from old..new is now reserved */ 155 + windex = old_windex; 156 + for (i = 0; i < iovlen; i++) 157 + windex = vmbus_txbr_copyto(tbr, windex, iov[i].iov_base, iov[i].iov_len); 158 + 159 + /* Set the offset of the current channel packet. */ 160 + save_windex = ((uint64_t)old_windex) << 32; 161 + windex = vmbus_txbr_copyto(tbr, windex, &save_windex, 162 + sizeof(save_windex)); 163 + 164 + /* The region reserved should match region used */ 165 + if (windex != next_windex) 166 + return -EINVAL; 167 + 168 + /* Ensure that data is available before updating host index */ 169 + rte_compiler_barrier(); 170 + 171 + /* Checkin for our reservation. wait for our turn to update host */ 172 + while (!rte_atomic32_cmpset(&vbr->windex, old_windex, next_windex)) 173 + _mm_pause(); 174 + 175 + return 0; 176 + } 177 + 178 + int rte_vmbus_chan_send(struct vmbus_br *txbr, uint16_t type, void *data, 179 + uint32_t dlen, uint32_t flags) 180 + { 181 + struct vmbus_chanpkt pkt; 182 + unsigned int pktlen, pad_pktlen; 183 + const uint32_t hlen = sizeof(pkt); 184 + uint64_t pad = 0; 185 + struct iovec iov[3]; 186 + int error; 187 + 188 + pktlen = hlen + dlen; 189 + pad_pktlen = ALIGN(pktlen, sizeof(uint64_t)); 190 + 191 + pkt.hdr.type = type; 192 + pkt.hdr.flags = flags; 193 + pkt.hdr.hlen = hlen >> VMBUS_CHANPKT_SIZE_SHIFT; 194 + pkt.hdr.tlen = pad_pktlen >> VMBUS_CHANPKT_SIZE_SHIFT; 195 + pkt.hdr.xactid = VMBUS_RQST_ERROR; 196 + 197 + iov[0].iov_base = &pkt; 198 + iov[0].iov_len = hlen; 199 + iov[1].iov_base = data; 200 + iov[1].iov_len = dlen; 201 + iov[2].iov_base = &pad; 202 + iov[2].iov_len = pad_pktlen - pktlen; 203 + 204 + error = vmbus_txbr_write(txbr, iov, 3); 205 + 206 + return error; 207 + } 208 + 209 + static inline uint32_t 210 + vmbus_rxbr_copyfrom(const struct vmbus_br *rbr, uint32_t rindex, 211 + void *dst0, size_t cplen) 212 + { 213 + const uint8_t *br_data = rbr->vbr->data; 214 + uint32_t br_dsize = rbr->dsize; 215 + uint8_t *dst = dst0; 216 + 217 + if (cplen > br_dsize - rindex) { 218 + uint32_t fraglen = br_dsize - rindex; 219 + 220 + /* Wrap-around detected. */ 221 + memcpy(dst, br_data + rindex, fraglen); 222 + memcpy(dst + fraglen, br_data, cplen - fraglen); 223 + } else { 224 + memcpy(dst, br_data + rindex, cplen); 225 + } 226 + 227 + return vmbus_br_idxinc(rindex, cplen, br_dsize); 228 + } 229 + 230 + /* Copy data from receive ring but don't change index */ 231 + static int 232 + vmbus_rxbr_peek(const struct vmbus_br *rbr, void *data, size_t dlen) 233 + { 234 + uint32_t avail; 235 + 236 + /* 237 + * The requested data and the 64bits channel packet 238 + * offset should be there at least. 239 + */ 240 + avail = vmbus_br_availread(rbr); 241 + if (avail < dlen + sizeof(uint64_t)) 242 + return -EAGAIN; 243 + 244 + vmbus_rxbr_copyfrom(rbr, rbr->vbr->rindex, data, dlen); 245 + return 0; 246 + } 247 + 248 + /* 249 + * Copy data from receive ring and change index 250 + * NOTE: 251 + * We assume (dlen + skip) == sizeof(channel packet). 252 + */ 253 + static int 254 + vmbus_rxbr_read(struct vmbus_br *rbr, void *data, size_t dlen, size_t skip) 255 + { 256 + struct vmbus_bufring *vbr = rbr->vbr; 257 + uint32_t br_dsize = rbr->dsize; 258 + uint32_t rindex; 259 + 260 + if (vmbus_br_availread(rbr) < dlen + skip + sizeof(uint64_t)) 261 + return -EAGAIN; 262 + 263 + /* Record where host was when we started read (for debug) */ 264 + rbr->windex = rbr->vbr->windex; 265 + 266 + /* 267 + * Copy channel packet from RX bufring. 268 + */ 269 + rindex = vmbus_br_idxinc(rbr->vbr->rindex, skip, br_dsize); 270 + rindex = vmbus_rxbr_copyfrom(rbr, rindex, data, dlen); 271 + 272 + /* 273 + * Discard this channel packet's 64bits offset, which is useless to us. 274 + */ 275 + rindex = vmbus_br_idxinc(rindex, sizeof(uint64_t), br_dsize); 276 + 277 + /* Update the read index _after_ the channel packet is fetched. */ 278 + rte_compiler_barrier(); 279 + 280 + vbr->rindex = rindex; 281 + 282 + return 0; 283 + } 284 + 285 + int rte_vmbus_chan_recv_raw(struct vmbus_br *rxbr, 286 + void *data, uint32_t *len) 287 + { 288 + struct vmbus_chanpkt_hdr pkt; 289 + uint32_t dlen, bufferlen = *len; 290 + int error; 291 + 292 + error = vmbus_rxbr_peek(rxbr, &pkt, sizeof(pkt)); 293 + if (error) 294 + return error; 295 + 296 + if (unlikely(pkt.hlen < VMBUS_CHANPKT_HLEN_MIN)) 297 + /* XXX this channel is dead actually. */ 298 + return -EIO; 299 + 300 + if (unlikely(pkt.hlen > pkt.tlen)) 301 + return -EIO; 302 + 303 + /* Length are in quad words */ 304 + dlen = pkt.tlen << VMBUS_CHANPKT_SIZE_SHIFT; 305 + *len = dlen; 306 + 307 + /* If caller buffer is not large enough */ 308 + if (unlikely(dlen > bufferlen)) 309 + return -ENOBUFS; 310 + 311 + /* Read data and skip packet header */ 312 + error = vmbus_rxbr_read(rxbr, data, dlen, 0); 313 + if (error) 314 + return error; 315 + 316 + /* Return the number of bytes read */ 317 + return dlen + sizeof(uint64_t); 318 + }
+158
tools/hv/vmbus_bufring.h
··· 1 + /* SPDX-License-Identifier: BSD-3-Clause */ 2 + 3 + #ifndef _VMBUS_BUF_H_ 4 + #define _VMBUS_BUF_H_ 5 + 6 + #include <stdbool.h> 7 + #include <stdint.h> 8 + 9 + #define __packed __attribute__((__packed__)) 10 + #define unlikely(x) __builtin_expect(!!(x), 0) 11 + 12 + #define ICMSGHDRFLAG_TRANSACTION 1 13 + #define ICMSGHDRFLAG_REQUEST 2 14 + #define ICMSGHDRFLAG_RESPONSE 4 15 + 16 + #define IC_VERSION_NEGOTIATION_MAX_VER_COUNT 100 17 + #define ICMSG_HDR (sizeof(struct vmbuspipe_hdr) + sizeof(struct icmsg_hdr)) 18 + #define ICMSG_NEGOTIATE_PKT_SIZE(icframe_vercnt, icmsg_vercnt) \ 19 + (ICMSG_HDR + sizeof(struct icmsg_negotiate) + \ 20 + (((icframe_vercnt) + (icmsg_vercnt)) * sizeof(struct ic_version))) 21 + 22 + /* 23 + * Channel packets 24 + */ 25 + 26 + /* Channel packet flags */ 27 + #define VMBUS_CHANPKT_TYPE_INBAND 0x0006 28 + #define VMBUS_CHANPKT_TYPE_RXBUF 0x0007 29 + #define VMBUS_CHANPKT_TYPE_GPA 0x0009 30 + #define VMBUS_CHANPKT_TYPE_COMP 0x000b 31 + 32 + #define VMBUS_CHANPKT_FLAG_NONE 0 33 + #define VMBUS_CHANPKT_FLAG_RC 0x0001 /* report completion */ 34 + 35 + #define VMBUS_CHANPKT_SIZE_SHIFT 3 36 + #define VMBUS_CHANPKT_SIZE_ALIGN BIT(VMBUS_CHANPKT_SIZE_SHIFT) 37 + #define VMBUS_CHANPKT_HLEN_MIN \ 38 + (sizeof(struct vmbus_chanpkt_hdr) >> VMBUS_CHANPKT_SIZE_SHIFT) 39 + 40 + /* 41 + * Buffer ring 42 + */ 43 + struct vmbus_bufring { 44 + volatile uint32_t windex; 45 + volatile uint32_t rindex; 46 + 47 + /* 48 + * Interrupt mask {0,1} 49 + * 50 + * For TX bufring, host set this to 1, when it is processing 51 + * the TX bufring, so that we can safely skip the TX event 52 + * notification to host. 53 + * 54 + * For RX bufring, once this is set to 1 by us, host will not 55 + * further dispatch interrupts to us, even if there are data 56 + * pending on the RX bufring. This effectively disables the 57 + * interrupt of the channel to which this RX bufring is attached. 58 + */ 59 + volatile uint32_t imask; 60 + 61 + /* 62 + * Win8 uses some of the reserved bits to implement 63 + * interrupt driven flow management. On the send side 64 + * we can request that the receiver interrupt the sender 65 + * when the ring transitions from being full to being able 66 + * to handle a message of size "pending_send_sz". 67 + * 68 + * Add necessary state for this enhancement. 69 + */ 70 + volatile uint32_t pending_send; 71 + uint32_t reserved1[12]; 72 + 73 + union { 74 + struct { 75 + uint32_t feat_pending_send_sz:1; 76 + }; 77 + uint32_t value; 78 + } feature_bits; 79 + 80 + /* Pad it to rte_mem_page_size() so that data starts on page boundary */ 81 + uint8_t reserved2[4028]; 82 + 83 + /* 84 + * Ring data starts here + RingDataStartOffset 85 + * !!! DO NOT place any fields below this !!! 86 + */ 87 + uint8_t data[]; 88 + } __packed; 89 + 90 + struct vmbus_br { 91 + struct vmbus_bufring *vbr; 92 + uint32_t dsize; 93 + uint32_t windex; /* next available location */ 94 + }; 95 + 96 + struct vmbus_chanpkt_hdr { 97 + uint16_t type; /* VMBUS_CHANPKT_TYPE_ */ 98 + uint16_t hlen; /* header len, in 8 bytes */ 99 + uint16_t tlen; /* total len, in 8 bytes */ 100 + uint16_t flags; /* VMBUS_CHANPKT_FLAG_ */ 101 + uint64_t xactid; 102 + } __packed; 103 + 104 + struct vmbus_chanpkt { 105 + struct vmbus_chanpkt_hdr hdr; 106 + } __packed; 107 + 108 + struct vmbuspipe_hdr { 109 + unsigned int flags; 110 + unsigned int msgsize; 111 + } __packed; 112 + 113 + struct ic_version { 114 + unsigned short major; 115 + unsigned short minor; 116 + } __packed; 117 + 118 + struct icmsg_negotiate { 119 + unsigned short icframe_vercnt; 120 + unsigned short icmsg_vercnt; 121 + unsigned int reserved; 122 + struct ic_version icversion_data[]; /* any size array */ 123 + } __packed; 124 + 125 + struct icmsg_hdr { 126 + struct ic_version icverframe; 127 + unsigned short icmsgtype; 128 + struct ic_version icvermsg; 129 + unsigned short icmsgsize; 130 + unsigned int status; 131 + unsigned char ictransaction_id; 132 + unsigned char icflags; 133 + unsigned char reserved[2]; 134 + } __packed; 135 + 136 + int rte_vmbus_chan_recv_raw(struct vmbus_br *rxbr, void *data, uint32_t *len); 137 + int rte_vmbus_chan_send(struct vmbus_br *txbr, uint16_t type, void *data, 138 + uint32_t dlen, uint32_t flags); 139 + void vmbus_br_setup(struct vmbus_br *br, void *buf, unsigned int blen); 140 + void *vmbus_uio_map(int *fd, int size); 141 + 142 + /* Amount of space available for write */ 143 + static inline uint32_t vmbus_br_availwrite(const struct vmbus_br *br, uint32_t windex) 144 + { 145 + uint32_t rindex = br->vbr->rindex; 146 + 147 + if (windex >= rindex) 148 + return br->dsize - (windex - rindex); 149 + else 150 + return rindex - windex; 151 + } 152 + 153 + static inline uint32_t vmbus_br_availread(const struct vmbus_br *br) 154 + { 155 + return br->dsize - vmbus_br_availwrite(br, br->vbr->windex); 156 + } 157 + 158 + #endif /* !_VMBUS_BUF_H_ */