Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

vringh: host-side implementation of virtio rings.

Getting use of virtio rings correct is tricky, and a recent patch saw
an implementation of in-kernel rings (as separate from userspace).

This abstracts the business of dealing with the virtio ring layout
from the access (userspace or direct); to do this, we use function
pointers, which gcc inlines correctly.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Michael S. Tsirkin <mst@redhat.com>

+1215 -1
+1 -1
drivers/Makefile
··· 123 123 obj-$(CONFIG_OF) += of/ 124 124 obj-$(CONFIG_SSB) += ssb/ 125 125 obj-$(CONFIG_BCMA) += bcma/ 126 - obj-$(CONFIG_VHOST_NET) += vhost/ 126 + obj-$(CONFIG_VHOST_RING) += vhost/ 127 127 obj-$(CONFIG_VLYNQ) += vlynq/ 128 128 obj-$(CONFIG_STAGING) += staging/ 129 129 obj-y += platform/
+8
drivers/vhost/Kconfig
··· 1 1 config VHOST_NET 2 2 tristate "Host kernel accelerator for virtio net" 3 3 depends on NET && EVENTFD && (TUN || !TUN) && (MACVTAP || !MACVTAP) 4 + select VHOST_RING 4 5 ---help--- 5 6 This kernel module can be loaded in host kernel to accelerate 6 7 guest networking with virtio_net. Not to be confused with virtio_net ··· 13 12 if STAGING 14 13 source "drivers/vhost/Kconfig.tcm" 15 14 endif 15 + 16 + config VHOST_RING 17 + tristate 18 + ---help--- 19 + This option is selected by any driver which needs to access 20 + the host side of a virtio ring. 21 +
+1
drivers/vhost/Kconfig.tcm
··· 1 1 config TCM_VHOST 2 2 tristate "TCM_VHOST fabric module" 3 3 depends on TARGET_CORE && EVENTFD && m 4 + select VHOST_RING 4 5 default n 5 6 ---help--- 6 7 Say M here to enable the TCM_VHOST fabric module for use with virtio-scsi guests
+2
drivers/vhost/Makefile
··· 2 2 vhost_net-y := vhost.o net.o 3 3 4 4 obj-$(CONFIG_TCM_VHOST) += tcm_vhost.o 5 + 6 + obj-$(CONFIG_VHOST_RING) += vringh.o
+1007
drivers/vhost/vringh.c
··· 1 + /* 2 + * Helpers for the host side of a virtio ring. 3 + * 4 + * Since these may be in userspace, we use (inline) accessors. 5 + */ 6 + #include <linux/vringh.h> 7 + #include <linux/virtio_ring.h> 8 + #include <linux/kernel.h> 9 + #include <linux/ratelimit.h> 10 + #include <linux/uaccess.h> 11 + #include <linux/slab.h> 12 + #include <linux/export.h> 13 + 14 + static __printf(1,2) __cold void vringh_bad(const char *fmt, ...) 15 + { 16 + static DEFINE_RATELIMIT_STATE(vringh_rs, 17 + DEFAULT_RATELIMIT_INTERVAL, 18 + DEFAULT_RATELIMIT_BURST); 19 + if (__ratelimit(&vringh_rs)) { 20 + va_list ap; 21 + va_start(ap, fmt); 22 + printk(KERN_NOTICE "vringh:"); 23 + vprintk(fmt, ap); 24 + va_end(ap); 25 + } 26 + } 27 + 28 + /* Returns vring->num if empty, -ve on error. */ 29 + static inline int __vringh_get_head(const struct vringh *vrh, 30 + int (*getu16)(u16 *val, const u16 *p), 31 + u16 *last_avail_idx) 32 + { 33 + u16 avail_idx, i, head; 34 + int err; 35 + 36 + err = getu16(&avail_idx, &vrh->vring.avail->idx); 37 + if (err) { 38 + vringh_bad("Failed to access avail idx at %p", 39 + &vrh->vring.avail->idx); 40 + return err; 41 + } 42 + 43 + if (*last_avail_idx == avail_idx) 44 + return vrh->vring.num; 45 + 46 + /* Only get avail ring entries after they have been exposed by guest. */ 47 + virtio_rmb(vrh->weak_barriers); 48 + 49 + i = *last_avail_idx & (vrh->vring.num - 1); 50 + 51 + err = getu16(&head, &vrh->vring.avail->ring[i]); 52 + if (err) { 53 + vringh_bad("Failed to read head: idx %d address %p", 54 + *last_avail_idx, &vrh->vring.avail->ring[i]); 55 + return err; 56 + } 57 + 58 + if (head >= vrh->vring.num) { 59 + vringh_bad("Guest says index %u > %u is available", 60 + head, vrh->vring.num); 61 + return -EINVAL; 62 + } 63 + 64 + (*last_avail_idx)++; 65 + return head; 66 + } 67 + 68 + /* Copy some bytes to/from the iovec. Returns num copied. */ 69 + static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov, 70 + void *ptr, size_t len, 71 + int (*xfer)(void *addr, void *ptr, 72 + size_t len)) 73 + { 74 + int err, done = 0; 75 + 76 + while (len && iov->i < iov->used) { 77 + size_t partlen; 78 + 79 + partlen = min(iov->iov[iov->i].iov_len, len); 80 + err = xfer(iov->iov[iov->i].iov_base, ptr, partlen); 81 + if (err) 82 + return err; 83 + done += partlen; 84 + len -= partlen; 85 + ptr += partlen; 86 + iov->consumed += partlen; 87 + iov->iov[iov->i].iov_len -= partlen; 88 + iov->iov[iov->i].iov_base += partlen; 89 + 90 + if (!iov->iov[iov->i].iov_len) { 91 + /* Fix up old iov element then increment. */ 92 + iov->iov[iov->i].iov_len = iov->consumed; 93 + iov->iov[iov->i].iov_base -= iov->consumed; 94 + 95 + iov->consumed = 0; 96 + iov->i++; 97 + } 98 + } 99 + return done; 100 + } 101 + 102 + /* May reduce *len if range is shorter. */ 103 + static inline bool range_check(struct vringh *vrh, u64 addr, size_t *len, 104 + struct vringh_range *range, 105 + bool (*getrange)(struct vringh *, 106 + u64, struct vringh_range *)) 107 + { 108 + if (addr < range->start || addr > range->end_incl) { 109 + if (!getrange(vrh, addr, range)) 110 + return false; 111 + } 112 + BUG_ON(addr < range->start || addr > range->end_incl); 113 + 114 + /* To end of memory? */ 115 + if (unlikely(addr + *len == 0)) { 116 + if (range->end_incl == -1ULL) 117 + return true; 118 + goto truncate; 119 + } 120 + 121 + /* Otherwise, don't wrap. */ 122 + if (addr + *len < addr) { 123 + vringh_bad("Wrapping descriptor %zu@0x%llx", 124 + *len, (unsigned long long)addr); 125 + return false; 126 + } 127 + 128 + if (unlikely(addr + *len - 1 > range->end_incl)) 129 + goto truncate; 130 + return true; 131 + 132 + truncate: 133 + *len = range->end_incl + 1 - addr; 134 + return true; 135 + } 136 + 137 + static inline bool no_range_check(struct vringh *vrh, u64 addr, size_t *len, 138 + struct vringh_range *range, 139 + bool (*getrange)(struct vringh *, 140 + u64, struct vringh_range *)) 141 + { 142 + return true; 143 + } 144 + 145 + /* No reason for this code to be inline. */ 146 + static int move_to_indirect(int *up_next, u16 *i, void *addr, 147 + const struct vring_desc *desc, 148 + struct vring_desc **descs, int *desc_max) 149 + { 150 + /* Indirect tables can't have indirect. */ 151 + if (*up_next != -1) { 152 + vringh_bad("Multilevel indirect %u->%u", *up_next, *i); 153 + return -EINVAL; 154 + } 155 + 156 + if (unlikely(desc->len % sizeof(struct vring_desc))) { 157 + vringh_bad("Strange indirect len %u", desc->len); 158 + return -EINVAL; 159 + } 160 + 161 + /* We will check this when we follow it! */ 162 + if (desc->flags & VRING_DESC_F_NEXT) 163 + *up_next = desc->next; 164 + else 165 + *up_next = -2; 166 + *descs = addr; 167 + *desc_max = desc->len / sizeof(struct vring_desc); 168 + 169 + /* Now, start at the first indirect. */ 170 + *i = 0; 171 + return 0; 172 + } 173 + 174 + static int resize_iovec(struct vringh_kiov *iov, gfp_t gfp) 175 + { 176 + struct kvec *new; 177 + unsigned int flag, new_num = (iov->max_num & ~VRINGH_IOV_ALLOCATED) * 2; 178 + 179 + if (new_num < 8) 180 + new_num = 8; 181 + 182 + flag = (iov->max_num & VRINGH_IOV_ALLOCATED); 183 + if (flag) 184 + new = krealloc(iov->iov, new_num * sizeof(struct iovec), gfp); 185 + else { 186 + new = kmalloc(new_num * sizeof(struct iovec), gfp); 187 + if (new) { 188 + memcpy(new, iov->iov, 189 + iov->max_num * sizeof(struct iovec)); 190 + flag = VRINGH_IOV_ALLOCATED; 191 + } 192 + } 193 + if (!new) 194 + return -ENOMEM; 195 + iov->iov = new; 196 + iov->max_num = (new_num | flag); 197 + return 0; 198 + } 199 + 200 + static u16 __cold return_from_indirect(const struct vringh *vrh, int *up_next, 201 + struct vring_desc **descs, int *desc_max) 202 + { 203 + u16 i = *up_next; 204 + 205 + *up_next = -1; 206 + *descs = vrh->vring.desc; 207 + *desc_max = vrh->vring.num; 208 + return i; 209 + } 210 + 211 + static int slow_copy(struct vringh *vrh, void *dst, const void *src, 212 + bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len, 213 + struct vringh_range *range, 214 + bool (*getrange)(struct vringh *vrh, 215 + u64, 216 + struct vringh_range *)), 217 + bool (*getrange)(struct vringh *vrh, 218 + u64 addr, 219 + struct vringh_range *r), 220 + struct vringh_range *range, 221 + int (*copy)(void *dst, const void *src, size_t len)) 222 + { 223 + size_t part, len = sizeof(struct vring_desc); 224 + 225 + do { 226 + u64 addr; 227 + int err; 228 + 229 + part = len; 230 + addr = (u64)(unsigned long)src - range->offset; 231 + 232 + if (!rcheck(vrh, addr, &part, range, getrange)) 233 + return -EINVAL; 234 + 235 + err = copy(dst, src, part); 236 + if (err) 237 + return err; 238 + 239 + dst += part; 240 + src += part; 241 + len -= part; 242 + } while (len); 243 + return 0; 244 + } 245 + 246 + static inline int 247 + __vringh_iov(struct vringh *vrh, u16 i, 248 + struct vringh_kiov *riov, 249 + struct vringh_kiov *wiov, 250 + bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len, 251 + struct vringh_range *range, 252 + bool (*getrange)(struct vringh *, u64, 253 + struct vringh_range *)), 254 + bool (*getrange)(struct vringh *, u64, struct vringh_range *), 255 + gfp_t gfp, 256 + int (*copy)(void *dst, const void *src, size_t len)) 257 + { 258 + int err, count = 0, up_next, desc_max; 259 + struct vring_desc desc, *descs; 260 + struct vringh_range range = { -1ULL, 0 }, slowrange; 261 + bool slow = false; 262 + 263 + /* We start traversing vring's descriptor table. */ 264 + descs = vrh->vring.desc; 265 + desc_max = vrh->vring.num; 266 + up_next = -1; 267 + 268 + if (riov) 269 + riov->i = riov->used = 0; 270 + else if (wiov) 271 + wiov->i = wiov->used = 0; 272 + else 273 + /* You must want something! */ 274 + BUG(); 275 + 276 + for (;;) { 277 + void *addr; 278 + struct vringh_kiov *iov; 279 + size_t len; 280 + 281 + if (unlikely(slow)) 282 + err = slow_copy(vrh, &desc, &descs[i], rcheck, getrange, 283 + &slowrange, copy); 284 + else 285 + err = copy(&desc, &descs[i], sizeof(desc)); 286 + if (unlikely(err)) 287 + goto fail; 288 + 289 + if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) { 290 + /* Make sure it's OK, and get offset. */ 291 + len = desc.len; 292 + if (!rcheck(vrh, desc.addr, &len, &range, getrange)) { 293 + err = -EINVAL; 294 + goto fail; 295 + } 296 + 297 + if (unlikely(len != desc.len)) { 298 + slow = true; 299 + /* We need to save this range to use offset */ 300 + slowrange = range; 301 + } 302 + 303 + addr = (void *)(long)(desc.addr + range.offset); 304 + err = move_to_indirect(&up_next, &i, addr, &desc, 305 + &descs, &desc_max); 306 + if (err) 307 + goto fail; 308 + continue; 309 + } 310 + 311 + if (count++ == vrh->vring.num) { 312 + vringh_bad("Descriptor loop in %p", descs); 313 + err = -ELOOP; 314 + goto fail; 315 + } 316 + 317 + if (desc.flags & VRING_DESC_F_WRITE) 318 + iov = wiov; 319 + else { 320 + iov = riov; 321 + if (unlikely(wiov && wiov->i)) { 322 + vringh_bad("Readable desc %p after writable", 323 + &descs[i]); 324 + err = -EINVAL; 325 + goto fail; 326 + } 327 + } 328 + 329 + if (!iov) { 330 + vringh_bad("Unexpected %s desc", 331 + !wiov ? "writable" : "readable"); 332 + err = -EPROTO; 333 + goto fail; 334 + } 335 + 336 + again: 337 + /* Make sure it's OK, and get offset. */ 338 + len = desc.len; 339 + if (!rcheck(vrh, desc.addr, &len, &range, getrange)) { 340 + err = -EINVAL; 341 + goto fail; 342 + } 343 + addr = (void *)(unsigned long)(desc.addr + range.offset); 344 + 345 + if (unlikely(iov->used == (iov->max_num & ~VRINGH_IOV_ALLOCATED))) { 346 + err = resize_iovec(iov, gfp); 347 + if (err) 348 + goto fail; 349 + } 350 + 351 + iov->iov[iov->used].iov_base = addr; 352 + iov->iov[iov->used].iov_len = len; 353 + iov->used++; 354 + 355 + if (unlikely(len != desc.len)) { 356 + desc.len -= len; 357 + desc.addr += len; 358 + goto again; 359 + } 360 + 361 + if (desc.flags & VRING_DESC_F_NEXT) { 362 + i = desc.next; 363 + } else { 364 + /* Just in case we need to finish traversing above. */ 365 + if (unlikely(up_next > 0)) { 366 + i = return_from_indirect(vrh, &up_next, 367 + &descs, &desc_max); 368 + slow = false; 369 + } else 370 + break; 371 + } 372 + 373 + if (i >= desc_max) { 374 + vringh_bad("Chained index %u > %u", i, desc_max); 375 + err = -EINVAL; 376 + goto fail; 377 + } 378 + } 379 + 380 + return 0; 381 + 382 + fail: 383 + return err; 384 + } 385 + 386 + static inline int __vringh_complete(struct vringh *vrh, 387 + const struct vring_used_elem *used, 388 + unsigned int num_used, 389 + int (*putu16)(u16 *p, u16 val), 390 + int (*putused)(struct vring_used_elem *dst, 391 + const struct vring_used_elem 392 + *src, unsigned num)) 393 + { 394 + struct vring_used *used_ring; 395 + int err; 396 + u16 used_idx, off; 397 + 398 + used_ring = vrh->vring.used; 399 + used_idx = vrh->last_used_idx + vrh->completed; 400 + 401 + off = used_idx % vrh->vring.num; 402 + 403 + /* Compiler knows num_used == 1 sometimes, hence extra check */ 404 + if (num_used > 1 && unlikely(off + num_used >= vrh->vring.num)) { 405 + u16 part = vrh->vring.num - off; 406 + err = putused(&used_ring->ring[off], used, part); 407 + if (!err) 408 + err = putused(&used_ring->ring[0], used + part, 409 + num_used - part); 410 + } else 411 + err = putused(&used_ring->ring[off], used, num_used); 412 + 413 + if (err) { 414 + vringh_bad("Failed to write %u used entries %u at %p", 415 + num_used, off, &used_ring->ring[off]); 416 + return err; 417 + } 418 + 419 + /* Make sure buffer is written before we update index. */ 420 + virtio_wmb(vrh->weak_barriers); 421 + 422 + err = putu16(&vrh->vring.used->idx, used_idx + num_used); 423 + if (err) { 424 + vringh_bad("Failed to update used index at %p", 425 + &vrh->vring.used->idx); 426 + return err; 427 + } 428 + 429 + vrh->completed += num_used; 430 + return 0; 431 + } 432 + 433 + 434 + static inline int __vringh_need_notify(struct vringh *vrh, 435 + int (*getu16)(u16 *val, const u16 *p)) 436 + { 437 + bool notify; 438 + u16 used_event; 439 + int err; 440 + 441 + /* Flush out used index update. This is paired with the 442 + * barrier that the Guest executes when enabling 443 + * interrupts. */ 444 + virtio_mb(vrh->weak_barriers); 445 + 446 + /* Old-style, without event indices. */ 447 + if (!vrh->event_indices) { 448 + u16 flags; 449 + err = getu16(&flags, &vrh->vring.avail->flags); 450 + if (err) { 451 + vringh_bad("Failed to get flags at %p", 452 + &vrh->vring.avail->flags); 453 + return err; 454 + } 455 + return (!(flags & VRING_AVAIL_F_NO_INTERRUPT)); 456 + } 457 + 458 + /* Modern: we know when other side wants to know. */ 459 + err = getu16(&used_event, &vring_used_event(&vrh->vring)); 460 + if (err) { 461 + vringh_bad("Failed to get used event idx at %p", 462 + &vring_used_event(&vrh->vring)); 463 + return err; 464 + } 465 + 466 + /* Just in case we added so many that we wrap. */ 467 + if (unlikely(vrh->completed > 0xffff)) 468 + notify = true; 469 + else 470 + notify = vring_need_event(used_event, 471 + vrh->last_used_idx + vrh->completed, 472 + vrh->last_used_idx); 473 + 474 + vrh->last_used_idx += vrh->completed; 475 + vrh->completed = 0; 476 + return notify; 477 + } 478 + 479 + static inline bool __vringh_notify_enable(struct vringh *vrh, 480 + int (*getu16)(u16 *val, const u16 *p), 481 + int (*putu16)(u16 *p, u16 val)) 482 + { 483 + u16 avail; 484 + 485 + if (!vrh->event_indices) { 486 + /* Old-school; update flags. */ 487 + if (putu16(&vrh->vring.used->flags, 0) != 0) { 488 + vringh_bad("Clearing used flags %p", 489 + &vrh->vring.used->flags); 490 + return true; 491 + } 492 + } else { 493 + if (putu16(&vring_avail_event(&vrh->vring), 494 + vrh->last_avail_idx) != 0) { 495 + vringh_bad("Updating avail event index %p", 496 + &vring_avail_event(&vrh->vring)); 497 + return true; 498 + } 499 + } 500 + 501 + /* They could have slipped one in as we were doing that: make 502 + * sure it's written, then check again. */ 503 + virtio_mb(vrh->weak_barriers); 504 + 505 + if (getu16(&avail, &vrh->vring.avail->idx) != 0) { 506 + vringh_bad("Failed to check avail idx at %p", 507 + &vrh->vring.avail->idx); 508 + return true; 509 + } 510 + 511 + /* This is unlikely, so we just leave notifications enabled 512 + * (if we're using event_indices, we'll only get one 513 + * notification anyway). */ 514 + return avail == vrh->last_avail_idx; 515 + } 516 + 517 + static inline void __vringh_notify_disable(struct vringh *vrh, 518 + int (*putu16)(u16 *p, u16 val)) 519 + { 520 + if (!vrh->event_indices) { 521 + /* Old-school; update flags. */ 522 + if (putu16(&vrh->vring.used->flags, VRING_USED_F_NO_NOTIFY)) { 523 + vringh_bad("Setting used flags %p", 524 + &vrh->vring.used->flags); 525 + } 526 + } 527 + } 528 + 529 + /* Userspace access helpers: in this case, addresses are really userspace. */ 530 + static inline int getu16_user(u16 *val, const u16 *p) 531 + { 532 + return get_user(*val, (__force u16 __user *)p); 533 + } 534 + 535 + static inline int putu16_user(u16 *p, u16 val) 536 + { 537 + return put_user(val, (__force u16 __user *)p); 538 + } 539 + 540 + static inline int copydesc_user(void *dst, const void *src, size_t len) 541 + { 542 + return copy_from_user(dst, (__force void __user *)src, len) ? 543 + -EFAULT : 0; 544 + } 545 + 546 + static inline int putused_user(struct vring_used_elem *dst, 547 + const struct vring_used_elem *src, 548 + unsigned int num) 549 + { 550 + return copy_to_user((__force void __user *)dst, src, 551 + sizeof(*dst) * num) ? -EFAULT : 0; 552 + } 553 + 554 + static inline int xfer_from_user(void *src, void *dst, size_t len) 555 + { 556 + return copy_from_user(dst, (__force void __user *)src, len) ? 557 + -EFAULT : 0; 558 + } 559 + 560 + static inline int xfer_to_user(void *dst, void *src, size_t len) 561 + { 562 + return copy_to_user((__force void __user *)dst, src, len) ? 563 + -EFAULT : 0; 564 + } 565 + 566 + /** 567 + * vringh_init_user - initialize a vringh for a userspace vring. 568 + * @vrh: the vringh to initialize. 569 + * @features: the feature bits for this ring. 570 + * @num: the number of elements. 571 + * @weak_barriers: true if we only need memory barriers, not I/O. 572 + * @desc: the userpace descriptor pointer. 573 + * @avail: the userpace avail pointer. 574 + * @used: the userpace used pointer. 575 + * 576 + * Returns an error if num is invalid: you should check pointers 577 + * yourself! 578 + */ 579 + int vringh_init_user(struct vringh *vrh, u32 features, 580 + unsigned int num, bool weak_barriers, 581 + struct vring_desc __user *desc, 582 + struct vring_avail __user *avail, 583 + struct vring_used __user *used) 584 + { 585 + /* Sane power of 2 please! */ 586 + if (!num || num > 0xffff || (num & (num - 1))) { 587 + vringh_bad("Bad ring size %u", num); 588 + return -EINVAL; 589 + } 590 + 591 + vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX)); 592 + vrh->weak_barriers = weak_barriers; 593 + vrh->completed = 0; 594 + vrh->last_avail_idx = 0; 595 + vrh->last_used_idx = 0; 596 + vrh->vring.num = num; 597 + /* vring expects kernel addresses, but only used via accessors. */ 598 + vrh->vring.desc = (__force struct vring_desc *)desc; 599 + vrh->vring.avail = (__force struct vring_avail *)avail; 600 + vrh->vring.used = (__force struct vring_used *)used; 601 + return 0; 602 + } 603 + EXPORT_SYMBOL(vringh_init_user); 604 + 605 + /** 606 + * vringh_getdesc_user - get next available descriptor from userspace ring. 607 + * @vrh: the userspace vring. 608 + * @riov: where to put the readable descriptors (or NULL) 609 + * @wiov: where to put the writable descriptors (or NULL) 610 + * @getrange: function to call to check ranges. 611 + * @head: head index we received, for passing to vringh_complete_user(). 612 + * 613 + * Returns 0 if there was no descriptor, 1 if there was, or -errno. 614 + * 615 + * Note that on error return, you can tell the difference between an 616 + * invalid ring and a single invalid descriptor: in the former case, 617 + * *head will be vrh->vring.num. You may be able to ignore an invalid 618 + * descriptor, but there's not much you can do with an invalid ring. 619 + * 620 + * Note that you may need to clean up riov and wiov, even on error! 621 + */ 622 + int vringh_getdesc_user(struct vringh *vrh, 623 + struct vringh_iov *riov, 624 + struct vringh_iov *wiov, 625 + bool (*getrange)(struct vringh *vrh, 626 + u64 addr, struct vringh_range *r), 627 + u16 *head) 628 + { 629 + int err; 630 + 631 + *head = vrh->vring.num; 632 + err = __vringh_get_head(vrh, getu16_user, &vrh->last_avail_idx); 633 + if (err < 0) 634 + return err; 635 + 636 + /* Empty... */ 637 + if (err == vrh->vring.num) 638 + return 0; 639 + 640 + /* We need the layouts to be the identical for this to work */ 641 + BUILD_BUG_ON(sizeof(struct vringh_kiov) != sizeof(struct vringh_iov)); 642 + BUILD_BUG_ON(offsetof(struct vringh_kiov, iov) != 643 + offsetof(struct vringh_iov, iov)); 644 + BUILD_BUG_ON(offsetof(struct vringh_kiov, i) != 645 + offsetof(struct vringh_iov, i)); 646 + BUILD_BUG_ON(offsetof(struct vringh_kiov, used) != 647 + offsetof(struct vringh_iov, used)); 648 + BUILD_BUG_ON(offsetof(struct vringh_kiov, max_num) != 649 + offsetof(struct vringh_iov, max_num)); 650 + BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec)); 651 + BUILD_BUG_ON(offsetof(struct iovec, iov_base) != 652 + offsetof(struct kvec, iov_base)); 653 + BUILD_BUG_ON(offsetof(struct iovec, iov_len) != 654 + offsetof(struct kvec, iov_len)); 655 + BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_base) 656 + != sizeof(((struct kvec *)NULL)->iov_base)); 657 + BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_len) 658 + != sizeof(((struct kvec *)NULL)->iov_len)); 659 + 660 + *head = err; 661 + err = __vringh_iov(vrh, *head, (struct vringh_kiov *)riov, 662 + (struct vringh_kiov *)wiov, 663 + range_check, getrange, GFP_KERNEL, copydesc_user); 664 + if (err) 665 + return err; 666 + 667 + return 1; 668 + } 669 + EXPORT_SYMBOL(vringh_getdesc_user); 670 + 671 + /** 672 + * vringh_iov_pull_user - copy bytes from vring_iov. 673 + * @riov: the riov as passed to vringh_getdesc_user() (updated as we consume) 674 + * @dst: the place to copy. 675 + * @len: the maximum length to copy. 676 + * 677 + * Returns the bytes copied <= len or a negative errno. 678 + */ 679 + ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len) 680 + { 681 + return vringh_iov_xfer((struct vringh_kiov *)riov, 682 + dst, len, xfer_from_user); 683 + } 684 + EXPORT_SYMBOL(vringh_iov_pull_user); 685 + 686 + /** 687 + * vringh_iov_push_user - copy bytes into vring_iov. 688 + * @wiov: the wiov as passed to vringh_getdesc_user() (updated as we consume) 689 + * @dst: the place to copy. 690 + * @len: the maximum length to copy. 691 + * 692 + * Returns the bytes copied <= len or a negative errno. 693 + */ 694 + ssize_t vringh_iov_push_user(struct vringh_iov *wiov, 695 + const void *src, size_t len) 696 + { 697 + return vringh_iov_xfer((struct vringh_kiov *)wiov, 698 + (void *)src, len, xfer_to_user); 699 + } 700 + EXPORT_SYMBOL(vringh_iov_push_user); 701 + 702 + /** 703 + * vringh_abandon_user - we've decided not to handle the descriptor(s). 704 + * @vrh: the vring. 705 + * @num: the number of descriptors to put back (ie. num 706 + * vringh_get_user() to undo). 707 + * 708 + * The next vringh_get_user() will return the old descriptor(s) again. 709 + */ 710 + void vringh_abandon_user(struct vringh *vrh, unsigned int num) 711 + { 712 + /* We only update vring_avail_event(vr) when we want to be notified, 713 + * so we haven't changed that yet. */ 714 + vrh->last_avail_idx -= num; 715 + } 716 + EXPORT_SYMBOL(vringh_abandon_user); 717 + 718 + /** 719 + * vringh_complete_user - we've finished with descriptor, publish it. 720 + * @vrh: the vring. 721 + * @head: the head as filled in by vringh_getdesc_user. 722 + * @len: the length of data we have written. 723 + * 724 + * You should check vringh_need_notify_user() after one or more calls 725 + * to this function. 726 + */ 727 + int vringh_complete_user(struct vringh *vrh, u16 head, u32 len) 728 + { 729 + struct vring_used_elem used; 730 + 731 + used.id = head; 732 + used.len = len; 733 + return __vringh_complete(vrh, &used, 1, putu16_user, putused_user); 734 + } 735 + EXPORT_SYMBOL(vringh_complete_user); 736 + 737 + /** 738 + * vringh_complete_multi_user - we've finished with many descriptors. 739 + * @vrh: the vring. 740 + * @used: the head, length pairs. 741 + * @num_used: the number of used elements. 742 + * 743 + * You should check vringh_need_notify_user() after one or more calls 744 + * to this function. 745 + */ 746 + int vringh_complete_multi_user(struct vringh *vrh, 747 + const struct vring_used_elem used[], 748 + unsigned num_used) 749 + { 750 + return __vringh_complete(vrh, used, num_used, 751 + putu16_user, putused_user); 752 + } 753 + EXPORT_SYMBOL(vringh_complete_multi_user); 754 + 755 + /** 756 + * vringh_notify_enable_user - we want to know if something changes. 757 + * @vrh: the vring. 758 + * 759 + * This always enables notifications, but returns false if there are 760 + * now more buffers available in the vring. 761 + */ 762 + bool vringh_notify_enable_user(struct vringh *vrh) 763 + { 764 + return __vringh_notify_enable(vrh, getu16_user, putu16_user); 765 + } 766 + EXPORT_SYMBOL(vringh_notify_enable_user); 767 + 768 + /** 769 + * vringh_notify_disable_user - don't tell us if something changes. 770 + * @vrh: the vring. 771 + * 772 + * This is our normal running state: we disable and then only enable when 773 + * we're going to sleep. 774 + */ 775 + void vringh_notify_disable_user(struct vringh *vrh) 776 + { 777 + __vringh_notify_disable(vrh, putu16_user); 778 + } 779 + EXPORT_SYMBOL(vringh_notify_disable_user); 780 + 781 + /** 782 + * vringh_need_notify_user - must we tell the other side about used buffers? 783 + * @vrh: the vring we've called vringh_complete_user() on. 784 + * 785 + * Returns -errno or 0 if we don't need to tell the other side, 1 if we do. 786 + */ 787 + int vringh_need_notify_user(struct vringh *vrh) 788 + { 789 + return __vringh_need_notify(vrh, getu16_user); 790 + } 791 + EXPORT_SYMBOL(vringh_need_notify_user); 792 + 793 + /* Kernelspace access helpers. */ 794 + static inline int getu16_kern(u16 *val, const u16 *p) 795 + { 796 + *val = ACCESS_ONCE(*p); 797 + return 0; 798 + } 799 + 800 + static inline int putu16_kern(u16 *p, u16 val) 801 + { 802 + ACCESS_ONCE(*p) = val; 803 + return 0; 804 + } 805 + 806 + static inline int copydesc_kern(void *dst, const void *src, size_t len) 807 + { 808 + memcpy(dst, src, len); 809 + return 0; 810 + } 811 + 812 + static inline int putused_kern(struct vring_used_elem *dst, 813 + const struct vring_used_elem *src, 814 + unsigned int num) 815 + { 816 + memcpy(dst, src, num * sizeof(*dst)); 817 + return 0; 818 + } 819 + 820 + static inline int xfer_kern(void *src, void *dst, size_t len) 821 + { 822 + memcpy(dst, src, len); 823 + return 0; 824 + } 825 + 826 + /** 827 + * vringh_init_kern - initialize a vringh for a kernelspace vring. 828 + * @vrh: the vringh to initialize. 829 + * @features: the feature bits for this ring. 830 + * @num: the number of elements. 831 + * @weak_barriers: true if we only need memory barriers, not I/O. 832 + * @desc: the userpace descriptor pointer. 833 + * @avail: the userpace avail pointer. 834 + * @used: the userpace used pointer. 835 + * 836 + * Returns an error if num is invalid. 837 + */ 838 + int vringh_init_kern(struct vringh *vrh, u32 features, 839 + unsigned int num, bool weak_barriers, 840 + struct vring_desc *desc, 841 + struct vring_avail *avail, 842 + struct vring_used *used) 843 + { 844 + /* Sane power of 2 please! */ 845 + if (!num || num > 0xffff || (num & (num - 1))) { 846 + vringh_bad("Bad ring size %u", num); 847 + return -EINVAL; 848 + } 849 + 850 + vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX)); 851 + vrh->weak_barriers = weak_barriers; 852 + vrh->completed = 0; 853 + vrh->last_avail_idx = 0; 854 + vrh->last_used_idx = 0; 855 + vrh->vring.num = num; 856 + vrh->vring.desc = desc; 857 + vrh->vring.avail = avail; 858 + vrh->vring.used = used; 859 + return 0; 860 + } 861 + EXPORT_SYMBOL(vringh_init_kern); 862 + 863 + /** 864 + * vringh_getdesc_kern - get next available descriptor from kernelspace ring. 865 + * @vrh: the kernelspace vring. 866 + * @riov: where to put the readable descriptors (or NULL) 867 + * @wiov: where to put the writable descriptors (or NULL) 868 + * @head: head index we received, for passing to vringh_complete_kern(). 869 + * @gfp: flags for allocating larger riov/wiov. 870 + * 871 + * Returns 0 if there was no descriptor, 1 if there was, or -errno. 872 + * 873 + * Note that on error return, you can tell the difference between an 874 + * invalid ring and a single invalid descriptor: in the former case, 875 + * *head will be vrh->vring.num. You may be able to ignore an invalid 876 + * descriptor, but there's not much you can do with an invalid ring. 877 + * 878 + * Note that you may need to clean up riov and wiov, even on error! 879 + */ 880 + int vringh_getdesc_kern(struct vringh *vrh, 881 + struct vringh_kiov *riov, 882 + struct vringh_kiov *wiov, 883 + u16 *head, 884 + gfp_t gfp) 885 + { 886 + int err; 887 + 888 + err = __vringh_get_head(vrh, getu16_kern, &vrh->last_avail_idx); 889 + if (err < 0) 890 + return err; 891 + 892 + /* Empty... */ 893 + if (err == vrh->vring.num) 894 + return 0; 895 + 896 + *head = err; 897 + err = __vringh_iov(vrh, *head, riov, wiov, no_range_check, NULL, 898 + gfp, copydesc_kern); 899 + if (err) 900 + return err; 901 + 902 + return 1; 903 + } 904 + EXPORT_SYMBOL(vringh_getdesc_kern); 905 + 906 + /** 907 + * vringh_iov_pull_kern - copy bytes from vring_iov. 908 + * @riov: the riov as passed to vringh_getdesc_kern() (updated as we consume) 909 + * @dst: the place to copy. 910 + * @len: the maximum length to copy. 911 + * 912 + * Returns the bytes copied <= len or a negative errno. 913 + */ 914 + ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len) 915 + { 916 + return vringh_iov_xfer(riov, dst, len, xfer_kern); 917 + } 918 + EXPORT_SYMBOL(vringh_iov_pull_kern); 919 + 920 + /** 921 + * vringh_iov_push_kern - copy bytes into vring_iov. 922 + * @wiov: the wiov as passed to vringh_getdesc_kern() (updated as we consume) 923 + * @dst: the place to copy. 924 + * @len: the maximum length to copy. 925 + * 926 + * Returns the bytes copied <= len or a negative errno. 927 + */ 928 + ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov, 929 + const void *src, size_t len) 930 + { 931 + return vringh_iov_xfer(wiov, (void *)src, len, xfer_kern); 932 + } 933 + EXPORT_SYMBOL(vringh_iov_push_kern); 934 + 935 + /** 936 + * vringh_abandon_kern - we've decided not to handle the descriptor(s). 937 + * @vrh: the vring. 938 + * @num: the number of descriptors to put back (ie. num 939 + * vringh_get_kern() to undo). 940 + * 941 + * The next vringh_get_kern() will return the old descriptor(s) again. 942 + */ 943 + void vringh_abandon_kern(struct vringh *vrh, unsigned int num) 944 + { 945 + /* We only update vring_avail_event(vr) when we want to be notified, 946 + * so we haven't changed that yet. */ 947 + vrh->last_avail_idx -= num; 948 + } 949 + EXPORT_SYMBOL(vringh_abandon_kern); 950 + 951 + /** 952 + * vringh_complete_kern - we've finished with descriptor, publish it. 953 + * @vrh: the vring. 954 + * @head: the head as filled in by vringh_getdesc_kern. 955 + * @len: the length of data we have written. 956 + * 957 + * You should check vringh_need_notify_kern() after one or more calls 958 + * to this function. 959 + */ 960 + int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len) 961 + { 962 + struct vring_used_elem used; 963 + 964 + used.id = head; 965 + used.len = len; 966 + 967 + return __vringh_complete(vrh, &used, 1, putu16_kern, putused_kern); 968 + } 969 + EXPORT_SYMBOL(vringh_complete_kern); 970 + 971 + /** 972 + * vringh_notify_enable_kern - we want to know if something changes. 973 + * @vrh: the vring. 974 + * 975 + * This always enables notifications, but returns false if there are 976 + * now more buffers available in the vring. 977 + */ 978 + bool vringh_notify_enable_kern(struct vringh *vrh) 979 + { 980 + return __vringh_notify_enable(vrh, getu16_kern, putu16_kern); 981 + } 982 + EXPORT_SYMBOL(vringh_notify_enable_kern); 983 + 984 + /** 985 + * vringh_notify_disable_kern - don't tell us if something changes. 986 + * @vrh: the vring. 987 + * 988 + * This is our normal running state: we disable and then only enable when 989 + * we're going to sleep. 990 + */ 991 + void vringh_notify_disable_kern(struct vringh *vrh) 992 + { 993 + __vringh_notify_disable(vrh, putu16_kern); 994 + } 995 + EXPORT_SYMBOL(vringh_notify_disable_kern); 996 + 997 + /** 998 + * vringh_need_notify_kern - must we tell the other side about used buffers? 999 + * @vrh: the vring we've called vringh_complete_kern() on. 1000 + * 1001 + * Returns -errno or 0 if we don't need to tell the other side, 1 if we do. 1002 + */ 1003 + int vringh_need_notify_kern(struct vringh *vrh) 1004 + { 1005 + return __vringh_need_notify(vrh, getu16_kern); 1006 + } 1007 + EXPORT_SYMBOL(vringh_need_notify_kern);
+196
include/linux/vringh.h
··· 1 + /* 2 + * Linux host-side vring helpers; for when the kernel needs to access 3 + * someone else's vring. 4 + * 5 + * Copyright IBM Corporation, 2013. 6 + * Parts taken from drivers/vhost/vhost.c Copyright 2009 Red Hat, Inc. 7 + * 8 + * This program is free software; you can redistribute it and/or modify 9 + * it under the terms of the GNU General Public License as published by 10 + * the Free Software Foundation; either version 2 of the License, or 11 + * (at your option) any later version. 12 + * 13 + * This program is distributed in the hope that it will be useful, 14 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 + * GNU General Public License for more details. 17 + * 18 + * You should have received a copy of the GNU General Public License 19 + * along with this program; if not, write to the Free Software 20 + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 21 + * 22 + * Written by: Rusty Russell <rusty@rustcorp.com.au> 23 + */ 24 + #ifndef _LINUX_VRINGH_H 25 + #define _LINUX_VRINGH_H 26 + #include <uapi/linux/virtio_ring.h> 27 + #include <linux/uio.h> 28 + #include <linux/slab.h> 29 + #include <asm/barrier.h> 30 + 31 + /* virtio_ring with information needed for host access. */ 32 + struct vringh { 33 + /* Guest publishes used event idx (note: we always do). */ 34 + bool event_indices; 35 + 36 + /* Can we get away with weak barriers? */ 37 + bool weak_barriers; 38 + 39 + /* Last available index we saw (ie. where we're up to). */ 40 + u16 last_avail_idx; 41 + 42 + /* Last index we used. */ 43 + u16 last_used_idx; 44 + 45 + /* How many descriptors we've completed since last need_notify(). */ 46 + u32 completed; 47 + 48 + /* The vring (note: it may contain user pointers!) */ 49 + struct vring vring; 50 + }; 51 + 52 + /* The memory the vring can access, and what offset to apply. */ 53 + struct vringh_range { 54 + u64 start, end_incl; 55 + u64 offset; 56 + }; 57 + 58 + /** 59 + * struct vringh_iov - iovec mangler. 60 + * 61 + * Mangles iovec in place, and restores it. 62 + * Remaining data is iov + i, of used - i elements. 63 + */ 64 + struct vringh_iov { 65 + struct iovec *iov; 66 + size_t consumed; /* Within iov[i] */ 67 + unsigned i, used, max_num; 68 + }; 69 + 70 + /** 71 + * struct vringh_iov - kvec mangler. 72 + * 73 + * Mangles kvec in place, and restores it. 74 + * Remaining data is iov + i, of used - i elements. 75 + */ 76 + struct vringh_kiov { 77 + struct kvec *iov; 78 + size_t consumed; /* Within iov[i] */ 79 + unsigned i, used, max_num; 80 + }; 81 + 82 + /* Flag on max_num to indicate we're kmalloced. */ 83 + #define VRINGH_IOV_ALLOCATED 0x8000000 84 + 85 + /* Helpers for userspace vrings. */ 86 + int vringh_init_user(struct vringh *vrh, u32 features, 87 + unsigned int num, bool weak_barriers, 88 + struct vring_desc __user *desc, 89 + struct vring_avail __user *avail, 90 + struct vring_used __user *used); 91 + 92 + static inline void vringh_iov_init(struct vringh_iov *iov, 93 + struct iovec *iovec, unsigned num) 94 + { 95 + iov->used = iov->i = 0; 96 + iov->consumed = 0; 97 + iov->max_num = num; 98 + iov->iov = iovec; 99 + } 100 + 101 + static inline void vringh_iov_reset(struct vringh_iov *iov) 102 + { 103 + iov->iov[iov->i].iov_len += iov->consumed; 104 + iov->iov[iov->i].iov_base -= iov->consumed; 105 + iov->consumed = 0; 106 + iov->i = 0; 107 + } 108 + 109 + static inline void vringh_iov_cleanup(struct vringh_iov *iov) 110 + { 111 + if (iov->max_num & VRINGH_IOV_ALLOCATED) 112 + kfree(iov->iov); 113 + iov->max_num = iov->used = iov->i = iov->consumed = 0; 114 + iov->iov = NULL; 115 + } 116 + 117 + /* Convert a descriptor into iovecs. */ 118 + int vringh_getdesc_user(struct vringh *vrh, 119 + struct vringh_iov *riov, 120 + struct vringh_iov *wiov, 121 + bool (*getrange)(struct vringh *vrh, 122 + u64 addr, struct vringh_range *r), 123 + u16 *head); 124 + 125 + /* Copy bytes from readable vsg, consuming it (and incrementing wiov->i). */ 126 + ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len); 127 + 128 + /* Copy bytes into writable vsg, consuming it (and incrementing wiov->i). */ 129 + ssize_t vringh_iov_push_user(struct vringh_iov *wiov, 130 + const void *src, size_t len); 131 + 132 + /* Mark a descriptor as used. */ 133 + int vringh_complete_user(struct vringh *vrh, u16 head, u32 len); 134 + int vringh_complete_multi_user(struct vringh *vrh, 135 + const struct vring_used_elem used[], 136 + unsigned num_used); 137 + 138 + /* Pretend we've never seen descriptor (for easy error handling). */ 139 + void vringh_abandon_user(struct vringh *vrh, unsigned int num); 140 + 141 + /* Do we need to fire the eventfd to notify the other side? */ 142 + int vringh_need_notify_user(struct vringh *vrh); 143 + 144 + bool vringh_notify_enable_user(struct vringh *vrh); 145 + void vringh_notify_disable_user(struct vringh *vrh); 146 + 147 + /* Helpers for kernelspace vrings. */ 148 + int vringh_init_kern(struct vringh *vrh, u32 features, 149 + unsigned int num, bool weak_barriers, 150 + struct vring_desc *desc, 151 + struct vring_avail *avail, 152 + struct vring_used *used); 153 + 154 + static inline void vringh_kiov_init(struct vringh_kiov *kiov, 155 + struct kvec *kvec, unsigned num) 156 + { 157 + kiov->used = kiov->i = 0; 158 + kiov->consumed = 0; 159 + kiov->max_num = num; 160 + kiov->iov = kvec; 161 + } 162 + 163 + static inline void vringh_kiov_reset(struct vringh_kiov *kiov) 164 + { 165 + kiov->iov[kiov->i].iov_len += kiov->consumed; 166 + kiov->iov[kiov->i].iov_base -= kiov->consumed; 167 + kiov->consumed = 0; 168 + kiov->i = 0; 169 + } 170 + 171 + static inline void vringh_kiov_cleanup(struct vringh_kiov *kiov) 172 + { 173 + if (kiov->max_num & VRINGH_IOV_ALLOCATED) 174 + kfree(kiov->iov); 175 + kiov->max_num = kiov->used = kiov->i = kiov->consumed = 0; 176 + kiov->iov = NULL; 177 + } 178 + 179 + int vringh_getdesc_kern(struct vringh *vrh, 180 + struct vringh_kiov *riov, 181 + struct vringh_kiov *wiov, 182 + u16 *head, 183 + gfp_t gfp); 184 + 185 + ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len); 186 + ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov, 187 + const void *src, size_t len); 188 + void vringh_abandon_kern(struct vringh *vrh, unsigned int num); 189 + int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len); 190 + 191 + bool vringh_notify_enable_kern(struct vringh *vrh); 192 + void vringh_notify_disable_kern(struct vringh *vrh); 193 + 194 + int vringh_need_notify_kern(struct vringh *vrh); 195 + 196 + #endif /* _LINUX_VRINGH_H */