Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

virtio_ring: Support DMA APIs

virtio_ring currently sends the device (usually a hypervisor)
physical addresses of its I/O buffers. This is okay when DMA
addresses and physical addresses are the same thing, but this isn't
always the case. For example, this never works on Xen guests, and
it is likely to fail if a physical "virtio" device ever ends up
behind an IOMMU or swiotlb.

The immediate use case for me is to enable virtio on Xen guests.
For that to work, we need vring to support DMA address translation
as well as a corresponding change to virtio_pci or to another
driver.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

authored by

Andy Lutomirski and committed by
Michael S. Tsirkin
780bc790 d26c96c8

+183 -36
+1 -1
drivers/virtio/Kconfig
··· 60 60 61 61 config VIRTIO_MMIO 62 62 tristate "Platform bus driver for memory mapped virtio devices" 63 - depends on HAS_IOMEM 63 + depends on HAS_IOMEM && HAS_DMA 64 64 select VIRTIO 65 65 ---help--- 66 66 This drivers provides support for memory mapped virtio
+165 -35
drivers/virtio/virtio_ring.c
··· 24 24 #include <linux/module.h> 25 25 #include <linux/hrtimer.h> 26 26 #include <linux/kmemleak.h> 27 + #include <linux/dma-mapping.h> 27 28 28 29 #ifdef DEBUG 29 30 /* For development, we want to crash whenever the ring is screwed. */ ··· 54 53 #define START_USE(vq) 55 54 #define END_USE(vq) 56 55 #endif 56 + 57 + struct vring_desc_state { 58 + void *data; /* Data for callback. */ 59 + struct vring_desc *indir_desc; /* Indirect descriptor, if any. */ 60 + }; 57 61 58 62 struct vring_virtqueue { 59 63 struct virtqueue vq; ··· 104 98 ktime_t last_add_time; 105 99 #endif 106 100 107 - /* Tokens for callbacks. */ 108 - void *data[]; 101 + /* Per-descriptor state. */ 102 + struct vring_desc_state desc_state[]; 109 103 }; 110 104 111 105 #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq) ··· 132 126 static bool vring_use_dma_api(struct virtio_device *vdev) 133 127 { 134 128 return false; 129 + } 130 + 131 + /* 132 + * The DMA ops on various arches are rather gnarly right now, and 133 + * making all of the arch DMA ops work on the vring device itself 134 + * is a mess. For now, we use the parent device for DMA ops. 135 + */ 136 + struct device *vring_dma_dev(const struct vring_virtqueue *vq) 137 + { 138 + return vq->vq.vdev->dev.parent; 139 + } 140 + 141 + /* Map one sg entry. */ 142 + static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq, 143 + struct scatterlist *sg, 144 + enum dma_data_direction direction) 145 + { 146 + if (!vring_use_dma_api(vq->vq.vdev)) 147 + return (dma_addr_t)sg_phys(sg); 148 + 149 + /* 150 + * We can't use dma_map_sg, because we don't use scatterlists in 151 + * the way it expects (we don't guarantee that the scatterlist 152 + * will exist for the lifetime of the mapping). 153 + */ 154 + return dma_map_page(vring_dma_dev(vq), 155 + sg_page(sg), sg->offset, sg->length, 156 + direction); 157 + } 158 + 159 + static dma_addr_t vring_map_single(const struct vring_virtqueue *vq, 160 + void *cpu_addr, size_t size, 161 + enum dma_data_direction direction) 162 + { 163 + if (!vring_use_dma_api(vq->vq.vdev)) 164 + return (dma_addr_t)virt_to_phys(cpu_addr); 165 + 166 + return dma_map_single(vring_dma_dev(vq), 167 + cpu_addr, size, direction); 168 + } 169 + 170 + static void vring_unmap_one(const struct vring_virtqueue *vq, 171 + struct vring_desc *desc) 172 + { 173 + u16 flags; 174 + 175 + if (!vring_use_dma_api(vq->vq.vdev)) 176 + return; 177 + 178 + flags = virtio16_to_cpu(vq->vq.vdev, desc->flags); 179 + 180 + if (flags & VRING_DESC_F_INDIRECT) { 181 + dma_unmap_single(vring_dma_dev(vq), 182 + virtio64_to_cpu(vq->vq.vdev, desc->addr), 183 + virtio32_to_cpu(vq->vq.vdev, desc->len), 184 + (flags & VRING_DESC_F_WRITE) ? 185 + DMA_FROM_DEVICE : DMA_TO_DEVICE); 186 + } else { 187 + dma_unmap_page(vring_dma_dev(vq), 188 + virtio64_to_cpu(vq->vq.vdev, desc->addr), 189 + virtio32_to_cpu(vq->vq.vdev, desc->len), 190 + (flags & VRING_DESC_F_WRITE) ? 191 + DMA_FROM_DEVICE : DMA_TO_DEVICE); 192 + } 193 + } 194 + 195 + static int vring_mapping_error(const struct vring_virtqueue *vq, 196 + dma_addr_t addr) 197 + { 198 + if (!vring_use_dma_api(vq->vq.vdev)) 199 + return 0; 200 + 201 + return dma_mapping_error(vring_dma_dev(vq), addr); 135 202 } 136 203 137 204 static struct vring_desc *alloc_indirect(struct virtqueue *_vq, ··· 240 161 struct vring_virtqueue *vq = to_vvq(_vq); 241 162 struct scatterlist *sg; 242 163 struct vring_desc *desc; 243 - unsigned int i, n, avail, descs_used, uninitialized_var(prev); 164 + unsigned int i, n, avail, descs_used, uninitialized_var(prev), err_idx; 244 165 int head; 245 166 bool indirect; 246 167 ··· 280 201 281 202 if (desc) { 282 203 /* Use a single buffer which doesn't continue */ 283 - vq->vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT); 284 - vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, virt_to_phys(desc)); 285 - /* avoid kmemleak false positive (hidden by virt_to_phys) */ 286 - kmemleak_ignore(desc); 287 - vq->vring.desc[head].len = cpu_to_virtio32(_vq->vdev, total_sg * sizeof(struct vring_desc)); 288 - 204 + indirect = true; 289 205 /* Set up rest to use this indirect table. */ 290 206 i = 0; 291 207 descs_used = 1; 292 - indirect = true; 293 208 } else { 209 + indirect = false; 294 210 desc = vq->vring.desc; 295 211 i = head; 296 212 descs_used = total_sg; 297 - indirect = false; 298 213 } 299 214 300 215 if (vq->vq.num_free < descs_used) { ··· 303 230 return -ENOSPC; 304 231 } 305 232 306 - /* We're about to use some buffers from the free list. */ 307 - vq->vq.num_free -= descs_used; 308 - 309 233 for (n = 0; n < out_sgs; n++) { 310 234 for (sg = sgs[n]; sg; sg = sg_next(sg)) { 235 + dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_TO_DEVICE); 236 + if (vring_mapping_error(vq, addr)) 237 + goto unmap_release; 238 + 311 239 desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT); 312 - desc[i].addr = cpu_to_virtio64(_vq->vdev, sg_phys(sg)); 240 + desc[i].addr = cpu_to_virtio64(_vq->vdev, addr); 313 241 desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length); 314 242 prev = i; 315 243 i = virtio16_to_cpu(_vq->vdev, desc[i].next); ··· 318 244 } 319 245 for (; n < (out_sgs + in_sgs); n++) { 320 246 for (sg = sgs[n]; sg; sg = sg_next(sg)) { 247 + dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE); 248 + if (vring_mapping_error(vq, addr)) 249 + goto unmap_release; 250 + 321 251 desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE); 322 - desc[i].addr = cpu_to_virtio64(_vq->vdev, sg_phys(sg)); 252 + desc[i].addr = cpu_to_virtio64(_vq->vdev, addr); 323 253 desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length); 324 254 prev = i; 325 255 i = virtio16_to_cpu(_vq->vdev, desc[i].next); ··· 332 254 /* Last one doesn't continue. */ 333 255 desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT); 334 256 257 + if (indirect) { 258 + /* Now that the indirect table is filled in, map it. */ 259 + dma_addr_t addr = vring_map_single( 260 + vq, desc, total_sg * sizeof(struct vring_desc), 261 + DMA_TO_DEVICE); 262 + if (vring_mapping_error(vq, addr)) 263 + goto unmap_release; 264 + 265 + vq->vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT); 266 + vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr); 267 + 268 + vq->vring.desc[head].len = cpu_to_virtio32(_vq->vdev, total_sg * sizeof(struct vring_desc)); 269 + } 270 + 271 + /* We're using some buffers from the free list. */ 272 + vq->vq.num_free -= descs_used; 273 + 335 274 /* Update free pointer */ 336 275 if (indirect) 337 276 vq->free_head = virtio16_to_cpu(_vq->vdev, vq->vring.desc[head].next); 338 277 else 339 278 vq->free_head = i; 340 279 341 - /* Set token. */ 342 - vq->data[head] = data; 280 + /* Store token and indirect buffer state. */ 281 + vq->desc_state[head].data = data; 282 + if (indirect) 283 + vq->desc_state[head].indir_desc = desc; 343 284 344 285 /* Put entry in available array (but don't update avail->idx until they 345 286 * do sync). */ ··· 381 284 virtqueue_kick(_vq); 382 285 383 286 return 0; 287 + 288 + unmap_release: 289 + err_idx = i; 290 + i = head; 291 + 292 + for (n = 0; n < total_sg; n++) { 293 + if (i == err_idx) 294 + break; 295 + vring_unmap_one(vq, &desc[i]); 296 + i = vq->vring.desc[i].next; 297 + } 298 + 299 + vq->vq.num_free += total_sg; 300 + 301 + if (indirect) 302 + kfree(desc); 303 + 304 + return -EIO; 384 305 } 385 306 386 307 /** ··· 569 454 570 455 static void detach_buf(struct vring_virtqueue *vq, unsigned int head) 571 456 { 572 - unsigned int i; 457 + unsigned int i, j; 458 + u16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT); 573 459 574 460 /* Clear data ptr. */ 575 - vq->data[head] = NULL; 461 + vq->desc_state[head].data = NULL; 576 462 577 - /* Put back on free list: find end */ 463 + /* Put back on free list: unmap first-level descriptors and find end */ 578 464 i = head; 579 465 580 - /* Free the indirect table */ 581 - if (vq->vring.desc[i].flags & cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)) 582 - kfree(phys_to_virt(virtio64_to_cpu(vq->vq.vdev, vq->vring.desc[i].addr))); 583 - 584 - while (vq->vring.desc[i].flags & cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT)) { 466 + while (vq->vring.desc[i].flags & nextflag) { 467 + vring_unmap_one(vq, &vq->vring.desc[i]); 585 468 i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next); 586 469 vq->vq.num_free++; 587 470 } 588 471 472 + vring_unmap_one(vq, &vq->vring.desc[i]); 589 473 vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head); 590 474 vq->free_head = head; 475 + 591 476 /* Plus final descriptor */ 592 477 vq->vq.num_free++; 478 + 479 + /* Free the indirect table, if any, now that it's unmapped. */ 480 + if (vq->desc_state[head].indir_desc) { 481 + struct vring_desc *indir_desc = vq->desc_state[head].indir_desc; 482 + u32 len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len); 483 + 484 + BUG_ON(!(vq->vring.desc[head].flags & 485 + cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT))); 486 + BUG_ON(len == 0 || len % sizeof(struct vring_desc)); 487 + 488 + for (j = 0; j < len / sizeof(struct vring_desc); j++) 489 + vring_unmap_one(vq, &indir_desc[j]); 490 + 491 + kfree(vq->desc_state[head].indir_desc); 492 + vq->desc_state[head].indir_desc = NULL; 493 + } 593 494 } 594 495 595 496 static inline bool more_used(const struct vring_virtqueue *vq) ··· 660 529 BAD_RING(vq, "id %u out of range\n", i); 661 530 return NULL; 662 531 } 663 - if (unlikely(!vq->data[i])) { 532 + if (unlikely(!vq->desc_state[i].data)) { 664 533 BAD_RING(vq, "id %u is not a head!\n", i); 665 534 return NULL; 666 535 } 667 536 668 537 /* detach_buf clears data, so grab it now. */ 669 - ret = vq->data[i]; 538 + ret = vq->desc_state[i].data; 670 539 detach_buf(vq, i); 671 540 vq->last_used_idx++; 672 541 /* If we expect an interrupt for the next entry, tell host ··· 840 709 START_USE(vq); 841 710 842 711 for (i = 0; i < vq->vring.num; i++) { 843 - if (!vq->data[i]) 712 + if (!vq->desc_state[i].data) 844 713 continue; 845 714 /* detach_buf clears data, so grab it now. */ 846 - buf = vq->data[i]; 715 + buf = vq->desc_state[i].data; 847 716 detach_buf(vq, i); 848 717 vq->avail_idx_shadow--; 849 718 vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow); ··· 897 766 return NULL; 898 767 } 899 768 900 - vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL); 769 + vq = kmalloc(sizeof(*vq) + num * sizeof(struct vring_desc_state), 770 + GFP_KERNEL); 901 771 if (!vq) 902 772 return NULL; 903 773 ··· 932 800 933 801 /* Put everything in free lists. */ 934 802 vq->free_head = 0; 935 - for (i = 0; i < num-1; i++) { 803 + for (i = 0; i < num-1; i++) 936 804 vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1); 937 - vq->data[i] = NULL; 938 - } 939 - vq->data[i] = NULL; 805 + memset(vq->desc_state, 0, num * sizeof(struct vring_desc_state)); 940 806 941 807 return &vq->vq; 942 808 }
+17
tools/virtio/linux/dma-mapping.h
··· 1 + #ifndef _LINUX_DMA_MAPPING_H 2 + #define _LINUX_DMA_MAPPING_H 3 + 4 + #ifdef CONFIG_HAS_DMA 5 + # error Virtio userspace code does not support CONFIG_HAS_DMA 6 + #endif 7 + 8 + #define PCI_DMA_BUS_IS_PHYS 1 9 + 10 + enum dma_data_direction { 11 + DMA_BIDIRECTIONAL = 0, 12 + DMA_TO_DEVICE = 1, 13 + DMA_FROM_DEVICE = 2, 14 + DMA_NONE = 3, 15 + }; 16 + 17 + #endif