Merge tag 'kvm-x86-generic-6.12' of https://github.com/kvm-x86/linux into HEAD

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

KVK generic changes for 6.12:

- Fix a bug that results in KVM prematurely exiting to userspace for coalesced
MMIO/PIO in many cases, clean up the related code, and add a testcase.

- Fix a bug in kvm_clear_guest() where it would trigger a buffer overflow _if_
the gpa+len crosses a page boundary, which thankfully is guaranteed to not
happen in the current code base. Add WARNs in more helpers that read/write
guest memory to detect similar bugs.

Paolo Bonzini 2 years ago 7056c4e2 c09dd2bb

+283 -24

5 changed files

expand all

tools

testing

selftests

kvm

Makefile

coalesced_io_test.c

include

kvm_util.h

virt

kvm

coalesced_mmio.c

kvm_main.c

tools/testing/selftests/kvm/Makefile

··· 130 130 TEST_GEN_PROGS_x86_64 += x86_64/triple_fault_event_test 131 131 TEST_GEN_PROGS_x86_64 += x86_64/recalc_apic_map_test 132 132 TEST_GEN_PROGS_x86_64 += access_tracking_perf_test 133 + TEST_GEN_PROGS_x86_64 += coalesced_io_test 133 134 TEST_GEN_PROGS_x86_64 += demand_paging_test 134 135 TEST_GEN_PROGS_x86_64 += dirty_log_test 135 136 TEST_GEN_PROGS_x86_64 += dirty_log_perf_test ··· 168 167 TEST_GEN_PROGS_aarch64 += aarch64/no-vgic-v3 169 168 TEST_GEN_PROGS_aarch64 += access_tracking_perf_test 170 169 TEST_GEN_PROGS_aarch64 += arch_timer 170 + TEST_GEN_PROGS_aarch64 += coalesced_io_test 171 171 TEST_GEN_PROGS_aarch64 += demand_paging_test 172 172 TEST_GEN_PROGS_aarch64 += dirty_log_test 173 173 TEST_GEN_PROGS_aarch64 += dirty_log_perf_test ··· 203 201 TEST_GEN_PROGS_riscv += riscv/sbi_pmu_test 204 202 TEST_GEN_PROGS_riscv += riscv/ebreak_test 205 203 TEST_GEN_PROGS_riscv += arch_timer 204 + TEST_GEN_PROGS_riscv += coalesced_io_test 206 205 TEST_GEN_PROGS_riscv += demand_paging_test 207 206 TEST_GEN_PROGS_riscv += dirty_log_test 208 207 TEST_GEN_PROGS_riscv += get-reg-list

+236

tools/testing/selftests/kvm/coalesced_io_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #include <signal.h> 3 + #include <stdio.h> 4 + #include <stdlib.h> 5 + #include <string.h> 6 + #include <sys/ioctl.h> 7 + 8 + #include <linux/sizes.h> 9 + 10 + #include <kvm_util.h> 11 + #include <processor.h> 12 + 13 + #include "ucall_common.h" 14 + 15 + struct kvm_coalesced_io { 16 + struct kvm_coalesced_mmio_ring *ring; 17 + uint32_t ring_size; 18 + uint64_t mmio_gpa; 19 + uint64_t *mmio; 20 + 21 + /* 22 + * x86-only, but define pio_port for all architectures to minimize the 23 + * amount of #ifdeffery and complexity, without having to sacrifice 24 + * verbose error messages. 25 + */ 26 + uint8_t pio_port; 27 + }; 28 + 29 + static struct kvm_coalesced_io kvm_builtin_io_ring; 30 + 31 + #ifdef __x86_64__ 32 + static const int has_pio = 1; 33 + #else 34 + static const int has_pio = 0; 35 + #endif 36 + 37 + static void guest_code(struct kvm_coalesced_io *io) 38 + { 39 + int i, j; 40 + 41 + for (;;) { 42 + for (j = 0; j < 1 + has_pio; j++) { 43 + /* 44 + * KVM always leaves one free entry, i.e. exits to 45 + * userspace before the last entry is filled. 46 + */ 47 + for (i = 0; i < io->ring_size - 1; i++) { 48 + #ifdef __x86_64__ 49 + if (i & 1) 50 + outl(io->pio_port, io->pio_port + i); 51 + else 52 + #endif 53 + WRITE_ONCE(*io->mmio, io->mmio_gpa + i); 54 + } 55 + #ifdef __x86_64__ 56 + if (j & 1) 57 + outl(io->pio_port, io->pio_port + i); 58 + else 59 + #endif 60 + WRITE_ONCE(*io->mmio, io->mmio_gpa + i); 61 + } 62 + GUEST_SYNC(0); 63 + 64 + WRITE_ONCE(*io->mmio, io->mmio_gpa + i); 65 + #ifdef __x86_64__ 66 + outl(io->pio_port, io->pio_port + i); 67 + #endif 68 + } 69 + } 70 + 71 + static void vcpu_run_and_verify_io_exit(struct kvm_vcpu *vcpu, 72 + struct kvm_coalesced_io *io, 73 + uint32_t ring_start, 74 + uint32_t expected_exit) 75 + { 76 + const bool want_pio = expected_exit == KVM_EXIT_IO; 77 + struct kvm_coalesced_mmio_ring *ring = io->ring; 78 + struct kvm_run *run = vcpu->run; 79 + uint32_t pio_value; 80 + 81 + WRITE_ONCE(ring->first, ring_start); 82 + WRITE_ONCE(ring->last, ring_start); 83 + 84 + vcpu_run(vcpu); 85 + 86 + /* 87 + * Annoyingly, reading PIO data is safe only for PIO exits, otherwise 88 + * data_offset is garbage, e.g. an MMIO gpa. 89 + */ 90 + if (run->exit_reason == KVM_EXIT_IO) 91 + pio_value = *(uint32_t *)((void *)run + run->io.data_offset); 92 + else 93 + pio_value = 0; 94 + 95 + TEST_ASSERT((!want_pio && (run->exit_reason == KVM_EXIT_MMIO && run->mmio.is_write && 96 + run->mmio.phys_addr == io->mmio_gpa && run->mmio.len == 8 && 97 + *(uint64_t *)run->mmio.data == io->mmio_gpa + io->ring_size - 1)) || 98 + (want_pio && (run->exit_reason == KVM_EXIT_IO && run->io.port == io->pio_port && 99 + run->io.direction == KVM_EXIT_IO_OUT && run->io.count == 1 && 100 + pio_value == io->pio_port + io->ring_size - 1)), 101 + "For start = %u, expected exit on %u-byte %s write 0x%llx = %lx, got exit_reason = %u (%s)\n " 102 + "(MMIO addr = 0x%llx, write = %u, len = %u, data = %lx)\n " 103 + "(PIO port = 0x%x, write = %u, len = %u, count = %u, data = %x", 104 + ring_start, want_pio ? 4 : 8, want_pio ? "PIO" : "MMIO", 105 + want_pio ? (unsigned long long)io->pio_port : io->mmio_gpa, 106 + (want_pio ? io->pio_port : io->mmio_gpa) + io->ring_size - 1, run->exit_reason, 107 + run->exit_reason == KVM_EXIT_MMIO ? "MMIO" : run->exit_reason == KVM_EXIT_IO ? "PIO" : "other", 108 + run->mmio.phys_addr, run->mmio.is_write, run->mmio.len, *(uint64_t *)run->mmio.data, 109 + run->io.port, run->io.direction, run->io.size, run->io.count, pio_value); 110 + } 111 + 112 + static void vcpu_run_and_verify_coalesced_io(struct kvm_vcpu *vcpu, 113 + struct kvm_coalesced_io *io, 114 + uint32_t ring_start, 115 + uint32_t expected_exit) 116 + { 117 + struct kvm_coalesced_mmio_ring *ring = io->ring; 118 + int i; 119 + 120 + vcpu_run_and_verify_io_exit(vcpu, io, ring_start, expected_exit); 121 + 122 + TEST_ASSERT((ring->last + 1) % io->ring_size == ring->first, 123 + "Expected ring to be full (minus 1), first = %u, last = %u, max = %u, start = %u", 124 + ring->first, ring->last, io->ring_size, ring_start); 125 + 126 + for (i = 0; i < io->ring_size - 1; i++) { 127 + uint32_t idx = (ring->first + i) % io->ring_size; 128 + struct kvm_coalesced_mmio *entry = &ring->coalesced_mmio[idx]; 129 + 130 + #ifdef __x86_64__ 131 + if (i & 1) 132 + TEST_ASSERT(entry->phys_addr == io->pio_port && 133 + entry->len == 4 && entry->pio && 134 + *(uint32_t *)entry->data == io->pio_port + i, 135 + "Wanted 4-byte port I/O 0x%x = 0x%x in entry %u, got %u-byte %s 0x%llx = 0x%x", 136 + io->pio_port, io->pio_port + i, i, 137 + entry->len, entry->pio ? "PIO" : "MMIO", 138 + entry->phys_addr, *(uint32_t *)entry->data); 139 + else 140 + #endif 141 + TEST_ASSERT(entry->phys_addr == io->mmio_gpa && 142 + entry->len == 8 && !entry->pio, 143 + "Wanted 8-byte MMIO to 0x%lx = %lx in entry %u, got %u-byte %s 0x%llx = 0x%lx", 144 + io->mmio_gpa, io->mmio_gpa + i, i, 145 + entry->len, entry->pio ? "PIO" : "MMIO", 146 + entry->phys_addr, *(uint64_t *)entry->data); 147 + } 148 + } 149 + 150 + static void test_coalesced_io(struct kvm_vcpu *vcpu, 151 + struct kvm_coalesced_io *io, uint32_t ring_start) 152 + { 153 + struct kvm_coalesced_mmio_ring *ring = io->ring; 154 + 155 + kvm_vm_register_coalesced_io(vcpu->vm, io->mmio_gpa, 8, false /* pio */); 156 + #ifdef __x86_64__ 157 + kvm_vm_register_coalesced_io(vcpu->vm, io->pio_port, 8, true /* pio */); 158 + #endif 159 + 160 + vcpu_run_and_verify_coalesced_io(vcpu, io, ring_start, KVM_EXIT_MMIO); 161 + #ifdef __x86_64__ 162 + vcpu_run_and_verify_coalesced_io(vcpu, io, ring_start, KVM_EXIT_IO); 163 + #endif 164 + 165 + /* 166 + * Verify ucall, which may use non-coalesced MMIO or PIO, generates an 167 + * immediate exit. 168 + */ 169 + WRITE_ONCE(ring->first, ring_start); 170 + WRITE_ONCE(ring->last, ring_start); 171 + vcpu_run(vcpu); 172 + TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_SYNC); 173 + TEST_ASSERT_EQ(ring->first, ring_start); 174 + TEST_ASSERT_EQ(ring->last, ring_start); 175 + 176 + /* Verify that non-coalesced MMIO/PIO generates an exit to userspace. */ 177 + kvm_vm_unregister_coalesced_io(vcpu->vm, io->mmio_gpa, 8, false /* pio */); 178 + vcpu_run_and_verify_io_exit(vcpu, io, ring_start, KVM_EXIT_MMIO); 179 + 180 + #ifdef __x86_64__ 181 + kvm_vm_unregister_coalesced_io(vcpu->vm, io->pio_port, 8, true /* pio */); 182 + vcpu_run_and_verify_io_exit(vcpu, io, ring_start, KVM_EXIT_IO); 183 + #endif 184 + } 185 + 186 + int main(int argc, char *argv[]) 187 + { 188 + struct kvm_vcpu *vcpu; 189 + struct kvm_vm *vm; 190 + int i; 191 + 192 + TEST_REQUIRE(kvm_has_cap(KVM_CAP_COALESCED_MMIO)); 193 + 194 + #ifdef __x86_64__ 195 + TEST_REQUIRE(kvm_has_cap(KVM_CAP_COALESCED_PIO)); 196 + #endif 197 + 198 + vm = vm_create_with_one_vcpu(&vcpu, guest_code); 199 + 200 + kvm_builtin_io_ring = (struct kvm_coalesced_io) { 201 + /* 202 + * The I/O ring is a kernel-allocated page whose address is 203 + * relative to each vCPU's run page, with the page offset 204 + * provided by KVM in the return of KVM_CAP_COALESCED_MMIO. 205 + */ 206 + .ring = (void *)vcpu->run + 207 + (kvm_check_cap(KVM_CAP_COALESCED_MMIO) * getpagesize()), 208 + 209 + /* 210 + * The size of the I/O ring is fixed, but KVM defines the sized 211 + * based on the kernel's PAGE_SIZE. Thus, userspace must query 212 + * the host's page size at runtime to compute the ring size. 213 + */ 214 + .ring_size = (getpagesize() - sizeof(struct kvm_coalesced_mmio_ring)) / 215 + sizeof(struct kvm_coalesced_mmio), 216 + 217 + /* 218 + * Arbitrary address+port (MMIO mustn't overlap memslots), with 219 + * the MMIO GPA identity mapped in the guest. 220 + */ 221 + .mmio_gpa = 4ull * SZ_1G, 222 + .mmio = (uint64_t *)(4ull * SZ_1G), 223 + .pio_port = 0x80, 224 + }; 225 + 226 + virt_map(vm, (uint64_t)kvm_builtin_io_ring.mmio, kvm_builtin_io_ring.mmio_gpa, 1); 227 + 228 + sync_global_to_guest(vm, kvm_builtin_io_ring); 229 + vcpu_args_set(vcpu, 1, &kvm_builtin_io_ring); 230 + 231 + for (i = 0; i < kvm_builtin_io_ring.ring_size; i++) 232 + test_coalesced_io(vcpu, &kvm_builtin_io_ring, i); 233 + 234 + kvm_vm_free(vm); 235 + return 0; 236 + }

+26

tools/testing/selftests/kvm/include/kvm_util.h

··· 460 460 return __vm_ioctl(vm, KVM_RESET_DIRTY_RINGS, NULL); 461 461 } 462 462 463 + static inline void kvm_vm_register_coalesced_io(struct kvm_vm *vm, 464 + uint64_t address, 465 + uint64_t size, bool pio) 466 + { 467 + struct kvm_coalesced_mmio_zone zone = { 468 + .addr = address, 469 + .size = size, 470 + .pio = pio, 471 + }; 472 + 473 + vm_ioctl(vm, KVM_REGISTER_COALESCED_MMIO, &zone); 474 + } 475 + 476 + static inline void kvm_vm_unregister_coalesced_io(struct kvm_vm *vm, 477 + uint64_t address, 478 + uint64_t size, bool pio) 479 + { 480 + struct kvm_coalesced_mmio_zone zone = { 481 + .addr = address, 482 + .size = size, 483 + .pio = pio, 484 + }; 485 + 486 + vm_ioctl(vm, KVM_UNREGISTER_COALESCED_MMIO, &zone); 487 + } 488 + 463 489 static inline int vm_get_stats_fd(struct kvm_vm *vm) 464 490 { 465 491 int fd = __vm_ioctl(vm, KVM_GET_STATS_FD, NULL);

+8 -23

virt/kvm/coalesced_mmio.c

··· 40 40 return 1; 41 41 } 42 42 43 - static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev, u32 last) 44 - { 45 - struct kvm_coalesced_mmio_ring *ring; 46 - unsigned avail; 47 - 48 - /* Are we able to batch it ? */ 49 - 50 - /* last is the first free entry 51 - * check if we don't meet the first used entry 52 - * there is always one unused entry in the buffer 53 - */ 54 - ring = dev->kvm->coalesced_mmio_ring; 55 - avail = (ring->first - last - 1) % KVM_COALESCED_MMIO_MAX; 56 - if (avail == 0) { 57 - /* full */ 58 - return 0; 59 - } 60 - 61 - return 1; 62 - } 63 - 64 43 static int coalesced_mmio_write(struct kvm_vcpu *vcpu, 65 44 struct kvm_io_device *this, gpa_t addr, 66 45 int len, const void *val) ··· 53 74 54 75 spin_lock(&dev->kvm->ring_lock); 55 76 77 + /* 78 + * last is the index of the entry to fill. Verify userspace hasn't 79 + * set last to be out of range, and that there is room in the ring. 80 + * Leave one entry free in the ring so that userspace can differentiate 81 + * between an empty ring and a full ring. 82 + */ 56 83 insert = READ_ONCE(ring->last); 57 - if (!coalesced_mmio_has_room(dev, insert) || 58 - insert >= KVM_COALESCED_MMIO_MAX) { 84 + if (insert >= KVM_COALESCED_MMIO_MAX || 85 + (insert + 1) % KVM_COALESCED_MMIO_MAX == READ_ONCE(ring->first)) { 59 86 spin_unlock(&dev->kvm->ring_lock); 60 87 return -EOPNOTSUPP; 61 88 }

+10 -1

virt/kvm/kvm_main.c

··· 3275 3275 int r; 3276 3276 unsigned long addr; 3277 3277 3278 + if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) 3279 + return -EFAULT; 3280 + 3278 3281 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 3279 3282 if (kvm_is_error_hva(addr)) 3280 3283 return -EFAULT; ··· 3351 3348 int r; 3352 3349 unsigned long addr; 3353 3350 3351 + if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) 3352 + return -EFAULT; 3353 + 3354 3354 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 3355 3355 if (kvm_is_error_hva(addr)) 3356 3356 return -EFAULT; ··· 3383 3377 { 3384 3378 int r; 3385 3379 unsigned long addr; 3380 + 3381 + if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) 3382 + return -EFAULT; 3386 3383 3387 3384 addr = gfn_to_hva_memslot(memslot, gfn); 3388 3385 if (kvm_is_error_hva(addr)) ··· 3590 3581 int ret; 3591 3582 3592 3583 while ((seg = next_segment(len, offset)) != 0) { 3593 - ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len); 3584 + ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, seg); 3594 3585 if (ret < 0) 3595 3586 return ret; 3596 3587 offset = 0;