Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

KVM: selftests: Verify KVM correctly handles mprotect(PROT_READ)

Add two phases to mmu_stress_test to verify that KVM correctly handles
guest memory that was writable, and then made read-only in the primary MMU,
and then made writable again.

Add bonus coverage for x86 and arm64 to verify that all of guest memory was
marked read-only. Making forward progress (without making memory writable)
requires arch specific code to skip over the faulting instruction, but the
test can at least verify each vCPU's starting page was made read-only for
other architectures.

Link: https://lore.kernel.org/r/20241128005547.4077116-14-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>

+102 -4
+102 -4
tools/testing/selftests/kvm/mmu_stress_test.c
··· 17 17 #include "processor.h" 18 18 #include "ucall_common.h" 19 19 20 + static bool mprotect_ro_done; 21 + 20 22 static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride) 21 23 { 22 24 uint64_t gpa; ··· 33 31 for (gpa = start_gpa; gpa < end_gpa; gpa += stride) 34 32 *((volatile uint64_t *)gpa); 35 33 GUEST_SYNC(2); 34 + 35 + /* 36 + * Write to the region while mprotect(PROT_READ) is underway. Keep 37 + * looping until the memory is guaranteed to be read-only, otherwise 38 + * vCPUs may complete their writes and advance to the next stage 39 + * prematurely. 40 + * 41 + * For architectures that support skipping the faulting instruction, 42 + * generate the store via inline assembly to ensure the exact length 43 + * of the instruction is known and stable (vcpu_arch_put_guest() on 44 + * fixed-length architectures should work, but the cost of paranoia 45 + * is low in this case). For x86, hand-code the exact opcode so that 46 + * there is no room for variability in the generated instruction. 47 + */ 48 + do { 49 + for (gpa = start_gpa; gpa < end_gpa; gpa += stride) 50 + #ifdef __x86_64__ 51 + asm volatile(".byte 0x48,0x89,0x00" :: "a"(gpa) : "memory"); /* mov %rax, (%rax) */ 52 + #elif defined(__aarch64__) 53 + asm volatile("str %0, [%0]" :: "r" (gpa) : "memory"); 54 + #else 55 + vcpu_arch_put_guest(*((volatile uint64_t *)gpa), gpa); 56 + #endif 57 + } while (!READ_ONCE(mprotect_ro_done)); 58 + 59 + /* 60 + * Only architectures that write the entire range can explicitly sync, 61 + * as other architectures will be stuck on the write fault. 62 + */ 63 + #if defined(__x86_64__) || defined(__aarch64__) 64 + GUEST_SYNC(3); 65 + #endif 66 + 67 + for (gpa = start_gpa; gpa < end_gpa; gpa += stride) 68 + vcpu_arch_put_guest(*((volatile uint64_t *)gpa), gpa); 69 + GUEST_SYNC(4); 36 70 37 71 GUEST_ASSERT(0); 38 72 } ··· 117 79 struct vcpu_info *info = data; 118 80 struct kvm_vcpu *vcpu = info->vcpu; 119 81 struct kvm_vm *vm = vcpu->vm; 82 + int r; 120 83 121 84 vcpu_args_set(vcpu, 3, info->start_gpa, info->end_gpa, vm->page_size); 122 85 ··· 140 101 141 102 /* Stage 2, read all of guest memory, which is now read-only. */ 142 103 run_vcpu(vcpu, 2); 104 + 105 + /* 106 + * Stage 3, write guest memory and verify KVM returns -EFAULT for once 107 + * the mprotect(PROT_READ) lands. Only architectures that support 108 + * validating *all* of guest memory sync for this stage, as vCPUs will 109 + * be stuck on the faulting instruction for other architectures. Go to 110 + * stage 3 without a rendezvous 111 + */ 112 + do { 113 + r = _vcpu_run(vcpu); 114 + } while (!r); 115 + TEST_ASSERT(r == -1 && errno == EFAULT, 116 + "Expected EFAULT on write to RO memory, got r = %d, errno = %d", r, errno); 117 + 118 + #if defined(__x86_64__) || defined(__aarch64__) 119 + /* 120 + * Verify *all* writes from the guest hit EFAULT due to the VMA now 121 + * being read-only. x86 and arm64 only at this time as skipping the 122 + * instruction that hits the EFAULT requires advancing the program 123 + * counter, which is arch specific and relies on inline assembly. 124 + */ 125 + #ifdef __x86_64__ 126 + vcpu->run->kvm_valid_regs = KVM_SYNC_X86_REGS; 127 + #endif 128 + for (;;) { 129 + r = _vcpu_run(vcpu); 130 + if (!r) 131 + break; 132 + TEST_ASSERT_EQ(errno, EFAULT); 133 + #if defined(__x86_64__) 134 + WRITE_ONCE(vcpu->run->kvm_dirty_regs, KVM_SYNC_X86_REGS); 135 + vcpu->run->s.regs.regs.rip += 3; 136 + #elif defined(__aarch64__) 137 + vcpu_set_reg(vcpu, ARM64_CORE_REG(regs.pc), 138 + vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pc)) + 4); 139 + #endif 140 + 141 + } 142 + assert_sync_stage(vcpu, 3); 143 + #endif /* __x86_64__ || __aarch64__ */ 144 + rendezvous_with_boss(); 145 + 146 + /* 147 + * Stage 4. Run to completion, waiting for mprotect(PROT_WRITE) to 148 + * make the memory writable again. 149 + */ 150 + do { 151 + r = _vcpu_run(vcpu); 152 + } while (r && errno == EFAULT); 153 + TEST_ASSERT_EQ(r, 0); 154 + assert_sync_stage(vcpu, 4); 143 155 rendezvous_with_boss(); 144 156 145 157 return NULL; ··· 273 183 const uint64_t start_gpa = SZ_4G; 274 184 const int first_slot = 1; 275 185 276 - struct timespec time_start, time_run1, time_reset, time_run2, time_ro; 186 + struct timespec time_start, time_run1, time_reset, time_run2, time_ro, time_rw; 277 187 uint64_t max_gpa, gpa, slot_size, max_mem, i; 278 188 int max_slots, slot, opt, fd; 279 189 bool hugepages = false; ··· 378 288 rendezvous_with_vcpus(&time_run2, "run 2"); 379 289 380 290 mprotect(mem, slot_size, PROT_READ); 381 - rendezvous_with_vcpus(&time_ro, "mprotect RO"); 291 + usleep(10); 292 + mprotect_ro_done = true; 293 + sync_global_to_guest(vm, mprotect_ro_done); 382 294 295 + rendezvous_with_vcpus(&time_ro, "mprotect RO"); 296 + mprotect(mem, slot_size, PROT_READ | PROT_WRITE); 297 + rendezvous_with_vcpus(&time_rw, "mprotect RW"); 298 + 299 + time_rw = timespec_sub(time_rw, time_ro); 383 300 time_ro = timespec_sub(time_ro, time_run2); 384 301 time_run2 = timespec_sub(time_run2, time_reset); 385 302 time_reset = timespec_sub(time_reset, time_run1); 386 303 time_run1 = timespec_sub(time_run1, time_start); 387 304 388 305 pr_info("run1 = %ld.%.9lds, reset = %ld.%.9lds, run2 = %ld.%.9lds, " 389 - "ro = %ld.%.9lds\n", 306 + "ro = %ld.%.9lds, rw = %ld.%.9lds\n", 390 307 time_run1.tv_sec, time_run1.tv_nsec, 391 308 time_reset.tv_sec, time_reset.tv_nsec, 392 309 time_run2.tv_sec, time_run2.tv_nsec, 393 - time_ro.tv_sec, time_ro.tv_nsec); 310 + time_ro.tv_sec, time_ro.tv_nsec, 311 + time_rw.tv_sec, time_rw.tv_nsec); 394 312 395 313 /* 396 314 * Delete even numbered slots (arbitrary) and unmap the first half of