Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

KVM: PPC: Book3S: Add hack for split real mode

Today we handle split real mode by mapping both instruction and data faults
into a special virtual address space that only exists during the split mode
phase.

This is good enough to catch 32bit Linux guests that use split real mode for
copy_from/to_user. In this case we're always prefixed with 0xc0000000 for our
instruction pointer and can map the user space process freely below there.

However, that approach fails when we're running KVM inside of KVM. Here the 1st
level last_inst reader may well be in the same virtual page as a 2nd level
interrupt handler.

It also fails when running Mac OS X guests. Here we have a 4G/4G split, so a
kernel copy_from/to_user implementation can easily overlap with user space
addresses.

The architecturally correct way to fix this would be to implement an instruction
interpreter in KVM that kicks in whenever we go into split real mode. This
interpreter however would not receive a great amount of testing and be a lot of
bloat for a reasonably isolated corner case.

So I went back to the drawing board and tried to come up with a way to make
split real mode work with a single flat address space. And then I realized that
we could get away with the same trick that makes it work for Linux:

Whenever we see an instruction address during split real mode that may collide,
we just move it higher up the virtual address space to a place that hopefully
does not collide (keep your fingers crossed!).

That approach does work surprisingly well. I am able to successfully run
Mac OS X guests with KVM and QEMU (no split real mode hacks like MOL) when I
apply a tiny timing probe hack to QEMU. I'd say this is a win over even more
broken split real mode :).

Signed-off-by: Alexander Graf <agraf@suse.de>

+71
+1
arch/powerpc/include/asm/kvm_asm.h
··· 131 131 #define BOOK3S_HFLAG_NATIVE_PS 0x8 132 132 #define BOOK3S_HFLAG_MULTI_PGSIZE 0x10 133 133 #define BOOK3S_HFLAG_NEW_TLBIE 0x20 134 + #define BOOK3S_HFLAG_SPLIT_HACK 0x40 134 135 135 136 #define RESUME_FLAG_NV (1<<0) /* Reload guest nonvolatile state? */ 136 137 #define RESUME_FLAG_HOST (1<<1) /* Resume host? */
+3
arch/powerpc/include/asm/kvm_book3s.h
··· 324 324 /* LPIDs we support with this build -- runtime limit may be lower */ 325 325 #define KVMPPC_NR_LPIDS (LPID_RSVD + 1) 326 326 327 + #define SPLIT_HACK_MASK 0xff000000 328 + #define SPLIT_HACK_OFFS 0xfb000000 329 + 327 330 #endif /* __ASM_KVM_BOOK3S_H__ */
+19
arch/powerpc/kvm/book3s.c
··· 72 72 { 73 73 } 74 74 75 + void kvmppc_unfixup_split_real(struct kvm_vcpu *vcpu) 76 + { 77 + if (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) { 78 + ulong pc = kvmppc_get_pc(vcpu); 79 + if ((pc & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS) 80 + kvmppc_set_pc(vcpu, pc & ~SPLIT_HACK_MASK); 81 + vcpu->arch.hflags &= ~BOOK3S_HFLAG_SPLIT_HACK; 82 + } 83 + } 84 + EXPORT_SYMBOL_GPL(kvmppc_unfixup_split_real); 85 + 75 86 static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu) 76 87 { 77 88 if (!is_kvmppc_hv_enabled(vcpu->kvm)) ··· 129 118 130 119 void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags) 131 120 { 121 + kvmppc_unfixup_split_real(vcpu); 132 122 kvmppc_set_srr0(vcpu, kvmppc_get_pc(vcpu)); 133 123 kvmppc_set_srr1(vcpu, kvmppc_get_msr(vcpu) | flags); 134 124 kvmppc_set_pc(vcpu, kvmppc_interrupt_offset(vcpu) + vec); ··· 396 384 pte->may_write = true; 397 385 pte->may_execute = true; 398 386 r = 0; 387 + 388 + if ((kvmppc_get_msr(vcpu) & (MSR_IR | MSR_DR)) == MSR_DR && 389 + !data) { 390 + if ((vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) && 391 + ((eaddr & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS)) 392 + pte->raddr &= ~SPLIT_HACK_MASK; 393 + } 399 394 } 400 395 401 396 return r;
+48
arch/powerpc/kvm/book3s_pr.c
··· 62 62 #define HW_PAGE_SIZE PAGE_SIZE 63 63 #endif 64 64 65 + static bool kvmppc_is_split_real(struct kvm_vcpu *vcpu) 66 + { 67 + ulong msr = kvmppc_get_msr(vcpu); 68 + return (msr & (MSR_IR|MSR_DR)) == MSR_DR; 69 + } 70 + 71 + static void kvmppc_fixup_split_real(struct kvm_vcpu *vcpu) 72 + { 73 + ulong msr = kvmppc_get_msr(vcpu); 74 + ulong pc = kvmppc_get_pc(vcpu); 75 + 76 + /* We are in DR only split real mode */ 77 + if ((msr & (MSR_IR|MSR_DR)) != MSR_DR) 78 + return; 79 + 80 + /* We have not fixed up the guest already */ 81 + if (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) 82 + return; 83 + 84 + /* The code is in fixupable address space */ 85 + if (pc & SPLIT_HACK_MASK) 86 + return; 87 + 88 + vcpu->arch.hflags |= BOOK3S_HFLAG_SPLIT_HACK; 89 + kvmppc_set_pc(vcpu, pc | SPLIT_HACK_OFFS); 90 + } 91 + 92 + void kvmppc_unfixup_split_real(struct kvm_vcpu *vcpu); 93 + 65 94 static void kvmppc_core_vcpu_load_pr(struct kvm_vcpu *vcpu, int cpu) 66 95 { 67 96 #ifdef CONFIG_PPC_BOOK3S_64 ··· 110 81 #ifdef CONFIG_PPC_BOOK3S_32 111 82 current->thread.kvm_shadow_vcpu = vcpu->arch.shadow_vcpu; 112 83 #endif 84 + 85 + if (kvmppc_is_split_real(vcpu)) 86 + kvmppc_fixup_split_real(vcpu); 113 87 } 114 88 115 89 static void kvmppc_core_vcpu_put_pr(struct kvm_vcpu *vcpu) ··· 126 94 to_book3s(vcpu)->slb_shadow_max = svcpu->slb_max; 127 95 svcpu_put(svcpu); 128 96 #endif 97 + 98 + if (kvmppc_is_split_real(vcpu)) 99 + kvmppc_unfixup_split_real(vcpu); 129 100 130 101 kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX); 131 102 kvmppc_giveup_fac(vcpu, FSCR_TAR_LG); ··· 357 322 } 358 323 } 359 324 325 + if (kvmppc_is_split_real(vcpu)) 326 + kvmppc_fixup_split_real(vcpu); 327 + else 328 + kvmppc_unfixup_split_real(vcpu); 329 + 360 330 if ((kvmppc_get_msr(vcpu) & (MSR_PR|MSR_IR|MSR_DR)) != 361 331 (old_msr & (MSR_PR|MSR_IR|MSR_DR))) { 362 332 kvmppc_mmu_flush_segments(vcpu); ··· 562 522 pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12)); 563 523 break; 564 524 case MSR_DR: 525 + if (!data && 526 + (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) && 527 + ((pte.raddr & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS)) 528 + pte.raddr &= ~SPLIT_HACK_MASK; 529 + /* fall through */ 565 530 case MSR_IR: 566 531 vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid); 567 532 ··· 930 885 { 931 886 ulong shadow_srr1 = vcpu->arch.shadow_srr1; 932 887 vcpu->stat.pf_instruc++; 888 + 889 + if (kvmppc_is_split_real(vcpu)) 890 + kvmppc_fixup_split_real(vcpu); 933 891 934 892 #ifdef CONFIG_PPC_BOOK3S_32 935 893 /* We set segments as unused segments when invalidating them. So