Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm:
KVM: Use new smp_call_function_mask() in kvm_flush_remote_tlbs()
sched: don't clear PF_VCPU in scheduler
KVM: Improve local apic timer wraparound handling
KVM: Fix local apic timer divide by zero
KVM: Move kvm_guest_exit() after local_irq_enable()
KVM: x86 emulator: fix access registers for instructions with ModR/M byte and Mod = 3
KVM: VMX: Force vm86 mode if setting flags during real mode
KVM: x86 emulator: implement 'movnti mem, reg'
KVM: VMX: Reset mmu context when entering real mode
KVM: VMX: Handle NMIs before enabling interrupts and preemption
KVM: MMU: Set shadow pte atomically in mmu_pte_write_zap_pte()
KVM: x86 emulator: fix repne/repnz decoding
KVM: x86 emulator: fix merge screwup due to emulator split

+103 -69
+13 -24
drivers/kvm/kvm_main.c
··· 198 199 static void ack_flush(void *_completed) 200 { 201 - atomic_t *completed = _completed; 202 - 203 - atomic_inc(completed); 204 } 205 206 void kvm_flush_remote_tlbs(struct kvm *kvm) 207 { 208 - int i, cpu, needed; 209 cpumask_t cpus; 210 struct kvm_vcpu *vcpu; 211 - atomic_t completed; 212 213 - atomic_set(&completed, 0); 214 cpus_clear(cpus); 215 - needed = 0; 216 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 217 vcpu = kvm->vcpus[i]; 218 if (!vcpu) ··· 215 continue; 216 cpu = vcpu->cpu; 217 if (cpu != -1 && cpu != raw_smp_processor_id()) 218 - if (!cpu_isset(cpu, cpus)) { 219 - cpu_set(cpu, cpus); 220 - ++needed; 221 - } 222 } 223 - 224 - /* 225 - * We really want smp_call_function_mask() here. But that's not 226 - * available, so ipi all cpus in parallel and wait for them 227 - * to complete. 228 - */ 229 - for (cpu = first_cpu(cpus); cpu != NR_CPUS; cpu = next_cpu(cpu, cpus)) 230 - smp_call_function_single(cpu, ack_flush, &completed, 1, 0); 231 - while (atomic_read(&completed) != needed) { 232 - cpu_relax(); 233 - barrier(); 234 - } 235 } 236 237 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) ··· 2034 2035 kvm_x86_ops->run(vcpu, kvm_run); 2036 2037 - kvm_guest_exit(); 2038 vcpu->guest_mode = 0; 2039 local_irq_enable(); 2040 2041 ++vcpu->stat.exits; 2042 2043 preempt_enable(); 2044
··· 198 199 static void ack_flush(void *_completed) 200 { 201 } 202 203 void kvm_flush_remote_tlbs(struct kvm *kvm) 204 { 205 + int i, cpu; 206 cpumask_t cpus; 207 struct kvm_vcpu *vcpu; 208 209 cpus_clear(cpus); 210 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 211 vcpu = kvm->vcpus[i]; 212 if (!vcpu) ··· 221 continue; 222 cpu = vcpu->cpu; 223 if (cpu != -1 && cpu != raw_smp_processor_id()) 224 + cpu_set(cpu, cpus); 225 } 226 + smp_call_function_mask(cpus, ack_flush, NULL, 1); 227 } 228 229 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) ··· 2054 2055 kvm_x86_ops->run(vcpu, kvm_run); 2056 2057 vcpu->guest_mode = 0; 2058 local_irq_enable(); 2059 2060 ++vcpu->stat.exits; 2061 + 2062 + /* 2063 + * We must have an instruction between local_irq_enable() and 2064 + * kvm_guest_exit(), so the timer interrupt isn't delayed by 2065 + * the interrupt shadow. The stat.exits increment will do nicely. 2066 + * But we need to prevent reordering, hence this barrier(): 2067 + */ 2068 + barrier(); 2069 + 2070 + kvm_guest_exit(); 2071 2072 preempt_enable(); 2073
+27 -11
drivers/kvm/lapic.c
··· 494 495 static u32 apic_get_tmcct(struct kvm_lapic *apic) 496 { 497 - u32 counter_passed; 498 - ktime_t passed, now = apic->timer.dev.base->get_time(); 499 - u32 tmcct = apic_get_reg(apic, APIC_TMICT); 500 501 ASSERT(apic != NULL); 502 503 if (unlikely(ktime_to_ns(now) <= 504 ktime_to_ns(apic->timer.last_update))) { ··· 521 522 counter_passed = div64_64(ktime_to_ns(passed), 523 (APIC_BUS_CYCLE_NS * apic->timer.divide_count)); 524 - tmcct -= counter_passed; 525 526 - if (tmcct <= 0) { 527 - if (unlikely(!apic_lvtt_period(apic))) 528 tmcct = 0; 529 - else 530 - do { 531 - tmcct += apic_get_reg(apic, APIC_TMICT); 532 - } while (tmcct <= 0); 533 } 534 535 return tmcct; ··· 869 apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); 870 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); 871 } 872 - apic->timer.divide_count = 0; 873 atomic_set(&apic->timer.pending, 0); 874 if (vcpu->vcpu_id == 0) 875 vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
··· 494 495 static u32 apic_get_tmcct(struct kvm_lapic *apic) 496 { 497 + u64 counter_passed; 498 + ktime_t passed, now; 499 + u32 tmcct; 500 501 ASSERT(apic != NULL); 502 + 503 + now = apic->timer.dev.base->get_time(); 504 + tmcct = apic_get_reg(apic, APIC_TMICT); 505 + 506 + /* if initial count is 0, current count should also be 0 */ 507 + if (tmcct == 0) 508 + return 0; 509 510 if (unlikely(ktime_to_ns(now) <= 511 ktime_to_ns(apic->timer.last_update))) { ··· 514 515 counter_passed = div64_64(ktime_to_ns(passed), 516 (APIC_BUS_CYCLE_NS * apic->timer.divide_count)); 517 518 + if (counter_passed > tmcct) { 519 + if (unlikely(!apic_lvtt_period(apic))) { 520 + /* one-shot timers stick at 0 until reset */ 521 tmcct = 0; 522 + } else { 523 + /* 524 + * periodic timers reset to APIC_TMICT when they 525 + * hit 0. The while loop simulates this happening N 526 + * times. (counter_passed %= tmcct) would also work, 527 + * but might be slower or not work on 32-bit?? 528 + */ 529 + while (counter_passed > tmcct) 530 + counter_passed -= tmcct; 531 + tmcct -= counter_passed; 532 + } 533 + } else { 534 + tmcct -= counter_passed; 535 } 536 537 return tmcct; ··· 853 apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); 854 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); 855 } 856 + update_divide_count(apic); 857 atomic_set(&apic->timer.pending, 0); 858 if (vcpu->vcpu_id == 0) 859 vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
+2 -1
drivers/kvm/mmu.c
··· 1049 destroy_kvm_mmu(vcpu); 1050 return init_kvm_mmu(vcpu); 1051 } 1052 1053 int kvm_mmu_load(struct kvm_vcpu *vcpu) 1054 { ··· 1089 mmu_page_remove_parent_pte(child, spte); 1090 } 1091 } 1092 - *spte = 0; 1093 kvm_flush_remote_tlbs(vcpu->kvm); 1094 } 1095
··· 1049 destroy_kvm_mmu(vcpu); 1050 return init_kvm_mmu(vcpu); 1051 } 1052 + EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); 1053 1054 int kvm_mmu_load(struct kvm_vcpu *vcpu) 1055 { ··· 1088 mmu_page_remove_parent_pte(child, spte); 1089 } 1090 } 1091 + set_shadow_pte(spte, 0); 1092 kvm_flush_remote_tlbs(vcpu->kvm); 1093 } 1094
+12 -4
drivers/kvm/vmx.c
··· 523 524 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 525 { 526 vmcs_writel(GUEST_RFLAGS, rflags); 527 } 528 ··· 1130 fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs); 1131 fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs); 1132 1133 init_rmode_tss(vcpu->kvm); 1134 } 1135 ··· 1763 set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); 1764 } 1765 1766 - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */ 1767 - asm ("int $2"); 1768 - return 1; 1769 - } 1770 1771 if (is_no_device(intr_info)) { 1772 vmx_fpu_activate(vcpu); ··· 2197 static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2198 { 2199 struct vcpu_vmx *vmx = to_vmx(vcpu); 2200 2201 /* 2202 * Loading guest fpu may have cleared host cr0.ts ··· 2324 2325 asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 2326 vmx->launched = 1; 2327 } 2328 2329 static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
··· 523 524 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 525 { 526 + if (vcpu->rmode.active) 527 + rflags |= IOPL_MASK | X86_EFLAGS_VM; 528 vmcs_writel(GUEST_RFLAGS, rflags); 529 } 530 ··· 1128 fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs); 1129 fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs); 1130 1131 + kvm_mmu_reset_context(vcpu); 1132 init_rmode_tss(vcpu->kvm); 1133 } 1134 ··· 1760 set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); 1761 } 1762 1763 + if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ 1764 + return 1; /* already handled by vmx_vcpu_run() */ 1765 1766 if (is_no_device(intr_info)) { 1767 vmx_fpu_activate(vcpu); ··· 2196 static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2197 { 2198 struct vcpu_vmx *vmx = to_vmx(vcpu); 2199 + u32 intr_info; 2200 2201 /* 2202 * Loading guest fpu may have cleared host cr0.ts ··· 2322 2323 asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 2324 vmx->launched = 1; 2325 + 2326 + intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 2327 + 2328 + /* We need to handle NMIs before interrupts are enabled */ 2329 + if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ 2330 + asm("int $2"); 2331 } 2332 2333 static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
+49 -28
drivers/kvm/x86_emulate.c
··· 212 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, 213 DstReg | SrcMem16 | ModRM | Mov, 214 /* 0xC0 - 0xCF */ 215 - 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, 0, 216 /* 0xD0 - 0xDF */ 217 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 218 /* 0xE0 - 0xEF */ ··· 597 case 0xf0: /* LOCK */ 598 lock_prefix = 1; 599 break; 600 case 0xf3: /* REP/REPE/REPZ */ 601 rep_prefix = 1; 602 - break; 603 - case 0xf2: /* REPNE/REPNZ */ 604 break; 605 default: 606 goto done_prefixes; ··· 825 if (twobyte && b == 0x01 && modrm_reg == 7) 826 break; 827 srcmem_common: 828 src.type = OP_MEM; 829 src.ptr = (unsigned long *)cr2; 830 src.val = 0; ··· 901 dst.ptr = (unsigned long *)cr2; 902 dst.bytes = (d & ByteOp) ? 1 : op_bytes; 903 dst.val = 0; 904 if (d & BitOp) { 905 unsigned long mask = ~(dst.bytes * 8 - 1); 906 ··· 1099 case 0xd2 ... 0xd3: /* Grp2 */ 1100 src.val = _regs[VCPU_REGS_RCX]; 1101 goto grp2; 1102 - case 0xe8: /* call (near) */ { 1103 - long int rel; 1104 - switch (op_bytes) { 1105 - case 2: 1106 - rel = insn_fetch(s16, 2, _eip); 1107 - break; 1108 - case 4: 1109 - rel = insn_fetch(s32, 4, _eip); 1110 - break; 1111 - case 8: 1112 - rel = insn_fetch(s64, 8, _eip); 1113 - break; 1114 - default: 1115 - DPRINTF("Call: Invalid op_bytes\n"); 1116 - goto cannot_emulate; 1117 - } 1118 - src.val = (unsigned long) _eip; 1119 - JMP_REL(rel); 1120 - goto push; 1121 - } 1122 - case 0xe9: /* jmp rel */ 1123 - case 0xeb: /* jmp rel short */ 1124 - JMP_REL(src.val); 1125 - no_wb = 1; /* Disable writeback. */ 1126 - break; 1127 case 0xf6 ... 0xf7: /* Grp3 */ 1128 switch (modrm_reg) { 1129 case 0 ... 1: /* test */ ··· 1341 case 0xae ... 0xaf: /* scas */ 1342 DPRINTF("Urk! I don't handle SCAS.\n"); 1343 goto cannot_emulate; 1344 1345 } 1346 goto writeback; ··· 1517 case 0xbe ... 0xbf: /* movsx */ 1518 dst.bytes = op_bytes; 1519 dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val; 1520 break; 1521 } 1522 goto writeback;
··· 212 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, 213 DstReg | SrcMem16 | ModRM | Mov, 214 /* 0xC0 - 0xCF */ 215 + 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, 216 + 0, 0, 0, 0, 0, 0, 0, 0, 217 /* 0xD0 - 0xDF */ 218 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 219 /* 0xE0 - 0xEF */ ··· 596 case 0xf0: /* LOCK */ 597 lock_prefix = 1; 598 break; 599 + case 0xf2: /* REPNE/REPNZ */ 600 case 0xf3: /* REP/REPE/REPZ */ 601 rep_prefix = 1; 602 break; 603 default: 604 goto done_prefixes; ··· 825 if (twobyte && b == 0x01 && modrm_reg == 7) 826 break; 827 srcmem_common: 828 + /* 829 + * For instructions with a ModR/M byte, switch to register 830 + * access if Mod = 3. 831 + */ 832 + if ((d & ModRM) && modrm_mod == 3) { 833 + src.type = OP_REG; 834 + break; 835 + } 836 src.type = OP_MEM; 837 src.ptr = (unsigned long *)cr2; 838 src.val = 0; ··· 893 dst.ptr = (unsigned long *)cr2; 894 dst.bytes = (d & ByteOp) ? 1 : op_bytes; 895 dst.val = 0; 896 + /* 897 + * For instructions with a ModR/M byte, switch to register 898 + * access if Mod = 3. 899 + */ 900 + if ((d & ModRM) && modrm_mod == 3) { 901 + dst.type = OP_REG; 902 + break; 903 + } 904 if (d & BitOp) { 905 unsigned long mask = ~(dst.bytes * 8 - 1); 906 ··· 1083 case 0xd2 ... 0xd3: /* Grp2 */ 1084 src.val = _regs[VCPU_REGS_RCX]; 1085 goto grp2; 1086 case 0xf6 ... 0xf7: /* Grp3 */ 1087 switch (modrm_reg) { 1088 case 0 ... 1: /* test */ ··· 1350 case 0xae ... 0xaf: /* scas */ 1351 DPRINTF("Urk! I don't handle SCAS.\n"); 1352 goto cannot_emulate; 1353 + case 0xe8: /* call (near) */ { 1354 + long int rel; 1355 + switch (op_bytes) { 1356 + case 2: 1357 + rel = insn_fetch(s16, 2, _eip); 1358 + break; 1359 + case 4: 1360 + rel = insn_fetch(s32, 4, _eip); 1361 + break; 1362 + case 8: 1363 + rel = insn_fetch(s64, 8, _eip); 1364 + break; 1365 + default: 1366 + DPRINTF("Call: Invalid op_bytes\n"); 1367 + goto cannot_emulate; 1368 + } 1369 + src.val = (unsigned long) _eip; 1370 + JMP_REL(rel); 1371 + goto push; 1372 + } 1373 + case 0xe9: /* jmp rel */ 1374 + case 0xeb: /* jmp rel short */ 1375 + JMP_REL(src.val); 1376 + no_wb = 1; /* Disable writeback. */ 1377 + break; 1378 + 1379 1380 } 1381 goto writeback; ··· 1500 case 0xbe ... 0xbf: /* movsx */ 1501 dst.bytes = op_bytes; 1502 dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val; 1503 + break; 1504 + case 0xc3: /* movnti */ 1505 + dst.bytes = op_bytes; 1506 + dst.val = (op_bytes == 4) ? (u32) src.val : (u64) src.val; 1507 break; 1508 } 1509 goto writeback;
-1
kernel/sched.c
··· 3375 3376 if (p->flags & PF_VCPU) { 3377 account_guest_time(p, cputime); 3378 - p->flags &= ~PF_VCPU; 3379 return; 3380 } 3381
··· 3375 3376 if (p->flags & PF_VCPU) { 3377 account_guest_time(p, cputime); 3378 return; 3379 } 3380