Merge commit 'origin/next' into kvm-ppc-next

+6 -19

Documentation/virtual/kvm/api.txt

··· 219 219 single-threaded guest vcpus, it should make all vcpu ids be a multiple 220 220 of the number of vcpus per vcore. 221 221 222 - On powerpc using book3s_hv mode, the vcpus are mapped onto virtual 223 - threads in one or more virtual CPU cores. (This is because the 224 - hardware requires all the hardware threads in a CPU core to be in the 225 - same partition.) The KVM_CAP_PPC_SMT capability indicates the number 226 - of vcpus per virtual core (vcore). The vcore id is obtained by 227 - dividing the vcpu id by the number of vcpus per vcore. The vcpus in a 228 - given vcore will always be in the same physical core as each other 229 - (though that might be a different physical core from time to time). 230 - Userspace can control the threading (SMT) mode of the guest by its 231 - allocation of vcpu ids. For example, if userspace wants 232 - single-threaded guest vcpus, it should make all vcpu ids be a multiple 233 - of the number of vcpus per vcore. 234 - 235 222 For virtual cpus that have been created with S390 user controlled virtual 236 223 machines, the resulting vcpu fd can be memory mapped at page offset 237 224 KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of the virtual ··· 861 874 be identical. This allows large pages in the guest to be backed by large 862 875 pages in the host. 863 876 864 - The flags field supports two flag, KVM_MEM_LOG_DIRTY_PAGES, which instructs 865 - kvm to keep track of writes to memory within the slot. See KVM_GET_DIRTY_LOG 866 - ioctl. The KVM_CAP_READONLY_MEM capability indicates the availability of the 867 - KVM_MEM_READONLY flag. When this flag is set for a memory region, KVM only 868 - allows read accesses. Writes will be posted to userspace as KVM_EXIT_MMIO 869 - exits. 877 + The flags field supports two flags: KVM_MEM_LOG_DIRTY_PAGES and 878 + KVM_MEM_READONLY. The former can be set to instruct KVM to keep track of 879 + writes to memory within the slot. See KVM_GET_DIRTY_LOG ioctl to know how to 880 + use it. The latter can be set, if KVM_CAP_READONLY_MEM capability allows it, 881 + to make a new slot read-only. In this case, writes to this memory will be 882 + posted to userspace as KVM_EXIT_MMIO exits. 870 883 871 884 When the KVM_CAP_SYNC_MMU capability is available, changes in the backing of 872 885 the memory region are automatically reflected into the guest. For example, an

+6

arch/ia64/kvm/lapic.h

··· 27 27 #define kvm_apic_present(x) (true) 28 28 #define kvm_lapic_enabled(x) (true) 29 29 30 + static inline bool kvm_apic_vid_enabled(void) 31 + { 32 + /* IA64 has no apicv supporting, do nothing here */ 33 + return false; 34 + } 35 + 30 36 #endif

+8

arch/s390/kvm/kvm-s390.c

··· 770 770 } else 771 771 prefix = 0; 772 772 773 + /* 774 + * The guest FPRS and ACRS are in the host FPRS/ACRS due to the lazy 775 + * copying in vcpu load/put. Lets update our copies before we save 776 + * it into the save area 777 + */ 778 + save_fp_regs(&vcpu->arch.guest_fpregs); 779 + save_access_regs(vcpu->run->s.regs.acrs); 780 + 773 781 if (__guestcopy(vcpu, addr + offsetof(struct save_area, fp_regs), 774 782 vcpu->arch.guest_fpregs.fprs, 128, prefix)) 775 783 return -EFAULT;

+14 -11

arch/s390/kvm/kvm-s390.h

··· 67 67 68 68 static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu) 69 69 { 70 - int base2 = vcpu->arch.sie_block->ipb >> 28; 71 - int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16); 70 + u32 base2 = vcpu->arch.sie_block->ipb >> 28; 71 + u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16); 72 72 73 73 return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2; 74 74 } ··· 76 76 static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu, 77 77 u64 *address1, u64 *address2) 78 78 { 79 - int base1 = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28; 80 - int disp1 = (vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16; 81 - int base2 = (vcpu->arch.sie_block->ipb & 0xf000) >> 12; 82 - int disp2 = vcpu->arch.sie_block->ipb & 0x0fff; 79 + u32 base1 = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28; 80 + u32 disp1 = (vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16; 81 + u32 base2 = (vcpu->arch.sie_block->ipb & 0xf000) >> 12; 82 + u32 disp2 = vcpu->arch.sie_block->ipb & 0x0fff; 83 83 84 84 *address1 = (base1 ? vcpu->run->s.regs.gprs[base1] : 0) + disp1; 85 85 *address2 = (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2; ··· 87 87 88 88 static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu) 89 89 { 90 - int base2 = vcpu->arch.sie_block->ipb >> 28; 91 - int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16) + 90 + u32 base2 = vcpu->arch.sie_block->ipb >> 28; 91 + u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16) + 92 92 ((vcpu->arch.sie_block->ipb & 0xff00) << 4); 93 + /* The displacement is a 20bit _SIGNED_ value */ 94 + if (disp2 & 0x80000) 95 + disp2+=0xfff00000; 93 96 94 - return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2; 97 + return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + (long)(int)disp2; 95 98 } 96 99 97 100 static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu) 98 101 { 99 - int base2 = vcpu->arch.sie_block->ipb >> 28; 100 - int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16); 102 + u32 base2 = vcpu->arch.sie_block->ipb >> 28; 103 + u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16); 101 104 102 105 return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2; 103 106 }

+6

arch/x86/include/asm/kvm_host.h

··· 699 699 void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 700 700 void (*enable_irq_window)(struct kvm_vcpu *vcpu); 701 701 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); 702 + int (*vm_has_apicv)(struct kvm *kvm); 703 + void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); 704 + void (*hwapic_isr_update)(struct kvm *kvm, int isr); 705 + void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); 706 + void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); 702 707 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 703 708 int (*get_tdp_level)(void); 704 709 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); ··· 998 993 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); 999 994 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); 1000 995 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); 996 + int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); 1001 997 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); 1002 998 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); 1003 999 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);

+20 -1

arch/x86/include/asm/vmx.h

··· 62 62 #define EXIT_REASON_MCE_DURING_VMENTRY 41 63 63 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 64 64 #define EXIT_REASON_APIC_ACCESS 44 65 + #define EXIT_REASON_EOI_INDUCED 45 65 66 #define EXIT_REASON_EPT_VIOLATION 48 66 67 #define EXIT_REASON_EPT_MISCONFIG 49 67 68 #define EXIT_REASON_WBINVD 54 68 69 #define EXIT_REASON_XSETBV 55 70 + #define EXIT_REASON_APIC_WRITE 56 69 71 #define EXIT_REASON_INVPCID 58 70 72 71 73 #define VMX_EXIT_REASONS \ ··· 105 103 { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ 106 104 { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ 107 105 { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ 108 - { EXIT_REASON_WBINVD, "WBINVD" } 106 + { EXIT_REASON_WBINVD, "WBINVD" }, \ 107 + { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \ 108 + { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ 109 + { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ 110 + { EXIT_REASON_INVD, "INVD" }, \ 111 + { EXIT_REASON_INVPCID, "INVPCID" } 109 112 110 113 #ifdef __KERNEL__ 111 114 ··· 145 138 #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 146 139 #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 147 140 #define SECONDARY_EXEC_RDTSCP 0x00000008 141 + #define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010 148 142 #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 149 143 #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 150 144 #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 145 + #define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100 146 + #define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200 151 147 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 152 148 #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 153 149 ··· 188 178 GUEST_GS_SELECTOR = 0x0000080a, 189 179 GUEST_LDTR_SELECTOR = 0x0000080c, 190 180 GUEST_TR_SELECTOR = 0x0000080e, 181 + GUEST_INTR_STATUS = 0x00000810, 191 182 HOST_ES_SELECTOR = 0x00000c00, 192 183 HOST_CS_SELECTOR = 0x00000c02, 193 184 HOST_SS_SELECTOR = 0x00000c04, ··· 216 205 APIC_ACCESS_ADDR_HIGH = 0x00002015, 217 206 EPT_POINTER = 0x0000201a, 218 207 EPT_POINTER_HIGH = 0x0000201b, 208 + EOI_EXIT_BITMAP0 = 0x0000201c, 209 + EOI_EXIT_BITMAP0_HIGH = 0x0000201d, 210 + EOI_EXIT_BITMAP1 = 0x0000201e, 211 + EOI_EXIT_BITMAP1_HIGH = 0x0000201f, 212 + EOI_EXIT_BITMAP2 = 0x00002020, 213 + EOI_EXIT_BITMAP2_HIGH = 0x00002021, 214 + EOI_EXIT_BITMAP3 = 0x00002022, 215 + EOI_EXIT_BITMAP3_HIGH = 0x00002023, 219 216 GUEST_PHYSICAL_ADDRESS = 0x00002400, 220 217 GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, 221 218 VMCS_LINK_POINTER = 0x00002800,

+1 -1

arch/x86/kvm/emulate.c

··· 1013 1013 void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf); 1014 1014 1015 1015 flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF; 1016 - asm("pushq %[flags]; popf; call *%[fastop]" 1016 + asm("push %[flags]; popf; call *%[fastop]" 1017 1017 : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags)); 1018 1018 return rc; 1019 1019 }

+51 -5

arch/x86/kvm/irq.c

··· 38 38 EXPORT_SYMBOL(kvm_cpu_has_pending_timer); 39 39 40 40 /* 41 + * check if there is pending interrupt from 42 + * non-APIC source without intack. 43 + */ 44 + static int kvm_cpu_has_extint(struct kvm_vcpu *v) 45 + { 46 + if (kvm_apic_accept_pic_intr(v)) 47 + return pic_irqchip(v->kvm)->output; /* PIC */ 48 + else 49 + return 0; 50 + } 51 + 52 + /* 53 + * check if there is injectable interrupt: 54 + * when virtual interrupt delivery enabled, 55 + * interrupt from apic will handled by hardware, 56 + * we don't need to check it here. 57 + */ 58 + int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) 59 + { 60 + if (!irqchip_in_kernel(v->kvm)) 61 + return v->arch.interrupt.pending; 62 + 63 + if (kvm_cpu_has_extint(v)) 64 + return 1; 65 + 66 + if (kvm_apic_vid_enabled(v->kvm)) 67 + return 0; 68 + 69 + return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ 70 + } 71 + 72 + /* 41 73 * check if there is pending interrupt without 42 74 * intack. 43 75 */ ··· 78 46 if (!irqchip_in_kernel(v->kvm)) 79 47 return v->arch.interrupt.pending; 80 48 81 - if (kvm_apic_accept_pic_intr(v) && pic_irqchip(v->kvm)->output) 82 - return pic_irqchip(v->kvm)->output; /* PIC */ 49 + if (kvm_cpu_has_extint(v)) 50 + return 1; 83 51 84 52 return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ 85 53 } 86 54 EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); 87 55 88 56 /* 57 + * Read pending interrupt(from non-APIC source) 58 + * vector and intack. 59 + */ 60 + static int kvm_cpu_get_extint(struct kvm_vcpu *v) 61 + { 62 + if (kvm_cpu_has_extint(v)) 63 + return kvm_pic_read_irq(v->kvm); /* PIC */ 64 + return -1; 65 + } 66 + 67 + /* 89 68 * Read pending interrupt vector and intack. 90 69 */ 91 70 int kvm_cpu_get_interrupt(struct kvm_vcpu *v) 92 71 { 72 + int vector; 73 + 93 74 if (!irqchip_in_kernel(v->kvm)) 94 75 return v->arch.interrupt.nr; 95 76 96 - if (kvm_apic_accept_pic_intr(v) && pic_irqchip(v->kvm)->output) 97 - return kvm_pic_read_irq(v->kvm); /* PIC */ 77 + vector = kvm_cpu_get_extint(v); 78 + 79 + if (kvm_apic_vid_enabled(v->kvm) || vector != -1) 80 + return vector; /* PIC */ 98 81 99 82 return kvm_get_apic_interrupt(v); /* APIC */ 100 83 } 101 - EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); 102 84 103 85 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) 104 86 {

+108 -32

arch/x86/kvm/lapic.c

··· 140 140 (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ 141 141 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) 142 142 143 - static inline int apic_x2apic_mode(struct kvm_lapic *apic) 144 - { 145 - return apic->vcpu->arch.apic_base & X2APIC_ENABLE; 146 - } 147 - 148 143 static inline int kvm_apic_id(struct kvm_lapic *apic) 149 144 { 150 145 return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; 151 146 } 152 147 153 - static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr) 148 + void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, 149 + struct kvm_lapic_irq *irq, 150 + u64 *eoi_exit_bitmap) 154 151 { 155 - u16 cid; 156 - ldr >>= 32 - map->ldr_bits; 157 - cid = (ldr >> map->cid_shift) & map->cid_mask; 152 + struct kvm_lapic **dst; 153 + struct kvm_apic_map *map; 154 + unsigned long bitmap = 1; 155 + int i; 158 156 159 - BUG_ON(cid >= ARRAY_SIZE(map->logical_map)); 157 + rcu_read_lock(); 158 + map = rcu_dereference(vcpu->kvm->arch.apic_map); 160 159 161 - return cid; 162 - } 160 + if (unlikely(!map)) { 161 + __set_bit(irq->vector, (unsigned long *)eoi_exit_bitmap); 162 + goto out; 163 + } 163 164 164 - static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr) 165 - { 166 - ldr >>= (32 - map->ldr_bits); 167 - return ldr & map->lid_mask; 165 + if (irq->dest_mode == 0) { /* physical mode */ 166 + if (irq->delivery_mode == APIC_DM_LOWEST || 167 + irq->dest_id == 0xff) { 168 + __set_bit(irq->vector, 169 + (unsigned long *)eoi_exit_bitmap); 170 + goto out; 171 + } 172 + dst = &map->phys_map[irq->dest_id & 0xff]; 173 + } else { 174 + u32 mda = irq->dest_id << (32 - map->ldr_bits); 175 + 176 + dst = map->logical_map[apic_cluster_id(map, mda)]; 177 + 178 + bitmap = apic_logical_id(map, mda); 179 + } 180 + 181 + for_each_set_bit(i, &bitmap, 16) { 182 + if (!dst[i]) 183 + continue; 184 + if (dst[i]->vcpu == vcpu) { 185 + __set_bit(irq->vector, 186 + (unsigned long *)eoi_exit_bitmap); 187 + break; 188 + } 189 + } 190 + 191 + out: 192 + rcu_read_unlock(); 168 193 } 169 194 170 195 static void recalculate_apic_map(struct kvm *kvm) ··· 255 230 256 231 if (old) 257 232 kfree_rcu(old, rcu); 233 + 234 + kvm_ioapic_make_eoibitmap_request(kvm); 258 235 } 259 236 260 237 static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) ··· 372 345 { 373 346 int result; 374 347 348 + /* 349 + * Note that irr_pending is just a hint. It will be always 350 + * true with virtual interrupt delivery enabled. 351 + */ 375 352 if (!apic->irr_pending) 376 353 return -1; 377 354 ··· 492 461 static inline int apic_find_highest_isr(struct kvm_lapic *apic) 493 462 { 494 463 int result; 464 + 465 + /* Note that isr_count is always 1 with vid enabled */ 495 466 if (!apic->isr_count) 496 467 return -1; 497 468 if (likely(apic->highest_isr_cache != -1)) ··· 773 740 return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; 774 741 } 775 742 743 + static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) 744 + { 745 + if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && 746 + kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { 747 + int trigger_mode; 748 + if (apic_test_vector(vector, apic->regs + APIC_TMR)) 749 + trigger_mode = IOAPIC_LEVEL_TRIG; 750 + else 751 + trigger_mode = IOAPIC_EDGE_TRIG; 752 + kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 753 + } 754 + } 755 + 776 756 static int apic_set_eoi(struct kvm_lapic *apic) 777 757 { 778 758 int vector = apic_find_highest_isr(apic); ··· 802 756 apic_clear_isr(vector, apic); 803 757 apic_update_ppr(apic); 804 758 805 - if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && 806 - kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { 807 - int trigger_mode; 808 - if (apic_test_vector(vector, apic->regs + APIC_TMR)) 809 - trigger_mode = IOAPIC_LEVEL_TRIG; 810 - else 811 - trigger_mode = IOAPIC_EDGE_TRIG; 812 - kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 813 - } 759 + kvm_ioapic_send_eoi(apic, vector); 814 760 kvm_make_request(KVM_REQ_EVENT, apic->vcpu); 815 761 return vector; 816 762 } 763 + 764 + /* 765 + * this interface assumes a trap-like exit, which has already finished 766 + * desired side effect including vISR and vPPR update. 767 + */ 768 + void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector) 769 + { 770 + struct kvm_lapic *apic = vcpu->arch.apic; 771 + 772 + trace_kvm_eoi(apic, vector); 773 + 774 + kvm_ioapic_send_eoi(apic, vector); 775 + kvm_make_request(KVM_REQ_EVENT, apic->vcpu); 776 + } 777 + EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated); 817 778 818 779 static void apic_send_ipi(struct kvm_lapic *apic) 819 780 { ··· 1265 1212 } 1266 1213 EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); 1267 1214 1215 + /* emulate APIC access in a trap manner */ 1216 + void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) 1217 + { 1218 + u32 val = 0; 1219 + 1220 + /* hw has done the conditional check and inst decode */ 1221 + offset &= 0xff0; 1222 + 1223 + apic_reg_read(vcpu->arch.apic, offset, 4, &val); 1224 + 1225 + /* TODO: optimize to just emulate side effect w/o one more write */ 1226 + apic_reg_write(vcpu->arch.apic, offset, val); 1227 + } 1228 + EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode); 1229 + 1268 1230 void kvm_free_lapic(struct kvm_vcpu *vcpu) 1269 1231 { 1270 1232 struct kvm_lapic *apic = vcpu->arch.apic; ··· 1356 1288 1357 1289 void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) 1358 1290 { 1291 + u64 old_value = vcpu->arch.apic_base; 1359 1292 struct kvm_lapic *apic = vcpu->arch.apic; 1360 1293 1361 1294 if (!apic) { ··· 1378 1309 value &= ~MSR_IA32_APICBASE_BSP; 1379 1310 1380 1311 vcpu->arch.apic_base = value; 1381 - if (apic_x2apic_mode(apic)) { 1382 - u32 id = kvm_apic_id(apic); 1383 - u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); 1384 - kvm_apic_set_ldr(apic, ldr); 1312 + if ((old_value ^ value) & X2APIC_ENABLE) { 1313 + if (value & X2APIC_ENABLE) { 1314 + u32 id = kvm_apic_id(apic); 1315 + u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); 1316 + kvm_apic_set_ldr(apic, ldr); 1317 + kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true); 1318 + } else 1319 + kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false); 1385 1320 } 1321 + 1386 1322 apic->base_address = apic->vcpu->arch.apic_base & 1387 1323 MSR_IA32_APICBASE_BASE; 1388 1324 ··· 1433 1359 apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); 1434 1360 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); 1435 1361 } 1436 - apic->irr_pending = false; 1437 - apic->isr_count = 0; 1362 + apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm); 1363 + apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm); 1438 1364 apic->highest_isr_cache = -1; 1439 1365 update_divide_count(apic); 1440 1366 atomic_set(&apic->lapic_timer.pending, 0); ··· 1649 1575 update_divide_count(apic); 1650 1576 start_apic_timer(apic); 1651 1577 apic->irr_pending = true; 1652 - apic->isr_count = count_vectors(apic->regs + APIC_ISR); 1578 + apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm) ? 1579 + 1 : count_vectors(apic->regs + APIC_ISR); 1653 1580 apic->highest_isr_cache = -1; 1581 + kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); 1654 1582 kvm_make_request(KVM_REQ_EVENT, vcpu); 1655 1583 } 1656 1584

+34

arch/x86/kvm/lapic.h

··· 64 64 u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); 65 65 void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); 66 66 67 + void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset); 68 + void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector); 69 + 67 70 void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); 68 71 void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); 69 72 void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); ··· 126 123 { 127 124 return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic); 128 125 } 126 + 127 + static inline int apic_x2apic_mode(struct kvm_lapic *apic) 128 + { 129 + return apic->vcpu->arch.apic_base & X2APIC_ENABLE; 130 + } 131 + 132 + static inline bool kvm_apic_vid_enabled(struct kvm *kvm) 133 + { 134 + return kvm_x86_ops->vm_has_apicv(kvm); 135 + } 136 + 137 + static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr) 138 + { 139 + u16 cid; 140 + ldr >>= 32 - map->ldr_bits; 141 + cid = (ldr >> map->cid_shift) & map->cid_mask; 142 + 143 + BUG_ON(cid >= ARRAY_SIZE(map->logical_map)); 144 + 145 + return cid; 146 + } 147 + 148 + static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr) 149 + { 150 + ldr >>= (32 - map->ldr_bits); 151 + return ldr & map->lid_mask; 152 + } 153 + 154 + void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, 155 + struct kvm_lapic_irq *irq, 156 + u64 *eoi_bitmap); 129 157 130 158 #endif

+9 -23

arch/x86/kvm/mmu.c

··· 448 448 449 449 static bool spte_is_locklessly_modifiable(u64 spte) 450 450 { 451 - return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); 451 + return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == 452 + (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); 452 453 } 453 454 454 455 static bool spte_has_volatile_bits(u64 spte) ··· 1461 1460 percpu_counter_add(&kvm_total_used_mmu_pages, nr); 1462 1461 } 1463 1462 1464 - /* 1465 - * Remove the sp from shadow page cache, after call it, 1466 - * we can not find this sp from the cache, and the shadow 1467 - * page table is still valid. 1468 - * It should be under the protection of mmu lock. 1469 - */ 1470 - static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp) 1463 + static void kvm_mmu_free_page(struct kvm_mmu_page *sp) 1471 1464 { 1472 1465 ASSERT(is_empty_shadow_page(sp->spt)); 1473 1466 hlist_del(&sp->hash_link); 1474 - if (!sp->role.direct) 1475 - free_page((unsigned long)sp->gfns); 1476 - } 1477 - 1478 - /* 1479 - * Free the shadow page table and the sp, we can do it 1480 - * out of the protection of mmu lock. 1481 - */ 1482 - static void kvm_mmu_free_page(struct kvm_mmu_page *sp) 1483 - { 1484 1467 list_del(&sp->link); 1485 1468 free_page((unsigned long)sp->spt); 1469 + if (!sp->role.direct) 1470 + free_page((unsigned long)sp->gfns); 1486 1471 kmem_cache_free(mmu_page_header_cache, sp); 1487 1472 } 1488 1473 ··· 2112 2125 do { 2113 2126 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); 2114 2127 WARN_ON(!sp->role.invalid || sp->root_count); 2115 - kvm_mmu_isolate_page(sp); 2116 2128 kvm_mmu_free_page(sp); 2117 2129 } while (!list_empty(invalid_list)); 2118 2130 } ··· 2313 2327 if (s->role.level != PT_PAGE_TABLE_LEVEL) 2314 2328 return 1; 2315 2329 2316 - if (!need_unsync && !s->unsync) { 2330 + if (!s->unsync) 2317 2331 need_unsync = true; 2318 - } 2319 2332 } 2320 2333 if (need_unsync) 2321 2334 kvm_unsync_pages(vcpu, gfn); ··· 3672 3687 else 3673 3688 r = paging32_init_context(vcpu, context); 3674 3689 3690 + vcpu->arch.mmu.base_role.nxe = is_nx(vcpu); 3675 3691 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); 3676 3692 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); 3677 3693 vcpu->arch.mmu.base_role.smep_andnot_wp ··· 3839 3853 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ 3840 3854 *gpa &= ~(gpa_t)7; 3841 3855 *bytes = 8; 3842 - r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8)); 3856 + r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8); 3843 3857 if (r) 3844 3858 gentry = 0; 3845 3859 new = (const u8 *)&gentry; ··· 3993 4007 !((sp->role.word ^ vcpu->arch.mmu.base_role.word) 3994 4008 & mask.word) && rmap_can_add(vcpu)) 3995 4009 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); 3996 - if (!remote_flush && need_remote_flush(entry, *spte)) 4010 + if (need_remote_flush(entry, *spte)) 3997 4011 remote_flush = true; 3998 4012 ++spte; 3999 4013 }

-3

arch/x86/kvm/paging_tmpl.h

··· 409 409 unsigned direct_access, access = gw->pt_access; 410 410 int top_level, emulate = 0; 411 411 412 - if (!is_present_gpte(gw->ptes[gw->level - 1])) 413 - return 0; 414 - 415 412 direct_access = gw->pte_access; 416 413 417 414 top_level = vcpu->arch.mmu.root_level;

+24

arch/x86/kvm/svm.c

··· 3571 3571 set_cr_intercept(svm, INTERCEPT_CR8_WRITE); 3572 3572 } 3573 3573 3574 + static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) 3575 + { 3576 + return; 3577 + } 3578 + 3579 + static int svm_vm_has_apicv(struct kvm *kvm) 3580 + { 3581 + return 0; 3582 + } 3583 + 3584 + static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 3585 + { 3586 + return; 3587 + } 3588 + 3589 + static void svm_hwapic_isr_update(struct kvm *kvm, int isr) 3590 + { 3591 + return; 3592 + } 3593 + 3574 3594 static int svm_nmi_allowed(struct kvm_vcpu *vcpu) 3575 3595 { 3576 3596 struct vcpu_svm *svm = to_svm(vcpu); ··· 4310 4290 .enable_nmi_window = enable_nmi_window, 4311 4291 .enable_irq_window = enable_irq_window, 4312 4292 .update_cr8_intercept = update_cr8_intercept, 4293 + .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode, 4294 + .vm_has_apicv = svm_vm_has_apicv, 4295 + .load_eoi_exitmap = svm_load_eoi_exitmap, 4296 + .hwapic_isr_update = svm_hwapic_isr_update, 4313 4297 4314 4298 .set_tss_addr = svm_set_tss_addr, 4315 4299 .get_tdp_level = get_npt_level,

+316 -20

arch/x86/kvm/vmx.c

··· 84 84 static bool __read_mostly fasteoi = 1; 85 85 module_param(fasteoi, bool, S_IRUGO); 86 86 87 + static bool __read_mostly enable_apicv_reg_vid = 1; 88 + module_param(enable_apicv_reg_vid, bool, S_IRUGO); 89 + 87 90 /* 88 91 * If nested=1, nested virtualization is supported, i.e., guests may use 89 92 * VMX and be a hypervisor for its own guests. If nested=0, guests may not ··· 643 640 static unsigned long *vmx_io_bitmap_b; 644 641 static unsigned long *vmx_msr_bitmap_legacy; 645 642 static unsigned long *vmx_msr_bitmap_longmode; 643 + static unsigned long *vmx_msr_bitmap_legacy_x2apic; 644 + static unsigned long *vmx_msr_bitmap_longmode_x2apic; 646 645 647 646 static bool cpu_has_load_ia32_efer; 648 647 static bool cpu_has_load_perf_global_ctrl; ··· 767 762 { 768 763 return vmcs_config.cpu_based_2nd_exec_ctrl & 769 764 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 765 + } 766 + 767 + static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) 768 + { 769 + return vmcs_config.cpu_based_2nd_exec_ctrl & 770 + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 771 + } 772 + 773 + static inline bool cpu_has_vmx_apic_register_virt(void) 774 + { 775 + return vmcs_config.cpu_based_2nd_exec_ctrl & 776 + SECONDARY_EXEC_APIC_REGISTER_VIRT; 777 + } 778 + 779 + static inline bool cpu_has_vmx_virtual_intr_delivery(void) 780 + { 781 + return vmcs_config.cpu_based_2nd_exec_ctrl & 782 + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; 770 783 } 771 784 772 785 static inline bool cpu_has_vmx_flexpriority(void) ··· 1844 1821 vmx->guest_msrs[from] = tmp; 1845 1822 } 1846 1823 1824 + static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) 1825 + { 1826 + unsigned long *msr_bitmap; 1827 + 1828 + if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) { 1829 + if (is_long_mode(vcpu)) 1830 + msr_bitmap = vmx_msr_bitmap_longmode_x2apic; 1831 + else 1832 + msr_bitmap = vmx_msr_bitmap_legacy_x2apic; 1833 + } else { 1834 + if (is_long_mode(vcpu)) 1835 + msr_bitmap = vmx_msr_bitmap_longmode; 1836 + else 1837 + msr_bitmap = vmx_msr_bitmap_legacy; 1838 + } 1839 + 1840 + vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); 1841 + } 1842 + 1847 1843 /* 1848 1844 * Set up the vmcs to automatically save and restore system 1849 1845 * msrs. Don't touch the 64-bit msrs if the guest is in legacy ··· 1871 1829 static void setup_msrs(struct vcpu_vmx *vmx) 1872 1830 { 1873 1831 int save_nmsrs, index; 1874 - unsigned long *msr_bitmap; 1875 1832 1876 1833 save_nmsrs = 0; 1877 1834 #ifdef CONFIG_X86_64 ··· 1902 1861 1903 1862 vmx->save_nmsrs = save_nmsrs; 1904 1863 1905 - if (cpu_has_vmx_msr_bitmap()) { 1906 - if (is_long_mode(&vmx->vcpu)) 1907 - msr_bitmap = vmx_msr_bitmap_longmode; 1908 - else 1909 - msr_bitmap = vmx_msr_bitmap_legacy; 1910 - 1911 - vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); 1912 - } 1864 + if (cpu_has_vmx_msr_bitmap()) 1865 + vmx_set_msr_bitmap(&vmx->vcpu); 1913 1866 } 1914 1867 1915 1868 /* ··· 2569 2534 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 2570 2535 min2 = 0; 2571 2536 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2537 + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2572 2538 SECONDARY_EXEC_WBINVD_EXITING | 2573 2539 SECONDARY_EXEC_ENABLE_VPID | 2574 2540 SECONDARY_EXEC_ENABLE_EPT | 2575 2541 SECONDARY_EXEC_UNRESTRICTED_GUEST | 2576 2542 SECONDARY_EXEC_PAUSE_LOOP_EXITING | 2577 2543 SECONDARY_EXEC_RDTSCP | 2578 - SECONDARY_EXEC_ENABLE_INVPCID; 2544 + SECONDARY_EXEC_ENABLE_INVPCID | 2545 + SECONDARY_EXEC_APIC_REGISTER_VIRT | 2546 + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; 2579 2547 if (adjust_vmx_controls(min2, opt2, 2580 2548 MSR_IA32_VMX_PROCBASED_CTLS2, 2581 2549 &_cpu_based_2nd_exec_control) < 0) ··· 2589 2551 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 2590 2552 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 2591 2553 #endif 2554 + 2555 + if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) 2556 + _cpu_based_2nd_exec_control &= ~( 2557 + SECONDARY_EXEC_APIC_REGISTER_VIRT | 2558 + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2559 + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 2560 + 2592 2561 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { 2593 2562 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT 2594 2563 enabled */ ··· 2792 2747 2793 2748 if (!cpu_has_vmx_ple()) 2794 2749 ple_gap = 0; 2750 + 2751 + if (!cpu_has_vmx_apic_register_virt() || 2752 + !cpu_has_vmx_virtual_intr_delivery()) 2753 + enable_apicv_reg_vid = 0; 2754 + 2755 + if (enable_apicv_reg_vid) 2756 + kvm_x86_ops->update_cr8_intercept = NULL; 2757 + else 2758 + kvm_x86_ops->hwapic_irr_update = NULL; 2795 2759 2796 2760 if (nested) 2797 2761 nested_vmx_setup_ctls_msrs(); ··· 3227 3173 if (!is_paging(vcpu)) { 3228 3174 hw_cr4 &= ~X86_CR4_PAE; 3229 3175 hw_cr4 |= X86_CR4_PSE; 3176 + /* 3177 + * SMEP is disabled if CPU is in non-paging mode in 3178 + * hardware. However KVM always uses paging mode to 3179 + * emulate guest non-paging mode with TDP. 3180 + * To emulate this behavior, SMEP needs to be manually 3181 + * disabled when guest switches to non-paging mode. 3182 + */ 3183 + hw_cr4 &= ~X86_CR4_SMEP; 3230 3184 } else if (!(cr4 & X86_CR4_PAE)) { 3231 3185 hw_cr4 &= ~X86_CR4_PAE; 3232 3186 } ··· 3769 3707 spin_unlock(&vmx_vpid_lock); 3770 3708 } 3771 3709 3772 - static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) 3710 + #define MSR_TYPE_R 1 3711 + #define MSR_TYPE_W 2 3712 + static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, 3713 + u32 msr, int type) 3773 3714 { 3774 3715 int f = sizeof(unsigned long); 3775 3716 ··· 3785 3720 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 3786 3721 */ 3787 3722 if (msr <= 0x1fff) { 3788 - __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */ 3789 - __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */ 3723 + if (type & MSR_TYPE_R) 3724 + /* read-low */ 3725 + __clear_bit(msr, msr_bitmap + 0x000 / f); 3726 + 3727 + if (type & MSR_TYPE_W) 3728 + /* write-low */ 3729 + __clear_bit(msr, msr_bitmap + 0x800 / f); 3730 + 3790 3731 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 3791 3732 msr &= 0x1fff; 3792 - __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */ 3793 - __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */ 3733 + if (type & MSR_TYPE_R) 3734 + /* read-high */ 3735 + __clear_bit(msr, msr_bitmap + 0x400 / f); 3736 + 3737 + if (type & MSR_TYPE_W) 3738 + /* write-high */ 3739 + __clear_bit(msr, msr_bitmap + 0xc00 / f); 3740 + 3741 + } 3742 + } 3743 + 3744 + static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, 3745 + u32 msr, int type) 3746 + { 3747 + int f = sizeof(unsigned long); 3748 + 3749 + if (!cpu_has_vmx_msr_bitmap()) 3750 + return; 3751 + 3752 + /* 3753 + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals 3754 + * have the write-low and read-high bitmap offsets the wrong way round. 3755 + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 3756 + */ 3757 + if (msr <= 0x1fff) { 3758 + if (type & MSR_TYPE_R) 3759 + /* read-low */ 3760 + __set_bit(msr, msr_bitmap + 0x000 / f); 3761 + 3762 + if (type & MSR_TYPE_W) 3763 + /* write-low */ 3764 + __set_bit(msr, msr_bitmap + 0x800 / f); 3765 + 3766 + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 3767 + msr &= 0x1fff; 3768 + if (type & MSR_TYPE_R) 3769 + /* read-high */ 3770 + __set_bit(msr, msr_bitmap + 0x400 / f); 3771 + 3772 + if (type & MSR_TYPE_W) 3773 + /* write-high */ 3774 + __set_bit(msr, msr_bitmap + 0xc00 / f); 3775 + 3794 3776 } 3795 3777 } 3796 3778 3797 3779 static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) 3798 3780 { 3799 3781 if (!longmode_only) 3800 - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr); 3801 - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr); 3782 + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, 3783 + msr, MSR_TYPE_R | MSR_TYPE_W); 3784 + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, 3785 + msr, MSR_TYPE_R | MSR_TYPE_W); 3786 + } 3787 + 3788 + static void vmx_enable_intercept_msr_read_x2apic(u32 msr) 3789 + { 3790 + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, 3791 + msr, MSR_TYPE_R); 3792 + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, 3793 + msr, MSR_TYPE_R); 3794 + } 3795 + 3796 + static void vmx_disable_intercept_msr_read_x2apic(u32 msr) 3797 + { 3798 + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, 3799 + msr, MSR_TYPE_R); 3800 + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, 3801 + msr, MSR_TYPE_R); 3802 + } 3803 + 3804 + static void vmx_disable_intercept_msr_write_x2apic(u32 msr) 3805 + { 3806 + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, 3807 + msr, MSR_TYPE_W); 3808 + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, 3809 + msr, MSR_TYPE_W); 3802 3810 } 3803 3811 3804 3812 /* ··· 3950 3812 return exec_control; 3951 3813 } 3952 3814 3815 + static int vmx_vm_has_apicv(struct kvm *kvm) 3816 + { 3817 + return enable_apicv_reg_vid && irqchip_in_kernel(kvm); 3818 + } 3819 + 3953 3820 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 3954 3821 { 3955 3822 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; ··· 3972 3829 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 3973 3830 if (!ple_gap) 3974 3831 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 3832 + if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) 3833 + exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 3834 + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 3835 + exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 3975 3836 return exec_control; 3976 3837 } 3977 3838 ··· 4018 3871 if (cpu_has_secondary_exec_ctrls()) { 4019 3872 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 4020 3873 vmx_secondary_exec_control(vmx)); 3874 + } 3875 + 3876 + if (enable_apicv_reg_vid) { 3877 + vmcs_write64(EOI_EXIT_BITMAP0, 0); 3878 + vmcs_write64(EOI_EXIT_BITMAP1, 0); 3879 + vmcs_write64(EOI_EXIT_BITMAP2, 0); 3880 + vmcs_write64(EOI_EXIT_BITMAP3, 0); 3881 + 3882 + vmcs_write16(GUEST_INTR_STATUS, 0); 4021 3883 } 4022 3884 4023 3885 if (ple_gap) { ··· 4941 4785 } 4942 4786 } 4943 4787 return emulate_instruction(vcpu, 0) == EMULATE_DONE; 4788 + } 4789 + 4790 + static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) 4791 + { 4792 + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4793 + int vector = exit_qualification & 0xff; 4794 + 4795 + /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ 4796 + kvm_apic_set_eoi_accelerated(vcpu, vector); 4797 + return 1; 4798 + } 4799 + 4800 + static int handle_apic_write(struct kvm_vcpu *vcpu) 4801 + { 4802 + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4803 + u32 offset = exit_qualification & 0xfff; 4804 + 4805 + /* APIC-write VM exit is trap-like and thus no need to adjust IP */ 4806 + kvm_apic_write_nodecode(vcpu, offset); 4807 + return 1; 4944 4808 } 4945 4809 4946 4810 static int handle_task_switch(struct kvm_vcpu *vcpu) ··· 5897 5721 [EXIT_REASON_VMON] = handle_vmon, 5898 5722 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 5899 5723 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 5724 + [EXIT_REASON_APIC_WRITE] = handle_apic_write, 5725 + [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, 5900 5726 [EXIT_REASON_WBINVD] = handle_wbinvd, 5901 5727 [EXIT_REASON_XSETBV] = handle_xsetbv, 5902 5728 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, ··· 6246 6068 } 6247 6069 6248 6070 vmcs_write32(TPR_THRESHOLD, irr); 6071 + } 6072 + 6073 + static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) 6074 + { 6075 + u32 sec_exec_control; 6076 + 6077 + /* 6078 + * There is not point to enable virtualize x2apic without enable 6079 + * apicv 6080 + */ 6081 + if (!cpu_has_vmx_virtualize_x2apic_mode() || 6082 + !vmx_vm_has_apicv(vcpu->kvm)) 6083 + return; 6084 + 6085 + if (!vm_need_tpr_shadow(vcpu->kvm)) 6086 + return; 6087 + 6088 + sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 6089 + 6090 + if (set) { 6091 + sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6092 + sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 6093 + } else { 6094 + sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 6095 + sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6096 + } 6097 + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); 6098 + 6099 + vmx_set_msr_bitmap(vcpu); 6100 + } 6101 + 6102 + static void vmx_hwapic_isr_update(struct kvm *kvm, int isr) 6103 + { 6104 + u16 status; 6105 + u8 old; 6106 + 6107 + if (!vmx_vm_has_apicv(kvm)) 6108 + return; 6109 + 6110 + if (isr == -1) 6111 + isr = 0; 6112 + 6113 + status = vmcs_read16(GUEST_INTR_STATUS); 6114 + old = status >> 8; 6115 + if (isr != old) { 6116 + status &= 0xff; 6117 + status |= isr << 8; 6118 + vmcs_write16(GUEST_INTR_STATUS, status); 6119 + } 6120 + } 6121 + 6122 + static void vmx_set_rvi(int vector) 6123 + { 6124 + u16 status; 6125 + u8 old; 6126 + 6127 + status = vmcs_read16(GUEST_INTR_STATUS); 6128 + old = (u8)status & 0xff; 6129 + if ((u8)vector != old) { 6130 + status &= ~0xff; 6131 + status |= (u8)vector; 6132 + vmcs_write16(GUEST_INTR_STATUS, status); 6133 + } 6134 + } 6135 + 6136 + static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) 6137 + { 6138 + if (max_irr == -1) 6139 + return; 6140 + 6141 + vmx_set_rvi(max_irr); 6142 + } 6143 + 6144 + static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 6145 + { 6146 + vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); 6147 + vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); 6148 + vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); 6149 + vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); 6249 6150 } 6250 6151 6251 6152 static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) ··· 7590 7333 .enable_nmi_window = enable_nmi_window, 7591 7334 .enable_irq_window = enable_irq_window, 7592 7335 .update_cr8_intercept = update_cr8_intercept, 7336 + .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode, 7337 + .vm_has_apicv = vmx_vm_has_apicv, 7338 + .load_eoi_exitmap = vmx_load_eoi_exitmap, 7339 + .hwapic_irr_update = vmx_hwapic_irr_update, 7340 + .hwapic_isr_update = vmx_hwapic_isr_update, 7593 7341 7594 7342 .set_tss_addr = vmx_set_tss_addr, 7595 7343 .get_tdp_level = get_ept_level, ··· 7627 7365 7628 7366 static int __init vmx_init(void) 7629 7367 { 7630 - int r, i; 7368 + int r, i, msr; 7631 7369 7632 7370 rdmsrl_safe(MSR_EFER, &host_efer); 7633 7371 ··· 7648 7386 if (!vmx_msr_bitmap_legacy) 7649 7387 goto out1; 7650 7388 7389 + vmx_msr_bitmap_legacy_x2apic = 7390 + (unsigned long *)__get_free_page(GFP_KERNEL); 7391 + if (!vmx_msr_bitmap_legacy_x2apic) 7392 + goto out2; 7651 7393 7652 7394 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); 7653 7395 if (!vmx_msr_bitmap_longmode) 7654 - goto out2; 7396 + goto out3; 7655 7397 7398 + vmx_msr_bitmap_longmode_x2apic = 7399 + (unsigned long *)__get_free_page(GFP_KERNEL); 7400 + if (!vmx_msr_bitmap_longmode_x2apic) 7401 + goto out4; 7656 7402 7657 7403 /* 7658 7404 * Allow direct access to the PC debug port (it is often used for I/O ··· 7692 7422 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); 7693 7423 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); 7694 7424 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); 7425 + memcpy(vmx_msr_bitmap_legacy_x2apic, 7426 + vmx_msr_bitmap_legacy, PAGE_SIZE); 7427 + memcpy(vmx_msr_bitmap_longmode_x2apic, 7428 + vmx_msr_bitmap_longmode, PAGE_SIZE); 7429 + 7430 + if (enable_apicv_reg_vid) { 7431 + for (msr = 0x800; msr <= 0x8ff; msr++) 7432 + vmx_disable_intercept_msr_read_x2apic(msr); 7433 + 7434 + /* According SDM, in x2apic mode, the whole id reg is used. 7435 + * But in KVM, it only use the highest eight bits. Need to 7436 + * intercept it */ 7437 + vmx_enable_intercept_msr_read_x2apic(0x802); 7438 + /* TMCCT */ 7439 + vmx_enable_intercept_msr_read_x2apic(0x839); 7440 + /* TPR */ 7441 + vmx_disable_intercept_msr_write_x2apic(0x808); 7442 + /* EOI */ 7443 + vmx_disable_intercept_msr_write_x2apic(0x80b); 7444 + /* SELF-IPI */ 7445 + vmx_disable_intercept_msr_write_x2apic(0x83f); 7446 + } 7695 7447 7696 7448 if (enable_ept) { 7697 7449 kvm_mmu_set_mask_ptes(0ull, ··· 7727 7435 7728 7436 return 0; 7729 7437 7730 - out3: 7438 + out4: 7731 7439 free_page((unsigned long)vmx_msr_bitmap_longmode); 7440 + out3: 7441 + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); 7732 7442 out2: 7733 7443 free_page((unsigned long)vmx_msr_bitmap_legacy); 7734 7444 out1: ··· 7742 7448 7743 7449 static void __exit vmx_exit(void) 7744 7450 { 7451 + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); 7452 + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); 7745 7453 free_page((unsigned long)vmx_msr_bitmap_legacy); 7746 7454 free_page((unsigned long)vmx_msr_bitmap_longmode); 7747 7455 free_page((unsigned long)vmx_io_bitmap_b);

+21 -4

arch/x86/kvm/x86.c

··· 870 870 871 871 kvm_x86_ops->set_efer(vcpu, efer); 872 872 873 - vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 874 - 875 873 /* Update reserved bits */ 876 874 if ((efer ^ old_efer) & EFER_NX) 877 875 kvm_mmu_reset_context(vcpu); ··· 5563 5565 vcpu->arch.nmi_injected = true; 5564 5566 kvm_x86_ops->set_nmi(vcpu); 5565 5567 } 5566 - } else if (kvm_cpu_has_interrupt(vcpu)) { 5568 + } else if (kvm_cpu_has_injectable_intr(vcpu)) { 5567 5569 if (kvm_x86_ops->interrupt_allowed(vcpu)) { 5568 5570 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), 5569 5571 false); ··· 5631 5633 #endif 5632 5634 } 5633 5635 5636 + static void update_eoi_exitmap(struct kvm_vcpu *vcpu) 5637 + { 5638 + u64 eoi_exit_bitmap[4]; 5639 + 5640 + memset(eoi_exit_bitmap, 0, 32); 5641 + 5642 + kvm_ioapic_calculate_eoi_exitmap(vcpu, eoi_exit_bitmap); 5643 + kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); 5644 + } 5645 + 5634 5646 static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 5635 5647 { 5636 5648 int r; ··· 5694 5686 kvm_handle_pmu_event(vcpu); 5695 5687 if (kvm_check_request(KVM_REQ_PMI, vcpu)) 5696 5688 kvm_deliver_pmi(vcpu); 5689 + if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu)) 5690 + update_eoi_exitmap(vcpu); 5697 5691 } 5698 5692 5699 5693 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { ··· 5704 5694 /* enable NMI/IRQ window open exits if needed */ 5705 5695 if (vcpu->arch.nmi_pending) 5706 5696 kvm_x86_ops->enable_nmi_window(vcpu); 5707 - else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) 5697 + else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) 5708 5698 kvm_x86_ops->enable_irq_window(vcpu); 5709 5699 5710 5700 if (kvm_lapic_enabled(vcpu)) { 5701 + /* 5702 + * Update architecture specific hints for APIC 5703 + * virtual interrupt delivery. 5704 + */ 5705 + if (kvm_x86_ops->hwapic_irr_update) 5706 + kvm_x86_ops->hwapic_irr_update(vcpu, 5707 + kvm_lapic_find_highest_irr(vcpu)); 5711 5708 update_cr8_intercept(vcpu); 5712 5709 kvm_lapic_sync_to_vapic(vcpu); 5713 5710 }

+11 -9

drivers/s390/kvm/virtio_ccw.c

··· 244 244 { 245 245 struct virtio_ccw_device *vcdev = to_vc_device(vdev); 246 246 int err; 247 - struct virtqueue *vq; 247 + struct virtqueue *vq = NULL; 248 248 struct virtio_ccw_vq_info *info; 249 - unsigned long size; 249 + unsigned long size = 0; /* silence the compiler */ 250 250 unsigned long flags; 251 251 252 252 /* Allocate queue. */ ··· 279 279 /* For now, we fail if we can't get the requested size. */ 280 280 dev_warn(&vcdev->cdev->dev, "no vq\n"); 281 281 err = -ENOMEM; 282 - free_pages_exact(info->queue, size); 283 282 goto out_err; 284 283 } 285 - info->vq = vq; 286 - vq->priv = info; 287 284 288 285 /* Register it with the host. */ 289 286 info->info_block->queue = (__u64)info->queue; ··· 294 297 err = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_VQ | i); 295 298 if (err) { 296 299 dev_warn(&vcdev->cdev->dev, "SET_VQ failed\n"); 297 - free_pages_exact(info->queue, size); 298 - info->vq = NULL; 299 - vq->priv = NULL; 300 300 goto out_err; 301 301 } 302 + 303 + info->vq = vq; 304 + vq->priv = info; 302 305 303 306 /* Save it to our list. */ 304 307 spin_lock_irqsave(&vcdev->lock, flags); ··· 308 311 return vq; 309 312 310 313 out_err: 311 - if (info) 314 + if (vq) 315 + vring_del_virtqueue(vq); 316 + if (info) { 317 + if (info->queue) 318 + free_pages_exact(info->queue, size); 312 319 kfree(info->info_block); 320 + } 313 321 kfree(info); 314 322 return ERR_PTR(err); 315 323 }

+3

include/linux/kvm_host.h

··· 123 123 #define KVM_REQ_MASTERCLOCK_UPDATE 19 124 124 #define KVM_REQ_MCLOCK_INPROGRESS 20 125 125 #define KVM_REQ_EPR_EXIT 21 126 + #define KVM_REQ_EOIBITMAP 22 126 127 127 128 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 128 129 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 ··· 539 538 void kvm_flush_remote_tlbs(struct kvm *kvm); 540 539 void kvm_reload_remote_mmus(struct kvm *kvm); 541 540 void kvm_make_mclock_inprogress_request(struct kvm *kvm); 541 + void kvm_make_update_eoibitmap_request(struct kvm *kvm); 542 542 543 543 long kvm_arch_dev_ioctl(struct file *filp, 544 544 unsigned int ioctl, unsigned long arg); ··· 693 691 int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level); 694 692 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, 695 693 int irq_source_id, int level); 694 + bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin); 696 695 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); 697 696 void kvm_register_irq_ack_notifier(struct kvm *kvm, 698 697 struct kvm_irq_ack_notifier *kian);

+19 -6

kernel/sched/core.c

··· 4316 4316 * It's the caller's job to ensure that the target task struct 4317 4317 * can't go away on us before we can do any checks. 4318 4318 * 4319 - * Returns true if we indeed boosted the target task. 4319 + * Returns: 4320 + * true (>0) if we indeed boosted the target task. 4321 + * false (0) if we failed to boost the target. 4322 + * -ESRCH if there's no task to yield to. 4320 4323 */ 4321 4324 bool __sched yield_to(struct task_struct *p, bool preempt) 4322 4325 { ··· 4333 4330 4334 4331 again: 4335 4332 p_rq = task_rq(p); 4333 + /* 4334 + * If we're the only runnable task on the rq and target rq also 4335 + * has only one task, there's absolutely no point in yielding. 4336 + */ 4337 + if (rq->nr_running == 1 && p_rq->nr_running == 1) { 4338 + yielded = -ESRCH; 4339 + goto out_irq; 4340 + } 4341 + 4336 4342 double_rq_lock(rq, p_rq); 4337 4343 while (task_rq(p) != p_rq) { 4338 4344 double_rq_unlock(rq, p_rq); ··· 4349 4337 } 4350 4338 4351 4339 if (!curr->sched_class->yield_to_task) 4352 - goto out; 4340 + goto out_unlock; 4353 4341 4354 4342 if (curr->sched_class != p->sched_class) 4355 - goto out; 4343 + goto out_unlock; 4356 4344 4357 4345 if (task_running(p_rq, p) || p->state) 4358 - goto out; 4346 + goto out_unlock; 4359 4347 4360 4348 yielded = curr->sched_class->yield_to_task(rq, p, preempt); 4361 4349 if (yielded) { ··· 4368 4356 resched_task(p_rq->curr); 4369 4357 } 4370 4358 4371 - out: 4359 + out_unlock: 4372 4360 double_rq_unlock(rq, p_rq); 4361 + out_irq: 4373 4362 local_irq_restore(flags); 4374 4363 4375 - if (yielded) 4364 + if (yielded > 0) 4376 4365 schedule(); 4377 4366 4378 4367 return yielded;

+39

virt/kvm/ioapic.c

··· 35 35 #include <linux/hrtimer.h> 36 36 #include <linux/io.h> 37 37 #include <linux/slab.h> 38 + #include <linux/export.h> 38 39 #include <asm/processor.h> 39 40 #include <asm/page.h> 40 41 #include <asm/current.h> ··· 116 115 smp_wmb(); 117 116 } 118 117 118 + void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, 119 + u64 *eoi_exit_bitmap) 120 + { 121 + struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; 122 + union kvm_ioapic_redirect_entry *e; 123 + struct kvm_lapic_irq irqe; 124 + int index; 125 + 126 + spin_lock(&ioapic->lock); 127 + /* traverse ioapic entry to set eoi exit bitmap*/ 128 + for (index = 0; index < IOAPIC_NUM_PINS; index++) { 129 + e = &ioapic->redirtbl[index]; 130 + if (!e->fields.mask && 131 + (e->fields.trig_mode == IOAPIC_LEVEL_TRIG || 132 + kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, 133 + index))) { 134 + irqe.dest_id = e->fields.dest_id; 135 + irqe.vector = e->fields.vector; 136 + irqe.dest_mode = e->fields.dest_mode; 137 + irqe.delivery_mode = e->fields.delivery_mode << 8; 138 + kvm_calculate_eoi_exitmap(vcpu, &irqe, eoi_exit_bitmap); 139 + } 140 + } 141 + spin_unlock(&ioapic->lock); 142 + } 143 + EXPORT_SYMBOL_GPL(kvm_ioapic_calculate_eoi_exitmap); 144 + 145 + void kvm_ioapic_make_eoibitmap_request(struct kvm *kvm) 146 + { 147 + struct kvm_ioapic *ioapic = kvm->arch.vioapic; 148 + 149 + if (!kvm_apic_vid_enabled(kvm) || !ioapic) 150 + return; 151 + kvm_make_update_eoibitmap_request(kvm); 152 + } 153 + 119 154 static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) 120 155 { 121 156 unsigned index; ··· 193 156 if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG 194 157 && ioapic->irr & (1 << index)) 195 158 ioapic_service(ioapic, index); 159 + kvm_ioapic_make_eoibitmap_request(ioapic->kvm); 196 160 break; 197 161 } 198 162 } ··· 493 455 spin_lock(&ioapic->lock); 494 456 memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); 495 457 update_handled_vectors(ioapic); 458 + kvm_ioapic_make_eoibitmap_request(kvm); 496 459 spin_unlock(&ioapic->lock); 497 460 return 0; 498 461 }

+4

virt/kvm/ioapic.h

··· 82 82 struct kvm_lapic_irq *irq); 83 83 int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); 84 84 int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); 85 + void kvm_ioapic_make_eoibitmap_request(struct kvm *kvm); 86 + void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, 87 + u64 *eoi_exit_bitmap); 88 + 85 89 86 90 #endif

+3 -1

virt/kvm/iommu.c

··· 76 76 gfn = slot->base_gfn; 77 77 end_gfn = gfn + slot->npages; 78 78 79 - flags = IOMMU_READ | IOMMU_WRITE; 79 + flags = IOMMU_READ; 80 + if (!(slot->flags & KVM_MEM_READONLY)) 81 + flags |= IOMMU_WRITE; 80 82 if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY) 81 83 flags |= IOMMU_CACHE; 82 84

+25

virt/kvm/irq_comm.c

··· 22 22 23 23 #include <linux/kvm_host.h> 24 24 #include <linux/slab.h> 25 + #include <linux/export.h> 25 26 #include <trace/events/kvm.h> 26 27 27 28 #include <asm/msidef.h> ··· 238 237 return ret; 239 238 } 240 239 240 + bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) 241 + { 242 + struct kvm_irq_ack_notifier *kian; 243 + struct hlist_node *n; 244 + int gsi; 245 + 246 + rcu_read_lock(); 247 + gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; 248 + if (gsi != -1) 249 + hlist_for_each_entry_rcu(kian, n, &kvm->irq_ack_notifier_list, 250 + link) 251 + if (kian->gsi == gsi) { 252 + rcu_read_unlock(); 253 + return true; 254 + } 255 + 256 + rcu_read_unlock(); 257 + 258 + return false; 259 + } 260 + EXPORT_SYMBOL_GPL(kvm_irq_has_notifier); 261 + 241 262 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) 242 263 { 243 264 struct kvm_irq_ack_notifier *kian; ··· 284 261 mutex_lock(&kvm->irq_lock); 285 262 hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list); 286 263 mutex_unlock(&kvm->irq_lock); 264 + kvm_ioapic_make_eoibitmap_request(kvm); 287 265 } 288 266 289 267 void kvm_unregister_irq_ack_notifier(struct kvm *kvm, ··· 294 270 hlist_del_init_rcu(&kian->link); 295 271 mutex_unlock(&kvm->irq_lock); 296 272 synchronize_rcu(); 273 + kvm_ioapic_make_eoibitmap_request(kvm); 297 274 } 298 275 299 276 int kvm_request_irq_source_id(struct kvm *kvm)

+75 -31

virt/kvm/kvm_main.c

··· 217 217 make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); 218 218 } 219 219 220 + void kvm_make_update_eoibitmap_request(struct kvm *kvm) 221 + { 222 + make_all_cpus_request(kvm, KVM_REQ_EOIBITMAP); 223 + } 224 + 220 225 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 221 226 { 222 227 struct page *page; ··· 719 714 } 720 715 721 716 /* 717 + * KVM_SET_USER_MEMORY_REGION ioctl allows the following operations: 718 + * - create a new memory slot 719 + * - delete an existing memory slot 720 + * - modify an existing memory slot 721 + * -- move it in the guest physical memory space 722 + * -- just change its flags 723 + * 724 + * Since flags can be changed by some of these operations, the following 725 + * differentiation is the best we can do for __kvm_set_memory_region(): 726 + */ 727 + enum kvm_mr_change { 728 + KVM_MR_CREATE, 729 + KVM_MR_DELETE, 730 + KVM_MR_MOVE, 731 + KVM_MR_FLAGS_ONLY, 732 + }; 733 + 734 + /* 722 735 * Allocate some memory and give it an address in the guest physical address 723 736 * space. 724 737 * ··· 754 731 struct kvm_memory_slot *slot; 755 732 struct kvm_memory_slot old, new; 756 733 struct kvm_memslots *slots = NULL, *old_memslots; 734 + enum kvm_mr_change change; 757 735 758 736 r = check_memory_region_flags(mem); 759 737 if (r) ··· 796 772 new.npages = npages; 797 773 new.flags = mem->flags; 798 774 799 - /* 800 - * Disallow changing a memory slot's size or changing anything about 801 - * zero sized slots that doesn't involve making them non-zero. 802 - */ 803 775 r = -EINVAL; 804 - if (npages && old.npages && npages != old.npages) 805 - goto out; 806 - if (!npages && !old.npages) 776 + if (npages) { 777 + if (!old.npages) 778 + change = KVM_MR_CREATE; 779 + else { /* Modify an existing slot. */ 780 + if ((mem->userspace_addr != old.userspace_addr) || 781 + (npages != old.npages) || 782 + ((new.flags ^ old.flags) & KVM_MEM_READONLY)) 783 + goto out; 784 + 785 + if (base_gfn != old.base_gfn) 786 + change = KVM_MR_MOVE; 787 + else if (new.flags != old.flags) 788 + change = KVM_MR_FLAGS_ONLY; 789 + else { /* Nothing to change. */ 790 + r = 0; 791 + goto out; 792 + } 793 + } 794 + } else if (old.npages) { 795 + change = KVM_MR_DELETE; 796 + } else /* Modify a non-existent slot: disallowed. */ 807 797 goto out; 808 798 809 - if ((npages && !old.npages) || (base_gfn != old.base_gfn)) { 799 + if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { 810 800 /* Check for overlaps */ 811 801 r = -EEXIST; 812 802 kvm_for_each_memslot(slot, kvm->memslots) { ··· 838 800 new.dirty_bitmap = NULL; 839 801 840 802 r = -ENOMEM; 841 - 842 - /* 843 - * Allocate if a slot is being created. If modifying a slot, 844 - * the userspace_addr cannot change. 845 - */ 846 - if (!old.npages) { 803 + if (change == KVM_MR_CREATE) { 847 804 new.user_alloc = user_alloc; 848 805 new.userspace_addr = mem->userspace_addr; 849 806 850 807 if (kvm_arch_create_memslot(&new, npages)) 851 808 goto out_free; 852 - } else if (npages && mem->userspace_addr != old.userspace_addr) { 853 - r = -EINVAL; 854 - goto out_free; 855 809 } 856 810 857 811 /* Allocate page dirty bitmap if needed */ ··· 852 822 goto out_free; 853 823 } 854 824 855 - if (!npages || base_gfn != old.base_gfn) { 825 + if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) { 856 826 r = -ENOMEM; 857 827 slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), 858 828 GFP_KERNEL); ··· 893 863 goto out_free; 894 864 } 895 865 896 - /* map new memory slot into the iommu */ 897 - if (npages) { 866 + /* 867 + * IOMMU mapping: New slots need to be mapped. Old slots need to be 868 + * un-mapped and re-mapped if their base changes. Since base change 869 + * unmapping is handled above with slot deletion, mapping alone is 870 + * needed here. Anything else the iommu might care about for existing 871 + * slots (size changes, userspace addr changes and read-only flag 872 + * changes) is disallowed above, so any other attribute changes getting 873 + * here can be skipped. 874 + */ 875 + if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { 898 876 r = kvm_iommu_map_pages(kvm, &new); 899 877 if (r) 900 878 goto out_slots; 901 879 } 902 880 903 881 /* actual memory is freed via old in kvm_free_physmem_slot below */ 904 - if (!npages) { 882 + if (change == KVM_MR_DELETE) { 905 883 new.dirty_bitmap = NULL; 906 884 memset(&new.arch, 0, sizeof(new.arch)); 907 885 } ··· 1707 1669 { 1708 1670 struct pid *pid; 1709 1671 struct task_struct *task = NULL; 1672 + bool ret = false; 1710 1673 1711 1674 rcu_read_lock(); 1712 1675 pid = rcu_dereference(target->pid); ··· 1715 1676 task = get_pid_task(target->pid, PIDTYPE_PID); 1716 1677 rcu_read_unlock(); 1717 1678 if (!task) 1718 - return false; 1679 + return ret; 1719 1680 if (task->flags & PF_VCPU) { 1720 1681 put_task_struct(task); 1721 - return false; 1682 + return ret; 1722 1683 } 1723 - if (yield_to(task, 1)) { 1724 - put_task_struct(task); 1725 - return true; 1726 - } 1684 + ret = yield_to(task, 1); 1727 1685 put_task_struct(task); 1728 - return false; 1686 + 1687 + return ret; 1729 1688 } 1730 1689 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); 1731 1690 ··· 1764 1727 return eligible; 1765 1728 } 1766 1729 #endif 1730 + 1767 1731 void kvm_vcpu_on_spin(struct kvm_vcpu *me) 1768 1732 { 1769 1733 struct kvm *kvm = me->kvm; 1770 1734 struct kvm_vcpu *vcpu; 1771 1735 int last_boosted_vcpu = me->kvm->last_boosted_vcpu; 1772 1736 int yielded = 0; 1737 + int try = 3; 1773 1738 int pass; 1774 1739 int i; 1775 1740 ··· 1783 1744 * VCPU is holding the lock that we need and will release it. 1784 1745 * We approximate round-robin by starting at the last boosted VCPU. 1785 1746 */ 1786 - for (pass = 0; pass < 2 && !yielded; pass++) { 1747 + for (pass = 0; pass < 2 && !yielded && try; pass++) { 1787 1748 kvm_for_each_vcpu(i, vcpu, kvm) { 1788 1749 if (!pass && i <= last_boosted_vcpu) { 1789 1750 i = last_boosted_vcpu; ··· 1796 1757 continue; 1797 1758 if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) 1798 1759 continue; 1799 - if (kvm_vcpu_yield_to(vcpu)) { 1760 + 1761 + yielded = kvm_vcpu_yield_to(vcpu); 1762 + if (yielded > 0) { 1800 1763 kvm->last_boosted_vcpu = i; 1801 - yielded = 1; 1802 1764 break; 1765 + } else if (yielded < 0) { 1766 + try--; 1767 + if (!try) 1768 + break; 1803 1769 } 1804 1770 } 1805 1771 }