Merge branch 'kvm-updates/2.6.33' of git://git.kernel.org/pub/scm/virt/kvm/kvm

* 'kvm-updates/2.6.33' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
KVM: x86: Fix leak of free lapic date in kvm_arch_vcpu_init()
KVM: x86: Fix probable memory leak of vcpu->arch.mce_banks
KVM: S390: fix potential array overrun in intercept handling
KVM: fix spurious interrupt with irqfd
eventfd - allow atomic read and waitqueue remove
KVM: MMU: bail out pagewalk on kvm_read_guest error
KVM: properly check max PIC pin in irq route setup
KVM: only allow one gsi per fd
KVM: x86: Fix host_mapping_level()
KVM: powerpc: Show timing option only on embedded
KVM: Fix race between APIC TMR and IRR

+128 -34
+1 -1
arch/powerpc/kvm/Kconfig
··· 53 53 54 54 config KVM_EXIT_TIMING 55 55 bool "Detailed exit timing" 56 - depends on KVM 56 + depends on KVM_440 || KVM_E500 57 57 ---help--- 58 58 Calculate elapsed time for every exit/enter cycle. A per-vcpu 59 59 report is available in debugfs kvm/vm#_vcpu#_timing.
+2 -2
arch/s390/kvm/intercept.c
··· 213 213 return rc2; 214 214 } 215 215 216 - static const intercept_handler_t intercept_funcs[0x48 >> 2] = { 216 + static const intercept_handler_t intercept_funcs[] = { 217 217 [0x00 >> 2] = handle_noop, 218 218 [0x04 >> 2] = handle_instruction, 219 219 [0x08 >> 2] = handle_prog, ··· 230 230 intercept_handler_t func; 231 231 u8 code = vcpu->arch.sie_block->icptcode; 232 232 233 - if (code & 3 || code > 0x48) 233 + if (code & 3 || (code >> 2) >= ARRAY_SIZE(intercept_funcs)) 234 234 return -ENOTSUPP; 235 235 func = intercept_funcs[code >> 2]; 236 236 if (func)
+6 -5
arch/x86/kvm/lapic.c
··· 373 373 if (unlikely(!apic_enabled(apic))) 374 374 break; 375 375 376 + if (trig_mode) { 377 + apic_debug("level trig mode for vector %d", vector); 378 + apic_set_vector(vector, apic->regs + APIC_TMR); 379 + } else 380 + apic_clear_vector(vector, apic->regs + APIC_TMR); 381 + 376 382 result = !apic_test_and_set_irr(vector, apic); 377 383 trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, 378 384 trig_mode, vector, !result); ··· 389 383 break; 390 384 } 391 385 392 - if (trig_mode) { 393 - apic_debug("level trig mode for vector %d", vector); 394 - apic_set_vector(vector, apic->regs + APIC_TMR); 395 - } else 396 - apic_clear_vector(vector, apic->regs + APIC_TMR); 397 386 kvm_vcpu_kick(vcpu); 398 387 break; 399 388
+2 -4
arch/x86/kvm/mmu.c
··· 477 477 478 478 addr = gfn_to_hva(kvm, gfn); 479 479 if (kvm_is_error_hva(addr)) 480 - return page_size; 480 + return PT_PAGE_TABLE_LEVEL; 481 481 482 482 down_read(&current->mm->mmap_sem); 483 483 vma = find_vma(current->mm, addr); ··· 515 515 if (host_level == PT_PAGE_TABLE_LEVEL) 516 516 return host_level; 517 517 518 - for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) { 519 - 518 + for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) 520 519 if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) 521 520 break; 522 - } 523 521 524 522 return level - 1; 525 523 }
+3 -1
arch/x86/kvm/paging_tmpl.h
··· 150 150 walker->table_gfn[walker->level - 1] = table_gfn; 151 151 walker->pte_gpa[walker->level - 1] = pte_gpa; 152 152 153 - kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); 153 + if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) 154 + goto not_present; 155 + 154 156 trace_kvm_mmu_paging_element(pte, walker->level); 155 157 156 158 if (!is_present_gpte(pte))
+4 -2
arch/x86/kvm/x86.c
··· 5072 5072 GFP_KERNEL); 5073 5073 if (!vcpu->arch.mce_banks) { 5074 5074 r = -ENOMEM; 5075 - goto fail_mmu_destroy; 5075 + goto fail_free_lapic; 5076 5076 } 5077 5077 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 5078 5078 5079 5079 return 0; 5080 - 5080 + fail_free_lapic: 5081 + kvm_free_lapic(vcpu); 5081 5082 fail_mmu_destroy: 5082 5083 kvm_mmu_destroy(vcpu); 5083 5084 fail_free_pio_data: ··· 5089 5088 5090 5089 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 5091 5090 { 5091 + kfree(vcpu->arch.mce_banks); 5092 5092 kvm_free_lapic(vcpu); 5093 5093 down_read(&vcpu->kvm->slots_lock); 5094 5094 kvm_mmu_destroy(vcpu);
+74 -15
fs/eventfd.c
··· 135 135 return events; 136 136 } 137 137 138 - static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, 139 - loff_t *ppos) 138 + static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) 140 139 { 141 - struct eventfd_ctx *ctx = file->private_data; 140 + *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; 141 + ctx->count -= *cnt; 142 + } 143 + 144 + /** 145 + * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue. 146 + * @ctx: [in] Pointer to eventfd context. 147 + * @wait: [in] Wait queue to be removed. 148 + * @cnt: [out] Pointer to the 64bit conter value. 149 + * 150 + * Returns zero if successful, or the following error codes: 151 + * 152 + * -EAGAIN : The operation would have blocked. 153 + * 154 + * This is used to atomically remove a wait queue entry from the eventfd wait 155 + * queue head, and read/reset the counter value. 156 + */ 157 + int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait, 158 + __u64 *cnt) 159 + { 160 + unsigned long flags; 161 + 162 + spin_lock_irqsave(&ctx->wqh.lock, flags); 163 + eventfd_ctx_do_read(ctx, cnt); 164 + __remove_wait_queue(&ctx->wqh, wait); 165 + if (*cnt != 0 && waitqueue_active(&ctx->wqh)) 166 + wake_up_locked_poll(&ctx->wqh, POLLOUT); 167 + spin_unlock_irqrestore(&ctx->wqh.lock, flags); 168 + 169 + return *cnt != 0 ? 0 : -EAGAIN; 170 + } 171 + EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue); 172 + 173 + /** 174 + * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero. 175 + * @ctx: [in] Pointer to eventfd context. 176 + * @no_wait: [in] Different from zero if the operation should not block. 177 + * @cnt: [out] Pointer to the 64bit conter value. 178 + * 179 + * Returns zero if successful, or the following error codes: 180 + * 181 + * -EAGAIN : The operation would have blocked but @no_wait was nonzero. 182 + * -ERESTARTSYS : A signal interrupted the wait operation. 183 + * 184 + * If @no_wait is zero, the function might sleep until the eventfd internal 185 + * counter becomes greater than zero. 186 + */ 187 + ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt) 188 + { 142 189 ssize_t res; 143 - __u64 ucnt = 0; 144 190 DECLARE_WAITQUEUE(wait, current); 145 191 146 - if (count < sizeof(ucnt)) 147 - return -EINVAL; 148 192 spin_lock_irq(&ctx->wqh.lock); 193 + *cnt = 0; 149 194 res = -EAGAIN; 150 195 if (ctx->count > 0) 151 - res = sizeof(ucnt); 152 - else if (!(file->f_flags & O_NONBLOCK)) { 196 + res = 0; 197 + else if (!no_wait) { 153 198 __add_wait_queue(&ctx->wqh, &wait); 154 - for (res = 0;;) { 199 + for (;;) { 155 200 set_current_state(TASK_INTERRUPTIBLE); 156 201 if (ctx->count > 0) { 157 - res = sizeof(ucnt); 202 + res = 0; 158 203 break; 159 204 } 160 205 if (signal_pending(current)) { ··· 213 168 __remove_wait_queue(&ctx->wqh, &wait); 214 169 __set_current_state(TASK_RUNNING); 215 170 } 216 - if (likely(res > 0)) { 217 - ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; 218 - ctx->count -= ucnt; 171 + if (likely(res == 0)) { 172 + eventfd_ctx_do_read(ctx, cnt); 219 173 if (waitqueue_active(&ctx->wqh)) 220 174 wake_up_locked_poll(&ctx->wqh, POLLOUT); 221 175 } 222 176 spin_unlock_irq(&ctx->wqh.lock); 223 - if (res > 0 && put_user(ucnt, (__u64 __user *) buf)) 224 - return -EFAULT; 225 177 226 178 return res; 179 + } 180 + EXPORT_SYMBOL_GPL(eventfd_ctx_read); 181 + 182 + static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, 183 + loff_t *ppos) 184 + { 185 + struct eventfd_ctx *ctx = file->private_data; 186 + ssize_t res; 187 + __u64 cnt; 188 + 189 + if (count < sizeof(cnt)) 190 + return -EINVAL; 191 + res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt); 192 + if (res < 0) 193 + return res; 194 + 195 + return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt); 227 196 } 228 197 229 198 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
+16
include/linux/eventfd.h
··· 10 10 11 11 #include <linux/fcntl.h> 12 12 #include <linux/file.h> 13 + #include <linux/wait.h> 13 14 14 15 /* 15 16 * CAREFUL: Check include/asm-generic/fcntl.h when defining ··· 35 34 struct eventfd_ctx *eventfd_ctx_fdget(int fd); 36 35 struct eventfd_ctx *eventfd_ctx_fileget(struct file *file); 37 36 int eventfd_signal(struct eventfd_ctx *ctx, int n); 37 + ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt); 38 + int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait, 39 + __u64 *cnt); 38 40 39 41 #else /* CONFIG_EVENTFD */ 40 42 ··· 63 59 static inline void eventfd_ctx_put(struct eventfd_ctx *ctx) 64 60 { 65 61 62 + } 63 + 64 + static inline ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, 65 + __u64 *cnt) 66 + { 67 + return -ENOSYS; 68 + } 69 + 70 + static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, 71 + wait_queue_t *wait, __u64 *cnt) 72 + { 73 + return -ENOSYS; 66 74 } 67 75 68 76 #endif
+15 -3
virt/kvm/eventfd.c
··· 72 72 irqfd_shutdown(struct work_struct *work) 73 73 { 74 74 struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown); 75 + u64 cnt; 75 76 76 77 /* 77 78 * Synchronize with the wait-queue and unhook ourselves to prevent 78 79 * further events. 79 80 */ 80 - remove_wait_queue(irqfd->wqh, &irqfd->wait); 81 + eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt); 81 82 82 83 /* 83 84 * We know no new events will be scheduled at this point, so block ··· 167 166 static int 168 167 kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) 169 168 { 170 - struct _irqfd *irqfd; 169 + struct _irqfd *irqfd, *tmp; 171 170 struct file *file = NULL; 172 171 struct eventfd_ctx *eventfd = NULL; 173 172 int ret; ··· 204 203 init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); 205 204 init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc); 206 205 206 + spin_lock_irq(&kvm->irqfds.lock); 207 + 208 + ret = 0; 209 + list_for_each_entry(tmp, &kvm->irqfds.items, list) { 210 + if (irqfd->eventfd != tmp->eventfd) 211 + continue; 212 + /* This fd is used for another irq already. */ 213 + ret = -EBUSY; 214 + spin_unlock_irq(&kvm->irqfds.lock); 215 + goto fail; 216 + } 217 + 207 218 events = file->f_op->poll(file, &irqfd->pt); 208 219 209 - spin_lock_irq(&kvm->irqfds.lock); 210 220 list_add_tail(&irqfd->list, &kvm->irqfds.items); 211 221 spin_unlock_irq(&kvm->irqfds.lock); 212 222
+5 -1
virt/kvm/irq_comm.c
··· 302 302 { 303 303 int r = -EINVAL; 304 304 int delta; 305 + unsigned max_pin; 305 306 struct kvm_kernel_irq_routing_entry *ei; 306 307 struct hlist_node *n; 307 308 ··· 323 322 switch (ue->u.irqchip.irqchip) { 324 323 case KVM_IRQCHIP_PIC_MASTER: 325 324 e->set = kvm_set_pic_irq; 325 + max_pin = 16; 326 326 break; 327 327 case KVM_IRQCHIP_PIC_SLAVE: 328 328 e->set = kvm_set_pic_irq; 329 + max_pin = 16; 329 330 delta = 8; 330 331 break; 331 332 case KVM_IRQCHIP_IOAPIC: 333 + max_pin = KVM_IOAPIC_NUM_PINS; 332 334 e->set = kvm_set_ioapic_irq; 333 335 break; 334 336 default: ··· 339 335 } 340 336 e->irqchip.irqchip = ue->u.irqchip.irqchip; 341 337 e->irqchip.pin = ue->u.irqchip.pin + delta; 342 - if (e->irqchip.pin >= KVM_IOAPIC_NUM_PINS) 338 + if (e->irqchip.pin >= max_pin) 343 339 goto out; 344 340 rt->chip[ue->u.irqchip.irqchip][e->irqchip.pin] = ue->gsi; 345 341 break;