commit 486d35e2220acfe45d85131c557d94fe889184a2 · tjh.dev/kernel

tjh.dev / kernel

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Merge branch 'kvm-updates/2.6.33' of git://git.kernel.org/pub/scm/virt/kvm/kvm

* 'kvm-updates/2.6.33' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
KVM: x86: Fix leak of free lapic date in kvm_arch_vcpu_init()
KVM: x86: Fix probable memory leak of vcpu->arch.mce_banks
KVM: S390: fix potential array overrun in intercept handling
KVM: fix spurious interrupt with irqfd
eventfd - allow atomic read and waitqueue remove
KVM: MMU: bail out pagewalk on kvm_read_guest error
KVM: properly check max PIC pin in irq route setup
KVM: only allow one gsi per fd
KVM: x86: Fix host_mapping_level()
KVM: powerpc: Show timing option only on embedded
KVM: Fix race between APIC TMR and IRR

Linus Torvalds 16 years ago 486d35e2 a8d0b666

+128 -34

10 changed files

expand all

unified split

arch

powerpc

kvm

Kconfig

s390

kvm

intercept.c

x86

kvm

lapic.c

mmu.c

paging_tmpl.h

x86.c

eventfd.c

include

linux

eventfd.h

virt

kvm

eventfd.c

irq_comm.c

+1 -1

arch/powerpc/kvm/Kconfig

··· 53 54 config KVM_EXIT_TIMING 55 bool "Detailed exit timing" 56 - depends on KVM 57 ---help--- 58 Calculate elapsed time for every exit/enter cycle. A per-vcpu 59 report is available in debugfs kvm/vm#_vcpu#_timing.

··· 53 54 config KVM_EXIT_TIMING 55 bool "Detailed exit timing" 56 + depends on KVM_440 || KVM_E500 57 ---help--- 58 Calculate elapsed time for every exit/enter cycle. A per-vcpu 59 report is available in debugfs kvm/vm#_vcpu#_timing.

+2 -2

arch/s390/kvm/intercept.c

··· 213 return rc2; 214 } 215 216 - static const intercept_handler_t intercept_funcs[0x48 >> 2] = { 217 [0x00 >> 2] = handle_noop, 218 [0x04 >> 2] = handle_instruction, 219 [0x08 >> 2] = handle_prog, ··· 230 intercept_handler_t func; 231 u8 code = vcpu->arch.sie_block->icptcode; 232 233 - if (code & 3 || code > 0x48) 234 return -ENOTSUPP; 235 func = intercept_funcs[code >> 2]; 236 if (func)

··· 213 return rc2; 214 } 215 216 + static const intercept_handler_t intercept_funcs[] = { 217 [0x00 >> 2] = handle_noop, 218 [0x04 >> 2] = handle_instruction, 219 [0x08 >> 2] = handle_prog, ··· 230 intercept_handler_t func; 231 u8 code = vcpu->arch.sie_block->icptcode; 232 233 + if (code & 3 || (code >> 2) >= ARRAY_SIZE(intercept_funcs)) 234 return -ENOTSUPP; 235 func = intercept_funcs[code >> 2]; 236 if (func)

+6 -5

arch/x86/kvm/lapic.c

··· 373 if (unlikely(!apic_enabled(apic))) 374 break; 375 376 result = !apic_test_and_set_irr(vector, apic); 377 trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, 378 trig_mode, vector, !result); ··· 389 break; 390 } 391 392 - if (trig_mode) { 393 - apic_debug("level trig mode for vector %d", vector); 394 - apic_set_vector(vector, apic->regs + APIC_TMR); 395 - } else 396 - apic_clear_vector(vector, apic->regs + APIC_TMR); 397 kvm_vcpu_kick(vcpu); 398 break; 399

··· 373 if (unlikely(!apic_enabled(apic))) 374 break; 375 376 + if (trig_mode) { 377 + apic_debug("level trig mode for vector %d", vector); 378 + apic_set_vector(vector, apic->regs + APIC_TMR); 379 + } else 380 + apic_clear_vector(vector, apic->regs + APIC_TMR); 381 + 382 result = !apic_test_and_set_irr(vector, apic); 383 trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, 384 trig_mode, vector, !result); ··· 383 break; 384 } 385 386 kvm_vcpu_kick(vcpu); 387 break; 388

+2 -4

arch/x86/kvm/mmu.c

··· 477 478 addr = gfn_to_hva(kvm, gfn); 479 if (kvm_is_error_hva(addr)) 480 - return page_size; 481 482 down_read(&current->mm->mmap_sem); 483 vma = find_vma(current->mm, addr); ··· 515 if (host_level == PT_PAGE_TABLE_LEVEL) 516 return host_level; 517 518 - for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) { 519 - 520 if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) 521 break; 522 - } 523 524 return level - 1; 525 }

··· 477 478 addr = gfn_to_hva(kvm, gfn); 479 if (kvm_is_error_hva(addr)) 480 + return PT_PAGE_TABLE_LEVEL; 481 482 down_read(&current->mm->mmap_sem); 483 vma = find_vma(current->mm, addr); ··· 515 if (host_level == PT_PAGE_TABLE_LEVEL) 516 return host_level; 517 518 + for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) 519 if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) 520 break; 521 522 return level - 1; 523 }

+3 -1

arch/x86/kvm/paging_tmpl.h

··· 150 walker->table_gfn[walker->level - 1] = table_gfn; 151 walker->pte_gpa[walker->level - 1] = pte_gpa; 152 153 - kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); 154 trace_kvm_mmu_paging_element(pte, walker->level); 155 156 if (!is_present_gpte(pte))

··· 150 walker->table_gfn[walker->level - 1] = table_gfn; 151 walker->pte_gpa[walker->level - 1] = pte_gpa; 152 153 + if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) 154 + goto not_present; 155 + 156 trace_kvm_mmu_paging_element(pte, walker->level); 157 158 if (!is_present_gpte(pte))

+4 -2

arch/x86/kvm/x86.c

··· 5072 GFP_KERNEL); 5073 if (!vcpu->arch.mce_banks) { 5074 r = -ENOMEM; 5075 - goto fail_mmu_destroy; 5076 } 5077 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 5078 5079 return 0; 5080 - 5081 fail_mmu_destroy: 5082 kvm_mmu_destroy(vcpu); 5083 fail_free_pio_data: ··· 5089 5090 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 5091 { 5092 kvm_free_lapic(vcpu); 5093 down_read(&vcpu->kvm->slots_lock); 5094 kvm_mmu_destroy(vcpu);

··· 5072 GFP_KERNEL); 5073 if (!vcpu->arch.mce_banks) { 5074 r = -ENOMEM; 5075 + goto fail_free_lapic; 5076 } 5077 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 5078 5079 return 0; 5080 + fail_free_lapic: 5081 + kvm_free_lapic(vcpu); 5082 fail_mmu_destroy: 5083 kvm_mmu_destroy(vcpu); 5084 fail_free_pio_data: ··· 5088 5089 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 5090 { 5091 + kfree(vcpu->arch.mce_banks); 5092 kvm_free_lapic(vcpu); 5093 down_read(&vcpu->kvm->slots_lock); 5094 kvm_mmu_destroy(vcpu);

+74 -15

fs/eventfd.c

··· 135 return events; 136 } 137 138 - static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, 139 - loff_t *ppos) 140 { 141 - struct eventfd_ctx *ctx = file->private_data; 142 ssize_t res; 143 - __u64 ucnt = 0; 144 DECLARE_WAITQUEUE(wait, current); 145 146 - if (count < sizeof(ucnt)) 147 - return -EINVAL; 148 spin_lock_irq(&ctx->wqh.lock); 149 res = -EAGAIN; 150 if (ctx->count > 0) 151 - res = sizeof(ucnt); 152 - else if (!(file->f_flags & O_NONBLOCK)) { 153 __add_wait_queue(&ctx->wqh, &wait); 154 - for (res = 0;;) { 155 set_current_state(TASK_INTERRUPTIBLE); 156 if (ctx->count > 0) { 157 - res = sizeof(ucnt); 158 break; 159 } 160 if (signal_pending(current)) { ··· 213 __remove_wait_queue(&ctx->wqh, &wait); 214 __set_current_state(TASK_RUNNING); 215 } 216 - if (likely(res > 0)) { 217 - ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; 218 - ctx->count -= ucnt; 219 if (waitqueue_active(&ctx->wqh)) 220 wake_up_locked_poll(&ctx->wqh, POLLOUT); 221 } 222 spin_unlock_irq(&ctx->wqh.lock); 223 - if (res > 0 && put_user(ucnt, (__u64 __user *) buf)) 224 - return -EFAULT; 225 226 return res; 227 } 228 229 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,

··· 135 return events; 136 } 137 138 + static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) 139 { 140 + *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; 141 + ctx->count -= *cnt; 142 + } 143 + 144 + /** 145 + * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue. 146 + * @ctx: [in] Pointer to eventfd context. 147 + * @wait: [in] Wait queue to be removed. 148 + * @cnt: [out] Pointer to the 64bit conter value. 149 + * 150 + * Returns zero if successful, or the following error codes: 151 + * 152 + * -EAGAIN : The operation would have blocked. 153 + * 154 + * This is used to atomically remove a wait queue entry from the eventfd wait 155 + * queue head, and read/reset the counter value. 156 + */ 157 + int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait, 158 + __u64 *cnt) 159 + { 160 + unsigned long flags; 161 + 162 + spin_lock_irqsave(&ctx->wqh.lock, flags); 163 + eventfd_ctx_do_read(ctx, cnt); 164 + __remove_wait_queue(&ctx->wqh, wait); 165 + if (*cnt != 0 && waitqueue_active(&ctx->wqh)) 166 + wake_up_locked_poll(&ctx->wqh, POLLOUT); 167 + spin_unlock_irqrestore(&ctx->wqh.lock, flags); 168 + 169 + return *cnt != 0 ? 0 : -EAGAIN; 170 + } 171 + EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue); 172 + 173 + /** 174 + * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero. 175 + * @ctx: [in] Pointer to eventfd context. 176 + * @no_wait: [in] Different from zero if the operation should not block. 177 + * @cnt: [out] Pointer to the 64bit conter value. 178 + * 179 + * Returns zero if successful, or the following error codes: 180 + * 181 + * -EAGAIN : The operation would have blocked but @no_wait was nonzero. 182 + * -ERESTARTSYS : A signal interrupted the wait operation. 183 + * 184 + * If @no_wait is zero, the function might sleep until the eventfd internal 185 + * counter becomes greater than zero. 186 + */ 187 + ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt) 188 + { 189 ssize_t res; 190 DECLARE_WAITQUEUE(wait, current); 191 192 spin_lock_irq(&ctx->wqh.lock); 193 + *cnt = 0; 194 res = -EAGAIN; 195 if (ctx->count > 0) 196 + res = 0; 197 + else if (!no_wait) { 198 __add_wait_queue(&ctx->wqh, &wait); 199 + for (;;) { 200 set_current_state(TASK_INTERRUPTIBLE); 201 if (ctx->count > 0) { 202 + res = 0; 203 break; 204 } 205 if (signal_pending(current)) { ··· 168 __remove_wait_queue(&ctx->wqh, &wait); 169 __set_current_state(TASK_RUNNING); 170 } 171 + if (likely(res == 0)) { 172 + eventfd_ctx_do_read(ctx, cnt); 173 if (waitqueue_active(&ctx->wqh)) 174 wake_up_locked_poll(&ctx->wqh, POLLOUT); 175 } 176 spin_unlock_irq(&ctx->wqh.lock); 177 178 return res; 179 + } 180 + EXPORT_SYMBOL_GPL(eventfd_ctx_read); 181 + 182 + static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, 183 + loff_t *ppos) 184 + { 185 + struct eventfd_ctx *ctx = file->private_data; 186 + ssize_t res; 187 + __u64 cnt; 188 + 189 + if (count < sizeof(cnt)) 190 + return -EINVAL; 191 + res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt); 192 + if (res < 0) 193 + return res; 194 + 195 + return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt); 196 } 197 198 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,

+16

include/linux/eventfd.h

··· 10 11 #include <linux/fcntl.h> 12 #include <linux/file.h> 13 14 /* 15 * CAREFUL: Check include/asm-generic/fcntl.h when defining ··· 35 struct eventfd_ctx *eventfd_ctx_fdget(int fd); 36 struct eventfd_ctx *eventfd_ctx_fileget(struct file *file); 37 int eventfd_signal(struct eventfd_ctx *ctx, int n); 38 39 #else /* CONFIG_EVENTFD */ 40 ··· 63 static inline void eventfd_ctx_put(struct eventfd_ctx *ctx) 64 { 65 66 } 67 68 #endif

··· 10 11 #include <linux/fcntl.h> 12 #include <linux/file.h> 13 + #include <linux/wait.h> 14 15 /* 16 * CAREFUL: Check include/asm-generic/fcntl.h when defining ··· 34 struct eventfd_ctx *eventfd_ctx_fdget(int fd); 35 struct eventfd_ctx *eventfd_ctx_fileget(struct file *file); 36 int eventfd_signal(struct eventfd_ctx *ctx, int n); 37 + ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt); 38 + int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait, 39 + __u64 *cnt); 40 41 #else /* CONFIG_EVENTFD */ 42 ··· 59 static inline void eventfd_ctx_put(struct eventfd_ctx *ctx) 60 { 61 62 + } 63 + 64 + static inline ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, 65 + __u64 *cnt) 66 + { 67 + return -ENOSYS; 68 + } 69 + 70 + static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, 71 + wait_queue_t *wait, __u64 *cnt) 72 + { 73 + return -ENOSYS; 74 } 75 76 #endif

+15 -3

virt/kvm/eventfd.c

··· 72 irqfd_shutdown(struct work_struct *work) 73 { 74 struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown); 75 76 /* 77 * Synchronize with the wait-queue and unhook ourselves to prevent 78 * further events. 79 */ 80 - remove_wait_queue(irqfd->wqh, &irqfd->wait); 81 82 /* 83 * We know no new events will be scheduled at this point, so block ··· 167 static int 168 kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) 169 { 170 - struct _irqfd *irqfd; 171 struct file *file = NULL; 172 struct eventfd_ctx *eventfd = NULL; 173 int ret; ··· 204 init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); 205 init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc); 206 207 events = file->f_op->poll(file, &irqfd->pt); 208 209 - spin_lock_irq(&kvm->irqfds.lock); 210 list_add_tail(&irqfd->list, &kvm->irqfds.items); 211 spin_unlock_irq(&kvm->irqfds.lock); 212

··· 72 irqfd_shutdown(struct work_struct *work) 73 { 74 struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown); 75 + u64 cnt; 76 77 /* 78 * Synchronize with the wait-queue and unhook ourselves to prevent 79 * further events. 80 */ 81 + eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt); 82 83 /* 84 * We know no new events will be scheduled at this point, so block ··· 166 static int 167 kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) 168 { 169 + struct _irqfd *irqfd, *tmp; 170 struct file *file = NULL; 171 struct eventfd_ctx *eventfd = NULL; 172 int ret; ··· 203 init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); 204 init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc); 205 206 + spin_lock_irq(&kvm->irqfds.lock); 207 + 208 + ret = 0; 209 + list_for_each_entry(tmp, &kvm->irqfds.items, list) { 210 + if (irqfd->eventfd != tmp->eventfd) 211 + continue; 212 + /* This fd is used for another irq already. */ 213 + ret = -EBUSY; 214 + spin_unlock_irq(&kvm->irqfds.lock); 215 + goto fail; 216 + } 217 + 218 events = file->f_op->poll(file, &irqfd->pt); 219 220 list_add_tail(&irqfd->list, &kvm->irqfds.items); 221 spin_unlock_irq(&kvm->irqfds.lock); 222

+5 -1

virt/kvm/irq_comm.c

··· 302 { 303 int r = -EINVAL; 304 int delta; 305 struct kvm_kernel_irq_routing_entry *ei; 306 struct hlist_node *n; 307 ··· 323 switch (ue->u.irqchip.irqchip) { 324 case KVM_IRQCHIP_PIC_MASTER: 325 e->set = kvm_set_pic_irq; 326 break; 327 case KVM_IRQCHIP_PIC_SLAVE: 328 e->set = kvm_set_pic_irq; 329 delta = 8; 330 break; 331 case KVM_IRQCHIP_IOAPIC: 332 e->set = kvm_set_ioapic_irq; 333 break; 334 default: ··· 339 } 340 e->irqchip.irqchip = ue->u.irqchip.irqchip; 341 e->irqchip.pin = ue->u.irqchip.pin + delta; 342 - if (e->irqchip.pin >= KVM_IOAPIC_NUM_PINS) 343 goto out; 344 rt->chip[ue->u.irqchip.irqchip][e->irqchip.pin] = ue->gsi; 345 break;

··· 302 { 303 int r = -EINVAL; 304 int delta; 305 + unsigned max_pin; 306 struct kvm_kernel_irq_routing_entry *ei; 307 struct hlist_node *n; 308 ··· 322 switch (ue->u.irqchip.irqchip) { 323 case KVM_IRQCHIP_PIC_MASTER: 324 e->set = kvm_set_pic_irq; 325 + max_pin = 16; 326 break; 327 case KVM_IRQCHIP_PIC_SLAVE: 328 e->set = kvm_set_pic_irq; 329 + max_pin = 16; 330 delta = 8; 331 break; 332 case KVM_IRQCHIP_IOAPIC: 333 + max_pin = KVM_IOAPIC_NUM_PINS; 334 e->set = kvm_set_ioapic_irq; 335 break; 336 default: ··· 335 } 336 e->irqchip.irqchip = ue->u.irqchip.irqchip; 337 e->irqchip.pin = ue->u.irqchip.pin + delta; 338 + if (e->irqchip.pin >= max_pin) 339 goto out; 340 rt->chip[ue->u.irqchip.irqchip][e->irqchip.pin] = ue->gsi; 341 break;