Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

KVM: x86: Prevent deadlock against tk_core.seq

syzbot reported a possible deadlock in pvclock_gtod_notify():

CPU 0 CPU 1
write_seqcount_begin(&tk_core.seq);
pvclock_gtod_notify() spin_lock(&pool->lock);
queue_work(..., &pvclock_gtod_work) ktime_get()
spin_lock(&pool->lock); do {
seq = read_seqcount_begin(tk_core.seq)
...
} while (read_seqcount_retry(&tk_core.seq, seq);

While this is unlikely to happen, it's possible.

Delegate queue_work() to irq_work() which postpones it until the
tk_core.seq write held region is left and interrupts are reenabled.

Fixes: 16e8d74d2da9 ("KVM: x86: notifier for clocksource changes")
Reported-by: syzbot+6beae4000559d41d80f8@syzkaller.appspotmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Message-Id: <87h7jgm1zy.ffs@nanos.tec.linutronix.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

authored by

Thomas Gleixner and committed by
Paolo Bonzini
3f804f6d 594b27e6

+18 -4
+18 -4
arch/x86/kvm/x86.c
··· 8095 8095 static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); 8096 8096 8097 8097 /* 8098 + * Indirection to move queue_work() out of the tk_core.seq write held 8099 + * region to prevent possible deadlocks against time accessors which 8100 + * are invoked with work related locks held. 8101 + */ 8102 + static void pvclock_irq_work_fn(struct irq_work *w) 8103 + { 8104 + queue_work(system_long_wq, &pvclock_gtod_work); 8105 + } 8106 + 8107 + static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn); 8108 + 8109 + /* 8098 8110 * Notification about pvclock gtod data update. 8099 8111 */ 8100 8112 static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused, ··· 8117 8105 8118 8106 update_pvclock_gtod(tk); 8119 8107 8120 - /* disable master clock if host does not trust, or does not 8121 - * use, TSC based clocksource. 8108 + /* 8109 + * Disable master clock if host does not trust, or does not use, 8110 + * TSC based clocksource. Delegate queue_work() to irq_work as 8111 + * this is invoked with tk_core.seq write held. 8122 8112 */ 8123 8113 if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) && 8124 8114 atomic_read(&kvm_guest_has_master_clock) != 0) 8125 - queue_work(system_long_wq, &pvclock_gtod_work); 8126 - 8115 + irq_work_queue(&pvclock_irq_work); 8127 8116 return 0; 8128 8117 } 8129 8118 ··· 8237 8224 cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE); 8238 8225 #ifdef CONFIG_X86_64 8239 8226 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); 8227 + irq_work_sync(&pvclock_irq_work); 8240 8228 cancel_work_sync(&pvclock_gtod_work); 8241 8229 #endif 8242 8230 kvm_x86_ops.hardware_enable = NULL;