Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86: kvm: Revert "remove sched notifier for cross-cpu migrations"

The following point:

2. per-CPU pvclock time info is updated if the
underlying CPU changes.

Is not true anymore since "KVM: x86: update pvclock area conditionally,
on cpu migration".

Add task migration notification back.

Problem noticed by Andy Lutomirski.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
CC: stable@kernel.org # 3.11+

+76 -8
+1
arch/x86/include/asm/pvclock.h
··· 95 95 96 96 struct pvclock_vsyscall_time_info { 97 97 struct pvclock_vcpu_time_info pvti; 98 + u32 migrate_count; 98 99 } __attribute__((__aligned__(SMP_CACHE_BYTES))); 99 100 100 101 #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
+44
arch/x86/kernel/pvclock.c
··· 141 141 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); 142 142 } 143 143 144 + static struct pvclock_vsyscall_time_info *pvclock_vdso_info; 145 + 146 + static struct pvclock_vsyscall_time_info * 147 + pvclock_get_vsyscall_user_time_info(int cpu) 148 + { 149 + if (!pvclock_vdso_info) { 150 + BUG(); 151 + return NULL; 152 + } 153 + 154 + return &pvclock_vdso_info[cpu]; 155 + } 156 + 157 + struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu) 158 + { 159 + return &pvclock_get_vsyscall_user_time_info(cpu)->pvti; 160 + } 161 + 144 162 #ifdef CONFIG_X86_64 163 + static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l, 164 + void *v) 165 + { 166 + struct task_migration_notifier *mn = v; 167 + struct pvclock_vsyscall_time_info *pvti; 168 + 169 + pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu); 170 + 171 + /* this is NULL when pvclock vsyscall is not initialized */ 172 + if (unlikely(pvti == NULL)) 173 + return NOTIFY_DONE; 174 + 175 + pvti->migrate_count++; 176 + 177 + return NOTIFY_DONE; 178 + } 179 + 180 + static struct notifier_block pvclock_migrate = { 181 + .notifier_call = pvclock_task_migrate, 182 + }; 183 + 145 184 /* 146 185 * Initialize the generic pvclock vsyscall state. This will allocate 147 186 * a/some page(s) for the per-vcpu pvclock information, set up a ··· 194 155 195 156 WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); 196 157 158 + pvclock_vdso_info = i; 159 + 197 160 for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { 198 161 __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, 199 162 __pa(i) + (idx*PAGE_SIZE), 200 163 PAGE_KERNEL_VVAR); 201 164 } 165 + 166 + 167 + register_task_migration_notifier(&pvclock_migrate); 202 168 203 169 return 0; 204 170 }
+8 -8
arch/x86/vdso/vclock_gettime.c
··· 82 82 cycle_t ret; 83 83 u64 last; 84 84 u32 version; 85 + u32 migrate_count; 85 86 u8 flags; 86 87 unsigned cpu, cpu1; 87 88 88 89 89 90 /* 90 - * Note: hypervisor must guarantee that: 91 - * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. 92 - * 2. that per-CPU pvclock time info is updated if the 93 - * underlying CPU changes. 94 - * 3. that version is increased whenever underlying CPU 95 - * changes. 96 - * 91 + * When looping to get a consistent (time-info, tsc) pair, we 92 + * also need to deal with the possibility we can switch vcpus, 93 + * so make sure we always re-fetch time-info for the current vcpu. 97 94 */ 98 95 do { 99 96 cpu = __getcpu() & VGETCPU_CPU_MASK; ··· 100 103 */ 101 104 102 105 pvti = get_pvti(cpu); 106 + 107 + migrate_count = pvti->migrate_count; 103 108 104 109 version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); 105 110 ··· 114 115 cpu1 = __getcpu() & VGETCPU_CPU_MASK; 115 116 } while (unlikely(cpu != cpu1 || 116 117 (pvti->pvti.version & 1) || 117 - pvti->pvti.version != version)); 118 + pvti->pvti.version != version || 119 + pvti->migrate_count != migrate_count)); 118 120 119 121 if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) 120 122 *mode = VCLOCK_NONE;
+8
include/linux/sched.h
··· 176 176 extern void calc_global_load(unsigned long ticks); 177 177 extern void update_cpu_load_nohz(void); 178 178 179 + /* Notifier for when a task gets migrated to a new CPU */ 180 + struct task_migration_notifier { 181 + struct task_struct *task; 182 + int from_cpu; 183 + int to_cpu; 184 + }; 185 + extern void register_task_migration_notifier(struct notifier_block *n); 186 + 179 187 extern unsigned long get_parent_ip(unsigned long addr); 180 188 181 189 extern void dump_cpu_task(int cpu);
+15
kernel/sched/core.c
··· 996 996 rq_clock_skip_update(rq, true); 997 997 } 998 998 999 + static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); 1000 + 1001 + void register_task_migration_notifier(struct notifier_block *n) 1002 + { 1003 + atomic_notifier_chain_register(&task_migration_notifier, n); 1004 + } 1005 + 999 1006 #ifdef CONFIG_SMP 1000 1007 void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1001 1008 { ··· 1033 1026 trace_sched_migrate_task(p, new_cpu); 1034 1027 1035 1028 if (task_cpu(p) != new_cpu) { 1029 + struct task_migration_notifier tmn; 1030 + 1036 1031 if (p->sched_class->migrate_task_rq) 1037 1032 p->sched_class->migrate_task_rq(p, new_cpu); 1038 1033 p->se.nr_migrations++; 1039 1034 perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); 1035 + 1036 + tmn.task = p; 1037 + tmn.from_cpu = task_cpu(p); 1038 + tmn.to_cpu = new_cpu; 1039 + 1040 + atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); 1040 1041 } 1041 1042 1042 1043 __set_task_cpu(p, new_cpu);