Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'x86-vdso-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 vdso updates from Ingo Molnar:
"Two main changes:

- Cleanups, simplifications and CLOCK_TAI support (Thomas Gleixner)

- Improve code generation (Andy Lutomirski)"

* 'x86-vdso-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/vdso: Rearrange do_hres() to improve code generation
x86/vdso: Document vgtod_ts better
x86/vdso: Remove "memory" clobbers in the vDSO syscall fallbacks
x66/vdso: Add CLOCK_TAI support
x86/vdso: Move cycle_last handling into the caller
x86/vdso: Simplify the invalid vclock case
x86/vdso: Replace the clockid switch case
x86/vdso: Collapse coarse functions
x86/vdso: Collapse high resolution functions
x86/vdso: Introduce and use vgtod_ts
x86/vdso: Use unsigned int consistently for vsyscall_gtod_data:: Seq
x86/vdso: Enforce 64bit clocksource
x86/time: Implement clocksource_arch_init()
clocksource: Provide clocksource_arch_init()

+154 -192
+1
arch/x86/Kconfig
··· 48 48 select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI 49 49 select ANON_INODES 50 50 select ARCH_CLOCKSOURCE_DATA 51 + select ARCH_CLOCKSOURCE_INIT 51 52 select ARCH_DISCARD_MEMBLOCK 52 53 select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI 53 54 select ARCH_HAS_DEBUG_VIRTUAL
+61 -151
arch/x86/entry/vdso/vclock_gettime.c
··· 45 45 long ret; 46 46 asm ("syscall" : "=a" (ret), "=m" (*ts) : 47 47 "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : 48 - "memory", "rcx", "r11"); 48 + "rcx", "r11"); 49 49 return ret; 50 50 } 51 - 52 - notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) 53 - { 54 - long ret; 55 - 56 - asm ("syscall" : "=a" (ret), "=m" (*tv), "=m" (*tz) : 57 - "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : 58 - "memory", "rcx", "r11"); 59 - return ret; 60 - } 61 - 62 51 63 52 #else 64 53 ··· 62 73 "mov %%edx, %%ebx \n" 63 74 : "=a" (ret), "=m" (*ts) 64 75 : "0" (__NR_clock_gettime), [clock] "g" (clock), "c" (ts) 65 - : "memory", "edx"); 66 - return ret; 67 - } 68 - 69 - notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) 70 - { 71 - long ret; 72 - 73 - asm ( 74 - "mov %%ebx, %%edx \n" 75 - "mov %[tv], %%ebx \n" 76 - "call __kernel_vsyscall \n" 77 - "mov %%edx, %%ebx \n" 78 - : "=a" (ret), "=m" (*tv), "=m" (*tz) 79 - : "0" (__NR_gettimeofday), [tv] "g" (tv), "c" (tz) 80 - : "memory", "edx"); 76 + : "edx"); 81 77 return ret; 82 78 } 83 79 ··· 74 100 return (const struct pvclock_vsyscall_time_info *)&pvclock_page; 75 101 } 76 102 77 - static notrace u64 vread_pvclock(int *mode) 103 + static notrace u64 vread_pvclock(void) 78 104 { 79 105 const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti; 80 - u64 ret; 81 - u64 last; 82 106 u32 version; 107 + u64 ret; 83 108 84 109 /* 85 110 * Note: The kernel and hypervisor must guarantee that cpu ID ··· 105 132 do { 106 133 version = pvclock_read_begin(pvti); 107 134 108 - if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) { 109 - *mode = VCLOCK_NONE; 110 - return 0; 111 - } 135 + if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) 136 + return U64_MAX; 112 137 113 138 ret = __pvclock_read_cycles(pvti, rdtsc_ordered()); 114 139 } while (pvclock_read_retry(pvti, version)); 115 140 116 - /* refer to vread_tsc() comment for rationale */ 117 - last = gtod->cycle_last; 118 - 119 - if (likely(ret >= last)) 120 - return ret; 121 - 122 - return last; 141 + return ret; 123 142 } 124 143 #endif 125 144 #ifdef CONFIG_HYPERV_TSCPAGE 126 - static notrace u64 vread_hvclock(int *mode) 145 + static notrace u64 vread_hvclock(void) 127 146 { 128 147 const struct ms_hyperv_tsc_page *tsc_pg = 129 148 (const struct ms_hyperv_tsc_page *)&hvclock_page; 130 - u64 current_tick = hv_read_tsc_page(tsc_pg); 131 149 132 - if (current_tick != U64_MAX) 133 - return current_tick; 134 - 135 - *mode = VCLOCK_NONE; 136 - return 0; 150 + return hv_read_tsc_page(tsc_pg); 137 151 } 138 152 #endif 139 153 140 - notrace static u64 vread_tsc(void) 154 + notrace static inline u64 vgetcyc(int mode) 141 155 { 142 - u64 ret = (u64)rdtsc_ordered(); 143 - u64 last = gtod->cycle_last; 144 - 145 - if (likely(ret >= last)) 146 - return ret; 147 - 148 - /* 149 - * GCC likes to generate cmov here, but this branch is extremely 150 - * predictable (it's just a function of time and the likely is 151 - * very likely) and there's a data dependence, so force GCC 152 - * to generate a branch instead. I don't barrier() because 153 - * we don't actually need a barrier, and if this function 154 - * ever gets inlined it will generate worse code. 155 - */ 156 - asm volatile (""); 157 - return last; 158 - } 159 - 160 - notrace static inline u64 vgetsns(int *mode) 161 - { 162 - u64 v; 163 - cycles_t cycles; 164 - 165 - if (gtod->vclock_mode == VCLOCK_TSC) 166 - cycles = vread_tsc(); 156 + if (mode == VCLOCK_TSC) 157 + return (u64)rdtsc_ordered(); 167 158 #ifdef CONFIG_PARAVIRT_CLOCK 168 - else if (gtod->vclock_mode == VCLOCK_PVCLOCK) 169 - cycles = vread_pvclock(mode); 159 + else if (mode == VCLOCK_PVCLOCK) 160 + return vread_pvclock(); 170 161 #endif 171 162 #ifdef CONFIG_HYPERV_TSCPAGE 172 - else if (gtod->vclock_mode == VCLOCK_HVCLOCK) 173 - cycles = vread_hvclock(mode); 163 + else if (mode == VCLOCK_HVCLOCK) 164 + return vread_hvclock(); 174 165 #endif 175 - else 176 - return 0; 177 - v = (cycles - gtod->cycle_last) & gtod->mask; 178 - return v * gtod->mult; 166 + return U64_MAX; 179 167 } 180 168 181 - /* Code size doesn't matter (vdso is 4k anyway) and this is faster. */ 182 - notrace static int __always_inline do_realtime(struct timespec *ts) 169 + notrace static int do_hres(clockid_t clk, struct timespec *ts) 183 170 { 184 - unsigned long seq; 185 - u64 ns; 186 - int mode; 171 + struct vgtod_ts *base = &gtod->basetime[clk]; 172 + u64 cycles, last, sec, ns; 173 + unsigned int seq; 187 174 188 175 do { 189 176 seq = gtod_read_begin(gtod); 190 - mode = gtod->vclock_mode; 191 - ts->tv_sec = gtod->wall_time_sec; 192 - ns = gtod->wall_time_snsec; 193 - ns += vgetsns(&mode); 177 + cycles = vgetcyc(gtod->vclock_mode); 178 + ns = base->nsec; 179 + last = gtod->cycle_last; 180 + if (unlikely((s64)cycles < 0)) 181 + return vdso_fallback_gettime(clk, ts); 182 + if (cycles > last) 183 + ns += (cycles - last) * gtod->mult; 194 184 ns >>= gtod->shift; 185 + sec = base->sec; 195 186 } while (unlikely(gtod_read_retry(gtod, seq))); 196 187 197 - ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); 188 + /* 189 + * Do this outside the loop: a race inside the loop could result 190 + * in __iter_div_u64_rem() being extremely slow. 191 + */ 192 + ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); 198 193 ts->tv_nsec = ns; 199 194 200 - return mode; 195 + return 0; 201 196 } 202 197 203 - notrace static int __always_inline do_monotonic(struct timespec *ts) 198 + notrace static void do_coarse(clockid_t clk, struct timespec *ts) 204 199 { 205 - unsigned long seq; 206 - u64 ns; 207 - int mode; 200 + struct vgtod_ts *base = &gtod->basetime[clk]; 201 + unsigned int seq; 208 202 209 203 do { 210 204 seq = gtod_read_begin(gtod); 211 - mode = gtod->vclock_mode; 212 - ts->tv_sec = gtod->monotonic_time_sec; 213 - ns = gtod->monotonic_time_snsec; 214 - ns += vgetsns(&mode); 215 - ns >>= gtod->shift; 216 - } while (unlikely(gtod_read_retry(gtod, seq))); 217 - 218 - ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); 219 - ts->tv_nsec = ns; 220 - 221 - return mode; 222 - } 223 - 224 - notrace static void do_realtime_coarse(struct timespec *ts) 225 - { 226 - unsigned long seq; 227 - do { 228 - seq = gtod_read_begin(gtod); 229 - ts->tv_sec = gtod->wall_time_coarse_sec; 230 - ts->tv_nsec = gtod->wall_time_coarse_nsec; 231 - } while (unlikely(gtod_read_retry(gtod, seq))); 232 - } 233 - 234 - notrace static void do_monotonic_coarse(struct timespec *ts) 235 - { 236 - unsigned long seq; 237 - do { 238 - seq = gtod_read_begin(gtod); 239 - ts->tv_sec = gtod->monotonic_time_coarse_sec; 240 - ts->tv_nsec = gtod->monotonic_time_coarse_nsec; 205 + ts->tv_sec = base->sec; 206 + ts->tv_nsec = base->nsec; 241 207 } while (unlikely(gtod_read_retry(gtod, seq))); 242 208 } 243 209 244 210 notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) 245 211 { 246 - switch (clock) { 247 - case CLOCK_REALTIME: 248 - if (do_realtime(ts) == VCLOCK_NONE) 249 - goto fallback; 250 - break; 251 - case CLOCK_MONOTONIC: 252 - if (do_monotonic(ts) == VCLOCK_NONE) 253 - goto fallback; 254 - break; 255 - case CLOCK_REALTIME_COARSE: 256 - do_realtime_coarse(ts); 257 - break; 258 - case CLOCK_MONOTONIC_COARSE: 259 - do_monotonic_coarse(ts); 260 - break; 261 - default: 262 - goto fallback; 263 - } 212 + unsigned int msk; 264 213 265 - return 0; 266 - fallback: 214 + /* Sort out negative (CPU/FD) and invalid clocks */ 215 + if (unlikely((unsigned int) clock >= MAX_CLOCKS)) 216 + return vdso_fallback_gettime(clock, ts); 217 + 218 + /* 219 + * Convert the clockid to a bitmask and use it to check which 220 + * clocks are handled in the VDSO directly. 221 + */ 222 + msk = 1U << clock; 223 + if (likely(msk & VGTOD_HRES)) { 224 + return do_hres(clock, ts); 225 + } else if (msk & VGTOD_COARSE) { 226 + do_coarse(clock, ts); 227 + return 0; 228 + } 267 229 return vdso_fallback_gettime(clock, ts); 268 230 } 231 + 269 232 int clock_gettime(clockid_t, struct timespec *) 270 233 __attribute__((weak, alias("__vdso_clock_gettime"))); 271 234 272 235 notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) 273 236 { 274 237 if (likely(tv != NULL)) { 275 - if (unlikely(do_realtime((struct timespec *)tv) == VCLOCK_NONE)) 276 - return vdso_fallback_gtod(tv, tz); 238 + struct timespec *ts = (struct timespec *) tv; 239 + 240 + do_hres(CLOCK_REALTIME, ts); 277 241 tv->tv_usec /= 1000; 278 242 } 279 243 if (unlikely(tz != NULL)) { ··· 230 320 notrace time_t __vdso_time(time_t *t) 231 321 { 232 322 /* This is atomic on x86 so we don't need any locks. */ 233 - time_t result = READ_ONCE(gtod->wall_time_sec); 323 + time_t result = READ_ONCE(gtod->basetime[CLOCK_REALTIME].sec); 234 324 235 325 if (t) 236 326 *t = result;
+28 -23
arch/x86/entry/vsyscall/vsyscall_gtod.c
··· 31 31 { 32 32 int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; 33 33 struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; 34 + struct vgtod_ts *base; 35 + u64 nsec; 34 36 35 37 /* Mark the new vclock used. */ 36 38 BUILD_BUG_ON(VCLOCK_MAX >= 32); ··· 47 45 vdata->mult = tk->tkr_mono.mult; 48 46 vdata->shift = tk->tkr_mono.shift; 49 47 50 - vdata->wall_time_sec = tk->xtime_sec; 51 - vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec; 48 + base = &vdata->basetime[CLOCK_REALTIME]; 49 + base->sec = tk->xtime_sec; 50 + base->nsec = tk->tkr_mono.xtime_nsec; 52 51 53 - vdata->monotonic_time_sec = tk->xtime_sec 54 - + tk->wall_to_monotonic.tv_sec; 55 - vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec 56 - + ((u64)tk->wall_to_monotonic.tv_nsec 57 - << tk->tkr_mono.shift); 58 - while (vdata->monotonic_time_snsec >= 59 - (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { 60 - vdata->monotonic_time_snsec -= 61 - ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; 62 - vdata->monotonic_time_sec++; 52 + base = &vdata->basetime[CLOCK_TAI]; 53 + base->sec = tk->xtime_sec + (s64)tk->tai_offset; 54 + base->nsec = tk->tkr_mono.xtime_nsec; 55 + 56 + base = &vdata->basetime[CLOCK_MONOTONIC]; 57 + base->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; 58 + nsec = tk->tkr_mono.xtime_nsec; 59 + nsec += ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift); 60 + while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { 61 + nsec -= ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; 62 + base->sec++; 63 63 } 64 + base->nsec = nsec; 64 65 65 - vdata->wall_time_coarse_sec = tk->xtime_sec; 66 - vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >> 67 - tk->tkr_mono.shift); 66 + base = &vdata->basetime[CLOCK_REALTIME_COARSE]; 67 + base->sec = tk->xtime_sec; 68 + base->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; 68 69 69 - vdata->monotonic_time_coarse_sec = 70 - vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; 71 - vdata->monotonic_time_coarse_nsec = 72 - vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec; 73 - 74 - while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) { 75 - vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC; 76 - vdata->monotonic_time_coarse_sec++; 70 + base = &vdata->basetime[CLOCK_MONOTONIC_COARSE]; 71 + base->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; 72 + nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; 73 + nsec += tk->wall_to_monotonic.tv_nsec; 74 + while (nsec >= NSEC_PER_SEC) { 75 + nsec -= NSEC_PER_SEC; 76 + base->sec++; 77 77 } 78 + base->nsec = nsec; 78 79 79 80 gtod_write_end(vdata); 80 81 }
+31 -18
arch/x86/include/asm/vgtod.h
··· 5 5 #include <linux/compiler.h> 6 6 #include <linux/clocksource.h> 7 7 8 + #include <uapi/linux/time.h> 9 + 8 10 #ifdef BUILD_VDSO32_64 9 11 typedef u64 gtod_long_t; 10 12 #else 11 13 typedef unsigned long gtod_long_t; 12 14 #endif 15 + 16 + /* 17 + * There is one of these objects in the vvar page for each 18 + * vDSO-accelerated clockid. For high-resolution clocks, this encodes 19 + * the time corresponding to vsyscall_gtod_data.cycle_last. For coarse 20 + * clocks, this encodes the actual time. 21 + * 22 + * To confuse the reader, for high-resolution clocks, nsec is left-shifted 23 + * by vsyscall_gtod_data.shift. 24 + */ 25 + struct vgtod_ts { 26 + u64 sec; 27 + u64 nsec; 28 + }; 29 + 30 + #define VGTOD_BASES (CLOCK_TAI + 1) 31 + #define VGTOD_HRES (BIT(CLOCK_REALTIME) | BIT(CLOCK_MONOTONIC) | BIT(CLOCK_TAI)) 32 + #define VGTOD_COARSE (BIT(CLOCK_REALTIME_COARSE) | BIT(CLOCK_MONOTONIC_COARSE)) 33 + 13 34 /* 14 35 * vsyscall_gtod_data will be accessed by 32 and 64 bit code at the same time 15 36 * so be carefull by modifying this structure. 16 37 */ 17 38 struct vsyscall_gtod_data { 18 - unsigned seq; 39 + unsigned int seq; 19 40 20 - int vclock_mode; 21 - u64 cycle_last; 22 - u64 mask; 23 - u32 mult; 24 - u32 shift; 41 + int vclock_mode; 42 + u64 cycle_last; 43 + u64 mask; 44 + u32 mult; 45 + u32 shift; 25 46 26 - /* open coded 'struct timespec' */ 27 - u64 wall_time_snsec; 28 - gtod_long_t wall_time_sec; 29 - gtod_long_t monotonic_time_sec; 30 - u64 monotonic_time_snsec; 31 - gtod_long_t wall_time_coarse_sec; 32 - gtod_long_t wall_time_coarse_nsec; 33 - gtod_long_t monotonic_time_coarse_sec; 34 - gtod_long_t monotonic_time_coarse_nsec; 47 + struct vgtod_ts basetime[VGTOD_BASES]; 35 48 36 49 int tz_minuteswest; 37 50 int tz_dsttime; ··· 57 44 return READ_ONCE(vclocks_used) & (1 << vclock); 58 45 } 59 46 60 - static inline unsigned gtod_read_begin(const struct vsyscall_gtod_data *s) 47 + static inline unsigned int gtod_read_begin(const struct vsyscall_gtod_data *s) 61 48 { 62 - unsigned ret; 49 + unsigned int ret; 63 50 64 51 repeat: 65 52 ret = READ_ONCE(s->seq); ··· 72 59 } 73 60 74 61 static inline int gtod_read_retry(const struct vsyscall_gtod_data *s, 75 - unsigned start) 62 + unsigned int start) 76 63 { 77 64 smp_rmb(); 78 65 return unlikely(s->seq != start);
+22
arch/x86/kernel/time.c
··· 10 10 * 11 11 */ 12 12 13 + #include <linux/clocksource.h> 13 14 #include <linux/clockchips.h> 14 15 #include <linux/interrupt.h> 15 16 #include <linux/irq.h> ··· 105 104 void __init time_init(void) 106 105 { 107 106 late_time_init = x86_late_time_init; 107 + } 108 + 109 + /* 110 + * Sanity check the vdso related archdata content. 111 + */ 112 + void clocksource_arch_init(struct clocksource *cs) 113 + { 114 + if (cs->archdata.vclock_mode == VCLOCK_NONE) 115 + return; 116 + 117 + if (cs->archdata.vclock_mode > VCLOCK_MAX) { 118 + pr_warn("clocksource %s registered with invalid vclock_mode %d. Disabling vclock.\n", 119 + cs->name, cs->archdata.vclock_mode); 120 + cs->archdata.vclock_mode = VCLOCK_NONE; 121 + } 122 + 123 + if (cs->mask != CLOCKSOURCE_MASK(64)) { 124 + pr_warn("clocksource %s registered with invalid mask %016llx. Disabling vclock.\n", 125 + cs->name, cs->mask); 126 + cs->archdata.vclock_mode = VCLOCK_NONE; 127 + } 108 128 }
+5
include/linux/clocksource.h
··· 241 241 __clocksource_update_freq_scale(cs, 1000, khz); 242 242 } 243 243 244 + #ifdef CONFIG_ARCH_CLOCKSOURCE_INIT 245 + extern void clocksource_arch_init(struct clocksource *cs); 246 + #else 247 + static inline void clocksource_arch_init(struct clocksource *cs) { } 248 + #endif 244 249 245 250 extern int timekeeping_notify(struct clocksource *clock); 246 251
+4
kernel/time/Kconfig
··· 12 12 config ARCH_CLOCKSOURCE_DATA 13 13 bool 14 14 15 + # Architecture has extra clocksource init called from registration 16 + config ARCH_CLOCKSOURCE_INIT 17 + bool 18 + 15 19 # Clocksources require validation of the clocksource against the last 16 20 # cycle update - x86/TSC misfeature 17 21 config CLOCKSOURCE_VALIDATE_LAST_CYCLE
+2
kernel/time/clocksource.c
··· 937 937 { 938 938 unsigned long flags; 939 939 940 + clocksource_arch_init(cs); 941 + 940 942 /* Initialize mult/shift and max_idle_ns */ 941 943 __clocksource_update_freq_scale(cs, scale, freq); 942 944