x86/tsc: Make calibration refinement more robust

The threshold in tsc_read_refs() is constant which may favor slower CPUs
but may not be optimal for simple reading of reference on faster ones.

Hence make it proportional to tsc_khz when available to compensate for
this. The threshold guards against any disturbance like IRQs, NMIs, SMIs
or CPU stealing by host on guest systems so rename it accordingly and
fix comments as well.

Also on some systems there is noticeable DMI bus contention at some point
during boot keeping the readout failing (observed with about one in ~300
boots when testing). In that case retry also the second readout instead of
simply bailing out unrefined. Usually the next second the readout returns
fast just fine without any issues.

Signed-off-by: Daniel Vacek <neelx@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Link: https://lkml.kernel.org/r/1541437840-29293-1-git-send-email-neelx@redhat.com

authored by Daniel Vacek and committed by Thomas Gleixner a786ef15 65102238

Changed files
+16 -14
arch
x86
kernel
+16 -14
arch/x86/kernel/tsc.c
··· 297 298 __setup("tsc=", tsc_setup); 299 300 - #define MAX_RETRIES 5 301 - #define SMI_TRESHOLD 50000 302 303 /* 304 - * Read TSC and the reference counters. Take care of SMI disturbance 305 */ 306 static u64 tsc_read_refs(u64 *p, int hpet) 307 { 308 u64 t1, t2; 309 int i; 310 311 for (i = 0; i < MAX_RETRIES; i++) { ··· 316 else 317 *p = acpi_pm_read_early(); 318 t2 = get_cycles(); 319 - if ((t2 - t1) < SMI_TRESHOLD) 320 return t2; 321 } 322 return ULLONG_MAX; ··· 704 * zero. In each wait loop iteration we read the TSC and check 705 * the delta to the previous read. We keep track of the min 706 * and max values of that delta. The delta is mostly defined 707 - * by the IO time of the PIT access, so we can detect when a 708 - * SMI/SMM disturbance happened between the two reads. If the 709 * maximum time is significantly larger than the minimum time, 710 * then we discard the result and have another try. 711 * 712 * 2) Reference counter. If available we use the HPET or the 713 * PMTIMER as a reference to check the sanity of that value. 714 * We use separate TSC readouts and check inside of the 715 - * reference read for a SMI/SMM disturbance. We dicard 716 * disturbed values here as well. We do that around the PIT 717 * calibration delay loop as we have to wait for a certain 718 * amount of time anyway. ··· 745 if (ref1 == ref2) 746 continue; 747 748 - /* Check, whether the sampling was disturbed by an SMI */ 749 if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) 750 continue; 751 ··· 1269 */ 1270 static void tsc_refine_calibration_work(struct work_struct *work) 1271 { 1272 - static u64 tsc_start = -1, ref_start; 1273 static int hpet; 1274 u64 tsc_stop, ref_stop, delta; 1275 unsigned long freq; ··· 1284 * delayed the first time we expire. So set the workqueue 1285 * again once we know timers are working. 1286 */ 1287 - if (tsc_start == -1) { 1288 /* 1289 * Only set hpet once, to avoid mixing hardware 1290 * if the hpet becomes enabled later. 1291 */ 1292 hpet = is_hpet_enabled(); 1293 - schedule_delayed_work(&tsc_irqwork, HZ); 1294 tsc_start = tsc_read_refs(&ref_start, hpet); 1295 return; 1296 } 1297 ··· 1302 if (ref_start == ref_stop) 1303 goto out; 1304 1305 - /* Check, whether the sampling was disturbed by an SMI */ 1306 - if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX) 1307 - goto out; 1308 1309 delta = tsc_stop - tsc_start; 1310 delta *= 1000000LL;
··· 297 298 __setup("tsc=", tsc_setup); 299 300 + #define MAX_RETRIES 5 301 + #define TSC_DEFAULT_THRESHOLD 0x20000 302 303 /* 304 + * Read TSC and the reference counters. Take care of any disturbances 305 */ 306 static u64 tsc_read_refs(u64 *p, int hpet) 307 { 308 u64 t1, t2; 309 + u64 thresh = tsc_khz ? tsc_khz >> 5 : TSC_DEFAULT_THRESHOLD; 310 int i; 311 312 for (i = 0; i < MAX_RETRIES; i++) { ··· 315 else 316 *p = acpi_pm_read_early(); 317 t2 = get_cycles(); 318 + if ((t2 - t1) < thresh) 319 return t2; 320 } 321 return ULLONG_MAX; ··· 703 * zero. In each wait loop iteration we read the TSC and check 704 * the delta to the previous read. We keep track of the min 705 * and max values of that delta. The delta is mostly defined 706 + * by the IO time of the PIT access, so we can detect when 707 + * any disturbance happened between the two reads. If the 708 * maximum time is significantly larger than the minimum time, 709 * then we discard the result and have another try. 710 * 711 * 2) Reference counter. If available we use the HPET or the 712 * PMTIMER as a reference to check the sanity of that value. 713 * We use separate TSC readouts and check inside of the 714 + * reference read for any possible disturbance. We dicard 715 * disturbed values here as well. We do that around the PIT 716 * calibration delay loop as we have to wait for a certain 717 * amount of time anyway. ··· 744 if (ref1 == ref2) 745 continue; 746 747 + /* Check, whether the sampling was disturbed */ 748 if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) 749 continue; 750 ··· 1268 */ 1269 static void tsc_refine_calibration_work(struct work_struct *work) 1270 { 1271 + static u64 tsc_start = ULLONG_MAX, ref_start; 1272 static int hpet; 1273 u64 tsc_stop, ref_stop, delta; 1274 unsigned long freq; ··· 1283 * delayed the first time we expire. So set the workqueue 1284 * again once we know timers are working. 1285 */ 1286 + if (tsc_start == ULLONG_MAX) { 1287 + restart: 1288 /* 1289 * Only set hpet once, to avoid mixing hardware 1290 * if the hpet becomes enabled later. 1291 */ 1292 hpet = is_hpet_enabled(); 1293 tsc_start = tsc_read_refs(&ref_start, hpet); 1294 + schedule_delayed_work(&tsc_irqwork, HZ); 1295 return; 1296 } 1297 ··· 1300 if (ref_start == ref_stop) 1301 goto out; 1302 1303 + /* Check, whether the sampling was disturbed */ 1304 + if (tsc_stop == ULLONG_MAX) 1305 + goto restart; 1306 1307 delta = tsc_stop - tsc_start; 1308 delta *= 1000000LL;