Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

sched/cputime: Improve cputime_adjust()

People report that utime and stime from /proc/<pid>/stat become very
wrong when the numbers are big enough, especially if you watch these
counters incrementally.

Specifically, the current implementation of: stime*rtime/total,
results in a saw-tooth function on top of the desired line, where the
teeth grow in size the larger the values become. IOW, it has a
relative error.

The result is that, when watching incrementally as time progresses
(for large values), we'll see periods of pure stime or utime increase,
irrespective of the actual ratio we're striving for.

Replace scale_stime() with a math64.h helper: mul_u64_u64_div_u64()
that is far more accurate. This also allows architectures to override
the implementation -- for instance they can opt for the old algorithm
if this new one turns out to be too expensive for them.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200519172506.GA317395@hirez.programming.kicks-ass.net

authored by

Oleg Nesterov and committed by
Peter Zijlstra
3dc167ba b3a9e3b9

+56 -47
+12 -2
arch/x86/include/asm/div64.h
··· 74 74 #else 75 75 # include <asm-generic/div64.h> 76 76 77 - static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div) 77 + /* 78 + * Will generate an #DE when the result doesn't fit u64, could fix with an 79 + * __ex_table[] entry when it becomes an issue. 80 + */ 81 + static inline u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div) 78 82 { 79 83 u64 q; 80 84 81 85 asm ("mulq %2; divq %3" : "=a" (q) 82 - : "a" (a), "rm" ((u64)mul), "rm" ((u64)div) 86 + : "a" (a), "rm" (mul), "rm" (div) 83 87 : "rdx"); 84 88 85 89 return q; 90 + } 91 + #define mul_u64_u64_div_u64 mul_u64_u64_div_u64 92 + 93 + static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div) 94 + { 95 + return mul_u64_u64_div_u64(a, mul, div); 86 96 } 87 97 #define mul_u64_u32_div mul_u64_u32_div 88 98
+2
include/linux/math64.h
··· 263 263 } 264 264 #endif /* mul_u64_u32_div */ 265 265 266 + u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div); 267 + 266 268 #define DIV64_U64_ROUND_UP(ll, d) \ 267 269 ({ u64 _tmp = (d); div64_u64((ll) + _tmp - 1, _tmp); }) 268 270
+1 -45
kernel/sched/cputime.c
··· 520 520 } 521 521 522 522 /* 523 - * Perform (stime * rtime) / total, but avoid multiplication overflow by 524 - * losing precision when the numbers are big. 525 - */ 526 - static u64 scale_stime(u64 stime, u64 rtime, u64 total) 527 - { 528 - u64 scaled; 529 - 530 - for (;;) { 531 - /* Make sure "rtime" is the bigger of stime/rtime */ 532 - if (stime > rtime) 533 - swap(rtime, stime); 534 - 535 - /* Make sure 'total' fits in 32 bits */ 536 - if (total >> 32) 537 - goto drop_precision; 538 - 539 - /* Does rtime (and thus stime) fit in 32 bits? */ 540 - if (!(rtime >> 32)) 541 - break; 542 - 543 - /* Can we just balance rtime/stime rather than dropping bits? */ 544 - if (stime >> 31) 545 - goto drop_precision; 546 - 547 - /* We can grow stime and shrink rtime and try to make them both fit */ 548 - stime <<= 1; 549 - rtime >>= 1; 550 - continue; 551 - 552 - drop_precision: 553 - /* We drop from rtime, it has more bits than stime */ 554 - rtime >>= 1; 555 - total >>= 1; 556 - } 557 - 558 - /* 559 - * Make sure gcc understands that this is a 32x32->64 multiply, 560 - * followed by a 64/32->64 divide. 561 - */ 562 - scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); 563 - return scaled; 564 - } 565 - 566 - /* 567 523 * Adjust tick based cputime random precision against scheduler runtime 568 524 * accounting. 569 525 * ··· 578 622 goto update; 579 623 } 580 624 581 - stime = scale_stime(stime, rtime, stime + utime); 625 + stime = mul_u64_u64_div_u64(stime, rtime, stime + utime); 582 626 583 627 update: 584 628 /*
+41
lib/math/div64.c
··· 190 190 return __iter_div_u64_rem(dividend, divisor, remainder); 191 191 } 192 192 EXPORT_SYMBOL(iter_div_u64_rem); 193 + 194 + #ifndef mul_u64_u64_div_u64 195 + u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c) 196 + { 197 + u64 res = 0, div, rem; 198 + int shift; 199 + 200 + /* can a * b overflow ? */ 201 + if (ilog2(a) + ilog2(b) > 62) { 202 + /* 203 + * (b * a) / c is equal to 204 + * 205 + * (b / c) * a + 206 + * (b % c) * a / c 207 + * 208 + * if nothing overflows. Can the 1st multiplication 209 + * overflow? Yes, but we do not care: this can only 210 + * happen if the end result can't fit in u64 anyway. 211 + * 212 + * So the code below does 213 + * 214 + * res = (b / c) * a; 215 + * b = b % c; 216 + */ 217 + div = div64_u64_rem(b, c, &rem); 218 + res = div * a; 219 + b = rem; 220 + 221 + shift = ilog2(a) + ilog2(b) - 62; 222 + if (shift > 0) { 223 + /* drop precision */ 224 + b >>= shift; 225 + c >>= shift; 226 + if (!c) 227 + return res; 228 + } 229 + } 230 + 231 + return res + div64_u64(a * b, c); 232 + } 233 + #endif