vsprintf: further optimize decimal conversion

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Previous code was using optimizations which were developed to work well
even on narrow-word CPUs (by today's standards). But Linux runs only on
32-bit and wider CPUs. We can use that.

First: using 32x32->64 multiply and trivial 32-bit shift, we can correctly
divide by 10 much larger numbers, and thus we can print groups of 9 digits
instead of groups of 5 digits.

Next: there are two algorithms to print larger numbers. One is generic:
divide by 1000000000 and repeatedly print groups of (up to) 9 digits.
It's conceptually simple, but requires an (unsigned long long) /
1000000000 division.

Second algorithm splits 64-bit unsigned long long into 16-bit chunks,
manipulates them cleverly and generates groups of 4 decimal digits. It so
happens that it does NOT require long long division.

If long is > 32 bits, division of 64-bit values is relatively easy, and we
will use the first algorithm. If long long is > 64 bits (strange
architecture with VERY large long long), second algorithm can't be used,
and we again use the first one.

Else (if long is 32 bits and long long is 64 bits) we use second one.

And third: there is a simple optimization which takes fast path not only
for zero as was done before, but for all one-digit numbers.

In all tested cases new code is faster than old one, in many cases by 30%,
in few cases by more than 50% (for example, on x86-32, conversion of
12345678). Code growth is ~0 in 32-bit case and ~130 bytes in 64-bit
case.

This patch is based upon an original from Michal Nazarewicz.

[akpm@linux-foundation.org: checkpatch fixes]
Signed-off-by: Michal Nazarewicz <mina86@mina86.com>
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Douglas W Jones <jones@cs.uiowa.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Denys Vlasenko and committed by

Linus Torvalds 14 years ago 133fd9f5 725fe002

+196 -93

2 changed files

expand all

include

asm-generic

bitsperlong.h

lib

vsprintf.c

include/asm-generic/bitsperlong.h

··· 28 28 #error Inconsistent word size. Check asm/bitsperlong.h 29 29 #endif 30 30 31 + #ifndef BITS_PER_LONG_LONG 32 + #define BITS_PER_LONG_LONG 64 33 + #endif 34 + 31 35 #endif /* __KERNEL__ */ 32 36 #endif /* __ASM_GENERIC_BITS_PER_LONG */

+192 -93

lib/vsprintf.c

··· 112 112 /* Decimal conversion is by far the most typical, and is used 113 113 * for /proc and /sys data. This directly impacts e.g. top performance 114 114 * with many processes running. We optimize it for speed 115 - * using code from 116 - * http://www.cs.uiowa.edu/~jones/bcd/decimal.html 117 - * (with permission from the author, Douglas W. Jones). */ 115 + * using ideas described at <http://www.cs.uiowa.edu/~jones/bcd/divide.html> 116 + * (with permission from the author, Douglas W. Jones). 117 + */ 118 118 119 - /* Formats correctly any integer in [0,99999]. 120 - * Outputs from one to five digits depending on input. 121 - * On i386 gcc 4.1.2 -O2: ~250 bytes of code. */ 119 + #if BITS_PER_LONG != 32 || BITS_PER_LONG_LONG != 64 120 + /* Formats correctly any integer in [0, 999999999] */ 122 121 static noinline_for_stack 123 - char *put_dec_trunc(char *buf, unsigned q) 122 + char *put_dec_full9(char *buf, unsigned q) 124 123 { 125 - unsigned d3, d2, d1, d0; 126 - d1 = (q>>4) & 0xf; 127 - d2 = (q>>8) & 0xf; 128 - d3 = (q>>12); 129 - 130 - d0 = 6*(d3 + d2 + d1) + (q & 0xf); 131 - q = (d0 * 0xcd) >> 11; 132 - d0 = d0 - 10*q; 133 - *buf++ = d0 + '0'; /* least significant digit */ 134 - d1 = q + 9*d3 + 5*d2 + d1; 135 - if (d1 != 0) { 136 - q = (d1 * 0xcd) >> 11; 137 - d1 = d1 - 10*q; 138 - *buf++ = d1 + '0'; /* next digit */ 139 - 140 - d2 = q + 2*d2; 141 - if ((d2 != 0) || (d3 != 0)) { 142 - q = (d2 * 0xd) >> 7; 143 - d2 = d2 - 10*q; 144 - *buf++ = d2 + '0'; /* next digit */ 145 - 146 - d3 = q + 4*d3; 147 - if (d3 != 0) { 148 - q = (d3 * 0xcd) >> 11; 149 - d3 = d3 - 10*q; 150 - *buf++ = d3 + '0'; /* next digit */ 151 - if (q != 0) 152 - *buf++ = q + '0'; /* most sign. digit */ 153 - } 154 - } 155 - } 156 - 157 - return buf; 158 - } 159 - /* Same with if's removed. Always emits five digits */ 160 - static noinline_for_stack 161 - char *put_dec_full(char *buf, unsigned q) 162 - { 163 - /* BTW, if q is in [0,9999], 8-bit ints will be enough, */ 164 - /* but anyway, gcc produces better code with full-sized ints */ 165 - unsigned d3, d2, d1, d0; 166 - d1 = (q>>4) & 0xf; 167 - d2 = (q>>8) & 0xf; 168 - d3 = (q>>12); 124 + unsigned r; 169 125 170 126 /* 171 127 * Possible ways to approx. divide by 10 172 - * gcc -O2 replaces multiply with shifts and adds 173 - * (x * 0xcd) >> 11: 11001101 - shorter code than * 0x67 (on i386) 174 - * (x * 0x67) >> 10: 1100111 175 - * (x * 0x34) >> 9: 110100 - same 176 - * (x * 0x1a) >> 8: 11010 - same 177 - * (x * 0x0d) >> 7: 1101 - same, shortest code (on i386) 128 + * (x * 0x1999999a) >> 32 x < 1073741829 (multiply must be 64-bit) 129 + * (x * 0xcccd) >> 19 x < 81920 (x < 262149 when 64-bit mul) 130 + * (x * 0x6667) >> 18 x < 43699 131 + * (x * 0x3334) >> 17 x < 16389 132 + * (x * 0x199a) >> 16 x < 16389 133 + * (x * 0x0ccd) >> 15 x < 16389 134 + * (x * 0x0667) >> 14 x < 2739 135 + * (x * 0x0334) >> 13 x < 1029 136 + * (x * 0x019a) >> 12 x < 1029 137 + * (x * 0x00cd) >> 11 x < 1029 shorter code than * 0x67 (on i386) 138 + * (x * 0x0067) >> 10 x < 179 139 + * (x * 0x0034) >> 9 x < 69 same 140 + * (x * 0x001a) >> 8 x < 69 same 141 + * (x * 0x000d) >> 7 x < 69 same, shortest code (on i386) 142 + * (x * 0x0007) >> 6 x < 19 143 + * See <http://www.cs.uiowa.edu/~jones/bcd/divide.html> 178 144 */ 179 - d0 = 6*(d3 + d2 + d1) + (q & 0xf); 180 - q = (d0 * 0xcd) >> 11; 181 - d0 = d0 - 10*q; 182 - *buf++ = d0 + '0'; 183 - d1 = q + 9*d3 + 5*d2 + d1; 184 - q = (d1 * 0xcd) >> 11; 185 - d1 = d1 - 10*q; 186 - *buf++ = d1 + '0'; 145 + r = (q * (uint64_t)0x1999999a) >> 32; 146 + *buf++ = (q - 10 * r) + '0'; /* 1 */ 147 + q = (r * (uint64_t)0x1999999a) >> 32; 148 + *buf++ = (r - 10 * q) + '0'; /* 2 */ 149 + r = (q * (uint64_t)0x1999999a) >> 32; 150 + *buf++ = (q - 10 * r) + '0'; /* 3 */ 151 + q = (r * (uint64_t)0x1999999a) >> 32; 152 + *buf++ = (r - 10 * q) + '0'; /* 4 */ 153 + r = (q * (uint64_t)0x1999999a) >> 32; 154 + *buf++ = (q - 10 * r) + '0'; /* 5 */ 155 + /* Now value is under 10000, can avoid 64-bit multiply */ 156 + q = (r * 0x199a) >> 16; 157 + *buf++ = (r - 10 * q) + '0'; /* 6 */ 158 + r = (q * 0xcd) >> 11; 159 + *buf++ = (q - 10 * r) + '0'; /* 7 */ 160 + q = (r * 0xcd) >> 11; 161 + *buf++ = (r - 10 * q) + '0'; /* 8 */ 162 + *buf++ = q + '0'; /* 9 */ 163 + return buf; 164 + } 165 + #endif 187 166 188 - d2 = q + 2*d2; 189 - q = (d2 * 0xd) >> 7; 190 - d2 = d2 - 10*q; 191 - *buf++ = d2 + '0'; 167 + /* Similar to above but do not pad with zeros. 168 + * Code can be easily arranged to print 9 digits too, but our callers 169 + * always call put_dec_full9() instead when the number has 9 decimal digits. 170 + */ 171 + static noinline_for_stack 172 + char *put_dec_trunc8(char *buf, unsigned r) 173 + { 174 + unsigned q; 192 175 193 - d3 = q + 4*d3; 194 - q = (d3 * 0xcd) >> 11; /* - shorter code */ 195 - /* q = (d3 * 0x67) >> 10; - would also work */ 196 - d3 = d3 - 10*q; 197 - *buf++ = d3 + '0'; 198 - *buf++ = q + '0'; 176 + /* Copy of previous function's body with added early returns */ 177 + q = (r * (uint64_t)0x1999999a) >> 32; 178 + *buf++ = (r - 10 * q) + '0'; /* 2 */ 179 + if (q == 0) 180 + return buf; 181 + r = (q * (uint64_t)0x1999999a) >> 32; 182 + *buf++ = (q - 10 * r) + '0'; /* 3 */ 183 + if (r == 0) 184 + return buf; 185 + q = (r * (uint64_t)0x1999999a) >> 32; 186 + *buf++ = (r - 10 * q) + '0'; /* 4 */ 187 + if (q == 0) 188 + return buf; 189 + r = (q * (uint64_t)0x1999999a) >> 32; 190 + *buf++ = (q - 10 * r) + '0'; /* 5 */ 191 + if (r == 0) 192 + return buf; 193 + q = (r * 0x199a) >> 16; 194 + *buf++ = (r - 10 * q) + '0'; /* 6 */ 195 + if (q == 0) 196 + return buf; 197 + r = (q * 0xcd) >> 11; 198 + *buf++ = (q - 10 * r) + '0'; /* 7 */ 199 + if (r == 0) 200 + return buf; 201 + q = (r * 0xcd) >> 11; 202 + *buf++ = (r - 10 * q) + '0'; /* 8 */ 203 + if (q == 0) 204 + return buf; 205 + *buf++ = q + '0'; /* 9 */ 206 + return buf; 207 + } 208 + 209 + /* There are two algorithms to print larger numbers. 210 + * One is generic: divide by 1000000000 and repeatedly print 211 + * groups of (up to) 9 digits. It's conceptually simple, 212 + * but requires a (unsigned long long) / 1000000000 division. 213 + * 214 + * Second algorithm splits 64-bit unsigned long long into 16-bit chunks, 215 + * manipulates them cleverly and generates groups of 4 decimal digits. 216 + * It so happens that it does NOT require long long division. 217 + * 218 + * If long is > 32 bits, division of 64-bit values is relatively easy, 219 + * and we will use the first algorithm. 220 + * If long long is > 64 bits (strange architecture with VERY large long long), 221 + * second algorithm can't be used, and we again use the first one. 222 + * 223 + * Else (if long is 32 bits and long long is 64 bits) we use second one. 224 + */ 225 + 226 + #if BITS_PER_LONG != 32 || BITS_PER_LONG_LONG != 64 227 + 228 + /* First algorithm: generic */ 229 + 230 + static 231 + char *put_dec(char *buf, unsigned long long n) 232 + { 233 + if (n >= 100*1000*1000) { 234 + while (n >= 1000*1000*1000) 235 + buf = put_dec_full9(buf, do_div(n, 1000*1000*1000)); 236 + if (n >= 100*1000*1000) 237 + return put_dec_full9(buf, n); 238 + } 239 + return put_dec_trunc8(buf, n); 240 + } 241 + 242 + #else 243 + 244 + /* Second algorithm: valid only for 64-bit long longs */ 245 + 246 + static noinline_for_stack 247 + char *put_dec_full4(char *buf, unsigned q) 248 + { 249 + unsigned r; 250 + r = (q * 0xcccd) >> 19; 251 + *buf++ = (q - 10 * r) + '0'; 252 + q = (r * 0x199a) >> 16; 253 + *buf++ = (r - 10 * q) + '0'; 254 + r = (q * 0xcd) >> 11; 255 + *buf++ = (q - 10 * r) + '0'; 256 + *buf++ = r + '0'; 257 + return buf; 258 + } 259 + 260 + /* Based on code by Douglas W. Jones found at 261 + * <http://www.cs.uiowa.edu/~jones/bcd/decimal.html#sixtyfour> 262 + * (with permission from the author). 263 + * Performs no 64-bit division and hence should be fast on 32-bit machines. 264 + */ 265 + static 266 + char *put_dec(char *buf, unsigned long long n) 267 + { 268 + uint32_t d3, d2, d1, q, h; 269 + 270 + if (n < 100*1000*1000) 271 + return put_dec_trunc8(buf, n); 272 + 273 + d1 = ((uint32_t)n >> 16); /* implicit "& 0xffff" */ 274 + h = (n >> 32); 275 + d2 = (h ) & 0xffff; 276 + d3 = (h >> 16); /* implicit "& 0xffff" */ 277 + 278 + q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff); 279 + 280 + buf = put_dec_full4(buf, q % 10000); 281 + q = q / 10000; 282 + 283 + d1 = q + 7671 * d3 + 9496 * d2 + 6 * d1; 284 + buf = put_dec_full4(buf, d1 % 10000); 285 + q = d1 / 10000; 286 + 287 + d2 = q + 4749 * d3 + 42 * d2; 288 + buf = put_dec_full4(buf, d2 % 10000); 289 + q = d2 / 10000; 290 + 291 + d3 = q + 281 * d3; 292 + if (!d3) 293 + goto done; 294 + buf = put_dec_full4(buf, d3 % 10000); 295 + q = d3 / 10000; 296 + if (!q) 297 + goto done; 298 + buf = put_dec_full4(buf, q); 299 + done: 300 + while (buf[-1] == '0') 301 + --buf; 199 302 200 303 return buf; 201 304 } 202 - /* No inlining helps gcc to use registers better */ 203 - static noinline_for_stack 204 - char *put_dec(char *buf, unsigned long long num) 205 - { 206 - while (1) { 207 - unsigned rem; 208 - if (num < 100000) 209 - return put_dec_trunc(buf, num); 210 - rem = do_div(num, 100000); 211 - buf = put_dec_full(buf, rem); 212 - } 213 - } 305 + 306 + #endif 214 307 215 308 /* 216 309 * Convert passed number to decimal string. ··· 313 220 */ 314 221 int num_to_str(char *buf, int size, unsigned long long num) 315 222 { 316 - char tmp[21]; /* Enough for 2^64 in decimal */ 223 + char tmp[sizeof(num) * 3]; 317 224 int idx, len; 318 225 319 - len = put_dec(tmp, num) - tmp; 226 + /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */ 227 + if (num <= 9) { 228 + tmp[0] = '0' + num; 229 + len = 1; 230 + } else { 231 + len = put_dec(tmp, num) - tmp; 232 + } 320 233 321 234 if (len > size) 322 235 return 0; 323 236 for (idx = 0; idx < len; ++idx) 324 237 buf[idx] = tmp[len - idx - 1]; 325 - return len; 238 + return len; 326 239 } 327 240 328 241 #define ZEROPAD 1 /* pad with zero */ ··· 413 314 414 315 /* generate full string in tmp[], in reverse order */ 415 316 i = 0; 416 - if (num == 0) 417 - tmp[i++] = '0'; 317 + if (num < spec.base) 318 + tmp[i++] = digits[num] | locase; 418 319 /* Generic code, for any base: 419 320 else do { 420 321 tmp[i++] = (digits[do_div(num,base)] | locase); ··· 710 611 } 711 612 for (i = 0; i < 4; i++) { 712 613 char temp[3]; /* hold each IP quad in reverse order */ 713 - int digits = put_dec_trunc(temp, addr[index]) - temp; 614 + int digits = put_dec_trunc8(temp, addr[index]) - temp; 714 615 if (leading_zeros) { 715 616 if (digits < 3) 716 617 *p++ = '0';