Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

lib/GCD.c: use binary GCD algorithm instead of Euclidean

The binary GCD algorithm is based on the following facts:
1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

Even on x86 machines with reasonable division hardware, the binary
algorithm runs about 25% faster (80% the execution time) than the
division-based Euclidian algorithm.

On platforms like Alpha and ARMv6 where division is a function call to
emulation code, it's even more significant.

There are two variants of the code here, depending on whether a fast
__ffs (find least significant set bit) instruction is available. This
allows the unpredictable branches in the bit-at-a-time shifting loop to
be eliminated.

If fast __ffs is not available, the "even/odd" GCD variant is used.

I use the following code to benchmark:

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#include <unistd.h>

#define swap(a, b) \
do { \
a ^= b; \
b ^= a; \
a ^= b; \
} while (0)

unsigned long gcd0(unsigned long a, unsigned long b)
{
unsigned long r;

if (a < b) {
swap(a, b);
}

if (b == 0)
return a;

while ((r = a % b) != 0) {
a = b;
b = r;
}

return b;
}

unsigned long gcd1(unsigned long a, unsigned long b)
{
unsigned long r = a | b;

if (!a || !b)
return r;

b >>= __builtin_ctzl(b);

for (;;) {
a >>= __builtin_ctzl(a);
if (a == b)
return a << __builtin_ctzl(r);

if (a < b)
swap(a, b);
a -= b;
}
}

unsigned long gcd2(unsigned long a, unsigned long b)
{
unsigned long r = a | b;

if (!a || !b)
return r;

r &= -r;

while (!(b & r))
b >>= 1;

for (;;) {
while (!(a & r))
a >>= 1;
if (a == b)
return a;

if (a < b)
swap(a, b);
a -= b;
a >>= 1;
if (a & r)
a += b;
a >>= 1;
}
}

unsigned long gcd3(unsigned long a, unsigned long b)
{
unsigned long r = a | b;

if (!a || !b)
return r;

b >>= __builtin_ctzl(b);
if (b == 1)
return r & -r;

for (;;) {
a >>= __builtin_ctzl(a);
if (a == 1)
return r & -r;
if (a == b)
return a << __builtin_ctzl(r);

if (a < b)
swap(a, b);
a -= b;
}
}

unsigned long gcd4(unsigned long a, unsigned long b)
{
unsigned long r = a | b;

if (!a || !b)
return r;

r &= -r;

while (!(b & r))
b >>= 1;
if (b == r)
return r;

for (;;) {
while (!(a & r))
a >>= 1;
if (a == r)
return r;
if (a == b)
return a;

if (a < b)
swap(a, b);
a -= b;
a >>= 1;
if (a & r)
a += b;
a >>= 1;
}
}

static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
gcd0, gcd1, gcd2, gcd3, gcd4,
};

#define TEST_ENTRIES (sizeof(gcd_func) / sizeof(gcd_func[0]))

#if defined(__x86_64__)

#define rdtscll(val) do { \
unsigned long __a,__d; \
__asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \
(val) = ((unsigned long long)__a) | (((unsigned long long)__d)<<32); \
} while(0)

static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
unsigned long a, unsigned long b, unsigned long *res)
{
unsigned long long start, end;
unsigned long long ret;
unsigned long gcd_res;

rdtscll(start);
gcd_res = gcd(a, b);
rdtscll(end);

if (end >= start)
ret = end - start;
else
ret = ~0ULL - start + 1 + end;

*res = gcd_res;
return ret;
}

#else

static inline struct timespec read_time(void)
{
struct timespec time;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time);
return time;
}

static inline unsigned long long diff_time(struct timespec start, struct timespec end)
{
struct timespec temp;

if ((end.tv_nsec - start.tv_nsec) < 0) {
temp.tv_sec = end.tv_sec - start.tv_sec - 1;
temp.tv_nsec = 1000000000ULL + end.tv_nsec - start.tv_nsec;
} else {
temp.tv_sec = end.tv_sec - start.tv_sec;
temp.tv_nsec = end.tv_nsec - start.tv_nsec;
}

return temp.tv_sec * 1000000000ULL + temp.tv_nsec;
}

static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
unsigned long a, unsigned long b, unsigned long *res)
{
struct timespec start, end;
unsigned long gcd_res;

start = read_time();
gcd_res = gcd(a, b);
end = read_time();

*res = gcd_res;
return diff_time(start, end);
}

#endif

static inline unsigned long get_rand()
{
if (sizeof(long) == 8)
return (unsigned long)rand() << 32 | rand();
else
return rand();
}

int main(int argc, char **argv)
{
unsigned int seed = time(0);
int loops = 100;
int repeats = 1000;
unsigned long (*res)[TEST_ENTRIES];
unsigned long long elapsed[TEST_ENTRIES];
int i, j, k;

for (;;) {
int opt = getopt(argc, argv, "n:r:s:");
/* End condition always first */
if (opt == -1)
break;

switch (opt) {
case 'n':
loops = atoi(optarg);
break;
case 'r':
repeats = atoi(optarg);
break;
case 's':
seed = strtoul(optarg, NULL, 10);
break;
default:
/* You won't actually get here. */
break;
}
}

res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops);
memset(elapsed, 0, sizeof(elapsed));

srand(seed);
for (j = 0; j < loops; j++) {
unsigned long a = get_rand();
/* Do we have args? */
unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
unsigned long long min_elapsed[TEST_ENTRIES];
for (k = 0; k < repeats; k++) {
for (i = 0; i < TEST_ENTRIES; i++) {
unsigned long long tmp = benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]);
if (k == 0 || min_elapsed[i] > tmp)
min_elapsed[i] = tmp;
}
}
for (i = 0; i < TEST_ENTRIES; i++)
elapsed[i] += min_elapsed[i];
}

for (i = 0; i < TEST_ENTRIES; i++)
printf("gcd%d: elapsed %llu\n", i, elapsed[i]);

k = 0;
srand(seed);
for (j = 0; j < loops; j++) {
unsigned long a = get_rand();
unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
for (i = 1; i < TEST_ENTRIES; i++) {
if (res[j][i] != res[j][0])
break;
}
if (i < TEST_ENTRIES) {
if (k == 0) {
k = 1;
fprintf(stderr, "Error:\n");
}
fprintf(stderr, "gcd(%lu, %lu): ", a, b);
for (i = 0; i < TEST_ENTRIES; i++)
fprintf(stderr, "%ld%s", res[j][i], i < TEST_ENTRIES - 1 ? ", " : "\n");
}
}

if (k == 0)
fprintf(stderr, "PASS\n");

free(res);

return 0;
}

Compiled with "-O2", on "VirtualBox 4.4.0-22-generic #38-Ubuntu x86_64" got:

zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 10174
gcd1: elapsed 2120
gcd2: elapsed 2902
gcd3: elapsed 2039
gcd4: elapsed 2812
PASS
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9309
gcd1: elapsed 2280
gcd2: elapsed 2822
gcd3: elapsed 2217
gcd4: elapsed 2710
PASS
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9589
gcd1: elapsed 2098
gcd2: elapsed 2815
gcd3: elapsed 2030
gcd4: elapsed 2718
PASS
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9914
gcd1: elapsed 2309
gcd2: elapsed 2779
gcd3: elapsed 2228
gcd4: elapsed 2709
PASS

[akpm@linux-foundation.org: avoid #defining a CONFIG_ variable]
Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
Signed-off-by: George Spelvin <linux@horizon.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Zhaoxiu Zeng and committed by
Linus Torvalds
fff7fb0b 3bcadd6f

+107 -10
+3
arch/Kconfig
··· 647 647 config ARCH_NO_COHERENT_DMA_MMAP 648 648 bool 649 649 650 + config CPU_NO_EFFICIENT_FFS 651 + def_bool n 652 + 650 653 source "kernel/gcov/Kconfig"
+1
arch/alpha/Kconfig
··· 26 26 select MODULES_USE_ELF_RELA 27 27 select ODD_RT_SIGACTION 28 28 select OLD_SIGSUSPEND 29 + select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67 29 30 help 30 31 The Alpha is a 64-bit general-purpose processor designed and 31 32 marketed by the Digital Equipment Corporation of blessed memory,
+1
arch/arc/Kconfig
··· 107 107 108 108 config ISA_ARCOMPACT 109 109 bool "ARCompact ISA" 110 + select CPU_NO_EFFICIENT_FFS 110 111 help 111 112 The original ARC ISA of ARC600/700 cores 112 113
+3
arch/arm/mm/Kconfig
··· 421 421 select CPU_USE_DOMAINS if MMU 422 422 select NEED_KUSER_HELPERS 423 423 select TLS_REG_EMUL if SMP || !MMU 424 + select CPU_NO_EFFICIENT_FFS 424 425 425 426 config CPU_32v4 426 427 bool 427 428 select CPU_USE_DOMAINS if MMU 428 429 select NEED_KUSER_HELPERS 429 430 select TLS_REG_EMUL if SMP || !MMU 431 + select CPU_NO_EFFICIENT_FFS 430 432 431 433 config CPU_32v4T 432 434 bool 433 435 select CPU_USE_DOMAINS if MMU 434 436 select NEED_KUSER_HELPERS 435 437 select TLS_REG_EMUL if SMP || !MMU 438 + select CPU_NO_EFFICIENT_FFS 436 439 437 440 config CPU_32v5 438 441 bool
+1
arch/h8300/Kconfig
··· 20 20 select HAVE_KERNEL_GZIP 21 21 select HAVE_KERNEL_LZO 22 22 select HAVE_ARCH_KGDB 23 + select CPU_NO_EFFICIENT_FFS 23 24 24 25 config RWSEM_GENERIC_SPINLOCK 25 26 def_bool y
+1
arch/m32r/Kconfig
··· 17 17 select ARCH_USES_GETTIMEOFFSET 18 18 select MODULES_USE_ELF_RELA 19 19 select HAVE_DEBUG_STACKOVERFLOW 20 + select CPU_NO_EFFICIENT_FFS 20 21 21 22 config SBUS 22 23 bool
+11
arch/m68k/Kconfig.cpu
··· 40 40 select CPU_HAS_NO_MULDIV64 41 41 select CPU_HAS_NO_UNALIGNED 42 42 select GENERIC_CSUM 43 + select CPU_NO_EFFICIENT_FFS 43 44 help 44 45 The Freescale (was Motorola) 68000 CPU is the first generation of 45 46 the well known M68K family of processors. The CPU core as well as ··· 52 51 bool 53 52 select CPU_HAS_NO_BITFIELDS 54 53 select CPU_HAS_NO_UNALIGNED 54 + select CPU_NO_EFFICIENT_FFS 55 55 help 56 56 The Freescale (was then Motorola) CPU32 is a CPU core that is 57 57 based on the 68020 processor. For the most part it is used in ··· 132 130 depends on !MMU 133 131 select COLDFIRE_SW_A7 134 132 select HAVE_MBAR 133 + select CPU_NO_EFFICIENT_FFS 135 134 help 136 135 Motorola ColdFire 5206 processor support. 137 136 ··· 141 138 depends on !MMU 142 139 select COLDFIRE_SW_A7 143 140 select HAVE_MBAR 141 + select CPU_NO_EFFICIENT_FFS 144 142 help 145 143 Motorola ColdFire 5206e processor support. 146 144 ··· 167 163 depends on !MMU 168 164 select COLDFIRE_SW_A7 169 165 select HAVE_MBAR 166 + select CPU_NO_EFFICIENT_FFS 170 167 help 171 168 Motorola ColdFire 5249 processor support. 172 169 ··· 176 171 depends on !MMU 177 172 select COLDFIRE_SW_A7 178 173 select HAVE_MBAR 174 + select CPU_NO_EFFICIENT_FFS 179 175 help 180 176 Freescale (Motorola) Coldfire 5251/5253 processor support. 181 177 ··· 195 189 depends on !MMU 196 190 select COLDFIRE_SW_A7 197 191 select HAVE_MBAR 192 + select CPU_NO_EFFICIENT_FFS 198 193 help 199 194 Motorola ColdFire 5272 processor support. 200 195 ··· 224 217 select COLDFIRE_SW_A7 225 218 select HAVE_CACHE_CB 226 219 select HAVE_MBAR 220 + select CPU_NO_EFFICIENT_FFS 227 221 help 228 222 Motorola ColdFire 5307 processor support. 229 223 ··· 250 242 select COLDFIRE_SW_A7 251 243 select HAVE_CACHE_CB 252 244 select HAVE_MBAR 245 + select CPU_NO_EFFICIENT_FFS 253 246 help 254 247 Motorola ColdFire 5407 processor support. 255 248 ··· 260 251 select MMU_COLDFIRE if MMU 261 252 select HAVE_CACHE_CB 262 253 select HAVE_MBAR 254 + select CPU_NO_EFFICIENT_FFS 263 255 help 264 256 Freescale ColdFire 5470/5471/5472/5473/5474/5475 processor support. 265 257 ··· 270 260 select M54xx 271 261 select HAVE_CACHE_CB 272 262 select HAVE_MBAR 263 + select CPU_NO_EFFICIENT_FFS 273 264 help 274 265 Freescale ColdFire 5480/5481/5482/5483/5484/5485 processor support. 275 266
+1
arch/metag/Kconfig
··· 30 30 select OF 31 31 select OF_EARLY_FLATTREE 32 32 select SPARSE_IRQ 33 + select CPU_NO_EFFICIENT_FFS 33 34 34 35 config STACKTRACE_SUPPORT 35 36 def_bool y
+1
arch/microblaze/Kconfig
··· 32 32 select OF_EARLY_FLATTREE 33 33 select TRACING_SUPPORT 34 34 select VIRT_TO_BUS 35 + select CPU_NO_EFFICIENT_FFS 35 36 36 37 config SWAP 37 38 def_bool n
+10
arch/mips/include/asm/cpu-features.h
··· 204 204 #endif 205 205 #endif 206 206 207 + /* __builtin_constant_p(cpu_has_mips_r) && cpu_has_mips_r */ 208 + #if !((defined(cpu_has_mips32r1) && cpu_has_mips32r1) || \ 209 + (defined(cpu_has_mips32r2) && cpu_has_mips32r2) || \ 210 + (defined(cpu_has_mips32r6) && cpu_has_mips32r6) || \ 211 + (defined(cpu_has_mips64r1) && cpu_has_mips64r1) || \ 212 + (defined(cpu_has_mips64r2) && cpu_has_mips64r2) || \ 213 + (defined(cpu_has_mips64r6) && cpu_has_mips64r6)) 214 + #define CPU_NO_EFFICIENT_FFS 1 215 + #endif 216 + 207 217 #ifndef cpu_has_mips_1 208 218 # define cpu_has_mips_1 (!cpu_has_mips_r6) 209 219 #endif
+1
arch/nios2/Kconfig
··· 15 15 select SOC_BUS 16 16 select SPARSE_IRQ 17 17 select USB_ARCH_HAS_HCD if USB_SUPPORT 18 + select CPU_NO_EFFICIENT_FFS 18 19 19 20 config GENERIC_CSUM 20 21 def_bool y
+1
arch/openrisc/Kconfig
··· 25 25 select MODULES_USE_ELF_RELA 26 26 select HAVE_DEBUG_STACKOVERFLOW 27 27 select OR1K_PIC 28 + select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1 28 29 29 30 config MMU 30 31 def_bool y
+1
arch/parisc/Kconfig
··· 32 32 select HAVE_ARCH_AUDITSYSCALL 33 33 select HAVE_ARCH_SECCOMP_FILTER 34 34 select ARCH_NO_COHERENT_DMA_MMAP 35 + select CPU_NO_EFFICIENT_FFS 35 36 36 37 help 37 38 The PA-RISC microprocessor is designed by Hewlett-Packard and used
+1
arch/s390/Kconfig
··· 123 123 select HAVE_ARCH_AUDITSYSCALL 124 124 select HAVE_ARCH_EARLY_PFN_TO_NID 125 125 select HAVE_ARCH_JUMP_LABEL 126 + select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES 126 127 select HAVE_ARCH_SECCOMP_FILTER 127 128 select HAVE_ARCH_SOFT_DIRTY 128 129 select HAVE_ARCH_TRACEHOOK
+1
arch/score/Kconfig
··· 14 14 select VIRT_TO_BUS 15 15 select MODULES_USE_ELF_REL 16 16 select CLONE_BACKWARDS 17 + select CPU_NO_EFFICIENT_FFS 17 18 18 19 choice 19 20 prompt "System type"
+1
arch/sh/Kconfig
··· 20 20 select PERF_USE_VMALLOC 21 21 select HAVE_DEBUG_KMEMLEAK 22 22 select HAVE_KERNEL_GZIP 23 + select CPU_NO_EFFICIENT_FFS 23 24 select HAVE_KERNEL_BZIP2 24 25 select HAVE_KERNEL_LZMA 25 26 select HAVE_KERNEL_XZ
+1
arch/sparc/Kconfig
··· 42 42 select ODD_RT_SIGACTION 43 43 select OLD_SIGSUSPEND 44 44 select ARCH_HAS_SG_CHAIN 45 + select CPU_NO_EFFICIENT_FFS 45 46 46 47 config SPARC32 47 48 def_bool !64BIT
+67 -10
lib/gcd.c
··· 2 2 #include <linux/gcd.h> 3 3 #include <linux/export.h> 4 4 5 - /* Greatest common divisor */ 5 + /* 6 + * This implements the binary GCD algorithm. (Often attributed to Stein, 7 + * but as Knuth has noted, appears in a first-century Chinese math text.) 8 + * 9 + * This is faster than the division-based algorithm even on x86, which 10 + * has decent hardware division. 11 + */ 12 + 13 + #if !defined(CONFIG_CPU_NO_EFFICIENT_FFS) && !defined(CPU_NO_EFFICIENT_FFS) 14 + 15 + /* If __ffs is available, the even/odd algorithm benchmarks slower. */ 6 16 unsigned long gcd(unsigned long a, unsigned long b) 7 17 { 8 - unsigned long r; 18 + unsigned long r = a | b; 9 19 10 - if (a < b) 11 - swap(a, b); 20 + if (!a || !b) 21 + return r; 12 22 13 - if (!b) 14 - return a; 15 - while ((r = a % b) != 0) { 16 - a = b; 17 - b = r; 23 + b >>= __ffs(b); 24 + if (b == 1) 25 + return r & -r; 26 + 27 + for (;;) { 28 + a >>= __ffs(a); 29 + if (a == 1) 30 + return r & -r; 31 + if (a == b) 32 + return a << __ffs(r); 33 + 34 + if (a < b) 35 + swap(a, b); 36 + a -= b; 18 37 } 19 - return b; 20 38 } 39 + 40 + #else 41 + 42 + /* If normalization is done by loops, the even/odd algorithm is a win. */ 43 + unsigned long gcd(unsigned long a, unsigned long b) 44 + { 45 + unsigned long r = a | b; 46 + 47 + if (!a || !b) 48 + return r; 49 + 50 + /* Isolate lsbit of r */ 51 + r &= -r; 52 + 53 + while (!(b & r)) 54 + b >>= 1; 55 + if (b == r) 56 + return r; 57 + 58 + for (;;) { 59 + while (!(a & r)) 60 + a >>= 1; 61 + if (a == r) 62 + return r; 63 + if (a == b) 64 + return a; 65 + 66 + if (a < b) 67 + swap(a, b); 68 + a -= b; 69 + a >>= 1; 70 + if (a & r) 71 + a += b; 72 + a >>= 1; 73 + } 74 + } 75 + 76 + #endif 77 + 21 78 EXPORT_SYMBOL_GPL(gcd);