Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

lib: optimize cpumask_next_and()

We've measured that we spend ~0.6% of sys cpu time in cpumask_next_and().
It's essentially a joined iteration in search for a non-zero bit, which is
currently implemented as a lookup join (find a nonzero bit on the lhs,
lookup the rhs to see if it's set there).

Implement a direct join (find a nonzero bit on the incrementally built
join). Also add generic bitmap benchmarks in the new `test_find_bit`
module for new function (see `find_next_and_bit` in [2] and [3] below).

For cpumask_next_and, direct benchmarking shows that it's 1.17x to 14x
faster with a geometric mean of 2.1 on 32 CPUs [1]. No impact on memory
usage. Note that on Arm, the new pure-C implementation still outperforms
the old one that uses a mix of C and asm (`find_next_bit`) [3].

[1] Approximate benchmark code:

```
unsigned long src1p[nr_cpumask_longs] = {pattern1};
unsigned long src2p[nr_cpumask_longs] = {pattern2};
for (/*a bunch of repetitions*/) {
for (int n = -1; n <= nr_cpu_ids; ++n) {
asm volatile("" : "+rm"(src1p)); // prevent any optimization
asm volatile("" : "+rm"(src2p));
unsigned long result = cpumask_next_and(n, src1p, src2p);
asm volatile("" : "+rm"(result));
}
}
```

Results:
pattern1 pattern2 time_before/time_after
0x0000ffff 0x0000ffff 1.65
0x0000ffff 0x00005555 2.24
0x0000ffff 0x00001111 2.94
0x0000ffff 0x00000000 14.0
0x00005555 0x0000ffff 1.67
0x00005555 0x00005555 1.71
0x00005555 0x00001111 1.90
0x00005555 0x00000000 6.58
0x00001111 0x0000ffff 1.46
0x00001111 0x00005555 1.49
0x00001111 0x00001111 1.45
0x00001111 0x00000000 3.10
0x00000000 0x0000ffff 1.18
0x00000000 0x00005555 1.18
0x00000000 0x00001111 1.17
0x00000000 0x00000000 1.25
-----------------------------
geo.mean 2.06

[2] test_find_next_bit, X86 (skylake)

[ 3913.477422] Start testing find_bit() with random-filled bitmap
[ 3913.477847] find_next_bit: 160868 cycles, 16484 iterations
[ 3913.477933] find_next_zero_bit: 169542 cycles, 16285 iterations
[ 3913.478036] find_last_bit: 201638 cycles, 16483 iterations
[ 3913.480214] find_first_bit: 4353244 cycles, 16484 iterations
[ 3913.480216] Start testing find_next_and_bit() with random-filled
bitmap
[ 3913.481074] find_next_and_bit: 89604 cycles, 8216 iterations
[ 3913.481075] Start testing find_bit() with sparse bitmap
[ 3913.481078] find_next_bit: 2536 cycles, 66 iterations
[ 3913.481252] find_next_zero_bit: 344404 cycles, 32703 iterations
[ 3913.481255] find_last_bit: 2006 cycles, 66 iterations
[ 3913.481265] find_first_bit: 17488 cycles, 66 iterations
[ 3913.481266] Start testing find_next_and_bit() with sparse bitmap
[ 3913.481272] find_next_and_bit: 764 cycles, 1 iterations

[3] test_find_next_bit, arm (v7 odroid XU3).

[ 267.206928] Start testing find_bit() with random-filled bitmap
[ 267.214752] find_next_bit: 4474 cycles, 16419 iterations
[ 267.221850] find_next_zero_bit: 5976 cycles, 16350 iterations
[ 267.229294] find_last_bit: 4209 cycles, 16419 iterations
[ 267.279131] find_first_bit: 1032991 cycles, 16420 iterations
[ 267.286265] Start testing find_next_and_bit() with random-filled
bitmap
[ 267.302386] find_next_and_bit: 2290 cycles, 8140 iterations
[ 267.309422] Start testing find_bit() with sparse bitmap
[ 267.316054] find_next_bit: 191 cycles, 66 iterations
[ 267.322726] find_next_zero_bit: 8758 cycles, 32703 iterations
[ 267.329803] find_last_bit: 84 cycles, 66 iterations
[ 267.336169] find_first_bit: 4118 cycles, 66 iterations
[ 267.342627] Start testing find_next_and_bit() with sparse bitmap
[ 267.356919] find_next_and_bit: 91 cycles, 1 iterations

[courbet@google.com: v6]
Link: http://lkml.kernel.org/r/20171129095715.23430-1-courbet@google.com
[geert@linux-m68k.org: m68k/bitops: always include <asm-generic/bitops/find.h>]
Link: http://lkml.kernel.org/r/1512556816-28627-1-git-send-email-geert@linux-m68k.org
Link: http://lkml.kernel.org/r/20171128131334.23491-1-courbet@google.com
Signed-off-by: Clement Courbet <courbet@google.com>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Yury Norov <ynorov@caviumnetworks.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Clement Courbet and committed by
Linus Torvalds
0ade34c3 15ff67bf

+147 -33
+1
arch/arm/include/asm/bitops.h
··· 338 338 339 339 #endif 340 340 341 + #include <asm-generic/bitops/find.h> 341 342 #include <asm-generic/bitops/le.h> 342 343 343 344 /*
+2 -1
arch/m68k/include/asm/bitops.h
··· 311 311 * functions. 312 312 */ 313 313 #if defined(CONFIG_CPU_HAS_NO_BITFIELDS) 314 - #include <asm-generic/bitops/find.h> 315 314 #include <asm-generic/bitops/ffz.h> 316 315 #else 317 316 ··· 439 440 } 440 441 441 442 #endif 443 + 444 + #include <asm-generic/bitops/find.h> 442 445 443 446 #ifdef __KERNEL__ 444 447
+2
arch/unicore32/include/asm/bitops.h
··· 44 44 #define find_first_bit find_first_bit 45 45 #define find_first_zero_bit find_first_zero_bit 46 46 47 + #include <asm-generic/bitops/find.h> 48 + 47 49 #endif /* __UNICORE_BITOPS_H__ */
+20
include/asm-generic/bitops/find.h
··· 16 16 size, unsigned long offset); 17 17 #endif 18 18 19 + #ifndef find_next_and_bit 20 + /** 21 + * find_next_and_bit - find the next set bit in both memory regions 22 + * @addr1: The first address to base the search on 23 + * @addr2: The second address to base the search on 24 + * @offset: The bitnumber to start searching at 25 + * @size: The bitmap size in bits 26 + * 27 + * Returns the bit number for the next set bit 28 + * If no bits are set, returns @size. 29 + */ 30 + extern unsigned long find_next_and_bit(const unsigned long *addr1, 31 + const unsigned long *addr2, unsigned long size, 32 + unsigned long offset); 33 + #endif 34 + 19 35 #ifndef find_next_zero_bit 20 36 /** 21 37 * find_next_zero_bit - find the next cleared bit in a memory region ··· 71 55 unsigned long size); 72 56 #else /* CONFIG_GENERIC_FIND_FIRST_BIT */ 73 57 58 + #ifndef find_first_bit 74 59 #define find_first_bit(addr, size) find_next_bit((addr), (size), 0) 60 + #endif 61 + #ifndef find_first_zero_bit 75 62 #define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0) 63 + #endif 76 64 77 65 #endif /* CONFIG_GENERIC_FIND_FIRST_BIT */ 78 66
+5 -1
include/linux/bitmap.h
··· 88 88 * test_and_change_bit(bit, addr) Change bit and return old value 89 89 * find_first_zero_bit(addr, nbits) Position first zero bit in *addr 90 90 * find_first_bit(addr, nbits) Position first set bit in *addr 91 - * find_next_zero_bit(addr, nbits, bit) Position next zero bit in *addr >= bit 91 + * find_next_zero_bit(addr, nbits, bit) 92 + * Position next zero bit in *addr >= bit 92 93 * find_next_bit(addr, nbits, bit) Position next set bit in *addr >= bit 94 + * find_next_and_bit(addr1, addr2, nbits, bit) 95 + * Same as find_next_bit, but in 96 + * (*addr1 & *addr2) 93 97 * 94 98 */ 95 99
+5 -4
lib/cpumask.c
··· 33 33 int cpumask_next_and(int n, const struct cpumask *src1p, 34 34 const struct cpumask *src2p) 35 35 { 36 - while ((n = cpumask_next(n, src1p)) < nr_cpu_ids) 37 - if (cpumask_test_cpu(n, src2p)) 38 - break; 39 - return n; 36 + /* -1 is a legal arg here. */ 37 + if (n != -1) 38 + cpumask_check(n); 39 + return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p), 40 + nr_cpumask_bits, n + 1); 40 41 } 41 42 EXPORT_SYMBOL(cpumask_next_and); 42 43
+43 -16
lib/find_bit.c
··· 21 21 #include <linux/export.h> 22 22 #include <linux/kernel.h> 23 23 24 - #if !defined(find_next_bit) || !defined(find_next_zero_bit) 24 + #if !defined(find_next_bit) || !defined(find_next_zero_bit) || \ 25 + !defined(find_next_and_bit) 25 26 26 27 /* 27 - * This is a common helper function for find_next_bit and 28 - * find_next_zero_bit. The difference is the "invert" argument, which 29 - * is XORed with each fetched word before searching it for one bits. 28 + * This is a common helper function for find_next_bit, find_next_zero_bit, and 29 + * find_next_and_bit. The differences are: 30 + * - The "invert" argument, which is XORed with each fetched word before 31 + * searching it for one bits. 32 + * - The optional "addr2", which is anded with "addr1" if present. 30 33 */ 31 - static unsigned long _find_next_bit(const unsigned long *addr, 32 - unsigned long nbits, unsigned long start, unsigned long invert) 34 + static inline unsigned long _find_next_bit(const unsigned long *addr1, 35 + const unsigned long *addr2, unsigned long nbits, 36 + unsigned long start, unsigned long invert) 33 37 { 34 38 unsigned long tmp; 35 39 36 40 if (unlikely(start >= nbits)) 37 41 return nbits; 38 42 39 - tmp = addr[start / BITS_PER_LONG] ^ invert; 43 + tmp = addr1[start / BITS_PER_LONG]; 44 + if (addr2) 45 + tmp &= addr2[start / BITS_PER_LONG]; 46 + tmp ^= invert; 40 47 41 48 /* Handle 1st word. */ 42 49 tmp &= BITMAP_FIRST_WORD_MASK(start); ··· 54 47 if (start >= nbits) 55 48 return nbits; 56 49 57 - tmp = addr[start / BITS_PER_LONG] ^ invert; 50 + tmp = addr1[start / BITS_PER_LONG]; 51 + if (addr2) 52 + tmp &= addr2[start / BITS_PER_LONG]; 53 + tmp ^= invert; 58 54 } 59 55 60 56 return min(start + __ffs(tmp), nbits); ··· 71 61 unsigned long find_next_bit(const unsigned long *addr, unsigned long size, 72 62 unsigned long offset) 73 63 { 74 - return _find_next_bit(addr, size, offset, 0UL); 64 + return _find_next_bit(addr, NULL, size, offset, 0UL); 75 65 } 76 66 EXPORT_SYMBOL(find_next_bit); 77 67 #endif ··· 80 70 unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, 81 71 unsigned long offset) 82 72 { 83 - return _find_next_bit(addr, size, offset, ~0UL); 73 + return _find_next_bit(addr, NULL, size, offset, ~0UL); 84 74 } 85 75 EXPORT_SYMBOL(find_next_zero_bit); 76 + #endif 77 + 78 + #if !defined(find_next_and_bit) 79 + unsigned long find_next_and_bit(const unsigned long *addr1, 80 + const unsigned long *addr2, unsigned long size, 81 + unsigned long offset) 82 + { 83 + return _find_next_bit(addr1, addr2, size, offset, 0UL); 84 + } 85 + EXPORT_SYMBOL(find_next_and_bit); 86 86 #endif 87 87 88 88 #ifndef find_first_bit ··· 166 146 } 167 147 168 148 #if !defined(find_next_bit_le) || !defined(find_next_zero_bit_le) 169 - static unsigned long _find_next_bit_le(const unsigned long *addr, 170 - unsigned long nbits, unsigned long start, unsigned long invert) 149 + static inline unsigned long _find_next_bit_le(const unsigned long *addr1, 150 + const unsigned long *addr2, unsigned long nbits, 151 + unsigned long start, unsigned long invert) 171 152 { 172 153 unsigned long tmp; 173 154 174 155 if (unlikely(start >= nbits)) 175 156 return nbits; 176 157 177 - tmp = addr[start / BITS_PER_LONG] ^ invert; 158 + tmp = addr1[start / BITS_PER_LONG]; 159 + if (addr2) 160 + tmp &= addr2[start / BITS_PER_LONG]; 161 + tmp ^= invert; 178 162 179 163 /* Handle 1st word. */ 180 164 tmp &= ext2_swab(BITMAP_FIRST_WORD_MASK(start)); ··· 189 165 if (start >= nbits) 190 166 return nbits; 191 167 192 - tmp = addr[start / BITS_PER_LONG] ^ invert; 168 + tmp = addr1[start / BITS_PER_LONG]; 169 + if (addr2) 170 + tmp &= addr2[start / BITS_PER_LONG]; 171 + tmp ^= invert; 193 172 } 194 173 195 174 return min(start + __ffs(ext2_swab(tmp)), nbits); ··· 203 176 unsigned long find_next_zero_bit_le(const void *addr, unsigned 204 177 long size, unsigned long offset) 205 178 { 206 - return _find_next_bit_le(addr, size, offset, ~0UL); 179 + return _find_next_bit_le(addr, NULL, size, offset, ~0UL); 207 180 } 208 181 EXPORT_SYMBOL(find_next_zero_bit_le); 209 182 #endif ··· 212 185 unsigned long find_next_bit_le(const void *addr, unsigned 213 186 long size, unsigned long offset) 214 187 { 215 - return _find_next_bit_le(addr, size, offset, 0UL); 188 + return _find_next_bit_le(addr, NULL, size, offset, 0UL); 216 189 } 217 190 EXPORT_SYMBOL(find_next_bit_le); 218 191 #endif
+24 -1
lib/find_bit_benchmark.c
··· 35 35 #define SPARSE 500 36 36 37 37 static DECLARE_BITMAP(bitmap, BITMAP_LEN) __initdata; 38 + static DECLARE_BITMAP(bitmap2, BITMAP_LEN) __initdata; 38 39 39 40 /* 40 41 * This is Schlemiel the Painter's algorithm. It should be called after ··· 104 103 return 0; 105 104 } 106 105 106 + static int __init test_find_next_and_bit(const void *bitmap, 107 + const void *bitmap2, unsigned long len) 108 + { 109 + unsigned long i, cnt; 110 + cycles_t cycles; 111 + 112 + cycles = get_cycles(); 113 + for (cnt = i = 0; i < BITMAP_LEN; cnt++) 114 + i = find_next_and_bit(bitmap, bitmap2, BITMAP_LEN, i+1); 115 + cycles = get_cycles() - cycles; 116 + pr_err("find_next_and_bit:\t\t%llu cycles, %ld iterations\n", 117 + (u64)cycles, cnt); 118 + 119 + return 0; 120 + } 121 + 107 122 static int __init find_bit_test(void) 108 123 { 109 124 unsigned long nbits = BITMAP_LEN / SPARSE; ··· 127 110 pr_err("\nStart testing find_bit() with random-filled bitmap\n"); 128 111 129 112 get_random_bytes(bitmap, sizeof(bitmap)); 113 + get_random_bytes(bitmap2, sizeof(bitmap2)); 130 114 131 115 test_find_next_bit(bitmap, BITMAP_LEN); 132 116 test_find_next_zero_bit(bitmap, BITMAP_LEN); 133 117 test_find_last_bit(bitmap, BITMAP_LEN); 134 118 test_find_first_bit(bitmap, BITMAP_LEN); 119 + test_find_next_and_bit(bitmap, bitmap2, BITMAP_LEN); 135 120 136 121 pr_err("\nStart testing find_bit() with sparse bitmap\n"); 137 122 138 123 bitmap_zero(bitmap, BITMAP_LEN); 124 + bitmap_zero(bitmap2, BITMAP_LEN); 139 125 140 - while (nbits--) 126 + while (nbits--) { 141 127 __set_bit(prandom_u32() % BITMAP_LEN, bitmap); 128 + __set_bit(prandom_u32() % BITMAP_LEN, bitmap2); 129 + } 142 130 143 131 test_find_next_bit(bitmap, BITMAP_LEN); 144 132 test_find_next_zero_bit(bitmap, BITMAP_LEN); 145 133 test_find_last_bit(bitmap, BITMAP_LEN); 146 134 test_find_first_bit(bitmap, BITMAP_LEN); 135 + test_find_next_and_bit(bitmap, bitmap2, BITMAP_LEN); 147 136 148 137 /* 149 138 * Everything is OK. Return error just to let user run benchmark
+16
tools/include/asm-generic/bitops/find.h
··· 16 16 size, unsigned long offset); 17 17 #endif 18 18 19 + #ifndef find_next_and_bit 20 + /** 21 + * find_next_and_bit - find the next set bit in both memory regions 22 + * @addr1: The first address to base the search on 23 + * @addr2: The second address to base the search on 24 + * @offset: The bitnumber to start searching at 25 + * @size: The bitmap size in bits 26 + * 27 + * Returns the bit number for the next set bit 28 + * If no bits are set, returns @size. 29 + */ 30 + extern unsigned long find_next_and_bit(const unsigned long *addr1, 31 + const unsigned long *addr2, unsigned long size, 32 + unsigned long offset); 33 + #endif 34 + 19 35 #ifndef find_next_zero_bit 20 36 21 37 /**
+29 -10
tools/lib/find_bit.c
··· 22 22 #include <linux/bitmap.h> 23 23 #include <linux/kernel.h> 24 24 25 - #if !defined(find_next_bit) 25 + #if !defined(find_next_bit) || !defined(find_next_zero_bit) || \ 26 + !defined(find_next_and_bit) 26 27 27 28 /* 28 - * This is a common helper function for find_next_bit and 29 - * find_next_zero_bit. The difference is the "invert" argument, which 30 - * is XORed with each fetched word before searching it for one bits. 29 + * This is a common helper function for find_next_bit, find_next_zero_bit, and 30 + * find_next_and_bit. The differences are: 31 + * - The "invert" argument, which is XORed with each fetched word before 32 + * searching it for one bits. 33 + * - The optional "addr2", which is anded with "addr1" if present. 31 34 */ 32 - static unsigned long _find_next_bit(const unsigned long *addr, 33 - unsigned long nbits, unsigned long start, unsigned long invert) 35 + static inline unsigned long _find_next_bit(const unsigned long *addr1, 36 + const unsigned long *addr2, unsigned long nbits, 37 + unsigned long start, unsigned long invert) 34 38 { 35 39 unsigned long tmp; 36 40 37 41 if (unlikely(start >= nbits)) 38 42 return nbits; 39 43 40 - tmp = addr[start / BITS_PER_LONG] ^ invert; 44 + tmp = addr1[start / BITS_PER_LONG]; 45 + if (addr2) 46 + tmp &= addr2[start / BITS_PER_LONG]; 47 + tmp ^= invert; 41 48 42 49 /* Handle 1st word. */ 43 50 tmp &= BITMAP_FIRST_WORD_MASK(start); ··· 55 48 if (start >= nbits) 56 49 return nbits; 57 50 58 - tmp = addr[start / BITS_PER_LONG] ^ invert; 51 + tmp = addr1[start / BITS_PER_LONG]; 52 + if (addr2) 53 + tmp &= addr2[start / BITS_PER_LONG]; 54 + tmp ^= invert; 59 55 } 60 56 61 57 return min(start + __ffs(tmp), nbits); ··· 72 62 unsigned long find_next_bit(const unsigned long *addr, unsigned long size, 73 63 unsigned long offset) 74 64 { 75 - return _find_next_bit(addr, size, offset, 0UL); 65 + return _find_next_bit(addr, NULL, size, offset, 0UL); 76 66 } 77 67 #endif 78 68 ··· 114 104 unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, 115 105 unsigned long offset) 116 106 { 117 - return _find_next_bit(addr, size, offset, ~0UL); 107 + return _find_next_bit(addr, NULL, size, offset, ~0UL); 108 + } 109 + #endif 110 + 111 + #ifndef find_next_and_bit 112 + unsigned long find_next_and_bit(const unsigned long *addr1, 113 + const unsigned long *addr2, unsigned long size, 114 + unsigned long offset) 115 + { 116 + return _find_next_bit(addr1, addr2, size, offset, 0UL); 118 117 } 119 118 #endif