Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86/asm/bitops: Force inlining of test_and_set_bit and friends

Sometimes GCC mysteriously doesn't inline very small functions
we expect to be inlined, see:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122

Arguably, GCC should do better, but GCC people aren't willing
to invest time into it and are asking to use __always_inline
instead.

With this .config:

http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os

here's an example of functions getting deinlined many times:

test_and_set_bit (166 copies, ~1260 calls)
55 push %rbp
48 89 e5 mov %rsp,%rbp
f0 48 0f ab 3e lock bts %rdi,(%rsi)
72 04 jb <test_and_set_bit+0xf>
31 c0 xor %eax,%eax
eb 05 jmp <test_and_set_bit+0x14>
b8 01 00 00 00 mov $0x1,%eax
5d pop %rbp
c3 retq

test_and_clear_bit (124 copies, ~1000 calls)
55 push %rbp
48 89 e5 mov %rsp,%rbp
f0 48 0f b3 3e lock btr %rdi,(%rsi)
72 04 jb <test_and_clear_bit+0xf>
31 c0 xor %eax,%eax
eb 05 jmp <test_and_clear_bit+0x14>
b8 01 00 00 00 mov $0x1,%eax
5d pop %rbp
c3 retq

change_bit (3 copies, 8 calls)
55 push %rbp
48 89 e5 mov %rsp,%rbp
f0 48 0f bb 3e lock btc %rdi,(%rsi)
5d pop %rbp
c3 retq

clear_bit_unlock (2 copies, 11 calls)
55 push %rbp
48 89 e5 mov %rsp,%rbp
f0 48 0f b3 3e lock btr %rdi,(%rsi)
5d pop %rbp
c3 retq

This patch works it around via s/inline/__always_inline/.

Code size decrease by ~13.5k after the patch:

text data bss dec filename
92110727 20826144 36417536 149354407 vmlinux.before
92097234 20826176 36417536 149340946 vmlinux.after

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas Graf <tgraf@suug.ch>
Link: http://lkml.kernel.org/r/1454881887-1367-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

authored by

Denys Vlasenko and committed by
Ingo Molnar
8dd5032d d99e1bd1

+18 -18
+18 -18
arch/x86/include/asm/bitops.h
··· 91 91 * If it's called on the same region of memory simultaneously, the effect 92 92 * may be that only one operation succeeds. 93 93 */ 94 - static inline void __set_bit(long nr, volatile unsigned long *addr) 94 + static __always_inline void __set_bit(long nr, volatile unsigned long *addr) 95 95 { 96 96 asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory"); 97 97 } ··· 128 128 * clear_bit() is atomic and implies release semantics before the memory 129 129 * operation. It can be used for an unlock. 130 130 */ 131 - static inline void clear_bit_unlock(long nr, volatile unsigned long *addr) 131 + static __always_inline void clear_bit_unlock(long nr, volatile unsigned long *addr) 132 132 { 133 133 barrier(); 134 134 clear_bit(nr, addr); 135 135 } 136 136 137 - static inline void __clear_bit(long nr, volatile unsigned long *addr) 137 + static __always_inline void __clear_bit(long nr, volatile unsigned long *addr) 138 138 { 139 139 asm volatile("btr %1,%0" : ADDR : "Ir" (nr)); 140 140 } ··· 151 151 * No memory barrier is required here, because x86 cannot reorder stores past 152 152 * older loads. Same principle as spin_unlock. 153 153 */ 154 - static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) 154 + static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) 155 155 { 156 156 barrier(); 157 157 __clear_bit(nr, addr); ··· 166 166 * If it's called on the same region of memory simultaneously, the effect 167 167 * may be that only one operation succeeds. 168 168 */ 169 - static inline void __change_bit(long nr, volatile unsigned long *addr) 169 + static __always_inline void __change_bit(long nr, volatile unsigned long *addr) 170 170 { 171 171 asm volatile("btc %1,%0" : ADDR : "Ir" (nr)); 172 172 } ··· 180 180 * Note that @nr may be almost arbitrarily large; this function is not 181 181 * restricted to acting on a single-word quantity. 182 182 */ 183 - static inline void change_bit(long nr, volatile unsigned long *addr) 183 + static __always_inline void change_bit(long nr, volatile unsigned long *addr) 184 184 { 185 185 if (IS_IMMEDIATE(nr)) { 186 186 asm volatile(LOCK_PREFIX "xorb %1,%0" ··· 201 201 * This operation is atomic and cannot be reordered. 202 202 * It also implies a memory barrier. 203 203 */ 204 - static inline int test_and_set_bit(long nr, volatile unsigned long *addr) 204 + static __always_inline int test_and_set_bit(long nr, volatile unsigned long *addr) 205 205 { 206 206 GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", "c"); 207 207 } ··· 228 228 * If two examples of this operation race, one can appear to succeed 229 229 * but actually fail. You must protect multiple accesses with a lock. 230 230 */ 231 - static inline int __test_and_set_bit(long nr, volatile unsigned long *addr) 231 + static __always_inline int __test_and_set_bit(long nr, volatile unsigned long *addr) 232 232 { 233 233 int oldbit; 234 234 ··· 247 247 * This operation is atomic and cannot be reordered. 248 248 * It also implies a memory barrier. 249 249 */ 250 - static inline int test_and_clear_bit(long nr, volatile unsigned long *addr) 250 + static __always_inline int test_and_clear_bit(long nr, volatile unsigned long *addr) 251 251 { 252 252 GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", "c"); 253 253 } ··· 268 268 * accessed from a hypervisor on the same CPU if running in a VM: don't change 269 269 * this without also updating arch/x86/kernel/kvm.c 270 270 */ 271 - static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr) 271 + static __always_inline int __test_and_clear_bit(long nr, volatile unsigned long *addr) 272 272 { 273 273 int oldbit; 274 274 ··· 280 280 } 281 281 282 282 /* WARNING: non atomic and it can be reordered! */ 283 - static inline int __test_and_change_bit(long nr, volatile unsigned long *addr) 283 + static __always_inline int __test_and_change_bit(long nr, volatile unsigned long *addr) 284 284 { 285 285 int oldbit; 286 286 ··· 300 300 * This operation is atomic and cannot be reordered. 301 301 * It also implies a memory barrier. 302 302 */ 303 - static inline int test_and_change_bit(long nr, volatile unsigned long *addr) 303 + static __always_inline int test_and_change_bit(long nr, volatile unsigned long *addr) 304 304 { 305 305 GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", "c"); 306 306 } ··· 311 311 (addr[nr >> _BITOPS_LONG_SHIFT])) != 0; 312 312 } 313 313 314 - static inline int variable_test_bit(long nr, volatile const unsigned long *addr) 314 + static __always_inline int variable_test_bit(long nr, volatile const unsigned long *addr) 315 315 { 316 316 int oldbit; 317 317 ··· 343 343 * 344 344 * Undefined if no bit exists, so code should check against 0 first. 345 345 */ 346 - static inline unsigned long __ffs(unsigned long word) 346 + static __always_inline unsigned long __ffs(unsigned long word) 347 347 { 348 348 asm("rep; bsf %1,%0" 349 349 : "=r" (word) ··· 357 357 * 358 358 * Undefined if no zero exists, so code should check against ~0UL first. 359 359 */ 360 - static inline unsigned long ffz(unsigned long word) 360 + static __always_inline unsigned long ffz(unsigned long word) 361 361 { 362 362 asm("rep; bsf %1,%0" 363 363 : "=r" (word) ··· 371 371 * 372 372 * Undefined if no set bit exists, so code should check against 0 first. 373 373 */ 374 - static inline unsigned long __fls(unsigned long word) 374 + static __always_inline unsigned long __fls(unsigned long word) 375 375 { 376 376 asm("bsr %1,%0" 377 377 : "=r" (word) ··· 393 393 * set bit if value is nonzero. The first (least significant) bit 394 394 * is at position 1. 395 395 */ 396 - static inline int ffs(int x) 396 + static __always_inline int ffs(int x) 397 397 { 398 398 int r; 399 399 ··· 434 434 * set bit if value is nonzero. The last (most significant) bit is 435 435 * at position 32. 436 436 */ 437 - static inline int fls(int x) 437 + static __always_inline int fls(int x) 438 438 { 439 439 int r; 440 440