Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86/asm changes from Ingo Molnar:
"Main changes:

- Apply low level mutex optimization on x86-64, by Wedson Almeida
Filho.

- Change bitops to be naturally 'long', by H Peter Anvin.

- Add TSX-NI opcodes support to the x86 (instrumentation) decoder, by
Masami Hiramatsu.

- Add clang compatibility adjustments/workarounds, by Jan-Simon
Möller"

* 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86, doc: Update uaccess.h comment to reflect clang changes
x86, asm: Fix a compilation issue with clang
x86, asm: Extend definitions of _ASM_* with a raw format
x86, insn: Add new opcodes as of June, 2013
x86/ia32/asm: Remove unused argument in macro
x86, bitops: Change bitops to be native operand size
x86: Use asm-goto to implement mutex fast path on x86-64

+107 -54
+1 -1
arch/x86/ia32/ia32entry.S
··· 452 452 453 453 CFI_ENDPROC 454 454 455 - .macro PTREGSCALL label, func, arg 455 + .macro PTREGSCALL label, func 456 456 ALIGN 457 457 GLOBAL(\label) 458 458 leaq \func(%rip),%rax
+5 -1
arch/x86/include/asm/asm.h
··· 3 3 4 4 #ifdef __ASSEMBLY__ 5 5 # define __ASM_FORM(x) x 6 + # define __ASM_FORM_RAW(x) x 6 7 # define __ASM_FORM_COMMA(x) x, 7 8 #else 8 9 # define __ASM_FORM(x) " " #x " " 10 + # define __ASM_FORM_RAW(x) #x 9 11 # define __ASM_FORM_COMMA(x) " " #x "," 10 12 #endif 11 13 12 14 #ifdef CONFIG_X86_32 13 15 # define __ASM_SEL(a,b) __ASM_FORM(a) 16 + # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a) 14 17 #else 15 18 # define __ASM_SEL(a,b) __ASM_FORM(b) 19 + # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b) 16 20 #endif 17 21 18 22 #define __ASM_SIZE(inst, ...) __ASM_SEL(inst##l##__VA_ARGS__, \ 19 23 inst##q##__VA_ARGS__) 20 - #define __ASM_REG(reg) __ASM_SEL(e##reg, r##reg) 24 + #define __ASM_REG(reg) __ASM_SEL_RAW(e##reg, r##reg) 21 25 22 26 #define _ASM_PTR __ASM_SEL(.long, .quad) 23 27 #define _ASM_ALIGN __ASM_SEL(.balign 4, .balign 8)
+27 -19
arch/x86/include/asm/bitops.h
··· 15 15 #include <linux/compiler.h> 16 16 #include <asm/alternative.h> 17 17 18 + #if BITS_PER_LONG == 32 19 + # define _BITOPS_LONG_SHIFT 5 20 + #elif BITS_PER_LONG == 64 21 + # define _BITOPS_LONG_SHIFT 6 22 + #else 23 + # error "Unexpected BITS_PER_LONG" 24 + #endif 25 + 18 26 #define BIT_64(n) (U64_C(1) << (n)) 19 27 20 28 /* ··· 67 59 * restricted to acting on a single-word quantity. 68 60 */ 69 61 static __always_inline void 70 - set_bit(unsigned int nr, volatile unsigned long *addr) 62 + set_bit(long nr, volatile unsigned long *addr) 71 63 { 72 64 if (IS_IMMEDIATE(nr)) { 73 65 asm volatile(LOCK_PREFIX "orb %1,%0" ··· 89 81 * If it's called on the same region of memory simultaneously, the effect 90 82 * may be that only one operation succeeds. 91 83 */ 92 - static inline void __set_bit(int nr, volatile unsigned long *addr) 84 + static inline void __set_bit(long nr, volatile unsigned long *addr) 93 85 { 94 86 asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory"); 95 87 } ··· 105 97 * in order to ensure changes are visible on other processors. 106 98 */ 107 99 static __always_inline void 108 - clear_bit(int nr, volatile unsigned long *addr) 100 + clear_bit(long nr, volatile unsigned long *addr) 109 101 { 110 102 if (IS_IMMEDIATE(nr)) { 111 103 asm volatile(LOCK_PREFIX "andb %1,%0" ··· 126 118 * clear_bit() is atomic and implies release semantics before the memory 127 119 * operation. It can be used for an unlock. 128 120 */ 129 - static inline void clear_bit_unlock(unsigned nr, volatile unsigned long *addr) 121 + static inline void clear_bit_unlock(long nr, volatile unsigned long *addr) 130 122 { 131 123 barrier(); 132 124 clear_bit(nr, addr); 133 125 } 134 126 135 - static inline void __clear_bit(int nr, volatile unsigned long *addr) 127 + static inline void __clear_bit(long nr, volatile unsigned long *addr) 136 128 { 137 129 asm volatile("btr %1,%0" : ADDR : "Ir" (nr)); 138 130 } ··· 149 141 * No memory barrier is required here, because x86 cannot reorder stores past 150 142 * older loads. Same principle as spin_unlock. 151 143 */ 152 - static inline void __clear_bit_unlock(unsigned nr, volatile unsigned long *addr) 144 + static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) 153 145 { 154 146 barrier(); 155 147 __clear_bit(nr, addr); ··· 167 159 * If it's called on the same region of memory simultaneously, the effect 168 160 * may be that only one operation succeeds. 169 161 */ 170 - static inline void __change_bit(int nr, volatile unsigned long *addr) 162 + static inline void __change_bit(long nr, volatile unsigned long *addr) 171 163 { 172 164 asm volatile("btc %1,%0" : ADDR : "Ir" (nr)); 173 165 } ··· 181 173 * Note that @nr may be almost arbitrarily large; this function is not 182 174 * restricted to acting on a single-word quantity. 183 175 */ 184 - static inline void change_bit(int nr, volatile unsigned long *addr) 176 + static inline void change_bit(long nr, volatile unsigned long *addr) 185 177 { 186 178 if (IS_IMMEDIATE(nr)) { 187 179 asm volatile(LOCK_PREFIX "xorb %1,%0" ··· 202 194 * This operation is atomic and cannot be reordered. 203 195 * It also implies a memory barrier. 204 196 */ 205 - static inline int test_and_set_bit(int nr, volatile unsigned long *addr) 197 + static inline int test_and_set_bit(long nr, volatile unsigned long *addr) 206 198 { 207 199 int oldbit; 208 200 ··· 220 212 * This is the same as test_and_set_bit on x86. 221 213 */ 222 214 static __always_inline int 223 - test_and_set_bit_lock(int nr, volatile unsigned long *addr) 215 + test_and_set_bit_lock(long nr, volatile unsigned long *addr) 224 216 { 225 217 return test_and_set_bit(nr, addr); 226 218 } ··· 234 226 * If two examples of this operation race, one can appear to succeed 235 227 * but actually fail. You must protect multiple accesses with a lock. 236 228 */ 237 - static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) 229 + static inline int __test_and_set_bit(long nr, volatile unsigned long *addr) 238 230 { 239 231 int oldbit; 240 232 ··· 253 245 * This operation is atomic and cannot be reordered. 254 246 * It also implies a memory barrier. 255 247 */ 256 - static inline int test_and_clear_bit(int nr, volatile unsigned long *addr) 248 + static inline int test_and_clear_bit(long nr, volatile unsigned long *addr) 257 249 { 258 250 int oldbit; 259 251 ··· 280 272 * accessed from a hypervisor on the same CPU if running in a VM: don't change 281 273 * this without also updating arch/x86/kernel/kvm.c 282 274 */ 283 - static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) 275 + static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr) 284 276 { 285 277 int oldbit; 286 278 ··· 292 284 } 293 285 294 286 /* WARNING: non atomic and it can be reordered! */ 295 - static inline int __test_and_change_bit(int nr, volatile unsigned long *addr) 287 + static inline int __test_and_change_bit(long nr, volatile unsigned long *addr) 296 288 { 297 289 int oldbit; 298 290 ··· 312 304 * This operation is atomic and cannot be reordered. 313 305 * It also implies a memory barrier. 314 306 */ 315 - static inline int test_and_change_bit(int nr, volatile unsigned long *addr) 307 + static inline int test_and_change_bit(long nr, volatile unsigned long *addr) 316 308 { 317 309 int oldbit; 318 310 ··· 323 315 return oldbit; 324 316 } 325 317 326 - static __always_inline int constant_test_bit(unsigned int nr, const volatile unsigned long *addr) 318 + static __always_inline int constant_test_bit(long nr, const volatile unsigned long *addr) 327 319 { 328 - return ((1UL << (nr % BITS_PER_LONG)) & 329 - (addr[nr / BITS_PER_LONG])) != 0; 320 + return ((1UL << (nr & (BITS_PER_LONG-1))) & 321 + (addr[nr >> _BITOPS_LONG_SHIFT])) != 0; 330 322 } 331 323 332 - static inline int variable_test_bit(int nr, volatile const unsigned long *addr) 324 + static inline int variable_test_bit(long nr, volatile const unsigned long *addr) 333 325 { 334 326 int oldbit; 335 327
+30
arch/x86/include/asm/mutex_64.h
··· 16 16 * 17 17 * Atomically decrements @v and calls <fail_fn> if the result is negative. 18 18 */ 19 + #ifdef CC_HAVE_ASM_GOTO 20 + static inline void __mutex_fastpath_lock(atomic_t *v, 21 + void (*fail_fn)(atomic_t *)) 22 + { 23 + asm volatile goto(LOCK_PREFIX " decl %0\n" 24 + " jns %l[exit]\n" 25 + : : "m" (v->counter) 26 + : "memory", "cc" 27 + : exit); 28 + fail_fn(v); 29 + exit: 30 + return; 31 + } 32 + #else 19 33 #define __mutex_fastpath_lock(v, fail_fn) \ 20 34 do { \ 21 35 unsigned long dummy; \ ··· 46 32 : "rax", "rsi", "rdx", "rcx", \ 47 33 "r8", "r9", "r10", "r11", "memory"); \ 48 34 } while (0) 35 + #endif 49 36 50 37 /** 51 38 * __mutex_fastpath_lock_retval - try to take the lock by moving the count ··· 71 56 * 72 57 * Atomically increments @v and calls <fail_fn> if the result is nonpositive. 73 58 */ 59 + #ifdef CC_HAVE_ASM_GOTO 60 + static inline void __mutex_fastpath_unlock(atomic_t *v, 61 + void (*fail_fn)(atomic_t *)) 62 + { 63 + asm volatile goto(LOCK_PREFIX " incl %0\n" 64 + " jg %l[exit]\n" 65 + : : "m" (v->counter) 66 + : "memory", "cc" 67 + : exit); 68 + fail_fn(v); 69 + exit: 70 + return; 71 + } 72 + #else 74 73 #define __mutex_fastpath_unlock(v, fail_fn) \ 75 74 do { \ 76 75 unsigned long dummy; \ ··· 101 72 : "rax", "rsi", "rdx", "rcx", \ 102 73 "r8", "r9", "r10", "r11", "memory"); \ 103 74 } while (0) 75 + #endif 104 76 105 77 #define __mutex_slowpath_needs_to_unlock() 1 106 78
+12 -12
arch/x86/include/asm/sync_bitops.h
··· 26 26 * Note that @nr may be almost arbitrarily large; this function is not 27 27 * restricted to acting on a single-word quantity. 28 28 */ 29 - static inline void sync_set_bit(int nr, volatile unsigned long *addr) 29 + static inline void sync_set_bit(long nr, volatile unsigned long *addr) 30 30 { 31 - asm volatile("lock; btsl %1,%0" 31 + asm volatile("lock; bts %1,%0" 32 32 : "+m" (ADDR) 33 33 : "Ir" (nr) 34 34 : "memory"); ··· 44 44 * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() 45 45 * in order to ensure changes are visible on other processors. 46 46 */ 47 - static inline void sync_clear_bit(int nr, volatile unsigned long *addr) 47 + static inline void sync_clear_bit(long nr, volatile unsigned long *addr) 48 48 { 49 - asm volatile("lock; btrl %1,%0" 49 + asm volatile("lock; btr %1,%0" 50 50 : "+m" (ADDR) 51 51 : "Ir" (nr) 52 52 : "memory"); ··· 61 61 * Note that @nr may be almost arbitrarily large; this function is not 62 62 * restricted to acting on a single-word quantity. 63 63 */ 64 - static inline void sync_change_bit(int nr, volatile unsigned long *addr) 64 + static inline void sync_change_bit(long nr, volatile unsigned long *addr) 65 65 { 66 - asm volatile("lock; btcl %1,%0" 66 + asm volatile("lock; btc %1,%0" 67 67 : "+m" (ADDR) 68 68 : "Ir" (nr) 69 69 : "memory"); ··· 77 77 * This operation is atomic and cannot be reordered. 78 78 * It also implies a memory barrier. 79 79 */ 80 - static inline int sync_test_and_set_bit(int nr, volatile unsigned long *addr) 80 + static inline int sync_test_and_set_bit(long nr, volatile unsigned long *addr) 81 81 { 82 82 int oldbit; 83 83 84 - asm volatile("lock; btsl %2,%1\n\tsbbl %0,%0" 84 + asm volatile("lock; bts %2,%1\n\tsbbl %0,%0" 85 85 : "=r" (oldbit), "+m" (ADDR) 86 86 : "Ir" (nr) : "memory"); 87 87 return oldbit; ··· 95 95 * This operation is atomic and cannot be reordered. 96 96 * It also implies a memory barrier. 97 97 */ 98 - static inline int sync_test_and_clear_bit(int nr, volatile unsigned long *addr) 98 + static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr) 99 99 { 100 100 int oldbit; 101 101 102 - asm volatile("lock; btrl %2,%1\n\tsbbl %0,%0" 102 + asm volatile("lock; btr %2,%1\n\tsbbl %0,%0" 103 103 : "=r" (oldbit), "+m" (ADDR) 104 104 : "Ir" (nr) : "memory"); 105 105 return oldbit; ··· 113 113 * This operation is atomic and cannot be reordered. 114 114 * It also implies a memory barrier. 115 115 */ 116 - static inline int sync_test_and_change_bit(int nr, volatile unsigned long *addr) 116 + static inline int sync_test_and_change_bit(long nr, volatile unsigned long *addr) 117 117 { 118 118 int oldbit; 119 119 120 - asm volatile("lock; btcl %2,%1\n\tsbbl %0,%0" 120 + asm volatile("lock; btc %2,%1\n\tsbbl %0,%0" 121 121 : "=r" (oldbit), "+m" (ADDR) 122 122 : "Ir" (nr) : "memory"); 123 123 return oldbit;
+5 -2
arch/x86/include/asm/uaccess.h
··· 153 153 * Careful: we have to cast the result to the type of the pointer 154 154 * for sign reasons. 155 155 * 156 - * The use of %edx as the register specifier is a bit of a 156 + * The use of _ASM_DX as the register specifier is a bit of a 157 157 * simplification, as gcc only cares about it as the starting point 158 158 * and not size: for a 64-bit value it will use %ecx:%edx on 32 bits 159 159 * (%ecx being the next register in gcc's x86 register sequence), and 160 160 * %rdx on 64 bits. 161 + * 162 + * Clang/LLVM cares about the size of the register, but still wants 163 + * the base register for something that ends up being a pair. 161 164 */ 162 165 #define get_user(x, ptr) \ 163 166 ({ \ 164 167 int __ret_gu; \ 165 - register __inttype(*(ptr)) __val_gu asm("%edx"); \ 168 + register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \ 166 169 __chk_user_ptr(ptr); \ 167 170 might_fault(); \ 168 171 asm volatile("call __get_user_%P3" \
+24 -18
arch/x86/lib/x86-opcode-map.txt
··· 1 1 # x86 Opcode Maps 2 2 # 3 3 # This is (mostly) based on following documentations. 4 - # - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2 5 - # (#325383-040US, October 2011) 6 - # - Intel(R) Advanced Vector Extensions Programming Reference 7 - # (#319433-011,JUNE 2011). 4 + # - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2C 5 + # (#326018-047US, June 2013) 8 6 # 9 7 #<Opcode maps> 10 8 # Table: table-name ··· 27 29 # - (F3): the last prefix is 0xF3 28 30 # - (F2): the last prefix is 0xF2 29 31 # - (!F3) : the last prefix is not 0xF3 (including non-last prefix case) 32 + # - (66&F2): Both 0x66 and 0xF2 prefixes are specified. 30 33 31 34 Table: one byte opcode 32 35 Referrer: ··· 245 246 c3: RETN 246 247 c4: LES Gz,Mp (i64) | VEX+2byte (Prefix) 247 248 c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix) 248 - c6: Grp11 Eb,Ib (1A) 249 - c7: Grp11 Ev,Iz (1A) 249 + c6: Grp11A Eb,Ib (1A) 250 + c7: Grp11B Ev,Iz (1A) 250 251 c8: ENTER Iw,Ib 251 252 c9: LEAVE (d64) 252 253 ca: RETF Iw ··· 292 293 # 0xf0 - 0xff 293 294 f0: LOCK (Prefix) 294 295 f1: 295 - f2: REPNE (Prefix) 296 - f3: REP/REPE (Prefix) 296 + f2: REPNE (Prefix) | XACQUIRE (Prefix) 297 + f3: REP/REPE (Prefix) | XRELEASE (Prefix) 297 298 f4: HLT 298 299 f5: CMC 299 300 f6: Grp3_1 Eb (1A) ··· 325 326 0a: 326 327 0b: UD2 (1B) 327 328 0c: 328 - 0d: NOP Ev | GrpP 329 + # AMD's prefetch group. Intel supports prefetchw(/1) only. 330 + 0d: GrpP 329 331 0e: FEMMS 330 332 # 3DNow! uses the last imm byte as opcode extension. 331 333 0f: 3DNow! Pq,Qq,Ib ··· 729 729 dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1) 730 730 de: VAESDEC Vdq,Hdq,Wdq (66),(v1) 731 731 df: VAESDECLAST Vdq,Hdq,Wdq (66),(v1) 732 - f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) 733 - f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) 732 + f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) | CRC32 Gd,Eb (66&F2) 733 + f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) | CRC32 Gd,Ew (66&F2) 734 734 f2: ANDN Gy,By,Ey (v) 735 735 f3: Grp17 (1A) 736 736 f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) 737 - f6: MULX By,Gy,rDX,Ey (F2),(v) 737 + f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v) 738 738 f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v) 739 739 EndTable 740 740 ··· 861 861 862 862 GrpTable: Grp7 863 863 0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) 864 - 1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001) 865 - 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) 864 + 1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) 865 + 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) 866 866 3: LIDT Ms 867 867 4: SMSW Mw/Rv 868 868 5: ··· 880 880 GrpTable: Grp9 881 881 1: CMPXCHG8B/16B Mq/Mdq 882 882 6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B) 883 - 7: VMPTRST Mq | VMPTRST Mq (F3) 883 + 7: VMPTRST Mq | VMPTRST Mq (F3) | RDSEED Rv (11B) 884 884 EndTable 885 885 886 886 GrpTable: Grp10 887 887 EndTable 888 888 889 - GrpTable: Grp11 890 - # Note: the operands are given by group opcode 891 - 0: MOV 889 + # Grp11A and Grp11B are expressed as Grp11 in Intel SDM 890 + GrpTable: Grp11A 891 + 0: MOV Eb,Ib 892 + 7: XABORT Ib (000),(11B) 893 + EndTable 894 + 895 + GrpTable: Grp11B 896 + 0: MOV Eb,Iz 897 + 7: XBEGIN Jz (000),(11B) 892 898 EndTable 893 899 894 900 GrpTable: Grp12
+3 -1
arch/x86/tools/gen-insn-attr-x86.awk
··· 68 68 69 69 lprefix1_expr = "\\((66|!F3)\\)" 70 70 lprefix2_expr = "\\(F3\\)" 71 - lprefix3_expr = "\\((F2|!F3)\\)" 71 + lprefix3_expr = "\\((F2|!F3|66\\&F2)\\)" 72 72 lprefix_expr = "\\((66|F2|F3)\\)" 73 73 max_lprefix = 4 74 74 ··· 83 83 prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" 84 84 prefix_num["REPNE"] = "INAT_PFX_REPNE" 85 85 prefix_num["REP/REPE"] = "INAT_PFX_REPE" 86 + prefix_num["XACQUIRE"] = "INAT_PFX_REPNE" 87 + prefix_num["XRELEASE"] = "INAT_PFX_REPE" 86 88 prefix_num["LOCK"] = "INAT_PFX_LOCK" 87 89 prefix_num["SEG=CS"] = "INAT_PFX_CS" 88 90 prefix_num["SEG=DS"] = "INAT_PFX_DS"