[SPARC64]: Avoid membar instructions in delay slots.

In particular, avoid membar instructions in the delay
slot of a jmpl instruction.

UltraSPARC-I, II, IIi, and IIe have a bug, documented in
the UltraSPARC-IIi User's Manual, Appendix K, Erratum 51

The long and short of it is that if the IMU unit misses
on a branch or jmpl, and there is a store buffer synchronizing
membar in the delay slot, the chip can stop fetching instructions.

If interrupts are enabled or some other trap is enabled, the
chip will unwedge itself, but performance will suffer.

We already had a workaround for this bug in a few spots, but
it's better to have the entire tree sanitized for this rule.

Signed-off-by: David S. Miller <davem@davemloft.net>

+172 -111
+4 -2
arch/sparc64/kernel/entry.S
··· 271 271 fmuld %f0, %f2, %f26 272 272 faddd %f0, %f2, %f28 273 273 fmuld %f0, %f2, %f30 274 + membar #Sync 274 275 b,pt %xcc, fpdis_exit 275 - membar #Sync 276 + nop 276 277 2: andcc %g5, FPRS_DU, %g0 277 278 bne,pt %icc, 3f 278 279 fzero %f32 ··· 302 301 fmuld %f32, %f34, %f58 303 302 faddd %f32, %f34, %f60 304 303 fmuld %f32, %f34, %f62 304 + membar #Sync 305 305 ba,pt %xcc, fpdis_exit 306 - membar #Sync 306 + nop 307 307 3: mov SECONDARY_CONTEXT, %g3 308 308 add %g6, TI_FPREGS, %g1 309 309 ldxa [%g3] ASI_DMMU, %g5
+8 -4
arch/sparc64/kernel/semaphore.c
··· 32 32 " add %1, %4, %1\n" 33 33 " cas [%3], %0, %1\n" 34 34 " cmp %0, %1\n" 35 + " membar #StoreLoad | #StoreStore\n" 35 36 " bne,pn %%icc, 1b\n" 36 - " membar #StoreLoad | #StoreStore\n" 37 + " nop\n" 37 38 : "=&r" (old_count), "=&r" (tmp), "=m" (sem->count) 38 39 : "r" (&sem->count), "r" (incr), "m" (sem->count) 39 40 : "cc"); ··· 72 71 " cmp %%g1, %%g7\n" 73 72 " bne,pn %%icc, 1b\n" 74 73 " addcc %%g7, 1, %%g0\n" 74 + " membar #StoreLoad | #StoreStore\n" 75 75 " ble,pn %%icc, 3f\n" 76 - " membar #StoreLoad | #StoreStore\n" 76 + " nop\n" 77 77 "2:\n" 78 78 " .subsection 2\n" 79 79 "3: mov %0, %%g1\n" ··· 130 128 " cmp %%g1, %%g7\n" 131 129 " bne,pn %%icc, 1b\n" 132 130 " cmp %%g7, 1\n" 131 + " membar #StoreLoad | #StoreStore\n" 133 132 " bl,pn %%icc, 3f\n" 134 - " membar #StoreLoad | #StoreStore\n" 133 + " nop\n" 135 134 "2:\n" 136 135 " .subsection 2\n" 137 136 "3: mov %0, %%g1\n" ··· 236 233 " cmp %%g1, %%g7\n" 237 234 " bne,pn %%icc, 1b\n" 238 235 " cmp %%g7, 1\n" 236 + " membar #StoreLoad | #StoreStore\n" 239 237 " bl,pn %%icc, 3f\n" 240 - " membar #StoreLoad | #StoreStore\n" 238 + " nop\n" 241 239 "2:\n" 242 240 " .subsection 2\n" 243 241 "3: mov %2, %%g1\n"
+2 -1
arch/sparc64/kernel/trampoline.S
··· 98 98 99 99 sethi %hi(prom_entry_lock), %g2 100 100 1: ldstub [%g2 + %lo(prom_entry_lock)], %g1 101 + membar #StoreLoad | #StoreStore 101 102 brnz,pn %g1, 1b 102 - membar #StoreLoad | #StoreStore 103 + nop 103 104 104 105 sethi %hi(p1275buf), %g2 105 106 or %g2, %lo(p1275buf), %g2
+53 -50
arch/sparc64/lib/U1memcpy.S
··· 87 87 #define LOOP_CHUNK3(src, dest, len, branch_dest) \ 88 88 MAIN_LOOP_CHUNK(src, dest, f32, f48, len, branch_dest) 89 89 90 + #define DO_SYNC membar #Sync; 90 91 #define STORE_SYNC(dest, fsrc) \ 91 92 EX_ST(STORE_BLK(%fsrc, %dest)); \ 92 - add %dest, 0x40, %dest; 93 + add %dest, 0x40, %dest; \ 94 + DO_SYNC 93 95 94 96 #define STORE_JUMP(dest, fsrc, target) \ 95 97 EX_ST(STORE_BLK(%fsrc, %dest)); \ 96 98 add %dest, 0x40, %dest; \ 97 - ba,pt %xcc, target; 99 + ba,pt %xcc, target; \ 100 + nop; 98 101 99 102 #define FINISH_VISCHUNK(dest, f0, f1, left) \ 100 103 subcc %left, 8, %left;\ ··· 242 239 ba,pt %xcc, 1b+4 243 240 faligndata %f0, %f2, %f48 244 241 1: FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) 245 - STORE_SYNC(o0, f48) membar #Sync 242 + STORE_SYNC(o0, f48) 246 243 FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) 247 - STORE_JUMP(o0, f48, 40f) membar #Sync 244 + STORE_JUMP(o0, f48, 40f) 248 245 2: FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) 249 - STORE_SYNC(o0, f48) membar #Sync 246 + STORE_SYNC(o0, f48) 250 247 FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) 251 - STORE_JUMP(o0, f48, 48f) membar #Sync 248 + STORE_JUMP(o0, f48, 48f) 252 249 3: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) 253 - STORE_SYNC(o0, f48) membar #Sync 250 + STORE_SYNC(o0, f48) 254 251 FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) 255 - STORE_JUMP(o0, f48, 56f) membar #Sync 252 + STORE_JUMP(o0, f48, 56f) 256 253 257 254 1: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) 258 255 LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) ··· 263 260 ba,pt %xcc, 1b+4 264 261 faligndata %f2, %f4, %f48 265 262 1: FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) 266 - STORE_SYNC(o0, f48) membar #Sync 263 + STORE_SYNC(o0, f48) 267 264 FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) 268 - STORE_JUMP(o0, f48, 41f) membar #Sync 265 + STORE_JUMP(o0, f48, 41f) 269 266 2: FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) 270 - STORE_SYNC(o0, f48) membar #Sync 267 + STORE_SYNC(o0, f48) 271 268 FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) 272 - STORE_JUMP(o0, f48, 49f) membar #Sync 269 + STORE_JUMP(o0, f48, 49f) 273 270 3: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) 274 - STORE_SYNC(o0, f48) membar #Sync 271 + STORE_SYNC(o0, f48) 275 272 FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) 276 - STORE_JUMP(o0, f48, 57f) membar #Sync 273 + STORE_JUMP(o0, f48, 57f) 277 274 278 275 1: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) 279 276 LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) ··· 284 281 ba,pt %xcc, 1b+4 285 282 faligndata %f4, %f6, %f48 286 283 1: FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) 287 - STORE_SYNC(o0, f48) membar #Sync 284 + STORE_SYNC(o0, f48) 288 285 FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) 289 - STORE_JUMP(o0, f48, 42f) membar #Sync 286 + STORE_JUMP(o0, f48, 42f) 290 287 2: FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) 291 - STORE_SYNC(o0, f48) membar #Sync 288 + STORE_SYNC(o0, f48) 292 289 FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) 293 - STORE_JUMP(o0, f48, 50f) membar #Sync 290 + STORE_JUMP(o0, f48, 50f) 294 291 3: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) 295 - STORE_SYNC(o0, f48) membar #Sync 292 + STORE_SYNC(o0, f48) 296 293 FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) 297 - STORE_JUMP(o0, f48, 58f) membar #Sync 294 + STORE_JUMP(o0, f48, 58f) 298 295 299 296 1: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) 300 297 LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) ··· 305 302 ba,pt %xcc, 1b+4 306 303 faligndata %f6, %f8, %f48 307 304 1: FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) 308 - STORE_SYNC(o0, f48) membar #Sync 305 + STORE_SYNC(o0, f48) 309 306 FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) 310 - STORE_JUMP(o0, f48, 43f) membar #Sync 307 + STORE_JUMP(o0, f48, 43f) 311 308 2: FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) 312 - STORE_SYNC(o0, f48) membar #Sync 309 + STORE_SYNC(o0, f48) 313 310 FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) 314 - STORE_JUMP(o0, f48, 51f) membar #Sync 311 + STORE_JUMP(o0, f48, 51f) 315 312 3: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) 316 - STORE_SYNC(o0, f48) membar #Sync 313 + STORE_SYNC(o0, f48) 317 314 FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) 318 - STORE_JUMP(o0, f48, 59f) membar #Sync 315 + STORE_JUMP(o0, f48, 59f) 319 316 320 317 1: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) 321 318 LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) ··· 326 323 ba,pt %xcc, 1b+4 327 324 faligndata %f8, %f10, %f48 328 325 1: FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) 329 - STORE_SYNC(o0, f48) membar #Sync 326 + STORE_SYNC(o0, f48) 330 327 FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) 331 - STORE_JUMP(o0, f48, 44f) membar #Sync 328 + STORE_JUMP(o0, f48, 44f) 332 329 2: FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) 333 - STORE_SYNC(o0, f48) membar #Sync 330 + STORE_SYNC(o0, f48) 334 331 FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) 335 - STORE_JUMP(o0, f48, 52f) membar #Sync 332 + STORE_JUMP(o0, f48, 52f) 336 333 3: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) 337 - STORE_SYNC(o0, f48) membar #Sync 334 + STORE_SYNC(o0, f48) 338 335 FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) 339 - STORE_JUMP(o0, f48, 60f) membar #Sync 336 + STORE_JUMP(o0, f48, 60f) 340 337 341 338 1: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) 342 339 LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) ··· 347 344 ba,pt %xcc, 1b+4 348 345 faligndata %f10, %f12, %f48 349 346 1: FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) 350 - STORE_SYNC(o0, f48) membar #Sync 347 + STORE_SYNC(o0, f48) 351 348 FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) 352 - STORE_JUMP(o0, f48, 45f) membar #Sync 349 + STORE_JUMP(o0, f48, 45f) 353 350 2: FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) 354 - STORE_SYNC(o0, f48) membar #Sync 351 + STORE_SYNC(o0, f48) 355 352 FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) 356 - STORE_JUMP(o0, f48, 53f) membar #Sync 353 + STORE_JUMP(o0, f48, 53f) 357 354 3: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) 358 - STORE_SYNC(o0, f48) membar #Sync 355 + STORE_SYNC(o0, f48) 359 356 FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) 360 - STORE_JUMP(o0, f48, 61f) membar #Sync 357 + STORE_JUMP(o0, f48, 61f) 361 358 362 359 1: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) 363 360 LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) ··· 368 365 ba,pt %xcc, 1b+4 369 366 faligndata %f12, %f14, %f48 370 367 1: FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) 371 - STORE_SYNC(o0, f48) membar #Sync 368 + STORE_SYNC(o0, f48) 372 369 FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) 373 - STORE_JUMP(o0, f48, 46f) membar #Sync 370 + STORE_JUMP(o0, f48, 46f) 374 371 2: FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) 375 - STORE_SYNC(o0, f48) membar #Sync 372 + STORE_SYNC(o0, f48) 376 373 FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) 377 - STORE_JUMP(o0, f48, 54f) membar #Sync 374 + STORE_JUMP(o0, f48, 54f) 378 375 3: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) 379 - STORE_SYNC(o0, f48) membar #Sync 376 + STORE_SYNC(o0, f48) 380 377 FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) 381 - STORE_JUMP(o0, f48, 62f) membar #Sync 378 + STORE_JUMP(o0, f48, 62f) 382 379 383 380 1: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) 384 381 LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) ··· 389 386 ba,pt %xcc, 1b+4 390 387 faligndata %f14, %f16, %f48 391 388 1: FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) 392 - STORE_SYNC(o0, f48) membar #Sync 389 + STORE_SYNC(o0, f48) 393 390 FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) 394 - STORE_JUMP(o0, f48, 47f) membar #Sync 391 + STORE_JUMP(o0, f48, 47f) 395 392 2: FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) 396 - STORE_SYNC(o0, f48) membar #Sync 393 + STORE_SYNC(o0, f48) 397 394 FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) 398 - STORE_JUMP(o0, f48, 55f) membar #Sync 395 + STORE_JUMP(o0, f48, 55f) 399 396 3: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) 400 - STORE_SYNC(o0, f48) membar #Sync 397 + STORE_SYNC(o0, f48) 401 398 FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) 402 - STORE_JUMP(o0, f48, 63f) membar #Sync 399 + STORE_JUMP(o0, f48, 63f) 403 400 404 401 40: FINISH_VISCHUNK(o0, f0, f2, g3) 405 402 41: FINISH_VISCHUNK(o0, f2, f4, g3)
+13 -2
arch/sparc64/lib/VISsave.S
··· 72 72 73 73 stda %f48, [%g3 + %g1] ASI_BLK_P 74 74 5: membar #Sync 75 - jmpl %g7 + %g0, %g0 75 + ba,pt %xcc, 80f 76 + nop 77 + 78 + .align 32 79 + 80: jmpl %g7 + %g0, %g0 76 80 nop 77 81 78 82 6: ldub [%g3 + TI_FPSAVED], %o5 ··· 91 87 stda %f32, [%g2 + %g1] ASI_BLK_P 92 88 stda %f48, [%g3 + %g1] ASI_BLK_P 93 89 membar #Sync 94 - jmpl %g7 + %g0, %g0 90 + ba,pt %xcc, 80f 91 + nop 95 92 93 + .align 32 94 + 80: jmpl %g7 + %g0, %g0 96 95 nop 97 96 98 97 .align 32 ··· 133 126 stda %f0, [%g2 + %g1] ASI_BLK_P 134 127 stda %f16, [%g3 + %g1] ASI_BLK_P 135 128 membar #Sync 129 + ba,pt %xcc, 4f 130 + nop 131 + 132 + .align 32 136 133 4: and %o5, FPRS_DU, %o5 137 134 jmpl %g7 + %g0, %g0 138 135 wr %o5, FPRS_FEF, %fprs
+26 -16
arch/sparc64/lib/atomic.S
··· 7 7 #include <linux/config.h> 8 8 #include <asm/asi.h> 9 9 10 - /* On SMP we need to use memory barriers to ensure 11 - * correct memory operation ordering, nop these out 12 - * for uniprocessor. 13 - */ 14 - #ifdef CONFIG_SMP 15 - #define ATOMIC_PRE_BARRIER membar #StoreLoad | #LoadLoad 16 - #define ATOMIC_POST_BARRIER membar #StoreLoad | #StoreStore 17 - #else 18 - #define ATOMIC_PRE_BARRIER nop 19 - #define ATOMIC_POST_BARRIER nop 20 - #endif 21 - 22 10 .text 23 11 24 12 /* Two versions of the atomic routines, one that ··· 40 52 nop 41 53 .size atomic_sub, .-atomic_sub 42 54 55 + /* On SMP we need to use memory barriers to ensure 56 + * correct memory operation ordering, nop these out 57 + * for uniprocessor. 58 + */ 59 + #ifdef CONFIG_SMP 60 + 61 + #define ATOMIC_PRE_BARRIER membar #StoreLoad | #LoadLoad; 62 + #define ATOMIC_POST_BARRIER \ 63 + ba,pt %xcc, 80b; \ 64 + membar #StoreLoad | #StoreStore 65 + 66 + 80: retl 67 + nop 68 + #else 69 + #define ATOMIC_PRE_BARRIER 70 + #define ATOMIC_POST_BARRIER 71 + #endif 72 + 43 73 .globl atomic_add_ret 44 74 .type atomic_add_ret,#function 45 75 atomic_add_ret: /* %o0 = increment, %o1 = atomic_ptr */ ··· 68 62 cmp %g1, %g7 69 63 bne,pn %icc, 1b 70 64 add %g7, %o0, %g7 65 + sra %g7, 0, %o0 71 66 ATOMIC_POST_BARRIER 72 67 retl 73 - sra %g7, 0, %o0 68 + nop 74 69 .size atomic_add_ret, .-atomic_add_ret 75 70 76 71 .globl atomic_sub_ret ··· 84 77 cmp %g1, %g7 85 78 bne,pn %icc, 1b 86 79 sub %g7, %o0, %g7 80 + sra %g7, 0, %o0 87 81 ATOMIC_POST_BARRIER 88 82 retl 89 - sra %g7, 0, %o0 83 + nop 90 84 .size atomic_sub_ret, .-atomic_sub_ret 91 85 92 86 .globl atomic64_add ··· 126 118 cmp %g1, %g7 127 119 bne,pn %xcc, 1b 128 120 add %g7, %o0, %g7 121 + mov %g7, %o0 129 122 ATOMIC_POST_BARRIER 130 123 retl 131 - mov %g7, %o0 124 + nop 132 125 .size atomic64_add_ret, .-atomic64_add_ret 133 126 134 127 .globl atomic64_sub_ret ··· 142 133 cmp %g1, %g7 143 134 bne,pn %xcc, 1b 144 135 sub %g7, %o0, %g7 136 + mov %g7, %o0 145 137 ATOMIC_POST_BARRIER 146 138 retl 147 - mov %g7, %o0 139 + nop 148 140 .size atomic64_sub_ret, .-atomic64_sub_ret
+21 -12
arch/sparc64/lib/bitops.S
··· 7 7 #include <linux/config.h> 8 8 #include <asm/asi.h> 9 9 10 + .text 11 + 10 12 /* On SMP we need to use memory barriers to ensure 11 13 * correct memory operation ordering, nop these out 12 14 * for uniprocessor. 13 15 */ 16 + 14 17 #ifdef CONFIG_SMP 15 18 #define BITOP_PRE_BARRIER membar #StoreLoad | #LoadLoad 16 - #define BITOP_POST_BARRIER membar #StoreLoad | #StoreStore 17 - #else 18 - #define BITOP_PRE_BARRIER nop 19 - #define BITOP_POST_BARRIER nop 20 - #endif 19 + #define BITOP_POST_BARRIER \ 20 + ba,pt %xcc, 80b; \ 21 + membar #StoreLoad | #StoreStore 21 22 22 - .text 23 + 80: retl 24 + nop 25 + #else 26 + #define BITOP_PRE_BARRIER 27 + #define BITOP_POST_BARRIER 28 + #endif 23 29 24 30 .globl test_and_set_bit 25 31 .type test_and_set_bit,#function ··· 43 37 cmp %g7, %g1 44 38 bne,pn %xcc, 1b 45 39 and %g7, %o2, %g2 46 - BITOP_POST_BARRIER 47 40 clr %o0 41 + movrne %g2, 1, %o0 42 + BITOP_POST_BARRIER 48 43 retl 49 - movrne %g2, 1, %o0 44 + nop 50 45 .size test_and_set_bit, .-test_and_set_bit 51 46 52 47 .globl test_and_clear_bit ··· 66 59 cmp %g7, %g1 67 60 bne,pn %xcc, 1b 68 61 and %g7, %o2, %g2 69 - BITOP_POST_BARRIER 70 62 clr %o0 63 + movrne %g2, 1, %o0 64 + BITOP_POST_BARRIER 71 65 retl 72 - movrne %g2, 1, %o0 66 + nop 73 67 .size test_and_clear_bit, .-test_and_clear_bit 74 68 75 69 .globl test_and_change_bit ··· 89 81 cmp %g7, %g1 90 82 bne,pn %xcc, 1b 91 83 and %g7, %o2, %g2 92 - BITOP_POST_BARRIER 93 84 clr %o0 85 + movrne %g2, 1, %o0 86 + BITOP_POST_BARRIER 94 87 retl 95 - movrne %g2, 1, %o0 88 + nop 96 89 .size test_and_change_bit, .-test_and_change_bit 97 90 98 91 .globl set_bit
+4 -2
arch/sparc64/lib/debuglocks.c
··· 252 252 " andn %%g1, %%g3, %%g7\n" 253 253 " casx [%0], %%g1, %%g7\n" 254 254 " cmp %%g1, %%g7\n" 255 + " membar #StoreLoad | #StoreStore\n" 255 256 " bne,pn %%xcc, 1b\n" 256 - " membar #StoreLoad | #StoreStore" 257 + " nop" 257 258 : /* no outputs */ 258 259 : "r" (&(rw->lock)) 259 260 : "g3", "g1", "g7", "cc", "memory"); ··· 352 351 " andn %%g1, %%g3, %%g7\n" 353 352 " casx [%0], %%g1, %%g7\n" 354 353 " cmp %%g1, %%g7\n" 354 + " membar #StoreLoad | #StoreStore\n" 355 355 " bne,pn %%xcc, 1b\n" 356 - " membar #StoreLoad | #StoreStore" 356 + " nop" 357 357 : /* no outputs */ 358 358 : "r" (&(rw->lock)) 359 359 : "g3", "g1", "g7", "cc", "memory");
+4 -2
arch/sparc64/lib/dec_and_lock.S
··· 48 48 #endif 49 49 to_zero: 50 50 ldstub [%o1], %g3 51 + membar #StoreLoad | #StoreStore 51 52 brnz,pn %g3, spin_on_lock 52 - membar #StoreLoad | #StoreStore 53 + nop 53 54 loop2: cas [%o0], %g2, %g7 /* ASSERT(g7 == 0) */ 54 55 cmp %g2, %g7 55 56 ··· 72 71 nop 73 72 spin_on_lock: 74 73 ldub [%o1], %g3 74 + membar #LoadLoad 75 75 brnz,pt %g3, spin_on_lock 76 - membar #LoadLoad 76 + nop 77 77 ba,pt %xcc, to_zero 78 78 nop 79 79 nop
+10 -5
arch/sparc64/lib/rwsem.S
··· 17 17 bne,pn %icc, 1b 18 18 add %g7, 1, %g7 19 19 cmp %g7, 0 20 + membar #StoreLoad | #StoreStore 20 21 bl,pn %icc, 3f 21 - membar #StoreLoad | #StoreStore 22 + nop 22 23 2: 23 24 retl 24 25 nop ··· 58 57 cmp %g3, %g7 59 58 bne,pn %icc, 1b 60 59 cmp %g7, 0 60 + membar #StoreLoad | #StoreStore 61 61 bne,pn %icc, 3f 62 - membar #StoreLoad | #StoreStore 62 + nop 63 63 2: retl 64 64 nop 65 65 3: ··· 99 97 cmp %g1, %g7 100 98 bne,pn %icc, 1b 101 99 cmp %g7, 0 100 + membar #StoreLoad | #StoreStore 102 101 bl,pn %icc, 3f 103 - membar #StoreLoad | #StoreStore 102 + nop 104 103 2: retl 105 104 nop 106 105 3: sethi %hi(RWSEM_ACTIVE_MASK), %g1 ··· 129 126 bne,pn %icc, 1b 130 127 sub %g7, %g1, %g7 131 128 cmp %g7, 0 129 + membar #StoreLoad | #StoreStore 132 130 bl,pn %icc, 3f 133 - membar #StoreLoad | #StoreStore 131 + nop 134 132 2: 135 133 retl 136 134 nop ··· 155 151 bne,pn %icc, 1b 156 152 sub %g7, %g1, %g7 157 153 cmp %g7, 0 154 + membar #StoreLoad | #StoreStore 158 155 bl,pn %icc, 3f 159 - membar #StoreLoad | #StoreStore 156 + nop 160 157 2: 161 158 retl 162 159 nop
+4 -2
arch/sparc64/mm/init.c
··· 136 136 "or %%g1, %0, %%g1\n\t" 137 137 "casx [%2], %%g7, %%g1\n\t" 138 138 "cmp %%g7, %%g1\n\t" 139 + "membar #StoreLoad | #StoreStore\n\t" 139 140 "bne,pn %%xcc, 1b\n\t" 140 - " membar #StoreLoad | #StoreStore" 141 + " nop" 141 142 : /* no outputs */ 142 143 : "r" (mask), "r" (non_cpu_bits), "r" (&page->flags) 143 144 : "g1", "g7"); ··· 158 157 " andn %%g7, %1, %%g1\n\t" 159 158 "casx [%2], %%g7, %%g1\n\t" 160 159 "cmp %%g7, %%g1\n\t" 160 + "membar #StoreLoad | #StoreStore\n\t" 161 161 "bne,pn %%xcc, 1b\n\t" 162 - " membar #StoreLoad | #StoreStore\n" 162 + " nop\n" 163 163 "2:" 164 164 : /* no outputs */ 165 165 : "r" (cpu), "r" (mask), "r" (&page->flags),
+2 -1
arch/sparc64/mm/ultra.S
··· 266 266 andn %o3, 1, %o3 267 267 stxa %g0, [%o3] ASI_IMMU_DEMAP 268 268 2: stxa %g0, [%o3] ASI_DMMU_DEMAP 269 + membar #Sync 269 270 brnz,pt %o1, 1b 270 - membar #Sync 271 + nop 271 272 stxa %g2, [%o4] ASI_DMMU 272 273 flush %g6 273 274 wrpr %g0, 0, %tl
+2 -1
include/asm-sparc64/rwsem.h
··· 55 55 "add %%g1, %1, %%g7\n\t" 56 56 "cas [%2], %%g1, %%g7\n\t" 57 57 "cmp %%g1, %%g7\n\t" 58 + "membar #StoreLoad | #StoreStore\n\t" 58 59 "bne,pn %%icc, 1b\n\t" 59 - " membar #StoreLoad | #StoreStore\n\t" 60 + " nop\n\t" 60 61 "mov %%g7, %0\n\t" 61 62 : "=&r" (tmp) 62 63 : "0" (tmp), "r" (sem)
+19 -10
include/asm-sparc64/spinlock.h
··· 52 52 53 53 __asm__ __volatile__( 54 54 "1: ldstub [%1], %0\n" 55 + " membar #StoreLoad | #StoreStore\n" 55 56 " brnz,pn %0, 2f\n" 56 - " membar #StoreLoad | #StoreStore\n" 57 + " nop\n" 57 58 " .subsection 2\n" 58 59 "2: ldub [%1], %0\n" 60 + " membar #LoadLoad\n" 59 61 " brnz,pt %0, 2b\n" 60 - " membar #LoadLoad\n" 62 + " nop\n" 61 63 " ba,a,pt %%xcc, 1b\n" 62 64 " .previous" 63 65 : "=&r" (tmp) ··· 97 95 98 96 __asm__ __volatile__( 99 97 "1: ldstub [%2], %0\n" 100 - " brnz,pn %0, 2f\n" 101 98 " membar #StoreLoad | #StoreStore\n" 99 + " brnz,pn %0, 2f\n" 100 + " nop\n" 102 101 " .subsection 2\n" 103 102 "2: rdpr %%pil, %1\n" 104 103 " wrpr %3, %%pil\n" 105 104 "3: ldub [%2], %0\n" 106 - " brnz,pt %0, 3b\n" 107 105 " membar #LoadLoad\n" 106 + " brnz,pt %0, 3b\n" 107 + " nop\n" 108 108 " ba,pt %%xcc, 1b\n" 109 - " wrpr %1, %%pil\n" 109 + " wrpr %1, %%pil\n" 110 110 " .previous" 111 111 : "=&r" (tmp1), "=&r" (tmp2) 112 112 : "r"(lock), "r"(flags) ··· 166 162 "4: add %0, 1, %1\n" 167 163 " cas [%2], %0, %1\n" 168 164 " cmp %0, %1\n" 165 + " membar #StoreLoad | #StoreStore\n" 169 166 " bne,pn %%icc, 1b\n" 170 - " membar #StoreLoad | #StoreStore\n" 167 + " nop\n" 171 168 " .subsection 2\n" 172 169 "2: ldsw [%2], %0\n" 170 + " membar #LoadLoad\n" 173 171 " brlz,pt %0, 2b\n" 174 - " membar #LoadLoad\n" 172 + " nop\n" 175 173 " ba,a,pt %%xcc, 4b\n" 176 174 " .previous" 177 175 : "=&r" (tmp1), "=&r" (tmp2) ··· 210 204 "4: or %0, %3, %1\n" 211 205 " cas [%2], %0, %1\n" 212 206 " cmp %0, %1\n" 207 + " membar #StoreLoad | #StoreStore\n" 213 208 " bne,pn %%icc, 1b\n" 214 - " membar #StoreLoad | #StoreStore\n" 209 + " nop\n" 215 210 " .subsection 2\n" 216 211 "2: lduw [%2], %0\n" 212 + " membar #LoadLoad\n" 217 213 " brnz,pt %0, 2b\n" 218 - " membar #LoadLoad\n" 214 + " nop\n" 219 215 " ba,a,pt %%xcc, 4b\n" 220 216 " .previous" 221 217 : "=&r" (tmp1), "=&r" (tmp2) ··· 248 240 " or %0, %4, %1\n" 249 241 " cas [%3], %0, %1\n" 250 242 " cmp %0, %1\n" 243 + " membar #StoreLoad | #StoreStore\n" 251 244 " bne,pn %%icc, 1b\n" 252 - " membar #StoreLoad | #StoreStore\n" 245 + " nop\n" 253 246 " mov 1, %2\n" 254 247 "2:" 255 248 : "=&r" (tmp1), "=&r" (tmp2), "=&r" (result)
-1
include/asm-sparc64/spitfire.h
··· 111 111 "membar #Sync" 112 112 : /* No outputs */ 113 113 : "r" (tag), "r" (addr), "i" (ASI_DCACHE_TAG)); 114 - __asm__ __volatile__ ("membar #Sync" : : : "memory"); 115 114 } 116 115 117 116 /* The instruction cache lines are flushed with this, but note that