arch/tile: allow nonatomic stores to interoperate with fast atomic syscalls

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

This semantic was already true for atomic operations within the kernel,
and this change makes it true for the fast atomic syscalls (__NR_cmpxchg
and __NR_atomic_update) as well. Previously, user-space had to use
the fast atomic syscalls exclusively to update memory, since raw stores
could lose a race with the atomic update code even when the atomic update
hadn't actually modified the value.

With this change, we no longer write back the value to memory if it
hasn't changed. This allows certain types of idioms in user space to
work as expected, e.g. "atomic exchange" to acquire a spinlock, followed
by a raw store of zero to release the lock.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>

Chris Metcalf 15 years ago df29ccb6 398fa5a9

+27 -36

2 changed files

expand all

arch

tile

kernel

intvec_32.S

lib

atomic_asm_32.S

+26 -35

arch/tile/kernel/intvec_32.S

··· 1470 1470 * We place it in the __HEAD section to ensure it is relatively 1471 1471 * near to the intvec_SWINT_1 code (reachable by a conditional branch). 1472 1472 * 1473 - * Must match register usage in do_page_fault(). 1473 + * Our use of ATOMIC_LOCK_REG here must match do_page_fault_ics(). 1474 + * 1475 + * As we do in lib/atomic_asm_32.S, we bypass a store if the value we 1476 + * would store is the same as the value we just loaded. 1474 1477 */ 1475 1478 __HEAD 1476 1479 .align 64 ··· 1534 1531 { 1535 1532 shri r20, r25, 32 - ATOMIC_HASH_L1_SHIFT 1536 1533 slt_u r23, r0, r23 1537 - 1538 - /* 1539 - * Ensure that the TLB is loaded before we take out the lock. 1540 - * On TILEPro, this will start fetching the value all the way 1541 - * into our L1 as well (and if it gets modified before we 1542 - * grab the lock, it will be invalidated from our cache 1543 - * before we reload it). On tile64, we'll start fetching it 1544 - * into our L1 if we're the home, and if we're not, we'll 1545 - * still at least start fetching it into the home's L2. 1546 - */ 1547 - lw r26, r0 1534 + lw r26, r0 /* see comment in the "#else" for the "lw r26". */ 1548 1535 } 1549 1536 { 1550 1537 s2a r21, r20, r21 ··· 1550 1557 bbs r23, .Lcmpxchg64 1551 1558 andi r23, r0, 7 /* Precompute alignment for cmpxchg64. */ 1552 1559 } 1553 - 1554 1560 { 1555 - /* 1556 - * We very carefully align the code that actually runs with 1557 - * the lock held (nine bundles) so that we know it is all in 1558 - * the icache when we start. This instruction (the jump) is 1559 - * at the start of the first cache line, address zero mod 64; 1560 - * we jump to somewhere in the second cache line to issue the 1561 - * tns, then jump back to finish up. 1562 - */ 1563 1561 s2a ATOMIC_LOCK_REG_NAME, r25, r21 1564 - j .Lcmpxchg32_tns 1562 + j .Lcmpxchg32_tns /* see comment in the #else for the jump. */ 1565 1563 } 1566 1564 1567 1565 #else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ ··· 1617 1633 { 1618 1634 /* 1619 1635 * We very carefully align the code that actually runs with 1620 - * the lock held (nine bundles) so that we know it is all in 1636 + * the lock held (twelve bundles) so that we know it is all in 1621 1637 * the icache when we start. This instruction (the jump) is 1622 1638 * at the start of the first cache line, address zero mod 64; 1623 - * we jump to somewhere in the second cache line to issue the 1624 - * tns, then jump back to finish up. 1639 + * we jump to the very end of the second cache line to get that 1640 + * line loaded in the icache, then fall through to issue the tns 1641 + * in the third cache line, at which point it's all cached. 1642 + * Note that is for performance, not correctness. 1625 1643 */ 1626 1644 j .Lcmpxchg32_tns 1627 1645 } 1628 1646 1629 1647 #endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ 1630 1648 1631 - ENTRY(__sys_cmpxchg_grab_lock) 1649 + /* Symbol for do_page_fault_ics() to use to compare against the PC. */ 1650 + .global __sys_cmpxchg_grab_lock 1651 + __sys_cmpxchg_grab_lock: 1632 1652 1633 1653 /* 1634 1654 * Perform the actual cmpxchg or atomic_update. 1635 - * Note that the system <arch/atomic.h> header relies on 1636 - * atomic_update() to always perform an "mf", so don't make 1637 - * it optional or conditional without modifying that code. 1638 1655 */ 1639 1656 .Ldo_cmpxchg32: 1640 1657 { ··· 1653 1668 } 1654 1669 { 1655 1670 mvnz r24, r23, r25 /* Use atomic_update value if appropriate. */ 1656 - bbns r22, .Lcmpxchg32_mismatch 1671 + bbns r22, .Lcmpxchg32_nostore 1657 1672 } 1673 + seq r22, r24, r21 /* Are we storing the value we loaded? */ 1674 + bbs r22, .Lcmpxchg32_nostore 1658 1675 sw r0, r24 1659 1676 1677 + /* The following instruction is the start of the second cache line. */ 1660 1678 /* Do slow mtspr here so the following "mf" waits less. */ 1661 1679 { 1662 1680 move sp, r27 ··· 1667 1679 } 1668 1680 mf 1669 1681 1670 - /* The following instruction is the start of the second cache line. */ 1671 1682 { 1672 1683 move r0, r21 1673 1684 sw ATOMIC_LOCK_REG_NAME, zero ··· 1674 1687 iret 1675 1688 1676 1689 /* Duplicated code here in the case where we don't overlap "mf" */ 1677 - .Lcmpxchg32_mismatch: 1690 + .Lcmpxchg32_nostore: 1678 1691 { 1679 1692 move r0, r21 1680 1693 sw ATOMIC_LOCK_REG_NAME, zero ··· 1690 1703 * and for 64-bit cmpxchg. We provide it as a macro and put 1691 1704 * it into both versions. We can't share the code literally 1692 1705 * since it depends on having the right branch-back address. 1693 - * Note that the first few instructions should share the cache 1694 - * line with the second half of the actual locked code. 1695 1706 */ 1696 1707 .macro cmpxchg_lock, bitwidth 1697 1708 ··· 1715 1730 } 1716 1731 /* 1717 1732 * The preceding instruction is the last thing that must be 1718 - * on the second cache line. 1733 + * hot in the icache before we do the "tns" above. 1719 1734 */ 1720 1735 1721 1736 #ifdef CONFIG_SMP ··· 1746 1761 .endm 1747 1762 1748 1763 .Lcmpxchg32_tns: 1764 + /* 1765 + * This is the last instruction on the second cache line. 1766 + * The nop here loads the second line, then we fall through 1767 + * to the tns to load the third line before we take the lock. 1768 + */ 1769 + nop 1749 1770 cmpxchg_lock 32 1750 1771 1751 1772 /*

+1 -1

arch/tile/lib/atomic_asm_32.S

··· 59 59 * bad kernel addresses). 60 60 * 61 61 * Note that if the value we would store is the same as what we 62 - * loaded, we bypass the load. Other platforms with true atomics can 62 + * loaded, we bypass the store. Other platforms with true atomics can 63 63 * make the guarantee that a non-atomic __clear_bit(), for example, 64 64 * can safely race with an atomic test_and_set_bit(); this example is 65 65 * from bit_spinlock.h in slub_lock() / slub_unlock(). We can't do