futex: Use RCU-based per-CPU reference counting instead of rcuref_t

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

The use of rcuref_t for reference counting introduces a performance bottleneck
when accessed concurrently by multiple threads during futex operations.

Replace rcuref_t with special crafted per-CPU reference counters. The
lifetime logic remains the same.

The newly allocate private hash starts in FR_PERCPU state. In this state, each
futex operation that requires the private hash uses a per-CPU counter (an
unsigned int) for incrementing or decrementing the reference count.

When the private hash is about to be replaced, the per-CPU counters are
migrated to a atomic_t counter mm_struct::futex_atomic.
The migration process:
- Waiting for one RCU grace period to ensure all users observe the
current private hash. This can be skipped if a grace period elapsed
since the private hash was assigned.

- futex_private_hash::state is set to FR_ATOMIC, forcing all users to
use mm_struct::futex_atomic for reference counting.

- After a RCU grace period, all users are guaranteed to be using the
atomic counter. The per-CPU counters can now be summed up and added to
the atomic_t counter. If the resulting count is zero, the hash can be
safely replaced. Otherwise, active users still hold a valid reference.

- Once the atomic reference count drops to zero, the next futex
operation will switch to the new private hash.

call_rcu_hurry() is used to speed up transition which otherwise might be
delay with RCU_LAZY. There is nothing wrong with using call_rcu(). The
side effects would be that on auto scaling the new hash is used later
and the SET_SLOTS prctl() will block longer.

[bigeasy: commit description + mm get/ put_async]

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20250710110011.384614-3-bigeasy@linutronix.de

Peter Zijlstra 9 months ago 56180dd2 a255b78d

+243 -35

6 changed files

expand all

include

linux

futex.h

mm_types.h

sched

mm.h

init

Kconfig

kernel

fork.c

futex

core.c

+5 -11

include/linux/futex.h

··· 85 85 #ifdef CONFIG_FUTEX_PRIVATE_HASH 86 86 int futex_hash_allocate_default(void); 87 87 void futex_hash_free(struct mm_struct *mm); 88 - 89 - static inline void futex_mm_init(struct mm_struct *mm) 90 - { 91 - RCU_INIT_POINTER(mm->futex_phash, NULL); 92 - mm->futex_phash_new = NULL; 93 - mutex_init(&mm->futex_hash_lock); 94 - } 88 + int futex_mm_init(struct mm_struct *mm); 95 89 96 90 #else /* !CONFIG_FUTEX_PRIVATE_HASH */ 97 91 static inline int futex_hash_allocate_default(void) { return 0; } 98 - static inline void futex_hash_free(struct mm_struct *mm) { } 99 - static inline void futex_mm_init(struct mm_struct *mm) { } 92 + static inline int futex_hash_free(struct mm_struct *mm) { return 0; } 93 + static inline int futex_mm_init(struct mm_struct *mm) { return 0; } 100 94 #endif /* CONFIG_FUTEX_PRIVATE_HASH */ 101 95 102 96 #else /* !CONFIG_FUTEX */ ··· 112 118 { 113 119 return 0; 114 120 } 115 - static inline void futex_hash_free(struct mm_struct *mm) { } 116 - static inline void futex_mm_init(struct mm_struct *mm) { } 121 + static inline int futex_hash_free(struct mm_struct *mm) { return 0; } 122 + static inline int futex_mm_init(struct mm_struct *mm) { return 0; } 117 123 118 124 #endif 119 125

include/linux/mm_types.h

··· 1070 1070 struct mutex futex_hash_lock; 1071 1071 struct futex_private_hash __rcu *futex_phash; 1072 1072 struct futex_private_hash *futex_phash_new; 1073 + /* futex-ref */ 1074 + unsigned long futex_batches; 1075 + struct rcu_head futex_rcu; 1076 + atomic_long_t futex_atomic; 1077 + unsigned int __percpu *futex_ref; 1073 1078 #endif 1074 1079 1075 1080 unsigned long hiwater_rss; /* High-watermark of RSS usage */

+1 -1

include/linux/sched/mm.h

··· 140 140 141 141 /* mmput gets rid of the mappings and all user-space */ 142 142 extern void mmput(struct mm_struct *); 143 - #ifdef CONFIG_MMU 143 + #if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH) 144 144 /* same as above but performs the slow path from the async context. Can 145 145 * be called from the atomic context as well 146 146 */

-4

init/Kconfig

··· 1716 1716 depends on FUTEX && RT_MUTEXES 1717 1717 default y 1718 1718 1719 - # 1720 - # marked broken for performance reasons; gives us one more cycle to sort things out. 1721 - # 1722 1719 config FUTEX_PRIVATE_HASH 1723 1720 bool 1724 1721 depends on FUTEX && !BASE_SMALL && MMU 1725 - depends on BROKEN 1726 1722 default y 1727 1723 1728 1724 config FUTEX_MPOL

+6 -2

kernel/fork.c

··· 1046 1046 RCU_INIT_POINTER(mm->exe_file, NULL); 1047 1047 mmu_notifier_subscriptions_init(mm); 1048 1048 init_tlb_flush_pending(mm); 1049 - futex_mm_init(mm); 1050 1049 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS) 1051 1050 mm->pmd_huge_pte = NULL; 1052 1051 #endif ··· 1059 1060 mm->flags = default_dump_filter; 1060 1061 mm->def_flags = 0; 1061 1062 } 1063 + 1064 + if (futex_mm_init(mm)) 1065 + goto fail_mm_init; 1062 1066 1063 1067 if (mm_alloc_pgd(mm)) 1064 1068 goto fail_nopgd; ··· 1092 1090 fail_noid: 1093 1091 mm_free_pgd(mm); 1094 1092 fail_nopgd: 1093 + futex_hash_free(mm); 1094 + fail_mm_init: 1095 1095 free_mm(mm); 1096 1096 return NULL; 1097 1097 } ··· 1149 1145 } 1150 1146 EXPORT_SYMBOL_GPL(mmput); 1151 1147 1152 - #ifdef CONFIG_MMU 1148 + #if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH) 1153 1149 static void mmput_async_fn(struct work_struct *work) 1154 1150 { 1155 1151 struct mm_struct *mm = container_of(work, struct mm_struct,

+226 -17

kernel/futex/core.c

··· 42 42 #include <linux/fault-inject.h> 43 43 #include <linux/slab.h> 44 44 #include <linux/prctl.h> 45 - #include <linux/rcuref.h> 46 45 #include <linux/mempolicy.h> 47 46 #include <linux/mmap_lock.h> 48 47 ··· 64 65 #define futex_queues (__futex_data.queues) 65 66 66 67 struct futex_private_hash { 67 - rcuref_t users; 68 + int state; 68 69 unsigned int hash_mask; 69 70 struct rcu_head rcu; 70 71 void *mm; ··· 128 129 __futex_hash(union futex_key *key, struct futex_private_hash *fph); 129 130 130 131 #ifdef CONFIG_FUTEX_PRIVATE_HASH 132 + static bool futex_ref_get(struct futex_private_hash *fph); 133 + static bool futex_ref_put(struct futex_private_hash *fph); 134 + static bool futex_ref_is_dead(struct futex_private_hash *fph); 135 + 136 + enum { FR_PERCPU = 0, FR_ATOMIC }; 137 + 131 138 static inline bool futex_key_is_private(union futex_key *key) 132 139 { 133 140 /* ··· 147 142 { 148 143 if (fph->immutable) 149 144 return true; 150 - return rcuref_get(&fph->users); 145 + return futex_ref_get(fph); 151 146 } 152 147 153 148 void futex_private_hash_put(struct futex_private_hash *fph) 154 149 { 155 - /* Ignore return value, last put is verified via rcuref_is_dead() */ 156 150 if (fph->immutable) 157 151 return; 158 - if (rcuref_put(&fph->users)) 152 + if (futex_ref_put(fph)) 159 153 wake_up_var(fph->mm); 160 154 } 161 155 ··· 247 243 fph = rcu_dereference_protected(mm->futex_phash, 248 244 lockdep_is_held(&mm->futex_hash_lock)); 249 245 if (fph) { 250 - if (!rcuref_is_dead(&fph->users)) { 246 + if (!futex_ref_is_dead(fph)) { 251 247 mm->futex_phash_new = new; 252 248 return false; 253 249 } 254 250 255 251 futex_rehash_private(fph, new); 256 252 } 257 - rcu_assign_pointer(mm->futex_phash, new); 253 + new->state = FR_PERCPU; 254 + scoped_guard(rcu) { 255 + mm->futex_batches = get_state_synchronize_rcu(); 256 + rcu_assign_pointer(mm->futex_phash, new); 257 + } 258 258 kvfree_rcu(fph, rcu); 259 259 return true; 260 260 } ··· 297 289 if (!fph) 298 290 return NULL; 299 291 300 - if (fph->immutable) 301 - return fph; 302 - if (rcuref_get(&fph->users)) 292 + if (futex_private_hash_get(fph)) 303 293 return fph; 304 294 } 305 295 futex_pivot_hash(mm); ··· 1533 1527 #define FH_IMMUTABLE 0x02 1534 1528 1535 1529 #ifdef CONFIG_FUTEX_PRIVATE_HASH 1530 + 1531 + /* 1532 + * futex-ref 1533 + * 1534 + * Heavily inspired by percpu-rwsem/percpu-refcount; not reusing any of that 1535 + * code because it just doesn't fit right. 1536 + * 1537 + * Dual counter, per-cpu / atomic approach like percpu-refcount, except it 1538 + * re-initializes the state automatically, such that the fph swizzle is also a 1539 + * transition back to per-cpu. 1540 + */ 1541 + 1542 + static void futex_ref_rcu(struct rcu_head *head); 1543 + 1544 + static void __futex_ref_atomic_begin(struct futex_private_hash *fph) 1545 + { 1546 + struct mm_struct *mm = fph->mm; 1547 + 1548 + /* 1549 + * The counter we're about to switch to must have fully switched; 1550 + * otherwise it would be impossible for it to have reported success 1551 + * from futex_ref_is_dead(). 1552 + */ 1553 + WARN_ON_ONCE(atomic_long_read(&mm->futex_atomic) != 0); 1554 + 1555 + /* 1556 + * Set the atomic to the bias value such that futex_ref_{get,put}() 1557 + * will never observe 0. Will be fixed up in __futex_ref_atomic_end() 1558 + * when folding in the percpu count. 1559 + */ 1560 + atomic_long_set(&mm->futex_atomic, LONG_MAX); 1561 + smp_store_release(&fph->state, FR_ATOMIC); 1562 + 1563 + call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu); 1564 + } 1565 + 1566 + static void __futex_ref_atomic_end(struct futex_private_hash *fph) 1567 + { 1568 + struct mm_struct *mm = fph->mm; 1569 + unsigned int count = 0; 1570 + long ret; 1571 + int cpu; 1572 + 1573 + /* 1574 + * Per __futex_ref_atomic_begin() the state of the fph must be ATOMIC 1575 + * and per this RCU callback, everybody must now observe this state and 1576 + * use the atomic variable. 1577 + */ 1578 + WARN_ON_ONCE(fph->state != FR_ATOMIC); 1579 + 1580 + /* 1581 + * Therefore the per-cpu counter is now stable, sum and reset. 1582 + */ 1583 + for_each_possible_cpu(cpu) { 1584 + unsigned int *ptr = per_cpu_ptr(mm->futex_ref, cpu); 1585 + count += *ptr; 1586 + *ptr = 0; 1587 + } 1588 + 1589 + /* 1590 + * Re-init for the next cycle. 1591 + */ 1592 + this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */ 1593 + 1594 + /* 1595 + * Add actual count, subtract bias and initial refcount. 1596 + * 1597 + * The moment this atomic operation happens, futex_ref_is_dead() can 1598 + * become true. 1599 + */ 1600 + ret = atomic_long_add_return(count - LONG_MAX - 1, &mm->futex_atomic); 1601 + if (!ret) 1602 + wake_up_var(mm); 1603 + 1604 + WARN_ON_ONCE(ret < 0); 1605 + mmput_async(mm); 1606 + } 1607 + 1608 + static void futex_ref_rcu(struct rcu_head *head) 1609 + { 1610 + struct mm_struct *mm = container_of(head, struct mm_struct, futex_rcu); 1611 + struct futex_private_hash *fph = rcu_dereference_raw(mm->futex_phash); 1612 + 1613 + if (fph->state == FR_PERCPU) { 1614 + /* 1615 + * Per this extra grace-period, everybody must now observe 1616 + * fph as the current fph and no previously observed fph's 1617 + * are in-flight. 1618 + * 1619 + * Notably, nobody will now rely on the atomic 1620 + * futex_ref_is_dead() state anymore so we can begin the 1621 + * migration of the per-cpu counter into the atomic. 1622 + */ 1623 + __futex_ref_atomic_begin(fph); 1624 + return; 1625 + } 1626 + 1627 + __futex_ref_atomic_end(fph); 1628 + } 1629 + 1630 + /* 1631 + * Drop the initial refcount and transition to atomics. 1632 + */ 1633 + static void futex_ref_drop(struct futex_private_hash *fph) 1634 + { 1635 + struct mm_struct *mm = fph->mm; 1636 + 1637 + /* 1638 + * Can only transition the current fph; 1639 + */ 1640 + WARN_ON_ONCE(rcu_dereference_raw(mm->futex_phash) != fph); 1641 + /* 1642 + * We enqueue at least one RCU callback. Ensure mm stays if the task 1643 + * exits before the transition is completed. 1644 + */ 1645 + mmget(mm); 1646 + 1647 + /* 1648 + * In order to avoid the following scenario: 1649 + * 1650 + * futex_hash() __futex_pivot_hash() 1651 + * guard(rcu); guard(mm->futex_hash_lock); 1652 + * fph = mm->futex_phash; 1653 + * rcu_assign_pointer(&mm->futex_phash, new); 1654 + * futex_hash_allocate() 1655 + * futex_ref_drop() 1656 + * fph->state = FR_ATOMIC; 1657 + * atomic_set(, BIAS); 1658 + * 1659 + * futex_private_hash_get(fph); // OOPS 1660 + * 1661 + * Where an old fph (which is FR_ATOMIC) and should fail on 1662 + * inc_not_zero, will succeed because a new transition is started and 1663 + * the atomic is bias'ed away from 0. 1664 + * 1665 + * There must be at least one full grace-period between publishing a 1666 + * new fph and trying to replace it. 1667 + */ 1668 + if (poll_state_synchronize_rcu(mm->futex_batches)) { 1669 + /* 1670 + * There was a grace-period, we can begin now. 1671 + */ 1672 + __futex_ref_atomic_begin(fph); 1673 + return; 1674 + } 1675 + 1676 + call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu); 1677 + } 1678 + 1679 + static bool futex_ref_get(struct futex_private_hash *fph) 1680 + { 1681 + struct mm_struct *mm = fph->mm; 1682 + 1683 + guard(rcu)(); 1684 + 1685 + if (smp_load_acquire(&fph->state) == FR_PERCPU) { 1686 + this_cpu_inc(*mm->futex_ref); 1687 + return true; 1688 + } 1689 + 1690 + return atomic_long_inc_not_zero(&mm->futex_atomic); 1691 + } 1692 + 1693 + static bool futex_ref_put(struct futex_private_hash *fph) 1694 + { 1695 + struct mm_struct *mm = fph->mm; 1696 + 1697 + guard(rcu)(); 1698 + 1699 + if (smp_load_acquire(&fph->state) == FR_PERCPU) { 1700 + this_cpu_dec(*mm->futex_ref); 1701 + return false; 1702 + } 1703 + 1704 + return atomic_long_dec_and_test(&mm->futex_atomic); 1705 + } 1706 + 1707 + static bool futex_ref_is_dead(struct futex_private_hash *fph) 1708 + { 1709 + struct mm_struct *mm = fph->mm; 1710 + 1711 + guard(rcu)(); 1712 + 1713 + if (smp_load_acquire(&fph->state) == FR_PERCPU) 1714 + return false; 1715 + 1716 + return atomic_long_read(&mm->futex_atomic) == 0; 1717 + } 1718 + 1719 + int futex_mm_init(struct mm_struct *mm) 1720 + { 1721 + mutex_init(&mm->futex_hash_lock); 1722 + RCU_INIT_POINTER(mm->futex_phash, NULL); 1723 + mm->futex_phash_new = NULL; 1724 + /* futex-ref */ 1725 + atomic_long_set(&mm->futex_atomic, 0); 1726 + mm->futex_batches = get_state_synchronize_rcu(); 1727 + mm->futex_ref = alloc_percpu(unsigned int); 1728 + if (!mm->futex_ref) 1729 + return -ENOMEM; 1730 + this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */ 1731 + return 0; 1732 + } 1733 + 1536 1734 void futex_hash_free(struct mm_struct *mm) 1537 1735 { 1538 1736 struct futex_private_hash *fph; 1539 1737 1738 + free_percpu(mm->futex_ref); 1540 1739 kvfree(mm->futex_phash_new); 1541 1740 fph = rcu_dereference_raw(mm->futex_phash); 1542 - if (fph) { 1543 - WARN_ON_ONCE(rcuref_read(&fph->users) > 1); 1741 + if (fph) 1544 1742 kvfree(fph); 1545 - } 1546 1743 } 1547 1744 1548 1745 static bool futex_pivot_pending(struct mm_struct *mm) ··· 1758 1549 return true; 1759 1550 1760 1551 fph = rcu_dereference(mm->futex_phash); 1761 - return rcuref_is_dead(&fph->users); 1552 + return futex_ref_is_dead(fph); 1762 1553 } 1763 1554 1764 1555 static bool futex_hash_less(struct futex_private_hash *a, ··· 1807 1598 } 1808 1599 } 1809 1600 1810 - fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 1601 + fph = kvzalloc(struct_size(fph, queues, hash_slots), 1602 + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 1811 1603 if (!fph) 1812 1604 return -ENOMEM; 1813 1605 1814 - rcuref_init(&fph->users, 1); 1815 1606 fph->hash_mask = hash_slots ? hash_slots - 1 : 0; 1816 1607 fph->custom = custom; 1817 1608 fph->immutable = !!(flags & FH_IMMUTABLE); ··· 1854 1645 * allocated a replacement hash, drop the initial 1855 1646 * reference on the existing hash. 1856 1647 */ 1857 - futex_private_hash_put(cur); 1648 + futex_ref_drop(cur); 1858 1649 } 1859 1650 1860 1651 if (new) {