Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

futex: Allow to resize the private local hash

The mm_struct::futex_hash_lock guards the futex_hash_bucket assignment/
replacement. The futex_hash_allocate()/ PR_FUTEX_HASH_SET_SLOTS
operation can now be invoked at runtime and resize an already existing
internal private futex_hash_bucket to another size.

The reallocation is based on an idea by Thomas Gleixner: The initial
allocation of struct futex_private_hash sets the reference count
to one. Every user acquires a reference on the local hash before using
it and drops it after it enqueued itself on the hash bucket. There is no
reference held while the task is scheduled out while waiting for the
wake up.
The resize process allocates a new struct futex_private_hash and drops
the initial reference. Synchronized with mm_struct::futex_hash_lock it
is checked if the reference counter for the currently used
mm_struct::futex_phash is marked as DEAD. If so, then all users enqueued
on the current private hash are requeued on the new private hash and the
new private hash is set to mm_struct::futex_phash. Otherwise the newly
allocated private hash is saved as mm_struct::futex_phash_new and the
rehashing and reassigning is delayed to the futex_hash() caller once the
reference counter is marked DEAD.
The replacement is not performed at rcuref_put() time because certain
callers, such as futex_wait_queue(), drop their reference after changing
the task state. This change will be destroyed once the futex_hash_lock
is acquired.

The user can change the number slots with PR_FUTEX_HASH_SET_SLOTS
multiple times. An increase and decrease is allowed and request blocks
until the assignment is done.

The private hash allocated at thread creation is changed from 16 to
16 <= 4 * number_of_threads <= global_hash_size
where number_of_threads can not exceed the number of online CPUs. Should
the user PR_FUTEX_HASH_SET_SLOTS then the auto scaling is disabled.

[peterz: reorganize the code to avoid state tracking and simplify new
object handling, block the user until changes are in effect, allow
increase and decrease of the hash].

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20250416162921.513656-15-bigeasy@linutronix.de

authored by

Sebastian Andrzej Siewior and committed by
Peter Zijlstra
bd54df5e 7c4f75a2

+281 -21
+2 -1
include/linux/futex.h
··· 85 85 86 86 static inline void futex_mm_init(struct mm_struct *mm) 87 87 { 88 - mm->futex_phash = NULL; 88 + rcu_assign_pointer(mm->futex_phash, NULL); 89 + mutex_init(&mm->futex_hash_lock); 89 90 } 90 91 91 92 #else /* !CONFIG_FUTEX_PRIVATE_HASH */
+3 -1
include/linux/mm_types.h
··· 1033 1033 seqcount_t mm_lock_seq; 1034 1034 #endif 1035 1035 #ifdef CONFIG_FUTEX_PRIVATE_HASH 1036 - struct futex_private_hash *futex_phash; 1036 + struct mutex futex_hash_lock; 1037 + struct futex_private_hash __rcu *futex_phash; 1038 + struct futex_private_hash *futex_phash_new; 1037 1039 #endif 1038 1040 1039 1041 unsigned long hiwater_rss; /* High-watermark of RSS usage */
+271 -19
kernel/futex/core.c
··· 40 40 #include <linux/fault-inject.h> 41 41 #include <linux/slab.h> 42 42 #include <linux/prctl.h> 43 + #include <linux/rcuref.h> 43 44 44 45 #include "futex.h" 45 46 #include "../locking/rtmutex_common.h" ··· 58 57 #define futex_hashmask (__futex_data.hashmask) 59 58 60 59 struct futex_private_hash { 60 + rcuref_t users; 61 61 unsigned int hash_mask; 62 + struct rcu_head rcu; 62 63 void *mm; 63 64 bool custom; 64 65 struct futex_hash_bucket queues[]; ··· 132 129 133 130 bool futex_private_hash_get(struct futex_private_hash *fph) 134 131 { 135 - return false; 132 + return rcuref_get(&fph->users); 136 133 } 137 134 138 135 void futex_private_hash_put(struct futex_private_hash *fph) 139 136 { 137 + /* Ignore return value, last put is verified via rcuref_is_dead() */ 138 + if (rcuref_put(&fph->users)) 139 + wake_up_var(fph->mm); 140 140 } 141 141 142 142 /** ··· 149 143 * Obtain an additional reference for the already obtained hash bucket. The 150 144 * caller must already own an reference. 151 145 */ 152 - void futex_hash_get(struct futex_hash_bucket *hb) { } 153 - void futex_hash_put(struct futex_hash_bucket *hb) { } 146 + void futex_hash_get(struct futex_hash_bucket *hb) 147 + { 148 + struct futex_private_hash *fph = hb->priv; 149 + 150 + if (!fph) 151 + return; 152 + WARN_ON_ONCE(!futex_private_hash_get(fph)); 153 + } 154 + 155 + void futex_hash_put(struct futex_hash_bucket *hb) 156 + { 157 + struct futex_private_hash *fph = hb->priv; 158 + 159 + if (!fph) 160 + return; 161 + futex_private_hash_put(fph); 162 + } 154 163 155 164 static struct futex_hash_bucket * 156 165 __futex_hash_private(union futex_key *key, struct futex_private_hash *fph) ··· 176 155 return NULL; 177 156 178 157 if (!fph) 179 - fph = key->private.mm->futex_phash; 158 + fph = rcu_dereference(key->private.mm->futex_phash); 180 159 if (!fph || !fph->hash_mask) 181 160 return NULL; 182 161 ··· 186 165 return &fph->queues[hash & fph->hash_mask]; 187 166 } 188 167 168 + static void futex_rehash_private(struct futex_private_hash *old, 169 + struct futex_private_hash *new) 170 + { 171 + struct futex_hash_bucket *hb_old, *hb_new; 172 + unsigned int slots = old->hash_mask + 1; 173 + unsigned int i; 174 + 175 + for (i = 0; i < slots; i++) { 176 + struct futex_q *this, *tmp; 177 + 178 + hb_old = &old->queues[i]; 179 + 180 + spin_lock(&hb_old->lock); 181 + plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) { 182 + 183 + plist_del(&this->list, &hb_old->chain); 184 + futex_hb_waiters_dec(hb_old); 185 + 186 + WARN_ON_ONCE(this->lock_ptr != &hb_old->lock); 187 + 188 + hb_new = __futex_hash(&this->key, new); 189 + futex_hb_waiters_inc(hb_new); 190 + /* 191 + * The new pointer isn't published yet but an already 192 + * moved user can be unqueued due to timeout or signal. 193 + */ 194 + spin_lock_nested(&hb_new->lock, SINGLE_DEPTH_NESTING); 195 + plist_add(&this->list, &hb_new->chain); 196 + this->lock_ptr = &hb_new->lock; 197 + spin_unlock(&hb_new->lock); 198 + } 199 + spin_unlock(&hb_old->lock); 200 + } 201 + } 202 + 203 + static bool __futex_pivot_hash(struct mm_struct *mm, 204 + struct futex_private_hash *new) 205 + { 206 + struct futex_private_hash *fph; 207 + 208 + WARN_ON_ONCE(mm->futex_phash_new); 209 + 210 + fph = rcu_dereference_protected(mm->futex_phash, 211 + lockdep_is_held(&mm->futex_hash_lock)); 212 + if (fph) { 213 + if (!rcuref_is_dead(&fph->users)) { 214 + mm->futex_phash_new = new; 215 + return false; 216 + } 217 + 218 + futex_rehash_private(fph, new); 219 + } 220 + rcu_assign_pointer(mm->futex_phash, new); 221 + kvfree_rcu(fph, rcu); 222 + return true; 223 + } 224 + 225 + static void futex_pivot_hash(struct mm_struct *mm) 226 + { 227 + scoped_guard(mutex, &mm->futex_hash_lock) { 228 + struct futex_private_hash *fph; 229 + 230 + fph = mm->futex_phash_new; 231 + if (fph) { 232 + mm->futex_phash_new = NULL; 233 + __futex_pivot_hash(mm, fph); 234 + } 235 + } 236 + } 237 + 189 238 struct futex_private_hash *futex_private_hash(void) 190 239 { 191 240 struct mm_struct *mm = current->mm; 192 - struct futex_private_hash *fph; 241 + /* 242 + * Ideally we don't loop. If there is a replacement in progress 243 + * then a new private hash is already prepared and a reference can't be 244 + * obtained once the last user dropped it's. 245 + * In that case we block on mm_struct::futex_hash_lock and either have 246 + * to perform the replacement or wait while someone else is doing the 247 + * job. Eitherway, on the second iteration we acquire a reference on the 248 + * new private hash or loop again because a new replacement has been 249 + * requested. 250 + */ 251 + again: 252 + scoped_guard(rcu) { 253 + struct futex_private_hash *fph; 193 254 194 - fph = mm->futex_phash; 195 - return fph; 255 + fph = rcu_dereference(mm->futex_phash); 256 + if (!fph) 257 + return NULL; 258 + 259 + if (rcuref_get(&fph->users)) 260 + return fph; 261 + } 262 + futex_pivot_hash(mm); 263 + goto again; 196 264 } 197 265 198 266 struct futex_hash_bucket *futex_hash(union futex_key *key) 199 267 { 268 + struct futex_private_hash *fph; 200 269 struct futex_hash_bucket *hb; 201 270 202 - hb = __futex_hash(key, NULL); 203 - return hb; 271 + again: 272 + scoped_guard(rcu) { 273 + hb = __futex_hash(key, NULL); 274 + fph = hb->priv; 275 + 276 + if (!fph || futex_private_hash_get(fph)) 277 + return hb; 278 + } 279 + futex_pivot_hash(key->private.mm); 280 + goto again; 204 281 } 205 282 206 283 #else /* !CONFIG_FUTEX_PRIVATE_HASH */ ··· 783 664 spinlock_t *lock_ptr; 784 665 int ret = 0; 785 666 667 + /* RCU so lock_ptr is not going away during locking. */ 668 + guard(rcu)(); 786 669 /* In the common case we don't take the spinlock, which is nice. */ 787 670 retry: 788 671 /* ··· 1187 1066 union futex_key key = FUTEX_KEY_INIT; 1188 1067 1189 1068 /* 1069 + * The mutex mm_struct::futex_hash_lock might be acquired. 1070 + */ 1071 + might_sleep(); 1072 + /* 1190 1073 * Ensure the hash remains stable (no resize) during the while loop 1191 1074 * below. The hb pointer is acquired under the pi_lock so we can't block 1192 1075 * on the mutex. ··· 1386 1261 #ifdef CONFIG_FUTEX_PRIVATE_HASH 1387 1262 void futex_hash_free(struct mm_struct *mm) 1388 1263 { 1389 - kvfree(mm->futex_phash); 1264 + struct futex_private_hash *fph; 1265 + 1266 + kvfree(mm->futex_phash_new); 1267 + fph = rcu_dereference_raw(mm->futex_phash); 1268 + if (fph) { 1269 + WARN_ON_ONCE(rcuref_read(&fph->users) > 1); 1270 + kvfree(fph); 1271 + } 1272 + } 1273 + 1274 + static bool futex_pivot_pending(struct mm_struct *mm) 1275 + { 1276 + struct futex_private_hash *fph; 1277 + 1278 + guard(rcu)(); 1279 + 1280 + if (!mm->futex_phash_new) 1281 + return true; 1282 + 1283 + fph = rcu_dereference(mm->futex_phash); 1284 + return rcuref_is_dead(&fph->users); 1285 + } 1286 + 1287 + static bool futex_hash_less(struct futex_private_hash *a, 1288 + struct futex_private_hash *b) 1289 + { 1290 + /* user provided always wins */ 1291 + if (!a->custom && b->custom) 1292 + return true; 1293 + if (a->custom && !b->custom) 1294 + return false; 1295 + 1296 + /* zero-sized hash wins */ 1297 + if (!b->hash_mask) 1298 + return true; 1299 + if (!a->hash_mask) 1300 + return false; 1301 + 1302 + /* keep the biggest */ 1303 + if (a->hash_mask < b->hash_mask) 1304 + return true; 1305 + if (a->hash_mask > b->hash_mask) 1306 + return false; 1307 + 1308 + return false; /* equal */ 1390 1309 } 1391 1310 1392 1311 static int futex_hash_allocate(unsigned int hash_slots, bool custom) ··· 1442 1273 if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots))) 1443 1274 return -EINVAL; 1444 1275 1445 - if (mm->futex_phash) 1446 - return -EALREADY; 1447 - 1448 - if (!thread_group_empty(current)) 1449 - return -EINVAL; 1276 + /* 1277 + * Once we've disabled the global hash there is no way back. 1278 + */ 1279 + scoped_guard(rcu) { 1280 + fph = rcu_dereference(mm->futex_phash); 1281 + if (fph && !fph->hash_mask) { 1282 + if (custom) 1283 + return -EBUSY; 1284 + return 0; 1285 + } 1286 + } 1450 1287 1451 1288 fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 1452 1289 if (!fph) 1453 1290 return -ENOMEM; 1454 1291 1292 + rcuref_init(&fph->users, 1); 1455 1293 fph->hash_mask = hash_slots ? hash_slots - 1 : 0; 1456 1294 fph->custom = custom; 1457 1295 fph->mm = mm; ··· 1466 1290 for (i = 0; i < hash_slots; i++) 1467 1291 futex_hash_bucket_init(&fph->queues[i], fph); 1468 1292 1469 - mm->futex_phash = fph; 1293 + if (custom) { 1294 + /* 1295 + * Only let prctl() wait / retry; don't unduly delay clone(). 1296 + */ 1297 + again: 1298 + wait_var_event(mm, futex_pivot_pending(mm)); 1299 + } 1300 + 1301 + scoped_guard(mutex, &mm->futex_hash_lock) { 1302 + struct futex_private_hash *free __free(kvfree) = NULL; 1303 + struct futex_private_hash *cur, *new; 1304 + 1305 + cur = rcu_dereference_protected(mm->futex_phash, 1306 + lockdep_is_held(&mm->futex_hash_lock)); 1307 + new = mm->futex_phash_new; 1308 + mm->futex_phash_new = NULL; 1309 + 1310 + if (fph) { 1311 + if (cur && !new) { 1312 + /* 1313 + * If we have an existing hash, but do not yet have 1314 + * allocated a replacement hash, drop the initial 1315 + * reference on the existing hash. 1316 + */ 1317 + futex_private_hash_put(cur); 1318 + } 1319 + 1320 + if (new) { 1321 + /* 1322 + * Two updates raced; throw out the lesser one. 1323 + */ 1324 + if (futex_hash_less(new, fph)) { 1325 + free = new; 1326 + new = fph; 1327 + } else { 1328 + free = fph; 1329 + } 1330 + } else { 1331 + new = fph; 1332 + } 1333 + fph = NULL; 1334 + } 1335 + 1336 + if (new) { 1337 + /* 1338 + * Will set mm->futex_phash_new on failure; 1339 + * futex_private_hash_get() will try again. 1340 + */ 1341 + if (!__futex_pivot_hash(mm, new) && custom) 1342 + goto again; 1343 + } 1344 + } 1470 1345 return 0; 1471 1346 } 1472 1347 1473 1348 int futex_hash_allocate_default(void) 1474 1349 { 1350 + unsigned int threads, buckets, current_buckets = 0; 1351 + struct futex_private_hash *fph; 1352 + 1475 1353 if (!current->mm) 1476 1354 return 0; 1477 1355 1478 - if (current->mm->futex_phash) 1356 + scoped_guard(rcu) { 1357 + threads = min_t(unsigned int, 1358 + get_nr_threads(current), 1359 + num_online_cpus()); 1360 + 1361 + fph = rcu_dereference(current->mm->futex_phash); 1362 + if (fph) { 1363 + if (fph->custom) 1364 + return 0; 1365 + 1366 + current_buckets = fph->hash_mask + 1; 1367 + } 1368 + } 1369 + 1370 + /* 1371 + * The default allocation will remain within 1372 + * 16 <= threads * 4 <= global hash size 1373 + */ 1374 + buckets = roundup_pow_of_two(4 * threads); 1375 + buckets = clamp(buckets, 16, futex_hashmask + 1); 1376 + 1377 + if (current_buckets >= buckets) 1479 1378 return 0; 1480 1379 1481 - return futex_hash_allocate(16, false); 1380 + return futex_hash_allocate(buckets, false); 1482 1381 } 1483 1382 1484 1383 static int futex_hash_get_slots(void) 1485 1384 { 1486 1385 struct futex_private_hash *fph; 1487 1386 1488 - fph = current->mm->futex_phash; 1387 + guard(rcu)(); 1388 + fph = rcu_dereference(current->mm->futex_phash); 1489 1389 if (fph && fph->hash_mask) 1490 1390 return fph->hash_mask + 1; 1491 1391 return 0;
+5
kernel/futex/requeue.c
··· 87 87 futex_hb_waiters_inc(hb2); 88 88 plist_add(&q->list, &hb2->chain); 89 89 q->lock_ptr = &hb2->lock; 90 + /* 91 + * hb1 and hb2 belong to the same futex_hash_bucket_private 92 + * because if we managed get a reference on hb1 then it can't be 93 + * replaced. Therefore we avoid put(hb1)+get(hb2) here. 94 + */ 90 95 } 91 96 q->key = *key2; 92 97 }