Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

futex: Implement FUTEX2_NUMA

Extend the futex2 interface to be numa aware.

When FUTEX2_NUMA is specified for a futex, the user value is extended
to two words (of the same size). The first is the user value we all
know, the second one will be the node to place this futex on.

struct futex_numa_32 {
u32 val;
u32 node;
};

When node is set to ~0, WAIT will set it to the current node_id such
that WAKE knows where to find it. If userspace corrupts the node value
between WAIT and WAKE, the futex will not be found and no wakeup will
happen.

When FUTEX2_NUMA is not set, the node is simply an extension of the
hash, such that traditional futexes are still interleaved over the
nodes.

This is done to avoid having to have a separate !numa hash-table.

[bigeasy: ensure to have at least hashsize of 4 in futex_init(), add
pr_info() for size and allocation information. Cast the naddr math to
void*]

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20250416162921.513656-17-bigeasy@linutronix.de

+123 -20
+3
include/linux/futex.h
··· 34 34 u64 i_seq; 35 35 unsigned long pgoff; 36 36 unsigned int offset; 37 + /* unsigned int node; */ 37 38 } shared; 38 39 struct { 39 40 union { ··· 43 42 }; 44 43 unsigned long address; 45 44 unsigned int offset; 45 + /* unsigned int node; */ 46 46 } private; 47 47 struct { 48 48 u64 ptr; 49 49 unsigned long word; 50 50 unsigned int offset; 51 + unsigned int node; /* NOT hashed! */ 51 52 } both; 52 53 }; 53 54
+7
include/uapi/linux/futex.h
··· 75 75 #define FUTEX_32 FUTEX2_SIZE_U32 /* historical accident :-( */ 76 76 77 77 /* 78 + * When FUTEX2_NUMA doubles the futex word, the second word is a node value. 79 + * The special value -1 indicates no-node. This is the same value as 80 + * NUMA_NO_NODE, except that value is not ABI, this is. 81 + */ 82 + #define FUTEX_NO_NODE (-1) 83 + 84 + /* 78 85 * Max numbers of elements in a futex_waitv array 79 86 */ 80 87 #define FUTEX_WAITV_MAX 128
+83 -17
kernel/futex/core.c
··· 36 36 #include <linux/pagemap.h> 37 37 #include <linux/debugfs.h> 38 38 #include <linux/plist.h> 39 + #include <linux/gfp.h> 40 + #include <linux/vmalloc.h> 39 41 #include <linux/memblock.h> 40 42 #include <linux/fault-inject.h> 41 43 #include <linux/slab.h> ··· 53 51 * reside in the same cacheline. 54 52 */ 55 53 static struct { 56 - struct futex_hash_bucket *queues; 57 54 unsigned long hashmask; 55 + unsigned int hashshift; 56 + struct futex_hash_bucket *queues[MAX_NUMNODES]; 58 57 } __futex_data __read_mostly __aligned(2*sizeof(long)); 59 - #define futex_queues (__futex_data.queues) 60 - #define futex_hashmask (__futex_data.hashmask) 58 + 59 + #define futex_hashmask (__futex_data.hashmask) 60 + #define futex_hashshift (__futex_data.hashshift) 61 + #define futex_queues (__futex_data.queues) 61 62 62 63 struct futex_private_hash { 63 64 rcuref_t users; ··· 344 339 { 345 340 struct futex_hash_bucket *hb; 346 341 u32 hash; 342 + int node; 347 343 348 344 hb = __futex_hash_private(key, fph); 349 345 if (hb) 350 346 return hb; 351 347 352 348 hash = jhash2((u32 *)key, 353 - offsetof(typeof(*key), both.offset) / 4, 349 + offsetof(typeof(*key), both.offset) / sizeof(u32), 354 350 key->both.offset); 355 - return &futex_queues[hash & futex_hashmask]; 351 + node = key->both.node; 352 + 353 + if (node == FUTEX_NO_NODE) { 354 + /* 355 + * In case of !FLAGS_NUMA, use some unused hash bits to pick a 356 + * node -- this ensures regular futexes are interleaved across 357 + * the nodes and avoids having to allocate multiple 358 + * hash-tables. 359 + * 360 + * NOTE: this isn't perfectly uniform, but it is fast and 361 + * handles sparse node masks. 362 + */ 363 + node = (hash >> futex_hashshift) % nr_node_ids; 364 + if (!node_possible(node)) { 365 + node = find_next_bit_wrap(node_possible_map.bits, 366 + nr_node_ids, node); 367 + } 368 + } 369 + 370 + return &futex_queues[node][hash & futex_hashmask]; 356 371 } 357 372 358 373 /** ··· 479 454 struct page *page; 480 455 struct folio *folio; 481 456 struct address_space *mapping; 482 - int err, ro = 0; 457 + int node, err, size, ro = 0; 483 458 bool fshared; 484 459 485 460 fshared = flags & FLAGS_SHARED; 461 + size = futex_size(flags); 462 + if (flags & FLAGS_NUMA) 463 + size *= 2; 486 464 487 465 /* 488 466 * The futex address must be "naturally" aligned. 489 467 */ 490 468 key->both.offset = address % PAGE_SIZE; 491 - if (unlikely((address % sizeof(u32)) != 0)) 469 + if (unlikely((address % size) != 0)) 492 470 return -EINVAL; 493 471 address -= key->both.offset; 494 472 495 - if (unlikely(!access_ok(uaddr, sizeof(u32)))) 473 + if (unlikely(!access_ok(uaddr, size))) 496 474 return -EFAULT; 497 475 498 476 if (unlikely(should_fail_futex(fshared))) 499 477 return -EFAULT; 478 + 479 + if (flags & FLAGS_NUMA) { 480 + u32 __user *naddr = (void *)uaddr + size / 2; 481 + 482 + if (futex_get_value(&node, naddr)) 483 + return -EFAULT; 484 + 485 + if (node == FUTEX_NO_NODE) { 486 + node = numa_node_id(); 487 + if (futex_put_value(node, naddr)) 488 + return -EFAULT; 489 + 490 + } else if (node >= MAX_NUMNODES || !node_possible(node)) { 491 + return -EINVAL; 492 + } 493 + 494 + key->both.node = node; 495 + 496 + } else { 497 + key->both.node = FUTEX_NO_NODE; 498 + } 500 499 501 500 /* 502 501 * PROCESS_PRIVATE futexes are fast. ··· 1691 1642 static int __init futex_init(void) 1692 1643 { 1693 1644 unsigned long hashsize, i; 1694 - unsigned int futex_shift; 1645 + unsigned int order, n; 1646 + unsigned long size; 1695 1647 1696 1648 #ifdef CONFIG_BASE_SMALL 1697 1649 hashsize = 16; 1698 1650 #else 1699 - hashsize = roundup_pow_of_two(256 * num_possible_cpus()); 1651 + hashsize = 256 * num_possible_cpus(); 1652 + hashsize /= num_possible_nodes(); 1653 + hashsize = max(4, hashsize); 1654 + hashsize = roundup_pow_of_two(hashsize); 1700 1655 #endif 1656 + futex_hashshift = ilog2(hashsize); 1657 + size = sizeof(struct futex_hash_bucket) * hashsize; 1658 + order = get_order(size); 1701 1659 1702 - futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), 1703 - hashsize, 0, 0, 1704 - &futex_shift, NULL, 1705 - hashsize, hashsize); 1706 - hashsize = 1UL << futex_shift; 1660 + for_each_node(n) { 1661 + struct futex_hash_bucket *table; 1707 1662 1708 - for (i = 0; i < hashsize; i++) 1709 - futex_hash_bucket_init(&futex_queues[i], NULL); 1663 + if (order > MAX_PAGE_ORDER) 1664 + table = vmalloc_huge_node(size, GFP_KERNEL, n); 1665 + else 1666 + table = alloc_pages_exact_nid(n, size, GFP_KERNEL); 1667 + 1668 + BUG_ON(!table); 1669 + 1670 + for (i = 0; i < hashsize; i++) 1671 + futex_hash_bucket_init(&table[i], NULL); 1672 + 1673 + futex_queues[n] = table; 1674 + } 1710 1675 1711 1676 futex_hashmask = hashsize - 1; 1677 + pr_info("futex hash table entries: %lu (%lu bytes on %d NUMA nodes, total %lu KiB, %s).\n", 1678 + hashsize, size, num_possible_nodes(), size * num_possible_nodes() / 1024, 1679 + order > MAX_PAGE_ORDER ? "vmalloc" : "linear"); 1712 1680 return 0; 1713 1681 } 1714 1682 core_initcall(futex_init);
+30 -3
kernel/futex/futex.h
··· 54 54 return flags; 55 55 } 56 56 57 - #define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE) 57 + #define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_NUMA | FUTEX2_PRIVATE) 58 58 59 59 /* FUTEX2_ to FLAGS_ */ 60 60 static inline unsigned int futex2_to_flags(unsigned int flags2) ··· 86 86 /* Only 32bit futexes are implemented -- for now */ 87 87 if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32) 88 88 return false; 89 + 90 + /* 91 + * Must be able to represent both FUTEX_NO_NODE and every valid nodeid 92 + * in a futex word. 93 + */ 94 + if (flags & FLAGS_NUMA) { 95 + int bits = 8 * futex_size(flags); 96 + u64 max = ~0ULL; 97 + 98 + max >>= 64 - bits; 99 + if (nr_node_ids >= max) 100 + return false; 101 + } 89 102 90 103 return true; 91 104 } ··· 295 282 * This looks a bit overkill, but generally just results in a couple 296 283 * of instructions. 297 284 */ 298 - static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from) 285 + static __always_inline int futex_get_value(u32 *dest, u32 __user *from) 299 286 { 300 287 u32 val; 301 288 ··· 312 299 return -EFAULT; 313 300 } 314 301 302 + static __always_inline int futex_put_value(u32 val, u32 __user *to) 303 + { 304 + if (can_do_masked_user_access()) 305 + to = masked_user_access_begin(to); 306 + else if (!user_read_access_begin(to, sizeof(*to))) 307 + return -EFAULT; 308 + unsafe_put_user(val, to, Efault); 309 + user_read_access_end(); 310 + return 0; 311 + Efault: 312 + user_read_access_end(); 313 + return -EFAULT; 314 + } 315 + 315 316 static inline int futex_get_value_locked(u32 *dest, u32 __user *from) 316 317 { 317 318 int ret; 318 319 319 320 pagefault_disable(); 320 - ret = futex_read_inatomic(dest, from); 321 + ret = futex_get_value(dest, from); 321 322 pagefault_enable(); 322 323 323 324 return ret;