Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

futex: Implement FUTEX2_MPOL

Extend the futex2 interface to be aware of mempolicy.

When FUTEX2_MPOL is specified and there is a MPOL_PREFERRED or
home_node specified covering the futex address, use that hash-map.

Notably, in this case the futex will go to the global node hashtable,
even if it is a PRIVATE futex.

When FUTEX2_NUMA|FUTEX2_MPOL is specified and the user specified node
value is FUTEX_NO_NODE, the MPOL lookup (as described above) will be
tried first before reverting to setting node to the local node.

[bigeasy: add CONFIG_FUTEX_MPOL, add MPOL to FUTEX2_VALID_MASK, write
the node only to user if FUTEX_NO_NODE was supplied]

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20250416162921.513656-18-bigeasy@linutronix.de

+115 -18
+4
include/linux/mmap_lock.h
··· 7 7 #include <linux/rwsem.h> 8 8 #include <linux/tracepoint-defs.h> 9 9 #include <linux/types.h> 10 + #include <linux/cleanup.h> 10 11 11 12 #define MMAP_LOCK_INITIALIZER(name) \ 12 13 .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock), ··· 211 210 __mmap_lock_trace_released(mm, false); 212 211 up_read(&mm->mmap_lock); 213 212 } 213 + 214 + DEFINE_GUARD(mmap_read_lock, struct mm_struct *, 215 + mmap_read_lock(_T), mmap_read_unlock(_T)) 214 216 215 217 static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) 216 218 {
+1 -1
include/uapi/linux/futex.h
··· 63 63 #define FUTEX2_SIZE_U32 0x02 64 64 #define FUTEX2_SIZE_U64 0x03 65 65 #define FUTEX2_NUMA 0x04 66 - /* 0x08 */ 66 + #define FUTEX2_MPOL 0x08 67 67 /* 0x10 */ 68 68 /* 0x20 */ 69 69 /* 0x40 */
+5
init/Kconfig
··· 1704 1704 depends on FUTEX && !BASE_SMALL && MMU 1705 1705 default y 1706 1706 1707 + config FUTEX_MPOL 1708 + bool 1709 + depends on FUTEX && NUMA 1710 + default y 1711 + 1707 1712 config EPOLL 1708 1713 bool "Enable eventpoll support" if EXPERT 1709 1714 default y
+100 -16
kernel/futex/core.c
··· 43 43 #include <linux/slab.h> 44 44 #include <linux/prctl.h> 45 45 #include <linux/rcuref.h> 46 + #include <linux/mempolicy.h> 47 + #include <linux/mmap_lock.h> 46 48 47 49 #include "futex.h" 48 50 #include "../locking/rtmutex_common.h" ··· 330 328 331 329 #endif /* CONFIG_FUTEX_PRIVATE_HASH */ 332 330 331 + #ifdef CONFIG_FUTEX_MPOL 332 + 333 + static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr) 334 + { 335 + struct vm_area_struct *vma = vma_lookup(mm, addr); 336 + struct mempolicy *mpol; 337 + int node = FUTEX_NO_NODE; 338 + 339 + if (!vma) 340 + return FUTEX_NO_NODE; 341 + 342 + mpol = vma_policy(vma); 343 + if (!mpol) 344 + return FUTEX_NO_NODE; 345 + 346 + switch (mpol->mode) { 347 + case MPOL_PREFERRED: 348 + node = first_node(mpol->nodes); 349 + break; 350 + case MPOL_PREFERRED_MANY: 351 + case MPOL_BIND: 352 + if (mpol->home_node != NUMA_NO_NODE) 353 + node = mpol->home_node; 354 + break; 355 + default: 356 + break; 357 + } 358 + 359 + return node; 360 + } 361 + 362 + static int futex_key_to_node_opt(struct mm_struct *mm, unsigned long addr) 363 + { 364 + int seq, node; 365 + 366 + guard(rcu)(); 367 + 368 + if (!mmap_lock_speculate_try_begin(mm, &seq)) 369 + return -EBUSY; 370 + 371 + node = __futex_key_to_node(mm, addr); 372 + 373 + if (mmap_lock_speculate_retry(mm, seq)) 374 + return -EAGAIN; 375 + 376 + return node; 377 + } 378 + 379 + static int futex_mpol(struct mm_struct *mm, unsigned long addr) 380 + { 381 + int node; 382 + 383 + node = futex_key_to_node_opt(mm, addr); 384 + if (node >= FUTEX_NO_NODE) 385 + return node; 386 + 387 + guard(mmap_read_lock)(mm); 388 + return __futex_key_to_node(mm, addr); 389 + } 390 + 391 + #else /* !CONFIG_FUTEX_MPOL */ 392 + 393 + static int futex_mpol(struct mm_struct *mm, unsigned long addr) 394 + { 395 + return FUTEX_NO_NODE; 396 + } 397 + 398 + #endif /* CONFIG_FUTEX_MPOL */ 399 + 333 400 /** 334 401 * __futex_hash - Return the hash bucket 335 402 * @key: Pointer to the futex key for which the hash is calculated ··· 413 342 static struct futex_hash_bucket * 414 343 __futex_hash(union futex_key *key, struct futex_private_hash *fph) 415 344 { 416 - struct futex_hash_bucket *hb; 345 + int node = key->both.node; 417 346 u32 hash; 418 - int node; 419 347 420 - hb = __futex_hash_private(key, fph); 421 - if (hb) 422 - return hb; 348 + if (node == FUTEX_NO_NODE) { 349 + struct futex_hash_bucket *hb; 350 + 351 + hb = __futex_hash_private(key, fph); 352 + if (hb) 353 + return hb; 354 + } 423 355 424 356 hash = jhash2((u32 *)key, 425 357 offsetof(typeof(*key), both.offset) / sizeof(u32), 426 358 key->both.offset); 427 - node = key->both.node; 428 359 429 360 if (node == FUTEX_NO_NODE) { 430 361 /* ··· 553 480 struct folio *folio; 554 481 struct address_space *mapping; 555 482 int node, err, size, ro = 0; 483 + bool node_updated = false; 556 484 bool fshared; 557 485 558 486 fshared = flags & FLAGS_SHARED; ··· 575 501 if (unlikely(should_fail_futex(fshared))) 576 502 return -EFAULT; 577 503 504 + node = FUTEX_NO_NODE; 505 + 578 506 if (flags & FLAGS_NUMA) { 579 507 u32 __user *naddr = (void *)uaddr + size / 2; 580 508 581 509 if (futex_get_value(&node, naddr)) 582 510 return -EFAULT; 583 511 512 + if (node != FUTEX_NO_NODE && 513 + (node >= MAX_NUMNODES || !node_possible(node))) 514 + return -EINVAL; 515 + } 516 + 517 + if (node == FUTEX_NO_NODE && (flags & FLAGS_MPOL)) { 518 + node = futex_mpol(mm, address); 519 + node_updated = true; 520 + } 521 + 522 + if (flags & FLAGS_NUMA) { 523 + u32 __user *naddr = (void *)uaddr + size / 2; 524 + 584 525 if (node == FUTEX_NO_NODE) { 585 526 node = numa_node_id(); 586 - if (futex_put_value(node, naddr)) 587 - return -EFAULT; 588 - 589 - } else if (node >= MAX_NUMNODES || !node_possible(node)) { 590 - return -EINVAL; 527 + node_updated = true; 591 528 } 592 - 593 - key->both.node = node; 594 - 595 - } else { 596 - key->both.node = FUTEX_NO_NODE; 529 + if (node_updated && futex_put_value(node, naddr)) 530 + return -EFAULT; 597 531 } 532 + 533 + key->both.node = node; 598 534 599 535 /* 600 536 * PROCESS_PRIVATE futexes are fast.
+5 -1
kernel/futex/futex.h
··· 39 39 #define FLAGS_HAS_TIMEOUT 0x0040 40 40 #define FLAGS_NUMA 0x0080 41 41 #define FLAGS_STRICT 0x0100 42 + #define FLAGS_MPOL 0x0200 42 43 43 44 /* FUTEX_ to FLAGS_ */ 44 45 static inline unsigned int futex_to_flags(unsigned int op) ··· 55 54 return flags; 56 55 } 57 56 58 - #define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_NUMA | FUTEX2_PRIVATE) 57 + #define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_NUMA | FUTEX2_MPOL | FUTEX2_PRIVATE) 59 58 60 59 /* FUTEX2_ to FLAGS_ */ 61 60 static inline unsigned int futex2_to_flags(unsigned int flags2) ··· 67 66 68 67 if (flags2 & FUTEX2_NUMA) 69 68 flags |= FLAGS_NUMA; 69 + 70 + if (flags2 & FUTEX2_MPOL) 71 + flags |= FLAGS_MPOL; 70 72 71 73 return flags; 72 74 }