Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

futex: Implement sys_futex_waitv()

Add support to wait on multiple futexes. This is the interface
implemented by this syscall:

futex_waitv(struct futex_waitv *waiters, unsigned int nr_futexes,
unsigned int flags, struct timespec *timeout, clockid_t clockid)

struct futex_waitv {
__u64 val;
__u64 uaddr;
__u32 flags;
__u32 __reserved;
};

Given an array of struct futex_waitv, wait on each uaddr. The thread
wakes if a futex_wake() is performed at any uaddr. The syscall returns
immediately if any waiter has *uaddr != val. *timeout is an optional
absolute timeout value for the operation. This syscall supports only
64bit sized timeout structs. The flags argument of the syscall should be
empty, but it can be used for future extensions. Flags for shared
futexes, sizes, etc. should be used on the individual flags of each
waiter.

__reserved is used for explicit padding and should be 0, but it might be
used for future extensions. If the userspace uses 32-bit pointers, it
should make sure to explicitly cast it when assigning to waitv::uaddr.

Returns the array index of one of the woken futexes. There’s no given
information of how many were woken, or any particular attribute of it
(if it’s the first woken, if it is of the smaller index...).

Signed-off-by: André Almeida <andrealmeid@collabora.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210923171111.300673-17-andrealmeid@collabora.com

authored by

André Almeida and committed by
Peter Zijlstra
bf69bad3 bff7c57c

+371 -1
+1
MAINTAINERS
··· 7718 7718 R: Peter Zijlstra <peterz@infradead.org> 7719 7719 R: Darren Hart <dvhart@infradead.org> 7720 7720 R: Davidlohr Bueso <dave@stgolabs.net> 7721 + R: André Almeida <andrealmeid@collabora.com> 7721 7722 L: linux-kernel@vger.kernel.org 7722 7723 S: Maintained 7723 7724 T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/core
+5
include/linux/syscalls.h
··· 58 58 struct compat_stat; 59 59 struct old_timeval32; 60 60 struct robust_list_head; 61 + struct futex_waitv; 61 62 struct getcpu_cache; 62 63 struct old_linux_dirent; 63 64 struct perf_event_attr; ··· 623 622 size_t __user *len_ptr); 624 623 asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, 625 624 size_t len); 625 + 626 + asmlinkage long sys_futex_waitv(struct futex_waitv *waiters, 627 + unsigned int nr_futexes, unsigned int flags, 628 + struct __kernel_timespec __user *timeout, clockid_t clockid); 626 629 627 630 /* kernel/hrtimer.c */ 628 631 asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp,
+4 -1
include/uapi/asm-generic/unistd.h
··· 880 880 #define __NR_process_mrelease 448 881 881 __SYSCALL(__NR_process_mrelease, sys_process_mrelease) 882 882 883 + #define __NR_futex_waitv 449 884 + __SYSCALL(__NR_futex_waitv, sys_futex_waitv) 885 + 883 886 #undef __NR_syscalls 884 - #define __NR_syscalls 449 887 + #define __NR_syscalls 450 885 888 886 889 /* 887 890 * 32 bit systems traditionally used different
+25
include/uapi/linux/futex.h
··· 44 44 FUTEX_PRIVATE_FLAG) 45 45 46 46 /* 47 + * Flags to specify the bit length of the futex word for futex2 syscalls. 48 + * Currently, only 32 is supported. 49 + */ 50 + #define FUTEX_32 2 51 + 52 + /* 53 + * Max numbers of elements in a futex_waitv array 54 + */ 55 + #define FUTEX_WAITV_MAX 128 56 + 57 + /** 58 + * struct futex_waitv - A waiter for vectorized wait 59 + * @val: Expected value at uaddr 60 + * @uaddr: User address to wait on 61 + * @flags: Flags for this waiter 62 + * @__reserved: Reserved member to preserve data alignment. Should be 0. 63 + */ 64 + struct futex_waitv { 65 + __u64 val; 66 + __u64 uaddr; 67 + __u32 flags; 68 + __u32 __reserved; 69 + }; 70 + 71 + /* 47 72 * Support for robust futexes: the kernel cleans up held futexes at 48 73 * thread exit time. 49 74 */
+15
kernel/futex/futex.h
··· 268 268 extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, 269 269 ktime_t *abs_time, u32 bitset); 270 270 271 + /** 272 + * struct futex_vector - Auxiliary struct for futex_waitv() 273 + * @w: Userspace provided data 274 + * @q: Kernel side data 275 + * 276 + * Struct used to build an array with all data need for futex_waitv() 277 + */ 278 + struct futex_vector { 279 + struct futex_waitv w; 280 + struct futex_q q; 281 + }; 282 + 283 + extern int futex_wait_multiple(struct futex_vector *vs, unsigned int count, 284 + struct hrtimer_sleeper *to); 285 + 271 286 extern int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset); 272 287 273 288 extern int futex_wake_op(u32 __user *uaddr1, unsigned int flags,
+119
kernel/futex/syscalls.c
··· 199 199 return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); 200 200 } 201 201 202 + /* Mask of available flags for each futex in futex_waitv list */ 203 + #define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG) 204 + 205 + /** 206 + * futex_parse_waitv - Parse a waitv array from userspace 207 + * @futexv: Kernel side list of waiters to be filled 208 + * @uwaitv: Userspace list to be parsed 209 + * @nr_futexes: Length of futexv 210 + * 211 + * Return: Error code on failure, 0 on success 212 + */ 213 + static int futex_parse_waitv(struct futex_vector *futexv, 214 + struct futex_waitv __user *uwaitv, 215 + unsigned int nr_futexes) 216 + { 217 + struct futex_waitv aux; 218 + unsigned int i; 219 + 220 + for (i = 0; i < nr_futexes; i++) { 221 + if (copy_from_user(&aux, &uwaitv[i], sizeof(aux))) 222 + return -EFAULT; 223 + 224 + if ((aux.flags & ~FUTEXV_WAITER_MASK) || aux.__reserved) 225 + return -EINVAL; 226 + 227 + if (!(aux.flags & FUTEX_32)) 228 + return -EINVAL; 229 + 230 + futexv[i].w.flags = aux.flags; 231 + futexv[i].w.val = aux.val; 232 + futexv[i].w.uaddr = aux.uaddr; 233 + futexv[i].q = futex_q_init; 234 + } 235 + 236 + return 0; 237 + } 238 + 239 + /** 240 + * sys_futex_waitv - Wait on a list of futexes 241 + * @waiters: List of futexes to wait on 242 + * @nr_futexes: Length of futexv 243 + * @flags: Flag for timeout (monotonic/realtime) 244 + * @timeout: Optional absolute timeout. 245 + * @clockid: Clock to be used for the timeout, realtime or monotonic. 246 + * 247 + * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes 248 + * if a futex_wake() is performed at any uaddr. The syscall returns immediately 249 + * if any waiter has *uaddr != val. *timeout is an optional timeout value for 250 + * the operation. Each waiter has individual flags. The `flags` argument for 251 + * the syscall should be used solely for specifying the timeout as realtime, if 252 + * needed. Flags for private futexes, sizes, etc. should be used on the 253 + * individual flags of each waiter. 254 + * 255 + * Returns the array index of one of the woken futexes. No further information 256 + * is provided: any number of other futexes may also have been woken by the 257 + * same event, and if more than one futex was woken, the retrned index may 258 + * refer to any one of them. (It is not necessaryily the futex with the 259 + * smallest index, nor the one most recently woken, nor...) 260 + */ 261 + 262 + SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters, 263 + unsigned int, nr_futexes, unsigned int, flags, 264 + struct __kernel_timespec __user *, timeout, clockid_t, clockid) 265 + { 266 + struct hrtimer_sleeper to; 267 + struct futex_vector *futexv; 268 + struct timespec64 ts; 269 + ktime_t time; 270 + int ret; 271 + 272 + /* This syscall supports no flags for now */ 273 + if (flags) 274 + return -EINVAL; 275 + 276 + if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters) 277 + return -EINVAL; 278 + 279 + if (timeout) { 280 + int flag_clkid = 0, flag_init = 0; 281 + 282 + if (clockid == CLOCK_REALTIME) { 283 + flag_clkid = FLAGS_CLOCKRT; 284 + flag_init = FUTEX_CLOCK_REALTIME; 285 + } 286 + 287 + if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) 288 + return -EINVAL; 289 + 290 + if (get_timespec64(&ts, timeout)) 291 + return -EFAULT; 292 + 293 + /* 294 + * Since there's no opcode for futex_waitv, use 295 + * FUTEX_WAIT_BITSET that uses absolute timeout as well 296 + */ 297 + ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time); 298 + if (ret) 299 + return ret; 300 + 301 + futex_setup_timer(&time, &to, flag_clkid, 0); 302 + } 303 + 304 + futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL); 305 + if (!futexv) 306 + return -ENOMEM; 307 + 308 + ret = futex_parse_waitv(futexv, waiters, nr_futexes); 309 + if (!ret) 310 + ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL); 311 + 312 + if (timeout) { 313 + hrtimer_cancel(&to.timer); 314 + destroy_hrtimer_on_stack(&to.timer); 315 + } 316 + 317 + kfree(futexv); 318 + return ret; 319 + } 320 + 202 321 #ifdef CONFIG_COMPAT 203 322 COMPAT_SYSCALL_DEFINE2(set_robust_list, 204 323 struct compat_robust_list_head __user *, head,
+201
kernel/futex/waitwake.c
··· 358 358 } 359 359 360 360 /** 361 + * unqueue_multiple - Remove various futexes from their hash bucket 362 + * @v: The list of futexes to unqueue 363 + * @count: Number of futexes in the list 364 + * 365 + * Helper to unqueue a list of futexes. This can't fail. 366 + * 367 + * Return: 368 + * - >=0 - Index of the last futex that was awoken; 369 + * - -1 - No futex was awoken 370 + */ 371 + static int unqueue_multiple(struct futex_vector *v, int count) 372 + { 373 + int ret = -1, i; 374 + 375 + for (i = 0; i < count; i++) { 376 + if (!futex_unqueue(&v[i].q)) 377 + ret = i; 378 + } 379 + 380 + return ret; 381 + } 382 + 383 + /** 384 + * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes 385 + * @vs: The futex list to wait on 386 + * @count: The size of the list 387 + * @woken: Index of the last woken futex, if any. Used to notify the 388 + * caller that it can return this index to userspace (return parameter) 389 + * 390 + * Prepare multiple futexes in a single step and enqueue them. This may fail if 391 + * the futex list is invalid or if any futex was already awoken. On success the 392 + * task is ready to interruptible sleep. 393 + * 394 + * Return: 395 + * - 1 - One of the futexes was woken by another thread 396 + * - 0 - Success 397 + * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL 398 + */ 399 + static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken) 400 + { 401 + struct futex_hash_bucket *hb; 402 + bool retry = false; 403 + int ret, i; 404 + u32 uval; 405 + 406 + /* 407 + * Enqueuing multiple futexes is tricky, because we need to enqueue 408 + * each futex on the list before dealing with the next one to avoid 409 + * deadlocking on the hash bucket. But, before enqueuing, we need to 410 + * make sure that current->state is TASK_INTERRUPTIBLE, so we don't 411 + * lose any wake events, which cannot be done before the get_futex_key 412 + * of the next key, because it calls get_user_pages, which can sleep. 413 + * Thus, we fetch the list of futexes keys in two steps, by first 414 + * pinning all the memory keys in the futex key, and only then we read 415 + * each key and queue the corresponding futex. 416 + * 417 + * Private futexes doesn't need to recalculate hash in retry, so skip 418 + * get_futex_key() when retrying. 419 + */ 420 + retry: 421 + for (i = 0; i < count; i++) { 422 + if ((vs[i].w.flags & FUTEX_PRIVATE_FLAG) && retry) 423 + continue; 424 + 425 + ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr), 426 + !(vs[i].w.flags & FUTEX_PRIVATE_FLAG), 427 + &vs[i].q.key, FUTEX_READ); 428 + 429 + if (unlikely(ret)) 430 + return ret; 431 + } 432 + 433 + set_current_state(TASK_INTERRUPTIBLE); 434 + 435 + for (i = 0; i < count; i++) { 436 + u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr; 437 + struct futex_q *q = &vs[i].q; 438 + u32 val = (u32)vs[i].w.val; 439 + 440 + hb = futex_q_lock(q); 441 + ret = futex_get_value_locked(&uval, uaddr); 442 + 443 + if (!ret && uval == val) { 444 + /* 445 + * The bucket lock can't be held while dealing with the 446 + * next futex. Queue each futex at this moment so hb can 447 + * be unlocked. 448 + */ 449 + futex_queue(q, hb); 450 + continue; 451 + } 452 + 453 + futex_q_unlock(hb); 454 + __set_current_state(TASK_RUNNING); 455 + 456 + /* 457 + * Even if something went wrong, if we find out that a futex 458 + * was woken, we don't return error and return this index to 459 + * userspace 460 + */ 461 + *woken = unqueue_multiple(vs, i); 462 + if (*woken >= 0) 463 + return 1; 464 + 465 + if (ret) { 466 + /* 467 + * If we need to handle a page fault, we need to do so 468 + * without any lock and any enqueued futex (otherwise 469 + * we could lose some wakeup). So we do it here, after 470 + * undoing all the work done so far. In success, we 471 + * retry all the work. 472 + */ 473 + if (get_user(uval, uaddr)) 474 + return -EFAULT; 475 + 476 + retry = true; 477 + goto retry; 478 + } 479 + 480 + if (uval != val) 481 + return -EWOULDBLOCK; 482 + } 483 + 484 + return 0; 485 + } 486 + 487 + /** 488 + * futex_sleep_multiple - Check sleeping conditions and sleep 489 + * @vs: List of futexes to wait for 490 + * @count: Length of vs 491 + * @to: Timeout 492 + * 493 + * Sleep if and only if the timeout hasn't expired and no futex on the list has 494 + * been woken up. 495 + */ 496 + static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count, 497 + struct hrtimer_sleeper *to) 498 + { 499 + if (to && !to->task) 500 + return; 501 + 502 + for (; count; count--, vs++) { 503 + if (!READ_ONCE(vs->q.lock_ptr)) 504 + return; 505 + } 506 + 507 + freezable_schedule(); 508 + } 509 + 510 + /** 511 + * futex_wait_multiple - Prepare to wait on and enqueue several futexes 512 + * @vs: The list of futexes to wait on 513 + * @count: The number of objects 514 + * @to: Timeout before giving up and returning to userspace 515 + * 516 + * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function 517 + * sleeps on a group of futexes and returns on the first futex that is 518 + * wake, or after the timeout has elapsed. 519 + * 520 + * Return: 521 + * - >=0 - Hint to the futex that was awoken 522 + * - <0 - On error 523 + */ 524 + int futex_wait_multiple(struct futex_vector *vs, unsigned int count, 525 + struct hrtimer_sleeper *to) 526 + { 527 + int ret, hint = 0; 528 + 529 + if (to) 530 + hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); 531 + 532 + while (1) { 533 + ret = futex_wait_multiple_setup(vs, count, &hint); 534 + if (ret) { 535 + if (ret > 0) { 536 + /* A futex was woken during setup */ 537 + ret = hint; 538 + } 539 + return ret; 540 + } 541 + 542 + futex_sleep_multiple(vs, count, to); 543 + 544 + __set_current_state(TASK_RUNNING); 545 + 546 + ret = unqueue_multiple(vs, count); 547 + if (ret >= 0) 548 + return ret; 549 + 550 + if (to && !to->task) 551 + return -ETIMEDOUT; 552 + else if (signal_pending(current)) 553 + return -ERESTARTSYS; 554 + /* 555 + * The final case is a spurious wakeup, for 556 + * which just retry. 557 + */ 558 + } 559 + } 560 + 561 + /** 361 562 * futex_wait_setup() - Prepare to wait on a futex 362 563 * @uaddr: the futex userspace address 363 564 * @val: the expected value
+1
kernel/sys_ni.c
··· 150 150 COND_SYSCALL_COMPAT(set_robust_list); 151 151 COND_SYSCALL(get_robust_list); 152 152 COND_SYSCALL_COMPAT(get_robust_list); 153 + COND_SYSCALL(futex_waitv); 153 154 154 155 /* kernel/hrtimer.c */ 155 156