at v5.11 112 kB view raw
1// SPDX-License-Identifier: GPL-2.0-or-later 2/* 3 * Fast Userspace Mutexes (which I call "Futexes!"). 4 * (C) Rusty Russell, IBM 2002 5 * 6 * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar 7 * (C) Copyright 2003 Red Hat Inc, All Rights Reserved 8 * 9 * Removed page pinning, fix privately mapped COW pages and other cleanups 10 * (C) Copyright 2003, 2004 Jamie Lokier 11 * 12 * Robust futex support started by Ingo Molnar 13 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved 14 * Thanks to Thomas Gleixner for suggestions, analysis and fixes. 15 * 16 * PI-futex support started by Ingo Molnar and Thomas Gleixner 17 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 18 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> 19 * 20 * PRIVATE futexes by Eric Dumazet 21 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> 22 * 23 * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> 24 * Copyright (C) IBM Corporation, 2009 25 * Thanks to Thomas Gleixner for conceptual design and careful reviews. 26 * 27 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 28 * enough at me, Linus for the original (flawed) idea, Matthew 29 * Kirkwood for proof-of-concept implementation. 30 * 31 * "The futexes are also cursed." 32 * "But they come in a choice of three flavours!" 33 */ 34#include <linux/compat.h> 35#include <linux/jhash.h> 36#include <linux/pagemap.h> 37#include <linux/syscalls.h> 38#include <linux/hugetlb.h> 39#include <linux/freezer.h> 40#include <linux/memblock.h> 41#include <linux/fault-inject.h> 42#include <linux/time_namespace.h> 43 44#include <asm/futex.h> 45 46#include "locking/rtmutex_common.h" 47 48/* 49 * READ this before attempting to hack on futexes! 50 * 51 * Basic futex operation and ordering guarantees 52 * ============================================= 53 * 54 * The waiter reads the futex value in user space and calls 55 * futex_wait(). This function computes the hash bucket and acquires 56 * the hash bucket lock. After that it reads the futex user space value 57 * again and verifies that the data has not changed. If it has not changed 58 * it enqueues itself into the hash bucket, releases the hash bucket lock 59 * and schedules. 60 * 61 * The waker side modifies the user space value of the futex and calls 62 * futex_wake(). This function computes the hash bucket and acquires the 63 * hash bucket lock. Then it looks for waiters on that futex in the hash 64 * bucket and wakes them. 65 * 66 * In futex wake up scenarios where no tasks are blocked on a futex, taking 67 * the hb spinlock can be avoided and simply return. In order for this 68 * optimization to work, ordering guarantees must exist so that the waiter 69 * being added to the list is acknowledged when the list is concurrently being 70 * checked by the waker, avoiding scenarios like the following: 71 * 72 * CPU 0 CPU 1 73 * val = *futex; 74 * sys_futex(WAIT, futex, val); 75 * futex_wait(futex, val); 76 * uval = *futex; 77 * *futex = newval; 78 * sys_futex(WAKE, futex); 79 * futex_wake(futex); 80 * if (queue_empty()) 81 * return; 82 * if (uval == val) 83 * lock(hash_bucket(futex)); 84 * queue(); 85 * unlock(hash_bucket(futex)); 86 * schedule(); 87 * 88 * This would cause the waiter on CPU 0 to wait forever because it 89 * missed the transition of the user space value from val to newval 90 * and the waker did not find the waiter in the hash bucket queue. 91 * 92 * The correct serialization ensures that a waiter either observes 93 * the changed user space value before blocking or is woken by a 94 * concurrent waker: 95 * 96 * CPU 0 CPU 1 97 * val = *futex; 98 * sys_futex(WAIT, futex, val); 99 * futex_wait(futex, val); 100 * 101 * waiters++; (a) 102 * smp_mb(); (A) <-- paired with -. 103 * | 104 * lock(hash_bucket(futex)); | 105 * | 106 * uval = *futex; | 107 * | *futex = newval; 108 * | sys_futex(WAKE, futex); 109 * | futex_wake(futex); 110 * | 111 * `--------> smp_mb(); (B) 112 * if (uval == val) 113 * queue(); 114 * unlock(hash_bucket(futex)); 115 * schedule(); if (waiters) 116 * lock(hash_bucket(futex)); 117 * else wake_waiters(futex); 118 * waiters--; (b) unlock(hash_bucket(futex)); 119 * 120 * Where (A) orders the waiters increment and the futex value read through 121 * atomic operations (see hb_waiters_inc) and where (B) orders the write 122 * to futex and the waiters read (see hb_waiters_pending()). 123 * 124 * This yields the following case (where X:=waiters, Y:=futex): 125 * 126 * X = Y = 0 127 * 128 * w[X]=1 w[Y]=1 129 * MB MB 130 * r[Y]=y r[X]=x 131 * 132 * Which guarantees that x==0 && y==0 is impossible; which translates back into 133 * the guarantee that we cannot both miss the futex variable change and the 134 * enqueue. 135 * 136 * Note that a new waiter is accounted for in (a) even when it is possible that 137 * the wait call can return error, in which case we backtrack from it in (b). 138 * Refer to the comment in queue_lock(). 139 * 140 * Similarly, in order to account for waiters being requeued on another 141 * address we always increment the waiters for the destination bucket before 142 * acquiring the lock. It then decrements them again after releasing it - 143 * the code that actually moves the futex(es) between hash buckets (requeue_futex) 144 * will do the additional required waiter count housekeeping. This is done for 145 * double_lock_hb() and double_unlock_hb(), respectively. 146 */ 147 148#ifdef CONFIG_HAVE_FUTEX_CMPXCHG 149#define futex_cmpxchg_enabled 1 150#else 151static int __read_mostly futex_cmpxchg_enabled; 152#endif 153 154/* 155 * Futex flags used to encode options to functions and preserve them across 156 * restarts. 157 */ 158#ifdef CONFIG_MMU 159# define FLAGS_SHARED 0x01 160#else 161/* 162 * NOMMU does not have per process address space. Let the compiler optimize 163 * code away. 164 */ 165# define FLAGS_SHARED 0x00 166#endif 167#define FLAGS_CLOCKRT 0x02 168#define FLAGS_HAS_TIMEOUT 0x04 169 170/* 171 * Priority Inheritance state: 172 */ 173struct futex_pi_state { 174 /* 175 * list of 'owned' pi_state instances - these have to be 176 * cleaned up in do_exit() if the task exits prematurely: 177 */ 178 struct list_head list; 179 180 /* 181 * The PI object: 182 */ 183 struct rt_mutex pi_mutex; 184 185 struct task_struct *owner; 186 refcount_t refcount; 187 188 union futex_key key; 189} __randomize_layout; 190 191/** 192 * struct futex_q - The hashed futex queue entry, one per waiting task 193 * @list: priority-sorted list of tasks waiting on this futex 194 * @task: the task waiting on the futex 195 * @lock_ptr: the hash bucket lock 196 * @key: the key the futex is hashed on 197 * @pi_state: optional priority inheritance state 198 * @rt_waiter: rt_waiter storage for use with requeue_pi 199 * @requeue_pi_key: the requeue_pi target futex key 200 * @bitset: bitset for the optional bitmasked wakeup 201 * 202 * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so 203 * we can wake only the relevant ones (hashed queues may be shared). 204 * 205 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 206 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. 207 * The order of wakeup is always to make the first condition true, then 208 * the second. 209 * 210 * PI futexes are typically woken before they are removed from the hash list via 211 * the rt_mutex code. See unqueue_me_pi(). 212 */ 213struct futex_q { 214 struct plist_node list; 215 216 struct task_struct *task; 217 spinlock_t *lock_ptr; 218 union futex_key key; 219 struct futex_pi_state *pi_state; 220 struct rt_mutex_waiter *rt_waiter; 221 union futex_key *requeue_pi_key; 222 u32 bitset; 223} __randomize_layout; 224 225static const struct futex_q futex_q_init = { 226 /* list gets initialized in queue_me()*/ 227 .key = FUTEX_KEY_INIT, 228 .bitset = FUTEX_BITSET_MATCH_ANY 229}; 230 231/* 232 * Hash buckets are shared by all the futex_keys that hash to the same 233 * location. Each key may have multiple futex_q structures, one for each task 234 * waiting on a futex. 235 */ 236struct futex_hash_bucket { 237 atomic_t waiters; 238 spinlock_t lock; 239 struct plist_head chain; 240} ____cacheline_aligned_in_smp; 241 242/* 243 * The base of the bucket array and its size are always used together 244 * (after initialization only in hash_futex()), so ensure that they 245 * reside in the same cacheline. 246 */ 247static struct { 248 struct futex_hash_bucket *queues; 249 unsigned long hashsize; 250} __futex_data __read_mostly __aligned(2*sizeof(long)); 251#define futex_queues (__futex_data.queues) 252#define futex_hashsize (__futex_data.hashsize) 253 254 255/* 256 * Fault injections for futexes. 257 */ 258#ifdef CONFIG_FAIL_FUTEX 259 260static struct { 261 struct fault_attr attr; 262 263 bool ignore_private; 264} fail_futex = { 265 .attr = FAULT_ATTR_INITIALIZER, 266 .ignore_private = false, 267}; 268 269static int __init setup_fail_futex(char *str) 270{ 271 return setup_fault_attr(&fail_futex.attr, str); 272} 273__setup("fail_futex=", setup_fail_futex); 274 275static bool should_fail_futex(bool fshared) 276{ 277 if (fail_futex.ignore_private && !fshared) 278 return false; 279 280 return should_fail(&fail_futex.attr, 1); 281} 282 283#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 284 285static int __init fail_futex_debugfs(void) 286{ 287 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 288 struct dentry *dir; 289 290 dir = fault_create_debugfs_attr("fail_futex", NULL, 291 &fail_futex.attr); 292 if (IS_ERR(dir)) 293 return PTR_ERR(dir); 294 295 debugfs_create_bool("ignore-private", mode, dir, 296 &fail_futex.ignore_private); 297 return 0; 298} 299 300late_initcall(fail_futex_debugfs); 301 302#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 303 304#else 305static inline bool should_fail_futex(bool fshared) 306{ 307 return false; 308} 309#endif /* CONFIG_FAIL_FUTEX */ 310 311#ifdef CONFIG_COMPAT 312static void compat_exit_robust_list(struct task_struct *curr); 313#endif 314 315/* 316 * Reflects a new waiter being added to the waitqueue. 317 */ 318static inline void hb_waiters_inc(struct futex_hash_bucket *hb) 319{ 320#ifdef CONFIG_SMP 321 atomic_inc(&hb->waiters); 322 /* 323 * Full barrier (A), see the ordering comment above. 324 */ 325 smp_mb__after_atomic(); 326#endif 327} 328 329/* 330 * Reflects a waiter being removed from the waitqueue by wakeup 331 * paths. 332 */ 333static inline void hb_waiters_dec(struct futex_hash_bucket *hb) 334{ 335#ifdef CONFIG_SMP 336 atomic_dec(&hb->waiters); 337#endif 338} 339 340static inline int hb_waiters_pending(struct futex_hash_bucket *hb) 341{ 342#ifdef CONFIG_SMP 343 /* 344 * Full barrier (B), see the ordering comment above. 345 */ 346 smp_mb(); 347 return atomic_read(&hb->waiters); 348#else 349 return 1; 350#endif 351} 352 353/** 354 * hash_futex - Return the hash bucket in the global hash 355 * @key: Pointer to the futex key for which the hash is calculated 356 * 357 * We hash on the keys returned from get_futex_key (see below) and return the 358 * corresponding hash bucket in the global hash. 359 */ 360static struct futex_hash_bucket *hash_futex(union futex_key *key) 361{ 362 u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, 363 key->both.offset); 364 365 return &futex_queues[hash & (futex_hashsize - 1)]; 366} 367 368 369/** 370 * match_futex - Check whether two futex keys are equal 371 * @key1: Pointer to key1 372 * @key2: Pointer to key2 373 * 374 * Return 1 if two futex_keys are equal, 0 otherwise. 375 */ 376static inline int match_futex(union futex_key *key1, union futex_key *key2) 377{ 378 return (key1 && key2 379 && key1->both.word == key2->both.word 380 && key1->both.ptr == key2->both.ptr 381 && key1->both.offset == key2->both.offset); 382} 383 384enum futex_access { 385 FUTEX_READ, 386 FUTEX_WRITE 387}; 388 389/** 390 * futex_setup_timer - set up the sleeping hrtimer. 391 * @time: ptr to the given timeout value 392 * @timeout: the hrtimer_sleeper structure to be set up 393 * @flags: futex flags 394 * @range_ns: optional range in ns 395 * 396 * Return: Initialized hrtimer_sleeper structure or NULL if no timeout 397 * value given 398 */ 399static inline struct hrtimer_sleeper * 400futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, 401 int flags, u64 range_ns) 402{ 403 if (!time) 404 return NULL; 405 406 hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ? 407 CLOCK_REALTIME : CLOCK_MONOTONIC, 408 HRTIMER_MODE_ABS); 409 /* 410 * If range_ns is 0, calling hrtimer_set_expires_range_ns() is 411 * effectively the same as calling hrtimer_set_expires(). 412 */ 413 hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); 414 415 return timeout; 416} 417 418/* 419 * Generate a machine wide unique identifier for this inode. 420 * 421 * This relies on u64 not wrapping in the life-time of the machine; which with 422 * 1ns resolution means almost 585 years. 423 * 424 * This further relies on the fact that a well formed program will not unmap 425 * the file while it has a (shared) futex waiting on it. This mapping will have 426 * a file reference which pins the mount and inode. 427 * 428 * If for some reason an inode gets evicted and read back in again, it will get 429 * a new sequence number and will _NOT_ match, even though it is the exact same 430 * file. 431 * 432 * It is important that match_futex() will never have a false-positive, esp. 433 * for PI futexes that can mess up the state. The above argues that false-negatives 434 * are only possible for malformed programs. 435 */ 436static u64 get_inode_sequence_number(struct inode *inode) 437{ 438 static atomic64_t i_seq; 439 u64 old; 440 441 /* Does the inode already have a sequence number? */ 442 old = atomic64_read(&inode->i_sequence); 443 if (likely(old)) 444 return old; 445 446 for (;;) { 447 u64 new = atomic64_add_return(1, &i_seq); 448 if (WARN_ON_ONCE(!new)) 449 continue; 450 451 old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); 452 if (old) 453 return old; 454 return new; 455 } 456} 457 458/** 459 * get_futex_key() - Get parameters which are the keys for a futex 460 * @uaddr: virtual address of the futex 461 * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED 462 * @key: address where result is stored. 463 * @rw: mapping needs to be read/write (values: FUTEX_READ, 464 * FUTEX_WRITE) 465 * 466 * Return: a negative error code or 0 467 * 468 * The key words are stored in @key on success. 469 * 470 * For shared mappings (when @fshared), the key is: 471 * 472 * ( inode->i_sequence, page->index, offset_within_page ) 473 * 474 * [ also see get_inode_sequence_number() ] 475 * 476 * For private mappings (or when !@fshared), the key is: 477 * 478 * ( current->mm, address, 0 ) 479 * 480 * This allows (cross process, where applicable) identification of the futex 481 * without keeping the page pinned for the duration of the FUTEX_WAIT. 482 * 483 * lock_page() might sleep, the caller should not hold a spinlock. 484 */ 485static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, 486 enum futex_access rw) 487{ 488 unsigned long address = (unsigned long)uaddr; 489 struct mm_struct *mm = current->mm; 490 struct page *page, *tail; 491 struct address_space *mapping; 492 int err, ro = 0; 493 494 /* 495 * The futex address must be "naturally" aligned. 496 */ 497 key->both.offset = address % PAGE_SIZE; 498 if (unlikely((address % sizeof(u32)) != 0)) 499 return -EINVAL; 500 address -= key->both.offset; 501 502 if (unlikely(!access_ok(uaddr, sizeof(u32)))) 503 return -EFAULT; 504 505 if (unlikely(should_fail_futex(fshared))) 506 return -EFAULT; 507 508 /* 509 * PROCESS_PRIVATE futexes are fast. 510 * As the mm cannot disappear under us and the 'key' only needs 511 * virtual address, we dont even have to find the underlying vma. 512 * Note : We do have to check 'uaddr' is a valid user address, 513 * but access_ok() should be faster than find_vma() 514 */ 515 if (!fshared) { 516 key->private.mm = mm; 517 key->private.address = address; 518 return 0; 519 } 520 521again: 522 /* Ignore any VERIFY_READ mapping (futex common case) */ 523 if (unlikely(should_fail_futex(true))) 524 return -EFAULT; 525 526 err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); 527 /* 528 * If write access is not required (eg. FUTEX_WAIT), try 529 * and get read-only access. 530 */ 531 if (err == -EFAULT && rw == FUTEX_READ) { 532 err = get_user_pages_fast(address, 1, 0, &page); 533 ro = 1; 534 } 535 if (err < 0) 536 return err; 537 else 538 err = 0; 539 540 /* 541 * The treatment of mapping from this point on is critical. The page 542 * lock protects many things but in this context the page lock 543 * stabilizes mapping, prevents inode freeing in the shared 544 * file-backed region case and guards against movement to swap cache. 545 * 546 * Strictly speaking the page lock is not needed in all cases being 547 * considered here and page lock forces unnecessarily serialization 548 * From this point on, mapping will be re-verified if necessary and 549 * page lock will be acquired only if it is unavoidable 550 * 551 * Mapping checks require the head page for any compound page so the 552 * head page and mapping is looked up now. For anonymous pages, it 553 * does not matter if the page splits in the future as the key is 554 * based on the address. For filesystem-backed pages, the tail is 555 * required as the index of the page determines the key. For 556 * base pages, there is no tail page and tail == page. 557 */ 558 tail = page; 559 page = compound_head(page); 560 mapping = READ_ONCE(page->mapping); 561 562 /* 563 * If page->mapping is NULL, then it cannot be a PageAnon 564 * page; but it might be the ZERO_PAGE or in the gate area or 565 * in a special mapping (all cases which we are happy to fail); 566 * or it may have been a good file page when get_user_pages_fast 567 * found it, but truncated or holepunched or subjected to 568 * invalidate_complete_page2 before we got the page lock (also 569 * cases which we are happy to fail). And we hold a reference, 570 * so refcount care in invalidate_complete_page's remove_mapping 571 * prevents drop_caches from setting mapping to NULL beneath us. 572 * 573 * The case we do have to guard against is when memory pressure made 574 * shmem_writepage move it from filecache to swapcache beneath us: 575 * an unlikely race, but we do need to retry for page->mapping. 576 */ 577 if (unlikely(!mapping)) { 578 int shmem_swizzled; 579 580 /* 581 * Page lock is required to identify which special case above 582 * applies. If this is really a shmem page then the page lock 583 * will prevent unexpected transitions. 584 */ 585 lock_page(page); 586 shmem_swizzled = PageSwapCache(page) || page->mapping; 587 unlock_page(page); 588 put_page(page); 589 590 if (shmem_swizzled) 591 goto again; 592 593 return -EFAULT; 594 } 595 596 /* 597 * Private mappings are handled in a simple way. 598 * 599 * If the futex key is stored on an anonymous page, then the associated 600 * object is the mm which is implicitly pinned by the calling process. 601 * 602 * NOTE: When userspace waits on a MAP_SHARED mapping, even if 603 * it's a read-only handle, it's expected that futexes attach to 604 * the object not the particular process. 605 */ 606 if (PageAnon(page)) { 607 /* 608 * A RO anonymous page will never change and thus doesn't make 609 * sense for futex operations. 610 */ 611 if (unlikely(should_fail_futex(true)) || ro) { 612 err = -EFAULT; 613 goto out; 614 } 615 616 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ 617 key->private.mm = mm; 618 key->private.address = address; 619 620 } else { 621 struct inode *inode; 622 623 /* 624 * The associated futex object in this case is the inode and 625 * the page->mapping must be traversed. Ordinarily this should 626 * be stabilised under page lock but it's not strictly 627 * necessary in this case as we just want to pin the inode, not 628 * update the radix tree or anything like that. 629 * 630 * The RCU read lock is taken as the inode is finally freed 631 * under RCU. If the mapping still matches expectations then the 632 * mapping->host can be safely accessed as being a valid inode. 633 */ 634 rcu_read_lock(); 635 636 if (READ_ONCE(page->mapping) != mapping) { 637 rcu_read_unlock(); 638 put_page(page); 639 640 goto again; 641 } 642 643 inode = READ_ONCE(mapping->host); 644 if (!inode) { 645 rcu_read_unlock(); 646 put_page(page); 647 648 goto again; 649 } 650 651 key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 652 key->shared.i_seq = get_inode_sequence_number(inode); 653 key->shared.pgoff = basepage_index(tail); 654 rcu_read_unlock(); 655 } 656 657out: 658 put_page(page); 659 return err; 660} 661 662/** 663 * fault_in_user_writeable() - Fault in user address and verify RW access 664 * @uaddr: pointer to faulting user space address 665 * 666 * Slow path to fixup the fault we just took in the atomic write 667 * access to @uaddr. 668 * 669 * We have no generic implementation of a non-destructive write to the 670 * user address. We know that we faulted in the atomic pagefault 671 * disabled section so we can as well avoid the #PF overhead by 672 * calling get_user_pages() right away. 673 */ 674static int fault_in_user_writeable(u32 __user *uaddr) 675{ 676 struct mm_struct *mm = current->mm; 677 int ret; 678 679 mmap_read_lock(mm); 680 ret = fixup_user_fault(mm, (unsigned long)uaddr, 681 FAULT_FLAG_WRITE, NULL); 682 mmap_read_unlock(mm); 683 684 return ret < 0 ? ret : 0; 685} 686 687/** 688 * futex_top_waiter() - Return the highest priority waiter on a futex 689 * @hb: the hash bucket the futex_q's reside in 690 * @key: the futex key (to distinguish it from other futex futex_q's) 691 * 692 * Must be called with the hb lock held. 693 */ 694static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, 695 union futex_key *key) 696{ 697 struct futex_q *this; 698 699 plist_for_each_entry(this, &hb->chain, list) { 700 if (match_futex(&this->key, key)) 701 return this; 702 } 703 return NULL; 704} 705 706static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, 707 u32 uval, u32 newval) 708{ 709 int ret; 710 711 pagefault_disable(); 712 ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); 713 pagefault_enable(); 714 715 return ret; 716} 717 718static int get_futex_value_locked(u32 *dest, u32 __user *from) 719{ 720 int ret; 721 722 pagefault_disable(); 723 ret = __get_user(*dest, from); 724 pagefault_enable(); 725 726 return ret ? -EFAULT : 0; 727} 728 729 730/* 731 * PI code: 732 */ 733static int refill_pi_state_cache(void) 734{ 735 struct futex_pi_state *pi_state; 736 737 if (likely(current->pi_state_cache)) 738 return 0; 739 740 pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); 741 742 if (!pi_state) 743 return -ENOMEM; 744 745 INIT_LIST_HEAD(&pi_state->list); 746 /* pi_mutex gets initialized later */ 747 pi_state->owner = NULL; 748 refcount_set(&pi_state->refcount, 1); 749 pi_state->key = FUTEX_KEY_INIT; 750 751 current->pi_state_cache = pi_state; 752 753 return 0; 754} 755 756static struct futex_pi_state *alloc_pi_state(void) 757{ 758 struct futex_pi_state *pi_state = current->pi_state_cache; 759 760 WARN_ON(!pi_state); 761 current->pi_state_cache = NULL; 762 763 return pi_state; 764} 765 766static void pi_state_update_owner(struct futex_pi_state *pi_state, 767 struct task_struct *new_owner) 768{ 769 struct task_struct *old_owner = pi_state->owner; 770 771 lockdep_assert_held(&pi_state->pi_mutex.wait_lock); 772 773 if (old_owner) { 774 raw_spin_lock(&old_owner->pi_lock); 775 WARN_ON(list_empty(&pi_state->list)); 776 list_del_init(&pi_state->list); 777 raw_spin_unlock(&old_owner->pi_lock); 778 } 779 780 if (new_owner) { 781 raw_spin_lock(&new_owner->pi_lock); 782 WARN_ON(!list_empty(&pi_state->list)); 783 list_add(&pi_state->list, &new_owner->pi_state_list); 784 pi_state->owner = new_owner; 785 raw_spin_unlock(&new_owner->pi_lock); 786 } 787} 788 789static void get_pi_state(struct futex_pi_state *pi_state) 790{ 791 WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount)); 792} 793 794/* 795 * Drops a reference to the pi_state object and frees or caches it 796 * when the last reference is gone. 797 */ 798static void put_pi_state(struct futex_pi_state *pi_state) 799{ 800 if (!pi_state) 801 return; 802 803 if (!refcount_dec_and_test(&pi_state->refcount)) 804 return; 805 806 /* 807 * If pi_state->owner is NULL, the owner is most probably dying 808 * and has cleaned up the pi_state already 809 */ 810 if (pi_state->owner) { 811 unsigned long flags; 812 813 raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags); 814 pi_state_update_owner(pi_state, NULL); 815 rt_mutex_proxy_unlock(&pi_state->pi_mutex); 816 raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags); 817 } 818 819 if (current->pi_state_cache) { 820 kfree(pi_state); 821 } else { 822 /* 823 * pi_state->list is already empty. 824 * clear pi_state->owner. 825 * refcount is at 0 - put it back to 1. 826 */ 827 pi_state->owner = NULL; 828 refcount_set(&pi_state->refcount, 1); 829 current->pi_state_cache = pi_state; 830 } 831} 832 833#ifdef CONFIG_FUTEX_PI 834 835/* 836 * This task is holding PI mutexes at exit time => bad. 837 * Kernel cleans up PI-state, but userspace is likely hosed. 838 * (Robust-futex cleanup is separate and might save the day for userspace.) 839 */ 840static void exit_pi_state_list(struct task_struct *curr) 841{ 842 struct list_head *next, *head = &curr->pi_state_list; 843 struct futex_pi_state *pi_state; 844 struct futex_hash_bucket *hb; 845 union futex_key key = FUTEX_KEY_INIT; 846 847 if (!futex_cmpxchg_enabled) 848 return; 849 /* 850 * We are a ZOMBIE and nobody can enqueue itself on 851 * pi_state_list anymore, but we have to be careful 852 * versus waiters unqueueing themselves: 853 */ 854 raw_spin_lock_irq(&curr->pi_lock); 855 while (!list_empty(head)) { 856 next = head->next; 857 pi_state = list_entry(next, struct futex_pi_state, list); 858 key = pi_state->key; 859 hb = hash_futex(&key); 860 861 /* 862 * We can race against put_pi_state() removing itself from the 863 * list (a waiter going away). put_pi_state() will first 864 * decrement the reference count and then modify the list, so 865 * its possible to see the list entry but fail this reference 866 * acquire. 867 * 868 * In that case; drop the locks to let put_pi_state() make 869 * progress and retry the loop. 870 */ 871 if (!refcount_inc_not_zero(&pi_state->refcount)) { 872 raw_spin_unlock_irq(&curr->pi_lock); 873 cpu_relax(); 874 raw_spin_lock_irq(&curr->pi_lock); 875 continue; 876 } 877 raw_spin_unlock_irq(&curr->pi_lock); 878 879 spin_lock(&hb->lock); 880 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 881 raw_spin_lock(&curr->pi_lock); 882 /* 883 * We dropped the pi-lock, so re-check whether this 884 * task still owns the PI-state: 885 */ 886 if (head->next != next) { 887 /* retain curr->pi_lock for the loop invariant */ 888 raw_spin_unlock(&pi_state->pi_mutex.wait_lock); 889 spin_unlock(&hb->lock); 890 put_pi_state(pi_state); 891 continue; 892 } 893 894 WARN_ON(pi_state->owner != curr); 895 WARN_ON(list_empty(&pi_state->list)); 896 list_del_init(&pi_state->list); 897 pi_state->owner = NULL; 898 899 raw_spin_unlock(&curr->pi_lock); 900 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 901 spin_unlock(&hb->lock); 902 903 rt_mutex_futex_unlock(&pi_state->pi_mutex); 904 put_pi_state(pi_state); 905 906 raw_spin_lock_irq(&curr->pi_lock); 907 } 908 raw_spin_unlock_irq(&curr->pi_lock); 909} 910#else 911static inline void exit_pi_state_list(struct task_struct *curr) { } 912#endif 913 914/* 915 * We need to check the following states: 916 * 917 * Waiter | pi_state | pi->owner | uTID | uODIED | ? 918 * 919 * [1] NULL | --- | --- | 0 | 0/1 | Valid 920 * [2] NULL | --- | --- | >0 | 0/1 | Valid 921 * 922 * [3] Found | NULL | -- | Any | 0/1 | Invalid 923 * 924 * [4] Found | Found | NULL | 0 | 1 | Valid 925 * [5] Found | Found | NULL | >0 | 1 | Invalid 926 * 927 * [6] Found | Found | task | 0 | 1 | Valid 928 * 929 * [7] Found | Found | NULL | Any | 0 | Invalid 930 * 931 * [8] Found | Found | task | ==taskTID | 0/1 | Valid 932 * [9] Found | Found | task | 0 | 0 | Invalid 933 * [10] Found | Found | task | !=taskTID | 0/1 | Invalid 934 * 935 * [1] Indicates that the kernel can acquire the futex atomically. We 936 * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. 937 * 938 * [2] Valid, if TID does not belong to a kernel thread. If no matching 939 * thread is found then it indicates that the owner TID has died. 940 * 941 * [3] Invalid. The waiter is queued on a non PI futex 942 * 943 * [4] Valid state after exit_robust_list(), which sets the user space 944 * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. 945 * 946 * [5] The user space value got manipulated between exit_robust_list() 947 * and exit_pi_state_list() 948 * 949 * [6] Valid state after exit_pi_state_list() which sets the new owner in 950 * the pi_state but cannot access the user space value. 951 * 952 * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. 953 * 954 * [8] Owner and user space value match 955 * 956 * [9] There is no transient state which sets the user space TID to 0 957 * except exit_robust_list(), but this is indicated by the 958 * FUTEX_OWNER_DIED bit. See [4] 959 * 960 * [10] There is no transient state which leaves owner and user space 961 * TID out of sync. Except one error case where the kernel is denied 962 * write access to the user address, see fixup_pi_state_owner(). 963 * 964 * 965 * Serialization and lifetime rules: 966 * 967 * hb->lock: 968 * 969 * hb -> futex_q, relation 970 * futex_q -> pi_state, relation 971 * 972 * (cannot be raw because hb can contain arbitrary amount 973 * of futex_q's) 974 * 975 * pi_mutex->wait_lock: 976 * 977 * {uval, pi_state} 978 * 979 * (and pi_mutex 'obviously') 980 * 981 * p->pi_lock: 982 * 983 * p->pi_state_list -> pi_state->list, relation 984 * 985 * pi_state->refcount: 986 * 987 * pi_state lifetime 988 * 989 * 990 * Lock order: 991 * 992 * hb->lock 993 * pi_mutex->wait_lock 994 * p->pi_lock 995 * 996 */ 997 998/* 999 * Validate that the existing waiter has a pi_state and sanity check 1000 * the pi_state against the user space value. If correct, attach to 1001 * it. 1002 */ 1003static int attach_to_pi_state(u32 __user *uaddr, u32 uval, 1004 struct futex_pi_state *pi_state, 1005 struct futex_pi_state **ps) 1006{ 1007 pid_t pid = uval & FUTEX_TID_MASK; 1008 u32 uval2; 1009 int ret; 1010 1011 /* 1012 * Userspace might have messed up non-PI and PI futexes [3] 1013 */ 1014 if (unlikely(!pi_state)) 1015 return -EINVAL; 1016 1017 /* 1018 * We get here with hb->lock held, and having found a 1019 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q 1020 * has dropped the hb->lock in between queue_me() and unqueue_me_pi(), 1021 * which in turn means that futex_lock_pi() still has a reference on 1022 * our pi_state. 1023 * 1024 * The waiter holding a reference on @pi_state also protects against 1025 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() 1026 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently 1027 * free pi_state before we can take a reference ourselves. 1028 */ 1029 WARN_ON(!refcount_read(&pi_state->refcount)); 1030 1031 /* 1032 * Now that we have a pi_state, we can acquire wait_lock 1033 * and do the state validation. 1034 */ 1035 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 1036 1037 /* 1038 * Since {uval, pi_state} is serialized by wait_lock, and our current 1039 * uval was read without holding it, it can have changed. Verify it 1040 * still is what we expect it to be, otherwise retry the entire 1041 * operation. 1042 */ 1043 if (get_futex_value_locked(&uval2, uaddr)) 1044 goto out_efault; 1045 1046 if (uval != uval2) 1047 goto out_eagain; 1048 1049 /* 1050 * Handle the owner died case: 1051 */ 1052 if (uval & FUTEX_OWNER_DIED) { 1053 /* 1054 * exit_pi_state_list sets owner to NULL and wakes the 1055 * topmost waiter. The task which acquires the 1056 * pi_state->rt_mutex will fixup owner. 1057 */ 1058 if (!pi_state->owner) { 1059 /* 1060 * No pi state owner, but the user space TID 1061 * is not 0. Inconsistent state. [5] 1062 */ 1063 if (pid) 1064 goto out_einval; 1065 /* 1066 * Take a ref on the state and return success. [4] 1067 */ 1068 goto out_attach; 1069 } 1070 1071 /* 1072 * If TID is 0, then either the dying owner has not 1073 * yet executed exit_pi_state_list() or some waiter 1074 * acquired the rtmutex in the pi state, but did not 1075 * yet fixup the TID in user space. 1076 * 1077 * Take a ref on the state and return success. [6] 1078 */ 1079 if (!pid) 1080 goto out_attach; 1081 } else { 1082 /* 1083 * If the owner died bit is not set, then the pi_state 1084 * must have an owner. [7] 1085 */ 1086 if (!pi_state->owner) 1087 goto out_einval; 1088 } 1089 1090 /* 1091 * Bail out if user space manipulated the futex value. If pi 1092 * state exists then the owner TID must be the same as the 1093 * user space TID. [9/10] 1094 */ 1095 if (pid != task_pid_vnr(pi_state->owner)) 1096 goto out_einval; 1097 1098out_attach: 1099 get_pi_state(pi_state); 1100 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 1101 *ps = pi_state; 1102 return 0; 1103 1104out_einval: 1105 ret = -EINVAL; 1106 goto out_error; 1107 1108out_eagain: 1109 ret = -EAGAIN; 1110 goto out_error; 1111 1112out_efault: 1113 ret = -EFAULT; 1114 goto out_error; 1115 1116out_error: 1117 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 1118 return ret; 1119} 1120 1121/** 1122 * wait_for_owner_exiting - Block until the owner has exited 1123 * @ret: owner's current futex lock status 1124 * @exiting: Pointer to the exiting task 1125 * 1126 * Caller must hold a refcount on @exiting. 1127 */ 1128static void wait_for_owner_exiting(int ret, struct task_struct *exiting) 1129{ 1130 if (ret != -EBUSY) { 1131 WARN_ON_ONCE(exiting); 1132 return; 1133 } 1134 1135 if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) 1136 return; 1137 1138 mutex_lock(&exiting->futex_exit_mutex); 1139 /* 1140 * No point in doing state checking here. If the waiter got here 1141 * while the task was in exec()->exec_futex_release() then it can 1142 * have any FUTEX_STATE_* value when the waiter has acquired the 1143 * mutex. OK, if running, EXITING or DEAD if it reached exit() 1144 * already. Highly unlikely and not a problem. Just one more round 1145 * through the futex maze. 1146 */ 1147 mutex_unlock(&exiting->futex_exit_mutex); 1148 1149 put_task_struct(exiting); 1150} 1151 1152static int handle_exit_race(u32 __user *uaddr, u32 uval, 1153 struct task_struct *tsk) 1154{ 1155 u32 uval2; 1156 1157 /* 1158 * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the 1159 * caller that the alleged owner is busy. 1160 */ 1161 if (tsk && tsk->futex_state != FUTEX_STATE_DEAD) 1162 return -EBUSY; 1163 1164 /* 1165 * Reread the user space value to handle the following situation: 1166 * 1167 * CPU0 CPU1 1168 * 1169 * sys_exit() sys_futex() 1170 * do_exit() futex_lock_pi() 1171 * futex_lock_pi_atomic() 1172 * exit_signals(tsk) No waiters: 1173 * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID 1174 * mm_release(tsk) Set waiter bit 1175 * exit_robust_list(tsk) { *uaddr = 0x80000PID; 1176 * Set owner died attach_to_pi_owner() { 1177 * *uaddr = 0xC0000000; tsk = get_task(PID); 1178 * } if (!tsk->flags & PF_EXITING) { 1179 * ... attach(); 1180 * tsk->futex_state = } else { 1181 * FUTEX_STATE_DEAD; if (tsk->futex_state != 1182 * FUTEX_STATE_DEAD) 1183 * return -EAGAIN; 1184 * return -ESRCH; <--- FAIL 1185 * } 1186 * 1187 * Returning ESRCH unconditionally is wrong here because the 1188 * user space value has been changed by the exiting task. 1189 * 1190 * The same logic applies to the case where the exiting task is 1191 * already gone. 1192 */ 1193 if (get_futex_value_locked(&uval2, uaddr)) 1194 return -EFAULT; 1195 1196 /* If the user space value has changed, try again. */ 1197 if (uval2 != uval) 1198 return -EAGAIN; 1199 1200 /* 1201 * The exiting task did not have a robust list, the robust list was 1202 * corrupted or the user space value in *uaddr is simply bogus. 1203 * Give up and tell user space. 1204 */ 1205 return -ESRCH; 1206} 1207 1208/* 1209 * Lookup the task for the TID provided from user space and attach to 1210 * it after doing proper sanity checks. 1211 */ 1212static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, 1213 struct futex_pi_state **ps, 1214 struct task_struct **exiting) 1215{ 1216 pid_t pid = uval & FUTEX_TID_MASK; 1217 struct futex_pi_state *pi_state; 1218 struct task_struct *p; 1219 1220 /* 1221 * We are the first waiter - try to look up the real owner and attach 1222 * the new pi_state to it, but bail out when TID = 0 [1] 1223 * 1224 * The !pid check is paranoid. None of the call sites should end up 1225 * with pid == 0, but better safe than sorry. Let the caller retry 1226 */ 1227 if (!pid) 1228 return -EAGAIN; 1229 p = find_get_task_by_vpid(pid); 1230 if (!p) 1231 return handle_exit_race(uaddr, uval, NULL); 1232 1233 if (unlikely(p->flags & PF_KTHREAD)) { 1234 put_task_struct(p); 1235 return -EPERM; 1236 } 1237 1238 /* 1239 * We need to look at the task state to figure out, whether the 1240 * task is exiting. To protect against the change of the task state 1241 * in futex_exit_release(), we do this protected by p->pi_lock: 1242 */ 1243 raw_spin_lock_irq(&p->pi_lock); 1244 if (unlikely(p->futex_state != FUTEX_STATE_OK)) { 1245 /* 1246 * The task is on the way out. When the futex state is 1247 * FUTEX_STATE_DEAD, we know that the task has finished 1248 * the cleanup: 1249 */ 1250 int ret = handle_exit_race(uaddr, uval, p); 1251 1252 raw_spin_unlock_irq(&p->pi_lock); 1253 /* 1254 * If the owner task is between FUTEX_STATE_EXITING and 1255 * FUTEX_STATE_DEAD then store the task pointer and keep 1256 * the reference on the task struct. The calling code will 1257 * drop all locks, wait for the task to reach 1258 * FUTEX_STATE_DEAD and then drop the refcount. This is 1259 * required to prevent a live lock when the current task 1260 * preempted the exiting task between the two states. 1261 */ 1262 if (ret == -EBUSY) 1263 *exiting = p; 1264 else 1265 put_task_struct(p); 1266 return ret; 1267 } 1268 1269 /* 1270 * No existing pi state. First waiter. [2] 1271 * 1272 * This creates pi_state, we have hb->lock held, this means nothing can 1273 * observe this state, wait_lock is irrelevant. 1274 */ 1275 pi_state = alloc_pi_state(); 1276 1277 /* 1278 * Initialize the pi_mutex in locked state and make @p 1279 * the owner of it: 1280 */ 1281 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); 1282 1283 /* Store the key for possible exit cleanups: */ 1284 pi_state->key = *key; 1285 1286 WARN_ON(!list_empty(&pi_state->list)); 1287 list_add(&pi_state->list, &p->pi_state_list); 1288 /* 1289 * Assignment without holding pi_state->pi_mutex.wait_lock is safe 1290 * because there is no concurrency as the object is not published yet. 1291 */ 1292 pi_state->owner = p; 1293 raw_spin_unlock_irq(&p->pi_lock); 1294 1295 put_task_struct(p); 1296 1297 *ps = pi_state; 1298 1299 return 0; 1300} 1301 1302static int lookup_pi_state(u32 __user *uaddr, u32 uval, 1303 struct futex_hash_bucket *hb, 1304 union futex_key *key, struct futex_pi_state **ps, 1305 struct task_struct **exiting) 1306{ 1307 struct futex_q *top_waiter = futex_top_waiter(hb, key); 1308 1309 /* 1310 * If there is a waiter on that futex, validate it and 1311 * attach to the pi_state when the validation succeeds. 1312 */ 1313 if (top_waiter) 1314 return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); 1315 1316 /* 1317 * We are the first waiter - try to look up the owner based on 1318 * @uval and attach to it. 1319 */ 1320 return attach_to_pi_owner(uaddr, uval, key, ps, exiting); 1321} 1322 1323static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) 1324{ 1325 int err; 1326 u32 curval; 1327 1328 if (unlikely(should_fail_futex(true))) 1329 return -EFAULT; 1330 1331 err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); 1332 if (unlikely(err)) 1333 return err; 1334 1335 /* If user space value changed, let the caller retry */ 1336 return curval != uval ? -EAGAIN : 0; 1337} 1338 1339/** 1340 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex 1341 * @uaddr: the pi futex user address 1342 * @hb: the pi futex hash bucket 1343 * @key: the futex key associated with uaddr and hb 1344 * @ps: the pi_state pointer where we store the result of the 1345 * lookup 1346 * @task: the task to perform the atomic lock work for. This will 1347 * be "current" except in the case of requeue pi. 1348 * @exiting: Pointer to store the task pointer of the owner task 1349 * which is in the middle of exiting 1350 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) 1351 * 1352 * Return: 1353 * - 0 - ready to wait; 1354 * - 1 - acquired the lock; 1355 * - <0 - error 1356 * 1357 * The hb->lock and futex_key refs shall be held by the caller. 1358 * 1359 * @exiting is only set when the return value is -EBUSY. If so, this holds 1360 * a refcount on the exiting task on return and the caller needs to drop it 1361 * after waiting for the exit to complete. 1362 */ 1363static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, 1364 union futex_key *key, 1365 struct futex_pi_state **ps, 1366 struct task_struct *task, 1367 struct task_struct **exiting, 1368 int set_waiters) 1369{ 1370 u32 uval, newval, vpid = task_pid_vnr(task); 1371 struct futex_q *top_waiter; 1372 int ret; 1373 1374 /* 1375 * Read the user space value first so we can validate a few 1376 * things before proceeding further. 1377 */ 1378 if (get_futex_value_locked(&uval, uaddr)) 1379 return -EFAULT; 1380 1381 if (unlikely(should_fail_futex(true))) 1382 return -EFAULT; 1383 1384 /* 1385 * Detect deadlocks. 1386 */ 1387 if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) 1388 return -EDEADLK; 1389 1390 if ((unlikely(should_fail_futex(true)))) 1391 return -EDEADLK; 1392 1393 /* 1394 * Lookup existing state first. If it exists, try to attach to 1395 * its pi_state. 1396 */ 1397 top_waiter = futex_top_waiter(hb, key); 1398 if (top_waiter) 1399 return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); 1400 1401 /* 1402 * No waiter and user TID is 0. We are here because the 1403 * waiters or the owner died bit is set or called from 1404 * requeue_cmp_pi or for whatever reason something took the 1405 * syscall. 1406 */ 1407 if (!(uval & FUTEX_TID_MASK)) { 1408 /* 1409 * We take over the futex. No other waiters and the user space 1410 * TID is 0. We preserve the owner died bit. 1411 */ 1412 newval = uval & FUTEX_OWNER_DIED; 1413 newval |= vpid; 1414 1415 /* The futex requeue_pi code can enforce the waiters bit */ 1416 if (set_waiters) 1417 newval |= FUTEX_WAITERS; 1418 1419 ret = lock_pi_update_atomic(uaddr, uval, newval); 1420 /* If the take over worked, return 1 */ 1421 return ret < 0 ? ret : 1; 1422 } 1423 1424 /* 1425 * First waiter. Set the waiters bit before attaching ourself to 1426 * the owner. If owner tries to unlock, it will be forced into 1427 * the kernel and blocked on hb->lock. 1428 */ 1429 newval = uval | FUTEX_WAITERS; 1430 ret = lock_pi_update_atomic(uaddr, uval, newval); 1431 if (ret) 1432 return ret; 1433 /* 1434 * If the update of the user space value succeeded, we try to 1435 * attach to the owner. If that fails, no harm done, we only 1436 * set the FUTEX_WAITERS bit in the user space variable. 1437 */ 1438 return attach_to_pi_owner(uaddr, newval, key, ps, exiting); 1439} 1440 1441/** 1442 * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket 1443 * @q: The futex_q to unqueue 1444 * 1445 * The q->lock_ptr must not be NULL and must be held by the caller. 1446 */ 1447static void __unqueue_futex(struct futex_q *q) 1448{ 1449 struct futex_hash_bucket *hb; 1450 1451 if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list))) 1452 return; 1453 lockdep_assert_held(q->lock_ptr); 1454 1455 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); 1456 plist_del(&q->list, &hb->chain); 1457 hb_waiters_dec(hb); 1458} 1459 1460/* 1461 * The hash bucket lock must be held when this is called. 1462 * Afterwards, the futex_q must not be accessed. Callers 1463 * must ensure to later call wake_up_q() for the actual 1464 * wakeups to occur. 1465 */ 1466static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) 1467{ 1468 struct task_struct *p = q->task; 1469 1470 if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) 1471 return; 1472 1473 get_task_struct(p); 1474 __unqueue_futex(q); 1475 /* 1476 * The waiting task can free the futex_q as soon as q->lock_ptr = NULL 1477 * is written, without taking any locks. This is possible in the event 1478 * of a spurious wakeup, for example. A memory barrier is required here 1479 * to prevent the following store to lock_ptr from getting ahead of the 1480 * plist_del in __unqueue_futex(). 1481 */ 1482 smp_store_release(&q->lock_ptr, NULL); 1483 1484 /* 1485 * Queue the task for later wakeup for after we've released 1486 * the hb->lock. 1487 */ 1488 wake_q_add_safe(wake_q, p); 1489} 1490 1491/* 1492 * Caller must hold a reference on @pi_state. 1493 */ 1494static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) 1495{ 1496 u32 curval, newval; 1497 struct task_struct *new_owner; 1498 bool postunlock = false; 1499 DEFINE_WAKE_Q(wake_q); 1500 int ret = 0; 1501 1502 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); 1503 if (WARN_ON_ONCE(!new_owner)) { 1504 /* 1505 * As per the comment in futex_unlock_pi() this should not happen. 1506 * 1507 * When this happens, give up our locks and try again, giving 1508 * the futex_lock_pi() instance time to complete, either by 1509 * waiting on the rtmutex or removing itself from the futex 1510 * queue. 1511 */ 1512 ret = -EAGAIN; 1513 goto out_unlock; 1514 } 1515 1516 /* 1517 * We pass it to the next owner. The WAITERS bit is always kept 1518 * enabled while there is PI state around. We cleanup the owner 1519 * died bit, because we are the owner. 1520 */ 1521 newval = FUTEX_WAITERS | task_pid_vnr(new_owner); 1522 1523 if (unlikely(should_fail_futex(true))) { 1524 ret = -EFAULT; 1525 goto out_unlock; 1526 } 1527 1528 ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); 1529 if (!ret && (curval != uval)) { 1530 /* 1531 * If a unconditional UNLOCK_PI operation (user space did not 1532 * try the TID->0 transition) raced with a waiter setting the 1533 * FUTEX_WAITERS flag between get_user() and locking the hash 1534 * bucket lock, retry the operation. 1535 */ 1536 if ((FUTEX_TID_MASK & curval) == uval) 1537 ret = -EAGAIN; 1538 else 1539 ret = -EINVAL; 1540 } 1541 1542 if (!ret) { 1543 /* 1544 * This is a point of no return; once we modified the uval 1545 * there is no going back and subsequent operations must 1546 * not fail. 1547 */ 1548 pi_state_update_owner(pi_state, new_owner); 1549 postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); 1550 } 1551 1552out_unlock: 1553 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 1554 1555 if (postunlock) 1556 rt_mutex_postunlock(&wake_q); 1557 1558 return ret; 1559} 1560 1561/* 1562 * Express the locking dependencies for lockdep: 1563 */ 1564static inline void 1565double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) 1566{ 1567 if (hb1 <= hb2) { 1568 spin_lock(&hb1->lock); 1569 if (hb1 < hb2) 1570 spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); 1571 } else { /* hb1 > hb2 */ 1572 spin_lock(&hb2->lock); 1573 spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); 1574 } 1575} 1576 1577static inline void 1578double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) 1579{ 1580 spin_unlock(&hb1->lock); 1581 if (hb1 != hb2) 1582 spin_unlock(&hb2->lock); 1583} 1584 1585/* 1586 * Wake up waiters matching bitset queued on this futex (uaddr). 1587 */ 1588static int 1589futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) 1590{ 1591 struct futex_hash_bucket *hb; 1592 struct futex_q *this, *next; 1593 union futex_key key = FUTEX_KEY_INIT; 1594 int ret; 1595 DEFINE_WAKE_Q(wake_q); 1596 1597 if (!bitset) 1598 return -EINVAL; 1599 1600 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); 1601 if (unlikely(ret != 0)) 1602 return ret; 1603 1604 hb = hash_futex(&key); 1605 1606 /* Make sure we really have tasks to wakeup */ 1607 if (!hb_waiters_pending(hb)) 1608 return ret; 1609 1610 spin_lock(&hb->lock); 1611 1612 plist_for_each_entry_safe(this, next, &hb->chain, list) { 1613 if (match_futex (&this->key, &key)) { 1614 if (this->pi_state || this->rt_waiter) { 1615 ret = -EINVAL; 1616 break; 1617 } 1618 1619 /* Check if one of the bits is set in both bitsets */ 1620 if (!(this->bitset & bitset)) 1621 continue; 1622 1623 mark_wake_futex(&wake_q, this); 1624 if (++ret >= nr_wake) 1625 break; 1626 } 1627 } 1628 1629 spin_unlock(&hb->lock); 1630 wake_up_q(&wake_q); 1631 return ret; 1632} 1633 1634static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) 1635{ 1636 unsigned int op = (encoded_op & 0x70000000) >> 28; 1637 unsigned int cmp = (encoded_op & 0x0f000000) >> 24; 1638 int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11); 1639 int cmparg = sign_extend32(encoded_op & 0x00000fff, 11); 1640 int oldval, ret; 1641 1642 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { 1643 if (oparg < 0 || oparg > 31) { 1644 char comm[sizeof(current->comm)]; 1645 /* 1646 * kill this print and return -EINVAL when userspace 1647 * is sane again 1648 */ 1649 pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n", 1650 get_task_comm(comm, current), oparg); 1651 oparg &= 31; 1652 } 1653 oparg = 1 << oparg; 1654 } 1655 1656 pagefault_disable(); 1657 ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); 1658 pagefault_enable(); 1659 if (ret) 1660 return ret; 1661 1662 switch (cmp) { 1663 case FUTEX_OP_CMP_EQ: 1664 return oldval == cmparg; 1665 case FUTEX_OP_CMP_NE: 1666 return oldval != cmparg; 1667 case FUTEX_OP_CMP_LT: 1668 return oldval < cmparg; 1669 case FUTEX_OP_CMP_GE: 1670 return oldval >= cmparg; 1671 case FUTEX_OP_CMP_LE: 1672 return oldval <= cmparg; 1673 case FUTEX_OP_CMP_GT: 1674 return oldval > cmparg; 1675 default: 1676 return -ENOSYS; 1677 } 1678} 1679 1680/* 1681 * Wake up all waiters hashed on the physical page that is mapped 1682 * to this virtual address: 1683 */ 1684static int 1685futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, 1686 int nr_wake, int nr_wake2, int op) 1687{ 1688 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 1689 struct futex_hash_bucket *hb1, *hb2; 1690 struct futex_q *this, *next; 1691 int ret, op_ret; 1692 DEFINE_WAKE_Q(wake_q); 1693 1694retry: 1695 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); 1696 if (unlikely(ret != 0)) 1697 return ret; 1698 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); 1699 if (unlikely(ret != 0)) 1700 return ret; 1701 1702 hb1 = hash_futex(&key1); 1703 hb2 = hash_futex(&key2); 1704 1705retry_private: 1706 double_lock_hb(hb1, hb2); 1707 op_ret = futex_atomic_op_inuser(op, uaddr2); 1708 if (unlikely(op_ret < 0)) { 1709 double_unlock_hb(hb1, hb2); 1710 1711 if (!IS_ENABLED(CONFIG_MMU) || 1712 unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { 1713 /* 1714 * we don't get EFAULT from MMU faults if we don't have 1715 * an MMU, but we might get them from range checking 1716 */ 1717 ret = op_ret; 1718 return ret; 1719 } 1720 1721 if (op_ret == -EFAULT) { 1722 ret = fault_in_user_writeable(uaddr2); 1723 if (ret) 1724 return ret; 1725 } 1726 1727 if (!(flags & FLAGS_SHARED)) { 1728 cond_resched(); 1729 goto retry_private; 1730 } 1731 1732 cond_resched(); 1733 goto retry; 1734 } 1735 1736 plist_for_each_entry_safe(this, next, &hb1->chain, list) { 1737 if (match_futex (&this->key, &key1)) { 1738 if (this->pi_state || this->rt_waiter) { 1739 ret = -EINVAL; 1740 goto out_unlock; 1741 } 1742 mark_wake_futex(&wake_q, this); 1743 if (++ret >= nr_wake) 1744 break; 1745 } 1746 } 1747 1748 if (op_ret > 0) { 1749 op_ret = 0; 1750 plist_for_each_entry_safe(this, next, &hb2->chain, list) { 1751 if (match_futex (&this->key, &key2)) { 1752 if (this->pi_state || this->rt_waiter) { 1753 ret = -EINVAL; 1754 goto out_unlock; 1755 } 1756 mark_wake_futex(&wake_q, this); 1757 if (++op_ret >= nr_wake2) 1758 break; 1759 } 1760 } 1761 ret += op_ret; 1762 } 1763 1764out_unlock: 1765 double_unlock_hb(hb1, hb2); 1766 wake_up_q(&wake_q); 1767 return ret; 1768} 1769 1770/** 1771 * requeue_futex() - Requeue a futex_q from one hb to another 1772 * @q: the futex_q to requeue 1773 * @hb1: the source hash_bucket 1774 * @hb2: the target hash_bucket 1775 * @key2: the new key for the requeued futex_q 1776 */ 1777static inline 1778void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, 1779 struct futex_hash_bucket *hb2, union futex_key *key2) 1780{ 1781 1782 /* 1783 * If key1 and key2 hash to the same bucket, no need to 1784 * requeue. 1785 */ 1786 if (likely(&hb1->chain != &hb2->chain)) { 1787 plist_del(&q->list, &hb1->chain); 1788 hb_waiters_dec(hb1); 1789 hb_waiters_inc(hb2); 1790 plist_add(&q->list, &hb2->chain); 1791 q->lock_ptr = &hb2->lock; 1792 } 1793 q->key = *key2; 1794} 1795 1796/** 1797 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue 1798 * @q: the futex_q 1799 * @key: the key of the requeue target futex 1800 * @hb: the hash_bucket of the requeue target futex 1801 * 1802 * During futex_requeue, with requeue_pi=1, it is possible to acquire the 1803 * target futex if it is uncontended or via a lock steal. Set the futex_q key 1804 * to the requeue target futex so the waiter can detect the wakeup on the right 1805 * futex, but remove it from the hb and NULL the rt_waiter so it can detect 1806 * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock 1807 * to protect access to the pi_state to fixup the owner later. Must be called 1808 * with both q->lock_ptr and hb->lock held. 1809 */ 1810static inline 1811void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, 1812 struct futex_hash_bucket *hb) 1813{ 1814 q->key = *key; 1815 1816 __unqueue_futex(q); 1817 1818 WARN_ON(!q->rt_waiter); 1819 q->rt_waiter = NULL; 1820 1821 q->lock_ptr = &hb->lock; 1822 1823 wake_up_state(q->task, TASK_NORMAL); 1824} 1825 1826/** 1827 * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter 1828 * @pifutex: the user address of the to futex 1829 * @hb1: the from futex hash bucket, must be locked by the caller 1830 * @hb2: the to futex hash bucket, must be locked by the caller 1831 * @key1: the from futex key 1832 * @key2: the to futex key 1833 * @ps: address to store the pi_state pointer 1834 * @exiting: Pointer to store the task pointer of the owner task 1835 * which is in the middle of exiting 1836 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) 1837 * 1838 * Try and get the lock on behalf of the top waiter if we can do it atomically. 1839 * Wake the top waiter if we succeed. If the caller specified set_waiters, 1840 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. 1841 * hb1 and hb2 must be held by the caller. 1842 * 1843 * @exiting is only set when the return value is -EBUSY. If so, this holds 1844 * a refcount on the exiting task on return and the caller needs to drop it 1845 * after waiting for the exit to complete. 1846 * 1847 * Return: 1848 * - 0 - failed to acquire the lock atomically; 1849 * - >0 - acquired the lock, return value is vpid of the top_waiter 1850 * - <0 - error 1851 */ 1852static int 1853futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, 1854 struct futex_hash_bucket *hb2, union futex_key *key1, 1855 union futex_key *key2, struct futex_pi_state **ps, 1856 struct task_struct **exiting, int set_waiters) 1857{ 1858 struct futex_q *top_waiter = NULL; 1859 u32 curval; 1860 int ret, vpid; 1861 1862 if (get_futex_value_locked(&curval, pifutex)) 1863 return -EFAULT; 1864 1865 if (unlikely(should_fail_futex(true))) 1866 return -EFAULT; 1867 1868 /* 1869 * Find the top_waiter and determine if there are additional waiters. 1870 * If the caller intends to requeue more than 1 waiter to pifutex, 1871 * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, 1872 * as we have means to handle the possible fault. If not, don't set 1873 * the bit unecessarily as it will force the subsequent unlock to enter 1874 * the kernel. 1875 */ 1876 top_waiter = futex_top_waiter(hb1, key1); 1877 1878 /* There are no waiters, nothing for us to do. */ 1879 if (!top_waiter) 1880 return 0; 1881 1882 /* Ensure we requeue to the expected futex. */ 1883 if (!match_futex(top_waiter->requeue_pi_key, key2)) 1884 return -EINVAL; 1885 1886 /* 1887 * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in 1888 * the contended case or if set_waiters is 1. The pi_state is returned 1889 * in ps in contended cases. 1890 */ 1891 vpid = task_pid_vnr(top_waiter->task); 1892 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, 1893 exiting, set_waiters); 1894 if (ret == 1) { 1895 requeue_pi_wake_futex(top_waiter, key2, hb2); 1896 return vpid; 1897 } 1898 return ret; 1899} 1900 1901/** 1902 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 1903 * @uaddr1: source futex user address 1904 * @flags: futex flags (FLAGS_SHARED, etc.) 1905 * @uaddr2: target futex user address 1906 * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) 1907 * @nr_requeue: number of waiters to requeue (0-INT_MAX) 1908 * @cmpval: @uaddr1 expected value (or %NULL) 1909 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a 1910 * pi futex (pi to pi requeue is not supported) 1911 * 1912 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire 1913 * uaddr2 atomically on behalf of the top waiter. 1914 * 1915 * Return: 1916 * - >=0 - on success, the number of tasks requeued or woken; 1917 * - <0 - on error 1918 */ 1919static int futex_requeue(u32 __user *uaddr1, unsigned int flags, 1920 u32 __user *uaddr2, int nr_wake, int nr_requeue, 1921 u32 *cmpval, int requeue_pi) 1922{ 1923 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 1924 int task_count = 0, ret; 1925 struct futex_pi_state *pi_state = NULL; 1926 struct futex_hash_bucket *hb1, *hb2; 1927 struct futex_q *this, *next; 1928 DEFINE_WAKE_Q(wake_q); 1929 1930 if (nr_wake < 0 || nr_requeue < 0) 1931 return -EINVAL; 1932 1933 /* 1934 * When PI not supported: return -ENOSYS if requeue_pi is true, 1935 * consequently the compiler knows requeue_pi is always false past 1936 * this point which will optimize away all the conditional code 1937 * further down. 1938 */ 1939 if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi) 1940 return -ENOSYS; 1941 1942 if (requeue_pi) { 1943 /* 1944 * Requeue PI only works on two distinct uaddrs. This 1945 * check is only valid for private futexes. See below. 1946 */ 1947 if (uaddr1 == uaddr2) 1948 return -EINVAL; 1949 1950 /* 1951 * requeue_pi requires a pi_state, try to allocate it now 1952 * without any locks in case it fails. 1953 */ 1954 if (refill_pi_state_cache()) 1955 return -ENOMEM; 1956 /* 1957 * requeue_pi must wake as many tasks as it can, up to nr_wake 1958 * + nr_requeue, since it acquires the rt_mutex prior to 1959 * returning to userspace, so as to not leave the rt_mutex with 1960 * waiters and no owner. However, second and third wake-ups 1961 * cannot be predicted as they involve race conditions with the 1962 * first wake and a fault while looking up the pi_state. Both 1963 * pthread_cond_signal() and pthread_cond_broadcast() should 1964 * use nr_wake=1. 1965 */ 1966 if (nr_wake != 1) 1967 return -EINVAL; 1968 } 1969 1970retry: 1971 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); 1972 if (unlikely(ret != 0)) 1973 return ret; 1974 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, 1975 requeue_pi ? FUTEX_WRITE : FUTEX_READ); 1976 if (unlikely(ret != 0)) 1977 return ret; 1978 1979 /* 1980 * The check above which compares uaddrs is not sufficient for 1981 * shared futexes. We need to compare the keys: 1982 */ 1983 if (requeue_pi && match_futex(&key1, &key2)) 1984 return -EINVAL; 1985 1986 hb1 = hash_futex(&key1); 1987 hb2 = hash_futex(&key2); 1988 1989retry_private: 1990 hb_waiters_inc(hb2); 1991 double_lock_hb(hb1, hb2); 1992 1993 if (likely(cmpval != NULL)) { 1994 u32 curval; 1995 1996 ret = get_futex_value_locked(&curval, uaddr1); 1997 1998 if (unlikely(ret)) { 1999 double_unlock_hb(hb1, hb2); 2000 hb_waiters_dec(hb2); 2001 2002 ret = get_user(curval, uaddr1); 2003 if (ret) 2004 return ret; 2005 2006 if (!(flags & FLAGS_SHARED)) 2007 goto retry_private; 2008 2009 goto retry; 2010 } 2011 if (curval != *cmpval) { 2012 ret = -EAGAIN; 2013 goto out_unlock; 2014 } 2015 } 2016 2017 if (requeue_pi && (task_count - nr_wake < nr_requeue)) { 2018 struct task_struct *exiting = NULL; 2019 2020 /* 2021 * Attempt to acquire uaddr2 and wake the top waiter. If we 2022 * intend to requeue waiters, force setting the FUTEX_WAITERS 2023 * bit. We force this here where we are able to easily handle 2024 * faults rather in the requeue loop below. 2025 */ 2026 ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, 2027 &key2, &pi_state, 2028 &exiting, nr_requeue); 2029 2030 /* 2031 * At this point the top_waiter has either taken uaddr2 or is 2032 * waiting on it. If the former, then the pi_state will not 2033 * exist yet, look it up one more time to ensure we have a 2034 * reference to it. If the lock was taken, ret contains the 2035 * vpid of the top waiter task. 2036 * If the lock was not taken, we have pi_state and an initial 2037 * refcount on it. In case of an error we have nothing. 2038 */ 2039 if (ret > 0) { 2040 WARN_ON(pi_state); 2041 task_count++; 2042 /* 2043 * If we acquired the lock, then the user space value 2044 * of uaddr2 should be vpid. It cannot be changed by 2045 * the top waiter as it is blocked on hb2 lock if it 2046 * tries to do so. If something fiddled with it behind 2047 * our back the pi state lookup might unearth it. So 2048 * we rather use the known value than rereading and 2049 * handing potential crap to lookup_pi_state. 2050 * 2051 * If that call succeeds then we have pi_state and an 2052 * initial refcount on it. 2053 */ 2054 ret = lookup_pi_state(uaddr2, ret, hb2, &key2, 2055 &pi_state, &exiting); 2056 } 2057 2058 switch (ret) { 2059 case 0: 2060 /* We hold a reference on the pi state. */ 2061 break; 2062 2063 /* If the above failed, then pi_state is NULL */ 2064 case -EFAULT: 2065 double_unlock_hb(hb1, hb2); 2066 hb_waiters_dec(hb2); 2067 ret = fault_in_user_writeable(uaddr2); 2068 if (!ret) 2069 goto retry; 2070 return ret; 2071 case -EBUSY: 2072 case -EAGAIN: 2073 /* 2074 * Two reasons for this: 2075 * - EBUSY: Owner is exiting and we just wait for the 2076 * exit to complete. 2077 * - EAGAIN: The user space value changed. 2078 */ 2079 double_unlock_hb(hb1, hb2); 2080 hb_waiters_dec(hb2); 2081 /* 2082 * Handle the case where the owner is in the middle of 2083 * exiting. Wait for the exit to complete otherwise 2084 * this task might loop forever, aka. live lock. 2085 */ 2086 wait_for_owner_exiting(ret, exiting); 2087 cond_resched(); 2088 goto retry; 2089 default: 2090 goto out_unlock; 2091 } 2092 } 2093 2094 plist_for_each_entry_safe(this, next, &hb1->chain, list) { 2095 if (task_count - nr_wake >= nr_requeue) 2096 break; 2097 2098 if (!match_futex(&this->key, &key1)) 2099 continue; 2100 2101 /* 2102 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always 2103 * be paired with each other and no other futex ops. 2104 * 2105 * We should never be requeueing a futex_q with a pi_state, 2106 * which is awaiting a futex_unlock_pi(). 2107 */ 2108 if ((requeue_pi && !this->rt_waiter) || 2109 (!requeue_pi && this->rt_waiter) || 2110 this->pi_state) { 2111 ret = -EINVAL; 2112 break; 2113 } 2114 2115 /* 2116 * Wake nr_wake waiters. For requeue_pi, if we acquired the 2117 * lock, we already woke the top_waiter. If not, it will be 2118 * woken by futex_unlock_pi(). 2119 */ 2120 if (++task_count <= nr_wake && !requeue_pi) { 2121 mark_wake_futex(&wake_q, this); 2122 continue; 2123 } 2124 2125 /* Ensure we requeue to the expected futex for requeue_pi. */ 2126 if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) { 2127 ret = -EINVAL; 2128 break; 2129 } 2130 2131 /* 2132 * Requeue nr_requeue waiters and possibly one more in the case 2133 * of requeue_pi if we couldn't acquire the lock atomically. 2134 */ 2135 if (requeue_pi) { 2136 /* 2137 * Prepare the waiter to take the rt_mutex. Take a 2138 * refcount on the pi_state and store the pointer in 2139 * the futex_q object of the waiter. 2140 */ 2141 get_pi_state(pi_state); 2142 this->pi_state = pi_state; 2143 ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, 2144 this->rt_waiter, 2145 this->task); 2146 if (ret == 1) { 2147 /* 2148 * We got the lock. We do neither drop the 2149 * refcount on pi_state nor clear 2150 * this->pi_state because the waiter needs the 2151 * pi_state for cleaning up the user space 2152 * value. It will drop the refcount after 2153 * doing so. 2154 */ 2155 requeue_pi_wake_futex(this, &key2, hb2); 2156 continue; 2157 } else if (ret) { 2158 /* 2159 * rt_mutex_start_proxy_lock() detected a 2160 * potential deadlock when we tried to queue 2161 * that waiter. Drop the pi_state reference 2162 * which we took above and remove the pointer 2163 * to the state from the waiters futex_q 2164 * object. 2165 */ 2166 this->pi_state = NULL; 2167 put_pi_state(pi_state); 2168 /* 2169 * We stop queueing more waiters and let user 2170 * space deal with the mess. 2171 */ 2172 break; 2173 } 2174 } 2175 requeue_futex(this, hb1, hb2, &key2); 2176 } 2177 2178 /* 2179 * We took an extra initial reference to the pi_state either 2180 * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We 2181 * need to drop it here again. 2182 */ 2183 put_pi_state(pi_state); 2184 2185out_unlock: 2186 double_unlock_hb(hb1, hb2); 2187 wake_up_q(&wake_q); 2188 hb_waiters_dec(hb2); 2189 return ret ? ret : task_count; 2190} 2191 2192/* The key must be already stored in q->key. */ 2193static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) 2194 __acquires(&hb->lock) 2195{ 2196 struct futex_hash_bucket *hb; 2197 2198 hb = hash_futex(&q->key); 2199 2200 /* 2201 * Increment the counter before taking the lock so that 2202 * a potential waker won't miss a to-be-slept task that is 2203 * waiting for the spinlock. This is safe as all queue_lock() 2204 * users end up calling queue_me(). Similarly, for housekeeping, 2205 * decrement the counter at queue_unlock() when some error has 2206 * occurred and we don't end up adding the task to the list. 2207 */ 2208 hb_waiters_inc(hb); /* implies smp_mb(); (A) */ 2209 2210 q->lock_ptr = &hb->lock; 2211 2212 spin_lock(&hb->lock); 2213 return hb; 2214} 2215 2216static inline void 2217queue_unlock(struct futex_hash_bucket *hb) 2218 __releases(&hb->lock) 2219{ 2220 spin_unlock(&hb->lock); 2221 hb_waiters_dec(hb); 2222} 2223 2224static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 2225{ 2226 int prio; 2227 2228 /* 2229 * The priority used to register this element is 2230 * - either the real thread-priority for the real-time threads 2231 * (i.e. threads with a priority lower than MAX_RT_PRIO) 2232 * - or MAX_RT_PRIO for non-RT threads. 2233 * Thus, all RT-threads are woken first in priority order, and 2234 * the others are woken last, in FIFO order. 2235 */ 2236 prio = min(current->normal_prio, MAX_RT_PRIO); 2237 2238 plist_node_init(&q->list, prio); 2239 plist_add(&q->list, &hb->chain); 2240 q->task = current; 2241} 2242 2243/** 2244 * queue_me() - Enqueue the futex_q on the futex_hash_bucket 2245 * @q: The futex_q to enqueue 2246 * @hb: The destination hash bucket 2247 * 2248 * The hb->lock must be held by the caller, and is released here. A call to 2249 * queue_me() is typically paired with exactly one call to unqueue_me(). The 2250 * exceptions involve the PI related operations, which may use unqueue_me_pi() 2251 * or nothing if the unqueue is done as part of the wake process and the unqueue 2252 * state is implicit in the state of woken task (see futex_wait_requeue_pi() for 2253 * an example). 2254 */ 2255static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 2256 __releases(&hb->lock) 2257{ 2258 __queue_me(q, hb); 2259 spin_unlock(&hb->lock); 2260} 2261 2262/** 2263 * unqueue_me() - Remove the futex_q from its futex_hash_bucket 2264 * @q: The futex_q to unqueue 2265 * 2266 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must 2267 * be paired with exactly one earlier call to queue_me(). 2268 * 2269 * Return: 2270 * - 1 - if the futex_q was still queued (and we removed unqueued it); 2271 * - 0 - if the futex_q was already removed by the waking thread 2272 */ 2273static int unqueue_me(struct futex_q *q) 2274{ 2275 spinlock_t *lock_ptr; 2276 int ret = 0; 2277 2278 /* In the common case we don't take the spinlock, which is nice. */ 2279retry: 2280 /* 2281 * q->lock_ptr can change between this read and the following spin_lock. 2282 * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and 2283 * optimizing lock_ptr out of the logic below. 2284 */ 2285 lock_ptr = READ_ONCE(q->lock_ptr); 2286 if (lock_ptr != NULL) { 2287 spin_lock(lock_ptr); 2288 /* 2289 * q->lock_ptr can change between reading it and 2290 * spin_lock(), causing us to take the wrong lock. This 2291 * corrects the race condition. 2292 * 2293 * Reasoning goes like this: if we have the wrong lock, 2294 * q->lock_ptr must have changed (maybe several times) 2295 * between reading it and the spin_lock(). It can 2296 * change again after the spin_lock() but only if it was 2297 * already changed before the spin_lock(). It cannot, 2298 * however, change back to the original value. Therefore 2299 * we can detect whether we acquired the correct lock. 2300 */ 2301 if (unlikely(lock_ptr != q->lock_ptr)) { 2302 spin_unlock(lock_ptr); 2303 goto retry; 2304 } 2305 __unqueue_futex(q); 2306 2307 BUG_ON(q->pi_state); 2308 2309 spin_unlock(lock_ptr); 2310 ret = 1; 2311 } 2312 2313 return ret; 2314} 2315 2316/* 2317 * PI futexes can not be requeued and must remove themself from the 2318 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry 2319 * and dropped here. 2320 */ 2321static void unqueue_me_pi(struct futex_q *q) 2322 __releases(q->lock_ptr) 2323{ 2324 __unqueue_futex(q); 2325 2326 BUG_ON(!q->pi_state); 2327 put_pi_state(q->pi_state); 2328 q->pi_state = NULL; 2329 2330 spin_unlock(q->lock_ptr); 2331} 2332 2333static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 2334 struct task_struct *argowner) 2335{ 2336 struct futex_pi_state *pi_state = q->pi_state; 2337 struct task_struct *oldowner, *newowner; 2338 u32 uval, curval, newval, newtid; 2339 int err = 0; 2340 2341 oldowner = pi_state->owner; 2342 2343 /* 2344 * We are here because either: 2345 * 2346 * - we stole the lock and pi_state->owner needs updating to reflect 2347 * that (@argowner == current), 2348 * 2349 * or: 2350 * 2351 * - someone stole our lock and we need to fix things to point to the 2352 * new owner (@argowner == NULL). 2353 * 2354 * Either way, we have to replace the TID in the user space variable. 2355 * This must be atomic as we have to preserve the owner died bit here. 2356 * 2357 * Note: We write the user space value _before_ changing the pi_state 2358 * because we can fault here. Imagine swapped out pages or a fork 2359 * that marked all the anonymous memory readonly for cow. 2360 * 2361 * Modifying pi_state _before_ the user space value would leave the 2362 * pi_state in an inconsistent state when we fault here, because we 2363 * need to drop the locks to handle the fault. This might be observed 2364 * in the PID check in lookup_pi_state. 2365 */ 2366retry: 2367 if (!argowner) { 2368 if (oldowner != current) { 2369 /* 2370 * We raced against a concurrent self; things are 2371 * already fixed up. Nothing to do. 2372 */ 2373 return 0; 2374 } 2375 2376 if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { 2377 /* We got the lock. pi_state is correct. Tell caller. */ 2378 return 1; 2379 } 2380 2381 /* 2382 * The trylock just failed, so either there is an owner or 2383 * there is a higher priority waiter than this one. 2384 */ 2385 newowner = rt_mutex_owner(&pi_state->pi_mutex); 2386 /* 2387 * If the higher priority waiter has not yet taken over the 2388 * rtmutex then newowner is NULL. We can't return here with 2389 * that state because it's inconsistent vs. the user space 2390 * state. So drop the locks and try again. It's a valid 2391 * situation and not any different from the other retry 2392 * conditions. 2393 */ 2394 if (unlikely(!newowner)) { 2395 err = -EAGAIN; 2396 goto handle_err; 2397 } 2398 } else { 2399 WARN_ON_ONCE(argowner != current); 2400 if (oldowner == current) { 2401 /* 2402 * We raced against a concurrent self; things are 2403 * already fixed up. Nothing to do. 2404 */ 2405 return 1; 2406 } 2407 newowner = argowner; 2408 } 2409 2410 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 2411 /* Owner died? */ 2412 if (!pi_state->owner) 2413 newtid |= FUTEX_OWNER_DIED; 2414 2415 err = get_futex_value_locked(&uval, uaddr); 2416 if (err) 2417 goto handle_err; 2418 2419 for (;;) { 2420 newval = (uval & FUTEX_OWNER_DIED) | newtid; 2421 2422 err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); 2423 if (err) 2424 goto handle_err; 2425 2426 if (curval == uval) 2427 break; 2428 uval = curval; 2429 } 2430 2431 /* 2432 * We fixed up user space. Now we need to fix the pi_state 2433 * itself. 2434 */ 2435 pi_state_update_owner(pi_state, newowner); 2436 2437 return argowner == current; 2438 2439 /* 2440 * In order to reschedule or handle a page fault, we need to drop the 2441 * locks here. In the case of a fault, this gives the other task 2442 * (either the highest priority waiter itself or the task which stole 2443 * the rtmutex) the chance to try the fixup of the pi_state. So once we 2444 * are back from handling the fault we need to check the pi_state after 2445 * reacquiring the locks and before trying to do another fixup. When 2446 * the fixup has been done already we simply return. 2447 * 2448 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely 2449 * drop hb->lock since the caller owns the hb -> futex_q relation. 2450 * Dropping the pi_mutex->wait_lock requires the state revalidate. 2451 */ 2452handle_err: 2453 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 2454 spin_unlock(q->lock_ptr); 2455 2456 switch (err) { 2457 case -EFAULT: 2458 err = fault_in_user_writeable(uaddr); 2459 break; 2460 2461 case -EAGAIN: 2462 cond_resched(); 2463 err = 0; 2464 break; 2465 2466 default: 2467 WARN_ON_ONCE(1); 2468 break; 2469 } 2470 2471 spin_lock(q->lock_ptr); 2472 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 2473 2474 /* 2475 * Check if someone else fixed it for us: 2476 */ 2477 if (pi_state->owner != oldowner) 2478 return argowner == current; 2479 2480 /* Retry if err was -EAGAIN or the fault in succeeded */ 2481 if (!err) 2482 goto retry; 2483 2484 /* 2485 * fault_in_user_writeable() failed so user state is immutable. At 2486 * best we can make the kernel state consistent but user state will 2487 * be most likely hosed and any subsequent unlock operation will be 2488 * rejected due to PI futex rule [10]. 2489 * 2490 * Ensure that the rtmutex owner is also the pi_state owner despite 2491 * the user space value claiming something different. There is no 2492 * point in unlocking the rtmutex if current is the owner as it 2493 * would need to wait until the next waiter has taken the rtmutex 2494 * to guarantee consistent state. Keep it simple. Userspace asked 2495 * for this wreckaged state. 2496 * 2497 * The rtmutex has an owner - either current or some other 2498 * task. See the EAGAIN loop above. 2499 */ 2500 pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex)); 2501 2502 return err; 2503} 2504 2505static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 2506 struct task_struct *argowner) 2507{ 2508 struct futex_pi_state *pi_state = q->pi_state; 2509 int ret; 2510 2511 lockdep_assert_held(q->lock_ptr); 2512 2513 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 2514 ret = __fixup_pi_state_owner(uaddr, q, argowner); 2515 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 2516 return ret; 2517} 2518 2519static long futex_wait_restart(struct restart_block *restart); 2520 2521/** 2522 * fixup_owner() - Post lock pi_state and corner case management 2523 * @uaddr: user address of the futex 2524 * @q: futex_q (contains pi_state and access to the rt_mutex) 2525 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) 2526 * 2527 * After attempting to lock an rt_mutex, this function is called to cleanup 2528 * the pi_state owner as well as handle race conditions that may allow us to 2529 * acquire the lock. Must be called with the hb lock held. 2530 * 2531 * Return: 2532 * - 1 - success, lock taken; 2533 * - 0 - success, lock not taken; 2534 * - <0 - on error (-EFAULT) 2535 */ 2536static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) 2537{ 2538 if (locked) { 2539 /* 2540 * Got the lock. We might not be the anticipated owner if we 2541 * did a lock-steal - fix up the PI-state in that case: 2542 * 2543 * Speculative pi_state->owner read (we don't hold wait_lock); 2544 * since we own the lock pi_state->owner == current is the 2545 * stable state, anything else needs more attention. 2546 */ 2547 if (q->pi_state->owner != current) 2548 return fixup_pi_state_owner(uaddr, q, current); 2549 return 1; 2550 } 2551 2552 /* 2553 * If we didn't get the lock; check if anybody stole it from us. In 2554 * that case, we need to fix up the uval to point to them instead of 2555 * us, otherwise bad things happen. [10] 2556 * 2557 * Another speculative read; pi_state->owner == current is unstable 2558 * but needs our attention. 2559 */ 2560 if (q->pi_state->owner == current) 2561 return fixup_pi_state_owner(uaddr, q, NULL); 2562 2563 /* 2564 * Paranoia check. If we did not take the lock, then we should not be 2565 * the owner of the rt_mutex. Warn and establish consistent state. 2566 */ 2567 if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current)) 2568 return fixup_pi_state_owner(uaddr, q, current); 2569 2570 return 0; 2571} 2572 2573/** 2574 * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal 2575 * @hb: the futex hash bucket, must be locked by the caller 2576 * @q: the futex_q to queue up on 2577 * @timeout: the prepared hrtimer_sleeper, or null for no timeout 2578 */ 2579static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, 2580 struct hrtimer_sleeper *timeout) 2581{ 2582 /* 2583 * The task state is guaranteed to be set before another task can 2584 * wake it. set_current_state() is implemented using smp_store_mb() and 2585 * queue_me() calls spin_unlock() upon completion, both serializing 2586 * access to the hash list and forcing another memory barrier. 2587 */ 2588 set_current_state(TASK_INTERRUPTIBLE); 2589 queue_me(q, hb); 2590 2591 /* Arm the timer */ 2592 if (timeout) 2593 hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); 2594 2595 /* 2596 * If we have been removed from the hash list, then another task 2597 * has tried to wake us, and we can skip the call to schedule(). 2598 */ 2599 if (likely(!plist_node_empty(&q->list))) { 2600 /* 2601 * If the timer has already expired, current will already be 2602 * flagged for rescheduling. Only call schedule if there 2603 * is no timeout, or if it has yet to expire. 2604 */ 2605 if (!timeout || timeout->task) 2606 freezable_schedule(); 2607 } 2608 __set_current_state(TASK_RUNNING); 2609} 2610 2611/** 2612 * futex_wait_setup() - Prepare to wait on a futex 2613 * @uaddr: the futex userspace address 2614 * @val: the expected value 2615 * @flags: futex flags (FLAGS_SHARED, etc.) 2616 * @q: the associated futex_q 2617 * @hb: storage for hash_bucket pointer to be returned to caller 2618 * 2619 * Setup the futex_q and locate the hash_bucket. Get the futex value and 2620 * compare it with the expected value. Handle atomic faults internally. 2621 * Return with the hb lock held and a q.key reference on success, and unlocked 2622 * with no q.key reference on failure. 2623 * 2624 * Return: 2625 * - 0 - uaddr contains val and hb has been locked; 2626 * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked 2627 */ 2628static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, 2629 struct futex_q *q, struct futex_hash_bucket **hb) 2630{ 2631 u32 uval; 2632 int ret; 2633 2634 /* 2635 * Access the page AFTER the hash-bucket is locked. 2636 * Order is important: 2637 * 2638 * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); 2639 * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } 2640 * 2641 * The basic logical guarantee of a futex is that it blocks ONLY 2642 * if cond(var) is known to be true at the time of blocking, for 2643 * any cond. If we locked the hash-bucket after testing *uaddr, that 2644 * would open a race condition where we could block indefinitely with 2645 * cond(var) false, which would violate the guarantee. 2646 * 2647 * On the other hand, we insert q and release the hash-bucket only 2648 * after testing *uaddr. This guarantees that futex_wait() will NOT 2649 * absorb a wakeup if *uaddr does not match the desired values 2650 * while the syscall executes. 2651 */ 2652retry: 2653 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ); 2654 if (unlikely(ret != 0)) 2655 return ret; 2656 2657retry_private: 2658 *hb = queue_lock(q); 2659 2660 ret = get_futex_value_locked(&uval, uaddr); 2661 2662 if (ret) { 2663 queue_unlock(*hb); 2664 2665 ret = get_user(uval, uaddr); 2666 if (ret) 2667 return ret; 2668 2669 if (!(flags & FLAGS_SHARED)) 2670 goto retry_private; 2671 2672 goto retry; 2673 } 2674 2675 if (uval != val) { 2676 queue_unlock(*hb); 2677 ret = -EWOULDBLOCK; 2678 } 2679 2680 return ret; 2681} 2682 2683static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, 2684 ktime_t *abs_time, u32 bitset) 2685{ 2686 struct hrtimer_sleeper timeout, *to; 2687 struct restart_block *restart; 2688 struct futex_hash_bucket *hb; 2689 struct futex_q q = futex_q_init; 2690 int ret; 2691 2692 if (!bitset) 2693 return -EINVAL; 2694 q.bitset = bitset; 2695 2696 to = futex_setup_timer(abs_time, &timeout, flags, 2697 current->timer_slack_ns); 2698retry: 2699 /* 2700 * Prepare to wait on uaddr. On success, holds hb lock and increments 2701 * q.key refs. 2702 */ 2703 ret = futex_wait_setup(uaddr, val, flags, &q, &hb); 2704 if (ret) 2705 goto out; 2706 2707 /* queue_me and wait for wakeup, timeout, or a signal. */ 2708 futex_wait_queue_me(hb, &q, to); 2709 2710 /* If we were woken (and unqueued), we succeeded, whatever. */ 2711 ret = 0; 2712 /* unqueue_me() drops q.key ref */ 2713 if (!unqueue_me(&q)) 2714 goto out; 2715 ret = -ETIMEDOUT; 2716 if (to && !to->task) 2717 goto out; 2718 2719 /* 2720 * We expect signal_pending(current), but we might be the 2721 * victim of a spurious wakeup as well. 2722 */ 2723 if (!signal_pending(current)) 2724 goto retry; 2725 2726 ret = -ERESTARTSYS; 2727 if (!abs_time) 2728 goto out; 2729 2730 restart = &current->restart_block; 2731 restart->fn = futex_wait_restart; 2732 restart->futex.uaddr = uaddr; 2733 restart->futex.val = val; 2734 restart->futex.time = *abs_time; 2735 restart->futex.bitset = bitset; 2736 restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; 2737 2738 ret = -ERESTART_RESTARTBLOCK; 2739 2740out: 2741 if (to) { 2742 hrtimer_cancel(&to->timer); 2743 destroy_hrtimer_on_stack(&to->timer); 2744 } 2745 return ret; 2746} 2747 2748 2749static long futex_wait_restart(struct restart_block *restart) 2750{ 2751 u32 __user *uaddr = restart->futex.uaddr; 2752 ktime_t t, *tp = NULL; 2753 2754 if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { 2755 t = restart->futex.time; 2756 tp = &t; 2757 } 2758 restart->fn = do_no_restart_syscall; 2759 2760 return (long)futex_wait(uaddr, restart->futex.flags, 2761 restart->futex.val, tp, restart->futex.bitset); 2762} 2763 2764 2765/* 2766 * Userspace tried a 0 -> TID atomic transition of the futex value 2767 * and failed. The kernel side here does the whole locking operation: 2768 * if there are waiters then it will block as a consequence of relying 2769 * on rt-mutexes, it does PI, etc. (Due to races the kernel might see 2770 * a 0 value of the futex too.). 2771 * 2772 * Also serves as futex trylock_pi()'ing, and due semantics. 2773 */ 2774static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, 2775 ktime_t *time, int trylock) 2776{ 2777 struct hrtimer_sleeper timeout, *to; 2778 struct task_struct *exiting = NULL; 2779 struct rt_mutex_waiter rt_waiter; 2780 struct futex_hash_bucket *hb; 2781 struct futex_q q = futex_q_init; 2782 int res, ret; 2783 2784 if (!IS_ENABLED(CONFIG_FUTEX_PI)) 2785 return -ENOSYS; 2786 2787 if (refill_pi_state_cache()) 2788 return -ENOMEM; 2789 2790 to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0); 2791 2792retry: 2793 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE); 2794 if (unlikely(ret != 0)) 2795 goto out; 2796 2797retry_private: 2798 hb = queue_lock(&q); 2799 2800 ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 2801 &exiting, 0); 2802 if (unlikely(ret)) { 2803 /* 2804 * Atomic work succeeded and we got the lock, 2805 * or failed. Either way, we do _not_ block. 2806 */ 2807 switch (ret) { 2808 case 1: 2809 /* We got the lock. */ 2810 ret = 0; 2811 goto out_unlock_put_key; 2812 case -EFAULT: 2813 goto uaddr_faulted; 2814 case -EBUSY: 2815 case -EAGAIN: 2816 /* 2817 * Two reasons for this: 2818 * - EBUSY: Task is exiting and we just wait for the 2819 * exit to complete. 2820 * - EAGAIN: The user space value changed. 2821 */ 2822 queue_unlock(hb); 2823 /* 2824 * Handle the case where the owner is in the middle of 2825 * exiting. Wait for the exit to complete otherwise 2826 * this task might loop forever, aka. live lock. 2827 */ 2828 wait_for_owner_exiting(ret, exiting); 2829 cond_resched(); 2830 goto retry; 2831 default: 2832 goto out_unlock_put_key; 2833 } 2834 } 2835 2836 WARN_ON(!q.pi_state); 2837 2838 /* 2839 * Only actually queue now that the atomic ops are done: 2840 */ 2841 __queue_me(&q, hb); 2842 2843 if (trylock) { 2844 ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); 2845 /* Fixup the trylock return value: */ 2846 ret = ret ? 0 : -EWOULDBLOCK; 2847 goto no_block; 2848 } 2849 2850 rt_mutex_init_waiter(&rt_waiter); 2851 2852 /* 2853 * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not 2854 * hold it while doing rt_mutex_start_proxy(), because then it will 2855 * include hb->lock in the blocking chain, even through we'll not in 2856 * fact hold it while blocking. This will lead it to report -EDEADLK 2857 * and BUG when futex_unlock_pi() interleaves with this. 2858 * 2859 * Therefore acquire wait_lock while holding hb->lock, but drop the 2860 * latter before calling __rt_mutex_start_proxy_lock(). This 2861 * interleaves with futex_unlock_pi() -- which does a similar lock 2862 * handoff -- such that the latter can observe the futex_q::pi_state 2863 * before __rt_mutex_start_proxy_lock() is done. 2864 */ 2865 raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); 2866 spin_unlock(q.lock_ptr); 2867 /* 2868 * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter 2869 * such that futex_unlock_pi() is guaranteed to observe the waiter when 2870 * it sees the futex_q::pi_state. 2871 */ 2872 ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); 2873 raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); 2874 2875 if (ret) { 2876 if (ret == 1) 2877 ret = 0; 2878 goto cleanup; 2879 } 2880 2881 if (unlikely(to)) 2882 hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); 2883 2884 ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); 2885 2886cleanup: 2887 spin_lock(q.lock_ptr); 2888 /* 2889 * If we failed to acquire the lock (deadlock/signal/timeout), we must 2890 * first acquire the hb->lock before removing the lock from the 2891 * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait 2892 * lists consistent. 2893 * 2894 * In particular; it is important that futex_unlock_pi() can not 2895 * observe this inconsistency. 2896 */ 2897 if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) 2898 ret = 0; 2899 2900no_block: 2901 /* 2902 * Fixup the pi_state owner and possibly acquire the lock if we 2903 * haven't already. 2904 */ 2905 res = fixup_owner(uaddr, &q, !ret); 2906 /* 2907 * If fixup_owner() returned an error, proprogate that. If it acquired 2908 * the lock, clear our -ETIMEDOUT or -EINTR. 2909 */ 2910 if (res) 2911 ret = (res < 0) ? res : 0; 2912 2913 /* Unqueue and drop the lock */ 2914 unqueue_me_pi(&q); 2915 goto out; 2916 2917out_unlock_put_key: 2918 queue_unlock(hb); 2919 2920out: 2921 if (to) { 2922 hrtimer_cancel(&to->timer); 2923 destroy_hrtimer_on_stack(&to->timer); 2924 } 2925 return ret != -EINTR ? ret : -ERESTARTNOINTR; 2926 2927uaddr_faulted: 2928 queue_unlock(hb); 2929 2930 ret = fault_in_user_writeable(uaddr); 2931 if (ret) 2932 goto out; 2933 2934 if (!(flags & FLAGS_SHARED)) 2935 goto retry_private; 2936 2937 goto retry; 2938} 2939 2940/* 2941 * Userspace attempted a TID -> 0 atomic transition, and failed. 2942 * This is the in-kernel slowpath: we look up the PI state (if any), 2943 * and do the rt-mutex unlock. 2944 */ 2945static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) 2946{ 2947 u32 curval, uval, vpid = task_pid_vnr(current); 2948 union futex_key key = FUTEX_KEY_INIT; 2949 struct futex_hash_bucket *hb; 2950 struct futex_q *top_waiter; 2951 int ret; 2952 2953 if (!IS_ENABLED(CONFIG_FUTEX_PI)) 2954 return -ENOSYS; 2955 2956retry: 2957 if (get_user(uval, uaddr)) 2958 return -EFAULT; 2959 /* 2960 * We release only a lock we actually own: 2961 */ 2962 if ((uval & FUTEX_TID_MASK) != vpid) 2963 return -EPERM; 2964 2965 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE); 2966 if (ret) 2967 return ret; 2968 2969 hb = hash_futex(&key); 2970 spin_lock(&hb->lock); 2971 2972 /* 2973 * Check waiters first. We do not trust user space values at 2974 * all and we at least want to know if user space fiddled 2975 * with the futex value instead of blindly unlocking. 2976 */ 2977 top_waiter = futex_top_waiter(hb, &key); 2978 if (top_waiter) { 2979 struct futex_pi_state *pi_state = top_waiter->pi_state; 2980 2981 ret = -EINVAL; 2982 if (!pi_state) 2983 goto out_unlock; 2984 2985 /* 2986 * If current does not own the pi_state then the futex is 2987 * inconsistent and user space fiddled with the futex value. 2988 */ 2989 if (pi_state->owner != current) 2990 goto out_unlock; 2991 2992 get_pi_state(pi_state); 2993 /* 2994 * By taking wait_lock while still holding hb->lock, we ensure 2995 * there is no point where we hold neither; and therefore 2996 * wake_futex_pi() must observe a state consistent with what we 2997 * observed. 2998 * 2999 * In particular; this forces __rt_mutex_start_proxy() to 3000 * complete such that we're guaranteed to observe the 3001 * rt_waiter. Also see the WARN in wake_futex_pi(). 3002 */ 3003 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 3004 spin_unlock(&hb->lock); 3005 3006 /* drops pi_state->pi_mutex.wait_lock */ 3007 ret = wake_futex_pi(uaddr, uval, pi_state); 3008 3009 put_pi_state(pi_state); 3010 3011 /* 3012 * Success, we're done! No tricky corner cases. 3013 */ 3014 if (!ret) 3015 goto out_putkey; 3016 /* 3017 * The atomic access to the futex value generated a 3018 * pagefault, so retry the user-access and the wakeup: 3019 */ 3020 if (ret == -EFAULT) 3021 goto pi_faulted; 3022 /* 3023 * A unconditional UNLOCK_PI op raced against a waiter 3024 * setting the FUTEX_WAITERS bit. Try again. 3025 */ 3026 if (ret == -EAGAIN) 3027 goto pi_retry; 3028 /* 3029 * wake_futex_pi has detected invalid state. Tell user 3030 * space. 3031 */ 3032 goto out_putkey; 3033 } 3034 3035 /* 3036 * We have no kernel internal state, i.e. no waiters in the 3037 * kernel. Waiters which are about to queue themselves are stuck 3038 * on hb->lock. So we can safely ignore them. We do neither 3039 * preserve the WAITERS bit not the OWNER_DIED one. We are the 3040 * owner. 3041 */ 3042 if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) { 3043 spin_unlock(&hb->lock); 3044 switch (ret) { 3045 case -EFAULT: 3046 goto pi_faulted; 3047 3048 case -EAGAIN: 3049 goto pi_retry; 3050 3051 default: 3052 WARN_ON_ONCE(1); 3053 goto out_putkey; 3054 } 3055 } 3056 3057 /* 3058 * If uval has changed, let user space handle it. 3059 */ 3060 ret = (curval == uval) ? 0 : -EAGAIN; 3061 3062out_unlock: 3063 spin_unlock(&hb->lock); 3064out_putkey: 3065 return ret; 3066 3067pi_retry: 3068 cond_resched(); 3069 goto retry; 3070 3071pi_faulted: 3072 3073 ret = fault_in_user_writeable(uaddr); 3074 if (!ret) 3075 goto retry; 3076 3077 return ret; 3078} 3079 3080/** 3081 * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex 3082 * @hb: the hash_bucket futex_q was original enqueued on 3083 * @q: the futex_q woken while waiting to be requeued 3084 * @key2: the futex_key of the requeue target futex 3085 * @timeout: the timeout associated with the wait (NULL if none) 3086 * 3087 * Detect if the task was woken on the initial futex as opposed to the requeue 3088 * target futex. If so, determine if it was a timeout or a signal that caused 3089 * the wakeup and return the appropriate error code to the caller. Must be 3090 * called with the hb lock held. 3091 * 3092 * Return: 3093 * - 0 = no early wakeup detected; 3094 * - <0 = -ETIMEDOUT or -ERESTARTNOINTR 3095 */ 3096static inline 3097int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, 3098 struct futex_q *q, union futex_key *key2, 3099 struct hrtimer_sleeper *timeout) 3100{ 3101 int ret = 0; 3102 3103 /* 3104 * With the hb lock held, we avoid races while we process the wakeup. 3105 * We only need to hold hb (and not hb2) to ensure atomicity as the 3106 * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. 3107 * It can't be requeued from uaddr2 to something else since we don't 3108 * support a PI aware source futex for requeue. 3109 */ 3110 if (!match_futex(&q->key, key2)) { 3111 WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); 3112 /* 3113 * We were woken prior to requeue by a timeout or a signal. 3114 * Unqueue the futex_q and determine which it was. 3115 */ 3116 plist_del(&q->list, &hb->chain); 3117 hb_waiters_dec(hb); 3118 3119 /* Handle spurious wakeups gracefully */ 3120 ret = -EWOULDBLOCK; 3121 if (timeout && !timeout->task) 3122 ret = -ETIMEDOUT; 3123 else if (signal_pending(current)) 3124 ret = -ERESTARTNOINTR; 3125 } 3126 return ret; 3127} 3128 3129/** 3130 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 3131 * @uaddr: the futex we initially wait on (non-pi) 3132 * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be 3133 * the same type, no requeueing from private to shared, etc. 3134 * @val: the expected value of uaddr 3135 * @abs_time: absolute timeout 3136 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all 3137 * @uaddr2: the pi futex we will take prior to returning to user-space 3138 * 3139 * The caller will wait on uaddr and will be requeued by futex_requeue() to 3140 * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake 3141 * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to 3142 * userspace. This ensures the rt_mutex maintains an owner when it has waiters; 3143 * without one, the pi logic would not know which task to boost/deboost, if 3144 * there was a need to. 3145 * 3146 * We call schedule in futex_wait_queue_me() when we enqueue and return there 3147 * via the following-- 3148 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() 3149 * 2) wakeup on uaddr2 after a requeue 3150 * 3) signal 3151 * 4) timeout 3152 * 3153 * If 3, cleanup and return -ERESTARTNOINTR. 3154 * 3155 * If 2, we may then block on trying to take the rt_mutex and return via: 3156 * 5) successful lock 3157 * 6) signal 3158 * 7) timeout 3159 * 8) other lock acquisition failure 3160 * 3161 * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). 3162 * 3163 * If 4 or 7, we cleanup and return with -ETIMEDOUT. 3164 * 3165 * Return: 3166 * - 0 - On success; 3167 * - <0 - On error 3168 */ 3169static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, 3170 u32 val, ktime_t *abs_time, u32 bitset, 3171 u32 __user *uaddr2) 3172{ 3173 struct hrtimer_sleeper timeout, *to; 3174 struct rt_mutex_waiter rt_waiter; 3175 struct futex_hash_bucket *hb; 3176 union futex_key key2 = FUTEX_KEY_INIT; 3177 struct futex_q q = futex_q_init; 3178 int res, ret; 3179 3180 if (!IS_ENABLED(CONFIG_FUTEX_PI)) 3181 return -ENOSYS; 3182 3183 if (uaddr == uaddr2) 3184 return -EINVAL; 3185 3186 if (!bitset) 3187 return -EINVAL; 3188 3189 to = futex_setup_timer(abs_time, &timeout, flags, 3190 current->timer_slack_ns); 3191 3192 /* 3193 * The waiter is allocated on our stack, manipulated by the requeue 3194 * code while we sleep on uaddr. 3195 */ 3196 rt_mutex_init_waiter(&rt_waiter); 3197 3198 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); 3199 if (unlikely(ret != 0)) 3200 goto out; 3201 3202 q.bitset = bitset; 3203 q.rt_waiter = &rt_waiter; 3204 q.requeue_pi_key = &key2; 3205 3206 /* 3207 * Prepare to wait on uaddr. On success, increments q.key (key1) ref 3208 * count. 3209 */ 3210 ret = futex_wait_setup(uaddr, val, flags, &q, &hb); 3211 if (ret) 3212 goto out; 3213 3214 /* 3215 * The check above which compares uaddrs is not sufficient for 3216 * shared futexes. We need to compare the keys: 3217 */ 3218 if (match_futex(&q.key, &key2)) { 3219 queue_unlock(hb); 3220 ret = -EINVAL; 3221 goto out; 3222 } 3223 3224 /* Queue the futex_q, drop the hb lock, wait for wakeup. */ 3225 futex_wait_queue_me(hb, &q, to); 3226 3227 spin_lock(&hb->lock); 3228 ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); 3229 spin_unlock(&hb->lock); 3230 if (ret) 3231 goto out; 3232 3233 /* 3234 * In order for us to be here, we know our q.key == key2, and since 3235 * we took the hb->lock above, we also know that futex_requeue() has 3236 * completed and we no longer have to concern ourselves with a wakeup 3237 * race with the atomic proxy lock acquisition by the requeue code. The 3238 * futex_requeue dropped our key1 reference and incremented our key2 3239 * reference count. 3240 */ 3241 3242 /* Check if the requeue code acquired the second futex for us. */ 3243 if (!q.rt_waiter) { 3244 /* 3245 * Got the lock. We might not be the anticipated owner if we 3246 * did a lock-steal - fix up the PI-state in that case. 3247 */ 3248 if (q.pi_state && (q.pi_state->owner != current)) { 3249 spin_lock(q.lock_ptr); 3250 ret = fixup_pi_state_owner(uaddr2, &q, current); 3251 /* 3252 * Drop the reference to the pi state which 3253 * the requeue_pi() code acquired for us. 3254 */ 3255 put_pi_state(q.pi_state); 3256 spin_unlock(q.lock_ptr); 3257 /* 3258 * Adjust the return value. It's either -EFAULT or 3259 * success (1) but the caller expects 0 for success. 3260 */ 3261 ret = ret < 0 ? ret : 0; 3262 } 3263 } else { 3264 struct rt_mutex *pi_mutex; 3265 3266 /* 3267 * We have been woken up by futex_unlock_pi(), a timeout, or a 3268 * signal. futex_unlock_pi() will not destroy the lock_ptr nor 3269 * the pi_state. 3270 */ 3271 WARN_ON(!q.pi_state); 3272 pi_mutex = &q.pi_state->pi_mutex; 3273 ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); 3274 3275 spin_lock(q.lock_ptr); 3276 if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) 3277 ret = 0; 3278 3279 debug_rt_mutex_free_waiter(&rt_waiter); 3280 /* 3281 * Fixup the pi_state owner and possibly acquire the lock if we 3282 * haven't already. 3283 */ 3284 res = fixup_owner(uaddr2, &q, !ret); 3285 /* 3286 * If fixup_owner() returned an error, proprogate that. If it 3287 * acquired the lock, clear -ETIMEDOUT or -EINTR. 3288 */ 3289 if (res) 3290 ret = (res < 0) ? res : 0; 3291 3292 /* Unqueue and drop the lock. */ 3293 unqueue_me_pi(&q); 3294 } 3295 3296 if (ret == -EINTR) { 3297 /* 3298 * We've already been requeued, but cannot restart by calling 3299 * futex_lock_pi() directly. We could restart this syscall, but 3300 * it would detect that the user space "val" changed and return 3301 * -EWOULDBLOCK. Save the overhead of the restart and return 3302 * -EWOULDBLOCK directly. 3303 */ 3304 ret = -EWOULDBLOCK; 3305 } 3306 3307out: 3308 if (to) { 3309 hrtimer_cancel(&to->timer); 3310 destroy_hrtimer_on_stack(&to->timer); 3311 } 3312 return ret; 3313} 3314 3315/* 3316 * Support for robust futexes: the kernel cleans up held futexes at 3317 * thread exit time. 3318 * 3319 * Implementation: user-space maintains a per-thread list of locks it 3320 * is holding. Upon do_exit(), the kernel carefully walks this list, 3321 * and marks all locks that are owned by this thread with the 3322 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is 3323 * always manipulated with the lock held, so the list is private and 3324 * per-thread. Userspace also maintains a per-thread 'list_op_pending' 3325 * field, to allow the kernel to clean up if the thread dies after 3326 * acquiring the lock, but just before it could have added itself to 3327 * the list. There can only be one such pending lock. 3328 */ 3329 3330/** 3331 * sys_set_robust_list() - Set the robust-futex list head of a task 3332 * @head: pointer to the list-head 3333 * @len: length of the list-head, as userspace expects 3334 */ 3335SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, 3336 size_t, len) 3337{ 3338 if (!futex_cmpxchg_enabled) 3339 return -ENOSYS; 3340 /* 3341 * The kernel knows only one size for now: 3342 */ 3343 if (unlikely(len != sizeof(*head))) 3344 return -EINVAL; 3345 3346 current->robust_list = head; 3347 3348 return 0; 3349} 3350 3351/** 3352 * sys_get_robust_list() - Get the robust-futex list head of a task 3353 * @pid: pid of the process [zero for current task] 3354 * @head_ptr: pointer to a list-head pointer, the kernel fills it in 3355 * @len_ptr: pointer to a length field, the kernel fills in the header size 3356 */ 3357SYSCALL_DEFINE3(get_robust_list, int, pid, 3358 struct robust_list_head __user * __user *, head_ptr, 3359 size_t __user *, len_ptr) 3360{ 3361 struct robust_list_head __user *head; 3362 unsigned long ret; 3363 struct task_struct *p; 3364 3365 if (!futex_cmpxchg_enabled) 3366 return -ENOSYS; 3367 3368 rcu_read_lock(); 3369 3370 ret = -ESRCH; 3371 if (!pid) 3372 p = current; 3373 else { 3374 p = find_task_by_vpid(pid); 3375 if (!p) 3376 goto err_unlock; 3377 } 3378 3379 ret = -EPERM; 3380 if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) 3381 goto err_unlock; 3382 3383 head = p->robust_list; 3384 rcu_read_unlock(); 3385 3386 if (put_user(sizeof(*head), len_ptr)) 3387 return -EFAULT; 3388 return put_user(head, head_ptr); 3389 3390err_unlock: 3391 rcu_read_unlock(); 3392 3393 return ret; 3394} 3395 3396/* Constants for the pending_op argument of handle_futex_death */ 3397#define HANDLE_DEATH_PENDING true 3398#define HANDLE_DEATH_LIST false 3399 3400/* 3401 * Process a futex-list entry, check whether it's owned by the 3402 * dying task, and do notification if so: 3403 */ 3404static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, 3405 bool pi, bool pending_op) 3406{ 3407 u32 uval, nval, mval; 3408 int err; 3409 3410 /* Futex address must be 32bit aligned */ 3411 if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) 3412 return -1; 3413 3414retry: 3415 if (get_user(uval, uaddr)) 3416 return -1; 3417 3418 /* 3419 * Special case for regular (non PI) futexes. The unlock path in 3420 * user space has two race scenarios: 3421 * 3422 * 1. The unlock path releases the user space futex value and 3423 * before it can execute the futex() syscall to wake up 3424 * waiters it is killed. 3425 * 3426 * 2. A woken up waiter is killed before it can acquire the 3427 * futex in user space. 3428 * 3429 * In both cases the TID validation below prevents a wakeup of 3430 * potential waiters which can cause these waiters to block 3431 * forever. 3432 * 3433 * In both cases the following conditions are met: 3434 * 3435 * 1) task->robust_list->list_op_pending != NULL 3436 * @pending_op == true 3437 * 2) User space futex value == 0 3438 * 3) Regular futex: @pi == false 3439 * 3440 * If these conditions are met, it is safe to attempt waking up a 3441 * potential waiter without touching the user space futex value and 3442 * trying to set the OWNER_DIED bit. The user space futex value is 3443 * uncontended and the rest of the user space mutex state is 3444 * consistent, so a woken waiter will just take over the 3445 * uncontended futex. Setting the OWNER_DIED bit would create 3446 * inconsistent state and malfunction of the user space owner died 3447 * handling. 3448 */ 3449 if (pending_op && !pi && !uval) { 3450 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); 3451 return 0; 3452 } 3453 3454 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr)) 3455 return 0; 3456 3457 /* 3458 * Ok, this dying thread is truly holding a futex 3459 * of interest. Set the OWNER_DIED bit atomically 3460 * via cmpxchg, and if the value had FUTEX_WAITERS 3461 * set, wake up a waiter (if any). (We have to do a 3462 * futex_wake() even if OWNER_DIED is already set - 3463 * to handle the rare but possible case of recursive 3464 * thread-death.) The rest of the cleanup is done in 3465 * userspace. 3466 */ 3467 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 3468 3469 /* 3470 * We are not holding a lock here, but we want to have 3471 * the pagefault_disable/enable() protection because 3472 * we want to handle the fault gracefully. If the 3473 * access fails we try to fault in the futex with R/W 3474 * verification via get_user_pages. get_user() above 3475 * does not guarantee R/W access. If that fails we 3476 * give up and leave the futex locked. 3477 */ 3478 if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) { 3479 switch (err) { 3480 case -EFAULT: 3481 if (fault_in_user_writeable(uaddr)) 3482 return -1; 3483 goto retry; 3484 3485 case -EAGAIN: 3486 cond_resched(); 3487 goto retry; 3488 3489 default: 3490 WARN_ON_ONCE(1); 3491 return err; 3492 } 3493 } 3494 3495 if (nval != uval) 3496 goto retry; 3497 3498 /* 3499 * Wake robust non-PI futexes here. The wakeup of 3500 * PI futexes happens in exit_pi_state(): 3501 */ 3502 if (!pi && (uval & FUTEX_WAITERS)) 3503 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); 3504 3505 return 0; 3506} 3507 3508/* 3509 * Fetch a robust-list pointer. Bit 0 signals PI futexes: 3510 */ 3511static inline int fetch_robust_entry(struct robust_list __user **entry, 3512 struct robust_list __user * __user *head, 3513 unsigned int *pi) 3514{ 3515 unsigned long uentry; 3516 3517 if (get_user(uentry, (unsigned long __user *)head)) 3518 return -EFAULT; 3519 3520 *entry = (void __user *)(uentry & ~1UL); 3521 *pi = uentry & 1; 3522 3523 return 0; 3524} 3525 3526/* 3527 * Walk curr->robust_list (very carefully, it's a userspace list!) 3528 * and mark any locks found there dead, and notify any waiters. 3529 * 3530 * We silently return on any sign of list-walking problem. 3531 */ 3532static void exit_robust_list(struct task_struct *curr) 3533{ 3534 struct robust_list_head __user *head = curr->robust_list; 3535 struct robust_list __user *entry, *next_entry, *pending; 3536 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; 3537 unsigned int next_pi; 3538 unsigned long futex_offset; 3539 int rc; 3540 3541 if (!futex_cmpxchg_enabled) 3542 return; 3543 3544 /* 3545 * Fetch the list head (which was registered earlier, via 3546 * sys_set_robust_list()): 3547 */ 3548 if (fetch_robust_entry(&entry, &head->list.next, &pi)) 3549 return; 3550 /* 3551 * Fetch the relative futex offset: 3552 */ 3553 if (get_user(futex_offset, &head->futex_offset)) 3554 return; 3555 /* 3556 * Fetch any possibly pending lock-add first, and handle it 3557 * if it exists: 3558 */ 3559 if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) 3560 return; 3561 3562 next_entry = NULL; /* avoid warning with gcc */ 3563 while (entry != &head->list) { 3564 /* 3565 * Fetch the next entry in the list before calling 3566 * handle_futex_death: 3567 */ 3568 rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); 3569 /* 3570 * A pending lock might already be on the list, so 3571 * don't process it twice: 3572 */ 3573 if (entry != pending) { 3574 if (handle_futex_death((void __user *)entry + futex_offset, 3575 curr, pi, HANDLE_DEATH_LIST)) 3576 return; 3577 } 3578 if (rc) 3579 return; 3580 entry = next_entry; 3581 pi = next_pi; 3582 /* 3583 * Avoid excessively long or circular lists: 3584 */ 3585 if (!--limit) 3586 break; 3587 3588 cond_resched(); 3589 } 3590 3591 if (pending) { 3592 handle_futex_death((void __user *)pending + futex_offset, 3593 curr, pip, HANDLE_DEATH_PENDING); 3594 } 3595} 3596 3597static void futex_cleanup(struct task_struct *tsk) 3598{ 3599 if (unlikely(tsk->robust_list)) { 3600 exit_robust_list(tsk); 3601 tsk->robust_list = NULL; 3602 } 3603 3604#ifdef CONFIG_COMPAT 3605 if (unlikely(tsk->compat_robust_list)) { 3606 compat_exit_robust_list(tsk); 3607 tsk->compat_robust_list = NULL; 3608 } 3609#endif 3610 3611 if (unlikely(!list_empty(&tsk->pi_state_list))) 3612 exit_pi_state_list(tsk); 3613} 3614 3615/** 3616 * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD 3617 * @tsk: task to set the state on 3618 * 3619 * Set the futex exit state of the task lockless. The futex waiter code 3620 * observes that state when a task is exiting and loops until the task has 3621 * actually finished the futex cleanup. The worst case for this is that the 3622 * waiter runs through the wait loop until the state becomes visible. 3623 * 3624 * This is called from the recursive fault handling path in do_exit(). 3625 * 3626 * This is best effort. Either the futex exit code has run already or 3627 * not. If the OWNER_DIED bit has been set on the futex then the waiter can 3628 * take it over. If not, the problem is pushed back to user space. If the 3629 * futex exit code did not run yet, then an already queued waiter might 3630 * block forever, but there is nothing which can be done about that. 3631 */ 3632void futex_exit_recursive(struct task_struct *tsk) 3633{ 3634 /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ 3635 if (tsk->futex_state == FUTEX_STATE_EXITING) 3636 mutex_unlock(&tsk->futex_exit_mutex); 3637 tsk->futex_state = FUTEX_STATE_DEAD; 3638} 3639 3640static void futex_cleanup_begin(struct task_struct *tsk) 3641{ 3642 /* 3643 * Prevent various race issues against a concurrent incoming waiter 3644 * including live locks by forcing the waiter to block on 3645 * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in 3646 * attach_to_pi_owner(). 3647 */ 3648 mutex_lock(&tsk->futex_exit_mutex); 3649 3650 /* 3651 * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. 3652 * 3653 * This ensures that all subsequent checks of tsk->futex_state in 3654 * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with 3655 * tsk->pi_lock held. 3656 * 3657 * It guarantees also that a pi_state which was queued right before 3658 * the state change under tsk->pi_lock by a concurrent waiter must 3659 * be observed in exit_pi_state_list(). 3660 */ 3661 raw_spin_lock_irq(&tsk->pi_lock); 3662 tsk->futex_state = FUTEX_STATE_EXITING; 3663 raw_spin_unlock_irq(&tsk->pi_lock); 3664} 3665 3666static void futex_cleanup_end(struct task_struct *tsk, int state) 3667{ 3668 /* 3669 * Lockless store. The only side effect is that an observer might 3670 * take another loop until it becomes visible. 3671 */ 3672 tsk->futex_state = state; 3673 /* 3674 * Drop the exit protection. This unblocks waiters which observed 3675 * FUTEX_STATE_EXITING to reevaluate the state. 3676 */ 3677 mutex_unlock(&tsk->futex_exit_mutex); 3678} 3679 3680void futex_exec_release(struct task_struct *tsk) 3681{ 3682 /* 3683 * The state handling is done for consistency, but in the case of 3684 * exec() there is no way to prevent futher damage as the PID stays 3685 * the same. But for the unlikely and arguably buggy case that a 3686 * futex is held on exec(), this provides at least as much state 3687 * consistency protection which is possible. 3688 */ 3689 futex_cleanup_begin(tsk); 3690 futex_cleanup(tsk); 3691 /* 3692 * Reset the state to FUTEX_STATE_OK. The task is alive and about 3693 * exec a new binary. 3694 */ 3695 futex_cleanup_end(tsk, FUTEX_STATE_OK); 3696} 3697 3698void futex_exit_release(struct task_struct *tsk) 3699{ 3700 futex_cleanup_begin(tsk); 3701 futex_cleanup(tsk); 3702 futex_cleanup_end(tsk, FUTEX_STATE_DEAD); 3703} 3704 3705long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 3706 u32 __user *uaddr2, u32 val2, u32 val3) 3707{ 3708 int cmd = op & FUTEX_CMD_MASK; 3709 unsigned int flags = 0; 3710 3711 if (!(op & FUTEX_PRIVATE_FLAG)) 3712 flags |= FLAGS_SHARED; 3713 3714 if (op & FUTEX_CLOCK_REALTIME) { 3715 flags |= FLAGS_CLOCKRT; 3716 if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \ 3717 cmd != FUTEX_WAIT_REQUEUE_PI) 3718 return -ENOSYS; 3719 } 3720 3721 switch (cmd) { 3722 case FUTEX_LOCK_PI: 3723 case FUTEX_UNLOCK_PI: 3724 case FUTEX_TRYLOCK_PI: 3725 case FUTEX_WAIT_REQUEUE_PI: 3726 case FUTEX_CMP_REQUEUE_PI: 3727 if (!futex_cmpxchg_enabled) 3728 return -ENOSYS; 3729 } 3730 3731 switch (cmd) { 3732 case FUTEX_WAIT: 3733 val3 = FUTEX_BITSET_MATCH_ANY; 3734 fallthrough; 3735 case FUTEX_WAIT_BITSET: 3736 return futex_wait(uaddr, flags, val, timeout, val3); 3737 case FUTEX_WAKE: 3738 val3 = FUTEX_BITSET_MATCH_ANY; 3739 fallthrough; 3740 case FUTEX_WAKE_BITSET: 3741 return futex_wake(uaddr, flags, val, val3); 3742 case FUTEX_REQUEUE: 3743 return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); 3744 case FUTEX_CMP_REQUEUE: 3745 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); 3746 case FUTEX_WAKE_OP: 3747 return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); 3748 case FUTEX_LOCK_PI: 3749 return futex_lock_pi(uaddr, flags, timeout, 0); 3750 case FUTEX_UNLOCK_PI: 3751 return futex_unlock_pi(uaddr, flags); 3752 case FUTEX_TRYLOCK_PI: 3753 return futex_lock_pi(uaddr, flags, NULL, 1); 3754 case FUTEX_WAIT_REQUEUE_PI: 3755 val3 = FUTEX_BITSET_MATCH_ANY; 3756 return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, 3757 uaddr2); 3758 case FUTEX_CMP_REQUEUE_PI: 3759 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); 3760 } 3761 return -ENOSYS; 3762} 3763 3764 3765SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, 3766 struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, 3767 u32, val3) 3768{ 3769 struct timespec64 ts; 3770 ktime_t t, *tp = NULL; 3771 u32 val2 = 0; 3772 int cmd = op & FUTEX_CMD_MASK; 3773 3774 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || 3775 cmd == FUTEX_WAIT_BITSET || 3776 cmd == FUTEX_WAIT_REQUEUE_PI)) { 3777 if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) 3778 return -EFAULT; 3779 if (get_timespec64(&ts, utime)) 3780 return -EFAULT; 3781 if (!timespec64_valid(&ts)) 3782 return -EINVAL; 3783 3784 t = timespec64_to_ktime(ts); 3785 if (cmd == FUTEX_WAIT) 3786 t = ktime_add_safe(ktime_get(), t); 3787 else if (!(op & FUTEX_CLOCK_REALTIME)) 3788 t = timens_ktime_to_host(CLOCK_MONOTONIC, t); 3789 tp = &t; 3790 } 3791 /* 3792 * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*. 3793 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. 3794 */ 3795 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || 3796 cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) 3797 val2 = (u32) (unsigned long) utime; 3798 3799 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 3800} 3801 3802#ifdef CONFIG_COMPAT 3803/* 3804 * Fetch a robust-list pointer. Bit 0 signals PI futexes: 3805 */ 3806static inline int 3807compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, 3808 compat_uptr_t __user *head, unsigned int *pi) 3809{ 3810 if (get_user(*uentry, head)) 3811 return -EFAULT; 3812 3813 *entry = compat_ptr((*uentry) & ~1); 3814 *pi = (unsigned int)(*uentry) & 1; 3815 3816 return 0; 3817} 3818 3819static void __user *futex_uaddr(struct robust_list __user *entry, 3820 compat_long_t futex_offset) 3821{ 3822 compat_uptr_t base = ptr_to_compat(entry); 3823 void __user *uaddr = compat_ptr(base + futex_offset); 3824 3825 return uaddr; 3826} 3827 3828/* 3829 * Walk curr->robust_list (very carefully, it's a userspace list!) 3830 * and mark any locks found there dead, and notify any waiters. 3831 * 3832 * We silently return on any sign of list-walking problem. 3833 */ 3834static void compat_exit_robust_list(struct task_struct *curr) 3835{ 3836 struct compat_robust_list_head __user *head = curr->compat_robust_list; 3837 struct robust_list __user *entry, *next_entry, *pending; 3838 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; 3839 unsigned int next_pi; 3840 compat_uptr_t uentry, next_uentry, upending; 3841 compat_long_t futex_offset; 3842 int rc; 3843 3844 if (!futex_cmpxchg_enabled) 3845 return; 3846 3847 /* 3848 * Fetch the list head (which was registered earlier, via 3849 * sys_set_robust_list()): 3850 */ 3851 if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) 3852 return; 3853 /* 3854 * Fetch the relative futex offset: 3855 */ 3856 if (get_user(futex_offset, &head->futex_offset)) 3857 return; 3858 /* 3859 * Fetch any possibly pending lock-add first, and handle it 3860 * if it exists: 3861 */ 3862 if (compat_fetch_robust_entry(&upending, &pending, 3863 &head->list_op_pending, &pip)) 3864 return; 3865 3866 next_entry = NULL; /* avoid warning with gcc */ 3867 while (entry != (struct robust_list __user *) &head->list) { 3868 /* 3869 * Fetch the next entry in the list before calling 3870 * handle_futex_death: 3871 */ 3872 rc = compat_fetch_robust_entry(&next_uentry, &next_entry, 3873 (compat_uptr_t __user *)&entry->next, &next_pi); 3874 /* 3875 * A pending lock might already be on the list, so 3876 * dont process it twice: 3877 */ 3878 if (entry != pending) { 3879 void __user *uaddr = futex_uaddr(entry, futex_offset); 3880 3881 if (handle_futex_death(uaddr, curr, pi, 3882 HANDLE_DEATH_LIST)) 3883 return; 3884 } 3885 if (rc) 3886 return; 3887 uentry = next_uentry; 3888 entry = next_entry; 3889 pi = next_pi; 3890 /* 3891 * Avoid excessively long or circular lists: 3892 */ 3893 if (!--limit) 3894 break; 3895 3896 cond_resched(); 3897 } 3898 if (pending) { 3899 void __user *uaddr = futex_uaddr(pending, futex_offset); 3900 3901 handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING); 3902 } 3903} 3904 3905COMPAT_SYSCALL_DEFINE2(set_robust_list, 3906 struct compat_robust_list_head __user *, head, 3907 compat_size_t, len) 3908{ 3909 if (!futex_cmpxchg_enabled) 3910 return -ENOSYS; 3911 3912 if (unlikely(len != sizeof(*head))) 3913 return -EINVAL; 3914 3915 current->compat_robust_list = head; 3916 3917 return 0; 3918} 3919 3920COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, 3921 compat_uptr_t __user *, head_ptr, 3922 compat_size_t __user *, len_ptr) 3923{ 3924 struct compat_robust_list_head __user *head; 3925 unsigned long ret; 3926 struct task_struct *p; 3927 3928 if (!futex_cmpxchg_enabled) 3929 return -ENOSYS; 3930 3931 rcu_read_lock(); 3932 3933 ret = -ESRCH; 3934 if (!pid) 3935 p = current; 3936 else { 3937 p = find_task_by_vpid(pid); 3938 if (!p) 3939 goto err_unlock; 3940 } 3941 3942 ret = -EPERM; 3943 if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) 3944 goto err_unlock; 3945 3946 head = p->compat_robust_list; 3947 rcu_read_unlock(); 3948 3949 if (put_user(sizeof(*head), len_ptr)) 3950 return -EFAULT; 3951 return put_user(ptr_to_compat(head), head_ptr); 3952 3953err_unlock: 3954 rcu_read_unlock(); 3955 3956 return ret; 3957} 3958#endif /* CONFIG_COMPAT */ 3959 3960#ifdef CONFIG_COMPAT_32BIT_TIME 3961SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, 3962 struct old_timespec32 __user *, utime, u32 __user *, uaddr2, 3963 u32, val3) 3964{ 3965 struct timespec64 ts; 3966 ktime_t t, *tp = NULL; 3967 int val2 = 0; 3968 int cmd = op & FUTEX_CMD_MASK; 3969 3970 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || 3971 cmd == FUTEX_WAIT_BITSET || 3972 cmd == FUTEX_WAIT_REQUEUE_PI)) { 3973 if (get_old_timespec32(&ts, utime)) 3974 return -EFAULT; 3975 if (!timespec64_valid(&ts)) 3976 return -EINVAL; 3977 3978 t = timespec64_to_ktime(ts); 3979 if (cmd == FUTEX_WAIT) 3980 t = ktime_add_safe(ktime_get(), t); 3981 else if (!(op & FUTEX_CLOCK_REALTIME)) 3982 t = timens_ktime_to_host(CLOCK_MONOTONIC, t); 3983 tp = &t; 3984 } 3985 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || 3986 cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) 3987 val2 = (int) (unsigned long) utime; 3988 3989 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 3990} 3991#endif /* CONFIG_COMPAT_32BIT_TIME */ 3992 3993static void __init futex_detect_cmpxchg(void) 3994{ 3995#ifndef CONFIG_HAVE_FUTEX_CMPXCHG 3996 u32 curval; 3997 3998 /* 3999 * This will fail and we want it. Some arch implementations do 4000 * runtime detection of the futex_atomic_cmpxchg_inatomic() 4001 * functionality. We want to know that before we call in any 4002 * of the complex code paths. Also we want to prevent 4003 * registration of robust lists in that case. NULL is 4004 * guaranteed to fault and we get -EFAULT on functional 4005 * implementation, the non-functional ones will return 4006 * -ENOSYS. 4007 */ 4008 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) 4009 futex_cmpxchg_enabled = 1; 4010#endif 4011} 4012 4013static int __init futex_init(void) 4014{ 4015 unsigned int futex_shift; 4016 unsigned long i; 4017 4018#if CONFIG_BASE_SMALL 4019 futex_hashsize = 16; 4020#else 4021 futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); 4022#endif 4023 4024 futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), 4025 futex_hashsize, 0, 4026 futex_hashsize < 256 ? HASH_SMALL : 0, 4027 &futex_shift, NULL, 4028 futex_hashsize, futex_hashsize); 4029 futex_hashsize = 1UL << futex_shift; 4030 4031 futex_detect_cmpxchg(); 4032 4033 for (i = 0; i < futex_hashsize; i++) { 4034 atomic_set(&futex_queues[i].waiters, 0); 4035 plist_head_init(&futex_queues[i].chain); 4036 spin_lock_init(&futex_queues[i].lock); 4037 } 4038 4039 return 0; 4040} 4041core_initcall(futex_init);