futex: Fix inode life-time issue

As reported by Jann, ihold() does not in fact guarantee inode
persistence. And instead of making it so, replace the usage of inode
pointers with a per boot, machine wide, unique inode identifier.

This sequence number is global, but shared (file backed) futexes are
rare enough that this should not become a performance issue.

Reported-by: Jann Horn <jannh@google.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Changed files
+65 -43
fs
include
linux
kernel
+1
fs/inode.c
··· 138 138 inode->i_sb = sb; 139 139 inode->i_blkbits = sb->s_blocksize_bits; 140 140 inode->i_flags = 0; 141 + atomic64_set(&inode->i_sequence, 0); 141 142 atomic_set(&inode->i_count, 1); 142 143 inode->i_op = &empty_iops; 143 144 inode->i_fop = &no_open_fops;
+1
include/linux/fs.h
··· 698 698 struct rcu_head i_rcu; 699 699 }; 700 700 atomic64_t i_version; 701 + atomic64_t i_sequence; /* see futex */ 701 702 atomic_t i_count; 702 703 atomic_t i_dio_count; 703 704 atomic_t i_writecount;
+10 -7
include/linux/futex.h
··· 31 31 32 32 union futex_key { 33 33 struct { 34 + u64 i_seq; 34 35 unsigned long pgoff; 35 - struct inode *inode; 36 - int offset; 36 + unsigned int offset; 37 37 } shared; 38 38 struct { 39 + union { 40 + struct mm_struct *mm; 41 + u64 __tmp; 42 + }; 39 43 unsigned long address; 40 - struct mm_struct *mm; 41 - int offset; 44 + unsigned int offset; 42 45 } private; 43 46 struct { 47 + u64 ptr; 44 48 unsigned long word; 45 - void *ptr; 46 - int offset; 49 + unsigned int offset; 47 50 } both; 48 51 }; 49 52 50 - #define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } } 53 + #define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = 0ULL } } 51 54 52 55 #ifdef CONFIG_FUTEX 53 56 enum {
+53 -36
kernel/futex.c
··· 429 429 430 430 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 431 431 case FUT_OFF_INODE: 432 - ihold(key->shared.inode); /* implies smp_mb(); (B) */ 432 + smp_mb(); /* explicit smp_mb(); (B) */ 433 433 break; 434 434 case FUT_OFF_MMSHARED: 435 435 futex_get_mm(key); /* implies smp_mb(); (B) */ ··· 463 463 464 464 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 465 465 case FUT_OFF_INODE: 466 - iput(key->shared.inode); 467 466 break; 468 467 case FUT_OFF_MMSHARED: 469 468 mmdrop(key->private.mm); ··· 504 505 return timeout; 505 506 } 506 507 508 + /* 509 + * Generate a machine wide unique identifier for this inode. 510 + * 511 + * This relies on u64 not wrapping in the life-time of the machine; which with 512 + * 1ns resolution means almost 585 years. 513 + * 514 + * This further relies on the fact that a well formed program will not unmap 515 + * the file while it has a (shared) futex waiting on it. This mapping will have 516 + * a file reference which pins the mount and inode. 517 + * 518 + * If for some reason an inode gets evicted and read back in again, it will get 519 + * a new sequence number and will _NOT_ match, even though it is the exact same 520 + * file. 521 + * 522 + * It is important that match_futex() will never have a false-positive, esp. 523 + * for PI futexes that can mess up the state. The above argues that false-negatives 524 + * are only possible for malformed programs. 525 + */ 526 + static u64 get_inode_sequence_number(struct inode *inode) 527 + { 528 + static atomic64_t i_seq; 529 + u64 old; 530 + 531 + /* Does the inode already have a sequence number? */ 532 + old = atomic64_read(&inode->i_sequence); 533 + if (likely(old)) 534 + return old; 535 + 536 + for (;;) { 537 + u64 new = atomic64_add_return(1, &i_seq); 538 + if (WARN_ON_ONCE(!new)) 539 + continue; 540 + 541 + old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); 542 + if (old) 543 + return old; 544 + return new; 545 + } 546 + } 547 + 507 548 /** 508 549 * get_futex_key() - Get parameters which are the keys for a futex 509 550 * @uaddr: virtual address of the futex ··· 556 517 * 557 518 * The key words are stored in @key on success. 558 519 * 559 - * For shared mappings, it's (page->index, file_inode(vma->vm_file), 560 - * offset_within_page). For private mappings, it's (uaddr, current->mm). 561 - * We can usually work out the index without swapping in the page. 520 + * For shared mappings (when @fshared), the key is: 521 + * ( inode->i_sequence, page->index, offset_within_page ) 522 + * [ also see get_inode_sequence_number() ] 523 + * 524 + * For private mappings (or when !@fshared), the key is: 525 + * ( current->mm, address, 0 ) 526 + * 527 + * This allows (cross process, where applicable) identification of the futex 528 + * without keeping the page pinned for the duration of the FUTEX_WAIT. 562 529 * 563 530 * lock_page() might sleep, the caller should not hold a spinlock. 564 531 */ ··· 704 659 key->private.mm = mm; 705 660 key->private.address = address; 706 661 707 - get_futex_key_refs(key); /* implies smp_mb(); (B) */ 708 - 709 662 } else { 710 663 struct inode *inode; 711 664 ··· 735 692 goto again; 736 693 } 737 694 738 - /* 739 - * Take a reference unless it is about to be freed. Previously 740 - * this reference was taken by ihold under the page lock 741 - * pinning the inode in place so i_lock was unnecessary. The 742 - * only way for this check to fail is if the inode was 743 - * truncated in parallel which is almost certainly an 744 - * application bug. In such a case, just retry. 745 - * 746 - * We are not calling into get_futex_key_refs() in file-backed 747 - * cases, therefore a successful atomic_inc return below will 748 - * guarantee that get_futex_key() will still imply smp_mb(); (B). 749 - */ 750 - if (!atomic_inc_not_zero(&inode->i_count)) { 751 - rcu_read_unlock(); 752 - put_page(page); 753 - 754 - goto again; 755 - } 756 - 757 - /* Should be impossible but lets be paranoid for now */ 758 - if (WARN_ON_ONCE(inode->i_mapping != mapping)) { 759 - err = -EFAULT; 760 - rcu_read_unlock(); 761 - iput(inode); 762 - 763 - goto out; 764 - } 765 - 766 695 key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 767 - key->shared.inode = inode; 696 + key->shared.i_seq = get_inode_sequence_number(inode); 768 697 key->shared.pgoff = basepage_index(tail); 769 698 rcu_read_unlock(); 770 699 } 700 + 701 + get_futex_key_refs(key); /* implies smp_mb(); (B) */ 771 702 772 703 out: 773 704 put_page(page);