Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: rcu-protected get_mm_exe_file()

This patch removes mm->mmap_sem from mm->exe_file read side.
Also it kills dup_mm_exe_file() and moves exe_file duplication into
dup_mmap() where both mmap_sems are locked.

[akpm@linux-foundation.org: fix comment typo]
Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Konstantin Khlebnikov and committed by
Linus Torvalds
90f31d0e 0ec62afe

+40 -22
+1 -2
fs/file.c
··· 638 638 file = fcheck_files(files, fd); 639 639 if (file) { 640 640 /* File object ref couldn't be taken */ 641 - if ((file->f_mode & mask) || 642 - !atomic_long_inc_not_zero(&file->f_count)) 641 + if ((file->f_mode & mask) || !get_file_rcu(file)) 643 642 file = NULL; 644 643 } 645 644 rcu_read_unlock();
+1
include/linux/fs.h
··· 870 870 atomic_long_inc(&f->f_count); 871 871 return f; 872 872 } 873 + #define get_file_rcu(x) atomic_long_inc_not_zero(&(x)->f_count) 873 874 #define fput_atomic(x) atomic_long_add_unless(&(x)->f_count, -1, 1) 874 875 #define file_count(x) atomic_long_read(&(x)->f_count) 875 876
+1 -1
include/linux/mm_types.h
··· 429 429 #endif 430 430 431 431 /* store ref to file /proc/<pid>/exe symlink points to */ 432 - struct file *exe_file; 432 + struct file __rcu *exe_file; 433 433 #ifdef CONFIG_MMU_NOTIFIER 434 434 struct mmu_notifier_mm *mmu_notifier_mm; 435 435 #endif
+37 -19
kernel/fork.c
··· 403 403 */ 404 404 down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); 405 405 406 + /* No ordering required: file already has been exposed. */ 407 + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); 408 + 406 409 mm->total_vm = oldmm->total_vm; 407 410 mm->shared_vm = oldmm->shared_vm; 408 411 mm->exec_vm = oldmm->exec_vm; ··· 531 528 pgd_free(mm, mm->pgd); 532 529 } 533 530 #else 534 - #define dup_mmap(mm, oldmm) (0) 531 + static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) 532 + { 533 + down_write(&oldmm->mmap_sem); 534 + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); 535 + up_write(&oldmm->mmap_sem); 536 + return 0; 537 + } 535 538 #define mm_alloc_pgd(mm) (0) 536 539 #define mm_free_pgd(mm) 537 540 #endif /* CONFIG_MMU */ ··· 706 697 } 707 698 EXPORT_SYMBOL_GPL(mmput); 708 699 700 + /** 701 + * set_mm_exe_file - change a reference to the mm's executable file 702 + * 703 + * This changes mm's executable file (shown as symlink /proc/[pid]/exe). 704 + * 705 + * Main users are mmput(), sys_execve() and sys_prctl(PR_SET_MM_MAP/EXE_FILE). 706 + * Callers prevent concurrent invocations: in mmput() nobody alive left, 707 + * in execve task is single-threaded, prctl holds mmap_sem exclusively. 708 + */ 709 709 void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) 710 710 { 711 + struct file *old_exe_file = rcu_dereference_protected(mm->exe_file, 712 + !atomic_read(&mm->mm_users) || current->in_execve || 713 + lockdep_is_held(&mm->mmap_sem)); 714 + 711 715 if (new_exe_file) 712 716 get_file(new_exe_file); 713 - if (mm->exe_file) 714 - fput(mm->exe_file); 715 - mm->exe_file = new_exe_file; 717 + rcu_assign_pointer(mm->exe_file, new_exe_file); 718 + if (old_exe_file) 719 + fput(old_exe_file); 716 720 } 717 721 722 + /** 723 + * get_mm_exe_file - acquire a reference to the mm's executable file 724 + * 725 + * Returns %NULL if mm has no associated executable file. 726 + * User must release file via fput(). 727 + */ 718 728 struct file *get_mm_exe_file(struct mm_struct *mm) 719 729 { 720 730 struct file *exe_file; 721 731 722 - /* We need mmap_sem to protect against races with removal of exe_file */ 723 - down_read(&mm->mmap_sem); 724 - exe_file = mm->exe_file; 725 - if (exe_file) 726 - get_file(exe_file); 727 - up_read(&mm->mmap_sem); 732 + rcu_read_lock(); 733 + exe_file = rcu_dereference(mm->exe_file); 734 + if (exe_file && !get_file_rcu(exe_file)) 735 + exe_file = NULL; 736 + rcu_read_unlock(); 728 737 return exe_file; 729 - } 730 - 731 - static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) 732 - { 733 - /* It's safe to write the exe_file pointer without exe_file_lock because 734 - * this is called during fork when the task is not yet in /proc */ 735 - newmm->exe_file = get_mm_exe_file(oldmm); 736 738 } 737 739 738 740 /** ··· 906 886 907 887 if (!mm_init(mm, tsk)) 908 888 goto fail_nomem; 909 - 910 - dup_mm_exe_file(oldmm, mm); 911 889 912 890 err = dup_mmap(mm, oldmm); 913 891 if (err)