Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

vfs: atomic f_pos accesses as per POSIX

Our write() system call has always been atomic in the sense that you get
the expected thread-safe contiguous write, but we haven't actually
guaranteed that concurrent writes are serialized wrt f_pos accesses, so
threads (or processes) that share a file descriptor and use "write()"
concurrently would quite likely overwrite each others data.

This violates POSIX.1-2008/SUSv4 Section XSI 2.9.7 that says:

"2.9.7 Thread Interactions with Regular File Operations

All of the following functions shall be atomic with respect to each
other in the effects specified in POSIX.1-2008 when they operate on
regular files or symbolic links: [...]"

and one of the effects is the file position update.

This unprotected file position behavior is not new behavior, and nobody
has ever cared. Until now. Yongzhi Pan reported unexpected behavior to
Michael Kerrisk that was due to this.

This resolves the issue with a f_pos-specific lock that is taken by
read/write/lseek on file descriptors that may be shared across threads
or processes.

Reported-by: Yongzhi Pan <panyongzhi@gmail.com>
Reported-by: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

authored by

Linus Torvalds and committed by
Al Viro
9c225f26 1b56e989

+55 -18
+1
fs/file_table.c
··· 135 135 atomic_long_set(&f->f_count, 1); 136 136 rwlock_init(&f->f_owner.lock); 137 137 spin_lock_init(&f->f_lock); 138 + mutex_init(&f->f_pos_lock); 138 139 eventpoll_init_file(f); 139 140 /* f->f_version: 0 */ 140 141 return f;
+1 -1
fs/namei.c
··· 1884 1884 1885 1885 nd->path = f.file->f_path; 1886 1886 if (flags & LOOKUP_RCU) { 1887 - if (f.need_put) 1887 + if (f.flags & FDPUT_FPUT) 1888 1888 *fp = f.file; 1889 1889 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); 1890 1890 rcu_read_lock();
+4
fs/open.c
··· 705 705 return 0; 706 706 } 707 707 708 + /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ 709 + if (S_ISREG(inode->i_mode)) 710 + f->f_mode |= FMODE_ATOMIC_POS; 711 + 708 712 f->f_op = fops_get(inode->i_fop); 709 713 if (unlikely(WARN_ON(!f->f_op))) { 710 714 error = -ENODEV;
+40 -14
fs/read_write.c
··· 264 264 } 265 265 EXPORT_SYMBOL(vfs_llseek); 266 266 267 + /* 268 + * We only lock f_pos if we have threads or if the file might be 269 + * shared with another process. In both cases we'll have an elevated 270 + * file count (done either by fdget() or by fork()). 271 + */ 272 + static inline struct fd fdget_pos(int fd) 273 + { 274 + struct fd f = fdget(fd); 275 + struct file *file = f.file; 276 + 277 + if (file && (file->f_mode & FMODE_ATOMIC_POS)) { 278 + if (file_count(file) > 1) { 279 + f.flags |= FDPUT_POS_UNLOCK; 280 + mutex_lock(&file->f_pos_lock); 281 + } 282 + } 283 + return f; 284 + } 285 + 286 + static inline void fdput_pos(struct fd f) 287 + { 288 + if (f.flags & FDPUT_POS_UNLOCK) 289 + mutex_unlock(&f.file->f_pos_lock); 290 + fdput(f); 291 + } 292 + 267 293 SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) 268 294 { 269 295 off_t retval; 270 - struct fd f = fdget(fd); 296 + struct fd f = fdget_pos(fd); 271 297 if (!f.file) 272 298 return -EBADF; 273 299 ··· 304 278 if (res != (loff_t)retval) 305 279 retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ 306 280 } 307 - fdput(f); 281 + fdput_pos(f); 308 282 return retval; 309 283 } 310 284 ··· 524 498 525 499 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) 526 500 { 527 - struct fd f = fdget(fd); 501 + struct fd f = fdget_pos(fd); 528 502 ssize_t ret = -EBADF; 529 503 530 504 if (f.file) { ··· 532 506 ret = vfs_read(f.file, buf, count, &pos); 533 507 if (ret >= 0) 534 508 file_pos_write(f.file, pos); 535 - fdput(f); 509 + fdput_pos(f); 536 510 } 537 511 return ret; 538 512 } ··· 540 514 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, 541 515 size_t, count) 542 516 { 543 - struct fd f = fdget(fd); 517 + struct fd f = fdget_pos(fd); 544 518 ssize_t ret = -EBADF; 545 519 546 520 if (f.file) { ··· 548 522 ret = vfs_write(f.file, buf, count, &pos); 549 523 if (ret >= 0) 550 524 file_pos_write(f.file, pos); 551 - fdput(f); 525 + fdput_pos(f); 552 526 } 553 527 554 528 return ret; ··· 823 797 SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, 824 798 unsigned long, vlen) 825 799 { 826 - struct fd f = fdget(fd); 800 + struct fd f = fdget_pos(fd); 827 801 ssize_t ret = -EBADF; 828 802 829 803 if (f.file) { ··· 831 805 ret = vfs_readv(f.file, vec, vlen, &pos); 832 806 if (ret >= 0) 833 807 file_pos_write(f.file, pos); 834 - fdput(f); 808 + fdput_pos(f); 835 809 } 836 810 837 811 if (ret > 0) ··· 843 817 SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, 844 818 unsigned long, vlen) 845 819 { 846 - struct fd f = fdget(fd); 820 + struct fd f = fdget_pos(fd); 847 821 ssize_t ret = -EBADF; 848 822 849 823 if (f.file) { ··· 851 825 ret = vfs_writev(f.file, vec, vlen, &pos); 852 826 if (ret >= 0) 853 827 file_pos_write(f.file, pos); 854 - fdput(f); 828 + fdput_pos(f); 855 829 } 856 830 857 831 if (ret > 0) ··· 994 968 const struct compat_iovec __user *,vec, 995 969 compat_ulong_t, vlen) 996 970 { 997 - struct fd f = fdget(fd); 971 + struct fd f = fdget_pos(fd); 998 972 ssize_t ret; 999 973 loff_t pos; 1000 974 ··· 1004 978 ret = compat_readv(f.file, vec, vlen, &pos); 1005 979 if (ret >= 0) 1006 980 f.file->f_pos = pos; 1007 - fdput(f); 981 + fdput_pos(f); 1008 982 return ret; 1009 983 } 1010 984 ··· 1061 1035 const struct compat_iovec __user *, vec, 1062 1036 compat_ulong_t, vlen) 1063 1037 { 1064 - struct fd f = fdget(fd); 1038 + struct fd f = fdget_pos(fd); 1065 1039 ssize_t ret; 1066 1040 loff_t pos; 1067 1041 ··· 1071 1045 ret = compat_writev(f.file, vec, vlen, &pos); 1072 1046 if (ret >= 0) 1073 1047 f.file->f_pos = pos; 1074 - fdput(f); 1048 + fdput_pos(f); 1075 1049 return ret; 1076 1050 } 1077 1051
+4 -2
include/linux/file.h
··· 28 28 29 29 struct fd { 30 30 struct file *file; 31 - int need_put; 31 + unsigned int flags; 32 32 }; 33 + #define FDPUT_FPUT 1 34 + #define FDPUT_POS_UNLOCK 2 33 35 34 36 static inline void fdput(struct fd fd) 35 37 { 36 - if (fd.need_put) 38 + if (fd.flags & FDPUT_FPUT) 37 39 fput(fd.file); 38 40 } 39 41
+5 -1
include/linux/fs.h
··· 123 123 /* File is opened with O_PATH; almost nothing can be done with it */ 124 124 #define FMODE_PATH ((__force fmode_t)0x4000) 125 125 126 + /* File needs atomic accesses to f_pos */ 127 + #define FMODE_ATOMIC_POS ((__force fmode_t)0x8000) 128 + 126 129 /* File was opened by fanotify and shouldn't generate fanotify events */ 127 130 #define FMODE_NONOTIFY ((__force fmode_t)0x1000000) 128 131 ··· 783 780 const struct file_operations *f_op; 784 781 785 782 /* 786 - * Protects f_ep_links, f_flags, f_pos vs i_size in lseek SEEK_CUR. 783 + * Protects f_ep_links, f_flags. 787 784 * Must not be taken from IRQ context. 788 785 */ 789 786 spinlock_t f_lock; 790 787 atomic_long_t f_count; 791 788 unsigned int f_flags; 792 789 fmode_t f_mode; 790 + struct mutex f_pos_lock; 793 791 loff_t f_pos; 794 792 struct fown_struct f_owner; 795 793 const struct cred *f_cred;