[PATCH] Add support for the sys_vmsplice syscall

sys_splice() moves data to/from pipes with a file input/output. sys_vmsplice()
moves data to a pipe, with the input being a user address range instead.

This uses an approach suggested by Linus, where we can hold partial ranges
inside the pages[] map. Hopefully this will be useful for network
receive support as well.

Signed-off-by: Jens Axboe <axboe@suse.de>

+268 -43
+1
arch/ia64/kernel/entry.S
··· 1610 1610 data8 sys_get_robust_list 1611 1611 data8 sys_sync_file_range // 1300 1612 1612 data8 sys_tee 1613 + data8 sys_vmsplice 1613 1614 1614 1615 .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls
+1
arch/powerpc/kernel/systbl.S
··· 324 324 SYSCALL(unshare) 325 325 SYSCALL(splice) 326 326 SYSCALL(tee) 327 + SYSCALL(vmsplice) 327 328 328 329 /* 329 330 * please add new calls to arch/powerpc/platforms/cell/spu_callbacks.c
+1
arch/powerpc/platforms/cell/spu_callbacks.c
··· 318 318 [__NR_unshare] sys_unshare, 319 319 [__NR_splice] sys_splice, 320 320 [__NR_tee] sys_tee, 321 + [__NR_vmsplice] sys_vmsplice, 321 322 }; 322 323 323 324 long spu_sys_callback(struct spu_syscall_block *s)
+253 -39
fs/splice.c
··· 27 27 #include <linux/buffer_head.h> 28 28 #include <linux/module.h> 29 29 #include <linux/syscalls.h> 30 + #include <linux/uio.h> 30 31 31 32 /* 32 33 * Passed to the actors ··· 37 36 unsigned int flags; /* splice flags */ 38 37 struct file *file; /* file to read/write */ 39 38 loff_t pos; /* file position */ 39 + }; 40 + 41 + struct partial_page { 42 + unsigned int offset; 43 + unsigned int len; 44 + }; 45 + 46 + /* 47 + * Passed to move_to_pipe 48 + */ 49 + struct splice_pipe_desc { 50 + struct page **pages; /* page map */ 51 + struct partial_page *partial; /* pages[] may not be contig */ 52 + int nr_pages; /* number of pages in map */ 53 + unsigned int flags; /* splice flags */ 54 + struct pipe_buf_operations *ops;/* ops associated with output pipe */ 40 55 }; 41 56 42 57 /* ··· 145 128 kunmap(buf->page); 146 129 } 147 130 131 + static void *user_page_pipe_buf_map(struct file *file, 132 + struct pipe_inode_info *pipe, 133 + struct pipe_buffer *buf) 134 + { 135 + return kmap(buf->page); 136 + } 137 + 138 + static void user_page_pipe_buf_unmap(struct pipe_inode_info *pipe, 139 + struct pipe_buffer *buf) 140 + { 141 + kunmap(buf->page); 142 + } 143 + 148 144 static void page_cache_pipe_buf_get(struct pipe_inode_info *info, 149 145 struct pipe_buffer *buf) 150 146 { ··· 173 143 .get = page_cache_pipe_buf_get, 174 144 }; 175 145 146 + static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, 147 + struct pipe_buffer *buf) 148 + { 149 + return 1; 150 + } 151 + 152 + static struct pipe_buf_operations user_page_pipe_buf_ops = { 153 + .can_merge = 0, 154 + .map = user_page_pipe_buf_map, 155 + .unmap = user_page_pipe_buf_unmap, 156 + .release = page_cache_pipe_buf_release, 157 + .steal = user_page_pipe_buf_steal, 158 + .get = page_cache_pipe_buf_get, 159 + }; 160 + 176 161 /* 177 162 * Pipe output worker. This sets up our pipe format with the page cache 178 163 * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). 179 164 */ 180 - static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, 181 - int nr_pages, unsigned long len, 182 - unsigned int offset, unsigned int flags) 165 + static ssize_t move_to_pipe(struct pipe_inode_info *pipe, 166 + struct splice_pipe_desc *spd) 183 167 { 184 - int ret, do_wakeup, i; 168 + int ret, do_wakeup, page_nr; 185 169 186 170 ret = 0; 187 171 do_wakeup = 0; 188 - i = 0; 172 + page_nr = 0; 189 173 190 174 if (pipe->inode) 191 175 mutex_lock(&pipe->inode->i_mutex); ··· 215 171 if (pipe->nrbufs < PIPE_BUFFERS) { 216 172 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 217 173 struct pipe_buffer *buf = pipe->bufs + newbuf; 218 - struct page *page = pages[i++]; 219 - unsigned long this_len; 220 174 221 - this_len = PAGE_CACHE_SIZE - offset; 222 - if (this_len > len) 223 - this_len = len; 224 - 225 - buf->page = page; 226 - buf->offset = offset; 227 - buf->len = this_len; 228 - buf->ops = &page_cache_pipe_buf_ops; 175 + buf->page = spd->pages[page_nr]; 176 + buf->offset = spd->partial[page_nr].offset; 177 + buf->len = spd->partial[page_nr].len; 178 + buf->ops = spd->ops; 229 179 pipe->nrbufs++; 180 + page_nr++; 181 + ret += buf->len; 182 + 230 183 if (pipe->inode) 231 184 do_wakeup = 1; 232 185 233 - ret += this_len; 234 - len -= this_len; 235 - offset = 0; 236 - if (!--nr_pages) 237 - break; 238 - if (!len) 186 + if (!--spd->nr_pages) 239 187 break; 240 188 if (pipe->nrbufs < PIPE_BUFFERS) 241 189 continue; ··· 235 199 break; 236 200 } 237 201 238 - if (flags & SPLICE_F_NONBLOCK) { 202 + if (spd->flags & SPLICE_F_NONBLOCK) { 239 203 if (!ret) 240 204 ret = -EAGAIN; 241 205 break; ··· 270 234 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 271 235 } 272 236 273 - while (i < nr_pages) 274 - page_cache_release(pages[i++]); 237 + while (page_nr < spd->nr_pages) 238 + page_cache_release(spd->pages[page_nr++]); 275 239 276 240 return ret; 277 241 } ··· 282 246 unsigned int flags) 283 247 { 284 248 struct address_space *mapping = in->f_mapping; 285 - unsigned int loff, offset, nr_pages; 249 + unsigned int loff, nr_pages; 286 250 struct page *pages[PIPE_BUFFERS]; 251 + struct partial_page partial[PIPE_BUFFERS]; 287 252 struct page *page; 288 253 pgoff_t index, end_index; 289 254 loff_t isize; 290 - size_t bytes; 291 - int i, error; 255 + size_t total_len; 256 + int error; 257 + struct splice_pipe_desc spd = { 258 + .pages = pages, 259 + .partial = partial, 260 + .flags = flags, 261 + .ops = &page_cache_pipe_buf_ops, 262 + }; 292 263 293 264 index = *ppos >> PAGE_CACHE_SHIFT; 294 - loff = offset = *ppos & ~PAGE_CACHE_MASK; 295 - nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 265 + loff = *ppos & ~PAGE_CACHE_MASK; 266 + nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 296 267 297 268 if (nr_pages > PIPE_BUFFERS) 298 269 nr_pages = PIPE_BUFFERS; ··· 309 266 * read-ahead if this is a non-zero offset (we are likely doing small 310 267 * chunk splice and the page is already there) for a single page. 311 268 */ 312 - if (!offset || nr_pages > 1) 313 - do_page_cache_readahead(mapping, in, index, nr_pages); 269 + if (!loff || spd.nr_pages > 1) 270 + do_page_cache_readahead(mapping, in, index, spd.nr_pages); 314 271 315 272 /* 316 273 * Now fill in the holes: 317 274 */ 318 275 error = 0; 319 - bytes = 0; 320 - for (i = 0; i < nr_pages; i++, index++) { 276 + total_len = 0; 277 + for (spd.nr_pages = 0; spd.nr_pages < nr_pages; spd.nr_pages++, index++) { 321 278 unsigned int this_len; 322 279 323 280 if (!len) ··· 410 367 */ 411 368 if (end_index == index) { 412 369 loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); 413 - if (bytes + loff > isize) { 370 + if (total_len + loff > isize) { 414 371 page_cache_release(page); 415 372 break; 416 373 } 417 374 /* 418 375 * force quit after adding this page 419 376 */ 420 - nr_pages = i; 377 + nr_pages = spd.nr_pages; 421 378 this_len = min(this_len, loff); 379 + loff = 0; 422 380 } 423 381 } 424 382 fill_it: 425 - pages[i] = page; 426 - bytes += this_len; 383 + pages[spd.nr_pages] = page; 384 + partial[spd.nr_pages].offset = loff; 385 + partial[spd.nr_pages].len = this_len; 427 386 len -= this_len; 387 + total_len += this_len; 428 388 loff = 0; 429 389 } 430 390 431 - if (i) 432 - return move_to_pipe(pipe, pages, i, bytes, offset, flags); 391 + if (spd.nr_pages) 392 + return move_to_pipe(pipe, &spd); 433 393 434 394 return error; 435 395 } ··· 1062 1016 } 1063 1017 1064 1018 return -EINVAL; 1019 + } 1020 + 1021 + /* 1022 + * Map an iov into an array of pages and offset/length tupples. With the 1023 + * partial_page structure, we can map several non-contiguous ranges into 1024 + * our ones pages[] map instead of splitting that operation into pieces. 1025 + * Could easily be exported as a generic helper for other users, in which 1026 + * case one would probably want to add a 'max_nr_pages' parameter as well. 1027 + */ 1028 + static int get_iovec_page_array(const struct iovec __user *iov, 1029 + unsigned int nr_vecs, struct page **pages, 1030 + struct partial_page *partial) 1031 + { 1032 + int buffers = 0, error = 0; 1033 + 1034 + /* 1035 + * It's ok to take the mmap_sem for reading, even 1036 + * across a "get_user()". 1037 + */ 1038 + down_read(&current->mm->mmap_sem); 1039 + 1040 + while (nr_vecs) { 1041 + unsigned long off, npages; 1042 + void __user *base; 1043 + size_t len; 1044 + int i; 1045 + 1046 + /* 1047 + * Get user address base and length for this iovec. 1048 + */ 1049 + error = get_user(base, &iov->iov_base); 1050 + if (unlikely(error)) 1051 + break; 1052 + error = get_user(len, &iov->iov_len); 1053 + if (unlikely(error)) 1054 + break; 1055 + 1056 + /* 1057 + * Sanity check this iovec. 0 read succeeds. 1058 + */ 1059 + if (unlikely(!len)) 1060 + break; 1061 + error = -EFAULT; 1062 + if (unlikely(!base)) 1063 + break; 1064 + 1065 + /* 1066 + * Get this base offset and number of pages, then map 1067 + * in the user pages. 1068 + */ 1069 + off = (unsigned long) base & ~PAGE_MASK; 1070 + npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1071 + if (npages > PIPE_BUFFERS - buffers) 1072 + npages = PIPE_BUFFERS - buffers; 1073 + 1074 + error = get_user_pages(current, current->mm, 1075 + (unsigned long) base, npages, 0, 0, 1076 + &pages[buffers], NULL); 1077 + 1078 + if (unlikely(error <= 0)) 1079 + break; 1080 + 1081 + /* 1082 + * Fill this contiguous range into the partial page map. 1083 + */ 1084 + for (i = 0; i < error; i++) { 1085 + const int plen = min_t(size_t, len, PAGE_SIZE) - off; 1086 + 1087 + partial[buffers].offset = off; 1088 + partial[buffers].len = plen; 1089 + 1090 + off = 0; 1091 + len -= plen; 1092 + buffers++; 1093 + } 1094 + 1095 + /* 1096 + * We didn't complete this iov, stop here since it probably 1097 + * means we have to move some of this into a pipe to 1098 + * be able to continue. 1099 + */ 1100 + if (len) 1101 + break; 1102 + 1103 + /* 1104 + * Don't continue if we mapped fewer pages than we asked for, 1105 + * or if we mapped the max number of pages that we have 1106 + * room for. 1107 + */ 1108 + if (error < npages || buffers == PIPE_BUFFERS) 1109 + break; 1110 + 1111 + nr_vecs--; 1112 + iov++; 1113 + } 1114 + 1115 + up_read(&current->mm->mmap_sem); 1116 + 1117 + if (buffers) 1118 + return buffers; 1119 + 1120 + return error; 1121 + } 1122 + 1123 + /* 1124 + * vmsplice splices a user address range into a pipe. It can be thought of 1125 + * as splice-from-memory, where the regular splice is splice-from-file (or 1126 + * to file). In both cases the output is a pipe, naturally. 1127 + * 1128 + * Note that vmsplice only supports splicing _from_ user memory to a pipe, 1129 + * not the other way around. Splicing from user memory is a simple operation 1130 + * that can be supported without any funky alignment restrictions or nasty 1131 + * vm tricks. We simply map in the user memory and fill them into a pipe. 1132 + * The reverse isn't quite as easy, though. There are two possible solutions 1133 + * for that: 1134 + * 1135 + * - memcpy() the data internally, at which point we might as well just 1136 + * do a regular read() on the buffer anyway. 1137 + * - Lots of nasty vm tricks, that are neither fast nor flexible (it 1138 + * has restriction limitations on both ends of the pipe). 1139 + * 1140 + * Alas, it isn't here. 1141 + * 1142 + */ 1143 + static long do_vmsplice(struct file *file, const struct iovec __user *iov, 1144 + unsigned long nr_segs, unsigned int flags) 1145 + { 1146 + struct pipe_inode_info *pipe = file->f_dentry->d_inode->i_pipe; 1147 + struct page *pages[PIPE_BUFFERS]; 1148 + struct partial_page partial[PIPE_BUFFERS]; 1149 + struct splice_pipe_desc spd = { 1150 + .pages = pages, 1151 + .partial = partial, 1152 + .flags = flags, 1153 + .ops = &user_page_pipe_buf_ops, 1154 + }; 1155 + 1156 + if (unlikely(!pipe)) 1157 + return -EBADF; 1158 + if (unlikely(nr_segs > UIO_MAXIOV)) 1159 + return -EINVAL; 1160 + else if (unlikely(!nr_segs)) 1161 + return 0; 1162 + 1163 + spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial); 1164 + if (spd.nr_pages <= 0) 1165 + return spd.nr_pages; 1166 + 1167 + return move_to_pipe(pipe, &spd); 1168 + } 1169 + 1170 + asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, 1171 + unsigned long nr_segs, unsigned int flags) 1172 + { 1173 + struct file *file; 1174 + long error; 1175 + int fput; 1176 + 1177 + error = -EBADF; 1178 + file = fget_light(fd, &fput); 1179 + if (file) { 1180 + if (file->f_mode & FMODE_WRITE) 1181 + error = do_vmsplice(file, iov, nr_segs, flags); 1182 + 1183 + fput_light(file, fput); 1184 + } 1185 + 1186 + return error; 1065 1187 } 1066 1188 1067 1189 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
+2 -1
include/asm-i386/unistd.h
··· 321 321 #define __NR_splice 313 322 322 #define __NR_sync_file_range 314 323 323 #define __NR_tee 315 324 + #define __NR_vmsplice 316 324 325 325 - #define NR_syscalls 316 326 + #define NR_syscalls 317 326 327 327 328 /* 328 329 * user-visible error numbers are in the range -1 - -128: see
+2 -1
include/asm-ia64/unistd.h
··· 290 290 #define __NR_get_robust_list 1299 291 291 #define __NR_sync_file_range 1300 292 292 #define __NR_tee 1301 293 + #define __NR_vmsplice 1302 293 294 294 295 #ifdef __KERNEL__ 295 296 296 297 #include <linux/config.h> 297 298 298 - #define NR_syscalls 278 /* length of syscall table */ 299 + #define NR_syscalls 279 /* length of syscall table */ 299 300 300 301 #define __ARCH_WANT_SYS_RT_SIGACTION 301 302
+2 -1
include/asm-powerpc/unistd.h
··· 303 303 #define __NR_unshare 282 304 304 #define __NR_splice 283 305 305 #define __NR_tee 284 306 + #define __NR_vmsplice 285 306 307 307 - #define __NR_syscalls 285 308 + #define __NR_syscalls 286 308 309 309 310 #ifdef __KERNEL__ 310 311 #define __NR__exit __NR_exit
+3 -1
include/asm-x86_64/unistd.h
··· 615 615 __SYSCALL(__NR_tee, sys_tee) 616 616 #define __NR_sync_file_range 277 617 617 __SYSCALL(__NR_sync_file_range, sys_sync_file_range) 618 + #define __NR_vmsplice 278 619 + __SYSCALL(__NR_vmsplice, sys_vmsplice) 618 620 619 - #define __NR_syscall_max __NR_sync_file_range 621 + #define __NR_syscall_max __NR_vmsplice 620 622 621 623 #ifndef __NO_STUBS 622 624
+3
include/linux/syscalls.h
··· 574 574 int fd_out, loff_t __user *off_out, 575 575 size_t len, unsigned int flags); 576 576 577 + asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, 578 + unsigned long nr_segs, unsigned int flags); 579 + 577 580 asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags); 578 581 579 582 asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,