Merge branch 'splice' of git://brick.kernel.dk/data/git/linux-2.6-block

* 'splice' of git://brick.kernel.dk/data/git/linux-2.6-block:
[PATCH] splice: add ->splice_write support for /dev/null
[PATCH] splice: rearrange moving to/from pipe helpers
[PATCH] Add support for the sys_vmsplice syscall
[PATCH] splice: fix offset problems
[PATCH] splice: fix min() warning

+328 -77
+1
arch/ia64/kernel/entry.S
··· 1610 data8 sys_get_robust_list 1611 data8 sys_sync_file_range // 1300 1612 data8 sys_tee 1613 1614 .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls
··· 1610 data8 sys_get_robust_list 1611 data8 sys_sync_file_range // 1300 1612 data8 sys_tee 1613 + data8 sys_vmsplice 1614 1615 .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls
+1
arch/powerpc/kernel/systbl.S
··· 324 SYSCALL(unshare) 325 SYSCALL(splice) 326 SYSCALL(tee) 327 328 /* 329 * please add new calls to arch/powerpc/platforms/cell/spu_callbacks.c
··· 324 SYSCALL(unshare) 325 SYSCALL(splice) 326 SYSCALL(tee) 327 + SYSCALL(vmsplice) 328 329 /* 330 * please add new calls to arch/powerpc/platforms/cell/spu_callbacks.c
+1
arch/powerpc/platforms/cell/spu_callbacks.c
··· 318 [__NR_unshare] sys_unshare, 319 [__NR_splice] sys_splice, 320 [__NR_tee] sys_tee, 321 }; 322 323 long spu_sys_callback(struct spu_syscall_block *s)
··· 318 [__NR_unshare] sys_unshare, 319 [__NR_splice] sys_splice, 320 [__NR_tee] sys_tee, 321 + [__NR_vmsplice] sys_vmsplice, 322 }; 323 324 long spu_sys_callback(struct spu_syscall_block *s)
+14
drivers/char/mem.c
··· 27 #include <linux/crash_dump.h> 28 #include <linux/backing-dev.h> 29 #include <linux/bootmem.h> 30 31 #include <asm/uaccess.h> 32 #include <asm/io.h> ··· 579 return count; 580 } 581 582 #ifdef CONFIG_MMU 583 /* 584 * For fun, we are using the MMU for this. ··· 798 .llseek = null_lseek, 799 .read = read_null, 800 .write = write_null, 801 }; 802 803 #if defined(CONFIG_ISA) || !defined(__mc68000__)
··· 27 #include <linux/crash_dump.h> 28 #include <linux/backing-dev.h> 29 #include <linux/bootmem.h> 30 + #include <linux/pipe_fs_i.h> 31 32 #include <asm/uaccess.h> 33 #include <asm/io.h> ··· 578 return count; 579 } 580 581 + static int pipe_to_null(struct pipe_inode_info *info, struct pipe_buffer *buf, 582 + struct splice_desc *sd) 583 + { 584 + return sd->len; 585 + } 586 + 587 + static ssize_t splice_write_null(struct pipe_inode_info *pipe,struct file *out, 588 + loff_t *ppos, size_t len, unsigned int flags) 589 + { 590 + return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_null); 591 + } 592 + 593 #ifdef CONFIG_MMU 594 /* 595 * For fun, we are using the MMU for this. ··· 785 .llseek = null_lseek, 786 .read = read_null, 787 .write = write_null, 788 + .splice_write = splice_write_null, 789 }; 790 791 #if defined(CONFIG_ISA) || !defined(__mc68000__)
+282 -73
fs/splice.c
··· 27 #include <linux/buffer_head.h> 28 #include <linux/module.h> 29 #include <linux/syscalls.h> 30 31 /* 32 - * Passed to the actors 33 */ 34 - struct splice_desc { 35 - unsigned int len, total_len; /* current and remaining length */ 36 unsigned int flags; /* splice flags */ 37 - struct file *file; /* file to read/write */ 38 - loff_t pos; /* file position */ 39 }; 40 41 /* ··· 135 kunmap(buf->page); 136 } 137 138 static void page_cache_pipe_buf_get(struct pipe_inode_info *info, 139 struct pipe_buffer *buf) 140 { ··· 163 .get = page_cache_pipe_buf_get, 164 }; 165 166 /* 167 * Pipe output worker. This sets up our pipe format with the page cache 168 * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). 169 */ 170 - static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, 171 - int nr_pages, unsigned long len, 172 - unsigned int offset, unsigned int flags) 173 { 174 - int ret, do_wakeup, i; 175 176 ret = 0; 177 do_wakeup = 0; 178 - i = 0; 179 180 if (pipe->inode) 181 mutex_lock(&pipe->inode->i_mutex); ··· 205 if (pipe->nrbufs < PIPE_BUFFERS) { 206 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 207 struct pipe_buffer *buf = pipe->bufs + newbuf; 208 - struct page *page = pages[i++]; 209 - unsigned long this_len; 210 211 - this_len = PAGE_CACHE_SIZE - offset; 212 - if (this_len > len) 213 - this_len = len; 214 - 215 - buf->page = page; 216 - buf->offset = offset; 217 - buf->len = this_len; 218 - buf->ops = &page_cache_pipe_buf_ops; 219 pipe->nrbufs++; 220 if (pipe->inode) 221 do_wakeup = 1; 222 223 - ret += this_len; 224 - len -= this_len; 225 - offset = 0; 226 - if (!--nr_pages) 227 - break; 228 - if (!len) 229 break; 230 if (pipe->nrbufs < PIPE_BUFFERS) 231 continue; ··· 225 break; 226 } 227 228 - if (flags & SPLICE_F_NONBLOCK) { 229 if (!ret) 230 ret = -EAGAIN; 231 break; ··· 260 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 261 } 262 263 - while (i < nr_pages) 264 - page_cache_release(pages[i++]); 265 266 return ret; 267 } ··· 272 unsigned int flags) 273 { 274 struct address_space *mapping = in->f_mapping; 275 - unsigned int loff, offset, nr_pages; 276 struct page *pages[PIPE_BUFFERS]; 277 struct page *page; 278 pgoff_t index, end_index; 279 loff_t isize; 280 - size_t bytes; 281 - int i, error; 282 283 index = *ppos >> PAGE_CACHE_SHIFT; 284 - loff = offset = *ppos & ~PAGE_CACHE_MASK; 285 - nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 286 287 if (nr_pages > PIPE_BUFFERS) 288 nr_pages = PIPE_BUFFERS; ··· 299 * read-ahead if this is a non-zero offset (we are likely doing small 300 * chunk splice and the page is already there) for a single page. 301 */ 302 - if (!offset || nr_pages > 1) 303 - do_page_cache_readahead(mapping, in, index, nr_pages); 304 305 /* 306 * Now fill in the holes: 307 */ 308 error = 0; 309 - bytes = 0; 310 - for (i = 0; i < nr_pages; i++, index++) { 311 unsigned int this_len; 312 313 if (!len) ··· 316 /* 317 * this_len is the max we'll use from this page 318 */ 319 - this_len = min(len, PAGE_CACHE_SIZE - loff); 320 find_page: 321 /* 322 * lookup the page for this index ··· 400 */ 401 if (end_index == index) { 402 loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); 403 - if (bytes + loff > isize) { 404 page_cache_release(page); 405 break; 406 } 407 /* 408 * force quit after adding this page 409 */ 410 - nr_pages = i; 411 this_len = min(this_len, loff); 412 } 413 } 414 fill_it: 415 - pages[i] = page; 416 - bytes += this_len; 417 len -= this_len; 418 loff = 0; 419 } 420 421 - if (i) 422 - return move_to_pipe(pipe, pages, i, bytes, offset, flags); 423 424 return error; 425 } ··· 475 476 /* 477 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 478 - * using sendpage(). 479 */ 480 static int pipe_to_sendpage(struct pipe_inode_info *info, 481 struct pipe_buffer *buf, struct splice_desc *sd) 482 { 483 struct file *file = sd->file; 484 loff_t pos = sd->pos; 485 - unsigned int offset; 486 ssize_t ret; 487 void *ptr; 488 int more; ··· 496 if (IS_ERR(ptr)) 497 return PTR_ERR(ptr); 498 499 - offset = pos & ~PAGE_CACHE_MASK; 500 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; 501 502 - ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,more); 503 504 buf->ops->unmap(info, buf); 505 - if (ret == sd->len) 506 - return 0; 507 - 508 - return -EIO; 509 } 510 511 /* ··· 531 struct file *file = sd->file; 532 struct address_space *mapping = file->f_mapping; 533 gfp_t gfp_mask = mapping_gfp_mask(mapping); 534 - unsigned int offset; 535 struct page *page; 536 pgoff_t index; 537 char *src; ··· 546 547 index = sd->pos >> PAGE_CACHE_SHIFT; 548 offset = sd->pos & ~PAGE_CACHE_MASK; 549 550 /* 551 * Reuse buf page, if SPLICE_F_MOVE is set. ··· 594 * the full page. 595 */ 596 if (!PageUptodate(page)) { 597 - if (sd->len < PAGE_CACHE_SIZE) { 598 ret = mapping->a_ops->readpage(file, page); 599 if (unlikely(ret)) 600 goto out; ··· 618 } 619 } 620 621 - ret = mapping->a_ops->prepare_write(file, page, 0, sd->len); 622 if (ret == AOP_TRUNCATED_PAGE) { 623 page_cache_release(page); 624 goto find_page; ··· 628 if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { 629 char *dst = kmap_atomic(page, KM_USER0); 630 631 - memcpy(dst + offset, src + buf->offset, sd->len); 632 flush_dcache_page(page); 633 kunmap_atomic(dst, KM_USER0); 634 } 635 636 - ret = mapping->a_ops->commit_write(file, page, 0, sd->len); 637 if (ret == AOP_TRUNCATED_PAGE) { 638 page_cache_release(page); 639 goto find_page; 640 } else if (ret) 641 goto out; 642 643 mark_page_accessed(page); 644 balance_dirty_pages_ratelimited(mapping); 645 out: ··· 656 return ret; 657 } 658 659 - typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, 660 - struct splice_desc *); 661 - 662 /* 663 * Pipe input worker. Most of this logic works like a regular pipe, the 664 * key here is the 'actor' worker passed in that actually moves the data 665 * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. 666 */ 667 - static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, 668 - loff_t *ppos, size_t len, unsigned int flags, 669 - splice_actor *actor) 670 { 671 int ret, do_wakeup, err; 672 struct splice_desc sd; ··· 689 sd.len = sd.total_len; 690 691 err = actor(pipe, buf, &sd); 692 - if (err) { 693 if (!ret && err != -ENODATA) 694 ret = err; 695 696 break; 697 } 698 699 - ret += sd.len; 700 - buf->offset += sd.len; 701 - buf->len -= sd.len; 702 703 if (!buf->len) { 704 buf->ops = NULL; ··· 715 do_wakeup = 1; 716 } 717 718 - sd.pos += sd.len; 719 - sd.total_len -= sd.len; 720 if (!sd.total_len) 721 break; 722 } ··· 782 struct address_space *mapping = out->f_mapping; 783 ssize_t ret; 784 785 - ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); 786 if (ret > 0) { 787 struct inode *inode = mapping->host; 788 ··· 824 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, 825 loff_t *ppos, size_t len, unsigned int flags) 826 { 827 - return move_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); 828 } 829 830 EXPORT_SYMBOL(generic_splice_sendpage); ··· 911 912 /* 913 * We don't have an immediate reader, but we'll read the stuff 914 - * out of the pipe right after the move_to_pipe(). So set 915 * PIPE_READERS appropriately. 916 */ 917 pipe->readers = 1; ··· 1049 } 1050 1051 return -EINVAL; 1052 } 1053 1054 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
··· 27 #include <linux/buffer_head.h> 28 #include <linux/module.h> 29 #include <linux/syscalls.h> 30 + #include <linux/uio.h> 31 + 32 + struct partial_page { 33 + unsigned int offset; 34 + unsigned int len; 35 + }; 36 37 /* 38 + * Passed to splice_to_pipe 39 */ 40 + struct splice_pipe_desc { 41 + struct page **pages; /* page map */ 42 + struct partial_page *partial; /* pages[] may not be contig */ 43 + int nr_pages; /* number of pages in map */ 44 unsigned int flags; /* splice flags */ 45 + struct pipe_buf_operations *ops;/* ops associated with output pipe */ 46 }; 47 48 /* ··· 128 kunmap(buf->page); 129 } 130 131 + static void *user_page_pipe_buf_map(struct file *file, 132 + struct pipe_inode_info *pipe, 133 + struct pipe_buffer *buf) 134 + { 135 + return kmap(buf->page); 136 + } 137 + 138 + static void user_page_pipe_buf_unmap(struct pipe_inode_info *pipe, 139 + struct pipe_buffer *buf) 140 + { 141 + kunmap(buf->page); 142 + } 143 + 144 static void page_cache_pipe_buf_get(struct pipe_inode_info *info, 145 struct pipe_buffer *buf) 146 { ··· 143 .get = page_cache_pipe_buf_get, 144 }; 145 146 + static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, 147 + struct pipe_buffer *buf) 148 + { 149 + return 1; 150 + } 151 + 152 + static struct pipe_buf_operations user_page_pipe_buf_ops = { 153 + .can_merge = 0, 154 + .map = user_page_pipe_buf_map, 155 + .unmap = user_page_pipe_buf_unmap, 156 + .release = page_cache_pipe_buf_release, 157 + .steal = user_page_pipe_buf_steal, 158 + .get = page_cache_pipe_buf_get, 159 + }; 160 + 161 /* 162 * Pipe output worker. This sets up our pipe format with the page cache 163 * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). 164 */ 165 + static ssize_t splice_to_pipe(struct pipe_inode_info *pipe, 166 + struct splice_pipe_desc *spd) 167 { 168 + int ret, do_wakeup, page_nr; 169 170 ret = 0; 171 do_wakeup = 0; 172 + page_nr = 0; 173 174 if (pipe->inode) 175 mutex_lock(&pipe->inode->i_mutex); ··· 171 if (pipe->nrbufs < PIPE_BUFFERS) { 172 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 173 struct pipe_buffer *buf = pipe->bufs + newbuf; 174 175 + buf->page = spd->pages[page_nr]; 176 + buf->offset = spd->partial[page_nr].offset; 177 + buf->len = spd->partial[page_nr].len; 178 + buf->ops = spd->ops; 179 pipe->nrbufs++; 180 + page_nr++; 181 + ret += buf->len; 182 + 183 if (pipe->inode) 184 do_wakeup = 1; 185 186 + if (!--spd->nr_pages) 187 break; 188 if (pipe->nrbufs < PIPE_BUFFERS) 189 continue; ··· 199 break; 200 } 201 202 + if (spd->flags & SPLICE_F_NONBLOCK) { 203 if (!ret) 204 ret = -EAGAIN; 205 break; ··· 234 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 235 } 236 237 + while (page_nr < spd->nr_pages) 238 + page_cache_release(spd->pages[page_nr++]); 239 240 return ret; 241 } ··· 246 unsigned int flags) 247 { 248 struct address_space *mapping = in->f_mapping; 249 + unsigned int loff, nr_pages; 250 struct page *pages[PIPE_BUFFERS]; 251 + struct partial_page partial[PIPE_BUFFERS]; 252 struct page *page; 253 pgoff_t index, end_index; 254 loff_t isize; 255 + size_t total_len; 256 + int error; 257 + struct splice_pipe_desc spd = { 258 + .pages = pages, 259 + .partial = partial, 260 + .flags = flags, 261 + .ops = &page_cache_pipe_buf_ops, 262 + }; 263 264 index = *ppos >> PAGE_CACHE_SHIFT; 265 + loff = *ppos & ~PAGE_CACHE_MASK; 266 + nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 267 268 if (nr_pages > PIPE_BUFFERS) 269 nr_pages = PIPE_BUFFERS; ··· 266 * read-ahead if this is a non-zero offset (we are likely doing small 267 * chunk splice and the page is already there) for a single page. 268 */ 269 + if (!loff || spd.nr_pages > 1) 270 + do_page_cache_readahead(mapping, in, index, spd.nr_pages); 271 272 /* 273 * Now fill in the holes: 274 */ 275 error = 0; 276 + total_len = 0; 277 + for (spd.nr_pages = 0; spd.nr_pages < nr_pages; spd.nr_pages++, index++) { 278 unsigned int this_len; 279 280 if (!len) ··· 283 /* 284 * this_len is the max we'll use from this page 285 */ 286 + this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); 287 find_page: 288 /* 289 * lookup the page for this index ··· 367 */ 368 if (end_index == index) { 369 loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); 370 + if (total_len + loff > isize) { 371 page_cache_release(page); 372 break; 373 } 374 /* 375 * force quit after adding this page 376 */ 377 + nr_pages = spd.nr_pages; 378 this_len = min(this_len, loff); 379 + loff = 0; 380 } 381 } 382 fill_it: 383 + pages[spd.nr_pages] = page; 384 + partial[spd.nr_pages].offset = loff; 385 + partial[spd.nr_pages].len = this_len; 386 len -= this_len; 387 + total_len += this_len; 388 loff = 0; 389 } 390 391 + if (spd.nr_pages) 392 + return splice_to_pipe(pipe, &spd); 393 394 return error; 395 } ··· 439 440 /* 441 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 442 + * using sendpage(). Return the number of bytes sent. 443 */ 444 static int pipe_to_sendpage(struct pipe_inode_info *info, 445 struct pipe_buffer *buf, struct splice_desc *sd) 446 { 447 struct file *file = sd->file; 448 loff_t pos = sd->pos; 449 ssize_t ret; 450 void *ptr; 451 int more; ··· 461 if (IS_ERR(ptr)) 462 return PTR_ERR(ptr); 463 464 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; 465 466 + ret = file->f_op->sendpage(file, buf->page, buf->offset, sd->len, 467 + &pos, more); 468 469 buf->ops->unmap(info, buf); 470 + return ret; 471 } 472 473 /* ··· 499 struct file *file = sd->file; 500 struct address_space *mapping = file->f_mapping; 501 gfp_t gfp_mask = mapping_gfp_mask(mapping); 502 + unsigned int offset, this_len; 503 struct page *page; 504 pgoff_t index; 505 char *src; ··· 514 515 index = sd->pos >> PAGE_CACHE_SHIFT; 516 offset = sd->pos & ~PAGE_CACHE_MASK; 517 + 518 + this_len = sd->len; 519 + if (this_len + offset > PAGE_CACHE_SIZE) 520 + this_len = PAGE_CACHE_SIZE - offset; 521 522 /* 523 * Reuse buf page, if SPLICE_F_MOVE is set. ··· 558 * the full page. 559 */ 560 if (!PageUptodate(page)) { 561 + if (this_len < PAGE_CACHE_SIZE) { 562 ret = mapping->a_ops->readpage(file, page); 563 if (unlikely(ret)) 564 goto out; ··· 582 } 583 } 584 585 + ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); 586 if (ret == AOP_TRUNCATED_PAGE) { 587 page_cache_release(page); 588 goto find_page; ··· 592 if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { 593 char *dst = kmap_atomic(page, KM_USER0); 594 595 + memcpy(dst + offset, src + buf->offset, this_len); 596 flush_dcache_page(page); 597 kunmap_atomic(dst, KM_USER0); 598 } 599 600 + ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); 601 if (ret == AOP_TRUNCATED_PAGE) { 602 page_cache_release(page); 603 goto find_page; 604 } else if (ret) 605 goto out; 606 607 + /* 608 + * Return the number of bytes written. 609 + */ 610 + ret = this_len; 611 mark_page_accessed(page); 612 balance_dirty_pages_ratelimited(mapping); 613 out: ··· 616 return ret; 617 } 618 619 /* 620 * Pipe input worker. Most of this logic works like a regular pipe, the 621 * key here is the 'actor' worker passed in that actually moves the data 622 * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. 623 */ 624 + ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, 625 + loff_t *ppos, size_t len, unsigned int flags, 626 + splice_actor *actor) 627 { 628 int ret, do_wakeup, err; 629 struct splice_desc sd; ··· 652 sd.len = sd.total_len; 653 654 err = actor(pipe, buf, &sd); 655 + if (err <= 0) { 656 if (!ret && err != -ENODATA) 657 ret = err; 658 659 break; 660 } 661 662 + ret += err; 663 + buf->offset += err; 664 + buf->len -= err; 665 + 666 + sd.len -= err; 667 + sd.pos += err; 668 + sd.total_len -= err; 669 + if (sd.len) 670 + continue; 671 672 if (!buf->len) { 673 buf->ops = NULL; ··· 672 do_wakeup = 1; 673 } 674 675 if (!sd.total_len) 676 break; 677 } ··· 741 struct address_space *mapping = out->f_mapping; 742 ssize_t ret; 743 744 + ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); 745 if (ret > 0) { 746 struct inode *inode = mapping->host; 747 ··· 783 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, 784 loff_t *ppos, size_t len, unsigned int flags) 785 { 786 + return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); 787 } 788 789 EXPORT_SYMBOL(generic_splice_sendpage); ··· 870 871 /* 872 * We don't have an immediate reader, but we'll read the stuff 873 + * out of the pipe right after the splice_to_pipe(). So set 874 * PIPE_READERS appropriately. 875 */ 876 pipe->readers = 1; ··· 1008 } 1009 1010 return -EINVAL; 1011 + } 1012 + 1013 + /* 1014 + * Map an iov into an array of pages and offset/length tupples. With the 1015 + * partial_page structure, we can map several non-contiguous ranges into 1016 + * our ones pages[] map instead of splitting that operation into pieces. 1017 + * Could easily be exported as a generic helper for other users, in which 1018 + * case one would probably want to add a 'max_nr_pages' parameter as well. 1019 + */ 1020 + static int get_iovec_page_array(const struct iovec __user *iov, 1021 + unsigned int nr_vecs, struct page **pages, 1022 + struct partial_page *partial) 1023 + { 1024 + int buffers = 0, error = 0; 1025 + 1026 + /* 1027 + * It's ok to take the mmap_sem for reading, even 1028 + * across a "get_user()". 1029 + */ 1030 + down_read(&current->mm->mmap_sem); 1031 + 1032 + while (nr_vecs) { 1033 + unsigned long off, npages; 1034 + void __user *base; 1035 + size_t len; 1036 + int i; 1037 + 1038 + /* 1039 + * Get user address base and length for this iovec. 1040 + */ 1041 + error = get_user(base, &iov->iov_base); 1042 + if (unlikely(error)) 1043 + break; 1044 + error = get_user(len, &iov->iov_len); 1045 + if (unlikely(error)) 1046 + break; 1047 + 1048 + /* 1049 + * Sanity check this iovec. 0 read succeeds. 1050 + */ 1051 + if (unlikely(!len)) 1052 + break; 1053 + error = -EFAULT; 1054 + if (unlikely(!base)) 1055 + break; 1056 + 1057 + /* 1058 + * Get this base offset and number of pages, then map 1059 + * in the user pages. 1060 + */ 1061 + off = (unsigned long) base & ~PAGE_MASK; 1062 + npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1063 + if (npages > PIPE_BUFFERS - buffers) 1064 + npages = PIPE_BUFFERS - buffers; 1065 + 1066 + error = get_user_pages(current, current->mm, 1067 + (unsigned long) base, npages, 0, 0, 1068 + &pages[buffers], NULL); 1069 + 1070 + if (unlikely(error <= 0)) 1071 + break; 1072 + 1073 + /* 1074 + * Fill this contiguous range into the partial page map. 1075 + */ 1076 + for (i = 0; i < error; i++) { 1077 + const int plen = min_t(size_t, len, PAGE_SIZE) - off; 1078 + 1079 + partial[buffers].offset = off; 1080 + partial[buffers].len = plen; 1081 + 1082 + off = 0; 1083 + len -= plen; 1084 + buffers++; 1085 + } 1086 + 1087 + /* 1088 + * We didn't complete this iov, stop here since it probably 1089 + * means we have to move some of this into a pipe to 1090 + * be able to continue. 1091 + */ 1092 + if (len) 1093 + break; 1094 + 1095 + /* 1096 + * Don't continue if we mapped fewer pages than we asked for, 1097 + * or if we mapped the max number of pages that we have 1098 + * room for. 1099 + */ 1100 + if (error < npages || buffers == PIPE_BUFFERS) 1101 + break; 1102 + 1103 + nr_vecs--; 1104 + iov++; 1105 + } 1106 + 1107 + up_read(&current->mm->mmap_sem); 1108 + 1109 + if (buffers) 1110 + return buffers; 1111 + 1112 + return error; 1113 + } 1114 + 1115 + /* 1116 + * vmsplice splices a user address range into a pipe. It can be thought of 1117 + * as splice-from-memory, where the regular splice is splice-from-file (or 1118 + * to file). In both cases the output is a pipe, naturally. 1119 + * 1120 + * Note that vmsplice only supports splicing _from_ user memory to a pipe, 1121 + * not the other way around. Splicing from user memory is a simple operation 1122 + * that can be supported without any funky alignment restrictions or nasty 1123 + * vm tricks. We simply map in the user memory and fill them into a pipe. 1124 + * The reverse isn't quite as easy, though. There are two possible solutions 1125 + * for that: 1126 + * 1127 + * - memcpy() the data internally, at which point we might as well just 1128 + * do a regular read() on the buffer anyway. 1129 + * - Lots of nasty vm tricks, that are neither fast nor flexible (it 1130 + * has restriction limitations on both ends of the pipe). 1131 + * 1132 + * Alas, it isn't here. 1133 + * 1134 + */ 1135 + static long do_vmsplice(struct file *file, const struct iovec __user *iov, 1136 + unsigned long nr_segs, unsigned int flags) 1137 + { 1138 + struct pipe_inode_info *pipe = file->f_dentry->d_inode->i_pipe; 1139 + struct page *pages[PIPE_BUFFERS]; 1140 + struct partial_page partial[PIPE_BUFFERS]; 1141 + struct splice_pipe_desc spd = { 1142 + .pages = pages, 1143 + .partial = partial, 1144 + .flags = flags, 1145 + .ops = &user_page_pipe_buf_ops, 1146 + }; 1147 + 1148 + if (unlikely(!pipe)) 1149 + return -EBADF; 1150 + if (unlikely(nr_segs > UIO_MAXIOV)) 1151 + return -EINVAL; 1152 + else if (unlikely(!nr_segs)) 1153 + return 0; 1154 + 1155 + spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial); 1156 + if (spd.nr_pages <= 0) 1157 + return spd.nr_pages; 1158 + 1159 + return splice_to_pipe(pipe, &spd); 1160 + } 1161 + 1162 + asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, 1163 + unsigned long nr_segs, unsigned int flags) 1164 + { 1165 + struct file *file; 1166 + long error; 1167 + int fput; 1168 + 1169 + error = -EBADF; 1170 + file = fget_light(fd, &fput); 1171 + if (file) { 1172 + if (file->f_mode & FMODE_WRITE) 1173 + error = do_vmsplice(file, iov, nr_segs, flags); 1174 + 1175 + fput_light(file, fput); 1176 + } 1177 + 1178 + return error; 1179 } 1180 1181 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
+2 -1
include/asm-i386/unistd.h
··· 321 #define __NR_splice 313 322 #define __NR_sync_file_range 314 323 #define __NR_tee 315 324 325 - #define NR_syscalls 316 326 327 /* 328 * user-visible error numbers are in the range -1 - -128: see
··· 321 #define __NR_splice 313 322 #define __NR_sync_file_range 314 323 #define __NR_tee 315 324 + #define __NR_vmsplice 316 325 326 + #define NR_syscalls 317 327 328 /* 329 * user-visible error numbers are in the range -1 - -128: see
+2 -1
include/asm-ia64/unistd.h
··· 290 #define __NR_get_robust_list 1299 291 #define __NR_sync_file_range 1300 292 #define __NR_tee 1301 293 294 #ifdef __KERNEL__ 295 296 #include <linux/config.h> 297 298 - #define NR_syscalls 278 /* length of syscall table */ 299 300 #define __ARCH_WANT_SYS_RT_SIGACTION 301
··· 290 #define __NR_get_robust_list 1299 291 #define __NR_sync_file_range 1300 292 #define __NR_tee 1301 293 + #define __NR_vmsplice 1302 294 295 #ifdef __KERNEL__ 296 297 #include <linux/config.h> 298 299 + #define NR_syscalls 279 /* length of syscall table */ 300 301 #define __ARCH_WANT_SYS_RT_SIGACTION 302
+2 -1
include/asm-powerpc/unistd.h
··· 303 #define __NR_unshare 282 304 #define __NR_splice 283 305 #define __NR_tee 284 306 307 - #define __NR_syscalls 285 308 309 #ifdef __KERNEL__ 310 #define __NR__exit __NR_exit
··· 303 #define __NR_unshare 282 304 #define __NR_splice 283 305 #define __NR_tee 284 306 + #define __NR_vmsplice 285 307 308 + #define __NR_syscalls 286 309 310 #ifdef __KERNEL__ 311 #define __NR__exit __NR_exit
+3 -1
include/asm-x86_64/unistd.h
··· 615 __SYSCALL(__NR_tee, sys_tee) 616 #define __NR_sync_file_range 277 617 __SYSCALL(__NR_sync_file_range, sys_sync_file_range) 618 619 - #define __NR_syscall_max __NR_sync_file_range 620 621 #ifndef __NO_STUBS 622
··· 615 __SYSCALL(__NR_tee, sys_tee) 616 #define __NR_sync_file_range 277 617 __SYSCALL(__NR_sync_file_range, sys_sync_file_range) 618 + #define __NR_vmsplice 278 619 + __SYSCALL(__NR_vmsplice, sys_vmsplice) 620 621 + #define __NR_syscall_max __NR_vmsplice 622 623 #ifndef __NO_STUBS 624
+17
include/linux/pipe_fs_i.h
··· 61 /* from/to, of course */ 62 #define SPLICE_F_MORE (0x04) /* expect more data */ 63 64 #endif
··· 61 /* from/to, of course */ 62 #define SPLICE_F_MORE (0x04) /* expect more data */ 63 64 + /* 65 + * Passed to the actors 66 + */ 67 + struct splice_desc { 68 + unsigned int len, total_len; /* current and remaining length */ 69 + unsigned int flags; /* splice flags */ 70 + struct file *file; /* file to read/write */ 71 + loff_t pos; /* file position */ 72 + }; 73 + 74 + typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, 75 + struct splice_desc *); 76 + 77 + extern ssize_t splice_from_pipe(struct pipe_inode_info *, struct file *, 78 + loff_t *, size_t, unsigned int, 79 + splice_actor *); 80 + 81 #endif
+3
include/linux/syscalls.h
··· 574 int fd_out, loff_t __user *off_out, 575 size_t len, unsigned int flags); 576 577 asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags); 578 579 asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
··· 574 int fd_out, loff_t __user *off_out, 575 size_t len, unsigned int flags); 576 577 + asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, 578 + unsigned long nr_segs, unsigned int flags); 579 + 580 asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags); 581 582 asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,